AngleSharp by AngleSharp

<PackageReference Include="AngleSharp" Version="0.9.9.2" />

 XmlTokenizer

sealed class XmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Most of the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Extensions; using AngleSharp.Html; using AngleSharp.Services; namespace AngleSharp.Parser.Xml { internal sealed class XmlTokenizer : BaseTokenizer { private readonly IEntityProvider _resolver; private TextPosition _position; public bool IsSuppressingErrors { get; set; } public XmlTokenizer(TextSource source, IEntityProvider resolver) : base(source) { _resolver = resolver; } public XmlToken Get() { char next = GetNext(); if (next != '￿') { _position = GetCurrentPosition(); return Data(next); } return NewEof(); } private XmlToken Data(char c) { switch (c) { case '<': return TagOpen(); case '￿': return NewEof(); default: return DataText(c); } } private XmlToken DataText(char c) { while (true) { switch (c) { case '<': case '￿': Back(); return NewCharacters(); case '&': base.StringBuffer.Append(CharacterReference()); c = GetNext(); break; case ']': base.StringBuffer.Append(c); c = CheckNextCharacter(); break; default: base.StringBuffer.Append(c); c = GetNext(); break; } } } private char CheckNextCharacter() { char next = GetNext(); if (next == ']') { if (GetNext() == '>') throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition()); Back(); } return next; } private XmlCDataToken CData() { char next = GetNext(); while (true) { switch (next) { case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); case ']': if (ContinuesWithSensitive("]]>")) { Advance(2); return NewCharacterData(); } break; } base.StringBuffer.Append(next); next = GetNext(); } } private string CharacterReference() { char next = GetNext(); int length = base.StringBuffer.Length; bool flag = next == '#'; if (flag) { next = GetNext(); if (next != 'x' && next != 'X') { while (next.IsDigit()) { base.StringBuffer.Append(next); next = GetNext(); } } else { next = GetNext(); while (next.IsHex()) { base.StringBuffer.Append(next); next = GetNext(); } } } else if (next.IsXmlNameStart()) { do { base.StringBuffer.Append(next); next = GetNext(); } while (next.IsXmlName()); } int num; string text; if (next == ';' && base.StringBuffer.Length > length) { int length2 = base.StringBuffer.Length - length; text = base.StringBuffer.ToString(length, length2); int num2; switch (flag) { default: num = text.FromHex(); goto IL_00e7; case false: { string symbol = _resolver.GetSymbol(text); if (!string.IsNullOrEmpty(symbol)) { base.StringBuffer.Remove(length, length2); return symbol; } break; } IL_00e7: num2 = num; if (num2.IsValidAsCharRef()) { base.StringBuffer.Remove(length, length2); return num2.ConvertFromUtf32(); } break; } if (!IsSuppressingErrors) throw XmlParseError.CharacterReferenceInvalidCode.At(_position); base.StringBuffer.Append(next); } if (!IsSuppressingErrors) throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition()); base.StringBuffer.Insert(length, '&'); return string.Empty; IL_00d7: num = text.FromDec(); goto IL_00e7; } private XmlToken TagOpen() { char next = GetNext(); switch (next) { case '!': return MarkupDeclaration(); case '?': next = GetNext(); if (ContinuesWithSensitive(TagNames.Xml)) { Advance(2); return DeclarationStart(); } return ProcessingStart(next); case '/': return TagEnd(); default: if (next.IsXmlNameStart()) { base.StringBuffer.Append(next); return TagName(NewOpenTag()); } throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition()); } } private XmlToken TagEnd() { char next = GetNext(); if (next.IsXmlNameStart()) { do { base.StringBuffer.Append(next); next = GetNext(); } while (next.IsXmlName()); while (next.IsSpaceCharacter()) { next = GetNext(); } if (next == '>') { XmlTagToken xmlTagToken = NewCloseTag(); xmlTagToken.Name = FlushBuffer(); return xmlTagToken; } } if (next == '￿') throw XmlParseError.EOF.At(GetCurrentPosition()); throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition()); } private XmlToken TagName(XmlTagToken tag) { char next = GetNext(); while (next.IsXmlName()) { base.StringBuffer.Append(next); next = GetNext(); } tag.Name = FlushBuffer(); switch (next) { case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); case '>': return tag; default: if (next.IsSpaceCharacter()) return AttributeBeforeName(tag); if (next == '/') return TagSelfClosing(tag); throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken TagSelfClosing(XmlTagToken tag) { char next = GetNext(); tag.IsSelfClosing = true; switch (next) { case '>': return tag; case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); default: throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken MarkupDeclaration() { GetNext(); if (ContinuesWithSensitive("--")) { Advance(); return CommentStart(); } if (ContinuesWithSensitive(TagNames.Doctype)) { Advance(6); return Doctype(); } if (ContinuesWithSensitive(Keywords.CData)) { Advance(6); return CData(); } throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition()); } private XmlToken DeclarationStart() { char next = GetNext(); if (!next.IsSpaceCharacter()) { base.StringBuffer.Append(TagNames.Xml); return ProcessingTarget(next, NewProcessing()); } do { next = GetNext(); } while (next.IsSpaceCharacter()); if (ContinuesWithSensitive(AttributeNames.Version)) { Advance(6); return DeclarationVersionAfterName(NewDeclaration()); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionAfterName(XmlDeclarationToken decl) { if (SkipSpaces() != '=') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return DeclarationVersionBeforeValue(decl); } private XmlToken DeclarationVersionBeforeValue(XmlDeclarationToken decl) { char c = SkipSpaces(); if (c != '"' && c != '\'') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return DeclarationVersionValue(decl, c); } private XmlToken DeclarationVersionValue(XmlDeclarationToken decl, char quote) { char next; for (next = GetNext(); next != quote; next = GetNext()) { if (next == '￿') throw XmlParseError.EOF.At(GetCurrentPosition()); base.StringBuffer.Append(next); } decl.Version = FlushBuffer(); next = GetNext(); if (next.IsSpaceCharacter()) return DeclarationAfterVersion(decl); return DeclarationEnd(next, decl); } private XmlToken DeclarationAfterVersion(XmlDeclarationToken decl) { char c = SkipSpaces(); if (ContinuesWithSensitive(AttributeNames.Encoding)) { Advance(7); return DeclarationEncodingAfterName(decl); } if (ContinuesWithSensitive(AttributeNames.Standalone)) { Advance(9); return DeclarationStandaloneAfterName(decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationEncodingAfterName(XmlDeclarationToken decl) { if (SkipSpaces() != '=') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return DeclarationEncodingBeforeValue(decl); } private XmlToken DeclarationEncodingBeforeValue(XmlDeclarationToken decl) { char c = SkipSpaces(); if (c == '"' || c == '\'') { char quote = c; c = GetNext(); if (c.IsLetter()) { base.StringBuffer.Append(c); return DeclarationEncodingValue(decl, quote); } } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationEncodingValue(XmlDeclarationToken decl, char quote) { char next; for (next = GetNext(); next != quote; next = GetNext()) { if (!next.IsAlphanumericAscii() && next != '.' && next != '_' && next != '-') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); base.StringBuffer.Append(next); } decl.Encoding = FlushBuffer(); next = GetNext(); if (next.IsSpaceCharacter()) return DeclarationAfterEncoding(decl); return DeclarationEnd(next, decl); } private XmlToken DeclarationAfterEncoding(XmlDeclarationToken decl) { char c = SkipSpaces(); if (ContinuesWithSensitive(AttributeNames.Standalone)) { Advance(9); return DeclarationStandaloneAfterName(decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationStandaloneAfterName(XmlDeclarationToken decl) { if (SkipSpaces() != '=') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return DeclarationStandaloneBeforeValue(decl); } private XmlToken DeclarationStandaloneBeforeValue(XmlDeclarationToken decl) { char c = SkipSpaces(); if (c != '"' && c != '\'') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return DeclarationStandaloneValue(decl, c); } private XmlToken DeclarationStandaloneValue(XmlDeclarationToken decl, char quote) { for (char next = GetNext(); next != quote; next = GetNext()) { if (next == '￿') throw XmlParseError.EOF.At(GetCurrentPosition()); base.StringBuffer.Append(next); } string current = FlushBuffer(); if (current.Is(Keywords.Yes)) decl.Standalone = true; else { if (!current.Is(Keywords.No)) throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); decl.Standalone = false; } return DeclarationEnd(GetNext(), decl); } private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c != '?' || GetNext() != '>') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return decl; } private XmlToken Doctype() { if (GetNext().IsSpaceCharacter()) return DoctypeNameBefore(); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameBefore() { char c = SkipSpaces(); if (c.IsXmlNameStart()) { base.StringBuffer.Append(c); return DoctypeName(NewDoctype()); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeName(XmlDoctypeToken doctype) { char next = GetNext(); while (next.IsXmlName()) { base.StringBuffer.Append(next); next = GetNext(); } doctype.Name = FlushBuffer(); if (next == '>') return doctype; if (next.IsSpaceCharacter()) return DoctypeNameAfter(doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameAfter(XmlDoctypeToken doctype) { char c = SkipSpaces(); if (c == '>') return doctype; if (ContinuesWithSensitive(Keywords.Public)) { Advance(5); return DoctypePublic(doctype); } if (ContinuesWithSensitive(Keywords.System)) { Advance(5); return DoctypeSystem(doctype); } if (c == '[') { Advance(); return DoctypeAfter(GetNext(), doctype); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublic(XmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { next = SkipSpaces(); if (next == '"' || next == '\'') { doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierValue(doctype, next); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublicIdentifierValue(XmlDoctypeToken doctype, char quote) { for (char next = GetNext(); next != quote; next = GetNext()) { if (!next.IsPubidChar()) throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition()); base.StringBuffer.Append(next); } doctype.PublicIdentifier = FlushBuffer(); return DoctypePublicIdentifierAfter(doctype); } private XmlToken DoctypePublicIdentifierAfter(XmlDoctypeToken doctype) { char next = GetNext(); if (next == '>') return doctype; if (next.IsSpaceCharacter()) return DoctypeBetween(doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeBetween(XmlDoctypeToken doctype) { char c = SkipSpaces(); switch (c) { case '>': return doctype; case '"': case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(doctype, c); default: throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } } private XmlToken DoctypeSystem(XmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { next = SkipSpaces(); if (next == '"' || next == '\'') { doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(doctype, next); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeSystemIdentifierValue(XmlDoctypeToken doctype, char quote) { for (char next = GetNext(); next != quote; next = GetNext()) { if (next == '￿') throw XmlParseError.EOF.At(GetCurrentPosition()); base.StringBuffer.Append(next); } doctype.SystemIdentifier = FlushBuffer(); return DoctypeSystemIdentifierAfter(doctype); } private XmlToken DoctypeSystemIdentifierAfter(XmlDoctypeToken doctype) { char c = SkipSpaces(); if (c == '[') { Advance(); c = GetNext(); } return DoctypeAfter(c, doctype); } private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken AttributeBeforeName(XmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '>': return tag; case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); default: if (c.IsXmlNameStart()) { base.StringBuffer.Append(c); return AttributeName(tag); } throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } private XmlToken AttributeName(XmlTagToken tag) { char next = GetNext(); while (next.IsXmlName()) { base.StringBuffer.Append(next); next = GetNext(); } string name = FlushBuffer(); if (!string.IsNullOrEmpty(tag.GetAttribute(name))) throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition()); tag.AddAttribute(name); if (next.IsSpaceCharacter()) { do { next = GetNext(); } while (next.IsSpaceCharacter()); } if (next != '=') throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); return AttributeBeforeValue(tag); } private XmlToken AttributeBeforeValue(XmlTagToken tag) { char c = SkipSpaces(); if (c != '"' && c != '\'') throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); return AttributeValue(tag, c); } private XmlToken AttributeValue(XmlTagToken tag, char quote) { for (char next = GetNext(); next != quote; next = GetNext()) { switch (next) { case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); case '<': throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition()); case '&': base.StringBuffer.Append(CharacterReference()); break; default: base.StringBuffer.Append(next); break; } } tag.SetAttributeValue(FlushBuffer()); return AttributeAfterValue(tag); } private XmlToken AttributeAfterValue(XmlTagToken tag) { char next = GetNext(); if (!next.IsSpaceCharacter()) { switch (next) { case '/': return TagSelfClosing(tag); case '>': return tag; default: throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } return AttributeBeforeName(tag); } private XmlToken ProcessingStart(char c) { if (c.IsXmlNameStart()) { base.StringBuffer.Append(c); return ProcessingTarget(GetNext(), NewProcessing()); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingTarget(char c, XmlPIToken pi) { while (c.IsXmlName()) { base.StringBuffer.Append(c); c = GetNext(); } pi.Target = FlushBuffer(); if (pi.Target.Isi(TagNames.Xml)) throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); if (c == '?') { c = GetNext(); if (c == '>') return pi; } else if (c.IsSpaceCharacter()) { return ProcessingContent(pi); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingContent(XmlPIToken pi) { char next = GetNext(); while (true) { switch (next) { case '?': next = GetNext(); if (next == '>') { pi.Content = FlushBuffer(); return pi; } base.StringBuffer.Append('?'); break; default: base.StringBuffer.Append(next); next = GetNext(); break; case '￿': throw XmlParseError.EOF.At(GetCurrentPosition()); } } } private XmlToken CommentStart() { return Comment(GetNext()); } private XmlToken Comment(char c) { while (c.IsXmlChar()) { if (c == '-') return CommentDash(); base.StringBuffer.Append(c); c = GetNext(); } throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlToken CommentDash() { char next = GetNext(); if (next == '-') return CommentEnd(); return Comment(next); } private XmlToken CommentEnd() { if (GetNext() == '>') return NewComment(); throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlEndOfFileToken NewEof() { return new XmlEndOfFileToken(GetCurrentPosition()); } private XmlCharacterToken NewCharacters() { string data = FlushBuffer(); return new XmlCharacterToken(_position, data); } private XmlCommentToken NewComment() { string data = FlushBuffer(); return new XmlCommentToken(_position, data); } private XmlPIToken NewProcessing() { return new XmlPIToken(_position); } private XmlDoctypeToken NewDoctype() { return new XmlDoctypeToken(_position); } private XmlDeclarationToken NewDeclaration() { return new XmlDeclarationToken(_position); } private XmlTagToken NewOpenTag() { return new XmlTagToken(XmlTokenType.StartTag, _position); } private XmlTagToken NewCloseTag() { return new XmlTagToken(XmlTokenType.EndTag, _position); } private XmlCDataToken NewCharacterData() { string data = FlushBuffer(); return new XmlCDataToken(_position, data); } } }