AngleSharp by AngleSharp

<PackageReference Include="AngleSharp" Version="0.9.4" />

 XmlTokenizer

sealed class XmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Most of the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events; using AngleSharp.Extensions; using AngleSharp.Html; using AngleSharp.Services; using System.Diagnostics; namespace AngleSharp.Parser.Xml { [DebuggerStepThrough] internal sealed class XmlTokenizer : BaseTokenizer { private readonly IEntityService _resolver; private TextPosition _position; public XmlTokenizer(TextSource source, IEventAggregator events, IEntityService resolver) : base(source, events) { _resolver = resolver; } public XmlToken Get() { char next = GetNext(); if (next != '') { _position = GetCurrentPosition(); return Data(next); } return NewEof(); } private XmlToken Data(char c) { switch (c) { case '<': return TagOpen(GetNext()); case '': return NewEof(); default: return DataText(c); } } private XmlToken DataText(char c) { while (true) { switch (c) { case '': case '<': Back(); return NewCharacters(); case '&': _stringBuffer.Append(CharacterReference(GetNext())); c = GetNext(); break; case ']': _stringBuffer.Append(c); c = CheckCharacter(GetNext()); break; default: _stringBuffer.Append(c); c = GetNext(); break; } } } private char CheckCharacter(char ch) { if (ch == ']') { if (GetNext() == '>') throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition()); Back(); } return ch; } private XmlCDataToken CData(char c) { while (true) { switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case ']': if (ContinuesWithSensitive("]]>")) { Advance(2); return NewCharacterData(); } break; } _stringBuffer.Append(c); c = GetNext(); } } private string CharacterReference(char c) { int length = _stringBuffer.Length; bool flag = false; bool flag2 = c == '#'; if (flag2) { c = GetNext(); if (c != 'x' && c != 'X') { while (c.IsDigit()) { _stringBuffer.Append(c); c = GetNext(); } } else { c = GetNext(); while (c.IsHex()) { _stringBuffer.Append(c); c = GetNext(); } } } else if (c.IsXmlNameStart()) { do { _stringBuffer.Append(c); c = GetNext(); } while (c.IsXmlName()); } int num; string text; if (c == ';' && _stringBuffer.Length > length) { int length2 = _stringBuffer.Length - length; text = _stringBuffer.ToString(length, length2); _stringBuffer.Remove(length, length2); int num2; switch (flag2) { default: num = text.FromHex(); goto IL_00f7; case false: { string symbol = _resolver.GetSymbol(text); if (string.IsNullOrEmpty(symbol)) throw XmlParseError.CharacterReferenceInvalidCode.At(_position); return symbol; } IL_00f7: num2 = num; if (!num2.IsValidAsCharRef()) throw XmlParseError.CharacterReferenceInvalidNumber.At(_position); return num2.ConvertFromUtf32(); } } throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition()); IL_00e7: num = text.FromDec(); goto IL_00f7; } private XmlToken TagOpen(char c) { switch (c) { case '!': return MarkupDeclaration(GetNext()); case '?': c = GetNext(); if (ContinuesWithSensitive(TagNames.Xml)) { Advance(2); return DeclarationStart(GetNext()); } return ProcessingStart(c); case '/': return TagEnd(GetNext()); default: if (c.IsXmlNameStart()) { _stringBuffer.Append(c); return TagName(GetNext(), NewOpenTag()); } throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition()); } } private XmlToken TagEnd(char c) { if (c.IsXmlNameStart()) { do { _stringBuffer.Append(c); c = GetNext(); } while (c.IsXmlName()); while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') { XmlTagToken xmlTagToken = NewCloseTag(); xmlTagToken.Name = FlushBuffer(); return xmlTagToken; } } if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition()); } private XmlToken TagName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } tag.Name = FlushBuffer(); switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case '>': return tag; default: if (c.IsSpaceCharacter()) return AttributeBeforeName(GetNext(), tag); if (c == '/') return TagSelfClosing(GetNext(), tag); throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken TagSelfClosing(char c, XmlTagToken tag) { tag.IsSelfClosing = true; switch (c) { case '>': return tag; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); default: throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken MarkupDeclaration(char c) { if (ContinuesWithSensitive("--")) { Advance(); return CommentStart(GetNext()); } if (ContinuesWithSensitive(TagNames.Doctype)) { Advance(6); return Doctype(GetNext()); } if (ContinuesWithSensitive(Keywords.CData)) { Advance(6); return CData(GetNext()); } throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition()); } private XmlToken DeclarationStart(char c) { if (!c.IsSpaceCharacter()) { _stringBuffer.Append(TagNames.Xml); return ProcessingTarget(c, NewProcessing()); } do { c = GetNext(); } while (c.IsSpaceCharacter()); if (ContinuesWithSensitive(AttributeNames.Version)) { Advance(6); return DeclarationVersionAfterName(GetNext(), NewDeclaration()); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationVersionBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') return DeclarationVersionValue(GetNext(), c, decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } decl.Version = FlushBuffer(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterVersion(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWithSensitive(AttributeNames.Encoding)) { Advance(7); return DeclarationEncodingAfterName(GetNext(), decl); } if (ContinuesWithSensitive(AttributeNames.Standalone)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationEncodingBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { char q = c; c = GetNext(); if (c.IsLetter()) return DeclarationEncodingValue(c, q, decl); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl) { do { if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } while (c != q); decl.Encoding = FlushBuffer(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterEncoding(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWithSensitive(AttributeNames.Standalone)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationStandaloneBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') return DeclarationStandaloneValue(GetNext(), c, decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } string current = FlushBuffer(); if (current.Is(Keywords.Yes)) decl.Standalone = true; else { if (!current.Is(Keywords.No)) throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); decl.Standalone = false; } return DeclarationEnd(GetNext(), decl); } private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c != '?' || GetNext() != '>') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return decl; } private XmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c.IsXmlNameStart()) { _stringBuffer.Append(c); return DoctypeName(GetNext(), NewDoctype()); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeName(char c, XmlDoctypeToken doctype) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } doctype.Name = FlushBuffer(); if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeNameAfter(GetNext(), doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; if (ContinuesWithSensitive(Keywords.Public)) { Advance(5); return DoctypePublic(GetNext(), doctype); } if (ContinuesWithSensitive(Keywords.System)) { Advance(5); return DoctypeSystem(GetNext(), doctype); } if (c == '[') { Advance(); return DoctypeAfter(GetNext(), doctype); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierValue(GetNext(), c, doctype); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (!c.IsPubidChar()) throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } doctype.PublicIdentifier = FlushBuffer(); return DoctypePublicIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype) { if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeBetween(GetNext(), doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '>': return doctype; case '"': case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); default: throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } } private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } doctype.SystemIdentifier = FlushBuffer(); return DoctypeSystemIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '[') { Advance(); c = GetNext(); } return DoctypeAfter(c, doctype); } private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken AttributeBeforeName(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); default: if (c.IsXmlNameStart()) { _stringBuffer.Append(c); return AttributeName(GetNext(), tag); } throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } private XmlToken AttributeName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } string name = FlushBuffer(); if (!string.IsNullOrEmpty(tag.GetAttribute(name))) throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition()); tag.AddAttribute(name); if (c.IsSpaceCharacter()) { do { c = GetNext(); } while (c.IsSpaceCharacter()); } if (c == '=') return AttributeBeforeValue(GetNext(), tag); throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } private XmlToken AttributeBeforeValue(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') return AttributeValue(GetNext(), c, tag); throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } private XmlToken AttributeValue(char c, char q, XmlTagToken tag) { while (c != q) { switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case '&': _stringBuffer.Append(CharacterReference(GetNext())); break; case '<': throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition()); default: _stringBuffer.Append(c); break; } c = GetNext(); } tag.SetAttributeValue(FlushBuffer()); return AttributeAfterValue(GetNext(), tag); } private XmlToken AttributeAfterValue(char c, XmlTagToken tag) { if (!c.IsSpaceCharacter()) { switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; default: throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } return AttributeBeforeName(GetNext(), tag); } private XmlToken ProcessingStart(char c) { if (c.IsXmlNameStart()) { _stringBuffer.Append(c); return ProcessingTarget(GetNext(), NewProcessing()); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingTarget(char c, XmlPIToken pi) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } pi.Target = FlushBuffer(); if (pi.Target.Isi(TagNames.Xml)) throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); if (c == '?') { c = GetNext(); if (c == '>') return pi; } else if (c.IsSpaceCharacter()) { return ProcessingContent(GetNext(), pi); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingContent(char c, XmlPIToken pi) { while (true) { switch (c) { case '?': c = GetNext(); if (c == '>') { pi.Content = FlushBuffer(); return pi; } _stringBuffer.Append('?'); break; default: _stringBuffer.Append(c); c = GetNext(); break; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); } } } private XmlToken CommentStart(char c) { return Comment(c); } private XmlToken Comment(char c) { while (c.IsXmlChar()) { if (c == '-') return CommentDash(GetNext()); _stringBuffer.Append(c); c = GetNext(); } throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlToken CommentDash(char c) { if (c == '-') return CommentEnd(GetNext()); return Comment(c); } private XmlToken CommentEnd(char c) { if (c == '>') return NewComment(); throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlEndOfFileToken NewEof() { return new XmlEndOfFileToken(GetCurrentPosition()); } private XmlCharacterToken NewCharacters() { string data = FlushBuffer(); return new XmlCharacterToken(_position, data); } private XmlCommentToken NewComment() { string data = FlushBuffer(); return new XmlCommentToken(_position, data); } private XmlPIToken NewProcessing() { return new XmlPIToken(_position); } private XmlDoctypeToken NewDoctype() { return new XmlDoctypeToken(_position); } private XmlDeclarationToken NewDeclaration() { return new XmlDeclarationToken(_position); } private XmlTagToken NewOpenTag() { return new XmlTagToken(XmlTokenType.StartTag, _position); } private XmlTagToken NewCloseTag() { return new XmlTagToken(XmlTokenType.EndTag, _position); } private XmlCDataToken NewCharacterData() { string data = FlushBuffer(); return new XmlCDataToken(_position, data); } } }