AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.8.4" />

 XmlTokenizer

sealed class XmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Most of the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events; using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Diagnostics; using System.Text; namespace AngleSharp.Parser.Xml { [DebuggerStepThrough] internal sealed class XmlTokenizer : BaseTokenizer { private static readonly string CDataOpening = "[CDATA["; private static readonly string PublicIdentifier = "PUBLIC"; private static readonly string SystemIdentifier = "SYSTEM"; private static readonly string YesIdentifier = "yes"; private static readonly string NoIdentifier = "no"; private TextPosition _position; public XmlTokenizer(TextSource source, IEventAggregator events) : base(source, events) { } public XmlToken Get() { char next = GetNext(); if (next == '') return NewEof(); return Data(next); } private XmlToken Data(char c) { _position = GetCurrentPosition(); switch (c) { case '&': return CharacterReference(GetNext()); case '<': return TagOpen(GetNext()); case '': return NewEof(); case ']': return CheckCharacter(GetNext()); default: return NewCharacter(c); } } private XmlToken CheckCharacter(char ch) { if (ch == ']') { if (GetNext() == '>') throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition()); Back(); } Back(); return NewCharacter(']'); } private XmlCDataToken CData(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case ']': if (ContinuesWith("]]>", true)) { Advance(2); return NewCData(_stringBuffer.ToString()); } break; } _stringBuffer.Append(c); c = GetNext(); } } private XmlEntityToken CharacterReference(char c) { StringBuilder stringBuilder = Pool.NewStringBuilder(); if (c == '#') { c = GetNext(); bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { stringBuilder.Append(c); c = GetNext(); } } else { c = GetNext(); while (c.IsHex()) { stringBuilder.Append(c); c = GetNext(); } } if (stringBuilder.Length > 0 && c == ';') return NewEntity(stringBuilder.ToPool(), true, flag); } else if (c.IsXmlNameStart()) { do { stringBuilder.Append(c); c = GetNext(); } while (c.IsXmlName()); if (c == ';') return NewEntity(stringBuilder.ToPool(), false, false); } stringBuilder.ToPool(); throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition()); } private XmlToken TagOpen(char c) { switch (c) { case '!': return MarkupDeclaration(GetNext()); case '?': c = GetNext(); if (ContinuesWith(Tags.Xml, false)) { Advance(2); return DeclarationStart(GetNext()); } return ProcessingStart(c); case '/': return TagEnd(GetNext()); default: if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(GetNext(), NewOpenTag()); } throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition()); } } private XmlToken TagEnd(char c) { if (c.IsXmlNameStart()) { _stringBuffer.Clear(); do { _stringBuffer.Append(c); c = GetNext(); } while (c.IsXmlName()); while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') { XmlTagToken xmlTagToken = NewCloseTag(); xmlTagToken.Name = _stringBuffer.ToString(); return xmlTagToken; } } if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition()); } private XmlToken TagName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } tag.Name = _stringBuffer.ToString(); switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case '>': return tag; default: if (c.IsSpaceCharacter()) return AttributeBeforeName(GetNext(), tag); if (c == '/') return TagSelfClosing(GetNext(), tag); throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken TagSelfClosing(char c, XmlTagToken tag) { tag.IsSelfClosing = true; switch (c) { case '>': return tag; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); default: throw XmlParseError.XmlInvalidName.At(GetCurrentPosition()); } } private XmlToken MarkupDeclaration(char c) { if (ContinuesWith("--", true)) { Advance(); return CommentStart(GetNext()); } if (ContinuesWith(Tags.Doctype, false)) { Advance(6); return Doctype(GetNext()); } if (ContinuesWith(CDataOpening, false)) { Advance(6); return CData(GetNext()); } throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition()); } private XmlToken DeclarationStart(char c) { if (!c.IsSpaceCharacter()) { _stringBuffer.Clear(); _stringBuffer.Append(Tags.Xml); return ProcessingTarget(c, NewProcessing()); } do { c = GetNext(); } while (c.IsSpaceCharacter()); if (ContinuesWith(AttributeNames.Version, false)) { Advance(6); return DeclarationVersionAfterName(GetNext(), NewDeclaration()); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationVersionBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return DeclarationVersionValue(GetNext(), c, decl); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } decl.Version = _stringBuffer.ToString(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterVersion(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWith(AttributeNames.Encoding, false)) { Advance(7); return DeclarationEncodingAfterName(GetNext(), decl); } if (ContinuesWith(AttributeNames.Standalone, false)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationEncodingBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { char q = c; _stringBuffer.Clear(); c = GetNext(); if (c.IsLetter()) return DeclarationEncodingValue(c, q, decl); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl) { do { if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } while (c != q); decl.Encoding = _stringBuffer.ToString(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterEncoding(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWith(AttributeNames.Standalone, false)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationStandaloneBeforeValue(GetNext(), decl); throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return DeclarationStandaloneValue(GetNext(), c, decl); } throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); } private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } string text = _stringBuffer.ToString(); if (text.Equals(YesIdentifier)) decl.Standalone = true; else { if (!text.Equals(NoIdentifier)) throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); decl.Standalone = false; } return DeclarationEnd(GetNext(), decl); } private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c != '?' || GetNext() != '>') throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition()); return decl; } private XmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return DoctypeName(GetNext(), NewDoctype()); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeName(char c, XmlDoctypeToken doctype) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeNameAfter(GetNext(), doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; if (ContinuesWith(PublicIdentifier, false)) { Advance(5); return DoctypePublic(GetNext(), doctype); } if (ContinuesWith(SystemIdentifier, false)) { Advance(5); return DoctypeSystem(GetNext(), doctype); } if (c == '[') { Advance(); return DoctypeAfter(GetNext(), doctype); } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierValue(GetNext(), c, doctype); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (!c.IsPubidChar()) throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype) { if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeBetween(GetNext(), doctype); throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '>': return doctype; case '"': case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); default: throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } } private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); } } throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (c == '') throw XmlParseError.EOF.At(GetCurrentPosition()); _stringBuffer.Append(c); c = GetNext(); } doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '[') { Advance(); c = GetNext(); } return DoctypeAfter(c, doctype); } private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition()); } private XmlToken AttributeBeforeName(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); default: if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(GetNext(), tag); } throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } private XmlToken AttributeName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } string name = _stringBuffer.ToString(); if (!string.IsNullOrEmpty(tag.GetAttribute(name))) throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition()); tag.AddAttribute(name); if (c.IsSpaceCharacter()) { do { c = GetNext(); } while (c.IsSpaceCharacter()); } if (c == '=') return AttributeBeforeValue(GetNext(), tag); throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } private XmlToken AttributeBeforeValue(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return AttributeValue(GetNext(), c, tag); } throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } private XmlToken AttributeValue(char c, char q, XmlTagToken tag) { while (c != q) { switch (c) { case '': throw XmlParseError.EOF.At(GetCurrentPosition()); case '&': _stringBuffer.Append(CharacterReference(GetNext()).GetEntity()); break; case '<': throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition()); default: _stringBuffer.Append(c); break; } c = GetNext(); } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(GetNext(), tag); } private XmlToken AttributeAfterValue(char c, XmlTagToken tag) { if (!c.IsSpaceCharacter()) { switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; default: throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition()); } } return AttributeBeforeName(GetNext(), tag); } private XmlToken ProcessingStart(char c) { if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ProcessingTarget(GetNext(), NewProcessing()); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingTarget(char c, XmlPIToken pi) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } pi.Target = _stringBuffer.ToString(); _stringBuffer.Clear(); if (string.Compare(pi.Target, Tags.Xml, StringComparison.OrdinalIgnoreCase) == 0) throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); if (c == '?') { c = GetNext(); if (c == '>') return pi; } else if (c.IsSpaceCharacter()) { return ProcessingContent(GetNext(), pi); } throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition()); } private XmlToken ProcessingContent(char c, XmlPIToken pi) { while (true) { switch (c) { case '?': c = GetNext(); if (c == '>') { pi.Content = _stringBuffer.ToString(); return pi; } _stringBuffer.Append('?'); break; default: _stringBuffer.Append(c); c = GetNext(); break; case '': throw XmlParseError.EOF.At(GetCurrentPosition()); } } } private XmlToken CommentStart(char c) { _stringBuffer.Clear(); return Comment(c); } private XmlToken Comment(char c) { while (c.IsXmlChar()) { if (c == '-') return CommentDash(GetNext()); _stringBuffer.Append(c); c = GetNext(); } throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlToken CommentDash(char c) { if (c == '-') return CommentEnd(GetNext()); return Comment(c); } private XmlToken CommentEnd(char c) { if (c == '>') return NewComment(_stringBuffer.ToString()); throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition()); } private XmlEndOfFileToken NewEof() { return new XmlEndOfFileToken(GetCurrentPosition()); } private XmlCharacterToken NewCharacter(char c) { return new XmlCharacterToken(_position, c); } private XmlCommentToken NewComment(string s) { return new XmlCommentToken(_position, s); } private XmlPIToken NewProcessing() { return new XmlPIToken(_position); } private XmlDoctypeToken NewDoctype() { return new XmlDoctypeToken(_position); } private XmlDeclarationToken NewDeclaration() { return new XmlDeclarationToken(_position); } private XmlTagToken NewOpenTag() { return new XmlTagToken(XmlTokenType.StartTag, _position); } private XmlTagToken NewCloseTag() { return new XmlTagToken(XmlTokenType.EndTag, _position); } private XmlCDataToken NewCData(string s) { return new XmlCDataToken(_position, s); } private XmlEntityToken NewEntity(string value, bool numeric = false, bool hex = false) { return new XmlEntityToken(_position, value, numeric, hex); } } }