AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.8.3" />

 XmlTokenizer

sealed class XmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Most of the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events; using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Diagnostics; using System.Text; namespace AngleSharp.Parser.Xml { [DebuggerStepThrough] internal sealed class XmlTokenizer : BaseTokenizer { private static readonly string CDataOpening = "[CDATA["; private static readonly string PublicIdentifier = "PUBLIC"; private static readonly string SystemIdentifier = "SYSTEM"; private static readonly string YesIdentifier = "yes"; private static readonly string NoIdentifier = "no"; public XmlTokenizer(TextSource source, IEventAggregator events) : base(source, events) { } public XmlToken Get() { char next = GetNext(); if (base.IsEnded) return XmlToken.EOF; return Data(next); } private static Exception XmlError(XmlParseError code) { return new InvalidOperationException(); } private XmlToken Data(char c) { switch (c) { case '&': return CharacterReference(GetNext()); case '<': return TagOpen(GetNext()); case '': return XmlToken.EOF; case ']': return CheckCharacter(GetNext()); default: return XmlToken.Character(c); } } private XmlToken CheckCharacter(char ch) { if (ch == ']') { if (GetNext() == '>') throw XmlError(XmlParseError.XmlInvalidCharData); Back(); } Back(); return XmlToken.Character(']'); } private XmlCDataToken CData(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': throw XmlError(XmlParseError.EOF); case ']': if (ContinuesWith("]]>", true)) { Advance(2); return XmlToken.CData(_stringBuffer.ToString()); } break; } _stringBuffer.Append(c); c = GetNext(); } } private XmlEntityToken CharacterReference(char c) { StringBuilder stringBuilder = Pool.NewStringBuilder(); if (c == '#') { c = GetNext(); bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { stringBuilder.Append(c); c = GetNext(); } } else { c = GetNext(); while (c.IsHex()) { stringBuilder.Append(c); c = GetNext(); } } if (stringBuilder.Length > 0 && c == ';') return new XmlEntityToken { Value = stringBuilder.ToPool(), IsNumeric = true, IsHex = flag }; } else if (c.IsXmlNameStart()) { do { stringBuilder.Append(c); c = GetNext(); } while (c.IsXmlName()); if (c == ';') return new XmlEntityToken { Value = stringBuilder.ToPool() }; } stringBuilder.ToPool(); throw XmlError(XmlParseError.CharacterReferenceNotTerminated); } private XmlToken TagOpen(char c) { switch (c) { case '!': return MarkupDeclaration(GetNext()); case '?': c = GetNext(); if (ContinuesWith(Tags.Xml, false)) { Advance(2); return DeclarationStart(GetNext()); } return ProcessingStart(c); case '/': return TagEnd(GetNext()); default: if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(GetNext(), XmlToken.OpenTag()); } throw XmlError(XmlParseError.XmlInvalidStartTag); } } private XmlToken TagEnd(char c) { if (c.IsXmlNameStart()) { _stringBuffer.Clear(); do { _stringBuffer.Append(c); c = GetNext(); } while (c.IsXmlName()); while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') { XmlTagToken xmlTagToken = XmlToken.CloseTag(); xmlTagToken.Name = _stringBuffer.ToString(); return xmlTagToken; } } if (c == '') throw XmlError(XmlParseError.EOF); throw XmlError(XmlParseError.XmlInvalidEndTag); } private XmlToken TagName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } tag.Name = _stringBuffer.ToString(); switch (c) { case '': throw XmlError(XmlParseError.EOF); case '>': return tag; default: if (c.IsSpaceCharacter()) return AttributeBeforeName(GetNext(), tag); if (c == '/') return TagSelfClosing(GetNext(), tag); throw XmlError(XmlParseError.XmlInvalidName); } } private XmlToken TagSelfClosing(char c, XmlTagToken tag) { tag.IsSelfClosing = true; switch (c) { case '>': return tag; case '': throw XmlError(XmlParseError.EOF); default: throw XmlError(XmlParseError.XmlInvalidName); } } private XmlToken MarkupDeclaration(char c) { if (ContinuesWith("--", true)) { Advance(); return CommentStart(GetNext()); } if (ContinuesWith(Tags.Doctype, false)) { Advance(6); return Doctype(GetNext()); } if (ContinuesWith(CDataOpening, false)) { Advance(6); return CData(GetNext()); } throw XmlError(XmlParseError.UndefinedMarkupDeclaration); } private XmlToken DeclarationStart(char c) { if (!c.IsSpaceCharacter()) { _stringBuffer.Clear(); _stringBuffer.Append(Tags.Xml); return ProcessingTarget(c, XmlToken.Processing()); } do { c = GetNext(); } while (c.IsSpaceCharacter()); if (ContinuesWith(AttributeNames.Version, false)) { Advance(6); return DeclarationVersionAfterName(GetNext(), XmlToken.Declaration()); } throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationVersionBeforeValue(GetNext(), decl); throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return DeclarationVersionValue(GetNext(), c, decl); } throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlError(XmlParseError.EOF); _stringBuffer.Append(c); c = GetNext(); } decl.Version = _stringBuffer.ToString(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterVersion(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWith(AttributeNames.Encoding, false)) { Advance(7); return DeclarationEncodingAfterName(GetNext(), decl); } if (ContinuesWith(AttributeNames.Standalone, false)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationEncodingBeforeValue(GetNext(), decl); throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { char q = c; _stringBuffer.Clear(); c = GetNext(); if (c.IsLetter()) return DeclarationEncodingValue(c, q, decl); } throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl) { do { if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-') throw XmlError(XmlParseError.XmlDeclarationInvalid); _stringBuffer.Append(c); c = GetNext(); } while (c != q); decl.Encoding = _stringBuffer.ToString(); c = GetNext(); if (c.IsSpaceCharacter()) return DeclarationAfterEncoding(c, decl); return DeclarationEnd(c, decl); } private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (ContinuesWith(AttributeNames.Standalone, false)) { Advance(9); return DeclarationStandaloneAfterName(GetNext(), decl); } return DeclarationEnd(c, decl); } private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '=') return DeclarationStandaloneBeforeValue(GetNext(), decl); throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return DeclarationStandaloneValue(GetNext(), c, decl); } throw XmlError(XmlParseError.XmlDeclarationInvalid); } private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl) { while (c != q) { if (c == '') throw XmlError(XmlParseError.EOF); _stringBuffer.Append(c); c = GetNext(); } string text = _stringBuffer.ToString(); if (text.Equals(YesIdentifier)) decl.Standalone = true; else { if (!text.Equals(NoIdentifier)) throw XmlError(XmlParseError.XmlDeclarationInvalid); decl.Standalone = false; } return DeclarationEnd(GetNext(), decl); } private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c != '?' || GetNext() != '>') throw XmlError(XmlParseError.XmlDeclarationInvalid); return decl; } private XmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return DoctypeName(GetNext(), XmlToken.Doctype()); } throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypeName(char c, XmlDoctypeToken doctype) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeNameAfter(GetNext(), doctype); throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; if (ContinuesWith(PublicIdentifier, false)) { Advance(5); return DoctypePublic(GetNext(), doctype); } if (ContinuesWith(SystemIdentifier, false)) { Advance(5); return DoctypeSystem(GetNext(), doctype); } if (c == '[') { Advance(); return DoctypeAfter(GetNext(), doctype); } throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierValue(GetNext(), c, doctype); } } throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (!c.IsPubidChar()) throw XmlError(XmlParseError.XmlInvalidPubId); _stringBuffer.Append(c); c = GetNext(); } doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype) { if (c == '>') return doctype; if (c.IsSpaceCharacter()) return DoctypeBetween(GetNext(), doctype); throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '>': return doctype; case '"': case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); default: throw XmlError(XmlParseError.DoctypeInvalid); } } private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype) { if (c.IsSpaceCharacter()) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierValue(GetNext(), c, doctype); } } throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype) { while (c != q) { if (c == '') throw XmlError(XmlParseError.EOF); _stringBuffer.Append(c); c = GetNext(); } doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(GetNext(), doctype); } private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '[') { Advance(); c = GetNext(); } return DoctypeAfter(c, doctype); } private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '>') return doctype; throw XmlError(XmlParseError.DoctypeInvalid); } private XmlToken AttributeBeforeName(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; case '': throw XmlError(XmlParseError.EOF); default: if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(GetNext(), tag); } throw XmlError(XmlParseError.XmlInvalidAttribute); } } private XmlToken AttributeName(char c, XmlTagToken tag) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } string name = _stringBuffer.ToString(); if (!string.IsNullOrEmpty(tag.GetAttribute(name))) throw XmlError(XmlParseError.XmlUniqueAttribute); tag.AddAttribute(name); if (c.IsSpaceCharacter()) { do { c = GetNext(); } while (c.IsSpaceCharacter()); } if (c == '=') return AttributeBeforeValue(GetNext(), tag); throw XmlError(XmlParseError.XmlInvalidAttribute); } private XmlToken AttributeBeforeValue(char c, XmlTagToken tag) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (c == '"' || c == '\'') { _stringBuffer.Clear(); return AttributeValue(GetNext(), c, tag); } throw XmlError(XmlParseError.XmlInvalidAttribute); } private XmlToken AttributeValue(char c, char q, XmlTagToken tag) { while (c != q) { switch (c) { case '': throw XmlError(XmlParseError.EOF); case '&': _stringBuffer.Append(CharacterReference(GetNext()).GetEntity()); break; case '<': throw XmlError(XmlParseError.XmlLtInAttributeValue); default: _stringBuffer.Append(c); break; } c = GetNext(); } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(GetNext(), tag); } private XmlToken AttributeAfterValue(char c, XmlTagToken tag) { if (!c.IsSpaceCharacter()) { switch (c) { case '/': return TagSelfClosing(GetNext(), tag); case '>': return tag; default: throw XmlError(XmlParseError.XmlInvalidAttribute); } } return AttributeBeforeName(GetNext(), tag); } private XmlToken ProcessingStart(char c) { if (c.IsXmlNameStart()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ProcessingTarget(GetNext(), XmlToken.Processing()); } throw XmlError(XmlParseError.XmlInvalidPI); } private XmlToken ProcessingTarget(char c, XmlPIToken pi) { while (c.IsXmlName()) { _stringBuffer.Append(c); c = GetNext(); } pi.Target = _stringBuffer.ToString(); _stringBuffer.Clear(); if (string.Compare(pi.Target, Tags.Xml, StringComparison.OrdinalIgnoreCase) == 0) throw XmlError(XmlParseError.XmlInvalidPI); if (c == '?') { c = GetNext(); if (c == '>') return pi; } else if (c.IsSpaceCharacter()) { return ProcessingContent(GetNext(), pi); } throw XmlError(XmlParseError.XmlInvalidPI); } private XmlToken ProcessingContent(char c, XmlPIToken pi) { while (true) { switch (c) { case '?': c = GetNext(); if (c == '>') { pi.Content = _stringBuffer.ToString(); return pi; } _stringBuffer.Append('?'); break; default: _stringBuffer.Append(c); c = GetNext(); break; case '': throw XmlError(XmlParseError.EOF); } } } private XmlToken CommentStart(char c) { _stringBuffer.Clear(); return Comment(c); } private XmlToken Comment(char c) { while (c.IsXmlChar()) { if (c == '-') return CommentDash(GetNext()); _stringBuffer.Append(c); c = GetNext(); } throw XmlError(XmlParseError.XmlInvalidComment); } private XmlToken CommentDash(char c) { if (c == '-') return CommentEnd(GetNext()); return Comment(c); } private XmlToken CommentEnd(char c) { if (c == '>') return XmlToken.Comment(_stringBuffer.ToString()); throw XmlError(XmlParseError.XmlInvalidComment); } } }