AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.8.8" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Events; using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Collections.Generic; using System.Diagnostics; namespace AngleSharp.Parser.Html { [DebuggerStepThrough] internal sealed class HtmlTokenizer : BaseTokenizer { private bool _acceptsCharacterData; private string _lastStartTag; private HtmlParseMode _state; private TextPosition _position; public bool IsAcceptingCharacterData { get { return _acceptsCharacterData; } set { _acceptsCharacterData = value; } } public HtmlParseMode State { get { return _state; } set { _state = value; } } public HtmlTokenizer(TextSource source, IEventAggregator events) : base(source, events) { _state = HtmlParseMode.PCData; _acceptsCharacterData = false; _lastStartTag = string.Empty; } public void RaiseErrorOccurred(HtmlParseError error, TextPosition position) { if (_events != null) { HtmlParseErrorEvent data = new HtmlParseErrorEvent(error.GetCode(), error.GetMessage(), position); _events.Publish(data); } } public void RaiseErrorOccurred(HtmlParseError code) { RaiseErrorOccurred(code, GetCurrentPosition()); } public HtmlToken Get() { char next = GetNext(); _position = GetCurrentPosition(); if (next != '') { switch (_state) { case HtmlParseMode.PCData: return Data(next); case HtmlParseMode.RCData: return RCData(next); case HtmlParseMode.Plaintext: return Plaintext(next); case HtmlParseMode.Rawtext: return Rawtext(next); case HtmlParseMode.Script: return ScriptData(next); } } return NewEof(); } private HtmlToken Data(char c) { if (c != '<') return DataText(c); return TagOpen(GetNext()); } private HtmlToken DataText(char c) { while (true) { switch (c) { case '': case '<': Back(); return NewCharacter(); case '&': AppendCharacterReference(GetNext(), ''); break; case '': RaiseErrorOccurred(HtmlParseError.Null); break; default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': Back(); return NewCharacter(); default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RCData(char c) { if (c != '<') return RCDataText(c); return RCDataLt(GetNext()); } private HtmlToken RCDataText(char c) { while (true) { switch (c) { case '&': AppendCharacterReference(GetNext(), ''); break; case '': case '<': Back(); return NewCharacter(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RCDataLt(char c) { if (c == '/') { c = GetNext(); if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RCDataNameEndTag(GetNext()); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RCDataNameEndTag(GetNext()); } _stringBuffer.Append('<').Append('/'); return RCDataText(c); } _stringBuffer.Append('<'); return RCDataText(c); } private HtmlToken RCDataNameEndTag(char c) { while (true) { HtmlToken htmlToken = CreateIfAppropriate(c); if (htmlToken != null) return htmlToken; if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else { if (!c.IsLowercaseAscii()) break; _stringBuffer.Append(c); } c = GetNext(); } _stringBuffer.Insert(0, '<').Insert(1, '/'); return RCDataText(c); } private HtmlToken Rawtext(char c) { if (c != '<') return RawtextText(c); return RawtextLT(GetNext()); } private HtmlToken RawtextText(char c) { while (true) { switch (c) { case '': case '<': Back(); return NewCharacter(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RawtextLT(char c) { if (c == '/') { c = GetNext(); if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RawtextNameEndTag(GetNext()); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RawtextNameEndTag(GetNext()); } _stringBuffer.Append('<').Append('/'); return RawtextText(c); } _stringBuffer.Append('<'); return RawtextText(c); } private HtmlToken RawtextNameEndTag(char c) { while (true) { HtmlToken htmlToken = CreateIfAppropriate(c); if (htmlToken != null) return htmlToken; if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else { if (!c.IsLowercaseAscii()) break; _stringBuffer.Append(c); } c = GetNext(); } _stringBuffer.Insert(0, '<').Insert(1, '/'); return RawtextText(c); } private HtmlToken CharacterData(char c) { while (true) { switch (c) { case '': Back(); goto IL_0040; case ']': { if (!ContinuesWith("]]>", true)) break; Advance(2); goto IL_0040; } IL_0040: return NewCharacter(); } _stringBuffer.Append(c); c = GetNext(); } } private void AppendCharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) { Back(); _stringBuffer.Append('&'); } else { string text = null; if (c == '#') { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); c = GetNext(); bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = GetNext(); } } else { num = 16; while ((c = GetNext()).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { Back(2); if (flag) Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber); _stringBuffer.Append('&'); return; } if (c != ';') { RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing); Back(); } if (Entities.IsInCharacterTable(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode); text = Entities.GetSymbolFromTable(num3); } else if (Entities.IsInvalidNumber(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber); text = '�'.ToString(); } else { if (Entities.IsInInvalidRange(num3)) RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange); text = Entities.Convert(num3); } } else { int num5 = 0; int insertionPoint = base.InsertionPoint - 1; char[] array = new char[31]; int num6 = 0; char c2 = base.Current; while (c2 != ';' && c2.IsName()) { array[num6++] = c2; string name = new string(array, 0, num6); c2 = GetNext(); num5++; name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name)); if (name != null) { num5 = 0; text = name; } if (c2 == '' || num6 >= 31) break; } Back(num5); c2 = base.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound); base.InsertionPoint = insertionPoint; _stringBuffer.Append('&'); return; } Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated); } if (text == null) { _stringBuffer.Append('&'); return; } } _stringBuffer.Append(text); } } private HtmlToken TagOpen(char c) { if (c == '/') return TagEnd(GetNext()); if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return TagName(NewTagOpen()); } if (!c.IsUppercaseAscii()) { switch (c) { case '!': return MarkupDeclaration(GetNext()); default: _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag); _stringBuffer.Append('<'); return DataText(c); case '?': RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(c); } } _stringBuffer.Append(char.ToLower(c)); return TagName(NewTagOpen()); } private HtmlToken TagEnd(char c) { if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return TagName(NewTagClose()); } if (!c.IsUppercaseAscii()) { switch (c) { case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return Data(GetNext()); case '': Back(); RaiseErrorOccurred(HtmlParseError.EOF); _stringBuffer.Append('<').Append('/'); return NewCharacter(); default: RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(c); } } _stringBuffer.Append(char.ToLower(c)); return TagName(NewTagClose()); } private HtmlToken TagName(HtmlTagToken tag) { while (true) { char next = GetNext(); if (next == '>') { tag.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return EmitTag(tag); } if (next.IsSpaceCharacter()) { tag.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return AttributeBeforeName(tag); } if (next == '/') break; if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { switch (next) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(next); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); return NewEof(); } } } tag.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return TagSelfClosing(tag); } private HtmlToken TagSelfClosing(HtmlTagToken tag) { switch (GetNext()) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '': RaiseErrorOccurred(HtmlParseError.EOF); return NewEof(); default: RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced); Back(); return AttributeBeforeName(tag); } } private HtmlToken MarkupDeclaration(char c) { if (ContinuesWith("--", true)) { Advance(); return CommentStart(GetNext()); } if (ContinuesWith(Tags.Doctype, true)) { Advance(6); return Doctype(GetNext()); } if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) { Advance(6); return CharacterData(GetNext()); } RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration); return BogusComment(c); } private HtmlToken BogusComment(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': Back(); goto case '>'; case '': _stringBuffer.Append('�'); c = GetNext(); break; default: _stringBuffer.Append(c); c = GetNext(); break; case '>': _state = HtmlParseMode.PCData; return NewComment(); } } } private HtmlToken CommentStart(char c) { _stringBuffer.Clear(); switch (c) { case '-': return CommentDashStart(GetNext()) ?? Comment(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return Comment(GetNext()); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append(c); return Comment(GetNext()); } return NewComment(); } private HtmlToken CommentDashStart(char c) { switch (c) { case '-': return CommentEnd(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('�'); return Comment(GetNext()); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append('-').Append(c); return Comment(GetNext()); } return NewComment(); } private HtmlToken Comment(char c) { while (true) { switch (c) { case '-': { HtmlToken htmlToken = CommentDashEnd(GetNext()); if (htmlToken != null) return htmlToken; break; } case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken CommentDashEnd(char c) { switch (c) { case '-': return CommentEnd(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; break; } _stringBuffer.Append('-').Append(c); return null; } private HtmlToken CommentEnd(char c) { while (true) { switch (c) { case '>': _state = HtmlParseMode.PCData; return NewComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('�'); return null; case '!': RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM); return CommentBangEnd(GetNext()); case '-': break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); default: RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected); _stringBuffer.Append('-').Append('-').Append(c); return null; } RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash); _stringBuffer.Append('-'); c = GetNext(); } } private HtmlToken CommentBangEnd(char c) { switch (c) { case '-': _stringBuffer.Append('-').Append('-').Append('!'); return CommentDashEnd(GetNext()); case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('-').Append('!') .Append('�'); return null; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append('-').Append('-').Append('!') .Append(c); return null; } return NewComment(); } private HtmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); if (c == '') { RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewDoctype(true); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected); return DoctypeNameBefore(c); } private HtmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (!c.IsUppercaseAscii()) { switch (c) { case '': { HtmlDoctypeToken doctype2 = NewDoctype(false); RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return DoctypeName(doctype2); } case '>': { HtmlDoctypeToken result2 = NewDoctype(true); _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return result2; } case '': { HtmlDoctypeToken result = NewDoctype(true); RaiseErrorOccurred(HtmlParseError.EOF); Back(); return result; } default: { HtmlDoctypeToken doctype = NewDoctype(false); _stringBuffer.Append(c); return DoctypeName(doctype); } } } HtmlDoctypeToken doctype3 = NewDoctype(false); _stringBuffer.Append(char.ToLower(c)); return DoctypeName(doctype3); } private HtmlToken DoctypeName(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); if (next.IsSpaceCharacter()) { doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeNameAfter(doctype); } if (next == '>') { _state = HtmlParseMode.PCData; doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); break; } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { switch (next) { case '': break; case '': goto IL_00a3; default: goto IL_00d6; } RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); } continue; IL_00d6: _stringBuffer.Append(next); continue; IL_00a3: RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); break; } return doctype; } private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; break; default: if (ContinuesWith("public", true)) { Advance(5); return DoctypePublic(doctype); } if (ContinuesWith("system", true)) { Advance(5); return DoctypeSystem(doctype); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublic(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypePublicIdentifierBefore(doctype); switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); goto IL_00d2; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); goto IL_00d2; default: { _stringBuffer.Append(next); break; } IL_00d2: return doctype; } } } private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); goto IL_00d2; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); Back(); goto IL_00d2; default: { _stringBuffer.Append(next); break; } IL_00d2: return doctype; } } } private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypeBetween(doctype); switch (next) { case '>': _state = HtmlParseMode.PCData; break; case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { _state = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(doctype); } switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); goto IL_00d2; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); Back(); goto IL_00d2; default: { _stringBuffer.Append(next); break; } IL_00d2: return doctype; } } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); goto IL_00e7; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); Back(); goto IL_00e7; default: { _stringBuffer.Append(next); break; } IL_00e7: return doctype; } } } private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); return BogusDoctype(doctype); } return doctype; } private HtmlToken BogusDoctype(HtmlDoctypeToken doctype) { while (true) { switch (GetNext()) { case '>': _state = HtmlParseMode.PCData; goto IL_0020; case '': { Back(); goto IL_0020; } IL_0020: return doctype; } } } private HtmlToken AttributeBeforeName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return AttributeName(tag); case '"': case '\'': case '<': case '=': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Append(c); return AttributeName(tag); default: _stringBuffer.Append(c); return AttributeName(tag); case '': return NewEof(); } } _stringBuffer.Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeName(HtmlTagToken tag) { while (true) { char next = GetNext(); switch (next) { case '=': tag.AddAttribute(_stringBuffer.ToString()); _stringBuffer.Clear(); return AttributeBeforeValue(tag); case '>': tag.AddAttribute(_stringBuffer.ToString()); _stringBuffer.Clear(); return EmitTag(tag); } if (next.IsSpaceCharacter()) { tag.AddAttribute(_stringBuffer.ToString()); _stringBuffer.Clear(); return AttributeAfterName(tag); } if (next == '/') break; if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { switch (next) { case '"': case '\'': case '<': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Append(next); break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(next); break; case '': return NewEof(); } } } tag.AddAttribute(_stringBuffer.ToString()); _stringBuffer.Clear(); return TagSelfClosing(tag); } private HtmlToken AttributeAfterName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '>': return EmitTag(tag); case '=': return AttributeBeforeValue(tag); case '/': return TagSelfClosing(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '"': case '\'': case '<': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Append(c); return AttributeName(tag); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return AttributeName(tag); default: _stringBuffer.Append(c); return AttributeName(tag); case '': return NewEof(); } } _stringBuffer.Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeBeforeValue(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '"': return AttributeDoubleQuotedValue(tag); case '\'': return AttributeSingleQuotedValue(tag); case '&': return AttributeUnquotedValue(c, tag); case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); _stringBuffer.Append(c); return AttributeUnquotedValue(GetNext(), tag); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return AttributeUnquotedValue(GetNext(), tag); default: _stringBuffer.Append(c); return AttributeUnquotedValue(GetNext(), tag); case '': return NewEof(); } } private HtmlToken AttributeDoubleQuotedValue(HtmlTagToken tag) { while (true) { char next = GetNext(); switch (next) { case '"': tag.SetAttributeValue(_stringBuffer.ToString()); _stringBuffer.Clear(); return AttributeAfterValue(tag); case '&': AppendCharacterReference(GetNext(), '"'); break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(next); break; case '': return NewEof(); } } } private HtmlToken AttributeSingleQuotedValue(HtmlTagToken tag) { while (true) { char next = GetNext(); switch (next) { case '\'': tag.SetAttributeValue(_stringBuffer.ToString()); _stringBuffer.Clear(); return AttributeAfterValue(tag); case '&': AppendCharacterReference(GetNext(), '\''); break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: _stringBuffer.Append(next); break; case '': return NewEof(); } } } private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag) { while (true) { if (c == '>') { tag.SetAttributeValue(_stringBuffer.ToString()); _stringBuffer.Clear(); return EmitTag(tag); } if (c.IsSpaceCharacter()) break; switch (c) { case '&': AppendCharacterReference(GetNext(), '>'); break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); _stringBuffer.Append(c); break; default: _stringBuffer.Append(c); break; case '': return NewEof(); } c = GetNext(); } tag.SetAttributeValue(_stringBuffer.ToString()); _stringBuffer.Clear(); return AttributeBeforeName(tag); } private HtmlToken AttributeAfterValue(HtmlTagToken tag) { char next = GetNext(); if (next == '>') return EmitTag(tag); if (!next.IsSpaceCharacter()) { switch (next) { case '/': return TagSelfClosing(tag); case '': return NewEof(); default: RaiseErrorOccurred(HtmlParseError.AttributeNameExpected); Back(); return AttributeBeforeName(tag); } } return AttributeBeforeName(tag); } private HtmlToken ScriptData(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '<': return ScriptDataLt(GetNext()); case '': Back(); return NewCharacter(); default: _stringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken ScriptDataLt(char c) { _stringBuffer.Append('<'); switch (c) { case '/': { c = GetNext(); int length = _stringBuffer.Append('/').Length; if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataNameEndTag(NewTagClose(), length); } break; } case '!': _stringBuffer.Append('!'); c = GetNext(); if (c == '-') return ScriptDataEscapeDashLt(GetNext()); break; } return ScriptData(c); } private HtmlToken ScriptDataNameEndTag(HtmlTagToken tag, int offset) { int length = _lastStartTag.Length; char next; while (true) { next = GetNext(); bool flag = next.IsSpaceCharacter(); bool flag2 = next == '>'; bool flag3 = next == '/'; if (_stringBuffer.Length - offset == length && (flag || flag2 || flag3)) { string text = _stringBuffer.ToString(offset, length); if (text.Equals(_lastStartTag, StringComparison.OrdinalIgnoreCase)) { if (offset > 2) { Back(3 + length); _stringBuffer.Remove(offset - 2, length + 2); return NewCharacter(); } _stringBuffer.Clear(); if (flag) { tag.Name = _lastStartTag; return AttributeBeforeName(tag); } if (flag3) { tag.Name = _lastStartTag; return TagSelfClosing(tag); } if (flag2) { tag.Name = _lastStartTag; return EmitTag(tag); } } } if (!next.IsLetter()) break; _stringBuffer.Append(next); } return ScriptData(next); } private HtmlToken ScriptDataEscapeDashLt(char c) { _stringBuffer.Append('-'); if (c == '-') { _stringBuffer.Append('-'); return ScriptDataEscapedDashDash(); } return ScriptData(c); } private HtmlToken ScriptDataEscaped(char c) { while (true) { switch (c) { case '-': _stringBuffer.Append('-'); return ScriptDataEscapedDash(GetNext()); case '<': return ScriptDataEscapedLT(GetNext()); case '': break; case '': Back(); return NewCharacter(); default: return ScriptData(c); } RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); c = GetNext(); } } private HtmlToken ScriptDataEscapedDash(char c) { switch (c) { case '-': _stringBuffer.Append('-'); return ScriptDataEscapedDashDash(); case '<': return ScriptDataEscapedLT(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': Back(); return NewCharacter(); default: _stringBuffer.Append(c); break; } return ScriptDataEscaped(GetNext()); } private HtmlToken ScriptDataEscapedDashDash() { while (true) { char next = GetNext(); switch (next) { case '-': break; case '<': return ScriptDataEscapedLT(GetNext()); case '>': _stringBuffer.Append('>'); return ScriptData(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return ScriptDataEscaped(GetNext()); case '': return NewCharacter(); default: _stringBuffer.Append(next); return ScriptDataEscaped(GetNext()); } _stringBuffer.Append('-'); } } private HtmlToken ScriptDataEscapedLT(char c) { if (c == '/') return ScriptDataEscapedEndTag(GetNext()); if (c.IsLetter()) { int length = _stringBuffer.Append('<').Length; _stringBuffer.Append(c); return ScriptDataStartDoubleEscape(length); } _stringBuffer.Append('<'); return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedEndTag(char c) { int length = _stringBuffer.Append('<').Append('/').Length; if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataEscapedNameEndTag(NewTagClose(), length); } return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedNameEndTag(HtmlTagToken tag, int offset) { int length = Tags.Script.Length; char next; while (true) { next = GetNext(); if (_stringBuffer.Length - offset == length && (next == '/' || next == '>' || next.IsSpaceCharacter()) && _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) { Back(length + 3); _stringBuffer.Remove(offset - 2, length + 2); return NewCharacter(); } if (!next.IsLetter()) break; _stringBuffer.Append(next); } return ScriptDataEscaped(next); } private HtmlToken ScriptDataStartDoubleEscape(int offset) { int length = Tags.Script.Length; char next; while (true) { next = GetNext(); if (_stringBuffer.Length - offset == length && (next == '/' || next == '>' || next.IsSpaceCharacter())) { bool flag = _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase); _stringBuffer.Append(next); if (!flag) return ScriptDataEscaped(GetNext()); return ScriptDataEscapedDouble(GetNext()); } if (!next.IsLetter()) break; _stringBuffer.Append(next); } return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedDouble(char c) { while (true) { switch (c) { case '-': _stringBuffer.Append('-'); return ScriptDataEscapedDoubleDash(GetNext()); case '<': _stringBuffer.Append('<'); return ScriptDataEscapedDoubleLT(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); } _stringBuffer.Append(c); c = GetNext(); } } private HtmlToken ScriptDataEscapedDoubleDash(char c) { switch (c) { case '-': _stringBuffer.Append('-'); return ScriptDataEscapedDoubleDashDash(); case '<': _stringBuffer.Append('<'); return ScriptDataEscapedDoubleLT(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); } return ScriptDataEscapedDouble(c); } private HtmlToken ScriptDataEscapedDoubleDashDash() { while (true) { char next = GetNext(); switch (next) { case '-': break; case '<': _stringBuffer.Append('<'); return ScriptDataEscapedDoubleLT(GetNext()); case '>': _stringBuffer.Append('>'); return ScriptData(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return ScriptDataEscapedDouble(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); default: _stringBuffer.Append(next); return ScriptDataEscapedDouble(GetNext()); } _stringBuffer.Append('-'); } } private HtmlToken ScriptDataEscapedDoubleLT(char c) { if (c == '/') { int length = _stringBuffer.Append('/').Length; return ScriptDataEndDoubleEscape(length); } return ScriptDataEscapedDouble(c); } private HtmlToken ScriptDataEndDoubleEscape(int offset) { int length = Tags.Script.Length; char next; while (true) { next = GetNext(); if (_stringBuffer.Length - offset == length && (next.IsSpaceCharacter() || next == '/' || next == '>')) { bool flag = _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase); _stringBuffer.Append(next); if (!flag) return ScriptDataEscapedDouble(GetNext()); return ScriptDataEscaped(GetNext()); } if (!next.IsLetter()) break; _stringBuffer.Append(next); } return ScriptDataEscapedDouble(next); } private HtmlToken NewCharacter() { string name = _stringBuffer.ToString(); _stringBuffer.Clear(); return new HtmlToken(HtmlTokenType.Character, _position, name); } private HtmlToken NewComment() { string name = _stringBuffer.ToString(); _stringBuffer.Clear(); return new HtmlToken(HtmlTokenType.Comment, _position, name); } private HtmlToken NewEof() { return new HtmlToken(HtmlTokenType.EndOfFile, _position); } private HtmlDoctypeToken NewDoctype(bool quirksForced) { return new HtmlDoctypeToken(quirksForced, _position); } private HtmlTagToken NewTagOpen() { return new HtmlTagToken(HtmlTokenType.StartTag, _position); } private HtmlTagToken NewTagClose() { return new HtmlTagToken(HtmlTokenType.EndTag, _position); } private HtmlToken CreateIfAppropriate(char c) { bool flag = c.IsSpaceCharacter(); bool flag2 = c == '>'; bool flag3 = c == '/'; if (_stringBuffer.Length == _lastStartTag.Length && (flag || flag2 || flag3) && _stringBuffer.ToString().Equals(_lastStartTag, StringComparison.Ordinal)) { HtmlTagToken htmlTagToken = NewTagClose(); _stringBuffer.Clear(); if (flag) { htmlTagToken.Name = _lastStartTag; return AttributeBeforeName(htmlTagToken); } if (flag3) { htmlTagToken.Name = _lastStartTag; return TagSelfClosing(htmlTagToken); } if (flag2) { htmlTagToken.Name = _lastStartTag; return EmitTag(htmlTagToken); } } return null; } private HtmlToken EmitTag(HtmlTagToken tag) { List<KeyValuePair<string, string>> attributes = tag.Attributes; _state = HtmlParseMode.PCData; switch (tag.Type) { case HtmlTokenType.StartTag: for (int num = attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { if (attributes[num2].Key == attributes[num].Key) { attributes.RemoveAt(num); RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted, tag.Position); break; } } } _lastStartTag = tag.Name; break; case HtmlTokenType.EndTag: if (tag.IsSelfClosing) RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed, tag.Position); if (attributes.Count != 0) RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes, tag.Position); break; } return tag; } } }