AngleSharp by AngleSharp

<PackageReference Include="AngleSharp" Version="0.9.9.1" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Dom.Events; using AngleSharp.Extensions; using AngleSharp.Html; using AngleSharp.Services; using System; using System.Collections.Generic; namespace AngleSharp.Parser.Html { internal sealed class HtmlTokenizer : BaseTokenizer { private enum AttributeState : byte { BeforeName, Name, AfterName, BeforeValue, QuotedValue, AfterValue, UnquotedValue, SelfClose } private enum ScriptState : byte { Normal, OpenTag, EndTag, StartEscape, Escaped, StartEscapeDash, EscapedDash, EscapedDashDash, EscapedOpenTag, EscapedEndTag, EscapedNameEndTag, StartDoubleEscape, EscapedDouble, EscapedDoubleDash, EscapedDoubleDashDash, EscapedDoubleOpenTag, EndDoubleEscape } private readonly IEntityProvider _resolver; private string _lastStartTag; private TextPosition _position; public bool IsAcceptingCharacterData { get; set; } public HtmlParseMode State { get; set; } public bool IsStrictMode { get; set; } public event EventHandler<HtmlErrorEvent> Error; public HtmlTokenizer(TextSource source, IEntityProvider resolver) : base(source) { State = HtmlParseMode.PCData; IsAcceptingCharacterData = false; IsStrictMode = false; _lastStartTag = string.Empty; _resolver = resolver; } public HtmlToken Get() { char next = GetNext(); _position = GetCurrentPosition(); if (next != '￿') { switch (State) { case HtmlParseMode.PCData: return Data(next); case HtmlParseMode.RCData: return RCData(next); case HtmlParseMode.Plaintext: return Plaintext(next); case HtmlParseMode.Rawtext: return Rawtext(next); case HtmlParseMode.Script: return ScriptData(next); } } return NewEof(true); } internal void RaiseErrorOccurred(HtmlParseError code, TextPosition position) { EventHandler<HtmlErrorEvent> error = this.Error; if (IsStrictMode) { string message = "Error while parsing the provided HTML document."; throw new HtmlParseException(code.GetCode(), message, position); } if (error != null) { HtmlErrorEvent e = new HtmlErrorEvent(code, position); error(this, e); } } private HtmlToken Data(char c) { if (c != '<') return DataText(c); return TagOpen(GetNext()); } private HtmlToken DataText(char c) { while (true) { switch (c) { case '<': case '￿': Back(); return NewCharacter(); case '&': AppendCharacterReference(GetNext(), ''); break; case '': RaiseErrorOccurred(HtmlParseError.Null); break; default: base.StringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': AppendReplacement(); break; case '￿': Back(); return NewCharacter(); default: base.StringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RCData(char c) { if (c != '<') return RCDataText(c); return RCDataLt(GetNext()); } private HtmlToken RCDataText(char c) { while (true) { switch (c) { case '&': AppendCharacterReference(GetNext(), ''); break; case '<': case '￿': Back(); return NewCharacter(); case '': AppendReplacement(); break; default: base.StringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RCDataLt(char c) { if (c == '/') { c = GetNext(); if (c.IsUppercaseAscii()) { base.StringBuffer.Append(char.ToLowerInvariant(c)); return RCDataNameEndTag(GetNext()); } if (c.IsLowercaseAscii()) { base.StringBuffer.Append(c); return RCDataNameEndTag(GetNext()); } base.StringBuffer.Append('<').Append('/'); return RCDataText(c); } base.StringBuffer.Append('<'); return RCDataText(c); } private HtmlToken RCDataNameEndTag(char c) { while (true) { HtmlToken htmlToken = CreateIfAppropriate(c); if (htmlToken != null) return htmlToken; if (c.IsUppercaseAscii()) base.StringBuffer.Append(char.ToLowerInvariant(c)); else { if (!c.IsLowercaseAscii()) break; base.StringBuffer.Append(c); } c = GetNext(); } base.StringBuffer.Insert(0, '<').Insert(1, '/'); return RCDataText(c); } private HtmlToken Rawtext(char c) { if (c != '<') return RawtextText(c); return RawtextLT(GetNext()); } private HtmlToken RawtextText(char c) { while (true) { switch (c) { case '<': case '￿': Back(); return NewCharacter(); case '': AppendReplacement(); break; default: base.StringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RawtextLT(char c) { if (c == '/') { c = GetNext(); if (c.IsUppercaseAscii()) { base.StringBuffer.Append(char.ToLowerInvariant(c)); return RawtextNameEndTag(GetNext()); } if (c.IsLowercaseAscii()) { base.StringBuffer.Append(c); return RawtextNameEndTag(GetNext()); } base.StringBuffer.Append('<').Append('/'); return RawtextText(c); } base.StringBuffer.Append('<'); return RawtextText(c); } private HtmlToken RawtextNameEndTag(char c) { while (true) { HtmlToken htmlToken = CreateIfAppropriate(c); if (htmlToken != null) return htmlToken; if (c.IsUppercaseAscii()) base.StringBuffer.Append(char.ToLowerInvariant(c)); else { if (!c.IsLowercaseAscii()) break; base.StringBuffer.Append(c); } c = GetNext(); } base.StringBuffer.Insert(0, '<').Insert(1, '/'); return RawtextText(c); } private HtmlToken CharacterData(char c) { while (true) { switch (c) { case '￿': Back(); goto IL_0042; case ']': { if (!ContinuesWithSensitive("]]>")) break; Advance(2); goto IL_0042; } IL_0042: return NewCharacter(); } base.StringBuffer.Append(c); c = GetNext(); } } private void AppendCharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '￿' || c == '&' || c == allowedCharacter) { Back(); base.StringBuffer.Append('&'); } else { string text = null; text = ((c != '#') ? GetLookupCharacterReference(c, allowedCharacter) : GetNumericCharacterReference(GetNext())); if (text == null) base.StringBuffer.Append('&'); else base.StringBuffer.Append(text); } } private string GetNumericCharacterReference(char c) { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = GetNext(); } } else { num = 16; while ((c = GetNext()).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { Back(2); if (flag) Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber); return null; } if (c != ';') { RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing); Back(); } if (HtmlEntityService.IsInCharacterTable(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode); return HtmlEntityService.GetSymbolFromTable(num3); } if (HtmlEntityService.IsInvalidNumber(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber); return '�'.ToString(); } if (HtmlEntityService.IsInInvalidRange(num3)) RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange); return num3.ConvertFromUtf32(); } private string GetLookupCharacterReference(char c, char allowedCharacter) { string text = null; int insertionPoint = base.InsertionPoint - 1; char[] array = new char[32]; int num = 0; char c2 = base.Current; while (c2 != ';' && c2.IsName()) { array[num++] = c2; c2 = GetNext(); if (c2 == '￿' || num >= 31) break; } if (c2 == ';') { array[num] = ';'; string name = new string(array, 0, num + 1); text = _resolver.GetSymbol(name); } while (text == null && num > 0) { string name2 = new string(array, 0, num--); text = _resolver.GetSymbol(name2); if (text == null) Back(); } c2 = base.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound); base.InsertionPoint = insertionPoint; return null; } Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated); } return text; } private HtmlToken TagOpen(char c) { if (c == '/') return TagEnd(GetNext()); if (c.IsLowercaseAscii()) { base.StringBuffer.Append(c); return TagName(NewTagOpen()); } if (!c.IsUppercaseAscii()) { switch (c) { case '!': return MarkupDeclaration(GetNext()); default: State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag); base.StringBuffer.Append('<'); return DataText(c); case '?': RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(c); } } base.StringBuffer.Append(char.ToLowerInvariant(c)); return TagName(NewTagOpen()); } private HtmlToken TagEnd(char c) { if (c.IsLowercaseAscii()) { base.StringBuffer.Append(c); return TagName(NewTagClose()); } if (!c.IsUppercaseAscii()) { switch (c) { case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return Data(GetNext()); case '￿': Back(); RaiseErrorOccurred(HtmlParseError.EOF); base.StringBuffer.Append('<').Append('/'); return NewCharacter(); default: RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(c); } } base.StringBuffer.Append(char.ToLowerInvariant(c)); return TagName(NewTagClose()); } private HtmlToken TagName(HtmlTagToken tag) { while (true) { char next = GetNext(); if (next == '>') { tag.Name = FlushBuffer(); return EmitTag(tag); } if (next.IsSpaceCharacter()) { tag.Name = FlushBuffer(); return ParseAttributes(tag); } if (next == '/') break; if (next.IsUppercaseAscii()) base.StringBuffer.Append(char.ToLowerInvariant(next)); else { switch (next) { case '': AppendReplacement(); break; default: base.StringBuffer.Append(next); break; case '￿': return NewEof(false); } } } tag.Name = FlushBuffer(); return TagSelfClosing(tag); } private HtmlToken TagSelfClosing(HtmlTagToken tag) { return TagSelfClosingInner(tag) ?? ParseAttributes(tag); } private HtmlToken TagSelfClosingInner(HtmlTagToken tag) { while (true) { switch (GetNext()) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '￿': return NewEof(false); case '/': break; default: RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced); Back(); return null; } RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced); } } private HtmlToken MarkupDeclaration(char c) { if (ContinuesWithSensitive("--")) { Advance(); return CommentStart(GetNext()); } if (ContinuesWithInsensitive(TagNames.Doctype)) { Advance(6); return Doctype(GetNext()); } if (IsAcceptingCharacterData && ContinuesWithSensitive(Keywords.CData)) { Advance(6); return CharacterData(GetNext()); } RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration); return BogusComment(c); } private HtmlToken BogusComment(char c) { base.StringBuffer.Clear(); while (true) { switch (c) { case '￿': Back(); goto case '>'; case '': c = '�'; break; case '>': State = HtmlParseMode.PCData; return NewComment(); } base.StringBuffer.Append(c); c = GetNext(); } } private HtmlToken CommentStart(char c) { base.StringBuffer.Clear(); switch (c) { case '-': return CommentDashStart(GetNext()) ?? Comment(GetNext()); case '': AppendReplacement(); return Comment(GetNext()); case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: base.StringBuffer.Append(c); return Comment(GetNext()); } return NewComment(); } private HtmlToken CommentDashStart(char c) { switch (c) { case '-': return CommentEnd(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); base.StringBuffer.Append('-').Append('�'); return Comment(GetNext()); case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: base.StringBuffer.Append('-').Append(c); return Comment(GetNext()); } return NewComment(); } private HtmlToken Comment(char c) { while (true) { switch (c) { case '-': { HtmlToken htmlToken = CommentDashEnd(GetNext()); if (htmlToken != null) return htmlToken; break; } case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); case '': AppendReplacement(); break; default: base.StringBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken CommentDashEnd(char c) { switch (c) { case '-': return CommentEnd(GetNext()); case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; break; } base.StringBuffer.Append('-').Append(c); return null; } private HtmlToken CommentEnd(char c) { while (true) { switch (c) { case '>': State = HtmlParseMode.PCData; return NewComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); base.StringBuffer.Append('-').Append('�'); return null; case '!': RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM); return CommentBangEnd(GetNext()); case '-': break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewComment(); default: RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected); base.StringBuffer.Append('-').Append('-').Append(c); return null; } RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash); base.StringBuffer.Append('-'); c = GetNext(); } } private HtmlToken CommentBangEnd(char c) { switch (c) { case '-': base.StringBuffer.Append('-').Append('-').Append('!'); return CommentDashEnd(GetNext()); case '>': State = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.Null); base.StringBuffer.Append('-').Append('-').Append('!') .Append('�'); return null; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: base.StringBuffer.Append('-').Append('-').Append('!') .Append(c); return null; } return NewComment(); } private HtmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); if (c == '￿') { RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewDoctype(true); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected); return DoctypeNameBefore(c); } private HtmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (!c.IsUppercaseAscii()) { switch (c) { case '': { HtmlDoctypeToken doctype2 = NewDoctype(false); AppendReplacement(); return DoctypeName(doctype2); } case '>': { HtmlDoctypeToken result2 = NewDoctype(true); State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return result2; } case '￿': { HtmlDoctypeToken result = NewDoctype(true); RaiseErrorOccurred(HtmlParseError.EOF); Back(); return result; } default: { HtmlDoctypeToken doctype = NewDoctype(false); base.StringBuffer.Append(c); return DoctypeName(doctype); } } } HtmlDoctypeToken doctype3 = NewDoctype(false); base.StringBuffer.Append(char.ToLowerInvariant(c)); return DoctypeName(doctype3); } private HtmlToken DoctypeName(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); if (next.IsSpaceCharacter()) { doctype.Name = FlushBuffer(); return DoctypeNameAfter(doctype); } if (next == '>') { State = HtmlParseMode.PCData; doctype.Name = FlushBuffer(); break; } if (next.IsUppercaseAscii()) base.StringBuffer.Append(char.ToLowerInvariant(next)); else { switch (next) { case '': break; case '￿': goto IL_006c; default: goto IL_008e; } AppendReplacement(); } continue; IL_008e: base.StringBuffer.Append(next); continue; IL_006c: RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.Name = FlushBuffer(); break; } return doctype; } private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': State = HtmlParseMode.PCData; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; break; default: if (ContinuesWithInsensitive(Keywords.Public)) { Advance(5); return DoctypePublic(doctype); } if (ContinuesWithInsensitive(Keywords.System)) { Advance(5); return DoctypeSystem(doctype); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublic(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypePublicIdentifierBefore(doctype); switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.PublicIdentifier = FlushBuffer(); return DoctypePublicIdentifierAfter(doctype); case '': AppendReplacement(); break; case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = FlushBuffer(); goto IL_0090; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = FlushBuffer(); goto IL_0090; default: { base.StringBuffer.Append(next); break; } IL_0090: return doctype; } } } private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.PublicIdentifier = FlushBuffer(); return DoctypePublicIdentifierAfter(doctype); case '': AppendReplacement(); break; case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = FlushBuffer(); goto IL_0090; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = FlushBuffer(); Back(); goto IL_0090; default: { base.StringBuffer.Append(next); break; } IL_0090: return doctype; } } } private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypeBetween(doctype); switch (next) { case '>': State = HtmlParseMode.PCData; break; case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': State = HtmlParseMode.PCData; break; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { State = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(doctype); } switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.SystemIdentifier = FlushBuffer(); doctype.IsQuirksForced = true; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return doctype; } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.SystemIdentifier = FlushBuffer(); return DoctypeSystemIdentifierAfter(doctype); case '': AppendReplacement(); break; case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); goto IL_0090; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); Back(); goto IL_0090; default: { base.StringBuffer.Append(next); break; } IL_0090: return doctype; } } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.SystemIdentifier = FlushBuffer(); return DoctypeSystemIdentifierAfter(doctype); case '': AppendReplacement(); break; case '>': State = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); goto IL_0099; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = FlushBuffer(); Back(); goto IL_0099; default: { base.StringBuffer.Append(next); break; } IL_0099: return doctype; } } } private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': State = HtmlParseMode.PCData; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); return BogusDoctype(doctype); } return doctype; } private HtmlToken BogusDoctype(HtmlDoctypeToken doctype) { while (true) { switch (GetNext()) { case '>': State = HtmlParseMode.PCData; goto IL_0025; case '￿': { Back(); goto IL_0025; } IL_0025: return doctype; } } } private HtmlToken ParseAttributes(HtmlTagToken tag) { AttributeState attributeState = AttributeState.BeforeName; char c = '"'; char c2 = ''; while (true) { switch (attributeState) { case AttributeState.BeforeName: c2 = SkipSpaces(); switch (c2) { case '/': attributeState = AttributeState.SelfClose; break; case '>': return EmitTag(tag); default: if (!c2.IsUppercaseAscii()) { switch (c2) { case '': AppendReplacement(); attributeState = AttributeState.Name; break; case '"': case '\'': case '<': case '=': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); base.StringBuffer.Append(c2); attributeState = AttributeState.Name; break; default: base.StringBuffer.Append(c2); attributeState = AttributeState.Name; break; case '￿': return NewEof(false); } } else { base.StringBuffer.Append(char.ToLowerInvariant(c2)); attributeState = AttributeState.Name; } break; } break; case AttributeState.Name: c2 = GetNext(); switch (c2) { case '=': tag.AddAttribute(FlushBuffer()); attributeState = AttributeState.BeforeValue; break; case '>': tag.AddAttribute(FlushBuffer()); return EmitTag(tag); default: if (c2.IsSpaceCharacter()) { tag.AddAttribute(FlushBuffer()); attributeState = AttributeState.AfterName; } else if (c2 == '/') { tag.AddAttribute(FlushBuffer()); attributeState = AttributeState.SelfClose; } else if (!c2.IsUppercaseAscii()) { switch (c2) { case '"': case '\'': case '<': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); base.StringBuffer.Append(c2); break; case '': AppendReplacement(); break; default: base.StringBuffer.Append(c2); break; case '￿': return NewEof(false); } } else { base.StringBuffer.Append(char.ToLowerInvariant(c2)); } break; } break; case AttributeState.AfterName: c2 = SkipSpaces(); switch (c2) { case '>': return EmitTag(tag); case '=': attributeState = AttributeState.BeforeValue; break; case '/': attributeState = AttributeState.SelfClose; break; default: if (!c2.IsUppercaseAscii()) { switch (c2) { case '"': case '\'': case '<': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); base.StringBuffer.Append(c2); attributeState = AttributeState.Name; break; case '': AppendReplacement(); attributeState = AttributeState.Name; break; default: base.StringBuffer.Append(c2); attributeState = AttributeState.Name; break; case '￿': return NewEof(false); } } else { base.StringBuffer.Append(char.ToLowerInvariant(c2)); attributeState = AttributeState.Name; } break; } break; case AttributeState.BeforeValue: c2 = SkipSpaces(); switch (c2) { case '"': case '\'': attributeState = AttributeState.QuotedValue; c = c2; break; case '&': attributeState = AttributeState.UnquotedValue; break; case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); base.StringBuffer.Append(c2); attributeState = AttributeState.UnquotedValue; c2 = GetNext(); break; case '': AppendReplacement(); attributeState = AttributeState.UnquotedValue; c2 = GetNext(); break; default: base.StringBuffer.Append(c2); attributeState = AttributeState.UnquotedValue; c2 = GetNext(); break; case '￿': return NewEof(false); } break; case AttributeState.QuotedValue: c2 = GetNext(); if (c2 != c) { switch (c2) { case '&': AppendCharacterReference(GetNext(), c); break; case '': AppendReplacement(); break; default: base.StringBuffer.Append(c2); break; case '￿': return NewEof(false); } } else { tag.SetAttributeValue(FlushBuffer()); attributeState = AttributeState.AfterValue; } break; case AttributeState.UnquotedValue: if (c2 == '>') { tag.SetAttributeValue(FlushBuffer()); return EmitTag(tag); } if (!c2.IsSpaceCharacter()) { switch (c2) { case '&': AppendCharacterReference(GetNext(), '>'); c2 = GetNext(); break; case '': AppendReplacement(); c2 = GetNext(); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); base.StringBuffer.Append(c2); c2 = GetNext(); break; default: base.StringBuffer.Append(c2); c2 = GetNext(); break; case '￿': return NewEof(false); } } else { tag.SetAttributeValue(FlushBuffer()); attributeState = AttributeState.BeforeName; } break; case AttributeState.AfterValue: c2 = GetNext(); if (c2 == '>') return EmitTag(tag); if (!c2.IsSpaceCharacter()) { switch (c2) { case '/': attributeState = AttributeState.SelfClose; break; case '￿': return NewEof(false); default: RaiseErrorOccurred(HtmlParseError.AttributeNameExpected); Back(); attributeState = AttributeState.BeforeName; break; } } else attributeState = AttributeState.BeforeName; break; case AttributeState.SelfClose: { HtmlToken htmlToken = TagSelfClosingInner(tag); if (htmlToken != null) return htmlToken; attributeState = AttributeState.BeforeName; break; } } } } private HtmlToken ScriptData(char c) { int length = _lastStartTag.Length; int length2 = TagNames.Script.Length; ScriptState scriptState = ScriptState.Normal; int num = 0; while (true) { switch (scriptState) { case ScriptState.Normal: switch (c) { case '': AppendReplacement(); goto IL_00ad; case '<': base.StringBuffer.Append('<'); scriptState = ScriptState.OpenTag; break; case '￿': Back(); return NewCharacter(); default: { base.StringBuffer.Append(c); goto IL_00ad; } IL_00ad: c = GetNext(); break; } break; case ScriptState.OpenTag: c = GetNext(); switch (c) { case '/': scriptState = ScriptState.EndTag; break; case '!': scriptState = ScriptState.StartEscape; break; default: scriptState = ScriptState.Normal; break; } break; case ScriptState.StartEscape: base.StringBuffer.Append('!'); c = GetNext(); scriptState = ((c == '-') ? ScriptState.StartEscapeDash : ScriptState.Normal); break; case ScriptState.StartEscapeDash: c = GetNext(); base.StringBuffer.Append('-'); if (c == '-') { base.StringBuffer.Append('-'); scriptState = ScriptState.EscapedDashDash; } else scriptState = ScriptState.Normal; break; case ScriptState.EndTag: { c = GetNext(); num = base.StringBuffer.Append('/').Length; HtmlTagToken htmlTagToken = NewTagClose(); while (c.IsLetter()) { base.StringBuffer.Append(c); c = GetNext(); bool flag = c.IsSpaceCharacter(); bool flag2 = c == '>'; bool flag3 = c == '/'; if (base.StringBuffer.Length - num == length && (flag | flag2 | flag3) && base.StringBuffer.ToString(num, length).Isi(_lastStartTag)) { if (num > 2) { Back(3 + length); base.StringBuffer.Remove(num - 2, length + 2); return NewCharacter(); } base.StringBuffer.Clear(); if (flag) { htmlTagToken.Name = _lastStartTag; return ParseAttributes(htmlTagToken); } if (flag3) { htmlTagToken.Name = _lastStartTag; return TagSelfClosing(htmlTagToken); } if (flag2) { htmlTagToken.Name = _lastStartTag; return EmitTag(htmlTagToken); } } } scriptState = ScriptState.Normal; break; } case ScriptState.Escaped: switch (c) { case '-': base.StringBuffer.Append('-'); c = GetNext(); scriptState = ScriptState.EscapedDash; break; case '<': c = GetNext(); scriptState = ScriptState.EscapedOpenTag; break; case '': AppendReplacement(); c = GetNext(); break; case '￿': Back(); return NewCharacter(); default: scriptState = ScriptState.Normal; break; } break; case ScriptState.EscapedDash: switch (c) { case '-': base.StringBuffer.Append('-'); scriptState = ScriptState.EscapedDashDash; break; case '<': c = GetNext(); scriptState = ScriptState.EscapedOpenTag; break; case '': AppendReplacement(); goto IL_033c; case '￿': Back(); return NewCharacter(); default: { base.StringBuffer.Append(c); goto IL_033c; } IL_033c: c = GetNext(); scriptState = ScriptState.Escaped; break; } break; case ScriptState.EscapedDashDash: c = GetNext(); switch (c) { case '-': base.StringBuffer.Append('-'); break; case '<': c = GetNext(); scriptState = ScriptState.EscapedOpenTag; break; case '>': base.StringBuffer.Append('>'); c = GetNext(); scriptState = ScriptState.Normal; break; case '': AppendReplacement(); c = GetNext(); scriptState = ScriptState.Escaped; break; case '￿': return NewCharacter(); default: base.StringBuffer.Append(c); c = GetNext(); scriptState = ScriptState.Escaped; break; } break; case ScriptState.EscapedOpenTag: if (c == '/') { c = GetNext(); scriptState = ScriptState.EscapedEndTag; } else if (c.IsLetter()) { num = base.StringBuffer.Append('<').Length; base.StringBuffer.Append(c); scriptState = ScriptState.StartDoubleEscape; } else { base.StringBuffer.Append('<'); scriptState = ScriptState.Escaped; } break; case ScriptState.EscapedEndTag: num = base.StringBuffer.Append('<').Append('/').Length; if (c.IsLetter()) { base.StringBuffer.Append(c); scriptState = ScriptState.EscapedNameEndTag; } else scriptState = ScriptState.Escaped; break; case ScriptState.EscapedNameEndTag: c = GetNext(); if (base.StringBuffer.Length - num == length2 && (c == '/' || c == '>' || c.IsSpaceCharacter()) && base.StringBuffer.ToString(num, length2).Isi(TagNames.Script)) { Back(length2 + 3); base.StringBuffer.Remove(num - 2, length2 + 2); return NewCharacter(); } if (!c.IsLetter()) scriptState = ScriptState.Escaped; else base.StringBuffer.Append(c); break; case ScriptState.StartDoubleEscape: c = GetNext(); if (base.StringBuffer.Length - num == length2 && (c == '/' || c == '>' || c.IsSpaceCharacter())) { bool num3 = base.StringBuffer.ToString(num, length2).Isi(TagNames.Script); base.StringBuffer.Append(c); c = GetNext(); scriptState = (num3 ? ScriptState.EscapedDouble : ScriptState.Escaped); } else if (c.IsLetter()) { base.StringBuffer.Append(c); } else { scriptState = ScriptState.Escaped; } break; case ScriptState.EscapedDouble: switch (c) { case '-': base.StringBuffer.Append('-'); c = GetNext(); scriptState = ScriptState.EscapedDoubleDash; break; case '<': base.StringBuffer.Append('<'); c = GetNext(); scriptState = ScriptState.EscapedDoubleOpenTag; break; case '': AppendReplacement(); goto default; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); default: base.StringBuffer.Append(c); c = GetNext(); break; } break; case ScriptState.EscapedDoubleDash: switch (c) { case '-': base.StringBuffer.Append('-'); scriptState = ScriptState.EscapedDoubleDashDash; break; case '<': base.StringBuffer.Append('<'); c = GetNext(); scriptState = ScriptState.EscapedDoubleOpenTag; break; case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; goto default; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); default: scriptState = ScriptState.EscapedDouble; break; } break; case ScriptState.EscapedDoubleDashDash: c = GetNext(); switch (c) { case '-': base.StringBuffer.Append('-'); break; case '<': base.StringBuffer.Append('<'); c = GetNext(); scriptState = ScriptState.EscapedDoubleOpenTag; break; case '>': base.StringBuffer.Append('>'); c = GetNext(); scriptState = ScriptState.Normal; break; case '': AppendReplacement(); c = GetNext(); scriptState = ScriptState.EscapedDouble; break; case '￿': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return NewCharacter(); default: base.StringBuffer.Append(c); c = GetNext(); scriptState = ScriptState.EscapedDouble; break; } break; case ScriptState.EscapedDoubleOpenTag: if (c == '/') { num = base.StringBuffer.Append('/').Length; scriptState = ScriptState.EndDoubleEscape; } else scriptState = ScriptState.EscapedDouble; break; case ScriptState.EndDoubleEscape: c = GetNext(); if (base.StringBuffer.Length - num == length2 && (c.IsSpaceCharacter() || c == '/' || c == '>')) { bool num2 = base.StringBuffer.ToString(num, length2).Isi(TagNames.Script); base.StringBuffer.Append(c); c = GetNext(); scriptState = (num2 ? ScriptState.Escaped : ScriptState.EscapedDouble); } else if (c.IsLetter()) { base.StringBuffer.Append(c); } else { scriptState = ScriptState.EscapedDouble; } break; } } } private HtmlToken NewCharacter() { string name = FlushBuffer(); return new HtmlToken(HtmlTokenType.Character, _position, name); } private HtmlToken NewComment() { string name = FlushBuffer(); return new HtmlToken(HtmlTokenType.Comment, _position, name); } private HtmlToken NewEof(bool acceptable = false) { if (!acceptable) RaiseErrorOccurred(HtmlParseError.EOF); return new HtmlToken(HtmlTokenType.EndOfFile, _position); } private HtmlDoctypeToken NewDoctype(bool quirksForced) { return new HtmlDoctypeToken(quirksForced, _position); } private HtmlTagToken NewTagOpen() { return new HtmlTagToken(HtmlTokenType.StartTag, _position); } private HtmlTagToken NewTagClose() { return new HtmlTagToken(HtmlTokenType.EndTag, _position); } private void RaiseErrorOccurred(HtmlParseError code) { RaiseErrorOccurred(code, GetCurrentPosition()); } private void AppendReplacement() { RaiseErrorOccurred(HtmlParseError.Null); base.StringBuffer.Append('�'); } private HtmlToken CreateIfAppropriate(char c) { bool flag = c.IsSpaceCharacter(); bool flag2 = c == '>'; bool flag3 = c == '/'; if (base.StringBuffer.Length == _lastStartTag.Length && (flag | flag2 | flag3) && base.StringBuffer.ToString().Is(_lastStartTag)) { HtmlTagToken htmlTagToken = NewTagClose(); base.StringBuffer.Clear(); if (flag) { htmlTagToken.Name = _lastStartTag; return ParseAttributes(htmlTagToken); } if (flag3) { htmlTagToken.Name = _lastStartTag; return TagSelfClosing(htmlTagToken); } if (flag2) { htmlTagToken.Name = _lastStartTag; return EmitTag(htmlTagToken); } } return null; } private HtmlToken EmitTag(HtmlTagToken tag) { List<KeyValuePair<string, string>> attributes = tag.Attributes; State = HtmlParseMode.PCData; switch (tag.Type) { case HtmlTokenType.StartTag: for (int num = attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { KeyValuePair<string, string> keyValuePair = attributes[num2]; string key = keyValuePair.Key; keyValuePair = attributes[num]; if (key == keyValuePair.Key) { attributes.RemoveAt(num); RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted, tag.Position); break; } } } _lastStartTag = tag.Name; break; case HtmlTokenType.EndTag: if (tag.IsSelfClosing) RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed, tag.Position); if (attributes.Count != 0) RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes, tag.Position); break; } return tag; } } }