AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.8.2" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Events; using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Collections.Generic; using System.Diagnostics; namespace AngleSharp.Parser.Html { [DebuggerStepThrough] internal sealed class HtmlTokenizer : BaseTokenizer { private bool _acceptsCharacterData; private string _lastStartTag; private HtmlParseMode _state; private HtmlToken _buffered; public bool IsAcceptingCharacterData { get { return _acceptsCharacterData; } set { _acceptsCharacterData = value; } } public HtmlParseMode State { get { return _state; } set { _state = value; } } public HtmlTokenizer(TextSource source, IEventAggregator events) : base(source, events) { _state = HtmlParseMode.PCData; _acceptsCharacterData = false; } public void RaiseErrorOccurred(HtmlParseError code) { if (_events != null) { TextPosition currentPosition = GetCurrentPosition(); HtmlParseErrorEvent data = new HtmlParseErrorEvent(code, currentPosition); _events.Publish(data); } } public HtmlToken Get() { HtmlToken htmlToken = _buffered; if (htmlToken != null) { _buffered = null; return htmlToken; } char next = GetNext(); if (base.IsEnded) return HtmlToken.EndOfFile; switch (_state) { case HtmlParseMode.PCData: htmlToken = Data(next); break; case HtmlParseMode.RCData: htmlToken = RCData(next); break; case HtmlParseMode.Plaintext: htmlToken = Plaintext(next); break; case HtmlParseMode.Rawtext: htmlToken = Rawtext(next); break; case HtmlParseMode.Script: htmlToken = ScriptData(next); break; } if (_textBuffer.Length > 0) { _buffered = htmlToken; htmlToken = HtmlToken.Character(_textBuffer.ToString()); _textBuffer.Clear(); } return htmlToken; } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: _textBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken Data(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(GetNext(), ''); if (text == null) _textBuffer.Append('&'); _textBuffer.Append(text); break; } case '<': return TagOpen(); case '': RaiseErrorOccurred(HtmlParseError.Null); return Data(GetNext()); case '': return HtmlToken.EndOfFile; default: _textBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RCData(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(GetNext(), ''); if (text == null) _textBuffer.Append('&'); _textBuffer.Append(text); goto IL_00b2; } case '<': c = GetNext(); if (c == '/') { _stringBuffer.Clear(); return RCDataEndTag(); } _textBuffer.Append('<'); break; case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); goto IL_00b2; case '': return HtmlToken.EndOfFile; default: { _textBuffer.Append(c); goto IL_00b2; } IL_00b2: c = GetNext(); break; } } } private HtmlToken RCDataEndTag() { char next = GetNext(); if (next.IsUppercaseAscii()) _stringBuffer.Clear().Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) { _textBuffer.Append('<').Append('/'); return RCData(next); } _stringBuffer.Clear().Append(next); } return RCDataNameEndTag(HtmlTagToken.Close()); } private HtmlToken RCDataNameEndTag(HtmlTagToken tag) { char next; while (true) { next = GetNext(); string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } if (flag && next == '/') { tag.Name = text; return TagSelfClosing(tag); } if (flag && next == '>') { tag.Name = text; return EmitTag(tag); } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) break; _stringBuffer.Append(next); } } _textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return RCData(next); } private HtmlToken Rawtext(char c) { while (true) { switch (c) { case '<': return RawtextLT(); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: _textBuffer.Append(c); break; } c = GetNext(); } } private HtmlToken RawtextLT() { char next = GetNext(); if (next == '/') { _stringBuffer.Clear(); return RawtextEndTag(); } _textBuffer.Append('<'); return Rawtext(next); } private HtmlToken RawtextEndTag() { char next = GetNext(); if (next.IsUppercaseAscii()) { _stringBuffer.Clear().Append(char.ToLower(next)); return RawtextNameEndTag(HtmlTagToken.Close()); } if (next.IsLowercaseAscii()) { _stringBuffer.Clear().Append(next); return RawtextNameEndTag(HtmlTagToken.Close()); } _textBuffer.Append('<').Append('/'); return Rawtext(next); } private HtmlToken RawtextNameEndTag(HtmlTagToken tag) { char next; while (true) { next = GetNext(); string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } if (flag && next == '/') { tag.Name = text; return TagSelfClosing(tag); } if (flag && next == '>') { tag.Name = text; return EmitTag(tag); } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) break; _stringBuffer.Append(next); } } _textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return Rawtext(next); } private HtmlToken CData() { _stringBuffer.Clear(); while (true) { char next = GetNext(); switch (next) { case '': Back(); goto IL_004b; case ']': { if (!ContinuesWith("]]>", true)) break; Advance(2); goto IL_004b; } IL_004b: return HtmlToken.Character(_stringBuffer.ToString()); } _stringBuffer.Append(next); } } private string CharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) { Back(); return null; } if (c == '#') { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); c = GetNext(); bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = GetNext(); } } else { num = 16; while ((c = GetNext()).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { Back(2); if (flag) Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber); return null; } if (c != ';') { RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing); Back(); } if (Entities.IsInCharacterTable(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode); return Entities.GetSymbolFromTable(num3); } if (Entities.IsInvalidNumber(num3)) { RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber); return '�'.ToString(); } if (Entities.IsInInvalidRange(num3)) RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange); return Entities.Convert(num3); } string result = null; int num5 = 0; int insertionPoint = base.InsertionPoint - 1; char[] array = new char[31]; int num6 = 0; char c2 = base.Current; while (c2 != ';' && c2.IsName()) { array[num6++] = c2; string name = new string(array, 0, num6); c2 = GetNext(); num5++; name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name)); if (name != null) { num5 = 0; result = name; } if (base.IsEnded || num6 >= 31) break; } Back(num5); c2 = base.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound); base.InsertionPoint = insertionPoint; return null; } Back(); RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated); } return result; } private HtmlToken TagOpen() { char next = GetNext(); if (next == '/') return TagEnd(GetNext()); if (next.IsLowercaseAscii()) { HtmlTagToken tag = HtmlTagToken.Open(); _stringBuffer.Clear().Append(next); return TagName(tag); } if (!next.IsUppercaseAscii()) { switch (next) { case '!': return MarkupDeclaration(); case '?': RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(next); default: _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag); _textBuffer.Append('<'); return Data(next); } } HtmlTagToken tag2 = HtmlTagToken.Open(); _stringBuffer.Clear().Append(char.ToLower(next)); return TagName(tag2); } private HtmlToken TagEnd(char c) { if (c.IsLowercaseAscii()) { HtmlTagToken tag = HtmlTagToken.Close(); _stringBuffer.Clear().Append(c); return TagName(tag); } if (!c.IsUppercaseAscii()) { switch (c) { case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return Data(GetNext()); case '': Back(); RaiseErrorOccurred(HtmlParseError.EOF); _textBuffer.Append('<').Append('/'); return HtmlToken.EndOfFile; default: RaiseErrorOccurred(HtmlParseError.BogusComment); return BogusComment(c); } } HtmlTagToken tag2 = HtmlTagToken.Close(); _stringBuffer.Clear().Append(char.ToLower(c)); return TagName(tag2); } private HtmlToken TagName(HtmlTagToken tag) { while (true) { char next = GetNext(); if (next.IsSpaceCharacter()) break; switch (next) { case '/': tag.Name = _stringBuffer.ToString(); return TagSelfClosing(tag); case '>': tag.Name = _stringBuffer.ToString(); return EmitTag(tag); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); return HtmlToken.EndOfFile; default: if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else _stringBuffer.Append(next); break; } } tag.Name = _stringBuffer.ToString(); return AttributeBeforeName(tag); } private HtmlToken TagSelfClosing(HtmlTagToken tag) { switch (GetNext()) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '': RaiseErrorOccurred(HtmlParseError.EOF); return HtmlToken.EndOfFile; default: RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced); Back(); return AttributeBeforeName(tag); } } private HtmlToken MarkupDeclaration() { char next = GetNext(); if (ContinuesWith("--", true)) { Advance(); return CommentStart(); } if (ContinuesWith(Tags.Doctype, true)) { Advance(6); return Doctype(); } if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) { Advance(6); return CData(); } RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration); return BogusComment(next); } private HtmlToken BogusComment(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': Back(); goto case '>'; case '': _stringBuffer.Append('�'); c = GetNext(); break; default: _stringBuffer.Append(c); c = GetNext(); break; case '>': _state = HtmlParseMode.PCData; return EmitComment(); } } } private HtmlToken CommentStart() { char next = GetNext(); _stringBuffer.Clear(); switch (next) { case '-': return CommentDashStart(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return Comment(); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append(next); return Comment(); } return EmitComment(); } private HtmlToken CommentDashStart() { char next = GetNext(); switch (next) { case '-': return CommentEnd(); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('�'); return Comment(); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append('-').Append(next); return Comment(); } return EmitComment(); } private HtmlToken Comment() { while (true) { char next = GetNext(); switch (next) { case '-': { HtmlToken htmlToken = CommentDashEnd(); if (htmlToken != null) return htmlToken; break; } case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return EmitComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); next = '�'; _stringBuffer.Append(next); break; default: _stringBuffer.Append(next); break; } } } private HtmlToken CommentDashEnd() { char c = GetNext(); switch (c) { case '-': return CommentEnd(); case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); return EmitComment(); case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; break; } _stringBuffer.Append('-').Append(c); return null; } private HtmlToken CommentEnd() { while (true) { char next = GetNext(); switch (next) { case '>': _state = HtmlParseMode.PCData; goto IL_00aa; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('�'); return null; case '!': RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM); return CommentBangEnd(); case '-': break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); goto IL_00aa; default: { RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected); _stringBuffer.Append('-').Append('-').Append(next); return null; } IL_00aa: return EmitComment(); } RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash); _stringBuffer.Append('-'); } } private HtmlToken CommentBangEnd() { char next = GetNext(); switch (next) { case '-': _stringBuffer.Append('-').Append('-').Append('!'); return CommentDashEnd(); case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('-').Append('-').Append('!') .Append('�'); return null; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); break; default: _stringBuffer.Append('-').Append('-').Append('!') .Append(next); return null; } return EmitComment(); } private HtmlToken Doctype() { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypeNameBefore(GetNext()); if (next == '') { RaiseErrorOccurred(HtmlParseError.EOF); Back(); HtmlDoctypeToken token = HtmlToken.Doctype(true); return Emit(token); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected); return DoctypeNameBefore(next); } private HtmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = GetNext(); } if (!c.IsUppercaseAscii()) { switch (c) { case '': { HtmlDoctypeToken doctype2 = HtmlToken.Doctype(false); RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Clear().Append('�'); return DoctypeName(doctype2); } case '>': { HtmlDoctypeToken token2 = HtmlToken.Doctype(true); _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return Emit(token2); } case '': { HtmlDoctypeToken token = HtmlToken.Doctype(true); RaiseErrorOccurred(HtmlParseError.EOF); Back(); return Emit(token); } default: { HtmlDoctypeToken doctype = HtmlToken.Doctype(false); _stringBuffer.Clear().Append(c); return DoctypeName(doctype); } } } HtmlDoctypeToken doctype3 = HtmlToken.Doctype(false); _stringBuffer.Clear().Append(char.ToLower(c)); return DoctypeName(doctype3); } private HtmlToken DoctypeName(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); if (next.IsSpaceCharacter()) { doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeNameAfter(doctype); } if (next == '>') { _state = HtmlParseMode.PCData; doctype.Name = _stringBuffer.ToString(); break; } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { switch (next) { case '': break; case '': goto IL_0094; default: goto IL_00bb; } RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); } continue; IL_00bb: _stringBuffer.Append(next); continue; IL_0094: RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.Name = _stringBuffer.ToString(); break; } return Emit(doctype); } private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; break; default: if (ContinuesWith("public", true)) { Advance(5); return DoctypePublic(doctype); } if (ContinuesWith("system", true)) { Advance(5); return DoctypeSystem(doctype); } RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypePublic(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) return DoctypePublicIdentifierBefore(doctype); switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00ba; case '': RaiseErrorOccurred(HtmlParseError.EOF); Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00ba; default: { _stringBuffer.Append(next); break; } IL_00ba: return Emit(doctype); } } } private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00ba; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); Back(); goto IL_00ba; default: { _stringBuffer.Append(next); break; } IL_00ba: return Emit(doctype); } } } private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { _stringBuffer.Clear(); return DoctypeBetween(doctype); } switch (next) { case '>': _state = HtmlParseMode.PCData; break; case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype) { char next = GetNext(); if (next.IsSpaceCharacter()) { _state = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(doctype); } switch (next) { case '"': RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.SystemIdentifier = _stringBuffer.ToString(); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '"': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); goto IL_00ba; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); goto IL_00ba; default: { _stringBuffer.Append(next); break; } IL_00ba: return Emit(doctype); } } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = GetNext(); switch (next) { case '\'': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(HtmlParseError.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); goto IL_00c9; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); goto IL_00c9; default: { _stringBuffer.Append(next); break; } IL_00c9: return Emit(doctype); } } } private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter); return BogusDoctype(doctype); } return Emit(doctype); } private HtmlToken BogusDoctype(HtmlDoctypeToken doctype) { while (true) { switch (GetNext()) { case '>': _state = HtmlParseMode.PCData; goto IL_0020; case '': { Back(); goto IL_0020; } IL_0020: return Emit(doctype); } } } private HtmlToken AttributeBeforeName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Clear().Append('�'); return AttributeName(tag); case '"': case '\'': case '<': case '=': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Clear().Append(c); return AttributeName(tag); case '': return HtmlToken.EndOfFile; default: _stringBuffer.Clear().Append(c); return AttributeName(tag); } } _stringBuffer.Clear().Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeName(HtmlTagToken tag) { while (true) { char next = GetNext(); if (next.IsSpaceCharacter()) break; switch (next) { case '/': tag.AddAttribute(_stringBuffer.ToString()); return TagSelfClosing(tag); case '=': tag.AddAttribute(_stringBuffer.ToString()); return AttributeBeforeValue(tag); case '>': tag.AddAttribute(_stringBuffer.ToString()); return EmitTag(tag); case '': return HtmlToken.EndOfFile; case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; default: if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else if (next == '"' || next == '\'' || next == '<') { RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Append(next); } else { _stringBuffer.Append(next); } break; } } tag.AddAttribute(_stringBuffer.ToString()); return AttributeAfterName(tag); } private HtmlToken AttributeAfterName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '=': return AttributeBeforeValue(tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Clear().Append('�'); return AttributeName(tag); case '"': case '\'': case '<': RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid); _stringBuffer.Clear().Append(c); return AttributeName(tag); case '': return HtmlToken.EndOfFile; default: _stringBuffer.Clear().Append(c); return AttributeName(tag); } } _stringBuffer.Clear().Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeBeforeValue(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '"': _stringBuffer.Clear(); return AttributeDoubleQuotedValue(tag); case '&': _stringBuffer.Clear(); return AttributeUnquotedValue(c, tag); case '\'': _stringBuffer.Clear(); return AttributeSingleQuotedValue(tag); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); return AttributeUnquotedValue(GetNext(), tag); case '>': RaiseErrorOccurred(HtmlParseError.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(GetNext(), tag); case '': return HtmlToken.EndOfFile; default: _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(GetNext(), tag); } } private HtmlToken AttributeDoubleQuotedValue(HtmlTagToken tag) { while (true) { char next = GetNext(); switch (next) { case '"': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(tag); case '&': { string text = CharacterReference(GetNext(), '"'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: _stringBuffer.Append(next); break; } } } private HtmlToken AttributeSingleQuotedValue(HtmlTagToken tag) { while (true) { char next = GetNext(); switch (next) { case '\'': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(tag); case '&': { string text = CharacterReference(GetNext(), '\''); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: _stringBuffer.Append(next); break; } } } private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '&': { string text = CharacterReference(GetNext(), '>'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '>': tag.SetAttributeValue(_stringBuffer.ToString()); return EmitTag(tag); case '': RaiseErrorOccurred(HtmlParseError.Null); _stringBuffer.Append('�'); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid); _stringBuffer.Append(c); break; case '': return HtmlToken.EndOfFile; default: _stringBuffer.Append(c); break; } c = GetNext(); } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeBeforeName(tag); } private HtmlToken AttributeAfterValue(HtmlTagToken tag) { char next = GetNext(); if (!next.IsSpaceCharacter()) { switch (next) { case '/': return TagSelfClosing(tag); case '>': return EmitTag(tag); case '': return HtmlToken.EndOfFile; default: RaiseErrorOccurred(HtmlParseError.AttributeNameExpected); Back(); return AttributeBeforeName(tag); } } return AttributeBeforeName(tag); } private HtmlToken ScriptData(char c) { while (true) { switch (c) { case '<': c = GetNext(); if (c == '/') { c = GetNext(); if (c.IsLetter()) { HtmlTagToken tag = HtmlTagToken.Close(); _stringBuffer.Clear().Append(c); return ScriptDataNameEndTag(tag); } _textBuffer.Append('<').Append('/'); } else { _textBuffer.Append('<'); if (c == '!') { c = GetNext(); _textBuffer.Append('!'); if (c == '-') { c = GetNext(); _textBuffer.Append('-'); if (c == '-') { _textBuffer.Append('-'); return ScriptDataEscapedDashDash(); } } } } break; case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); goto IL_00fb; case '': return HtmlToken.EndOfFile; default: { _textBuffer.Append(c); goto IL_00fb; } IL_00fb: c = GetNext(); break; } } } private HtmlToken ScriptDataNameEndTag(HtmlTagToken tag) { char next; while (true) { next = GetNext(); string text = _stringBuffer.ToString().ToLowerInvariant(); if (text == _lastStartTag) { if (next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } switch (next) { case '/': tag.Name = text; return TagSelfClosing(tag); case '>': tag.Name = text; return EmitTag(tag); } } if (!next.IsLetter()) break; _stringBuffer.Append(next); } _textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return ScriptData(next); } private HtmlToken ScriptDataEscaped(char c) { while (true) { switch (c) { case '-': _textBuffer.Append('-'); c = GetNext(); switch (c) { case '-': _textBuffer.Append('-'); return ScriptDataEscapedDashDash(); case '<': return ScriptDataEscapedLT(); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: _textBuffer.Append(c); break; } break; case '<': return ScriptDataEscapedLT(); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); break; case '': return HtmlToken.EndOfFile; default: return ScriptData(c); } c = GetNext(); } } private HtmlToken ScriptDataEscapedDashDash() { while (true) { char next = GetNext(); switch (next) { case '-': break; case '<': return ScriptDataEscapedLT(); case '>': _textBuffer.Append('>'); return ScriptData(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); return ScriptDataEscaped(GetNext()); case '': return HtmlToken.EndOfFile; default: _textBuffer.Append(next); return ScriptDataEscaped(GetNext()); } _textBuffer.Append('-'); } } private HtmlToken ScriptDataEscapedLT() { char next = GetNext(); if (next == '/') return ScriptDataEscapedEndTag(); if (next.IsLetter()) { _stringBuffer.Clear().Append(next); _textBuffer.Append('<').Append(next); return ScriptDataStartDoubleEscape(); } _textBuffer.Append('<'); return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedEndTag() { char next = GetNext(); if (next.IsLetter()) { HtmlTagToken tag = HtmlTagToken.Close(); _stringBuffer.Clear().Append(next); return ScriptDataEscapedNameTag(tag); } _textBuffer.Append('<').Append('/'); return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedNameTag(HtmlTagToken tag) { char next; while (true) { next = GetNext(); string text = _stringBuffer.ToString().ToLowerInvariant(); if (text == _lastStartTag) { if (next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } switch (next) { case '/': tag.Name = text; return TagSelfClosing(tag); case '>': tag.Name = text; return EmitTag(tag); } } if (!next.IsLetter()) break; _stringBuffer.Append(next); } _textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return ScriptDataEscaped(next); } private HtmlToken ScriptDataStartDoubleEscape() { char next; while (true) { next = GetNext(); if (next == '/' || next == '>' || next.IsSpaceCharacter()) { _textBuffer.Append(next); if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) return ScriptDataEscapedDouble(GetNext()); return ScriptDataEscaped(GetNext()); } if (!next.IsLetter()) break; _stringBuffer.Append(next); _textBuffer.Append(next); } return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedDouble(char c) { while (true) { switch (c) { case '-': _textBuffer.Append('-'); c = GetNext(); switch (c) { case '-': _textBuffer.Append('-'); return ScriptDataEscapedDoubleDashDash(); case '<': _textBuffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '': RaiseErrorOccurred(HtmlParseError.Null); c = '�'; break; case '': RaiseErrorOccurred(HtmlParseError.EOF); return HtmlToken.EndOfFile; } break; case '<': _textBuffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); break; case '': RaiseErrorOccurred(HtmlParseError.EOF); return HtmlToken.EndOfFile; } _textBuffer.Append(c); c = GetNext(); } } private HtmlToken ScriptDataEscapedDoubleDashDash() { while (true) { char next = GetNext(); switch (next) { case '-': break; case '<': _textBuffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '>': _textBuffer.Append('>'); return ScriptData(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.Null); _textBuffer.Append('�'); return ScriptDataEscapedDouble(GetNext()); case '': RaiseErrorOccurred(HtmlParseError.EOF); return HtmlToken.EndOfFile; default: _textBuffer.Append(next); return ScriptDataEscapedDouble(GetNext()); } _textBuffer.Append('-'); } } private HtmlToken ScriptDataEscapedDoubleLT() { char next = GetNext(); if (next == '/') { _stringBuffer.Clear(); _textBuffer.Append('/'); return ScriptDataEndDoubleEscape(); } return ScriptDataEscapedDouble(next); } private HtmlToken ScriptDataEndDoubleEscape() { char next; while (true) { next = GetNext(); if (next.IsSpaceCharacter() || next == '/' || next == '>') { _textBuffer.Append(next); if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) return ScriptDataEscaped(GetNext()); return ScriptDataEscapedDouble(GetNext()); } if (!next.IsLetter()) break; _stringBuffer.Append(next); _textBuffer.Append(next); } return ScriptDataEscapedDouble(next); } private HtmlToken Emit(HtmlToken token) { return token; } private HtmlToken EmitComment() { HtmlToken token = HtmlToken.Comment(_stringBuffer.ToString()); return Emit(token); } private HtmlToken EmitTag(HtmlTagToken tag) { _state = HtmlParseMode.PCData; List<KeyValuePair<string, string>> attributes = tag.Attributes; if (tag.Type == HtmlTokenType.StartTag) { for (int num = attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { KeyValuePair<string, string> keyValuePair = attributes[num2]; string key = keyValuePair.Key; keyValuePair = attributes[num]; if (key == keyValuePair.Key) { attributes.RemoveAt(num); RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted); break; } } } _lastStartTag = tag.Name; } else { if (tag.IsSelfClosing) RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed); if (attributes.Count != 0) RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes); } return Emit(tag); } } }