AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.5.1" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using System; using System.Collections.Generic; using System.Diagnostics; using System.Text; namespace AngleSharp.Parser.Html { [DebuggerStepThrough] internal sealed class HtmlTokenizer : BaseTokenizer { private bool _acceptsCharacterData; private string _lastStartTag; private HtmlParseMode _model; private StringBuilder _buffer; private HtmlToken _buffered; public bool AcceptsCharacterData { get { return _acceptsCharacterData; } set { _acceptsCharacterData = value; } } public SourceManager Stream => _src; public HtmlTokenizer(SourceManager source) : base(source) { _model = HtmlParseMode.PCData; _acceptsCharacterData = false; _buffer = new StringBuilder(); } public HtmlToken Get() { HtmlToken htmlToken = _buffered; if (htmlToken != null) { _buffered = null; return htmlToken; } if (_src.IsEnded) return HtmlToken.EOF; switch (_model) { case HtmlParseMode.PCData: htmlToken = Data(_src.Current); break; case HtmlParseMode.RCData: htmlToken = RCData(_src.Current); break; case HtmlParseMode.Plaintext: htmlToken = Plaintext(_src.Current); break; case HtmlParseMode.Rawtext: htmlToken = Rawtext(_src.Current); break; case HtmlParseMode.Script: htmlToken = ScriptData(_src.Current); break; } if (_buffer.Length > 0) { _buffered = htmlToken; htmlToken = HtmlToken.Character(_buffer.ToString()); _buffer.Clear(); } _src.Advance(); return htmlToken; } public void Switch(HtmlParseMode state) { _model = state; } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = _src.Next; } } private HtmlToken Data(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(_src.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return TagOpen(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); return Data(_src.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = _src.Next; } } private HtmlToken RCData(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(_src.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return RCDataLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = _src.Next; } } private HtmlToken RCDataLT(char c) { if (c == '/') { _stringBuffer.Clear(); return RCDataEndTag(_src.Next); } _buffer.Append('<'); return RCData(c); } private HtmlToken RCDataEndTag(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return RCDataNameEndTag(_src.Next, HtmlToken.CloseTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return RCDataNameEndTag(_src.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return RCData(c); } private HtmlToken RCDataNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(_src.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(_src.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RCDataNameEndTag(_src.Next, tag); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RCDataNameEndTag(_src.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return RCData(c); } private HtmlToken Rawtext(char c) { while (true) { switch (c) { case '<': return RawtextLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = _src.Next; } } private HtmlToken RawtextLT(char c) { if (c == '/') { _stringBuffer.Clear(); return RawtextEndTag(_src.Next); } _buffer.Append('<'); return Rawtext(c); } private HtmlToken RawtextEndTag(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return RawtextNameEndTag(_src.Next, HtmlToken.CloseTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return RawtextNameEndTag(_src.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return Rawtext(c); } private HtmlToken RawtextNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(_src.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(_src.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RawtextNameEndTag(_src.Next, tag); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RawtextNameEndTag(_src.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return Rawtext(c); } private HtmlToken CData(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': _src.Back(); goto IL_0060; case ']': { if (!_src.ContinuesWith("]]>", true)) break; _src.Advance(2); goto IL_0060; } IL_0060: return HtmlToken.Character(_stringBuffer.ToString()); } _stringBuffer.Append(c); c = _src.Next; } } private string CharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) { _src.Back(); return null; } if (c == '#') { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); c = _src.Next; bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = _src.Next; } } else { num = 16; while ((c = _src.Next).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { _src.Back(2); if (flag) _src.Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceWrongNumber); return null; } if (c != ';') { RaiseErrorOccurred(ErrorCode.CharacterReferenceSemicolonMissing); _src.Back(); } if (Entities.IsInCharacterTable(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidCode); return Entities.GetSymbolFromTable(num3); } if (Entities.IsInvalidNumber(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidNumber); return '�'.ToString(); } if (Entities.IsInInvalidRange(num3)) RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidRange); return Entities.Convert(num3); } string result = string.Empty; int num5 = 0; int insertionPoint = _src.InsertionPoint - 1; char[] array = new char[31]; int num6 = 0; char c2 = _src.Current; while (c2 != ';' && c2.IsName()) { array[num6++] = c2; string name = new string(array, 0, num6); c2 = _src.Next; num5++; name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name)); if (name != null) { num5 = 0; result = name; } if (_src.IsEnded) break; } _src.Back(num5); c2 = _src.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(ErrorCode.CharacterReferenceAttributeEqualsFound); _src.InsertionPoint = insertionPoint; return null; } _src.Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceNotTerminated); } return result; } private HtmlToken TagOpen(char c) { switch (c) { case '!': return MarkupDeclaration(_src.Next); case '/': return TagEnd(_src.Next); default: if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return TagName(_src.Next, HtmlToken.OpenTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(_src.Next, HtmlToken.OpenTag()); } if (c == '?') { RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(c); } _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.AmbiguousOpenTag); _buffer.Append('<'); return Data(c); } } private HtmlToken TagEnd(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return TagName(_src.Next, HtmlToken.CloseTag()); } if (!c.IsLowercaseAscii()) { switch (c) { case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return Data(_src.Next); case '': _src.Back(); RaiseErrorOccurred(ErrorCode.EOF); _buffer.Append('<').Append('/'); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(c); } } _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(_src.Next, HtmlToken.CloseTag()); } private HtmlToken TagName(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '/': tag.Name = _stringBuffer.ToString(); return TagSelfClosing(_src.Next, tag); case '>': tag.Name = _stringBuffer.ToString(); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else _stringBuffer.Append(c); break; } c = _src.Next; } tag.Name = _stringBuffer.ToString(); return AttributeBeforeName(_src.Next, tag); } private HtmlToken TagSelfClosing(char c, HtmlTagToken tag) { switch (c) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.ClosingSlashMisplaced); return AttributeBeforeName(c, tag); } } private HtmlToken MarkupDeclaration(char c) { if (_src.ContinuesWith("--", true)) { _src.Advance(); return CommentStart(_src.Next); } if (_src.ContinuesWith("DOCTYPE", true)) { _src.Advance(6); return Doctype(_src.Next); } if (_acceptsCharacterData && _src.ContinuesWith("[CDATA[", false)) { _src.Advance(6); return CData(_src.Next); } RaiseErrorOccurred(ErrorCode.UndefinedMarkupDeclaration); return BogusComment(c); } private HtmlToken BogusComment(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': _src.Back(); goto case '>'; case '': _stringBuffer.Append('�'); break; default: _stringBuffer.Append(c); break; case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); } c = _src.Next; } } private HtmlCommentToken CommentStart(char c) { _stringBuffer.Clear(); switch (c) { case '-': return CommentDashStart(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); return Comment(_src.Next); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append(c); return Comment(_src.Next); } } private HtmlCommentToken CommentDashStart(char c) { switch (c) { case '-': return CommentEnd(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('-'); _stringBuffer.Append('�'); return Comment(_src.Next); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append('-'); _stringBuffer.Append(c); return Comment(_src.Next); } } private HtmlCommentToken Comment(char c) { while (true) { switch (c) { case '-': { HtmlCommentToken htmlCommentToken = CommentDashEnd(_src.Next); if (htmlCommentToken != null) return htmlCommentToken; break; } case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.NULL); c = '�'; _stringBuffer.Append(c); break; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlCommentToken CommentDashEnd(char c) { switch (c) { case '-': return CommentEnd(_src.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.NULL); c = '�'; break; } _stringBuffer.Append('-'); _stringBuffer.Append(c); return null; } private HtmlCommentToken CommentEnd(char c) { while (true) { switch (c) { case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('-'); _stringBuffer.Append('�'); return null; case '!': RaiseErrorOccurred(ErrorCode.CommentEndedWithEM); return CommentBangEnd(_src.Next); case '-': break; case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: RaiseErrorOccurred(ErrorCode.CommentEndedUnexpected); _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append(c); return null; } RaiseErrorOccurred(ErrorCode.CommentEndedWithDash); _stringBuffer.Append('-'); c = _src.Next; } } private HtmlCommentToken CommentBangEnd(char c) { switch (c) { case '-': _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); return CommentDashEnd(_src.Next); case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); _stringBuffer.Append('�'); return null; case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); _stringBuffer.Append(c); return null; } } private HtmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(_src.Next); if (c == '') { RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Doctype(true); } RaiseErrorOccurred(ErrorCode.DoctypeUnexpected); return DoctypeNameBefore(c); } private HtmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = _src.Next; } if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Clear(); _stringBuffer.Append('�'); return DoctypeName(_src.Next, HtmlToken.Doctype(false)); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Doctype(true); case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); return HtmlToken.Doctype(true); default: _stringBuffer.Clear(); _stringBuffer.Append(c); return DoctypeName(_src.Next, HtmlToken.Doctype(false)); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return DoctypeName(_src.Next, HtmlToken.Doctype(false)); } private HtmlToken DoctypeName(char c, HtmlDoctypeToken doctype) { while (true) { if (c.IsSpaceCharacter()) { doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeNameAfter(_src.Next, doctype); } if (c == '>') break; if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else { switch (c) { case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); doctype.IsQuirksForced = true; doctype.Name = _stringBuffer.ToString(); return doctype; default: _stringBuffer.Append(c); break; } } c = _src.Next; } _model = HtmlParseMode.PCData; doctype.Name = _stringBuffer.ToString(); return doctype; } private HtmlToken DoctypeNameAfter(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); doctype.IsQuirksForced = true; return doctype; default: if (_src.ContinuesWith("public", true)) { _src.Advance(5); return DoctypePublic(_src.Next, doctype); } if (_src.ContinuesWith("system", true)) { _src.Advance(5); return DoctypeSystem(_src.Next, doctype); } RaiseErrorOccurred(ErrorCode.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } private HtmlToken DoctypePublic(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(_src.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(_src.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } return DoctypePublicIdentifierBefore(_src.Next, doctype); } private HtmlToken DoctypePublicIdentifierBefore(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '"': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(_src.Next, doctype); case '\'': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(_src.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } private HtmlToken DoctypePublicIdentifierDoubleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '"': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); _src.Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken DoctypePublicIdentifierSingleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '\'': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); _src.Back(); return doctype; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken DoctypePublicIdentifierAfter(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(_src.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } _stringBuffer.Clear(); return DoctypeBetween(_src.Next, doctype); } private HtmlToken DoctypeBetween(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(_src.Next, doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } private HtmlToken DoctypeSystem(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(_src.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(_src.Next, doctype); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.SystemIdentifier = _stringBuffer.ToString(); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } _model = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(_src.Next, doctype); } private HtmlToken DoctypeSystemIdentifierBefore(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(_src.Next, doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(_src.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(_src.Next, doctype); } } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '"': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _src.Back(); return doctype; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '\'': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(_src.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); _src.Back(); return doctype; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken DoctypeSystemIdentifierAfter(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; _src.Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); return BogusDoctype(_src.Next, doctype); } } private HtmlToken BogusDoctype(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '': _src.Back(); return doctype; case '>': _model = HtmlParseMode.PCData; return doctype; } c = _src.Next; } } private HtmlToken AttributeBeforeName(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '/': return TagSelfClosing(_src.Next, tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Clear(); _stringBuffer.Append('�'); return AttributeName(_src.Next, tag); case '"': case '\'': case '<': case '=': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(_src.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(_src.Next, tag); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return AttributeName(_src.Next, tag); } } private HtmlToken AttributeName(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '/': tag.AddAttribute(_stringBuffer.ToString()); return TagSelfClosing(_src.Next, tag); case '=': tag.AddAttribute(_stringBuffer.ToString()); return AttributeBeforeValue(_src.Next, tag); case '>': tag.AddAttribute(_stringBuffer.ToString()); return EmitTag(tag); case '': return HtmlToken.EOF; case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; default: if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else if (c == '"' || c == '\'' || c == '<') { RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Append(c); } else { _stringBuffer.Append(c); } break; } c = _src.Next; } tag.AddAttribute(_stringBuffer.ToString()); return AttributeAfterName(_src.Next, tag); } private HtmlToken AttributeAfterName(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '/': return TagSelfClosing(_src.Next, tag); case '=': return AttributeBeforeValue(_src.Next, tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Clear(); _stringBuffer.Append('�'); return AttributeName(_src.Next, tag); case '"': case '\'': case '<': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(_src.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(_src.Next, tag); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return AttributeName(_src.Next, tag); } } private HtmlToken AttributeBeforeValue(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = _src.Next; } switch (c) { case '"': _stringBuffer.Clear(); return AttributeDoubleQuotedValue(_src.Next, tag); case '&': _stringBuffer.Clear(); return AttributeUnquotedValue(c, tag); case '\'': _stringBuffer.Clear(); return AttributeSingleQuotedValue(_src.Next, tag); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); return AttributeUnquotedValue(_src.Next, tag); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(_src.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(_src.Next, tag); } } private HtmlToken AttributeDoubleQuotedValue(char c, HtmlTagToken tag) { while (true) { switch (c) { case '"': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(_src.Next, tag); case '&': { string text = CharacterReference(_src.Next, '"'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken AttributeSingleQuotedValue(char c, HtmlTagToken tag) { while (true) { switch (c) { case '\'': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(_src.Next, tag); case '&': { string text = CharacterReference(_src.Next, '\''); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = _src.Next; } } private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '&': { string text = CharacterReference(_src.Next, '>'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '>': tag.SetAttributeValue(_stringBuffer.ToString()); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.NULL); _stringBuffer.Append('�'); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Append(c); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = _src.Next; } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeBeforeName(_src.Next, tag); } private HtmlToken AttributeAfterValue(char c, HtmlTagToken tag) { if (!c.IsSpaceCharacter()) { switch (c) { case '/': return TagSelfClosing(_src.Next, tag); case '>': return EmitTag(tag); case '': return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.AttributeNameExpected); return AttributeBeforeName(c, tag); } } return AttributeBeforeName(_src.Next, tag); } private HtmlToken ScriptData(char c) { while (true) { switch (c) { case '<': return ScriptDataLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = _src.Next; } } private HtmlToken ScriptDataLT(char c) { switch (c) { case '/': return ScriptDataEndTag(_src.Next); case '!': _buffer.Append('<').Append('!'); return ScriptDataStartEscape(_src.Next); default: _buffer.Append('<'); return ScriptData(c); } } private HtmlToken ScriptDataEndTag(char c) { if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ScriptDataNameEndTag(_src.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return ScriptData(c); } private HtmlToken ScriptDataNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString().ToLower(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(_src.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(_src.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataNameEndTag(_src.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return ScriptData(c); } private HtmlToken ScriptDataStartEscape(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataStartEscapeDash(_src.Next); } return ScriptData(c); } private HtmlToken ScriptDataEscaped(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDash(_src.Next); case '<': return ScriptDataEscapedLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); return ScriptDataEscaped(_src.Next); case '': return HtmlToken.EOF; default: return ScriptData(c); } } private HtmlToken ScriptDataStartEscapeDash(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataEscapedDashDash(_src.Next); } return ScriptData(c); } private HtmlToken ScriptDataEscapedDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDashDash(_src.Next); case '<': return ScriptDataEscapedLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); return ScriptDataEscaped(_src.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscaped(_src.Next); } } private HtmlToken ScriptDataEscapedDashDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDashDash(_src.Next); case '<': return ScriptDataEscapedLT(_src.Next); case '>': _buffer.Append('>'); return ScriptData(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); return ScriptDataEscaped(_src.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscaped(_src.Next); } } private HtmlToken ScriptDataEscapedLT(char c) { if (c == '/') return ScriptDataEndTag(_src.Next); if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); _buffer.Append('<'); _buffer.Append(c); return ScriptDataStartDoubleEscape(_src.Next); } _buffer.Append('<'); return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedEndTag(char c, HtmlTagToken tag) { if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ScriptDataEscapedEndTag(_src.Next, tag); } _buffer.Append('<').Append('/'); return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedNameTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString().ToLower(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(_src.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(_src.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataEscapedNameTag(_src.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return ScriptDataEscaped(c); } private HtmlToken ScriptDataStartDoubleEscape(char c) { if (c.IsSpaceCharacter() || c == '/' || c == '>') { _buffer.Append(c); if (string.Compare(_stringBuffer.ToString(), "script", StringComparison.OrdinalIgnoreCase) == 0) return ScriptDataEscapedDouble(_src.Next); return ScriptDataEscaped(_src.Next); } if (c.IsLetter()) { _stringBuffer.Append(c); _buffer.Append(c); return ScriptDataStartDoubleEscape(_src.Next); } return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedDouble(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDash(_src.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; } _buffer.Append(c); return ScriptDataEscapedDouble(_src.Next); } private HtmlToken ScriptDataEscapedDoubleDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDashDash(_src.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); return ScriptDataEscapedDouble(_src.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscapedDouble(_src.Next); } } private HtmlToken ScriptDataEscapedDoubleDashDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDashDash(_src.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(_src.Next); case '>': _buffer.Append('>'); return ScriptData(_src.Next); case '': RaiseErrorOccurred(ErrorCode.NULL); _buffer.Append('�'); return ScriptDataEscapedDouble(_src.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscapedDouble(_src.Next); } } private HtmlToken ScriptDataEscapedDoubleLT(char c) { if (c == '/') { _stringBuffer.Clear(); _buffer.Append('/'); return ScriptDataEndDoubleEscape(_src.Next); } return ScriptDataEscapedDouble(c); } private HtmlToken ScriptDataEndDoubleEscape(char c) { if (c.IsSpaceCharacter() || c == '/' || c == '>') { _buffer.Append(c); if (string.Compare(_stringBuffer.ToString(), "script", StringComparison.OrdinalIgnoreCase) == 0) return ScriptDataEscaped(_src.Next); return ScriptDataEscapedDouble(_src.Next); } if (c.IsLetter()) { _stringBuffer.Append(c); _buffer.Append(c); return ScriptDataEndDoubleEscape(_src.Next); } return ScriptDataEscapedDouble(c); } private HtmlTagToken EmitTag(HtmlTagToken tag) { _model = HtmlParseMode.PCData; if (tag.Type == HtmlTokenType.StartTag) { for (int num = tag.Attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { if (tag.Attributes[num2].Key == tag.Attributes[num].Key) { tag.Attributes.RemoveAt(num); RaiseErrorOccurred(ErrorCode.AttributeDuplicateOmitted); break; } } } _lastStartTag = tag.Name; } else { if (tag.IsSelfClosing) RaiseErrorOccurred(ErrorCode.EndTagCannotBeSelfClosed); if (tag.Attributes.Count != 0) RaiseErrorOccurred(ErrorCode.EndTagCannotHaveAttributes); } return tag; } } }