AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.6.0" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using System; using System.Collections.Generic; using System.Diagnostics; using System.Text; namespace AngleSharp.Parser.Html { [DebuggerStepThrough] internal sealed class HtmlTokenizer : BaseTokenizer { private bool _acceptsCharacterData; private string _lastStartTag; private HtmlParseMode _model; private StringBuilder _buffer; private HtmlToken _buffered; public bool AcceptsCharacterData { get { return _acceptsCharacterData; } set { _acceptsCharacterData = value; } } public HtmlTokenizer(ITextSource source) : base(source) { _model = HtmlParseMode.PCData; _acceptsCharacterData = false; _buffer = new StringBuilder(); } public HtmlToken Get() { HtmlToken htmlToken = _buffered; if (htmlToken != null) { _buffered = null; return htmlToken; } char next = base.Next; if (base.IsEnded) return HtmlToken.EOF; switch (_model) { case HtmlParseMode.PCData: htmlToken = Data(next); break; case HtmlParseMode.RCData: htmlToken = RCData(next); break; case HtmlParseMode.Plaintext: htmlToken = Plaintext(next); break; case HtmlParseMode.Rawtext: htmlToken = Rawtext(next); break; case HtmlParseMode.Script: htmlToken = ScriptData(next); break; } if (_buffer.Length > 0) { _buffered = htmlToken; htmlToken = HtmlToken.Character(_buffer.ToString()); _buffer.Clear(); } return htmlToken; } public void Switch(HtmlParseMode state) { _model = state; } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken Data(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(base.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return TagOpen(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); return Data(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RCData(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(base.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return RCDataLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RCDataLT(char c) { if (c == '/') { _stringBuffer.Clear(); return RCDataEndTag(base.Next); } _buffer.Append('<'); return RCData(c); } private HtmlToken RCDataEndTag(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return RCDataNameEndTag(base.Next, HtmlToken.CloseTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return RCDataNameEndTag(base.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return RCData(c); } private HtmlToken RCDataNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(base.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(base.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RCDataNameEndTag(base.Next, tag); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RCDataNameEndTag(base.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return RCData(c); } private HtmlToken Rawtext(char c) { while (true) { switch (c) { case '<': return RawtextLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RawtextLT(char c) { if (c == '/') { _stringBuffer.Clear(); return RawtextEndTag(base.Next); } _buffer.Append('<'); return Rawtext(c); } private HtmlToken RawtextEndTag(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return RawtextNameEndTag(base.Next, HtmlToken.CloseTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return RawtextNameEndTag(base.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return Rawtext(c); } private HtmlToken RawtextNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(base.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(base.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsUppercaseAscii()) { _stringBuffer.Append(char.ToLower(c)); return RawtextNameEndTag(base.Next, tag); } if (c.IsLowercaseAscii()) { _stringBuffer.Append(c); return RawtextNameEndTag(base.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return Rawtext(c); } private HtmlToken CData(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': Back(); goto IL_004c; case ']': { if (!ContinuesWith("]]>", true)) break; Advance(2); goto IL_004c; } IL_004c: return HtmlToken.Character(_stringBuffer.ToString()); } _stringBuffer.Append(c); c = base.Next; } } private string CharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) { Back(); return null; } if (c == '#') { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); c = base.Next; bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = base.Next; } } else { num = 16; while ((c = base.Next).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { Back(2); if (flag) Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceWrongNumber); return null; } if (c != ';') { RaiseErrorOccurred(ErrorCode.CharacterReferenceSemicolonMissing); Back(); } if (Entities.IsInCharacterTable(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidCode); return Entities.GetSymbolFromTable(num3); } if (Entities.IsInvalidNumber(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidNumber); return '�'.ToString(); } if (Entities.IsInInvalidRange(num3)) RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidRange); return Entities.Convert(num3); } string result = null; int num5 = 0; int insertionPoint = base.InsertionPoint - 1; char[] array = new char[31]; int num6 = 0; char c2 = base.Current; while (c2 != ';' && c2.IsName()) { array[num6++] = c2; string name = new string(array, 0, num6); c2 = base.Next; num5++; name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name)); if (name != null) { num5 = 0; result = name; } if (base.IsEnded || num6 >= 31) break; } Back(num5); c2 = base.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(ErrorCode.CharacterReferenceAttributeEqualsFound); base.InsertionPoint = insertionPoint; return null; } Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceNotTerminated); } return result; } private HtmlToken TagOpen(char c) { switch (c) { case '!': return MarkupDeclaration(base.Next); case '/': return TagEnd(base.Next); default: if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return TagName(base.Next, HtmlToken.OpenTag()); } if (c.IsLowercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(base.Next, HtmlToken.OpenTag()); } if (c == '?') { RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(c); } _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.AmbiguousOpenTag); _buffer.Append('<'); return Data(c); } } private HtmlToken TagEnd(char c) { if (c.IsUppercaseAscii()) { _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return TagName(base.Next, HtmlToken.CloseTag()); } if (!c.IsLowercaseAscii()) { switch (c) { case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return Data(base.Next); case '': Back(); RaiseErrorOccurred(ErrorCode.EOF); _buffer.Append('<').Append('/'); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(c); } } _stringBuffer.Clear(); _stringBuffer.Append(c); return TagName(base.Next, HtmlToken.CloseTag()); } private HtmlToken TagName(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '/': tag.Name = _stringBuffer.ToString(); return TagSelfClosing(base.Next, tag); case '>': tag.Name = _stringBuffer.ToString(); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else _stringBuffer.Append(c); break; } c = base.Next; } tag.Name = _stringBuffer.ToString(); return AttributeBeforeName(base.Next, tag); } private HtmlToken TagSelfClosing(char c, HtmlTagToken tag) { switch (c) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.ClosingSlashMisplaced); return AttributeBeforeName(c, tag); } } private HtmlToken MarkupDeclaration(char c) { if (ContinuesWith("--", true)) { Advance(); return CommentStart(base.Next); } if (ContinuesWith(Tags.Doctype, true)) { Advance(6); return Doctype(base.Next); } if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) { Advance(6); return CData(base.Next); } RaiseErrorOccurred(ErrorCode.UndefinedMarkupDeclaration); return BogusComment(c); } private HtmlToken BogusComment(char c) { _stringBuffer.Clear(); while (true) { switch (c) { case '': Back(); goto case '>'; case '': _stringBuffer.Append('�'); break; default: _stringBuffer.Append(c); break; case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); } c = base.Next; } } private HtmlCommentToken CommentStart(char c) { _stringBuffer.Clear(); switch (c) { case '-': return CommentDashStart(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); return Comment(base.Next); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append(c); return Comment(base.Next); } } private HtmlCommentToken CommentDashStart(char c) { switch (c) { case '-': return CommentEnd(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-'); _stringBuffer.Append('�'); return Comment(base.Next); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append('-'); _stringBuffer.Append(c); return Comment(base.Next); } } private HtmlCommentToken Comment(char c) { while (true) { switch (c) { case '-': { HtmlCommentToken htmlCommentToken = CommentDashEnd(base.Next); if (htmlCommentToken != null) return htmlCommentToken; break; } case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.Null); c = '�'; _stringBuffer.Append(c); break; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlCommentToken CommentDashEnd(char c) { switch (c) { case '-': return CommentEnd(base.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.Null); c = '�'; break; } _stringBuffer.Append('-'); _stringBuffer.Append(c); return null; } private HtmlCommentToken CommentEnd(char c) { while (true) { switch (c) { case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-'); _stringBuffer.Append('�'); return null; case '!': RaiseErrorOccurred(ErrorCode.CommentEndedWithEM); return CommentBangEnd(base.Next); case '-': break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: RaiseErrorOccurred(ErrorCode.CommentEndedUnexpected); _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append(c); return null; } RaiseErrorOccurred(ErrorCode.CommentEndedWithDash); _stringBuffer.Append('-'); c = base.Next; } } private HtmlCommentToken CommentBangEnd(char c) { switch (c) { case '-': _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); return CommentDashEnd(base.Next); case '>': _model = HtmlParseMode.PCData; return HtmlToken.Comment(_stringBuffer.ToString()); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); _stringBuffer.Append('�'); return null; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Comment(_stringBuffer.ToString()); default: _stringBuffer.Append('-'); _stringBuffer.Append('-'); _stringBuffer.Append('!'); _stringBuffer.Append(c); return null; } } private HtmlToken Doctype(char c) { if (c.IsSpaceCharacter()) return DoctypeNameBefore(base.Next); if (c == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Doctype(true); } RaiseErrorOccurred(ErrorCode.DoctypeUnexpected); return DoctypeNameBefore(c); } private HtmlToken DoctypeNameBefore(char c) { while (c.IsSpaceCharacter()) { c = base.Next; } if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear(); _stringBuffer.Append('�'); return DoctypeName(base.Next, HtmlToken.Doctype(false)); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return HtmlToken.Doctype(true); case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return HtmlToken.Doctype(true); default: _stringBuffer.Clear(); _stringBuffer.Append(c); return DoctypeName(base.Next, HtmlToken.Doctype(false)); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return DoctypeName(base.Next, HtmlToken.Doctype(false)); } private HtmlToken DoctypeName(char c, HtmlDoctypeToken doctype) { while (true) { if (c.IsSpaceCharacter()) { doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeNameAfter(base.Next, doctype); } if (c == '>') break; if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; doctype.Name = _stringBuffer.ToString(); return doctype; default: _stringBuffer.Append(c); break; } } c = base.Next; } _model = HtmlParseMode.PCData; doctype.Name = _stringBuffer.ToString(); return doctype; } private HtmlToken DoctypeNameAfter(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; return doctype; default: if (ContinuesWith("public", true)) { Advance(5); return DoctypePublic(base.Next, doctype); } if (ContinuesWith("system", true)) { Advance(5); return DoctypeSystem(base.Next, doctype); } RaiseErrorOccurred(ErrorCode.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } private HtmlToken DoctypePublic(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(base.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(base.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } return DoctypePublicIdentifierBefore(base.Next, doctype); } private HtmlToken DoctypePublicIdentifierBefore(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '"': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(base.Next, doctype); case '\'': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(base.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } private HtmlToken DoctypePublicIdentifierDoubleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '"': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken DoctypePublicIdentifierSingleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '\'': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); Back(); return doctype; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken DoctypePublicIdentifierAfter(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(base.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } _stringBuffer.Clear(); return DoctypeBetween(base.Next, doctype); } private HtmlToken DoctypeBetween(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(base.Next, doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } private HtmlToken DoctypeSystem(char c, HtmlDoctypeToken doctype) { if (!c.IsSpaceCharacter()) { switch (c) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(base.Next, doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(base.Next, doctype); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.SystemIdentifier = _stringBuffer.ToString(); doctype.IsQuirksForced = true; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } _model = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(base.Next, doctype); } private HtmlToken DoctypeSystemIdentifierBefore(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(base.Next, doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(base.Next, doctype); case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(base.Next, doctype); } } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '"': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); return doctype; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '\'': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(base.Next, doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _model = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); return doctype; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken DoctypeSystemIdentifierAfter(char c, HtmlDoctypeToken doctype) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '>': _model = HtmlParseMode.PCData; return doctype; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); return doctype; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); return BogusDoctype(base.Next, doctype); } } private HtmlToken BogusDoctype(char c, HtmlDoctypeToken doctype) { while (true) { switch (c) { case '': Back(); return doctype; case '>': _model = HtmlParseMode.PCData; return doctype; } c = base.Next; } } private HtmlToken AttributeBeforeName(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '/': return TagSelfClosing(base.Next, tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear(); _stringBuffer.Append('�'); return AttributeName(base.Next, tag); case '"': case '\'': case '<': case '=': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(base.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(base.Next, tag); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return AttributeName(base.Next, tag); } } private HtmlToken AttributeName(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '/': tag.AddAttribute(_stringBuffer.ToString()); return TagSelfClosing(base.Next, tag); case '=': tag.AddAttribute(_stringBuffer.ToString()); return AttributeBeforeValue(base.Next, tag); case '>': tag.AddAttribute(_stringBuffer.ToString()); return EmitTag(tag); case '': return HtmlToken.EOF; case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; default: if (c.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(c)); else if (c == '"' || c == '\'' || c == '<') { RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Append(c); } else { _stringBuffer.Append(c); } break; } c = base.Next; } tag.AddAttribute(_stringBuffer.ToString()); return AttributeAfterName(base.Next, tag); } private HtmlToken AttributeAfterName(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '/': return TagSelfClosing(base.Next, tag); case '=': return AttributeBeforeValue(base.Next, tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear(); _stringBuffer.Append('�'); return AttributeName(base.Next, tag); case '"': case '\'': case '<': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(base.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear(); _stringBuffer.Append(c); return AttributeName(base.Next, tag); } } _stringBuffer.Clear(); _stringBuffer.Append(char.ToLower(c)); return AttributeName(base.Next, tag); } } private HtmlToken AttributeBeforeValue(char c, HtmlTagToken tag) { while (c.IsSpaceCharacter()) { c = base.Next; } switch (c) { case '"': _stringBuffer.Clear(); return AttributeDoubleQuotedValue(base.Next, tag); case '&': _stringBuffer.Clear(); return AttributeUnquotedValue(c, tag); case '\'': _stringBuffer.Clear(); return AttributeSingleQuotedValue(base.Next, tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); return AttributeUnquotedValue(base.Next, tag); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(base.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(base.Next, tag); } } private HtmlToken AttributeDoubleQuotedValue(char c, HtmlTagToken tag) { while (true) { switch (c) { case '"': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(base.Next, tag); case '&': { string text = CharacterReference(base.Next, '"'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken AttributeSingleQuotedValue(char c, HtmlTagToken tag) { while (true) { switch (c) { case '\'': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(base.Next, tag); case '&': { string text = CharacterReference(base.Next, '\''); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = base.Next; } } private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '&': { string text = CharacterReference(base.Next, '>'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '>': tag.SetAttributeValue(_stringBuffer.ToString()); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Append(c); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = base.Next; } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeBeforeName(base.Next, tag); } private HtmlToken AttributeAfterValue(char c, HtmlTagToken tag) { if (!c.IsSpaceCharacter()) { switch (c) { case '/': return TagSelfClosing(base.Next, tag); case '>': return EmitTag(tag); case '': return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.AttributeNameExpected); return AttributeBeforeName(c, tag); } } return AttributeBeforeName(base.Next, tag); } private HtmlToken ScriptData(char c) { while (true) { switch (c) { case '<': return ScriptDataLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken ScriptDataLT(char c) { switch (c) { case '/': return ScriptDataEndTag(base.Next); case '!': _buffer.Append('<').Append('!'); return ScriptDataStartEscape(base.Next); default: _buffer.Append('<'); return ScriptData(c); } } private HtmlToken ScriptDataEndTag(char c) { if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ScriptDataNameEndTag(base.Next, HtmlToken.CloseTag()); } _buffer.Append('<').Append('/'); return ScriptData(c); } private HtmlToken ScriptDataNameEndTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString().ToLower(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(base.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(base.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataNameEndTag(base.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return ScriptData(c); } private HtmlToken ScriptDataStartEscape(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataStartEscapeDash(base.Next); } return ScriptData(c); } private HtmlToken ScriptDataEscaped(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDash(base.Next); case '<': return ScriptDataEscapedLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: return ScriptData(c); } } private HtmlToken ScriptDataStartEscapeDash(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataEscapedDashDash(base.Next); } return ScriptData(c); } private HtmlToken ScriptDataEscapedDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDashDash(base.Next); case '<': return ScriptDataEscapedLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscaped(base.Next); } } private HtmlToken ScriptDataEscapedDashDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDashDash(base.Next); case '<': return ScriptDataEscapedLT(base.Next); case '>': _buffer.Append('>'); return ScriptData(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscaped(base.Next); } } private HtmlToken ScriptDataEscapedLT(char c) { if (c == '/') return ScriptDataEndTag(base.Next); if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); _buffer.Append('<'); _buffer.Append(c); return ScriptDataStartDoubleEscape(base.Next); } _buffer.Append('<'); return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedEndTag(char c, HtmlTagToken tag) { if (c.IsLetter()) { _stringBuffer.Clear(); _stringBuffer.Append(c); return ScriptDataEscapedEndTag(base.Next, tag); } _buffer.Append('<').Append('/'); return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedNameTag(char c, HtmlTagToken tag) { string text = _stringBuffer.ToString().ToLower(); bool flag = text == _lastStartTag; if (flag && c.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(base.Next, tag); } if (flag && c == '/') { tag.Name = text; return TagSelfClosing(base.Next, tag); } if (flag && c == '>') { tag.Name = text; return EmitTag(tag); } if (c.IsLetter()) { _stringBuffer.Append(c); return ScriptDataEscapedNameTag(base.Next, tag); } _buffer.Append('<').Append('/'); _buffer.Append(_stringBuffer.ToString()); return ScriptDataEscaped(c); } private HtmlToken ScriptDataStartDoubleEscape(char c) { if (c.IsSpaceCharacter() || c == '/' || c == '>') { _buffer.Append(c); if (string.Compare(_stringBuffer.ToString(), "script", StringComparison.OrdinalIgnoreCase) == 0) return ScriptDataEscapedDouble(base.Next); return ScriptDataEscaped(base.Next); } if (c.IsLetter()) { _stringBuffer.Append(c); _buffer.Append(c); return ScriptDataStartDoubleEscape(base.Next); } return ScriptDataEscaped(c); } private HtmlToken ScriptDataEscapedDouble(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDash(base.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; } _buffer.Append(c); return ScriptDataEscapedDouble(base.Next); } private HtmlToken ScriptDataEscapedDoubleDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDashDash(base.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscapedDouble(base.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscapedDouble(base.Next); } } private HtmlToken ScriptDataEscapedDoubleDashDash(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDashDash(base.Next); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(base.Next); case '>': _buffer.Append('>'); return ScriptData(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscapedDouble(base.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(c); return ScriptDataEscapedDouble(base.Next); } } private HtmlToken ScriptDataEscapedDoubleLT(char c) { if (c == '/') { _stringBuffer.Clear(); _buffer.Append('/'); return ScriptDataEndDoubleEscape(base.Next); } return ScriptDataEscapedDouble(c); } private HtmlToken ScriptDataEndDoubleEscape(char c) { if (c.IsSpaceCharacter() || c == '/' || c == '>') { _buffer.Append(c); if (string.Compare(_stringBuffer.ToString(), "script", StringComparison.OrdinalIgnoreCase) == 0) return ScriptDataEscaped(base.Next); return ScriptDataEscapedDouble(base.Next); } if (c.IsLetter()) { _stringBuffer.Append(c); _buffer.Append(c); return ScriptDataEndDoubleEscape(base.Next); } return ScriptDataEscapedDouble(c); } private HtmlTagToken EmitTag(HtmlTagToken tag) { _model = HtmlParseMode.PCData; if (tag.Type == HtmlTokenType.StartTag) { for (int num = tag.Attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { if (tag.Attributes[num2].Key == tag.Attributes[num].Key) { tag.Attributes.RemoveAt(num); RaiseErrorOccurred(ErrorCode.AttributeDuplicateOmitted); break; } } } _lastStartTag = tag.Name; } else { if (tag.IsSelfClosing) RaiseErrorOccurred(ErrorCode.EndTagCannotBeSelfClosed); if (tag.Attributes.Count != 0) RaiseErrorOccurred(ErrorCode.EndTagCannotHaveAttributes); } return tag; } } }