AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.7.0" />

 HtmlTokenizer

sealed class HtmlTokenizer : BaseTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at: http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Collections.Generic; using System.Diagnostics; using System.Text; namespace AngleSharp.Parser.Html { [DebuggerStepThrough] internal sealed class HtmlTokenizer : BaseTokenizer { private readonly StringBuilder _buffer; private bool _acceptsCharacterData; private string _lastStartTag; private HtmlParseMode _state; private HtmlToken _buffered; public bool IsAcceptingCharacterData { get { return _acceptsCharacterData; } set { _acceptsCharacterData = value; } } public HtmlParseMode State { get { return _state; } set { _state = value; } } public HtmlTokenizer(ITextSource source) : base(source) { _state = HtmlParseMode.PCData; _acceptsCharacterData = false; _buffer = new StringBuilder(); } public HtmlToken Get() { HtmlToken htmlToken = _buffered; if (htmlToken != null) { _buffered = null; return htmlToken; } char next = base.Next; if (base.IsEnded) return HtmlToken.EOF; switch (_state) { case HtmlParseMode.PCData: htmlToken = Data(next); break; case HtmlParseMode.RCData: htmlToken = RCData(next); break; case HtmlParseMode.Plaintext: htmlToken = Plaintext(next); break; case HtmlParseMode.Rawtext: htmlToken = Rawtext(next); break; case HtmlParseMode.Script: htmlToken = ScriptData(next); break; } if (_buffer.Length > 0) { _buffered = htmlToken; htmlToken = HtmlToken.Character(_buffer.ToString()); _buffer.Clear(); } return htmlToken; } public override void Dispose() { base.Dispose(); _buffer.ToPool(); } private HtmlToken Plaintext(char c) { while (true) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken Data(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(base.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return TagOpen(); case '': RaiseErrorOccurred(ErrorCode.Null); return Data(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RCData(char c) { while (true) { switch (c) { case '&': { string text = CharacterReference(base.Next, ''); if (text == null) _buffer.Append('&'); _buffer.Append(text); break; } case '<': return RCDataLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RCDataLT() { TextPosition currentPosition = GetCurrentPosition(); char next = base.Next; if (next == '/') { _stringBuffer.Clear(); return RCDataEndTag(currentPosition); } _buffer.Append('<'); return RCData(next); } private HtmlToken RCDataEndTag(TextPosition position) { char next = base.Next; if (next.IsUppercaseAscii()) _stringBuffer.Clear().Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) { _buffer.Append('<').Append('/'); return RCData(next); } _stringBuffer.Clear().Append(next); } HtmlTagToken htmlTagToken = HtmlToken.CloseTag(); htmlTagToken.Start = position; return RCDataNameEndTag(htmlTagToken); } private HtmlToken RCDataNameEndTag(HtmlTagToken tag) { char next; while (true) { next = base.Next; string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } if (flag && next == '/') { tag.Name = text; return TagSelfClosing(tag); } if (flag && next == '>') { tag.Name = text; return EmitTag(tag); } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) break; _stringBuffer.Append(next); } } _buffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return RCData(next); } private HtmlToken Rawtext(char c) { while (true) { switch (c) { case '<': return RawtextLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken RawtextLT() { TextPosition currentPosition = GetCurrentPosition(); char next = base.Next; if (next == '/') { _stringBuffer.Clear(); return RawtextEndTag(currentPosition); } _buffer.Append('<'); return Rawtext(next); } private HtmlToken RawtextEndTag(TextPosition position) { char next = base.Next; if (next.IsUppercaseAscii()) _stringBuffer.Clear().Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) { _buffer.Append('<').Append('/'); return Rawtext(next); } _stringBuffer.Clear().Append(next); } HtmlTagToken htmlTagToken = HtmlToken.CloseTag(); htmlTagToken.Start = position; return RawtextNameEndTag(htmlTagToken); } private HtmlToken RawtextNameEndTag(HtmlTagToken tag) { char next; while (true) { next = base.Next; string text = _stringBuffer.ToString(); bool flag = text == _lastStartTag; if (flag && next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } if (flag && next == '/') { tag.Name = text; return TagSelfClosing(tag); } if (flag && next == '>') { tag.Name = text; return EmitTag(tag); } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { if (!next.IsLowercaseAscii()) break; _stringBuffer.Append(next); } } _buffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return Rawtext(next); } private HtmlToken CData() { char next = base.Next; _stringBuffer.Clear(); while (true) { switch (next) { case '': Back(); goto IL_0052; case ']': { if (!ContinuesWith("]]>", true)) break; Advance(2); goto IL_0052; } IL_0052: return HtmlToken.Character(_stringBuffer.ToString()); } _stringBuffer.Append(next); next = base.Next; } } private string CharacterReference(char c, char allowedCharacter = '') { if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) { Back(); return null; } if (c == '#') { int num = 10; int num2 = 1; int num3 = 0; List<int> list = new List<int>(); c = base.Next; bool flag = c == 'x' || c == 'X'; if (!flag) { while (c.IsDigit()) { list.Add(c.FromHex()); c = base.Next; } } else { num = 16; while ((c = base.Next).IsHex()) { list.Add(c.FromHex()); } } for (int num4 = list.Count - 1; num4 >= 0; num4--) { num3 += list[num4] * num2; num2 *= num; } if (list.Count == 0) { Back(2); if (flag) Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceWrongNumber); return null; } if (c != ';') { RaiseErrorOccurred(ErrorCode.CharacterReferenceSemicolonMissing); Back(); } if (Entities.IsInCharacterTable(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidCode); return Entities.GetSymbolFromTable(num3); } if (Entities.IsInvalidNumber(num3)) { RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidNumber); return '�'.ToString(); } if (Entities.IsInInvalidRange(num3)) RaiseErrorOccurred(ErrorCode.CharacterReferenceInvalidRange); return Entities.Convert(num3); } string result = null; int num5 = 0; int insertionPoint = base.InsertionPoint - 1; char[] array = new char[31]; int num6 = 0; char c2 = base.Current; while (c2 != ';' && c2.IsName()) { array[num6++] = c2; string name = new string(array, 0, num6); c2 = base.Next; num5++; name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name)); if (name != null) { num5 = 0; result = name; } if (base.IsEnded || num6 >= 31) break; } Back(num5); c2 = base.Current; if (c2 != ';') { if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) { if (c2 == '=') RaiseErrorOccurred(ErrorCode.CharacterReferenceAttributeEqualsFound); base.InsertionPoint = insertionPoint; return null; } Back(); RaiseErrorOccurred(ErrorCode.CharacterReferenceNotTerminated); } return result; } private HtmlToken TagOpen() { TextPosition currentPosition = GetCurrentPosition(); char next = base.Next; if (next == '/') return TagEnd(base.Next, currentPosition); if (next.IsLowercaseAscii()) { HtmlTagToken htmlTagToken = HtmlToken.OpenTag(); htmlTagToken.Start = currentPosition; _stringBuffer.Clear().Append(next); return TagName(htmlTagToken); } if (!next.IsUppercaseAscii()) { switch (next) { case '!': return MarkupDeclaration(currentPosition); case '?': RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(next, currentPosition); default: _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.AmbiguousOpenTag); _buffer.Append('<'); return Data(next); } } HtmlTagToken htmlTagToken2 = HtmlToken.OpenTag(); htmlTagToken2.Start = currentPosition; _stringBuffer.Clear().Append(char.ToLower(next)); return TagName(htmlTagToken2); } private HtmlToken TagEnd(char c, TextPosition position) { if (c.IsLowercaseAscii()) { HtmlTagToken htmlTagToken = HtmlToken.CloseTag(); htmlTagToken.Start = position; _stringBuffer.Clear().Append(c); return TagName(htmlTagToken); } if (!c.IsUppercaseAscii()) { switch (c) { case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); return Data(base.Next); case '': Back(); RaiseErrorOccurred(ErrorCode.EOF); _buffer.Append('<').Append('/'); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.BogusComment); return BogusComment(c, position); } } HtmlTagToken htmlTagToken2 = HtmlToken.CloseTag(); htmlTagToken2.Start = position; _stringBuffer.Clear().Append(char.ToLower(c)); return TagName(htmlTagToken2); } private HtmlToken TagName(HtmlTagToken tag) { while (true) { char next = base.Next; if (next.IsSpaceCharacter()) break; switch (next) { case '/': tag.Name = _stringBuffer.ToString(); return TagSelfClosing(tag); case '>': tag.Name = _stringBuffer.ToString(); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else _stringBuffer.Append(next); break; } } tag.Name = _stringBuffer.ToString(); return AttributeBeforeName(tag); } private HtmlToken TagSelfClosing(HtmlTagToken tag) { switch (base.Next) { case '>': tag.IsSelfClosing = true; return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.ClosingSlashMisplaced); Back(); return AttributeBeforeName(tag); } } private HtmlToken MarkupDeclaration(TextPosition position) { char next = base.Next; if (ContinuesWith("--", true)) { Advance(); return CommentStart(position); } if (ContinuesWith(Tags.Doctype, true)) { Advance(6); return Doctype(position); } if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) { Advance(6); return CData(); } RaiseErrorOccurred(ErrorCode.UndefinedMarkupDeclaration); return BogusComment(next, position); } private HtmlToken BogusComment(char c, TextPosition position) { _stringBuffer.Clear(); while (true) { switch (c) { case '': Back(); goto case '>'; case '': _stringBuffer.Append('�'); c = base.Next; break; default: _stringBuffer.Append(c); c = base.Next; break; case '>': _state = HtmlParseMode.PCData; return EmitComment(position); } } } private HtmlCommentToken CommentStart(TextPosition position) { char next = base.Next; _stringBuffer.Clear(); switch (next) { case '-': return CommentDashStart(position); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); return Comment(position); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); break; default: _stringBuffer.Append(next); return Comment(position); } return EmitComment(position); } private HtmlCommentToken CommentDashStart(TextPosition position) { char next = base.Next; switch (next) { case '-': return CommentEnd(position); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-').Append('�'); return Comment(position); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); break; default: _stringBuffer.Append('-').Append(next); return Comment(position); } return EmitComment(position); } private HtmlCommentToken Comment(TextPosition position) { while (true) { char next = base.Next; switch (next) { case '-': { HtmlCommentToken htmlCommentToken = CommentDashEnd(position); if (htmlCommentToken != null) return htmlCommentToken; break; } case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return EmitComment(position); case '': RaiseErrorOccurred(ErrorCode.Null); next = '�'; _stringBuffer.Append(next); break; default: _stringBuffer.Append(next); break; } } } private HtmlCommentToken CommentDashEnd(TextPosition position) { char c = base.Next; switch (c) { case '-': return CommentEnd(position); case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); return EmitComment(position); case '': RaiseErrorOccurred(ErrorCode.Null); c = '�'; break; } _stringBuffer.Append('-').Append(c); return null; } private HtmlCommentToken CommentEnd(TextPosition position) { while (true) { char next = base.Next; switch (next) { case '>': _state = HtmlParseMode.PCData; goto IL_00b8; case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-').Append('�'); return null; case '!': RaiseErrorOccurred(ErrorCode.CommentEndedWithEM); return CommentBangEnd(position); case '-': break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); goto IL_00b8; default: { RaiseErrorOccurred(ErrorCode.CommentEndedUnexpected); _stringBuffer.Append('-').Append('-').Append(next); return null; } IL_00b8: return EmitComment(position); } RaiseErrorOccurred(ErrorCode.CommentEndedWithDash); _stringBuffer.Append('-'); } } private HtmlCommentToken CommentBangEnd(TextPosition position) { char next = base.Next; switch (next) { case '-': _stringBuffer.Append('-').Append('-').Append('!'); return CommentDashEnd(position); case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('-').Append('-').Append('!') .Append('�'); return null; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); break; default: _stringBuffer.Append('-').Append('-').Append('!') .Append(next); return null; } return EmitComment(position); } private HtmlToken Doctype(TextPosition position) { char next = base.Next; if (next.IsSpaceCharacter()) return DoctypeNameBefore(base.Next, position); if (next == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); HtmlDoctypeToken htmlDoctypeToken = HtmlToken.Doctype(true); htmlDoctypeToken.Start = position; htmlDoctypeToken.End = GetCurrentPosition(); return htmlDoctypeToken; } RaiseErrorOccurred(ErrorCode.DoctypeUnexpected); return DoctypeNameBefore(next, position); } private HtmlToken DoctypeNameBefore(char c, TextPosition position) { while (c.IsSpaceCharacter()) { c = base.Next; } if (!c.IsUppercaseAscii()) { switch (c) { case '': { HtmlDoctypeToken htmlDoctypeToken4 = HtmlToken.Doctype(false); htmlDoctypeToken4.Start = position; RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear().Append('�'); return DoctypeName(htmlDoctypeToken4); } case '>': { HtmlDoctypeToken htmlDoctypeToken3 = HtmlToken.Doctype(true); htmlDoctypeToken3.Start = position; _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); htmlDoctypeToken3.End = GetCurrentPosition(); return htmlDoctypeToken3; } case '': { HtmlDoctypeToken htmlDoctypeToken2 = HtmlToken.Doctype(true); htmlDoctypeToken2.Start = position; RaiseErrorOccurred(ErrorCode.EOF); Back(); htmlDoctypeToken2.End = GetCurrentPosition(); return htmlDoctypeToken2; } default: { HtmlDoctypeToken htmlDoctypeToken = HtmlToken.Doctype(false); htmlDoctypeToken.Start = position; _stringBuffer.Clear().Append(c); return DoctypeName(htmlDoctypeToken); } } } HtmlDoctypeToken htmlDoctypeToken5 = HtmlToken.Doctype(false); htmlDoctypeToken5.Start = position; _stringBuffer.Clear().Append(char.ToLower(c)); return DoctypeName(htmlDoctypeToken5); } private HtmlToken DoctypeName(HtmlDoctypeToken doctype) { while (true) { char next = base.Next; if (next.IsSpaceCharacter()) { doctype.Name = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeNameAfter(doctype); } if (next == '>') { _state = HtmlParseMode.PCData; doctype.Name = _stringBuffer.ToString(); break; } if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else { switch (next) { case '': break; case '': goto IL_0098; default: goto IL_00bf; } RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); } continue; IL_00bf: _stringBuffer.Append(next); continue; IL_0098: RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; doctype.Name = _stringBuffer.ToString(); break; } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; break; default: if (ContinuesWith("public", true)) { Advance(5); return DoctypePublic(doctype); } if (ContinuesWith("system", true)) { Advance(5); return DoctypeSystem(doctype); } RaiseErrorOccurred(ErrorCode.DoctypeUnexpectedAfterName); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypePublic(HtmlDoctypeToken doctype) { char next = base.Next; if (next.IsSpaceCharacter()) return DoctypePublicIdentifierBefore(doctype); switch (next) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierDoubleQuoted(doctype); case '\'': _stringBuffer.Clear(); doctype.PublicIdentifier = string.Empty; return DoctypePublicIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypePublicInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = base.Next; switch (next) { case '"': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00be; case '': RaiseErrorOccurred(ErrorCode.EOF); Back(); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00be; default: { _stringBuffer.Append(next); break; } IL_00be: doctype.End = GetCurrentPosition(); return doctype; } } } private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = base.Next; switch (next) { case '\'': doctype.PublicIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypePublicIdentifierAfter(doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); goto IL_00be; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.PublicIdentifier = _stringBuffer.ToString(); Back(); goto IL_00be; default: { _stringBuffer.Append(next); break; } IL_00be: doctype.End = GetCurrentPosition(); return doctype; } } } private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype) { char next = base.Next; if (next.IsSpaceCharacter()) { _stringBuffer.Clear(); return DoctypeBetween(doctype); } switch (next) { case '>': _state = HtmlParseMode.PCData; break; case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype) { char next = base.Next; if (next.IsSpaceCharacter()) { _state = HtmlParseMode.PCData; return DoctypeSystemIdentifierBefore(doctype); } switch (next) { case '"': RaiseErrorOccurred(ErrorCode.DoubleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': RaiseErrorOccurred(ErrorCode.SingleQuotationMarkUnexpected); doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.SystemIdentifier = _stringBuffer.ToString(); doctype.IsQuirksForced = true; break; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypeSystemInvalid); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '"': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierDoubleQuoted(doctype); case '\'': doctype.SystemIdentifier = string.Empty; return DoctypeSystemIdentifierSingleQuoted(doctype); case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); break; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); doctype.IsQuirksForced = true; return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = base.Next; switch (next) { case '"': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); goto IL_00be; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); goto IL_00be; default: { _stringBuffer.Append(next); break; } IL_00be: doctype.End = GetCurrentPosition(); return doctype; } } } private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype) { while (true) { char next = base.Next; switch (next) { case '\'': doctype.SystemIdentifier = _stringBuffer.ToString(); _stringBuffer.Clear(); return DoctypeSystemIdentifierAfter(doctype); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '>': _state = HtmlParseMode.PCData; RaiseErrorOccurred(ErrorCode.TagClosedWrong); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); goto IL_00d3; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; doctype.SystemIdentifier = _stringBuffer.ToString(); Back(); goto IL_00d3; default: { _stringBuffer.Append(next); break; } IL_00d3: doctype.End = GetCurrentPosition(); return doctype; } } } private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype) { switch (SkipSpaces()) { case '>': _state = HtmlParseMode.PCData; break; case '': RaiseErrorOccurred(ErrorCode.EOF); doctype.IsQuirksForced = true; Back(); break; default: RaiseErrorOccurred(ErrorCode.DoctypeInvalidCharacter); return BogusDoctype(doctype); } doctype.End = GetCurrentPosition(); return doctype; } private HtmlToken BogusDoctype(HtmlDoctypeToken doctype) { while (true) { switch (base.Next) { case '>': _state = HtmlParseMode.PCData; goto IL_0020; case '': { Back(); goto IL_0020; } IL_0020: doctype.End = GetCurrentPosition(); return doctype; } } } private HtmlToken AttributeBeforeName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear().Append('�'); return AttributeName(tag); case '"': case '\'': case '<': case '=': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear().Append(c); return AttributeName(tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear().Append(c); return AttributeName(tag); } } _stringBuffer.Clear().Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeName(HtmlTagToken tag) { while (true) { char next = base.Next; if (next.IsSpaceCharacter()) break; switch (next) { case '/': tag.AddAttribute(_stringBuffer.ToString()); return TagSelfClosing(tag); case '=': tag.AddAttribute(_stringBuffer.ToString()); return AttributeBeforeValue(tag); case '>': tag.AddAttribute(_stringBuffer.ToString()); return EmitTag(tag); case '': return HtmlToken.EOF; case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; default: if (next.IsUppercaseAscii()) _stringBuffer.Append(char.ToLower(next)); else if (next == '"' || next == '\'' || next == '<') { RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Append(next); } else { _stringBuffer.Append(next); } break; } } tag.AddAttribute(_stringBuffer.ToString()); return AttributeAfterName(tag); } private HtmlToken AttributeAfterName(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '/': return TagSelfClosing(tag); case '=': return AttributeBeforeValue(tag); case '>': return EmitTag(tag); default: if (!c.IsUppercaseAscii()) { switch (c) { case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Clear().Append('�'); return AttributeName(tag); case '"': case '\'': case '<': RaiseErrorOccurred(ErrorCode.AttributeNameInvalid); _stringBuffer.Clear().Append(c); return AttributeName(tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear().Append(c); return AttributeName(tag); } } _stringBuffer.Clear().Append(char.ToLower(c)); return AttributeName(tag); } } private HtmlToken AttributeBeforeValue(HtmlTagToken tag) { char c = SkipSpaces(); switch (c) { case '"': _stringBuffer.Clear(); return AttributeDoubleQuotedValue(tag); case '&': _stringBuffer.Clear(); return AttributeUnquotedValue(c, tag); case '\'': _stringBuffer.Clear(); return AttributeSingleQuotedValue(tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); return AttributeUnquotedValue(base.Next, tag); case '>': RaiseErrorOccurred(ErrorCode.TagClosedWrong); return EmitTag(tag); case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(base.Next, tag); case '': return HtmlToken.EOF; default: _stringBuffer.Clear().Append(c); return AttributeUnquotedValue(base.Next, tag); } } private HtmlToken AttributeDoubleQuotedValue(HtmlTagToken tag) { while (true) { char next = base.Next; switch (next) { case '"': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(tag); case '&': { string text = CharacterReference(base.Next, '"'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(next); break; } } } private HtmlToken AttributeSingleQuotedValue(HtmlTagToken tag) { while (true) { char next = base.Next; switch (next) { case '\'': tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeAfterValue(tag); case '&': { string text = CharacterReference(base.Next, '\''); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(next); break; } } } private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag) { while (!c.IsSpaceCharacter()) { switch (c) { case '&': { string text = CharacterReference(base.Next, '>'); if (text == null) _stringBuffer.Append('&'); else _stringBuffer.Append(text); break; } case '>': tag.SetAttributeValue(_stringBuffer.ToString()); return EmitTag(tag); case '': RaiseErrorOccurred(ErrorCode.Null); _stringBuffer.Append('�'); break; case '"': case '\'': case '<': case '=': case '`': RaiseErrorOccurred(ErrorCode.AttributeValueInvalid); _stringBuffer.Append(c); break; case '': return HtmlToken.EOF; default: _stringBuffer.Append(c); break; } c = base.Next; } tag.SetAttributeValue(_stringBuffer.ToString()); return AttributeBeforeName(tag); } private HtmlToken AttributeAfterValue(HtmlTagToken tag) { char next = base.Next; if (!next.IsSpaceCharacter()) { switch (next) { case '/': return TagSelfClosing(tag); case '>': return EmitTag(tag); case '': return HtmlToken.EOF; default: RaiseErrorOccurred(ErrorCode.AttributeNameExpected); Back(); return AttributeBeforeName(tag); } } return AttributeBeforeName(tag); } private HtmlToken ScriptData(char c) { while (true) { switch (c) { case '<': return ScriptDataLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': return HtmlToken.EOF; default: _buffer.Append(c); break; } c = base.Next; } } private HtmlToken ScriptDataLT() { TextPosition currentPosition = GetCurrentPosition(); char next = base.Next; switch (next) { case '/': return ScriptDataEndTag(currentPosition); case '!': _buffer.Append('<').Append('!'); return ScriptDataStartEscape(base.Next); default: _buffer.Append('<'); return ScriptData(next); } } private HtmlToken ScriptDataEndTag(TextPosition position) { char next = base.Next; if (next.IsLetter()) { HtmlTagToken htmlTagToken = HtmlToken.CloseTag(); htmlTagToken.Start = position; _stringBuffer.Clear().Append(next); return ScriptDataNameEndTag(htmlTagToken); } _buffer.Append('<').Append('/'); return ScriptData(next); } private HtmlToken ScriptDataNameEndTag(HtmlTagToken tag) { char next; while (true) { next = base.Next; string text = _stringBuffer.ToString().ToLowerInvariant(); if (text == _lastStartTag) { if (next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } switch (next) { case '/': tag.Name = text; return TagSelfClosing(tag); case '>': tag.Name = text; return EmitTag(tag); } } if (!next.IsLetter()) break; _stringBuffer.Append(next); } _buffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return ScriptData(next); } private HtmlToken ScriptDataStartEscape(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataStartEscapeDash(base.Next); } return ScriptData(c); } private HtmlToken ScriptDataEscaped(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDash(); case '<': return ScriptDataEscapedLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: return ScriptData(c); } } private HtmlToken ScriptDataStartEscapeDash(char c) { if (c == '-') { _buffer.Append('-'); return ScriptDataEscapedDashDash(); } return ScriptData(c); } private HtmlToken ScriptDataEscapedDash() { char next = base.Next; switch (next) { case '-': _buffer.Append('-'); return ScriptDataEscapedDashDash(); case '<': return ScriptDataEscapedLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(next); return ScriptDataEscaped(base.Next); } } private HtmlToken ScriptDataEscapedDashDash() { while (true) { char next = base.Next; switch (next) { case '-': break; case '<': return ScriptDataEscapedLT(); case '>': _buffer.Append('>'); return ScriptData(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscaped(base.Next); case '': return HtmlToken.EOF; default: _buffer.Append(next); return ScriptDataEscaped(base.Next); } _buffer.Append('-'); } } private HtmlToken ScriptDataEscapedLT() { TextPosition currentPosition = GetCurrentPosition(); char next = base.Next; if (next == '/') return ScriptDataEndTag(currentPosition); if (next.IsLetter()) { _stringBuffer.Clear().Append(next); _buffer.Append('<').Append(next); return ScriptDataStartDoubleEscape(); } _buffer.Append('<'); return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedEndTag(HtmlTagToken tag) { char next = base.Next; if (next.IsLetter()) { _stringBuffer.Clear().Append(next); return ScriptDataEscapedNameTag(tag); } _buffer.Append('<').Append('/'); return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedNameTag(HtmlTagToken tag) { char next; while (true) { next = base.Next; string text = _stringBuffer.ToString().ToLowerInvariant(); if (text == _lastStartTag) { if (next.IsSpaceCharacter()) { tag.Name = text; return AttributeBeforeName(tag); } switch (next) { case '/': tag.Name = text; return TagSelfClosing(tag); case '>': tag.Name = text; return EmitTag(tag); } } if (!next.IsLetter()) break; _stringBuffer.Append(next); } _buffer.Append('<').Append('/').Append(_stringBuffer.ToString()); return ScriptDataEscaped(next); } private HtmlToken ScriptDataStartDoubleEscape() { char next; while (true) { next = base.Next; if (next == '/' || next == '>' || next.IsSpaceCharacter()) { _buffer.Append(next); if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) return ScriptDataEscapedDouble(base.Next); return ScriptDataEscaped(base.Next); } if (!next.IsLetter()) break; _stringBuffer.Append(next); _buffer.Append(next); } return ScriptDataEscaped(next); } private HtmlToken ScriptDataEscapedDouble(char c) { switch (c) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDash(); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; } _buffer.Append(c); return ScriptDataEscapedDouble(base.Next); } private HtmlToken ScriptDataEscapedDoubleDash() { char next = base.Next; switch (next) { case '-': _buffer.Append('-'); return ScriptDataEscapedDoubleDashDash(); case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscapedDouble(base.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(next); return ScriptDataEscapedDouble(base.Next); } } private HtmlToken ScriptDataEscapedDoubleDashDash() { while (true) { char next = base.Next; switch (next) { case '-': break; case '<': _buffer.Append('<'); return ScriptDataEscapedDoubleLT(); case '>': _buffer.Append('>'); return ScriptData(base.Next); case '': RaiseErrorOccurred(ErrorCode.Null); _buffer.Append('�'); return ScriptDataEscapedDouble(base.Next); case '': RaiseErrorOccurred(ErrorCode.EOF); return HtmlToken.EOF; default: _buffer.Append(next); return ScriptDataEscapedDouble(base.Next); } _buffer.Append('-'); } } private HtmlToken ScriptDataEscapedDoubleLT() { char next = base.Next; if (next == '/') { _stringBuffer.Clear(); _buffer.Append('/'); return ScriptDataEndDoubleEscape(); } return ScriptDataEscapedDouble(next); } private HtmlToken ScriptDataEndDoubleEscape() { char next; while (true) { next = base.Next; if (next.IsSpaceCharacter() || next == '/' || next == '>') { _buffer.Append(next); if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) return ScriptDataEscaped(base.Next); return ScriptDataEscapedDouble(base.Next); } if (!next.IsLetter()) break; _stringBuffer.Append(next); _buffer.Append(next); } return ScriptDataEscapedDouble(next); } private HtmlCommentToken EmitComment(TextPosition position) { HtmlCommentToken htmlCommentToken = HtmlToken.Comment(_stringBuffer.ToString()); htmlCommentToken.Start = position; htmlCommentToken.End = GetCurrentPosition(); return htmlCommentToken; } private HtmlTagToken EmitTag(HtmlTagToken tag) { _state = HtmlParseMode.PCData; if (tag.Type == HtmlTokenType.StartTag) { for (int num = tag.Attributes.Count - 1; num > 0; num--) { for (int num2 = num - 1; num2 >= 0; num2--) { if (tag.Attributes[num2].Key == tag.Attributes[num].Key) { tag.Attributes.RemoveAt(num); RaiseErrorOccurred(ErrorCode.AttributeDuplicateOmitted); break; } } } _lastStartTag = tag.Name; } else { if (tag.IsSelfClosing) RaiseErrorOccurred(ErrorCode.EndTagCannotBeSelfClosed); if (tag.Attributes.Count != 0) RaiseErrorOccurred(ErrorCode.EndTagCannotHaveAttributes); } tag.End = GetCurrentPosition(); return tag; } } }