HtmlTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at:
http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Dom.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using AngleSharp.Services;
using System;
using System.Collections.Generic;
namespace AngleSharp.Parser.Html
{
internal sealed class HtmlTokenizer : BaseTokenizer
{
private enum AttributeState : byte
{
BeforeName,
Name,
AfterName,
BeforeValue,
QuotedValue,
AfterValue,
UnquotedValue
}
private enum ScriptState : byte
{
Normal,
OpenTag,
EndTag,
StartEscape,
Escaped,
StartEscapeDash,
EscapedDash,
EscapedDashDash,
EscapedOpenTag,
EscapedEndTag,
EscapedNameEndTag,
StartDoubleEscape,
EscapedDouble,
EscapedDoubleDash,
EscapedDoubleDashDash,
EscapedDoubleOpenTag,
EndDoubleEscape
}
private readonly IEntityProvider _resolver;
private string _lastStartTag;
private TextPosition _position;
public bool IsAcceptingCharacterData { get; set; }
public HtmlParseMode State { get; set; }
public bool IsStrictMode { get; set; }
public event EventHandler<HtmlErrorEvent> Error;
public HtmlTokenizer(TextSource source, IEntityProvider resolver)
: base(source)
{
State = HtmlParseMode.PCData;
IsAcceptingCharacterData = false;
IsStrictMode = false;
_lastStartTag = string.Empty;
_resolver = resolver;
}
public HtmlToken Get()
{
char next = GetNext();
_position = GetCurrentPosition();
if (next != '') {
switch (State) {
case HtmlParseMode.PCData:
return Data(next);
case HtmlParseMode.RCData:
return RCData(next);
case HtmlParseMode.Plaintext:
return Plaintext(next);
case HtmlParseMode.Rawtext:
return Rawtext(next);
case HtmlParseMode.Script:
return ScriptData(next);
}
}
return NewEof(true);
}
internal void RaiseErrorOccurred(HtmlParseError code, TextPosition position)
{
EventHandler<HtmlErrorEvent> error = this.Error;
if (IsStrictMode) {
string message = "Error while parsing the provided HTML document.";
throw new HtmlParseException(code.GetCode(), message, position);
}
if (error != null) {
HtmlErrorEvent e = new HtmlErrorEvent(code, position);
error(this, e);
}
}
private HtmlToken Data(char c)
{
if (c != '<')
return DataText(c);
return TagOpen(GetNext());
}
private HtmlToken DataText(char c)
{
while (true) {
switch (c) {
case '<':
case '':
Back();
return NewCharacter();
case '&':
AppendCharacterReference(GetNext(), ' ');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
break;
default:
base.StringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken Plaintext(char c)
{
while (true) {
switch (c) {
case ' ':
AppendReplacement();
break;
case '':
Back();
return NewCharacter();
default:
base.StringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RCData(char c)
{
if (c != '<')
return RCDataText(c);
return RCDataLt(GetNext());
}
private HtmlToken RCDataText(char c)
{
while (true) {
switch (c) {
case '&':
AppendCharacterReference(GetNext(), ' ');
break;
case '<':
case '':
Back();
return NewCharacter();
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RCDataLt(char c)
{
if (c == '/') {
c = GetNext();
if (c.IsUppercaseAscii()) {
base.StringBuffer.Append(char.ToLowerInvariant(c));
return RCDataNameEndTag(GetNext());
}
if (c.IsLowercaseAscii()) {
base.StringBuffer.Append(c);
return RCDataNameEndTag(GetNext());
}
base.StringBuffer.Append('<').Append('/');
return RCDataText(c);
}
base.StringBuffer.Append('<');
return RCDataText(c);
}
private HtmlToken RCDataNameEndTag(char c)
{
while (true) {
HtmlToken htmlToken = CreateIfAppropriate(c);
if (htmlToken != null)
return htmlToken;
if (c.IsUppercaseAscii())
base.StringBuffer.Append(char.ToLowerInvariant(c));
else {
if (!c.IsLowercaseAscii())
break;
base.StringBuffer.Append(c);
}
c = GetNext();
}
base.StringBuffer.Insert(0, '<').Insert(1, '/');
return RCDataText(c);
}
private HtmlToken Rawtext(char c)
{
if (c != '<')
return RawtextText(c);
return RawtextLT(GetNext());
}
private HtmlToken RawtextText(char c)
{
while (true) {
switch (c) {
case '<':
case '':
Back();
return NewCharacter();
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RawtextLT(char c)
{
if (c == '/') {
c = GetNext();
if (c.IsUppercaseAscii()) {
base.StringBuffer.Append(char.ToLowerInvariant(c));
return RawtextNameEndTag(GetNext());
}
if (c.IsLowercaseAscii()) {
base.StringBuffer.Append(c);
return RawtextNameEndTag(GetNext());
}
base.StringBuffer.Append('<').Append('/');
return RawtextText(c);
}
base.StringBuffer.Append('<');
return RawtextText(c);
}
private HtmlToken RawtextNameEndTag(char c)
{
while (true) {
HtmlToken htmlToken = CreateIfAppropriate(c);
if (htmlToken != null)
return htmlToken;
if (c.IsUppercaseAscii())
base.StringBuffer.Append(char.ToLowerInvariant(c));
else {
if (!c.IsLowercaseAscii())
break;
base.StringBuffer.Append(c);
}
c = GetNext();
}
base.StringBuffer.Insert(0, '<').Insert(1, '/');
return RawtextText(c);
}
private HtmlToken CharacterData(char c)
{
while (true) {
switch (c) {
case '':
Back();
goto IL_0042;
case ']':
{
if (!ContinuesWithSensitive("]]>"))
break;
Advance(2);
goto IL_0042;
}
IL_0042:
return NewCharacter();
}
base.StringBuffer.Append(c);
c = GetNext();
}
}
private void AppendCharacterReference(char c, char allowedCharacter = ' ')
{
if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) {
Back();
base.StringBuffer.Append('&');
} else {
string text = null;
text = ((c != '#') ? GetLookupCharacterReference(c, allowedCharacter) : GetNumericCharacterReference(GetNext()));
if (text == null)
base.StringBuffer.Append('&');
else
base.StringBuffer.Append(text);
}
}
private string GetNumericCharacterReference(char c)
{
int num = 10;
int num2 = 1;
int num3 = 0;
List<int> list = new List<int>();
bool flag = c == 'x' || c == 'X';
if (!flag) {
while (c.IsDigit()) {
list.Add(c.FromHex());
c = GetNext();
}
} else {
num = 16;
while ((c = GetNext()).IsHex()) {
list.Add(c.FromHex());
}
}
for (int num4 = list.Count - 1; num4 >= 0; num4--) {
num3 += list[num4] * num2;
num2 *= num;
}
if (list.Count == 0) {
Back(2);
if (flag)
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber);
return null;
}
if (c != ';') {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing);
Back();
}
if (HtmlEntityService.IsInCharacterTable(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode);
return HtmlEntityService.GetSymbolFromTable(num3);
}
if (HtmlEntityService.IsInvalidNumber(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber);
return '�'.ToString();
}
if (HtmlEntityService.IsInInvalidRange(num3))
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange);
return num3.ConvertFromUtf32();
}
private string GetLookupCharacterReference(char c, char allowedCharacter)
{
string text = null;
int insertionPoint = base.InsertionPoint - 1;
char[] array = new char[32];
int num = 0;
char c2 = base.Current;
while (c2 != ';' && c2.IsName()) {
array[num++] = c2;
c2 = GetNext();
if (c2 == '' || num >= 31)
break;
}
if (c2 == ';') {
array[num] = ';';
string name = new string(array, 0, num + 1);
text = _resolver.GetSymbol(name);
}
while (text == null && num > 0) {
string name2 = new string(array, 0, num--);
text = _resolver.GetSymbol(name2);
if (text == null)
Back();
}
c2 = base.Current;
if (c2 != ';') {
if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) {
if (c2 == '=')
RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound);
base.InsertionPoint = insertionPoint;
return null;
}
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated);
}
return text;
}
private HtmlToken TagOpen(char c)
{
if (c == '/')
return TagEnd(GetNext());
if (c.IsLowercaseAscii()) {
base.StringBuffer.Append(c);
return TagName(NewTagOpen());
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case '!':
return MarkupDeclaration(GetNext());
default:
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag);
base.StringBuffer.Append('<');
return DataText(c);
case '?':
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(c);
}
}
base.StringBuffer.Append(char.ToLowerInvariant(c));
return TagName(NewTagOpen());
}
private HtmlToken TagEnd(char c)
{
if (c.IsLowercaseAscii()) {
base.StringBuffer.Append(c);
return TagName(NewTagClose());
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return Data(GetNext());
case '':
Back();
RaiseErrorOccurred(HtmlParseError.EOF);
base.StringBuffer.Append('<').Append('/');
return NewCharacter();
default:
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(c);
}
}
base.StringBuffer.Append(char.ToLowerInvariant(c));
return TagName(NewTagClose());
}
private HtmlToken TagName(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
if (next == '>') {
tag.Name = FlushBuffer();
return EmitTag(tag);
}
if (next.IsSpaceCharacter()) {
tag.Name = FlushBuffer();
return ParseAttributes(tag);
}
if (next == '/')
break;
if (next.IsUppercaseAscii())
base.StringBuffer.Append(char.ToLowerInvariant(next));
else {
switch (next) {
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(next);
break;
case '':
return NewEof(false);
}
}
}
tag.Name = FlushBuffer();
return TagSelfClosing(tag);
}
private HtmlToken TagSelfClosing(HtmlTagToken tag)
{
switch (GetNext()) {
case '>':
tag.IsSelfClosing = true;
return EmitTag(tag);
case '':
return NewEof(false);
default:
RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced);
Back();
return ParseAttributes(tag);
}
}
private HtmlToken MarkupDeclaration(char c)
{
if (ContinuesWithSensitive("--")) {
Advance();
return CommentStart(GetNext());
}
if (ContinuesWithInsensitive(TagNames.Doctype)) {
Advance(6);
return Doctype(GetNext());
}
if (IsAcceptingCharacterData && ContinuesWithSensitive(Keywords.CData)) {
Advance(6);
return CharacterData(GetNext());
}
RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration);
return BogusComment(c);
}
private HtmlToken BogusComment(char c)
{
base.StringBuffer.Clear();
while (true) {
switch (c) {
case '':
Back();
goto case '>';
case ' ':
c = '�';
break;
case '>':
State = HtmlParseMode.PCData;
return NewComment();
}
base.StringBuffer.Append(c);
c = GetNext();
}
}
private HtmlToken CommentStart(char c)
{
base.StringBuffer.Clear();
switch (c) {
case '-':
return CommentDashStart(GetNext()) ?? Comment(GetNext());
case ' ':
AppendReplacement();
return Comment(GetNext());
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
base.StringBuffer.Append(c);
return Comment(GetNext());
}
return NewComment();
}
private HtmlToken CommentDashStart(char c)
{
switch (c) {
case '-':
return CommentEnd(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
base.StringBuffer.Append('-').Append('�');
return Comment(GetNext());
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
base.StringBuffer.Append('-').Append(c);
return Comment(GetNext());
}
return NewComment();
}
private HtmlToken Comment(char c)
{
while (true) {
switch (c) {
case '-': {
HtmlToken htmlToken = CommentDashEnd(GetNext());
if (htmlToken != null)
return htmlToken;
break;
}
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken CommentDashEnd(char c)
{
switch (c) {
case '-':
return CommentEnd(GetNext());
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
break;
}
base.StringBuffer.Append('-').Append(c);
return null;
}
private HtmlToken CommentEnd(char c)
{
while (true) {
switch (c) {
case '>':
State = HtmlParseMode.PCData;
return NewComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
base.StringBuffer.Append('-').Append('�');
return null;
case '!':
RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM);
return CommentBangEnd(GetNext());
case '-':
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
default:
RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected);
base.StringBuffer.Append('-').Append('-').Append(c);
return null;
}
RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash);
base.StringBuffer.Append('-');
c = GetNext();
}
}
private HtmlToken CommentBangEnd(char c)
{
switch (c) {
case '-':
base.StringBuffer.Append('-').Append('-').Append('!');
return CommentDashEnd(GetNext());
case '>':
State = HtmlParseMode.PCData;
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
base.StringBuffer.Append('-').Append('-').Append('!')
.Append('�');
return null;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
base.StringBuffer.Append('-').Append('-').Append('!')
.Append(c);
return null;
}
return NewComment();
}
private HtmlToken Doctype(char c)
{
if (c.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
if (c == '') {
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewDoctype(true);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected);
return DoctypeNameBefore(c);
}
private HtmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ': {
HtmlDoctypeToken doctype2 = NewDoctype(false);
AppendReplacement();
return DoctypeName(doctype2);
}
case '>': {
HtmlDoctypeToken result2 = NewDoctype(true);
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return result2;
}
case '': {
HtmlDoctypeToken result = NewDoctype(true);
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return result;
}
default: {
HtmlDoctypeToken doctype = NewDoctype(false);
base.StringBuffer.Append(c);
return DoctypeName(doctype);
}
}
}
HtmlDoctypeToken doctype3 = NewDoctype(false);
base.StringBuffer.Append(char.ToLowerInvariant(c));
return DoctypeName(doctype3);
}
private HtmlToken DoctypeName(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
if (next.IsSpaceCharacter()) {
doctype.Name = FlushBuffer();
return DoctypeNameAfter(doctype);
}
if (next == '>') {
State = HtmlParseMode.PCData;
doctype.Name = FlushBuffer();
break;
}
if (next.IsUppercaseAscii())
base.StringBuffer.Append(char.ToLowerInvariant(next));
else {
switch (next) {
case ' ':
break;
case '':
goto IL_006c;
default:
goto IL_008e;
}
AppendReplacement();
}
continue;
IL_008e:
base.StringBuffer.Append(next);
continue;
IL_006c:
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.Name = FlushBuffer();
break;
}
return doctype;
}
private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
State = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
break;
default:
if (ContinuesWithInsensitive(Keywords.Public)) {
Advance(5);
return DoctypePublic(doctype);
}
if (ContinuesWithInsensitive(Keywords.System)) {
Advance(5);
return DoctypeSystem(doctype);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublic(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypePublicIdentifierBefore(doctype);
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.PublicIdentifier = FlushBuffer();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
AppendReplacement();
break;
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = FlushBuffer();
goto IL_0090;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = FlushBuffer();
goto IL_0090;
default:
{
base.StringBuffer.Append(next);
break;
}
IL_0090:
return doctype;
}
}
}
private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.PublicIdentifier = FlushBuffer();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
AppendReplacement();
break;
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = FlushBuffer();
goto IL_0090;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = FlushBuffer();
Back();
goto IL_0090;
default:
{
base.StringBuffer.Append(next);
break;
}
IL_0090:
return doctype;
}
}
}
private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypeBetween(doctype);
switch (next) {
case '>':
State = HtmlParseMode.PCData;
break;
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
State = HtmlParseMode.PCData;
break;
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
State = HtmlParseMode.PCData;
return DoctypeSystemIdentifierBefore(doctype);
}
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.SystemIdentifier = FlushBuffer();
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.SystemIdentifier = FlushBuffer();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
AppendReplacement();
break;
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
goto IL_0090;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
Back();
goto IL_0090;
default:
{
base.StringBuffer.Append(next);
break;
}
IL_0090:
return doctype;
}
}
}
private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.SystemIdentifier = FlushBuffer();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
AppendReplacement();
break;
case '>':
State = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
goto IL_0099;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = FlushBuffer();
Back();
goto IL_0099;
default:
{
base.StringBuffer.Append(next);
break;
}
IL_0099:
return doctype;
}
}
}
private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
State = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken BogusDoctype(HtmlDoctypeToken doctype)
{
while (true) {
switch (GetNext()) {
case '>':
State = HtmlParseMode.PCData;
goto IL_0025;
case '':
{
Back();
goto IL_0025;
}
IL_0025:
return doctype;
}
}
}
private HtmlToken ParseAttributes(HtmlTagToken tag)
{
AttributeState attributeState = AttributeState.BeforeName;
char c = '"';
char c2 = ' ';
while (true) {
switch (attributeState) {
case AttributeState.BeforeName:
c2 = SkipSpaces();
switch (c2) {
case '/':
return TagSelfClosing(tag);
case '>':
return EmitTag(tag);
}
if (!c2.IsUppercaseAscii()) {
switch (c2) {
case ' ':
AppendReplacement();
attributeState = AttributeState.Name;
break;
case '"':
case '\'':
case '<':
case '=':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
base.StringBuffer.Append(c2);
attributeState = AttributeState.Name;
break;
default:
base.StringBuffer.Append(c2);
attributeState = AttributeState.Name;
break;
case '':
return NewEof(false);
}
} else {
base.StringBuffer.Append(char.ToLowerInvariant(c2));
attributeState = AttributeState.Name;
}
break;
case AttributeState.Name:
c2 = GetNext();
switch (c2) {
case '=':
tag.AddAttribute(FlushBuffer());
attributeState = AttributeState.BeforeValue;
break;
case '>':
tag.AddAttribute(FlushBuffer());
return EmitTag(tag);
default:
if (c2.IsSpaceCharacter()) {
tag.AddAttribute(FlushBuffer());
attributeState = AttributeState.AfterName;
} else {
if (c2 == '/') {
tag.AddAttribute(FlushBuffer());
return TagSelfClosing(tag);
}
if (!c2.IsUppercaseAscii()) {
switch (c2) {
case '"':
case '\'':
case '<':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
base.StringBuffer.Append(c2);
break;
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(c2);
break;
case '':
return NewEof(false);
}
} else
base.StringBuffer.Append(char.ToLowerInvariant(c2));
}
break;
}
break;
case AttributeState.AfterName:
c2 = SkipSpaces();
switch (c2) {
case '>':
return EmitTag(tag);
case '=':
attributeState = AttributeState.BeforeValue;
break;
case '/':
return TagSelfClosing(tag);
default:
if (!c2.IsUppercaseAscii()) {
switch (c2) {
case '"':
case '\'':
case '<':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
base.StringBuffer.Append(c2);
attributeState = AttributeState.Name;
break;
case ' ':
AppendReplacement();
attributeState = AttributeState.Name;
break;
default:
base.StringBuffer.Append(c2);
attributeState = AttributeState.Name;
break;
case '':
return NewEof(false);
}
} else {
base.StringBuffer.Append(char.ToLowerInvariant(c2));
attributeState = AttributeState.Name;
}
break;
}
break;
case AttributeState.BeforeValue:
c2 = SkipSpaces();
switch (c2) {
case '"':
case '\'':
attributeState = AttributeState.QuotedValue;
c = c2;
break;
case '&':
attributeState = AttributeState.UnquotedValue;
break;
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return EmitTag(tag);
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
base.StringBuffer.Append(c2);
attributeState = AttributeState.UnquotedValue;
c2 = GetNext();
break;
case ' ':
AppendReplacement();
attributeState = AttributeState.UnquotedValue;
c2 = GetNext();
break;
default:
base.StringBuffer.Append(c2);
attributeState = AttributeState.UnquotedValue;
c2 = GetNext();
break;
case '':
return NewEof(false);
}
break;
case AttributeState.QuotedValue:
c2 = GetNext();
if (c2 != c) {
switch (c2) {
case '&':
AppendCharacterReference(GetNext(), c);
break;
case ' ':
AppendReplacement();
break;
default:
base.StringBuffer.Append(c2);
break;
case '':
return NewEof(false);
}
} else {
tag.SetAttributeValue(FlushBuffer());
attributeState = AttributeState.AfterValue;
}
break;
case AttributeState.UnquotedValue:
if (c2 == '>') {
tag.SetAttributeValue(FlushBuffer());
return EmitTag(tag);
}
if (!c2.IsSpaceCharacter()) {
switch (c2) {
case '&':
AppendCharacterReference(GetNext(), '>');
c2 = GetNext();
break;
case ' ':
AppendReplacement();
c2 = GetNext();
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
base.StringBuffer.Append(c2);
c2 = GetNext();
break;
default:
base.StringBuffer.Append(c2);
c2 = GetNext();
break;
case '':
return NewEof(false);
}
} else {
tag.SetAttributeValue(FlushBuffer());
attributeState = AttributeState.BeforeName;
}
break;
case AttributeState.AfterValue:
c2 = GetNext();
if (c2 == '>')
return EmitTag(tag);
if (c2.IsSpaceCharacter())
attributeState = AttributeState.BeforeName;
else {
switch (c2) {
case '/':
return TagSelfClosing(tag);
case '':
return NewEof(false);
}
RaiseErrorOccurred(HtmlParseError.AttributeNameExpected);
Back();
attributeState = AttributeState.BeforeName;
}
break;
}
}
}
private HtmlToken ScriptData(char c)
{
int length = _lastStartTag.Length;
int length2 = TagNames.Script.Length;
ScriptState scriptState = ScriptState.Normal;
int num = 0;
while (true) {
switch (scriptState) {
case ScriptState.Normal:
switch (c) {
case ' ':
AppendReplacement();
goto IL_00ad;
case '<':
base.StringBuffer.Append('<');
scriptState = ScriptState.OpenTag;
break;
case '':
Back();
return NewCharacter();
default:
{
base.StringBuffer.Append(c);
goto IL_00ad;
}
IL_00ad:
c = GetNext();
break;
}
break;
case ScriptState.OpenTag:
c = GetNext();
switch (c) {
case '/':
scriptState = ScriptState.EndTag;
break;
case '!':
scriptState = ScriptState.StartEscape;
break;
default:
scriptState = ScriptState.Normal;
break;
}
break;
case ScriptState.StartEscape:
base.StringBuffer.Append('!');
c = GetNext();
scriptState = ((c == '-') ? ScriptState.StartEscapeDash : ScriptState.Normal);
break;
case ScriptState.StartEscapeDash:
c = GetNext();
base.StringBuffer.Append('-');
if (c == '-') {
base.StringBuffer.Append('-');
scriptState = ScriptState.EscapedDashDash;
} else
scriptState = ScriptState.Normal;
break;
case ScriptState.EndTag: {
c = GetNext();
num = base.StringBuffer.Append('/').Length;
HtmlTagToken htmlTagToken = NewTagClose();
while (c.IsLetter()) {
base.StringBuffer.Append(c);
c = GetNext();
bool flag = c.IsSpaceCharacter();
bool flag2 = c == '>';
bool flag3 = c == '/';
if (base.StringBuffer.Length - num == length && (flag | flag2 | flag3) && base.StringBuffer.ToString(num, length).Isi(_lastStartTag)) {
if (num > 2) {
Back(3 + length);
base.StringBuffer.Remove(num - 2, length + 2);
return NewCharacter();
}
base.StringBuffer.Clear();
if (flag) {
htmlTagToken.Name = _lastStartTag;
return ParseAttributes(htmlTagToken);
}
if (flag3) {
htmlTagToken.Name = _lastStartTag;
return TagSelfClosing(htmlTagToken);
}
if (flag2) {
htmlTagToken.Name = _lastStartTag;
return EmitTag(htmlTagToken);
}
}
}
scriptState = ScriptState.Normal;
break;
}
case ScriptState.Escaped:
switch (c) {
case '-':
base.StringBuffer.Append('-');
c = GetNext();
scriptState = ScriptState.EscapedDash;
break;
case '<':
c = GetNext();
scriptState = ScriptState.EscapedOpenTag;
break;
case ' ':
AppendReplacement();
c = GetNext();
break;
case '':
Back();
return NewCharacter();
default:
scriptState = ScriptState.Normal;
break;
}
break;
case ScriptState.EscapedDash:
switch (c) {
case '-':
base.StringBuffer.Append('-');
scriptState = ScriptState.EscapedDashDash;
break;
case '<':
c = GetNext();
scriptState = ScriptState.EscapedOpenTag;
break;
case ' ':
AppendReplacement();
goto IL_033c;
case '':
Back();
return NewCharacter();
default:
{
base.StringBuffer.Append(c);
goto IL_033c;
}
IL_033c:
c = GetNext();
scriptState = ScriptState.Escaped;
break;
}
break;
case ScriptState.EscapedDashDash:
c = GetNext();
switch (c) {
case '-':
base.StringBuffer.Append('-');
break;
case '<':
c = GetNext();
scriptState = ScriptState.EscapedOpenTag;
break;
case '>':
base.StringBuffer.Append('>');
c = GetNext();
scriptState = ScriptState.Normal;
break;
case ' ':
AppendReplacement();
c = GetNext();
scriptState = ScriptState.Escaped;
break;
case '':
return NewCharacter();
default:
base.StringBuffer.Append(c);
c = GetNext();
scriptState = ScriptState.Escaped;
break;
}
break;
case ScriptState.EscapedOpenTag:
if (c == '/') {
c = GetNext();
scriptState = ScriptState.EscapedEndTag;
} else if (c.IsLetter()) {
num = base.StringBuffer.Append('<').Length;
base.StringBuffer.Append(c);
scriptState = ScriptState.StartDoubleEscape;
} else {
base.StringBuffer.Append('<');
scriptState = ScriptState.Escaped;
}
break;
case ScriptState.EscapedEndTag:
num = base.StringBuffer.Append('<').Append('/').Length;
if (c.IsLetter()) {
base.StringBuffer.Append(c);
scriptState = ScriptState.EscapedNameEndTag;
} else
scriptState = ScriptState.Escaped;
break;
case ScriptState.EscapedNameEndTag:
c = GetNext();
if (base.StringBuffer.Length - num == length2 && (c == '/' || c == '>' || c.IsSpaceCharacter()) && base.StringBuffer.ToString(num, length2).Isi(TagNames.Script)) {
Back(length2 + 3);
base.StringBuffer.Remove(num - 2, length2 + 2);
return NewCharacter();
}
if (!c.IsLetter())
scriptState = ScriptState.Escaped;
else
base.StringBuffer.Append(c);
break;
case ScriptState.StartDoubleEscape:
c = GetNext();
if (base.StringBuffer.Length - num == length2 && (c == '/' || c == '>' || c.IsSpaceCharacter())) {
bool num3 = base.StringBuffer.ToString(num, length2).Isi(TagNames.Script);
base.StringBuffer.Append(c);
c = GetNext();
scriptState = (num3 ? ScriptState.EscapedDouble : ScriptState.Escaped);
} else if (c.IsLetter()) {
base.StringBuffer.Append(c);
} else {
scriptState = ScriptState.Escaped;
}
break;
case ScriptState.EscapedDouble:
switch (c) {
case '-':
base.StringBuffer.Append('-');
c = GetNext();
scriptState = ScriptState.EscapedDoubleDash;
break;
case '<':
base.StringBuffer.Append('<');
c = GetNext();
scriptState = ScriptState.EscapedDoubleOpenTag;
break;
case ' ':
AppendReplacement();
goto default;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
default:
base.StringBuffer.Append(c);
c = GetNext();
break;
}
break;
case ScriptState.EscapedDoubleDash:
switch (c) {
case '-':
base.StringBuffer.Append('-');
scriptState = ScriptState.EscapedDoubleDashDash;
break;
case '<':
base.StringBuffer.Append('<');
c = GetNext();
scriptState = ScriptState.EscapedDoubleOpenTag;
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
goto default;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
default:
scriptState = ScriptState.EscapedDouble;
break;
}
break;
case ScriptState.EscapedDoubleDashDash:
c = GetNext();
switch (c) {
case '-':
base.StringBuffer.Append('-');
break;
case '<':
base.StringBuffer.Append('<');
c = GetNext();
scriptState = ScriptState.EscapedDoubleOpenTag;
break;
case '>':
base.StringBuffer.Append('>');
c = GetNext();
scriptState = ScriptState.Normal;
break;
case ' ':
AppendReplacement();
c = GetNext();
scriptState = ScriptState.EscapedDouble;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
default:
base.StringBuffer.Append(c);
c = GetNext();
scriptState = ScriptState.EscapedDouble;
break;
}
break;
case ScriptState.EscapedDoubleOpenTag:
if (c == '/') {
num = base.StringBuffer.Append('/').Length;
scriptState = ScriptState.EndDoubleEscape;
} else
scriptState = ScriptState.EscapedDouble;
break;
case ScriptState.EndDoubleEscape:
c = GetNext();
if (base.StringBuffer.Length - num == length2 && (c.IsSpaceCharacter() || c == '/' || c == '>')) {
bool num2 = base.StringBuffer.ToString(num, length2).Isi(TagNames.Script);
base.StringBuffer.Append(c);
c = GetNext();
scriptState = (num2 ? ScriptState.Escaped : ScriptState.EscapedDouble);
} else if (c.IsLetter()) {
base.StringBuffer.Append(c);
} else {
scriptState = ScriptState.EscapedDouble;
}
break;
}
}
}
private HtmlToken NewCharacter()
{
string name = FlushBuffer();
return new HtmlToken(HtmlTokenType.Character, _position, name);
}
private HtmlToken NewComment()
{
string name = FlushBuffer();
return new HtmlToken(HtmlTokenType.Comment, _position, name);
}
private HtmlToken NewEof(bool acceptable = false)
{
if (!acceptable)
RaiseErrorOccurred(HtmlParseError.EOF);
return new HtmlToken(HtmlTokenType.EndOfFile, _position);
}
private HtmlDoctypeToken NewDoctype(bool quirksForced)
{
return new HtmlDoctypeToken(quirksForced, _position);
}
private HtmlTagToken NewTagOpen()
{
return new HtmlTagToken(HtmlTokenType.StartTag, _position);
}
private HtmlTagToken NewTagClose()
{
return new HtmlTagToken(HtmlTokenType.EndTag, _position);
}
private void RaiseErrorOccurred(HtmlParseError code)
{
RaiseErrorOccurred(code, GetCurrentPosition());
}
private void AppendReplacement()
{
RaiseErrorOccurred(HtmlParseError.Null);
base.StringBuffer.Append('�');
}
private HtmlToken CreateIfAppropriate(char c)
{
bool flag = c.IsSpaceCharacter();
bool flag2 = c == '>';
bool flag3 = c == '/';
if (base.StringBuffer.Length == _lastStartTag.Length && (flag | flag2 | flag3) && base.StringBuffer.ToString().Is(_lastStartTag)) {
HtmlTagToken htmlTagToken = NewTagClose();
base.StringBuffer.Clear();
if (flag) {
htmlTagToken.Name = _lastStartTag;
return ParseAttributes(htmlTagToken);
}
if (flag3) {
htmlTagToken.Name = _lastStartTag;
return TagSelfClosing(htmlTagToken);
}
if (flag2) {
htmlTagToken.Name = _lastStartTag;
return EmitTag(htmlTagToken);
}
}
return null;
}
private HtmlToken EmitTag(HtmlTagToken tag)
{
List<KeyValuePair<string, string>> attributes = tag.Attributes;
State = HtmlParseMode.PCData;
switch (tag.Type) {
case HtmlTokenType.StartTag:
for (int num = attributes.Count - 1; num > 0; num--) {
for (int num2 = num - 1; num2 >= 0; num2--) {
KeyValuePair<string, string> keyValuePair = attributes[num2];
string key = keyValuePair.Key;
keyValuePair = attributes[num];
if (key == keyValuePair.Key) {
attributes.RemoveAt(num);
RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted, tag.Position);
break;
}
}
}
_lastStartTag = tag.Name;
break;
case HtmlTokenType.EndTag:
if (tag.IsSelfClosing)
RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed, tag.Position);
if (attributes.Count != 0)
RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes, tag.Position);
break;
}
return tag;
}
}
}