HtmlTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at:
http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using System;
using System.Collections.Generic;
using System.Diagnostics;
namespace AngleSharp.Parser.Html
{
[DebuggerStepThrough]
internal sealed class HtmlTokenizer : BaseTokenizer
{
private bool _acceptsCharacterData;
private string _lastStartTag;
private HtmlParseMode _state;
private TextPosition _position;
public bool IsAcceptingCharacterData {
get {
return _acceptsCharacterData;
}
set {
_acceptsCharacterData = value;
}
}
public HtmlParseMode State {
get {
return _state;
}
set {
_state = value;
}
}
public HtmlTokenizer(TextSource source, IEventAggregator events)
: base(source, events)
{
_state = HtmlParseMode.PCData;
_acceptsCharacterData = false;
_lastStartTag = string.Empty;
}
public void RaiseErrorOccurred(HtmlParseError error, TextPosition position)
{
if (_events != null) {
HtmlParseErrorEvent data = new HtmlParseErrorEvent(error.GetCode(), error.GetMessage(), position);
_events.Publish(data);
}
}
public void RaiseErrorOccurred(HtmlParseError code)
{
RaiseErrorOccurred(code, GetCurrentPosition());
}
public HtmlToken Get()
{
char next = GetNext();
_position = GetCurrentPosition();
if (next != '') {
switch (_state) {
case HtmlParseMode.PCData:
return Data(next);
case HtmlParseMode.RCData:
return RCData(next);
case HtmlParseMode.Plaintext:
return Plaintext(next);
case HtmlParseMode.Rawtext:
return Rawtext(next);
case HtmlParseMode.Script:
return ScriptData(next);
}
}
return NewEof();
}
private HtmlToken Data(char c)
{
if (c != '<')
return DataText(c);
return TagOpen(GetNext());
}
private HtmlToken DataText(char c)
{
while (true) {
switch (c) {
case '':
case '<':
Back();
return NewCharacter();
case '&':
AppendCharacterReference(GetNext(), ' ');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
break;
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken Plaintext(char c)
{
while (true) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
Back();
return NewCharacter();
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RCData(char c)
{
if (c != '<')
return RCDataText(c);
return RCDataLt(GetNext());
}
private HtmlToken RCDataText(char c)
{
while (true) {
switch (c) {
case '&':
AppendCharacterReference(GetNext(), ' ');
break;
case '':
case '<':
Back();
return NewCharacter();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RCDataLt(char c)
{
if (c == '/') {
c = GetNext();
if (c.IsUppercaseAscii()) {
_stringBuffer.Append(char.ToLower(c));
return RCDataNameEndTag(GetNext());
}
if (c.IsLowercaseAscii()) {
_stringBuffer.Append(c);
return RCDataNameEndTag(GetNext());
}
_stringBuffer.Append('<').Append('/');
return RCDataText(c);
}
_stringBuffer.Append('<');
return RCDataText(c);
}
private HtmlToken RCDataNameEndTag(char c)
{
while (true) {
HtmlToken htmlToken = CreateIfAppropriate(c);
if (htmlToken != null)
return htmlToken;
if (c.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(c));
else {
if (!c.IsLowercaseAscii())
break;
_stringBuffer.Append(c);
}
c = GetNext();
}
_stringBuffer.Insert(0, '<').Insert(1, '/');
return RCDataText(c);
}
private HtmlToken Rawtext(char c)
{
if (c != '<')
return RawtextText(c);
return RawtextLT(GetNext());
}
private HtmlToken RawtextText(char c)
{
while (true) {
switch (c) {
case '':
case '<':
Back();
return NewCharacter();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RawtextLT(char c)
{
if (c == '/') {
c = GetNext();
if (c.IsUppercaseAscii()) {
_stringBuffer.Append(char.ToLower(c));
return RawtextNameEndTag(GetNext());
}
if (c.IsLowercaseAscii()) {
_stringBuffer.Append(c);
return RawtextNameEndTag(GetNext());
}
_stringBuffer.Append('<').Append('/');
return RawtextText(c);
}
_stringBuffer.Append('<');
return RawtextText(c);
}
private HtmlToken RawtextNameEndTag(char c)
{
while (true) {
HtmlToken htmlToken = CreateIfAppropriate(c);
if (htmlToken != null)
return htmlToken;
if (c.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(c));
else {
if (!c.IsLowercaseAscii())
break;
_stringBuffer.Append(c);
}
c = GetNext();
}
_stringBuffer.Insert(0, '<').Insert(1, '/');
return RawtextText(c);
}
private HtmlToken CharacterData(char c)
{
while (true) {
switch (c) {
case '':
Back();
goto IL_0040;
case ']':
{
if (!ContinuesWith("]]>", true))
break;
Advance(2);
goto IL_0040;
}
IL_0040:
return NewCharacter();
}
_stringBuffer.Append(c);
c = GetNext();
}
}
private void AppendCharacterReference(char c, char allowedCharacter = ' ')
{
if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) {
Back();
_stringBuffer.Append('&');
} else {
string text = null;
if (c == '#') {
int num = 10;
int num2 = 1;
int num3 = 0;
List<int> list = new List<int>();
c = GetNext();
bool flag = c == 'x' || c == 'X';
if (!flag) {
while (c.IsDigit()) {
list.Add(c.FromHex());
c = GetNext();
}
} else {
num = 16;
while ((c = GetNext()).IsHex()) {
list.Add(c.FromHex());
}
}
for (int num4 = list.Count - 1; num4 >= 0; num4--) {
num3 += list[num4] * num2;
num2 *= num;
}
if (list.Count == 0) {
Back(2);
if (flag)
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber);
_stringBuffer.Append('&');
return;
}
if (c != ';') {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing);
Back();
}
if (Entities.IsInCharacterTable(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode);
text = Entities.GetSymbolFromTable(num3);
} else if (Entities.IsInvalidNumber(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber);
text = '�'.ToString();
} else {
if (Entities.IsInInvalidRange(num3))
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange);
text = Entities.Convert(num3);
}
} else {
int num5 = 0;
int insertionPoint = base.InsertionPoint - 1;
char[] array = new char[31];
int num6 = 0;
char c2 = base.Current;
while (c2 != ';' && c2.IsName()) {
array[num6++] = c2;
string name = new string(array, 0, num6);
c2 = GetNext();
num5++;
name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name));
if (name != null) {
num5 = 0;
text = name;
}
if (c2 == '' || num6 >= 31)
break;
}
Back(num5);
c2 = base.Current;
if (c2 != ';') {
if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) {
if (c2 == '=')
RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound);
base.InsertionPoint = insertionPoint;
_stringBuffer.Append('&');
return;
}
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated);
}
if (text == null) {
_stringBuffer.Append('&');
return;
}
}
_stringBuffer.Append(text);
}
}
private HtmlToken TagOpen(char c)
{
if (c == '/')
return TagEnd(GetNext());
if (c.IsLowercaseAscii()) {
_stringBuffer.Append(c);
return TagName(NewTagOpen());
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case '!':
return MarkupDeclaration(GetNext());
default:
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag);
_stringBuffer.Append('<');
return DataText(c);
case '?':
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(c);
}
}
_stringBuffer.Append(char.ToLower(c));
return TagName(NewTagOpen());
}
private HtmlToken TagEnd(char c)
{
if (c.IsLowercaseAscii()) {
_stringBuffer.Append(c);
return TagName(NewTagClose());
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return Data(GetNext());
case '':
Back();
RaiseErrorOccurred(HtmlParseError.EOF);
_stringBuffer.Append('<').Append('/');
return NewCharacter();
default:
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(c);
}
}
_stringBuffer.Append(char.ToLower(c));
return TagName(NewTagClose());
}
private HtmlToken TagName(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
if (next == '>') {
tag.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
return EmitTag(tag);
}
if (next.IsSpaceCharacter()) {
tag.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
return AttributeBeforeName(tag);
}
if (next == '/')
break;
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
switch (next) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(next);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return NewEof();
}
}
}
tag.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
return TagSelfClosing(tag);
}
private HtmlToken TagSelfClosing(HtmlTagToken tag)
{
switch (GetNext()) {
case '>':
tag.IsSelfClosing = true;
return EmitTag(tag);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return NewEof();
default:
RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced);
Back();
return AttributeBeforeName(tag);
}
}
private HtmlToken MarkupDeclaration(char c)
{
if (ContinuesWith("--", true)) {
Advance();
return CommentStart(GetNext());
}
if (ContinuesWith(Tags.Doctype, true)) {
Advance(6);
return Doctype(GetNext());
}
if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) {
Advance(6);
return CharacterData(GetNext());
}
RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration);
return BogusComment(c);
}
private HtmlToken BogusComment(char c)
{
_stringBuffer.Clear();
while (true) {
switch (c) {
case '':
Back();
goto case '>';
case ' ':
_stringBuffer.Append('�');
c = GetNext();
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
case '>':
_state = HtmlParseMode.PCData;
return NewComment();
}
}
}
private HtmlToken CommentStart(char c)
{
_stringBuffer.Clear();
switch (c) {
case '-':
return CommentDashStart(GetNext()) ?? Comment(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return Comment(GetNext());
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append(c);
return Comment(GetNext());
}
return NewComment();
}
private HtmlToken CommentDashStart(char c)
{
switch (c) {
case '-':
return CommentEnd(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('�');
return Comment(GetNext());
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append('-').Append(c);
return Comment(GetNext());
}
return NewComment();
}
private HtmlToken Comment(char c)
{
while (true) {
switch (c) {
case '-': {
HtmlToken htmlToken = CommentDashEnd(GetNext());
if (htmlToken != null)
return htmlToken;
break;
}
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken CommentDashEnd(char c)
{
switch (c) {
case '-':
return CommentEnd(GetNext());
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
break;
}
_stringBuffer.Append('-').Append(c);
return null;
}
private HtmlToken CommentEnd(char c)
{
while (true) {
switch (c) {
case '>':
_state = HtmlParseMode.PCData;
return NewComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('�');
return null;
case '!':
RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM);
return CommentBangEnd(GetNext());
case '-':
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewComment();
default:
RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected);
_stringBuffer.Append('-').Append('-').Append(c);
return null;
}
RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash);
_stringBuffer.Append('-');
c = GetNext();
}
}
private HtmlToken CommentBangEnd(char c)
{
switch (c) {
case '-':
_stringBuffer.Append('-').Append('-').Append('!');
return CommentDashEnd(GetNext());
case '>':
_state = HtmlParseMode.PCData;
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('-').Append('!')
.Append('�');
return null;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append('-').Append('-').Append('!')
.Append(c);
return null;
}
return NewComment();
}
private HtmlToken Doctype(char c)
{
if (c.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
if (c == '') {
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewDoctype(true);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected);
return DoctypeNameBefore(c);
}
private HtmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ': {
HtmlDoctypeToken doctype2 = NewDoctype(false);
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return DoctypeName(doctype2);
}
case '>': {
HtmlDoctypeToken result2 = NewDoctype(true);
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return result2;
}
case '': {
HtmlDoctypeToken result = NewDoctype(true);
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return result;
}
default: {
HtmlDoctypeToken doctype = NewDoctype(false);
_stringBuffer.Append(c);
return DoctypeName(doctype);
}
}
}
HtmlDoctypeToken doctype3 = NewDoctype(false);
_stringBuffer.Append(char.ToLower(c));
return DoctypeName(doctype3);
}
private HtmlToken DoctypeName(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
if (next.IsSpaceCharacter()) {
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeNameAfter(doctype);
}
if (next == '>') {
_state = HtmlParseMode.PCData;
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
break;
}
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
switch (next) {
case ' ':
break;
case '':
goto IL_00a3;
default:
goto IL_00d6;
}
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
}
continue;
IL_00d6:
_stringBuffer.Append(next);
continue;
IL_00a3:
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
break;
}
return doctype;
}
private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
break;
default:
if (ContinuesWith("public", true)) {
Advance(5);
return DoctypePublic(doctype);
}
if (ContinuesWith("system", true)) {
Advance(5);
return DoctypeSystem(doctype);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublic(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypePublicIdentifierBefore(doctype);
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
goto IL_00d2;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
goto IL_00d2;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00d2:
return doctype;
}
}
}
private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
goto IL_00d2;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
Back();
goto IL_00d2;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00d2:
return doctype;
}
}
}
private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypeBetween(doctype);
switch (next) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
_state = HtmlParseMode.PCData;
return DoctypeSystemIdentifierBefore(doctype);
}
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
goto IL_00d2;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
Back();
goto IL_00d2;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00d2:
return doctype;
}
}
}
private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
goto IL_00e7;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
Back();
goto IL_00e7;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00e7:
return doctype;
}
}
}
private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
return BogusDoctype(doctype);
}
return doctype;
}
private HtmlToken BogusDoctype(HtmlDoctypeToken doctype)
{
while (true) {
switch (GetNext()) {
case '>':
_state = HtmlParseMode.PCData;
goto IL_0020;
case '':
{
Back();
goto IL_0020;
}
IL_0020:
return doctype;
}
}
}
private HtmlToken AttributeBeforeName(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '/':
return TagSelfClosing(tag);
case '>':
return EmitTag(tag);
default:
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return AttributeName(tag);
case '"':
case '\'':
case '<':
case '=':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Append(c);
return AttributeName(tag);
default:
_stringBuffer.Append(c);
return AttributeName(tag);
case '':
return NewEof();
}
}
_stringBuffer.Append(char.ToLower(c));
return AttributeName(tag);
}
}
private HtmlToken AttributeName(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
switch (next) {
case '=':
tag.AddAttribute(_stringBuffer.ToString());
_stringBuffer.Clear();
return AttributeBeforeValue(tag);
case '>':
tag.AddAttribute(_stringBuffer.ToString());
_stringBuffer.Clear();
return EmitTag(tag);
}
if (next.IsSpaceCharacter()) {
tag.AddAttribute(_stringBuffer.ToString());
_stringBuffer.Clear();
return AttributeAfterName(tag);
}
if (next == '/')
break;
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
switch (next) {
case '"':
case '\'':
case '<':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Append(next);
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(next);
break;
case '':
return NewEof();
}
}
}
tag.AddAttribute(_stringBuffer.ToString());
_stringBuffer.Clear();
return TagSelfClosing(tag);
}
private HtmlToken AttributeAfterName(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '>':
return EmitTag(tag);
case '=':
return AttributeBeforeValue(tag);
case '/':
return TagSelfClosing(tag);
default:
if (!c.IsUppercaseAscii()) {
switch (c) {
case '"':
case '\'':
case '<':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Append(c);
return AttributeName(tag);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return AttributeName(tag);
default:
_stringBuffer.Append(c);
return AttributeName(tag);
case '':
return NewEof();
}
}
_stringBuffer.Append(char.ToLower(c));
return AttributeName(tag);
}
}
private HtmlToken AttributeBeforeValue(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '"':
return AttributeDoubleQuotedValue(tag);
case '\'':
return AttributeSingleQuotedValue(tag);
case '&':
return AttributeUnquotedValue(c, tag);
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return EmitTag(tag);
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
_stringBuffer.Append(c);
return AttributeUnquotedValue(GetNext(), tag);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return AttributeUnquotedValue(GetNext(), tag);
default:
_stringBuffer.Append(c);
return AttributeUnquotedValue(GetNext(), tag);
case '':
return NewEof();
}
}
private HtmlToken AttributeDoubleQuotedValue(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
tag.SetAttributeValue(_stringBuffer.ToString());
_stringBuffer.Clear();
return AttributeAfterValue(tag);
case '&':
AppendCharacterReference(GetNext(), '"');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(next);
break;
case '':
return NewEof();
}
}
}
private HtmlToken AttributeSingleQuotedValue(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
tag.SetAttributeValue(_stringBuffer.ToString());
_stringBuffer.Clear();
return AttributeAfterValue(tag);
case '&':
AppendCharacterReference(GetNext(), '\'');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
_stringBuffer.Append(next);
break;
case '':
return NewEof();
}
}
}
private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag)
{
while (true) {
if (c == '>') {
tag.SetAttributeValue(_stringBuffer.ToString());
_stringBuffer.Clear();
return EmitTag(tag);
}
if (c.IsSpaceCharacter())
break;
switch (c) {
case '&':
AppendCharacterReference(GetNext(), '>');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
_stringBuffer.Append(c);
break;
default:
_stringBuffer.Append(c);
break;
case '':
return NewEof();
}
c = GetNext();
}
tag.SetAttributeValue(_stringBuffer.ToString());
_stringBuffer.Clear();
return AttributeBeforeName(tag);
}
private HtmlToken AttributeAfterValue(HtmlTagToken tag)
{
char next = GetNext();
if (next == '>')
return EmitTag(tag);
if (!next.IsSpaceCharacter()) {
switch (next) {
case '/':
return TagSelfClosing(tag);
case '':
return NewEof();
default:
RaiseErrorOccurred(HtmlParseError.AttributeNameExpected);
Back();
return AttributeBeforeName(tag);
}
}
return AttributeBeforeName(tag);
}
private HtmlToken ScriptData(char c)
{
while (true) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '<':
return ScriptDataLt(GetNext());
case '':
Back();
return NewCharacter();
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken ScriptDataLt(char c)
{
_stringBuffer.Append('<');
switch (c) {
case '/': {
c = GetNext();
int length = _stringBuffer.Append('/').Length;
if (c.IsLetter()) {
_stringBuffer.Append(c);
return ScriptDataNameEndTag(NewTagClose(), length);
}
break;
}
case '!':
_stringBuffer.Append('!');
c = GetNext();
if (c == '-')
return ScriptDataEscapeDashLt(GetNext());
break;
}
return ScriptData(c);
}
private HtmlToken ScriptDataNameEndTag(HtmlTagToken tag, int offset)
{
int length = _lastStartTag.Length;
char next;
while (true) {
next = GetNext();
bool flag = next.IsSpaceCharacter();
bool flag2 = next == '>';
bool flag3 = next == '/';
if (_stringBuffer.Length - offset == length && (flag || flag2 || flag3)) {
string text = _stringBuffer.ToString(offset, length);
if (text.Equals(_lastStartTag, StringComparison.OrdinalIgnoreCase)) {
if (offset > 2) {
Back(3 + length);
_stringBuffer.Remove(offset - 2, length + 2);
return NewCharacter();
}
_stringBuffer.Clear();
if (flag) {
tag.Name = _lastStartTag;
return AttributeBeforeName(tag);
}
if (flag3) {
tag.Name = _lastStartTag;
return TagSelfClosing(tag);
}
if (flag2) {
tag.Name = _lastStartTag;
return EmitTag(tag);
}
}
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
return ScriptData(next);
}
private HtmlToken ScriptDataEscapeDashLt(char c)
{
_stringBuffer.Append('-');
if (c == '-') {
_stringBuffer.Append('-');
return ScriptDataEscapedDashDash();
}
return ScriptData(c);
}
private HtmlToken ScriptDataEscaped(char c)
{
while (true) {
switch (c) {
case '-':
_stringBuffer.Append('-');
return ScriptDataEscapedDash(GetNext());
case '<':
return ScriptDataEscapedLT(GetNext());
case ' ':
break;
case '':
Back();
return NewCharacter();
default:
return ScriptData(c);
}
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
c = GetNext();
}
}
private HtmlToken ScriptDataEscapedDash(char c)
{
switch (c) {
case '-':
_stringBuffer.Append('-');
return ScriptDataEscapedDashDash();
case '<':
return ScriptDataEscapedLT(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
Back();
return NewCharacter();
default:
_stringBuffer.Append(c);
break;
}
return ScriptDataEscaped(GetNext());
}
private HtmlToken ScriptDataEscapedDashDash()
{
while (true) {
char next = GetNext();
switch (next) {
case '-':
break;
case '<':
return ScriptDataEscapedLT(GetNext());
case '>':
_stringBuffer.Append('>');
return ScriptData(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return ScriptDataEscaped(GetNext());
case '':
return NewCharacter();
default:
_stringBuffer.Append(next);
return ScriptDataEscaped(GetNext());
}
_stringBuffer.Append('-');
}
}
private HtmlToken ScriptDataEscapedLT(char c)
{
if (c == '/')
return ScriptDataEscapedEndTag(GetNext());
if (c.IsLetter()) {
int length = _stringBuffer.Append('<').Length;
_stringBuffer.Append(c);
return ScriptDataStartDoubleEscape(length);
}
_stringBuffer.Append('<');
return ScriptDataEscaped(c);
}
private HtmlToken ScriptDataEscapedEndTag(char c)
{
int length = _stringBuffer.Append('<').Append('/').Length;
if (c.IsLetter()) {
_stringBuffer.Append(c);
return ScriptDataEscapedNameEndTag(NewTagClose(), length);
}
return ScriptDataEscaped(c);
}
private HtmlToken ScriptDataEscapedNameEndTag(HtmlTagToken tag, int offset)
{
int length = Tags.Script.Length;
char next;
while (true) {
next = GetNext();
if (_stringBuffer.Length - offset == length && (next == '/' || next == '>' || next.IsSpaceCharacter()) && _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase)) {
Back(length + 3);
_stringBuffer.Remove(offset - 2, length + 2);
return NewCharacter();
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataStartDoubleEscape(int offset)
{
int length = Tags.Script.Length;
char next;
while (true) {
next = GetNext();
if (_stringBuffer.Length - offset == length && (next == '/' || next == '>' || next.IsSpaceCharacter())) {
bool flag = _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase);
_stringBuffer.Append(next);
if (!flag)
return ScriptDataEscaped(GetNext());
return ScriptDataEscapedDouble(GetNext());
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataEscapedDouble(char c)
{
while (true) {
switch (c) {
case '-':
_stringBuffer.Append('-');
return ScriptDataEscapedDoubleDash(GetNext());
case '<':
_stringBuffer.Append('<');
return ScriptDataEscapedDoubleLT(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
}
_stringBuffer.Append(c);
c = GetNext();
}
}
private HtmlToken ScriptDataEscapedDoubleDash(char c)
{
switch (c) {
case '-':
_stringBuffer.Append('-');
return ScriptDataEscapedDoubleDashDash();
case '<':
_stringBuffer.Append('<');
return ScriptDataEscapedDoubleLT(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
}
return ScriptDataEscapedDouble(c);
}
private HtmlToken ScriptDataEscapedDoubleDashDash()
{
while (true) {
char next = GetNext();
switch (next) {
case '-':
break;
case '<':
_stringBuffer.Append('<');
return ScriptDataEscapedDoubleLT(GetNext());
case '>':
_stringBuffer.Append('>');
return ScriptData(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return ScriptDataEscapedDouble(GetNext());
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return NewCharacter();
default:
_stringBuffer.Append(next);
return ScriptDataEscapedDouble(GetNext());
}
_stringBuffer.Append('-');
}
}
private HtmlToken ScriptDataEscapedDoubleLT(char c)
{
if (c == '/') {
int length = _stringBuffer.Append('/').Length;
return ScriptDataEndDoubleEscape(length);
}
return ScriptDataEscapedDouble(c);
}
private HtmlToken ScriptDataEndDoubleEscape(int offset)
{
int length = Tags.Script.Length;
char next;
while (true) {
next = GetNext();
if (_stringBuffer.Length - offset == length && (next.IsSpaceCharacter() || next == '/' || next == '>')) {
bool flag = _stringBuffer.ToString(offset, length).Equals(Tags.Script, StringComparison.OrdinalIgnoreCase);
_stringBuffer.Append(next);
if (!flag)
return ScriptDataEscapedDouble(GetNext());
return ScriptDataEscaped(GetNext());
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
return ScriptDataEscapedDouble(next);
}
private HtmlToken NewCharacter()
{
string name = _stringBuffer.ToString();
_stringBuffer.Clear();
return new HtmlToken(HtmlTokenType.Character, _position, name);
}
private HtmlToken NewComment()
{
string name = _stringBuffer.ToString();
_stringBuffer.Clear();
return new HtmlToken(HtmlTokenType.Comment, _position, name);
}
private HtmlToken NewEof()
{
return new HtmlToken(HtmlTokenType.EndOfFile, _position);
}
private HtmlDoctypeToken NewDoctype(bool quirksForced)
{
return new HtmlDoctypeToken(quirksForced, _position);
}
private HtmlTagToken NewTagOpen()
{
return new HtmlTagToken(HtmlTokenType.StartTag, _position);
}
private HtmlTagToken NewTagClose()
{
return new HtmlTagToken(HtmlTokenType.EndTag, _position);
}
private HtmlToken CreateIfAppropriate(char c)
{
bool flag = c.IsSpaceCharacter();
bool flag2 = c == '>';
bool flag3 = c == '/';
if (_stringBuffer.Length == _lastStartTag.Length && (flag || flag2 || flag3) && _stringBuffer.ToString().Equals(_lastStartTag, StringComparison.Ordinal)) {
HtmlTagToken htmlTagToken = NewTagClose();
_stringBuffer.Clear();
if (flag) {
htmlTagToken.Name = _lastStartTag;
return AttributeBeforeName(htmlTagToken);
}
if (flag3) {
htmlTagToken.Name = _lastStartTag;
return TagSelfClosing(htmlTagToken);
}
if (flag2) {
htmlTagToken.Name = _lastStartTag;
return EmitTag(htmlTagToken);
}
}
return null;
}
private HtmlToken EmitTag(HtmlTagToken tag)
{
List<KeyValuePair<string, string>> attributes = tag.Attributes;
_state = HtmlParseMode.PCData;
switch (tag.Type) {
case HtmlTokenType.StartTag:
for (int num = attributes.Count - 1; num > 0; num--) {
for (int num2 = num - 1; num2 >= 0; num2--) {
if (attributes[num2].Key == attributes[num].Key) {
attributes.RemoveAt(num);
RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted, tag.Position);
break;
}
}
}
_lastStartTag = tag.Name;
break;
case HtmlTokenType.EndTag:
if (tag.IsSelfClosing)
RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed, tag.Position);
if (attributes.Count != 0)
RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes, tag.Position);
break;
}
return tag;
}
}
}