HtmlTokenizer
Performs the tokenization of the source code. Follows the tokenization algorithm at:
http://www.w3.org/html/wg/drafts/html/master/syntax.html
using AngleSharp.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using System;
using System.Collections.Generic;
using System.Diagnostics;
namespace AngleSharp.Parser.Html
{
[DebuggerStepThrough]
internal sealed class HtmlTokenizer : BaseTokenizer
{
private bool _acceptsCharacterData;
private string _lastStartTag;
private HtmlParseMode _state;
private HtmlToken _buffered;
public bool IsAcceptingCharacterData {
get {
return _acceptsCharacterData;
}
set {
_acceptsCharacterData = value;
}
}
public HtmlParseMode State {
get {
return _state;
}
set {
_state = value;
}
}
public HtmlTokenizer(TextSource source, IEventAggregator events)
: base(source, events)
{
_state = HtmlParseMode.PCData;
_acceptsCharacterData = false;
}
public void RaiseErrorOccurred(HtmlParseError code)
{
if (_events != null) {
TextPosition currentPosition = GetCurrentPosition();
HtmlParseErrorEvent data = new HtmlParseErrorEvent(code, currentPosition);
_events.Publish(data);
}
}
public HtmlToken Get()
{
HtmlToken htmlToken = _buffered;
if (htmlToken != null) {
_buffered = null;
return htmlToken;
}
char next = GetNext();
if (base.IsEnded)
return HtmlToken.EndOfFile;
switch (_state) {
case HtmlParseMode.PCData:
htmlToken = Data(next);
break;
case HtmlParseMode.RCData:
htmlToken = RCData(next);
break;
case HtmlParseMode.Plaintext:
htmlToken = Plaintext(next);
break;
case HtmlParseMode.Rawtext:
htmlToken = Rawtext(next);
break;
case HtmlParseMode.Script:
htmlToken = ScriptData(next);
break;
}
if (_textBuffer.Length > 0) {
_buffered = htmlToken;
htmlToken = HtmlToken.Character(_textBuffer.ToString());
_textBuffer.Clear();
}
return htmlToken;
}
private HtmlToken Plaintext(char c)
{
while (true) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken Data(char c)
{
while (true) {
switch (c) {
case '&': {
string text = CharacterReference(GetNext(), ' ');
if (text == null)
_textBuffer.Append('&');
_textBuffer.Append(text);
break;
}
case '<':
return TagOpen();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
return Data(GetNext());
case '':
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RCData(char c)
{
while (true) {
switch (c) {
case '&': {
string text = CharacterReference(GetNext(), ' ');
if (text == null)
_textBuffer.Append('&');
_textBuffer.Append(text);
goto IL_00b2;
}
case '<':
c = GetNext();
if (c == '/') {
_stringBuffer.Clear();
return RCDataEndTag();
}
_textBuffer.Append('<');
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
goto IL_00b2;
case '':
return HtmlToken.EndOfFile;
default:
{
_textBuffer.Append(c);
goto IL_00b2;
}
IL_00b2:
c = GetNext();
break;
}
}
}
private HtmlToken RCDataEndTag()
{
char next = GetNext();
if (next.IsUppercaseAscii())
_stringBuffer.Clear().Append(char.ToLower(next));
else {
if (!next.IsLowercaseAscii()) {
_textBuffer.Append('<').Append('/');
return RCData(next);
}
_stringBuffer.Clear().Append(next);
}
return RCDataNameEndTag(HtmlTagToken.Close());
}
private HtmlToken RCDataNameEndTag(HtmlTagToken tag)
{
char next;
while (true) {
next = GetNext();
string text = _stringBuffer.ToString();
bool flag = text == _lastStartTag;
if (flag && next.IsSpaceCharacter()) {
tag.Name = text;
return AttributeBeforeName(tag);
}
if (flag && next == '/') {
tag.Name = text;
return TagSelfClosing(tag);
}
if (flag && next == '>') {
tag.Name = text;
return EmitTag(tag);
}
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
if (!next.IsLowercaseAscii())
break;
_stringBuffer.Append(next);
}
}
_textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString());
return RCData(next);
}
private HtmlToken Rawtext(char c)
{
while (true) {
switch (c) {
case '<':
return RawtextLT();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(c);
break;
}
c = GetNext();
}
}
private HtmlToken RawtextLT()
{
char next = GetNext();
if (next == '/') {
_stringBuffer.Clear();
return RawtextEndTag();
}
_textBuffer.Append('<');
return Rawtext(next);
}
private HtmlToken RawtextEndTag()
{
char next = GetNext();
if (next.IsUppercaseAscii()) {
_stringBuffer.Clear().Append(char.ToLower(next));
return RawtextNameEndTag(HtmlTagToken.Close());
}
if (next.IsLowercaseAscii()) {
_stringBuffer.Clear().Append(next);
return RawtextNameEndTag(HtmlTagToken.Close());
}
_textBuffer.Append('<').Append('/');
return Rawtext(next);
}
private HtmlToken RawtextNameEndTag(HtmlTagToken tag)
{
char next;
while (true) {
next = GetNext();
string text = _stringBuffer.ToString();
bool flag = text == _lastStartTag;
if (flag && next.IsSpaceCharacter()) {
tag.Name = text;
return AttributeBeforeName(tag);
}
if (flag && next == '/') {
tag.Name = text;
return TagSelfClosing(tag);
}
if (flag && next == '>') {
tag.Name = text;
return EmitTag(tag);
}
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
if (!next.IsLowercaseAscii())
break;
_stringBuffer.Append(next);
}
}
_textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString());
return Rawtext(next);
}
private HtmlToken CData()
{
_stringBuffer.Clear();
while (true) {
char next = GetNext();
switch (next) {
case '':
Back();
goto IL_004b;
case ']':
{
if (!ContinuesWith("]]>", true))
break;
Advance(2);
goto IL_004b;
}
IL_004b:
return HtmlToken.Character(_stringBuffer.ToString());
}
_stringBuffer.Append(next);
}
}
private string CharacterReference(char c, char allowedCharacter = ' ')
{
if (c.IsSpaceCharacter() || c == '<' || c == '' || c == '&' || c == allowedCharacter) {
Back();
return null;
}
if (c == '#') {
int num = 10;
int num2 = 1;
int num3 = 0;
List<int> list = new List<int>();
c = GetNext();
bool flag = c == 'x' || c == 'X';
if (!flag) {
while (c.IsDigit()) {
list.Add(c.FromHex());
c = GetNext();
}
} else {
num = 16;
while ((c = GetNext()).IsHex()) {
list.Add(c.FromHex());
}
}
for (int num4 = list.Count - 1; num4 >= 0; num4--) {
num3 += list[num4] * num2;
num2 *= num;
}
if (list.Count == 0) {
Back(2);
if (flag)
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceWrongNumber);
return null;
}
if (c != ';') {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceSemicolonMissing);
Back();
}
if (Entities.IsInCharacterTable(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidCode);
return Entities.GetSymbolFromTable(num3);
}
if (Entities.IsInvalidNumber(num3)) {
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidNumber);
return '�'.ToString();
}
if (Entities.IsInInvalidRange(num3))
RaiseErrorOccurred(HtmlParseError.CharacterReferenceInvalidRange);
return Entities.Convert(num3);
}
string result = null;
int num5 = 0;
int insertionPoint = base.InsertionPoint - 1;
char[] array = new char[31];
int num6 = 0;
char c2 = base.Current;
while (c2 != ';' && c2.IsName()) {
array[num6++] = c2;
string name = new string(array, 0, num6);
c2 = GetNext();
num5++;
name = ((c2 == ';') ? Entities.GetSymbol(name) : Entities.GetSymbolWithoutSemicolon(name));
if (name != null) {
num5 = 0;
result = name;
}
if (base.IsEnded || num6 >= 31)
break;
}
Back(num5);
c2 = base.Current;
if (c2 != ';') {
if (allowedCharacter != 0 && (c2 == '=' || c2.IsAlphanumericAscii())) {
if (c2 == '=')
RaiseErrorOccurred(HtmlParseError.CharacterReferenceAttributeEqualsFound);
base.InsertionPoint = insertionPoint;
return null;
}
Back();
RaiseErrorOccurred(HtmlParseError.CharacterReferenceNotTerminated);
}
return result;
}
private HtmlToken TagOpen()
{
char next = GetNext();
if (next == '/')
return TagEnd(GetNext());
if (next.IsLowercaseAscii()) {
HtmlTagToken tag = HtmlTagToken.Open();
_stringBuffer.Clear().Append(next);
return TagName(tag);
}
if (!next.IsUppercaseAscii()) {
switch (next) {
case '!':
return MarkupDeclaration();
case '?':
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(next);
default:
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.AmbiguousOpenTag);
_textBuffer.Append('<');
return Data(next);
}
}
HtmlTagToken tag2 = HtmlTagToken.Open();
_stringBuffer.Clear().Append(char.ToLower(next));
return TagName(tag2);
}
private HtmlToken TagEnd(char c)
{
if (c.IsLowercaseAscii()) {
HtmlTagToken tag = HtmlTagToken.Close();
_stringBuffer.Clear().Append(c);
return TagName(tag);
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return Data(GetNext());
case '':
Back();
RaiseErrorOccurred(HtmlParseError.EOF);
_textBuffer.Append('<').Append('/');
return HtmlToken.EndOfFile;
default:
RaiseErrorOccurred(HtmlParseError.BogusComment);
return BogusComment(c);
}
}
HtmlTagToken tag2 = HtmlTagToken.Close();
_stringBuffer.Clear().Append(char.ToLower(c));
return TagName(tag2);
}
private HtmlToken TagName(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
if (next.IsSpaceCharacter())
break;
switch (next) {
case '/':
tag.Name = _stringBuffer.ToString();
return TagSelfClosing(tag);
case '>':
tag.Name = _stringBuffer.ToString();
return EmitTag(tag);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return HtmlToken.EndOfFile;
default:
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else
_stringBuffer.Append(next);
break;
}
}
tag.Name = _stringBuffer.ToString();
return AttributeBeforeName(tag);
}
private HtmlToken TagSelfClosing(HtmlTagToken tag)
{
switch (GetNext()) {
case '>':
tag.IsSelfClosing = true;
return EmitTag(tag);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return HtmlToken.EndOfFile;
default:
RaiseErrorOccurred(HtmlParseError.ClosingSlashMisplaced);
Back();
return AttributeBeforeName(tag);
}
}
private HtmlToken MarkupDeclaration()
{
char next = GetNext();
if (ContinuesWith("--", true)) {
Advance();
return CommentStart();
}
if (ContinuesWith(Tags.Doctype, true)) {
Advance(6);
return Doctype();
}
if (_acceptsCharacterData && ContinuesWith("[CDATA[", false)) {
Advance(6);
return CData();
}
RaiseErrorOccurred(HtmlParseError.UndefinedMarkupDeclaration);
return BogusComment(next);
}
private HtmlToken BogusComment(char c)
{
_stringBuffer.Clear();
while (true) {
switch (c) {
case '':
Back();
goto case '>';
case ' ':
_stringBuffer.Append('�');
c = GetNext();
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
case '>':
_state = HtmlParseMode.PCData;
return EmitComment();
}
}
}
private HtmlToken CommentStart()
{
char next = GetNext();
_stringBuffer.Clear();
switch (next) {
case '-':
return CommentDashStart();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return Comment();
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append(next);
return Comment();
}
return EmitComment();
}
private HtmlToken CommentDashStart()
{
char next = GetNext();
switch (next) {
case '-':
return CommentEnd();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('�');
return Comment();
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append('-').Append(next);
return Comment();
}
return EmitComment();
}
private HtmlToken Comment()
{
while (true) {
char next = GetNext();
switch (next) {
case '-': {
HtmlToken htmlToken = CommentDashEnd();
if (htmlToken != null)
return htmlToken;
break;
}
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return EmitComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
next = '�';
_stringBuffer.Append(next);
break;
default:
_stringBuffer.Append(next);
break;
}
}
}
private HtmlToken CommentDashEnd()
{
char c = GetNext();
switch (c) {
case '-':
return CommentEnd();
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return EmitComment();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
break;
}
_stringBuffer.Append('-').Append(c);
return null;
}
private HtmlToken CommentEnd()
{
while (true) {
char next = GetNext();
switch (next) {
case '>':
_state = HtmlParseMode.PCData;
goto IL_00aa;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('�');
return null;
case '!':
RaiseErrorOccurred(HtmlParseError.CommentEndedWithEM);
return CommentBangEnd();
case '-':
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
goto IL_00aa;
default:
{
RaiseErrorOccurred(HtmlParseError.CommentEndedUnexpected);
_stringBuffer.Append('-').Append('-').Append(next);
return null;
}
IL_00aa:
return EmitComment();
}
RaiseErrorOccurred(HtmlParseError.CommentEndedWithDash);
_stringBuffer.Append('-');
}
}
private HtmlToken CommentBangEnd()
{
char next = GetNext();
switch (next) {
case '-':
_stringBuffer.Append('-').Append('-').Append('!');
return CommentDashEnd();
case '>':
_state = HtmlParseMode.PCData;
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('-').Append('-').Append('!')
.Append('�');
return null;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
break;
default:
_stringBuffer.Append('-').Append('-').Append('!')
.Append(next);
return null;
}
return EmitComment();
}
private HtmlToken Doctype()
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
if (next == '') {
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
HtmlDoctypeToken token = HtmlToken.Doctype(true);
return Emit(token);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpected);
return DoctypeNameBefore(next);
}
private HtmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ': {
HtmlDoctypeToken doctype2 = HtmlToken.Doctype(false);
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Clear().Append('�');
return DoctypeName(doctype2);
}
case '>': {
HtmlDoctypeToken token2 = HtmlToken.Doctype(true);
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return Emit(token2);
}
case '': {
HtmlDoctypeToken token = HtmlToken.Doctype(true);
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
return Emit(token);
}
default: {
HtmlDoctypeToken doctype = HtmlToken.Doctype(false);
_stringBuffer.Clear().Append(c);
return DoctypeName(doctype);
}
}
}
HtmlDoctypeToken doctype3 = HtmlToken.Doctype(false);
_stringBuffer.Clear().Append(char.ToLower(c));
return DoctypeName(doctype3);
}
private HtmlToken DoctypeName(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
if (next.IsSpaceCharacter()) {
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeNameAfter(doctype);
}
if (next == '>') {
_state = HtmlParseMode.PCData;
doctype.Name = _stringBuffer.ToString();
break;
}
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else {
switch (next) {
case ' ':
break;
case '':
goto IL_0094;
default:
goto IL_00bb;
}
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
}
continue;
IL_00bb:
_stringBuffer.Append(next);
continue;
IL_0094:
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.Name = _stringBuffer.ToString();
break;
}
return Emit(doctype);
}
private HtmlToken DoctypeNameAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
break;
default:
if (ContinuesWith("public", true)) {
Advance(5);
return DoctypePublic(doctype);
}
if (ContinuesWith("system", true)) {
Advance(5);
return DoctypeSystem(doctype);
}
RaiseErrorOccurred(HtmlParseError.DoctypeUnexpectedAfterName);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypePublic(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter())
return DoctypePublicIdentifierBefore(doctype);
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypePublicIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
_stringBuffer.Clear();
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierDoubleQuoted(doctype);
case '\'':
_stringBuffer.Clear();
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypePublicInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypePublicIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
goto IL_00ba;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
Back();
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
goto IL_00ba;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00ba:
return Emit(doctype);
}
}
}
private HtmlToken DoctypePublicIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
goto IL_00ba;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.PublicIdentifier = _stringBuffer.ToString();
Back();
goto IL_00ba;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00ba:
return Emit(doctype);
}
}
}
private HtmlToken DoctypePublicIdentifierAfter(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
_stringBuffer.Clear();
return DoctypeBetween(doctype);
}
switch (next) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypeBetween(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypeSystem(HtmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
_state = HtmlParseMode.PCData;
return DoctypeSystemIdentifierBefore(doctype);
}
switch (next) {
case '"':
RaiseErrorOccurred(HtmlParseError.DoubleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
RaiseErrorOccurred(HtmlParseError.SingleQuotationMarkUnexpected);
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.SystemIdentifier = _stringBuffer.ToString();
doctype.IsQuirksForced = true;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeSystemInvalid);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypeSystemIdentifierBefore(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '"':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierDoubleQuoted(doctype);
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierSingleQuoted(doctype);
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
doctype.IsQuirksForced = true;
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken DoctypeSystemIdentifierDoubleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
goto IL_00ba;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
Back();
goto IL_00ba;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00ba:
return Emit(doctype);
}
}
}
private HtmlToken DoctypeSystemIdentifierSingleQuoted(HtmlDoctypeToken doctype)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(doctype);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '>':
_state = HtmlParseMode.PCData;
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
goto IL_00c9;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
doctype.SystemIdentifier = _stringBuffer.ToString();
Back();
goto IL_00c9;
default:
{
_stringBuffer.Append(next);
break;
}
IL_00c9:
return Emit(doctype);
}
}
}
private HtmlToken DoctypeSystemIdentifierAfter(HtmlDoctypeToken doctype)
{
switch (SkipSpaces()) {
case '>':
_state = HtmlParseMode.PCData;
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
doctype.IsQuirksForced = true;
Back();
break;
default:
RaiseErrorOccurred(HtmlParseError.DoctypeInvalidCharacter);
return BogusDoctype(doctype);
}
return Emit(doctype);
}
private HtmlToken BogusDoctype(HtmlDoctypeToken doctype)
{
while (true) {
switch (GetNext()) {
case '>':
_state = HtmlParseMode.PCData;
goto IL_0020;
case '':
{
Back();
goto IL_0020;
}
IL_0020:
return Emit(doctype);
}
}
}
private HtmlToken AttributeBeforeName(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '/':
return TagSelfClosing(tag);
case '>':
return EmitTag(tag);
default:
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Clear().Append('�');
return AttributeName(tag);
case '"':
case '\'':
case '<':
case '=':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Clear().Append(c);
return AttributeName(tag);
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Clear().Append(c);
return AttributeName(tag);
}
}
_stringBuffer.Clear().Append(char.ToLower(c));
return AttributeName(tag);
}
}
private HtmlToken AttributeName(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
if (next.IsSpaceCharacter())
break;
switch (next) {
case '/':
tag.AddAttribute(_stringBuffer.ToString());
return TagSelfClosing(tag);
case '=':
tag.AddAttribute(_stringBuffer.ToString());
return AttributeBeforeValue(tag);
case '>':
tag.AddAttribute(_stringBuffer.ToString());
return EmitTag(tag);
case '':
return HtmlToken.EndOfFile;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
default:
if (next.IsUppercaseAscii())
_stringBuffer.Append(char.ToLower(next));
else if (next == '"' || next == '\'' || next == '<') {
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Append(next);
} else {
_stringBuffer.Append(next);
}
break;
}
}
tag.AddAttribute(_stringBuffer.ToString());
return AttributeAfterName(tag);
}
private HtmlToken AttributeAfterName(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '/':
return TagSelfClosing(tag);
case '=':
return AttributeBeforeValue(tag);
case '>':
return EmitTag(tag);
default:
if (!c.IsUppercaseAscii()) {
switch (c) {
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Clear().Append('�');
return AttributeName(tag);
case '"':
case '\'':
case '<':
RaiseErrorOccurred(HtmlParseError.AttributeNameInvalid);
_stringBuffer.Clear().Append(c);
return AttributeName(tag);
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Clear().Append(c);
return AttributeName(tag);
}
}
_stringBuffer.Clear().Append(char.ToLower(c));
return AttributeName(tag);
}
}
private HtmlToken AttributeBeforeValue(HtmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '"':
_stringBuffer.Clear();
return AttributeDoubleQuotedValue(tag);
case '&':
_stringBuffer.Clear();
return AttributeUnquotedValue(c, tag);
case '\'':
_stringBuffer.Clear();
return AttributeSingleQuotedValue(tag);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
return AttributeUnquotedValue(GetNext(), tag);
case '>':
RaiseErrorOccurred(HtmlParseError.TagClosedWrong);
return EmitTag(tag);
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
_stringBuffer.Clear().Append(c);
return AttributeUnquotedValue(GetNext(), tag);
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Clear().Append(c);
return AttributeUnquotedValue(GetNext(), tag);
}
}
private HtmlToken AttributeDoubleQuotedValue(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
switch (next) {
case '"':
tag.SetAttributeValue(_stringBuffer.ToString());
return AttributeAfterValue(tag);
case '&': {
string text = CharacterReference(GetNext(), '"');
if (text == null)
_stringBuffer.Append('&');
else
_stringBuffer.Append(text);
break;
}
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Append(next);
break;
}
}
}
private HtmlToken AttributeSingleQuotedValue(HtmlTagToken tag)
{
while (true) {
char next = GetNext();
switch (next) {
case '\'':
tag.SetAttributeValue(_stringBuffer.ToString());
return AttributeAfterValue(tag);
case '&': {
string text = CharacterReference(GetNext(), '\'');
if (text == null)
_stringBuffer.Append('&');
else
_stringBuffer.Append(text);
break;
}
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Append(next);
break;
}
}
}
private HtmlToken AttributeUnquotedValue(char c, HtmlTagToken tag)
{
while (!c.IsSpaceCharacter()) {
switch (c) {
case '&': {
string text = CharacterReference(GetNext(), '>');
if (text == null)
_stringBuffer.Append('&');
else
_stringBuffer.Append(text);
break;
}
case '>':
tag.SetAttributeValue(_stringBuffer.ToString());
return EmitTag(tag);
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_stringBuffer.Append('�');
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
RaiseErrorOccurred(HtmlParseError.AttributeValueInvalid);
_stringBuffer.Append(c);
break;
case '':
return HtmlToken.EndOfFile;
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
tag.SetAttributeValue(_stringBuffer.ToString());
return AttributeBeforeName(tag);
}
private HtmlToken AttributeAfterValue(HtmlTagToken tag)
{
char next = GetNext();
if (!next.IsSpaceCharacter()) {
switch (next) {
case '/':
return TagSelfClosing(tag);
case '>':
return EmitTag(tag);
case '':
return HtmlToken.EndOfFile;
default:
RaiseErrorOccurred(HtmlParseError.AttributeNameExpected);
Back();
return AttributeBeforeName(tag);
}
}
return AttributeBeforeName(tag);
}
private HtmlToken ScriptData(char c)
{
while (true) {
switch (c) {
case '<':
c = GetNext();
if (c == '/') {
c = GetNext();
if (c.IsLetter()) {
HtmlTagToken tag = HtmlTagToken.Close();
_stringBuffer.Clear().Append(c);
return ScriptDataNameEndTag(tag);
}
_textBuffer.Append('<').Append('/');
} else {
_textBuffer.Append('<');
if (c == '!') {
c = GetNext();
_textBuffer.Append('!');
if (c == '-') {
c = GetNext();
_textBuffer.Append('-');
if (c == '-') {
_textBuffer.Append('-');
return ScriptDataEscapedDashDash();
}
}
}
}
break;
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
goto IL_00fb;
case '':
return HtmlToken.EndOfFile;
default:
{
_textBuffer.Append(c);
goto IL_00fb;
}
IL_00fb:
c = GetNext();
break;
}
}
}
private HtmlToken ScriptDataNameEndTag(HtmlTagToken tag)
{
char next;
while (true) {
next = GetNext();
string text = _stringBuffer.ToString().ToLowerInvariant();
if (text == _lastStartTag) {
if (next.IsSpaceCharacter()) {
tag.Name = text;
return AttributeBeforeName(tag);
}
switch (next) {
case '/':
tag.Name = text;
return TagSelfClosing(tag);
case '>':
tag.Name = text;
return EmitTag(tag);
}
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
_textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString());
return ScriptData(next);
}
private HtmlToken ScriptDataEscaped(char c)
{
while (true) {
switch (c) {
case '-':
_textBuffer.Append('-');
c = GetNext();
switch (c) {
case '-':
_textBuffer.Append('-');
return ScriptDataEscapedDashDash();
case '<':
return ScriptDataEscapedLT();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(c);
break;
}
break;
case '<':
return ScriptDataEscapedLT();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
break;
case '':
return HtmlToken.EndOfFile;
default:
return ScriptData(c);
}
c = GetNext();
}
}
private HtmlToken ScriptDataEscapedDashDash()
{
while (true) {
char next = GetNext();
switch (next) {
case '-':
break;
case '<':
return ScriptDataEscapedLT();
case '>':
_textBuffer.Append('>');
return ScriptData(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
return ScriptDataEscaped(GetNext());
case '':
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(next);
return ScriptDataEscaped(GetNext());
}
_textBuffer.Append('-');
}
}
private HtmlToken ScriptDataEscapedLT()
{
char next = GetNext();
if (next == '/')
return ScriptDataEscapedEndTag();
if (next.IsLetter()) {
_stringBuffer.Clear().Append(next);
_textBuffer.Append('<').Append(next);
return ScriptDataStartDoubleEscape();
}
_textBuffer.Append('<');
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataEscapedEndTag()
{
char next = GetNext();
if (next.IsLetter()) {
HtmlTagToken tag = HtmlTagToken.Close();
_stringBuffer.Clear().Append(next);
return ScriptDataEscapedNameTag(tag);
}
_textBuffer.Append('<').Append('/');
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataEscapedNameTag(HtmlTagToken tag)
{
char next;
while (true) {
next = GetNext();
string text = _stringBuffer.ToString().ToLowerInvariant();
if (text == _lastStartTag) {
if (next.IsSpaceCharacter()) {
tag.Name = text;
return AttributeBeforeName(tag);
}
switch (next) {
case '/':
tag.Name = text;
return TagSelfClosing(tag);
case '>':
tag.Name = text;
return EmitTag(tag);
}
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
}
_textBuffer.Append('<').Append('/').Append(_stringBuffer.ToString());
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataStartDoubleEscape()
{
char next;
while (true) {
next = GetNext();
if (next == '/' || next == '>' || next.IsSpaceCharacter()) {
_textBuffer.Append(next);
if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase))
return ScriptDataEscapedDouble(GetNext());
return ScriptDataEscaped(GetNext());
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
_textBuffer.Append(next);
}
return ScriptDataEscaped(next);
}
private HtmlToken ScriptDataEscapedDouble(char c)
{
while (true) {
switch (c) {
case '-':
_textBuffer.Append('-');
c = GetNext();
switch (c) {
case '-':
_textBuffer.Append('-');
return ScriptDataEscapedDoubleDashDash();
case '<':
_textBuffer.Append('<');
return ScriptDataEscapedDoubleLT();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
c = '�';
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return HtmlToken.EndOfFile;
}
break;
case '<':
_textBuffer.Append('<');
return ScriptDataEscapedDoubleLT();
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
break;
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return HtmlToken.EndOfFile;
}
_textBuffer.Append(c);
c = GetNext();
}
}
private HtmlToken ScriptDataEscapedDoubleDashDash()
{
while (true) {
char next = GetNext();
switch (next) {
case '-':
break;
case '<':
_textBuffer.Append('<');
return ScriptDataEscapedDoubleLT();
case '>':
_textBuffer.Append('>');
return ScriptData(GetNext());
case ' ':
RaiseErrorOccurred(HtmlParseError.Null);
_textBuffer.Append('�');
return ScriptDataEscapedDouble(GetNext());
case '':
RaiseErrorOccurred(HtmlParseError.EOF);
return HtmlToken.EndOfFile;
default:
_textBuffer.Append(next);
return ScriptDataEscapedDouble(GetNext());
}
_textBuffer.Append('-');
}
}
private HtmlToken ScriptDataEscapedDoubleLT()
{
char next = GetNext();
if (next == '/') {
_stringBuffer.Clear();
_textBuffer.Append('/');
return ScriptDataEndDoubleEscape();
}
return ScriptDataEscapedDouble(next);
}
private HtmlToken ScriptDataEndDoubleEscape()
{
char next;
while (true) {
next = GetNext();
if (next.IsSpaceCharacter() || next == '/' || next == '>') {
_textBuffer.Append(next);
if (_stringBuffer.ToString().Equals(Tags.Script, StringComparison.OrdinalIgnoreCase))
return ScriptDataEscaped(GetNext());
return ScriptDataEscapedDouble(GetNext());
}
if (!next.IsLetter())
break;
_stringBuffer.Append(next);
_textBuffer.Append(next);
}
return ScriptDataEscapedDouble(next);
}
private HtmlToken Emit(HtmlToken token)
{
return token;
}
private HtmlToken EmitComment()
{
HtmlToken token = HtmlToken.Comment(_stringBuffer.ToString());
return Emit(token);
}
private HtmlToken EmitTag(HtmlTagToken tag)
{
_state = HtmlParseMode.PCData;
List<KeyValuePair<string, string>> attributes = tag.Attributes;
if (tag.Type == HtmlTokenType.StartTag) {
for (int num = attributes.Count - 1; num > 0; num--) {
for (int num2 = num - 1; num2 >= 0; num2--) {
KeyValuePair<string, string> keyValuePair = attributes[num2];
string key = keyValuePair.Key;
keyValuePair = attributes[num];
if (key == keyValuePair.Key) {
attributes.RemoveAt(num);
RaiseErrorOccurred(HtmlParseError.AttributeDuplicateOmitted);
break;
}
}
}
_lastStartTag = tag.Name;
} else {
if (tag.IsSelfClosing)
RaiseErrorOccurred(HtmlParseError.EndTagCannotBeSelfClosed);
if (attributes.Count != 0)
RaiseErrorOccurred(HtmlParseError.EndTagCannotHaveAttributes);
}
return Emit(tag);
}
}
}