XmlTokenizer
Performs the tokenization of the source code. Most of
the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using AngleSharp.Services;
using System.Diagnostics;
namespace AngleSharp.Parser.Xml
{
[DebuggerStepThrough]
internal sealed class XmlTokenizer : BaseTokenizer
{
private readonly IEntityService _resolver;
private TextPosition _position;
public XmlTokenizer(TextSource source, IEventAggregator events, IEntityService resolver)
: base(source, events)
{
_resolver = resolver;
}
public XmlToken Get()
{
char next = GetNext();
if (next != '') {
_position = GetCurrentPosition();
return Data(next);
}
return NewEof();
}
private XmlToken Data(char c)
{
switch (c) {
case '<':
return TagOpen(GetNext());
case '':
return NewEof();
default:
return DataText(c);
}
}
private XmlToken DataText(char c)
{
while (true) {
switch (c) {
case '':
case '<':
Back();
return NewCharacters();
case '&':
_stringBuffer.Append(CharacterReference(GetNext()));
c = GetNext();
break;
case ']':
_stringBuffer.Append(c);
c = CheckCharacter(GetNext());
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
}
}
}
private char CheckCharacter(char ch)
{
if (ch == ']') {
if (GetNext() == '>')
throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition());
Back();
}
return ch;
}
private XmlCDataToken CData(char c)
{
while (true) {
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case ']':
if (ContinuesWithSensitive("]]>")) {
Advance(2);
return NewCharacterData();
}
break;
}
_stringBuffer.Append(c);
c = GetNext();
}
}
private string CharacterReference(char c)
{
int length = _stringBuffer.Length;
bool flag = false;
bool flag2 = c == '#';
if (flag2) {
c = GetNext();
if (c != 'x' && c != 'X') {
while (c.IsDigit()) {
_stringBuffer.Append(c);
c = GetNext();
}
} else {
c = GetNext();
while (c.IsHex()) {
_stringBuffer.Append(c);
c = GetNext();
}
}
} else if (c.IsXmlNameStart()) {
do {
_stringBuffer.Append(c);
c = GetNext();
} while (c.IsXmlName());
}
int num;
string text;
if (c == ';' && _stringBuffer.Length > length) {
int length2 = _stringBuffer.Length - length;
text = _stringBuffer.ToString(length, length2);
_stringBuffer.Remove(length, length2);
int num2;
switch (flag2) {
default:
num = text.FromHex();
goto IL_00f7;
case false:
{
string symbol = _resolver.GetSymbol(text);
if (string.IsNullOrEmpty(symbol))
throw XmlParseError.CharacterReferenceInvalidCode.At(_position);
return symbol;
}
IL_00f7:
num2 = num;
if (!num2.IsValidAsCharRef())
throw XmlParseError.CharacterReferenceInvalidNumber.At(_position);
return num2.ConvertFromUtf32();
}
}
throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition());
IL_00e7:
num = text.FromDec();
goto IL_00f7;
}
private XmlToken TagOpen(char c)
{
switch (c) {
case '!':
return MarkupDeclaration(GetNext());
case '?':
c = GetNext();
if (ContinuesWithSensitive(TagNames.Xml)) {
Advance(2);
return DeclarationStart(GetNext());
}
return ProcessingStart(c);
case '/':
return TagEnd(GetNext());
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Append(c);
return TagName(GetNext(), NewOpenTag());
}
throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition());
}
}
private XmlToken TagEnd(char c)
{
if (c.IsXmlNameStart()) {
do {
_stringBuffer.Append(c);
c = GetNext();
} while (c.IsXmlName());
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>') {
XmlTagToken xmlTagToken = NewCloseTag();
xmlTagToken.Name = FlushBuffer();
return xmlTagToken;
}
}
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition());
}
private XmlToken TagName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
tag.Name = FlushBuffer();
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '>':
return tag;
default:
if (c.IsSpaceCharacter())
return AttributeBeforeName(GetNext(), tag);
if (c == '/')
return TagSelfClosing(GetNext(), tag);
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken TagSelfClosing(char c, XmlTagToken tag)
{
tag.IsSelfClosing = true;
switch (c) {
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken MarkupDeclaration(char c)
{
if (ContinuesWithSensitive("--")) {
Advance();
return CommentStart(GetNext());
}
if (ContinuesWithSensitive(TagNames.Doctype)) {
Advance(6);
return Doctype(GetNext());
}
if (ContinuesWithSensitive(Keywords.CData)) {
Advance(6);
return CData(GetNext());
}
throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition());
}
private XmlToken DeclarationStart(char c)
{
if (!c.IsSpaceCharacter()) {
_stringBuffer.Append(TagNames.Xml);
return ProcessingTarget(c, NewProcessing());
}
do {
c = GetNext();
} while (c.IsSpaceCharacter());
if (ContinuesWithSensitive(AttributeNames.Version)) {
Advance(6);
return DeclarationVersionAfterName(GetNext(), NewDeclaration());
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationVersionBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'')
return DeclarationVersionValue(GetNext(), c, decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
decl.Version = FlushBuffer();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterVersion(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWithSensitive(AttributeNames.Encoding)) {
Advance(7);
return DeclarationEncodingAfterName(GetNext(), decl);
}
if (ContinuesWithSensitive(AttributeNames.Standalone)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationEncodingBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
char q = c;
c = GetNext();
if (c.IsLetter())
return DeclarationEncodingValue(c, q, decl);
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl)
{
do {
if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
} while (c != q);
decl.Encoding = FlushBuffer();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterEncoding(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWithSensitive(AttributeNames.Standalone)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationStandaloneBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'')
return DeclarationStandaloneValue(GetNext(), c, decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
string current = FlushBuffer();
if (current.Is(Keywords.Yes))
decl.Standalone = true;
else {
if (!current.Is(Keywords.No))
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
decl.Standalone = false;
}
return DeclarationEnd(GetNext(), decl);
}
private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c != '?' || GetNext() != '>')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return decl;
}
private XmlToken Doctype(char c)
{
if (c.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c.IsXmlNameStart()) {
_stringBuffer.Append(c);
return DoctypeName(GetNext(), NewDoctype());
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeName(char c, XmlDoctypeToken doctype)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
doctype.Name = FlushBuffer();
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeNameAfter(GetNext(), doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
if (ContinuesWithSensitive(Keywords.Public)) {
Advance(5);
return DoctypePublic(GetNext(), doctype);
}
if (ContinuesWithSensitive(Keywords.System)) {
Advance(5);
return DoctypeSystem(GetNext(), doctype);
}
if (c == '[') {
Advance();
return DoctypeAfter(GetNext(), doctype);
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (!c.IsPubidChar())
throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
doctype.PublicIdentifier = FlushBuffer();
return DoctypePublicIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype)
{
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeBetween(GetNext(), doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '>':
return doctype;
case '"':
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
default:
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
}
private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
doctype.SystemIdentifier = FlushBuffer();
return DoctypeSystemIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '[') {
Advance();
c = GetNext();
}
return DoctypeAfter(c, doctype);
}
private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken AttributeBeforeName(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Append(c);
return AttributeName(GetNext(), tag);
}
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
private XmlToken AttributeName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
string name = FlushBuffer();
if (!string.IsNullOrEmpty(tag.GetAttribute(name)))
throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition());
tag.AddAttribute(name);
if (c.IsSpaceCharacter()) {
do {
c = GetNext();
} while (c.IsSpaceCharacter());
}
if (c == '=')
return AttributeBeforeValue(GetNext(), tag);
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
private XmlToken AttributeBeforeValue(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'')
return AttributeValue(GetNext(), c, tag);
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
private XmlToken AttributeValue(char c, char q, XmlTagToken tag)
{
while (c != q) {
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '&':
_stringBuffer.Append(CharacterReference(GetNext()));
break;
case '<':
throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition());
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
tag.SetAttributeValue(FlushBuffer());
return AttributeAfterValue(GetNext(), tag);
}
private XmlToken AttributeAfterValue(char c, XmlTagToken tag)
{
if (!c.IsSpaceCharacter()) {
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
default:
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
return AttributeBeforeName(GetNext(), tag);
}
private XmlToken ProcessingStart(char c)
{
if (c.IsXmlNameStart()) {
_stringBuffer.Append(c);
return ProcessingTarget(GetNext(), NewProcessing());
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingTarget(char c, XmlPIToken pi)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
pi.Target = FlushBuffer();
if (pi.Target.Isi(TagNames.Xml))
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
if (c == '?') {
c = GetNext();
if (c == '>')
return pi;
} else if (c.IsSpaceCharacter()) {
return ProcessingContent(GetNext(), pi);
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingContent(char c, XmlPIToken pi)
{
while (true) {
switch (c) {
case '?':
c = GetNext();
if (c == '>') {
pi.Content = FlushBuffer();
return pi;
}
_stringBuffer.Append('?');
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
}
}
}
private XmlToken CommentStart(char c)
{
return Comment(c);
}
private XmlToken Comment(char c)
{
while (c.IsXmlChar()) {
if (c == '-')
return CommentDash(GetNext());
_stringBuffer.Append(c);
c = GetNext();
}
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlToken CommentDash(char c)
{
if (c == '-')
return CommentEnd(GetNext());
return Comment(c);
}
private XmlToken CommentEnd(char c)
{
if (c == '>')
return NewComment();
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlEndOfFileToken NewEof()
{
return new XmlEndOfFileToken(GetCurrentPosition());
}
private XmlCharacterToken NewCharacters()
{
string data = FlushBuffer();
return new XmlCharacterToken(_position, data);
}
private XmlCommentToken NewComment()
{
string data = FlushBuffer();
return new XmlCommentToken(_position, data);
}
private XmlPIToken NewProcessing()
{
return new XmlPIToken(_position);
}
private XmlDoctypeToken NewDoctype()
{
return new XmlDoctypeToken(_position);
}
private XmlDeclarationToken NewDeclaration()
{
return new XmlDeclarationToken(_position);
}
private XmlTagToken NewOpenTag()
{
return new XmlTagToken(XmlTokenType.StartTag, _position);
}
private XmlTagToken NewCloseTag()
{
return new XmlTagToken(XmlTokenType.EndTag, _position);
}
private XmlCDataToken NewCharacterData()
{
string data = FlushBuffer();
return new XmlCDataToken(_position, data);
}
}
}