XmlTokenizer
Performs the tokenization of the source code. Most of
the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Extensions;
using AngleSharp.Html;
using AngleSharp.Services;
namespace AngleSharp.Parser.Xml
{
internal sealed class XmlTokenizer : BaseTokenizer
{
private readonly IEntityProvider _resolver;
private TextPosition _position;
public bool IsSuppressingErrors { get; set; }
public XmlTokenizer(TextSource source, IEntityProvider resolver)
: base(source)
{
_resolver = resolver;
}
public XmlToken Get()
{
char next = GetNext();
if (next != '') {
_position = GetCurrentPosition();
return Data(next);
}
return NewEof();
}
private XmlToken Data(char c)
{
switch (c) {
case '<':
return TagOpen();
case '':
return NewEof();
default:
return DataText(c);
}
}
private XmlToken DataText(char c)
{
while (true) {
switch (c) {
case '<':
case '':
Back();
return NewCharacters();
case '&':
base.StringBuffer.Append(CharacterReference());
c = GetNext();
break;
case ']':
base.StringBuffer.Append(c);
c = CheckNextCharacter();
break;
default:
base.StringBuffer.Append(c);
c = GetNext();
break;
}
}
}
private char CheckNextCharacter()
{
char next = GetNext();
if (next == ']') {
if (GetNext() == '>')
throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition());
Back();
}
return next;
}
private XmlCDataToken CData()
{
char next = GetNext();
while (true) {
switch (next) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case ']':
if (ContinuesWithSensitive("]]>")) {
Advance(2);
return NewCharacterData();
}
break;
}
base.StringBuffer.Append(next);
next = GetNext();
}
}
private string CharacterReference()
{
char next = GetNext();
int length = base.StringBuffer.Length;
bool flag = next == '#';
if (flag) {
next = GetNext();
if (next != 'x' && next != 'X') {
while (next.IsDigit()) {
base.StringBuffer.Append(next);
next = GetNext();
}
} else {
next = GetNext();
while (next.IsHex()) {
base.StringBuffer.Append(next);
next = GetNext();
}
}
} else if (next.IsXmlNameStart()) {
do {
base.StringBuffer.Append(next);
next = GetNext();
} while (next.IsXmlName());
}
int num;
string text;
if (next == ';' && base.StringBuffer.Length > length) {
int length2 = base.StringBuffer.Length - length;
text = base.StringBuffer.ToString(length, length2);
int num2;
switch (flag) {
default:
num = text.FromHex();
goto IL_00e7;
case false:
{
string symbol = _resolver.GetSymbol(text);
if (!string.IsNullOrEmpty(symbol)) {
base.StringBuffer.Remove(length, length2);
return symbol;
}
break;
}
IL_00e7:
num2 = num;
if (num2.IsValidAsCharRef()) {
base.StringBuffer.Remove(length, length2);
return num2.ConvertFromUtf32();
}
break;
}
if (!IsSuppressingErrors)
throw XmlParseError.CharacterReferenceInvalidCode.At(_position);
base.StringBuffer.Append(next);
}
if (!IsSuppressingErrors)
throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition());
base.StringBuffer.Insert(length, '&');
return string.Empty;
IL_00d7:
num = text.FromDec();
goto IL_00e7;
}
private XmlToken TagOpen()
{
char next = GetNext();
switch (next) {
case '!':
return MarkupDeclaration();
case '?':
next = GetNext();
if (ContinuesWithSensitive(TagNames.Xml)) {
Advance(2);
return DeclarationStart();
}
return ProcessingStart(next);
case '/':
return TagEnd();
default:
if (next.IsXmlNameStart()) {
base.StringBuffer.Append(next);
return TagName(NewOpenTag());
}
throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition());
}
}
private XmlToken TagEnd()
{
char next = GetNext();
if (next.IsXmlNameStart()) {
do {
base.StringBuffer.Append(next);
next = GetNext();
} while (next.IsXmlName());
while (next.IsSpaceCharacter()) {
next = GetNext();
}
if (next == '>') {
XmlTagToken xmlTagToken = NewCloseTag();
xmlTagToken.Name = FlushBuffer();
return xmlTagToken;
}
}
if (next == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition());
}
private XmlToken TagName(XmlTagToken tag)
{
char next = GetNext();
while (next.IsXmlName()) {
base.StringBuffer.Append(next);
next = GetNext();
}
tag.Name = FlushBuffer();
switch (next) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '>':
return tag;
default:
if (next.IsSpaceCharacter())
return AttributeBeforeName(tag);
if (next == '/')
return TagSelfClosing(tag);
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken TagSelfClosing(XmlTagToken tag)
{
char next = GetNext();
tag.IsSelfClosing = true;
switch (next) {
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken MarkupDeclaration()
{
GetNext();
if (ContinuesWithSensitive("--")) {
Advance();
return CommentStart();
}
if (ContinuesWithSensitive(TagNames.Doctype)) {
Advance(6);
return Doctype();
}
if (ContinuesWithSensitive(Keywords.CData)) {
Advance(6);
return CData();
}
throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition());
}
private XmlToken DeclarationStart()
{
char next = GetNext();
if (!next.IsSpaceCharacter()) {
base.StringBuffer.Append(TagNames.Xml);
return ProcessingTarget(next, NewProcessing());
}
do {
next = GetNext();
} while (next.IsSpaceCharacter());
if (ContinuesWithSensitive(AttributeNames.Version)) {
Advance(6);
return DeclarationVersionAfterName(NewDeclaration());
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionAfterName(XmlDeclarationToken decl)
{
if (SkipSpaces() != '=')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return DeclarationVersionBeforeValue(decl);
}
private XmlToken DeclarationVersionBeforeValue(XmlDeclarationToken decl)
{
char c = SkipSpaces();
if (c != '"' && c != '\'')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return DeclarationVersionValue(decl, c);
}
private XmlToken DeclarationVersionValue(XmlDeclarationToken decl, char quote)
{
char next;
for (next = GetNext(); next != quote; next = GetNext()) {
if (next == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
base.StringBuffer.Append(next);
}
decl.Version = FlushBuffer();
next = GetNext();
if (next.IsSpaceCharacter())
return DeclarationAfterVersion(decl);
return DeclarationEnd(next, decl);
}
private XmlToken DeclarationAfterVersion(XmlDeclarationToken decl)
{
char c = SkipSpaces();
if (ContinuesWithSensitive(AttributeNames.Encoding)) {
Advance(7);
return DeclarationEncodingAfterName(decl);
}
if (ContinuesWithSensitive(AttributeNames.Standalone)) {
Advance(9);
return DeclarationStandaloneAfterName(decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationEncodingAfterName(XmlDeclarationToken decl)
{
if (SkipSpaces() != '=')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return DeclarationEncodingBeforeValue(decl);
}
private XmlToken DeclarationEncodingBeforeValue(XmlDeclarationToken decl)
{
char c = SkipSpaces();
if (c == '"' || c == '\'') {
char quote = c;
c = GetNext();
if (c.IsLetter()) {
base.StringBuffer.Append(c);
return DeclarationEncodingValue(decl, quote);
}
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationEncodingValue(XmlDeclarationToken decl, char quote)
{
char next;
for (next = GetNext(); next != quote; next = GetNext()) {
if (!next.IsAlphanumericAscii() && next != '.' && next != '_' && next != '-')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
base.StringBuffer.Append(next);
}
decl.Encoding = FlushBuffer();
next = GetNext();
if (next.IsSpaceCharacter())
return DeclarationAfterEncoding(decl);
return DeclarationEnd(next, decl);
}
private XmlToken DeclarationAfterEncoding(XmlDeclarationToken decl)
{
char c = SkipSpaces();
if (ContinuesWithSensitive(AttributeNames.Standalone)) {
Advance(9);
return DeclarationStandaloneAfterName(decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationStandaloneAfterName(XmlDeclarationToken decl)
{
if (SkipSpaces() != '=')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return DeclarationStandaloneBeforeValue(decl);
}
private XmlToken DeclarationStandaloneBeforeValue(XmlDeclarationToken decl)
{
char c = SkipSpaces();
if (c != '"' && c != '\'')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return DeclarationStandaloneValue(decl, c);
}
private XmlToken DeclarationStandaloneValue(XmlDeclarationToken decl, char quote)
{
for (char next = GetNext(); next != quote; next = GetNext()) {
if (next == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
base.StringBuffer.Append(next);
}
string current = FlushBuffer();
if (current.Is(Keywords.Yes))
decl.Standalone = true;
else {
if (!current.Is(Keywords.No))
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
decl.Standalone = false;
}
return DeclarationEnd(GetNext(), decl);
}
private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c != '?' || GetNext() != '>')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return decl;
}
private XmlToken Doctype()
{
if (GetNext().IsSpaceCharacter())
return DoctypeNameBefore();
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameBefore()
{
char c = SkipSpaces();
if (c.IsXmlNameStart()) {
base.StringBuffer.Append(c);
return DoctypeName(NewDoctype());
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeName(XmlDoctypeToken doctype)
{
char next = GetNext();
while (next.IsXmlName()) {
base.StringBuffer.Append(next);
next = GetNext();
}
doctype.Name = FlushBuffer();
if (next == '>')
return doctype;
if (next.IsSpaceCharacter())
return DoctypeNameAfter(doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameAfter(XmlDoctypeToken doctype)
{
char c = SkipSpaces();
if (c == '>')
return doctype;
if (ContinuesWithSensitive(Keywords.Public)) {
Advance(5);
return DoctypePublic(doctype);
}
if (ContinuesWithSensitive(Keywords.System)) {
Advance(5);
return DoctypeSystem(doctype);
}
if (c == '[') {
Advance();
return DoctypeAfter(GetNext(), doctype);
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublic(XmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
next = SkipSpaces();
if (next == '"' || next == '\'') {
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierValue(doctype, next);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublicIdentifierValue(XmlDoctypeToken doctype, char quote)
{
for (char next = GetNext(); next != quote; next = GetNext()) {
if (!next.IsPubidChar())
throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition());
base.StringBuffer.Append(next);
}
doctype.PublicIdentifier = FlushBuffer();
return DoctypePublicIdentifierAfter(doctype);
}
private XmlToken DoctypePublicIdentifierAfter(XmlDoctypeToken doctype)
{
char next = GetNext();
if (next == '>')
return doctype;
if (next.IsSpaceCharacter())
return DoctypeBetween(doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeBetween(XmlDoctypeToken doctype)
{
char c = SkipSpaces();
switch (c) {
case '>':
return doctype;
case '"':
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(doctype, c);
default:
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
}
private XmlToken DoctypeSystem(XmlDoctypeToken doctype)
{
char next = GetNext();
if (next.IsSpaceCharacter()) {
next = SkipSpaces();
if (next == '"' || next == '\'') {
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(doctype, next);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeSystemIdentifierValue(XmlDoctypeToken doctype, char quote)
{
for (char next = GetNext(); next != quote; next = GetNext()) {
if (next == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
base.StringBuffer.Append(next);
}
doctype.SystemIdentifier = FlushBuffer();
return DoctypeSystemIdentifierAfter(doctype);
}
private XmlToken DoctypeSystemIdentifierAfter(XmlDoctypeToken doctype)
{
char c = SkipSpaces();
if (c == '[') {
Advance();
c = GetNext();
}
return DoctypeAfter(c, doctype);
}
private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken AttributeBeforeName(XmlTagToken tag)
{
char c = SkipSpaces();
switch (c) {
case '/':
return TagSelfClosing(tag);
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
if (c.IsXmlNameStart()) {
base.StringBuffer.Append(c);
return AttributeName(tag);
}
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
private XmlToken AttributeName(XmlTagToken tag)
{
char next = GetNext();
while (next.IsXmlName()) {
base.StringBuffer.Append(next);
next = GetNext();
}
string name = FlushBuffer();
if (!string.IsNullOrEmpty(tag.GetAttribute(name)))
throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition());
tag.AddAttribute(name);
if (next.IsSpaceCharacter()) {
do {
next = GetNext();
} while (next.IsSpaceCharacter());
}
if (next != '=')
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
return AttributeBeforeValue(tag);
}
private XmlToken AttributeBeforeValue(XmlTagToken tag)
{
char c = SkipSpaces();
if (c != '"' && c != '\'')
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
return AttributeValue(tag, c);
}
private XmlToken AttributeValue(XmlTagToken tag, char quote)
{
for (char next = GetNext(); next != quote; next = GetNext()) {
switch (next) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '<':
throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition());
case '&':
base.StringBuffer.Append(CharacterReference());
break;
default:
base.StringBuffer.Append(next);
break;
}
}
tag.SetAttributeValue(FlushBuffer());
return AttributeAfterValue(tag);
}
private XmlToken AttributeAfterValue(XmlTagToken tag)
{
char next = GetNext();
if (!next.IsSpaceCharacter()) {
switch (next) {
case '/':
return TagSelfClosing(tag);
case '>':
return tag;
default:
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
return AttributeBeforeName(tag);
}
private XmlToken ProcessingStart(char c)
{
if (c.IsXmlNameStart()) {
base.StringBuffer.Append(c);
return ProcessingTarget(GetNext(), NewProcessing());
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingTarget(char c, XmlPIToken pi)
{
while (c.IsXmlName()) {
base.StringBuffer.Append(c);
c = GetNext();
}
pi.Target = FlushBuffer();
if (pi.Target.Isi(TagNames.Xml))
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
if (c == '?') {
c = GetNext();
if (c == '>')
return pi;
} else if (c.IsSpaceCharacter()) {
return ProcessingContent(pi);
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingContent(XmlPIToken pi)
{
char next = GetNext();
while (true) {
switch (next) {
case '?':
next = GetNext();
if (next == '>') {
pi.Content = FlushBuffer();
return pi;
}
base.StringBuffer.Append('?');
break;
default:
base.StringBuffer.Append(next);
next = GetNext();
break;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
}
}
}
private XmlToken CommentStart()
{
return Comment(GetNext());
}
private XmlToken Comment(char c)
{
while (c.IsXmlChar()) {
if (c == '-')
return CommentDash();
base.StringBuffer.Append(c);
c = GetNext();
}
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlToken CommentDash()
{
char next = GetNext();
if (next == '-')
return CommentEnd();
return Comment(next);
}
private XmlToken CommentEnd()
{
if (GetNext() == '>')
return NewComment();
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlEndOfFileToken NewEof()
{
return new XmlEndOfFileToken(GetCurrentPosition());
}
private XmlCharacterToken NewCharacters()
{
string data = FlushBuffer();
return new XmlCharacterToken(_position, data);
}
private XmlCommentToken NewComment()
{
string data = FlushBuffer();
return new XmlCommentToken(_position, data);
}
private XmlPIToken NewProcessing()
{
return new XmlPIToken(_position);
}
private XmlDoctypeToken NewDoctype()
{
return new XmlDoctypeToken(_position);
}
private XmlDeclarationToken NewDeclaration()
{
return new XmlDeclarationToken(_position);
}
private XmlTagToken NewOpenTag()
{
return new XmlTagToken(XmlTokenType.StartTag, _position);
}
private XmlTagToken NewCloseTag()
{
return new XmlTagToken(XmlTokenType.EndTag, _position);
}
private XmlCDataToken NewCharacterData()
{
string data = FlushBuffer();
return new XmlCDataToken(_position, data);
}
}
}