XmlTokenizer
Performs the tokenization of the source code. Most of
the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using System;
using System.Diagnostics;
using System.Text;
namespace AngleSharp.Parser.Xml
{
[DebuggerStepThrough]
internal sealed class XmlTokenizer : BaseTokenizer
{
private static readonly string CDataOpening = "[CDATA[";
private static readonly string PublicIdentifier = "PUBLIC";
private static readonly string SystemIdentifier = "SYSTEM";
private static readonly string YesIdentifier = "yes";
private static readonly string NoIdentifier = "no";
private TextPosition _position;
public XmlTokenizer(TextSource source, IEventAggregator events)
: base(source, events)
{
}
public XmlToken Get()
{
char next = GetNext();
if (next == '')
return NewEof();
return Data(next);
}
private XmlToken Data(char c)
{
_position = GetCurrentPosition();
switch (c) {
case '&':
return CharacterReference(GetNext());
case '<':
return TagOpen(GetNext());
case '':
return NewEof();
case ']':
return CheckCharacter(GetNext());
default:
return NewCharacter(c);
}
}
private XmlToken CheckCharacter(char ch)
{
if (ch == ']') {
if (GetNext() == '>')
throw XmlParseError.XmlInvalidCharData.At(GetCurrentPosition());
Back();
}
Back();
return NewCharacter(']');
}
private XmlCDataToken CData(char c)
{
_stringBuffer.Clear();
while (true) {
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case ']':
if (ContinuesWith("]]>", true)) {
Advance(2);
return NewCData(_stringBuffer.ToString());
}
break;
}
_stringBuffer.Append(c);
c = GetNext();
}
}
private XmlEntityToken CharacterReference(char c)
{
StringBuilder stringBuilder = Pool.NewStringBuilder();
if (c == '#') {
c = GetNext();
bool flag = c == 'x' || c == 'X';
if (!flag) {
while (c.IsDigit()) {
stringBuilder.Append(c);
c = GetNext();
}
} else {
c = GetNext();
while (c.IsHex()) {
stringBuilder.Append(c);
c = GetNext();
}
}
if (stringBuilder.Length > 0 && c == ';')
return NewEntity(stringBuilder.ToPool(), true, flag);
} else if (c.IsXmlNameStart()) {
do {
stringBuilder.Append(c);
c = GetNext();
} while (c.IsXmlName());
if (c == ';')
return NewEntity(stringBuilder.ToPool(), false, false);
}
stringBuilder.ToPool();
throw XmlParseError.CharacterReferenceNotTerminated.At(GetCurrentPosition());
}
private XmlToken TagOpen(char c)
{
switch (c) {
case '!':
return MarkupDeclaration(GetNext());
case '?':
c = GetNext();
if (ContinuesWith(Tags.Xml, false)) {
Advance(2);
return DeclarationStart(GetNext());
}
return ProcessingStart(c);
case '/':
return TagEnd(GetNext());
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return TagName(GetNext(), NewOpenTag());
}
throw XmlParseError.XmlInvalidStartTag.At(GetCurrentPosition());
}
}
private XmlToken TagEnd(char c)
{
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
do {
_stringBuffer.Append(c);
c = GetNext();
} while (c.IsXmlName());
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>') {
XmlTagToken xmlTagToken = NewCloseTag();
xmlTagToken.Name = _stringBuffer.ToString();
return xmlTagToken;
}
}
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
throw XmlParseError.XmlInvalidEndTag.At(GetCurrentPosition());
}
private XmlToken TagName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
tag.Name = _stringBuffer.ToString();
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '>':
return tag;
default:
if (c.IsSpaceCharacter())
return AttributeBeforeName(GetNext(), tag);
if (c == '/')
return TagSelfClosing(GetNext(), tag);
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken TagSelfClosing(char c, XmlTagToken tag)
{
tag.IsSelfClosing = true;
switch (c) {
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
throw XmlParseError.XmlInvalidName.At(GetCurrentPosition());
}
}
private XmlToken MarkupDeclaration(char c)
{
if (ContinuesWith("--", true)) {
Advance();
return CommentStart(GetNext());
}
if (ContinuesWith(Tags.Doctype, false)) {
Advance(6);
return Doctype(GetNext());
}
if (ContinuesWith(CDataOpening, false)) {
Advance(6);
return CData(GetNext());
}
throw XmlParseError.UndefinedMarkupDeclaration.At(GetCurrentPosition());
}
private XmlToken DeclarationStart(char c)
{
if (!c.IsSpaceCharacter()) {
_stringBuffer.Clear();
_stringBuffer.Append(Tags.Xml);
return ProcessingTarget(c, NewProcessing());
}
do {
c = GetNext();
} while (c.IsSpaceCharacter());
if (ContinuesWith(AttributeNames.Version, false)) {
Advance(6);
return DeclarationVersionAfterName(GetNext(), NewDeclaration());
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationVersionBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return DeclarationVersionValue(GetNext(), c, decl);
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
decl.Version = _stringBuffer.ToString();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterVersion(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWith(AttributeNames.Encoding, false)) {
Advance(7);
return DeclarationEncodingAfterName(GetNext(), decl);
}
if (ContinuesWith(AttributeNames.Standalone, false)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationEncodingBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
char q = c;
_stringBuffer.Clear();
c = GetNext();
if (c.IsLetter())
return DeclarationEncodingValue(c, q, decl);
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl)
{
do {
if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
} while (c != q);
decl.Encoding = _stringBuffer.ToString();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterEncoding(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWith(AttributeNames.Standalone, false)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationStandaloneBeforeValue(GetNext(), decl);
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return DeclarationStandaloneValue(GetNext(), c, decl);
}
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
}
private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
string text = _stringBuffer.ToString();
if (text.Equals(YesIdentifier))
decl.Standalone = true;
else {
if (!text.Equals(NoIdentifier))
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
decl.Standalone = false;
}
return DeclarationEnd(GetNext(), decl);
}
private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c != '?' || GetNext() != '>')
throw XmlParseError.XmlDeclarationInvalid.At(GetCurrentPosition());
return decl;
}
private XmlToken Doctype(char c)
{
if (c.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return DoctypeName(GetNext(), NewDoctype());
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeName(char c, XmlDoctypeToken doctype)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeNameAfter(GetNext(), doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
if (ContinuesWith(PublicIdentifier, false)) {
Advance(5);
return DoctypePublic(GetNext(), doctype);
}
if (ContinuesWith(SystemIdentifier, false)) {
Advance(5);
return DoctypeSystem(GetNext(), doctype);
}
if (c == '[') {
Advance();
return DoctypeAfter(GetNext(), doctype);
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (!c.IsPubidChar())
throw XmlParseError.XmlInvalidPubId.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype)
{
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeBetween(GetNext(), doctype);
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '>':
return doctype;
case '"':
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
default:
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
}
private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (c == '')
throw XmlParseError.EOF.At(GetCurrentPosition());
_stringBuffer.Append(c);
c = GetNext();
}
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '[') {
Advance();
c = GetNext();
}
return DoctypeAfter(c, doctype);
}
private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
throw XmlParseError.DoctypeInvalid.At(GetCurrentPosition());
}
private XmlToken AttributeBeforeName(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return AttributeName(GetNext(), tag);
}
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
private XmlToken AttributeName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
string name = _stringBuffer.ToString();
if (!string.IsNullOrEmpty(tag.GetAttribute(name)))
throw XmlParseError.XmlUniqueAttribute.At(GetCurrentPosition());
tag.AddAttribute(name);
if (c.IsSpaceCharacter()) {
do {
c = GetNext();
} while (c.IsSpaceCharacter());
}
if (c == '=')
return AttributeBeforeValue(GetNext(), tag);
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
private XmlToken AttributeBeforeValue(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return AttributeValue(GetNext(), c, tag);
}
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
private XmlToken AttributeValue(char c, char q, XmlTagToken tag)
{
while (c != q) {
switch (c) {
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
case '&':
_stringBuffer.Append(CharacterReference(GetNext()).GetEntity());
break;
case '<':
throw XmlParseError.XmlLtInAttributeValue.At(GetCurrentPosition());
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
tag.SetAttributeValue(_stringBuffer.ToString());
return AttributeAfterValue(GetNext(), tag);
}
private XmlToken AttributeAfterValue(char c, XmlTagToken tag)
{
if (!c.IsSpaceCharacter()) {
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
default:
throw XmlParseError.XmlInvalidAttribute.At(GetCurrentPosition());
}
}
return AttributeBeforeName(GetNext(), tag);
}
private XmlToken ProcessingStart(char c)
{
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return ProcessingTarget(GetNext(), NewProcessing());
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingTarget(char c, XmlPIToken pi)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
pi.Target = _stringBuffer.ToString();
_stringBuffer.Clear();
if (string.Compare(pi.Target, Tags.Xml, StringComparison.OrdinalIgnoreCase) == 0)
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
if (c == '?') {
c = GetNext();
if (c == '>')
return pi;
} else if (c.IsSpaceCharacter()) {
return ProcessingContent(GetNext(), pi);
}
throw XmlParseError.XmlInvalidPI.At(GetCurrentPosition());
}
private XmlToken ProcessingContent(char c, XmlPIToken pi)
{
while (true) {
switch (c) {
case '?':
c = GetNext();
if (c == '>') {
pi.Content = _stringBuffer.ToString();
return pi;
}
_stringBuffer.Append('?');
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
case '':
throw XmlParseError.EOF.At(GetCurrentPosition());
}
}
}
private XmlToken CommentStart(char c)
{
_stringBuffer.Clear();
return Comment(c);
}
private XmlToken Comment(char c)
{
while (c.IsXmlChar()) {
if (c == '-')
return CommentDash(GetNext());
_stringBuffer.Append(c);
c = GetNext();
}
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlToken CommentDash(char c)
{
if (c == '-')
return CommentEnd(GetNext());
return Comment(c);
}
private XmlToken CommentEnd(char c)
{
if (c == '>')
return NewComment(_stringBuffer.ToString());
throw XmlParseError.XmlInvalidComment.At(GetCurrentPosition());
}
private XmlEndOfFileToken NewEof()
{
return new XmlEndOfFileToken(GetCurrentPosition());
}
private XmlCharacterToken NewCharacter(char c)
{
return new XmlCharacterToken(_position, c);
}
private XmlCommentToken NewComment(string s)
{
return new XmlCommentToken(_position, s);
}
private XmlPIToken NewProcessing()
{
return new XmlPIToken(_position);
}
private XmlDoctypeToken NewDoctype()
{
return new XmlDoctypeToken(_position);
}
private XmlDeclarationToken NewDeclaration()
{
return new XmlDeclarationToken(_position);
}
private XmlTagToken NewOpenTag()
{
return new XmlTagToken(XmlTokenType.StartTag, _position);
}
private XmlTagToken NewCloseTag()
{
return new XmlTagToken(XmlTokenType.EndTag, _position);
}
private XmlCDataToken NewCData(string s)
{
return new XmlCDataToken(_position, s);
}
private XmlEntityToken NewEntity(string value, bool numeric = false, bool hex = false)
{
return new XmlEntityToken(_position, value, numeric, hex);
}
}
}