XmlTokenizer
Performs the tokenization of the source code. Most of
the information is taken from http://www.w3.org/TR/REC-xml/.
using AngleSharp.Events;
using AngleSharp.Extensions;
using AngleSharp.Html;
using System;
using System.Diagnostics;
using System.Text;
namespace AngleSharp.Parser.Xml
{
[DebuggerStepThrough]
internal sealed class XmlTokenizer : BaseTokenizer
{
private static readonly string CDataOpening = "[CDATA[";
private static readonly string PublicIdentifier = "PUBLIC";
private static readonly string SystemIdentifier = "SYSTEM";
private static readonly string YesIdentifier = "yes";
private static readonly string NoIdentifier = "no";
public XmlTokenizer(TextSource source, IEventAggregator events)
: base(source, events)
{
}
public XmlToken Get()
{
char next = GetNext();
if (base.IsEnded)
return XmlToken.EOF;
return Data(next);
}
private static Exception XmlError(XmlParseError code)
{
return new InvalidOperationException();
}
private XmlToken Data(char c)
{
switch (c) {
case '&':
return CharacterReference(GetNext());
case '<':
return TagOpen(GetNext());
case '':
return XmlToken.EOF;
case ']':
return CheckCharacter(GetNext());
default:
return XmlToken.Character(c);
}
}
private XmlToken CheckCharacter(char ch)
{
if (ch == ']') {
if (GetNext() == '>')
throw XmlError(XmlParseError.XmlInvalidCharData);
Back();
}
Back();
return XmlToken.Character(']');
}
private XmlCDataToken CData(char c)
{
_stringBuffer.Clear();
while (true) {
switch (c) {
case '':
throw XmlError(XmlParseError.EOF);
case ']':
if (ContinuesWith("]]>", true)) {
Advance(2);
return XmlToken.CData(_stringBuffer.ToString());
}
break;
}
_stringBuffer.Append(c);
c = GetNext();
}
}
private XmlEntityToken CharacterReference(char c)
{
StringBuilder stringBuilder = Pool.NewStringBuilder();
if (c == '#') {
c = GetNext();
bool flag = c == 'x' || c == 'X';
if (!flag) {
while (c.IsDigit()) {
stringBuilder.Append(c);
c = GetNext();
}
} else {
c = GetNext();
while (c.IsHex()) {
stringBuilder.Append(c);
c = GetNext();
}
}
if (stringBuilder.Length > 0 && c == ';')
return new XmlEntityToken {
Value = stringBuilder.ToPool(),
IsNumeric = true,
IsHex = flag
};
} else if (c.IsXmlNameStart()) {
do {
stringBuilder.Append(c);
c = GetNext();
} while (c.IsXmlName());
if (c == ';')
return new XmlEntityToken {
Value = stringBuilder.ToPool()
};
}
stringBuilder.ToPool();
throw XmlError(XmlParseError.CharacterReferenceNotTerminated);
}
private XmlToken TagOpen(char c)
{
switch (c) {
case '!':
return MarkupDeclaration(GetNext());
case '?':
c = GetNext();
if (ContinuesWith(Tags.Xml, false)) {
Advance(2);
return DeclarationStart(GetNext());
}
return ProcessingStart(c);
case '/':
return TagEnd(GetNext());
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return TagName(GetNext(), XmlToken.OpenTag());
}
throw XmlError(XmlParseError.XmlInvalidStartTag);
}
}
private XmlToken TagEnd(char c)
{
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
do {
_stringBuffer.Append(c);
c = GetNext();
} while (c.IsXmlName());
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>') {
XmlTagToken xmlTagToken = XmlToken.CloseTag();
xmlTagToken.Name = _stringBuffer.ToString();
return xmlTagToken;
}
}
if (c == '')
throw XmlError(XmlParseError.EOF);
throw XmlError(XmlParseError.XmlInvalidEndTag);
}
private XmlToken TagName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
tag.Name = _stringBuffer.ToString();
switch (c) {
case '':
throw XmlError(XmlParseError.EOF);
case '>':
return tag;
default:
if (c.IsSpaceCharacter())
return AttributeBeforeName(GetNext(), tag);
if (c == '/')
return TagSelfClosing(GetNext(), tag);
throw XmlError(XmlParseError.XmlInvalidName);
}
}
private XmlToken TagSelfClosing(char c, XmlTagToken tag)
{
tag.IsSelfClosing = true;
switch (c) {
case '>':
return tag;
case '':
throw XmlError(XmlParseError.EOF);
default:
throw XmlError(XmlParseError.XmlInvalidName);
}
}
private XmlToken MarkupDeclaration(char c)
{
if (ContinuesWith("--", true)) {
Advance();
return CommentStart(GetNext());
}
if (ContinuesWith(Tags.Doctype, false)) {
Advance(6);
return Doctype(GetNext());
}
if (ContinuesWith(CDataOpening, false)) {
Advance(6);
return CData(GetNext());
}
throw XmlError(XmlParseError.UndefinedMarkupDeclaration);
}
private XmlToken DeclarationStart(char c)
{
if (!c.IsSpaceCharacter()) {
_stringBuffer.Clear();
_stringBuffer.Append(Tags.Xml);
return ProcessingTarget(c, XmlToken.Processing());
}
do {
c = GetNext();
} while (c.IsSpaceCharacter());
if (ContinuesWith(AttributeNames.Version, false)) {
Advance(6);
return DeclarationVersionAfterName(GetNext(), XmlToken.Declaration());
}
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationVersionAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationVersionBeforeValue(GetNext(), decl);
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationVersionBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return DeclarationVersionValue(GetNext(), c, decl);
}
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationVersionValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlError(XmlParseError.EOF);
_stringBuffer.Append(c);
c = GetNext();
}
decl.Version = _stringBuffer.ToString();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterVersion(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterVersion(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWith(AttributeNames.Encoding, false)) {
Advance(7);
return DeclarationEncodingAfterName(GetNext(), decl);
}
if (ContinuesWith(AttributeNames.Standalone, false)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationEncodingAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationEncodingBeforeValue(GetNext(), decl);
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationEncodingBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
char q = c;
_stringBuffer.Clear();
c = GetNext();
if (c.IsLetter())
return DeclarationEncodingValue(c, q, decl);
}
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationEncodingValue(char c, char q, XmlDeclarationToken decl)
{
do {
if (!c.IsAlphanumericAscii() && c != '.' && c != '_' && c != '-')
throw XmlError(XmlParseError.XmlDeclarationInvalid);
_stringBuffer.Append(c);
c = GetNext();
} while (c != q);
decl.Encoding = _stringBuffer.ToString();
c = GetNext();
if (c.IsSpaceCharacter())
return DeclarationAfterEncoding(c, decl);
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationAfterEncoding(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (ContinuesWith(AttributeNames.Standalone, false)) {
Advance(9);
return DeclarationStandaloneAfterName(GetNext(), decl);
}
return DeclarationEnd(c, decl);
}
private XmlToken DeclarationStandaloneAfterName(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '=')
return DeclarationStandaloneBeforeValue(GetNext(), decl);
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationStandaloneBeforeValue(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return DeclarationStandaloneValue(GetNext(), c, decl);
}
throw XmlError(XmlParseError.XmlDeclarationInvalid);
}
private XmlToken DeclarationStandaloneValue(char c, char q, XmlDeclarationToken decl)
{
while (c != q) {
if (c == '')
throw XmlError(XmlParseError.EOF);
_stringBuffer.Append(c);
c = GetNext();
}
string text = _stringBuffer.ToString();
if (text.Equals(YesIdentifier))
decl.Standalone = true;
else {
if (!text.Equals(NoIdentifier))
throw XmlError(XmlParseError.XmlDeclarationInvalid);
decl.Standalone = false;
}
return DeclarationEnd(GetNext(), decl);
}
private XmlDeclarationToken DeclarationEnd(char c, XmlDeclarationToken decl)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c != '?' || GetNext() != '>')
throw XmlError(XmlParseError.XmlDeclarationInvalid);
return decl;
}
private XmlToken Doctype(char c)
{
if (c.IsSpaceCharacter())
return DoctypeNameBefore(GetNext());
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypeNameBefore(char c)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return DoctypeName(GetNext(), XmlToken.Doctype());
}
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypeName(char c, XmlDoctypeToken doctype)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
doctype.Name = _stringBuffer.ToString();
_stringBuffer.Clear();
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeNameAfter(GetNext(), doctype);
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypeNameAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
if (ContinuesWith(PublicIdentifier, false)) {
Advance(5);
return DoctypePublic(GetNext(), doctype);
}
if (ContinuesWith(SystemIdentifier, false)) {
Advance(5);
return DoctypeSystem(GetNext(), doctype);
}
if (c == '[') {
Advance();
return DoctypeAfter(GetNext(), doctype);
}
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypePublic(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.PublicIdentifier = string.Empty;
return DoctypePublicIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypePublicIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (!c.IsPubidChar())
throw XmlError(XmlParseError.XmlInvalidPubId);
_stringBuffer.Append(c);
c = GetNext();
}
doctype.PublicIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypePublicIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypePublicIdentifierAfter(char c, XmlDoctypeToken doctype)
{
if (c == '>')
return doctype;
if (c.IsSpaceCharacter())
return DoctypeBetween(GetNext(), doctype);
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypeBetween(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '>':
return doctype;
case '"':
case '\'':
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
default:
throw XmlError(XmlParseError.DoctypeInvalid);
}
}
private XmlToken DoctypeSystem(char c, XmlDoctypeToken doctype)
{
if (c.IsSpaceCharacter()) {
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
doctype.SystemIdentifier = string.Empty;
return DoctypeSystemIdentifierValue(GetNext(), c, doctype);
}
}
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken DoctypeSystemIdentifierValue(char c, char q, XmlDoctypeToken doctype)
{
while (c != q) {
if (c == '')
throw XmlError(XmlParseError.EOF);
_stringBuffer.Append(c);
c = GetNext();
}
doctype.SystemIdentifier = _stringBuffer.ToString();
_stringBuffer.Clear();
return DoctypeSystemIdentifierAfter(GetNext(), doctype);
}
private XmlToken DoctypeSystemIdentifierAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '[') {
Advance();
c = GetNext();
}
return DoctypeAfter(c, doctype);
}
private XmlToken DoctypeAfter(char c, XmlDoctypeToken doctype)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '>')
return doctype;
throw XmlError(XmlParseError.DoctypeInvalid);
}
private XmlToken AttributeBeforeName(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
case '':
throw XmlError(XmlParseError.EOF);
default:
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return AttributeName(GetNext(), tag);
}
throw XmlError(XmlParseError.XmlInvalidAttribute);
}
}
private XmlToken AttributeName(char c, XmlTagToken tag)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
string name = _stringBuffer.ToString();
if (!string.IsNullOrEmpty(tag.GetAttribute(name)))
throw XmlError(XmlParseError.XmlUniqueAttribute);
tag.AddAttribute(name);
if (c.IsSpaceCharacter()) {
do {
c = GetNext();
} while (c.IsSpaceCharacter());
}
if (c == '=')
return AttributeBeforeValue(GetNext(), tag);
throw XmlError(XmlParseError.XmlInvalidAttribute);
}
private XmlToken AttributeBeforeValue(char c, XmlTagToken tag)
{
while (c.IsSpaceCharacter()) {
c = GetNext();
}
if (c == '"' || c == '\'') {
_stringBuffer.Clear();
return AttributeValue(GetNext(), c, tag);
}
throw XmlError(XmlParseError.XmlInvalidAttribute);
}
private XmlToken AttributeValue(char c, char q, XmlTagToken tag)
{
while (c != q) {
switch (c) {
case '':
throw XmlError(XmlParseError.EOF);
case '&':
_stringBuffer.Append(CharacterReference(GetNext()).GetEntity());
break;
case '<':
throw XmlError(XmlParseError.XmlLtInAttributeValue);
default:
_stringBuffer.Append(c);
break;
}
c = GetNext();
}
tag.SetAttributeValue(_stringBuffer.ToString());
return AttributeAfterValue(GetNext(), tag);
}
private XmlToken AttributeAfterValue(char c, XmlTagToken tag)
{
if (!c.IsSpaceCharacter()) {
switch (c) {
case '/':
return TagSelfClosing(GetNext(), tag);
case '>':
return tag;
default:
throw XmlError(XmlParseError.XmlInvalidAttribute);
}
}
return AttributeBeforeName(GetNext(), tag);
}
private XmlToken ProcessingStart(char c)
{
if (c.IsXmlNameStart()) {
_stringBuffer.Clear();
_stringBuffer.Append(c);
return ProcessingTarget(GetNext(), XmlToken.Processing());
}
throw XmlError(XmlParseError.XmlInvalidPI);
}
private XmlToken ProcessingTarget(char c, XmlPIToken pi)
{
while (c.IsXmlName()) {
_stringBuffer.Append(c);
c = GetNext();
}
pi.Target = _stringBuffer.ToString();
_stringBuffer.Clear();
if (string.Compare(pi.Target, Tags.Xml, StringComparison.OrdinalIgnoreCase) == 0)
throw XmlError(XmlParseError.XmlInvalidPI);
if (c == '?') {
c = GetNext();
if (c == '>')
return pi;
} else if (c.IsSpaceCharacter()) {
return ProcessingContent(GetNext(), pi);
}
throw XmlError(XmlParseError.XmlInvalidPI);
}
private XmlToken ProcessingContent(char c, XmlPIToken pi)
{
while (true) {
switch (c) {
case '?':
c = GetNext();
if (c == '>') {
pi.Content = _stringBuffer.ToString();
return pi;
}
_stringBuffer.Append('?');
break;
default:
_stringBuffer.Append(c);
c = GetNext();
break;
case '':
throw XmlError(XmlParseError.EOF);
}
}
}
private XmlToken CommentStart(char c)
{
_stringBuffer.Clear();
return Comment(c);
}
private XmlToken Comment(char c)
{
while (c.IsXmlChar()) {
if (c == '-')
return CommentDash(GetNext());
_stringBuffer.Append(c);
c = GetNext();
}
throw XmlError(XmlParseError.XmlInvalidComment);
}
private XmlToken CommentDash(char c)
{
if (c == '-')
return CommentEnd(GetNext());
return Comment(c);
}
private XmlToken CommentEnd(char c)
{
if (c == '>')
return XmlToken.Comment(_stringBuffer.ToString());
throw XmlError(XmlParseError.XmlInvalidComment);
}
}
}