XmlParser
For more details: See the W3C Recommendation
http://www.w3.org/TR/REC-xml/
and a little bit about XML parser (XHTML context)
http://www.w3.org/html/wg/drafts/html/master/the-xhtml-syntax.html#xml-parser.
using AngleSharp.Dom;
using AngleSharp.Dom.Xml;
using AngleSharp.Extensions;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace AngleSharp.Parser.Xml
{
[DebuggerStepThrough]
public sealed class XmlParser
{
private readonly XmlTokenizer _tokenizer;
private readonly Document _document;
private readonly List<Element> _openElements;
private readonly object _syncGuard;
private bool _started;
private XmlTreeMode _currentMode;
private Task<IDocument> _parsing;
private bool _standalone;
internal Node CurrentNode {
get {
if (_openElements.Count > 0)
return _openElements[_openElements.Count - 1];
return _document;
}
}
public IDocument Result => _document;
public bool IsStandalone => _standalone;
public bool IsAsync => _parsing != null;
public XmlParser(string source, IConfiguration configuration = null)
: this(new XmlDocument(BrowsingContext.New(configuration), new TextSource(source)))
{
}
public XmlParser(Stream stream, IConfiguration configuration = null)
: this(new XmlDocument(BrowsingContext.New(configuration), new TextSource(stream, configuration.DefaultEncoding())))
{
}
internal XmlParser(Document document)
{
_tokenizer = new XmlTokenizer(document.Source, document.Options.Events);
_syncGuard = new object();
_started = false;
_document = document;
_standalone = false;
_openElements = new List<Element>();
_currentMode = XmlTreeMode.Initial;
}
public Task<IDocument> ParseAsync()
{
return ParseAsync(CancellationToken.None);
}
public Task<IDocument> ParseAsync(CancellationToken cancelToken)
{
lock (_syncGuard) {
if (!_started) {
_started = true;
_parsing = KernelAsync(cancelToken);
}
}
return _parsing;
}
public IDocument Parse()
{
lock (_syncGuard) {
if (!_started) {
_started = true;
Kernel();
}
}
return _document;
}
private void Consume(XmlToken token)
{
switch (_currentMode) {
case XmlTreeMode.Initial:
Initial(token);
break;
case XmlTreeMode.Prolog:
BeforeDoctype(token);
break;
case XmlTreeMode.Misc:
InMisc(token);
break;
case XmlTreeMode.Body:
InBody(token);
break;
case XmlTreeMode.After:
AfterBody(token);
break;
}
}
private void Initial(XmlToken token)
{
if (token.Type == XmlTokenType.Declaration) {
XmlDeclarationToken xmlDeclarationToken = (XmlDeclarationToken)token;
_standalone = xmlDeclarationToken.Standalone;
if (!xmlDeclarationToken.IsEncodingMissing)
SetEncoding(xmlDeclarationToken.Encoding);
if (!CheckVersion(xmlDeclarationToken.Version))
throw XmlParseError.XmlDeclarationVersionUnsupported.At(token.Position);
} else {
_currentMode = XmlTreeMode.Prolog;
BeforeDoctype(token);
}
}
private void BeforeDoctype(XmlToken token)
{
if (token.Type == XmlTokenType.Doctype) {
XmlDoctypeToken xmlDoctypeToken = (XmlDoctypeToken)token;
_document.AppendChild(new DocumentType(_document, xmlDoctypeToken.Name) {
SystemIdentifier = xmlDoctypeToken.SystemIdentifier,
PublicIdentifier = xmlDoctypeToken.PublicIdentifier
});
_currentMode = XmlTreeMode.Misc;
} else
InMisc(token);
}
private void InMisc(XmlToken token)
{
switch (token.Type) {
case XmlTokenType.Comment: {
XmlCommentToken xmlCommentToken = (XmlCommentToken)token;
IComment child = _document.CreateComment(xmlCommentToken.Data);
CurrentNode.AppendChild(child);
break;
}
case XmlTokenType.ProcessingInstruction: {
XmlPIToken xmlPIToken = (XmlPIToken)token;
IProcessingInstruction child2 = _document.CreateProcessingInstruction(xmlPIToken.Target, xmlPIToken.Content);
CurrentNode.AppendChild(child2);
break;
}
case XmlTokenType.StartTag:
_currentMode = XmlTreeMode.Body;
InBody(token);
break;
default:
if (!token.IsIgnorable)
throw XmlParseError.XmlMissingRoot.At(token.Position);
break;
}
}
private void InBody(XmlToken token)
{
switch (token.Type) {
case XmlTokenType.CharacterReference:
break;
case XmlTokenType.StartTag: {
XmlTagToken xmlTagToken2 = (XmlTagToken)token;
XmlElement xmlElement = new XmlElement(_document, xmlTagToken2.Name, null);
CurrentNode.AppendChild(xmlElement);
if (!xmlTagToken2.IsSelfClosing)
_openElements.Add(xmlElement);
else if (_openElements.Count == 0) {
_currentMode = XmlTreeMode.After;
}
for (int i = 0; i < xmlTagToken2.Attributes.Count; i++) {
xmlElement.SetAttribute(xmlTagToken2.Attributes[i].Key, xmlTagToken2.Attributes[i].Value.Trim());
}
break;
}
case XmlTokenType.EndTag: {
XmlTagToken xmlTagToken = (XmlTagToken)token;
if (CurrentNode.NodeName != xmlTagToken.Name)
throw XmlParseError.TagClosingMismatch.At(token.Position);
_openElements.RemoveAt(_openElements.Count - 1);
if (_openElements.Count == 0)
_currentMode = XmlTreeMode.After;
break;
}
case XmlTokenType.Comment:
case XmlTokenType.ProcessingInstruction:
InMisc(token);
break;
case XmlTokenType.Entity: {
XmlEntityToken xmlEntityToken = (XmlEntityToken)token;
string entity = xmlEntityToken.GetEntity();
CurrentNode.AppendText(entity);
break;
}
case XmlTokenType.CData: {
XmlCDataToken xmlCDataToken = (XmlCDataToken)token;
CurrentNode.AppendText(xmlCDataToken.Data);
break;
}
case XmlTokenType.Character: {
XmlCharacterToken xmlCharacterToken = (XmlCharacterToken)token;
CurrentNode.AppendText(xmlCharacterToken.Data.ToString());
break;
}
case XmlTokenType.EndOfFile:
throw XmlParseError.EOF.At(token.Position);
case XmlTokenType.Doctype:
throw XmlParseError.XmlDoctypeAfterContent.At(token.Position);
case XmlTokenType.Declaration:
throw XmlParseError.XmlDeclarationMisplaced.At(token.Position);
}
}
private void AfterBody(XmlToken token)
{
switch (token.Type) {
case XmlTokenType.EndOfFile:
break;
case XmlTokenType.Comment:
case XmlTokenType.ProcessingInstruction:
InMisc(token);
break;
default:
if (!token.IsIgnorable)
throw XmlParseError.XmlMissingRoot.At(token.Position);
break;
}
}
private bool CheckVersion(string ver)
{
double result = 0;
if (double.TryParse(ver, NumberStyles.Any, CultureInfo.InvariantCulture, out result)) {
if (result >= 1)
return result < 2;
return false;
}
return false;
}
private void Kernel()
{
XmlToken xmlToken = null;
do {
xmlToken = _tokenizer.Get();
Consume(xmlToken);
} while (xmlToken.Type != XmlTokenType.EndOfFile);
}
private async Task<IDocument> KernelAsync(CancellationToken cancelToken)
{
TextSource source = _document.Source;
XmlToken token;
do {
if (source.Length - source.Index < 1024)
await source.Prefetch(8192, cancelToken).ConfigureAwait(false);
token = _tokenizer.Get();
Consume(token);
} while (token.Type != XmlTokenType.EndOfFile);
return _document;
}
private void SetEncoding(string charSet)
{
if (TextEncoding.IsSupported(charSet)) {
Encoding encoding = TextEncoding.Resolve(charSet);
if (encoding != null)
try {
_document.Source.CurrentEncoding = encoding;
} catch (NotSupportedException) {
_currentMode = XmlTreeMode.Initial;
_document.ReplaceAll(null, true);
_openElements.Clear();
}
}
}
}
}