AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.8.1" />

 CssTokenizer

sealed class CssTokenizer : BaseTokenizer
The CSS tokenizer. See http://dev.w3.org/csswg/css-syntax/#tokenization for more details.
using AngleSharp.Css; using AngleSharp.Extensions; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; namespace AngleSharp.Parser.Css { [DebuggerStepThrough] internal sealed class CssTokenizer : BaseTokenizer { private bool _ignoreWs; private bool _ignoreCs; public bool IgnoreWhitespace { get { return _ignoreWs; } set { _ignoreWs = value; } } public bool IgnoreComments { get { return _ignoreCs; } set { _ignoreCs = value; } } public IEnumerable<CssToken> Tokens { get { while (true) { CssToken cssToken = Data(GetNext()); if (cssToken == null) break; yield return cssToken; } } } public CssTokenizer(TextSource source) : base(source) { } private CssToken Data(char current) { switch (current) { case '\t': case '\n': case '\r': case ' ': do { current = GetNext(); } while (current.IsSpaceCharacter()); if (_ignoreWs) return Data(current); Back(); return CssSpecialCharacter.Whitespace; case '"': return StringDQ(); case '#': return HashStart(); case '$': current = GetNext(); if (current == '=') return CssMatchToken.Suffix; return CssToken.Delim(GetPrevious()); case '\'': return StringSQ(); case '(': return CssBracketToken.OpenRound; case ')': return CssBracketToken.CloseRound; case '*': current = GetNext(); if (current == '=') return CssMatchToken.Substring; return CssToken.Delim(GetPrevious()); case '+': { char next = GetNext(); if (next != '') { char next2 = GetNext(); Back(2); if (next.IsDigit() || (next == '.' && next2.IsDigit())) return NumberStart(current); } else Back(); return CssToken.Delim(current); } case ',': return CssSpecialCharacter.Comma; case '.': if (GetNext().IsDigit()) return NumberStart(GetPrevious()); return CssToken.Delim(GetPrevious()); case '-': { char next3 = GetNext(); if (next3 != '') { char next4 = GetNext(); Back(2); if (next3.IsDigit() || (next3 == '.' && next4.IsDigit())) return NumberStart(current); if (next3.IsNameStart()) return IdentStart(current); if (next3 == '\\' && !next4.IsLineBreak() && next4 != '') return IdentStart(current); if (next3 == '-' && next4 == '>') { Advance(2); if (_ignoreCs) return Data(GetNext()); return CssCommentToken.Close; } } else Back(); return CssToken.Delim(current); } case '/': current = GetNext(); if (current == '*') return Comment(); return CssToken.Delim(GetPrevious()); case '\\': current = GetNext(); if (current.IsLineBreak() || current == '') { RaiseErrorOccurred((current != '') ? ErrorCode.LineBreakUnexpected : ErrorCode.EOF); return CssToken.Delim(GetPrevious()); } return IdentStart(GetPrevious()); case ':': return CssSpecialCharacter.Colon; case ';': return CssSpecialCharacter.Semicolon; case '<': current = GetNext(); if (current == '!') { current = GetNext(); if (current == '-') { current = GetNext(); if (current == '-') { if (_ignoreCs) return Data(GetNext()); return CssCommentToken.Open; } current = GetPrevious(); } current = GetPrevious(); } return CssToken.Delim(GetPrevious()); case '@': return AtKeywordStart(); case '[': return CssBracketToken.OpenSquare; case ']': return CssBracketToken.CloseSquare; case '^': current = GetNext(); if (current == '=') return CssMatchToken.Prefix; return CssToken.Delim(GetPrevious()); case '{': return CssBracketToken.OpenCurly; case '}': return CssBracketToken.CloseCurly; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return NumberStart(current); case 'U': case 'u': current = GetNext(); if (current == '+') { current = GetNext(); if (current.IsHex() || current == '?') return UnicodeRange(current); current = GetPrevious(); } return IdentStart(GetPrevious()); case '|': current = GetNext(); switch (current) { case '=': return CssMatchToken.Dash; case '|': return CssColumnToken.Instance; default: return CssToken.Delim(GetPrevious()); } case '~': current = GetNext(); if (current == '=') return CssMatchToken.Include; return CssToken.Delim(GetPrevious()); case '': return null; case '!': current = GetNext(); if (current == '=') return CssMatchToken.Not; return CssToken.Delim(GetPrevious()); default: if (current.IsNameStart()) return IdentStart(current); return CssToken.Delim(current); } } private CssToken StringDQ() { while (true) { char next = GetNext(); switch (next) { case '': case '"': return CssStringToken.Plain(FlushBuffer(), false); case '\n': case ' ': RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); Back(); return CssStringToken.Plain(FlushBuffer(), true); case '\\': next = GetNext(); if (next.IsLineBreak()) _stringBuffer.AppendLine(); else { if (next == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); return CssStringToken.Plain(FlushBuffer(), true); } _stringBuffer.Append(ConsumeEscape(next)); } break; default: _stringBuffer.Append(next); break; } } } private CssToken StringSQ() { while (true) { char next = GetNext(); switch (next) { case '': case '\'': return CssStringToken.Plain(FlushBuffer(), false); case '\n': case ' ': RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); Back(); return CssStringToken.Plain(FlushBuffer(), true); case '\\': next = GetNext(); if (next.IsLineBreak()) _stringBuffer.AppendLine(); else { if (next == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); return CssStringToken.Plain(FlushBuffer(), true); } _stringBuffer.Append(ConsumeEscape(next)); } break; default: _stringBuffer.Append(next); break; } } } private CssToken HashStart() { char next = GetNext(); if (next.IsNameStart()) { _stringBuffer.Append(next); return HashRest(); } if (IsValidEscape(next)) { next = GetNext(); _stringBuffer.Append(ConsumeEscape(next)); return HashRest(); } if (next == '\\') { RaiseErrorOccurred(ErrorCode.InvalidCharacter); Back(); return CssToken.Delim('#'); } Back(); return CssToken.Delim('#'); } private CssToken HashRest() { char next; while (true) { next = GetNext(); if (next.IsName()) _stringBuffer.Append(next); else { if (!IsValidEscape(next)) break; next = GetNext(); _stringBuffer.Append(ConsumeEscape(next)); } } if (next == '\\') { RaiseErrorOccurred(ErrorCode.InvalidCharacter); Back(); return CssKeywordToken.Hash(FlushBuffer()); } Back(); return CssKeywordToken.Hash(FlushBuffer()); } private CssToken Comment() { while (true) { char next = GetNext(); switch (next) { case '*': next = GetNext(); if (next == '/') return Data(GetNext()); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return Data(next); } } } private CssToken AtKeywordStart() { char next = GetNext(); if (next == '-') { next = GetNext(); if (next.IsNameStart() || IsValidEscape(next)) { _stringBuffer.Append('-'); return AtKeywordRest(next); } Back(2); return CssToken.Delim('@'); } if (next.IsNameStart()) { _stringBuffer.Append(next); return AtKeywordRest(GetNext()); } if (IsValidEscape(next)) { next = GetNext(); _stringBuffer.Append(ConsumeEscape(next)); return AtKeywordRest(GetNext()); } Back(); return CssToken.Delim('@'); } private CssToken AtKeywordRest(char current) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = GetNext(); _stringBuffer.Append(ConsumeEscape(current)); } current = GetNext(); } Back(); return CssKeywordToken.At(FlushBuffer()); } private CssToken IdentStart(char current) { if (current == '-') { current = GetNext(); if (current.IsNameStart() || IsValidEscape(current)) { _stringBuffer.Append('-'); return IdentRest(current); } Back(); return CssToken.Delim('-'); } if (current.IsNameStart()) { _stringBuffer.Append(current); return IdentRest(GetNext()); } if (current == '\\' && IsValidEscape(current)) { current = GetNext(); _stringBuffer.Append(ConsumeEscape(current)); return IdentRest(GetNext()); } return Data(current); } private CssToken IdentRest(char current) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = GetNext(); _stringBuffer.Append(ConsumeEscape(current)); } current = GetNext(); } if (current == '(') { string a = _stringBuffer.ToString().ToLowerInvariant(); if (a == FunctionNames.Url) { _stringBuffer.Clear(); return UrlStart(CssTokenType.Url); } if (a == FunctionNames.Domain) { _stringBuffer.Clear(); return UrlStart(CssTokenType.Domain); } if (a == FunctionNames.Url_Prefix) { _stringBuffer.Clear(); return UrlStart(CssTokenType.UrlPrefix); } return CssKeywordToken.Function(FlushBuffer()); } Back(); return CssKeywordToken.Ident(FlushBuffer()); } private CssToken TransformFunctionWhitespace(char current) { do { current = GetNext(); if (current == '(') { Back(); return CssKeywordToken.Function(FlushBuffer()); } } while (current.IsSpaceCharacter()); Back(2); return CssKeywordToken.Ident(FlushBuffer()); } private CssToken NumberStart(char current) { while (true) { switch (current) { case '+': case '-': _stringBuffer.Append(current); current = GetNext(); if (current == '.') { _stringBuffer.Append(current); _stringBuffer.Append(GetNext()); return NumberFraction(); } _stringBuffer.Append(current); return NumberRest(); case '.': _stringBuffer.Append(current); _stringBuffer.Append(GetNext()); return NumberFraction(); } if (current.IsDigit()) break; current = GetNext(); } _stringBuffer.Append(current); return NumberRest(); } private CssToken NumberRest() { char next = GetNext(); while (next.IsDigit()) { _stringBuffer.Append(next); next = GetNext(); } if (next.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append(next); return Dimension(number); } if (!IsValidEscape(next)) { switch (next) { case '.': next = GetNext(); if (next.IsDigit()) { _stringBuffer.Append('.').Append(next); return NumberFraction(); } Back(); return new CssNumberToken(FlushBuffer()); case '%': return CssUnitToken.Percentage(FlushBuffer()); case 'E': case 'e': return NumberExponential(); case '-': return NumberDash(); default: Back(); return new CssNumberToken(FlushBuffer()); } } next = GetNext(); string number2 = FlushBuffer(); _stringBuffer.Append(ConsumeEscape(next)); return Dimension(number2); } private CssToken NumberFraction() { char next = GetNext(); while (next.IsDigit()) { _stringBuffer.Append(next); next = GetNext(); } if (next.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append(next); return Dimension(number); } if (!IsValidEscape(next)) { switch (next) { case 'E': case 'e': return NumberExponential(); case '%': return CssUnitToken.Percentage(FlushBuffer()); case '-': return NumberDash(); default: Back(); return new CssNumberToken(FlushBuffer()); } } next = GetNext(); string number2 = FlushBuffer(); _stringBuffer.Append(ConsumeEscape(next)); return Dimension(number2); } private CssToken Dimension(string number) { while (true) { char next = GetNext(); if (next.IsLetter()) _stringBuffer.Append(next); else { if (!IsValidEscape(next)) break; next = GetNext(); _stringBuffer.Append(ConsumeEscape(next)); } } Back(); return CssUnitToken.Dimension(number, FlushBuffer()); } private CssToken SciNotation() { while (true) { char next = GetNext(); if (!next.IsDigit()) break; _stringBuffer.Append(next); } Back(); return new CssNumberToken(FlushBuffer()); } private CssToken UrlStart(CssTokenType type) { char c = SkipSpaces(); switch (c) { case '': RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, string.Empty, true); case '"': return UrlDQ(type); case '\'': return UrlSQ(type); case ')': return CssStringToken.Url(type, string.Empty, false); default: return UrlUQ(c, type); } } private CssToken UrlDQ(CssTokenType type) { while (true) { char next = GetNext(); if (next.IsLineBreak()) { RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); return UrlBad(type); } if ('' == next) break; switch (next) { case '"': return UrlEnd(type); case '\\': next = GetNext(); if (next == '') { Back(2); RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); } if (next.IsLineBreak()) _stringBuffer.AppendLine(); else _stringBuffer.Append(ConsumeEscape(next)); break; default: _stringBuffer.Append(next); break; } } return CssStringToken.Url(type, FlushBuffer(), false); } private CssToken UrlSQ(CssTokenType type) { while (true) { char next = GetNext(); if (next.IsLineBreak()) { RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); return UrlBad(type); } if ('' == next) break; switch (next) { case '\'': return UrlEnd(type); case '\\': next = GetNext(); if (next == '') { Back(2); RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); } if (next.IsLineBreak()) _stringBuffer.AppendLine(); else _stringBuffer.Append(ConsumeEscape(next)); break; default: _stringBuffer.Append(next); break; } } return CssStringToken.Url(type, FlushBuffer(), false); } private CssToken UrlUQ(char current, CssTokenType type) { while (true) { if (current.IsSpaceCharacter()) return UrlEnd(type); switch (current) { case '': case ')': return CssStringToken.Url(type, FlushBuffer(), false); default: if (!current.IsNonPrintable()) break; goto case '"'; case '"': case '\'': case '(': RaiseErrorOccurred(ErrorCode.InvalidCharacter); return UrlBad(type); } if (current == '\\') { if (!IsValidEscape(current)) break; current = GetNext(); _stringBuffer.Append(ConsumeEscape(current)); } else _stringBuffer.Append(current); current = GetNext(); } RaiseErrorOccurred(ErrorCode.InvalidCharacter); return UrlBad(type); } private CssToken UrlEnd(CssTokenType type) { char next; do { next = GetNext(); if (next == ')') return CssStringToken.Url(type, FlushBuffer(), false); } while (next.IsSpaceCharacter()); RaiseErrorOccurred(ErrorCode.InvalidCharacter); Back(); return UrlBad(type); } private CssToken UrlBad(CssTokenType type) { while (true) { char next = GetNext(); switch (next) { case '': RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); case ')': return CssStringToken.Url(type, FlushBuffer(), true); } if (IsValidEscape(next)) { next = GetNext(); _stringBuffer.Append(ConsumeEscape(next)); } } } private CssToken UnicodeRange(char current) { for (int i = 0; i < 6; i++) { if (!current.IsHex()) break; _stringBuffer.Append(current); current = GetNext(); } if (_stringBuffer.Length != 6) { for (int j = 0; j < 6 - _stringBuffer.Length; j++) { if (current != '?') { current = GetPrevious(); break; } _stringBuffer.Append(current); current = GetNext(); } string text = FlushBuffer(); string start = text.Replace('?', '0'); string end = text.Replace('?', 'F'); return new CssRangeToken(start, end); } if (current == '-') { current = GetNext(); if (current.IsHex()) { string start2 = _stringBuffer.ToString(); _stringBuffer.Clear(); for (int k = 0; k < 6; k++) { if (!current.IsHex()) { current = GetPrevious(); break; } _stringBuffer.Append(current); current = GetNext(); } string end2 = FlushBuffer(); return new CssRangeToken(start2, end2); } Back(2); return new CssRangeToken(FlushBuffer(), null); } Back(); return new CssRangeToken(FlushBuffer(), null); } private string FlushBuffer() { string result = _stringBuffer.ToString(); _stringBuffer.Clear(); return result; } private CssToken NumberExponential() { char next = GetNext(); if (next.IsDigit()) { _stringBuffer.Append('e').Append(next); return SciNotation(); } if (next == '+' || next == '-') { char value = next; next = GetNext(); if (next.IsDigit()) { _stringBuffer.Append('e').Append(value).Append(next); return SciNotation(); } Back(); } next = GetPrevious(); string number = FlushBuffer(); _stringBuffer.Append(next); return Dimension(number); } private CssToken NumberDash() { char next = GetNext(); if (next.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append('-').Append(next); return Dimension(number); } if (IsValidEscape(next)) { next = GetNext(); string number2 = FlushBuffer(); _stringBuffer.Append('-').Append(ConsumeEscape(next)); return Dimension(number2); } Back(2); return new CssNumberToken(FlushBuffer()); } private string ConsumeEscape(char current) { if (current.IsHex()) { List<char> list = new List<char>(); for (int i = 0; i < 6; i++) { list.Add(current); current = GetNext(); if (!current.IsHex()) break; } if (current != ' ') Back(); return char.ConvertFromUtf32(int.Parse(new string(list.ToArray()), NumberStyles.HexNumber)); } return current.ToString(); } private bool IsValidEscape(char current) { if (current != '\\') return false; current = GetNext(); Back(); if (current == '') return false; if (current.IsLineBreak()) return false; return true; } } }