AngleSharp by Florian Rappl

<PackageReference Include="AngleSharp" Version="0.7.0" />

 CssTokenizer

sealed class CssTokenizer : BaseTokenizer
The CSS tokenizer. See http://dev.w3.org/csswg/css-syntax/#tokenization for more details.
using AngleSharp.Css; using AngleSharp.Extensions; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; namespace AngleSharp.Parser.Css { [DebuggerStepThrough] internal sealed class CssTokenizer : BaseTokenizer { private bool _ignoreWs; private bool _ignoreCs; public bool IgnoreWhitespace { get { return _ignoreWs; } set { _ignoreWs = value; } } public bool IgnoreComments { get { return _ignoreCs; } set { _ignoreCs = value; } } public IEnumerable<CssToken> Tokens { get { while (true) { char current = base.Next; CssToken token = Data(current); if (token == null) break; yield return token; } } } public CssTokenizer(ITextSource source) : base(source) { } private CssToken Data(char current) { switch (current) { case '\t': case '\n': case '\r': case ' ': do { current = base.Next; } while (current.IsSpaceCharacter()); if (_ignoreWs) return Data(current); Back(); return CssSpecialCharacter.Whitespace; case '"': return StringDQ(base.Next); case '#': return HashStart(base.Next); case '$': current = base.Next; if (current == '=') return CssMatchToken.Suffix; return CssToken.Delim(base.Previous); case '\'': return StringSQ(base.Next); case '(': return CssBracketToken.OpenRound; case ')': return CssBracketToken.CloseRound; case '*': current = base.Next; if (current == '=') return CssMatchToken.Substring; return CssToken.Delim(base.Previous); case '+': { char next2 = base.Next; if (next2 == '') Back(); else { char next3 = base.Next; Back(2); if (next2.IsDigit() || (next2 == '.' && next3.IsDigit())) return NumberStart(current); } return CssToken.Delim(current); } case ',': return CssSpecialCharacter.Comma; case '.': { char next = base.Next; if (next.IsDigit()) return NumberStart(base.Previous); return CssToken.Delim(base.Previous); } case '-': { char next4 = base.Next; if (next4 == '') Back(); else { char next5 = base.Next; Back(2); if (next4.IsDigit() || (next4 == '.' && next5.IsDigit())) return NumberStart(current); if (next4.IsNameStart()) return IdentStart(current); if (next4 == '\\' && !next5.IsLineBreak() && next5 != '') return IdentStart(current); if (next4 == '-' && next5 == '>') { Advance(2); if (_ignoreCs) return Data(base.Next); return CssCommentToken.Close; } } return CssToken.Delim(current); } case '/': current = base.Next; if (current == '*') return Comment(base.Next); return CssToken.Delim(base.Previous); case '\\': current = base.Next; if (current.IsLineBreak() || current == '') { RaiseErrorOccurred((current != '') ? ErrorCode.LineBreakUnexpected : ErrorCode.EOF); return CssToken.Delim(base.Previous); } return IdentStart(base.Previous); case ':': return CssSpecialCharacter.Colon; case ';': return CssSpecialCharacter.Semicolon; case '<': current = base.Next; if (current == '!') { current = base.Next; if (current == '-') { current = base.Next; if (current == '-') { if (_ignoreCs) return Data(base.Next); return CssCommentToken.Open; } current = base.Previous; } current = base.Previous; } return CssToken.Delim(base.Previous); case '@': return AtKeywordStart(base.Next); case '[': return CssBracketToken.OpenSquare; case ']': return CssBracketToken.CloseSquare; case '^': current = base.Next; if (current == '=') return CssMatchToken.Prefix; return CssToken.Delim(base.Previous); case '{': return CssBracketToken.OpenCurly; case '}': return CssBracketToken.CloseCurly; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return NumberStart(current); case 'U': case 'u': current = base.Next; if (current == '+') { current = base.Next; if (current.IsHex() || current == '?') return UnicodeRange(current); current = base.Previous; } return IdentStart(base.Previous); case '|': current = base.Next; switch (current) { case '=': return CssMatchToken.Dash; case '|': return CssToken.Column; default: return CssToken.Delim(base.Previous); } case '~': current = base.Next; if (current == '=') return CssMatchToken.Include; return CssToken.Delim(base.Previous); case '': return null; case '!': current = base.Next; if (current == '=') return CssMatchToken.Not; return CssToken.Delim(base.Previous); default: if (current.IsNameStart()) return IdentStart(current); return CssToken.Delim(current); } } private CssToken StringDQ(char current) { while (true) { switch (current) { case '': case '"': return CssStringToken.Plain(FlushBuffer(), false); case '\n': case ' ': RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); Back(); return CssStringToken.Plain(FlushBuffer(), true); case '\\': current = base.Next; if (current.IsLineBreak()) _stringBuffer.AppendLine(); else { if (current == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); return CssStringToken.Plain(FlushBuffer(), true); } _stringBuffer.Append(ConsumeEscape(current)); } break; default: _stringBuffer.Append(current); break; } current = base.Next; } } private CssToken StringSQ(char current) { while (true) { switch (current) { case '': case '\'': return CssStringToken.Plain(FlushBuffer(), false); case '\n': case ' ': RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); Back(); return CssStringToken.Plain(FlushBuffer(), true); case '\\': current = base.Next; if (current.IsLineBreak()) _stringBuffer.AppendLine(); else { if (current == '') { RaiseErrorOccurred(ErrorCode.EOF); Back(); return CssStringToken.Plain(FlushBuffer(), true); } _stringBuffer.Append(ConsumeEscape(current)); } break; default: _stringBuffer.Append(current); break; } current = base.Next; } } private CssToken HashStart(char current) { if (current.IsNameStart()) { _stringBuffer.Append(current); return HashRest(base.Next); } if (IsValidEscape(current)) { current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); return HashRest(base.Next); } if (current == '\\') { RaiseErrorOccurred(ErrorCode.InvalidCharacter); Back(); return CssToken.Delim('#'); } Back(); return CssToken.Delim('#'); } private CssToken HashRest(char current) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } current = base.Next; } if (current == '\\') { RaiseErrorOccurred(ErrorCode.InvalidCharacter); Back(); return CssKeywordToken.Hash(FlushBuffer()); } Back(); return CssKeywordToken.Hash(FlushBuffer()); } private CssToken Comment(char current) { while (true) { switch (current) { case '*': current = base.Next; if (current == '/') return Data(base.Next); break; case '': RaiseErrorOccurred(ErrorCode.EOF); return Data(current); } current = base.Next; } } private CssToken AtKeywordStart(char current) { if (current == '-') { current = base.Next; if (current.IsNameStart() || IsValidEscape(current)) { _stringBuffer.Append('-'); return AtKeywordRest(current); } Back(2); return CssToken.Delim('@'); } if (current.IsNameStart()) { _stringBuffer.Append(current); return AtKeywordRest(base.Next); } if (IsValidEscape(current)) { current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); return AtKeywordRest(base.Next); } Back(); return CssToken.Delim('@'); } private CssToken AtKeywordRest(char current) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } current = base.Next; } Back(); return CssKeywordToken.At(FlushBuffer()); } private CssToken IdentStart(char current) { if (current == '-') { current = base.Next; if (current.IsNameStart() || IsValidEscape(current)) { _stringBuffer.Append('-'); return IdentRest(current); } Back(); return CssToken.Delim('-'); } if (current.IsNameStart()) { _stringBuffer.Append(current); return IdentRest(base.Next); } if (current == '\\' && IsValidEscape(current)) { current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); return IdentRest(base.Next); } return Data(current); } private CssToken IdentRest(char current) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } current = base.Next; } if (current == '(') { string a = _stringBuffer.ToString().ToLowerInvariant(); if (a == FunctionNames.Url) { _stringBuffer.Clear(); return UrlStart(base.Next, CssTokenType.Url); } if (a == FunctionNames.Domain) { _stringBuffer.Clear(); return UrlStart(base.Next, CssTokenType.Domain); } if (a == FunctionNames.Url_Prefix) { _stringBuffer.Clear(); return UrlStart(base.Next, CssTokenType.UrlPrefix); } return CssKeywordToken.Function(FlushBuffer()); } Back(); return CssKeywordToken.Ident(FlushBuffer()); } private CssToken TransformFunctionWhitespace(char current) { do { current = base.Next; if (current == '(') { Back(); return CssKeywordToken.Function(FlushBuffer()); } } while (current.IsSpaceCharacter()); Back(2); return CssKeywordToken.Ident(FlushBuffer()); } private CssToken NumberStart(char current) { while (true) { switch (current) { case '+': case '-': _stringBuffer.Append(current); current = base.Next; if (current == '.') { _stringBuffer.Append(current); _stringBuffer.Append(base.Next); return NumberFraction(base.Next); } _stringBuffer.Append(current); return NumberRest(base.Next); case '.': _stringBuffer.Append(current); _stringBuffer.Append(base.Next); return NumberFraction(base.Next); } if (current.IsDigit()) break; current = base.Next; } _stringBuffer.Append(current); return NumberRest(base.Next); } private CssToken NumberRest(char current) { while (current.IsDigit()) { _stringBuffer.Append(current); current = base.Next; } if (current.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append(current); return Dimension(base.Next, number); } if (!IsValidEscape(current)) { switch (current) { case '.': current = base.Next; if (current.IsDigit()) { _stringBuffer.Append('.').Append(current); return NumberFraction(base.Next); } Back(); return CssToken.Number(FlushBuffer()); case '%': return CssUnitToken.Percentage(FlushBuffer()); case 'E': case 'e': return NumberExponential(current); case '-': return NumberDash(current); default: Back(); return CssToken.Number(FlushBuffer()); } } current = base.Next; string number2 = FlushBuffer(); _stringBuffer.Append(ConsumeEscape(current)); return Dimension(base.Next, number2); } private CssToken NumberFraction(char current) { while (current.IsDigit()) { _stringBuffer.Append(current); current = base.Next; } if (current.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append(current); return Dimension(base.Next, number); } if (!IsValidEscape(current)) { switch (current) { case 'E': case 'e': return NumberExponential(current); case '%': return CssUnitToken.Percentage(FlushBuffer()); case '-': return NumberDash(current); default: Back(); return CssToken.Number(FlushBuffer()); } } current = base.Next; string number2 = FlushBuffer(); _stringBuffer.Append(ConsumeEscape(current)); return Dimension(base.Next, number2); } private CssToken Dimension(char current, string number) { while (true) { if (current.IsName()) _stringBuffer.Append(current); else { if (!IsValidEscape(current)) break; current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } current = base.Next; } Back(); return CssUnitToken.Dimension(number, FlushBuffer()); } private CssToken SciNotation(char current) { while (current.IsDigit()) { _stringBuffer.Append(current); current = base.Next; } Back(); return CssToken.Number(FlushBuffer()); } private CssToken UrlStart(char current, CssTokenType type) { while (current.IsSpaceCharacter()) { current = base.Next; } switch (current) { case '': RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, string.Empty, true); case '"': return UrlDQ(base.Next, type); case '\'': return UrlSQ(base.Next, type); case ')': return CssStringToken.Url(type, string.Empty, false); default: return UrlUQ(current, type); } } private CssToken UrlDQ(char current, CssTokenType type) { while (true) { if (current.IsLineBreak()) { RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); return UrlBad(base.Next, type); } if ('' == current) break; switch (current) { case '"': return UrlEnd(base.Next, type); case '\\': current = base.Next; if (current == '') { Back(2); RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); } if (current.IsLineBreak()) _stringBuffer.AppendLine(); else _stringBuffer.Append(ConsumeEscape(current)); break; default: _stringBuffer.Append(current); break; } current = base.Next; } return CssStringToken.Url(type, FlushBuffer(), false); } private CssToken UrlSQ(char current, CssTokenType type) { while (true) { if (current.IsLineBreak()) { RaiseErrorOccurred(ErrorCode.LineBreakUnexpected); return UrlBad(base.Next, type); } if ('' == current) break; switch (current) { case '\'': return UrlEnd(base.Next, type); case '\\': current = base.Next; if (current == '') { Back(2); RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); } if (current.IsLineBreak()) _stringBuffer.AppendLine(); else _stringBuffer.Append(ConsumeEscape(current)); break; default: _stringBuffer.Append(current); break; } current = base.Next; } return CssStringToken.Url(type, FlushBuffer(), false); } private CssToken UrlUQ(char current, CssTokenType type) { while (true) { if (current.IsSpaceCharacter()) return UrlEnd(base.Next, type); switch (current) { case '': case ')': return CssStringToken.Url(type, FlushBuffer(), false); default: if (!current.IsNonPrintable()) break; goto case '"'; case '"': case '\'': case '(': RaiseErrorOccurred(ErrorCode.InvalidCharacter); return UrlBad(base.Next, type); } if (current == '\\') { if (!IsValidEscape(current)) break; current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } else _stringBuffer.Append(current); current = base.Next; } RaiseErrorOccurred(ErrorCode.InvalidCharacter); return UrlBad(base.Next, type); } private CssToken UrlEnd(char current, CssTokenType type) { while (true) { if (current == ')') return CssStringToken.Url(type, FlushBuffer(), false); if (!current.IsSpaceCharacter()) break; current = base.Next; } RaiseErrorOccurred(ErrorCode.InvalidCharacter); return UrlBad(current, type); } private CssToken UrlBad(char current, CssTokenType type) { while (true) { switch (current) { case '': RaiseErrorOccurred(ErrorCode.EOF); return CssStringToken.Url(type, FlushBuffer(), true); case ')': return CssStringToken.Url(type, FlushBuffer(), true); } if (IsValidEscape(current)) { current = base.Next; _stringBuffer.Append(ConsumeEscape(current)); } current = base.Next; } } private CssToken UnicodeRange(char current) { for (int i = 0; i < 6; i++) { if (!current.IsHex()) break; _stringBuffer.Append(current); current = base.Next; } if (_stringBuffer.Length != 6) { for (int j = 0; j < 6 - _stringBuffer.Length; j++) { if (current != '?') { current = base.Previous; break; } _stringBuffer.Append(current); current = base.Next; } string text = FlushBuffer(); string start = text.Replace('?', '0'); string end = text.Replace('?', 'F'); return CssToken.Range(start, end); } if (current == '-') { current = base.Next; if (current.IsHex()) { string start2 = _stringBuffer.ToString(); _stringBuffer.Clear(); for (int k = 0; k < 6; k++) { if (!current.IsHex()) { current = base.Previous; break; } _stringBuffer.Append(current); current = base.Next; } string end2 = FlushBuffer(); return CssToken.Range(start2, end2); } Back(2); return CssToken.Range(FlushBuffer(), null); } Back(); return CssToken.Range(FlushBuffer(), null); } private string FlushBuffer() { string result = _stringBuffer.ToString(); _stringBuffer.Clear(); return result; } private CssToken NumberExponential(char current) { current = base.Next; if (current.IsDigit()) { _stringBuffer.Append('e').Append(current); return SciNotation(base.Next); } if (current == '+' || current == '-') { char value = current; current = base.Next; if (current.IsDigit()) { _stringBuffer.Append('e').Append(value).Append(current); return SciNotation(base.Next); } Back(); } current = base.Previous; string number = FlushBuffer(); _stringBuffer.Append(current); return Dimension(base.Next, number); } private CssToken NumberDash(char current) { current = base.Next; if (current.IsNameStart()) { string number = FlushBuffer(); _stringBuffer.Append('-').Append(current); return Dimension(base.Next, number); } if (IsValidEscape(current)) { current = base.Next; string number2 = FlushBuffer(); _stringBuffer.Append('-').Append(ConsumeEscape(current)); return Dimension(base.Next, number2); } Back(2); return CssToken.Number(FlushBuffer()); } private string ConsumeEscape(char current) { if (current.IsHex()) { List<char> list = new List<char>(); for (int i = 0; i < 6; i++) { list.Add(current); current = base.Next; if (!current.IsHex()) break; } current = base.Previous; int utf = int.Parse(new string(list.ToArray()), NumberStyles.HexNumber); return char.ConvertFromUtf32(utf); } return current.ToString(); } private bool IsValidEscape(char current) { if (current != '\\') return false; current = base.Next; Back(); if (current == '') return false; if (current.IsLineBreak()) return false; return true; } } }