CssTokenizer
The CSS tokenizer.
See http://dev.w3.org/csswg/css-syntax/#tokenization for more details.
using AngleSharp.Css;
using AngleSharp.Extensions;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
namespace AngleSharp.Parser.Css
{
[DebuggerStepThrough]
internal sealed class CssTokenizer : BaseTokenizer
{
private bool _ignoreWs;
private bool _ignoreCs;
public bool IgnoreWhitespace {
get {
return _ignoreWs;
}
set {
_ignoreWs = value;
}
}
public bool IgnoreComments {
get {
return _ignoreCs;
}
set {
_ignoreCs = value;
}
}
public IEnumerable<CssToken> Tokens {
get {
while (true) {
CssToken cssToken = Data(GetNext());
if (cssToken == null)
break;
yield return cssToken;
}
}
}
public CssTokenizer(TextSource source)
: base(source)
{
}
private CssToken Data(char current)
{
switch (current) {
case '\t':
case '\n':
case '\r':
case ' ':
do {
current = GetNext();
} while (current.IsSpaceCharacter());
if (_ignoreWs)
return Data(current);
Back();
return CssSpecialCharacter.Whitespace;
case '"':
return StringDQ();
case '#':
return HashStart();
case '$':
current = GetNext();
if (current == '=')
return CssMatchToken.Suffix;
return CssToken.Delim(GetPrevious());
case '\'':
return StringSQ();
case '(':
return CssBracketToken.OpenRound;
case ')':
return CssBracketToken.CloseRound;
case '*':
current = GetNext();
if (current == '=')
return CssMatchToken.Substring;
return CssToken.Delim(GetPrevious());
case '+': {
char next = GetNext();
if (next != '') {
char next2 = GetNext();
Back(2);
if (next.IsDigit() || (next == '.' && next2.IsDigit()))
return NumberStart(current);
} else
Back();
return CssToken.Delim(current);
}
case ',':
return CssSpecialCharacter.Comma;
case '.':
if (GetNext().IsDigit())
return NumberStart(GetPrevious());
return CssToken.Delim(GetPrevious());
case '-': {
char next3 = GetNext();
if (next3 != '') {
char next4 = GetNext();
Back(2);
if (next3.IsDigit() || (next3 == '.' && next4.IsDigit()))
return NumberStart(current);
if (next3.IsNameStart())
return IdentStart(current);
if (next3 == '\\' && !next4.IsLineBreak() && next4 != '')
return IdentStart(current);
if (next3 == '-' && next4 == '>') {
Advance(2);
if (_ignoreCs)
return Data(GetNext());
return CssCommentToken.Close;
}
} else
Back();
return CssToken.Delim(current);
}
case '/':
current = GetNext();
if (current == '*')
return Comment();
return CssToken.Delim(GetPrevious());
case '\\':
current = GetNext();
if (current.IsLineBreak() || current == '') {
RaiseErrorOccurred((current != '') ? ErrorCode.LineBreakUnexpected : ErrorCode.EOF);
return CssToken.Delim(GetPrevious());
}
return IdentStart(GetPrevious());
case ':':
return CssSpecialCharacter.Colon;
case ';':
return CssSpecialCharacter.Semicolon;
case '<':
current = GetNext();
if (current == '!') {
current = GetNext();
if (current == '-') {
current = GetNext();
if (current == '-') {
if (_ignoreCs)
return Data(GetNext());
return CssCommentToken.Open;
}
current = GetPrevious();
}
current = GetPrevious();
}
return CssToken.Delim(GetPrevious());
case '@':
return AtKeywordStart();
case '[':
return CssBracketToken.OpenSquare;
case ']':
return CssBracketToken.CloseSquare;
case '^':
current = GetNext();
if (current == '=')
return CssMatchToken.Prefix;
return CssToken.Delim(GetPrevious());
case '{':
return CssBracketToken.OpenCurly;
case '}':
return CssBracketToken.CloseCurly;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return NumberStart(current);
case 'U':
case 'u':
current = GetNext();
if (current == '+') {
current = GetNext();
if (current.IsHex() || current == '?')
return UnicodeRange(current);
current = GetPrevious();
}
return IdentStart(GetPrevious());
case '|':
current = GetNext();
switch (current) {
case '=':
return CssMatchToken.Dash;
case '|':
return CssColumnToken.Instance;
default:
return CssToken.Delim(GetPrevious());
}
case '~':
current = GetNext();
if (current == '=')
return CssMatchToken.Include;
return CssToken.Delim(GetPrevious());
case '':
return null;
case '!':
current = GetNext();
if (current == '=')
return CssMatchToken.Not;
return CssToken.Delim(GetPrevious());
default:
if (current.IsNameStart())
return IdentStart(current);
return CssToken.Delim(current);
}
}
private CssToken StringDQ()
{
while (true) {
char next = GetNext();
switch (next) {
case '':
case '"':
return CssStringToken.Plain(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
case '\\':
next = GetNext();
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (next == '') {
RaiseErrorOccurred(ErrorCode.EOF);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(next));
}
break;
default:
_stringBuffer.Append(next);
break;
}
}
}
private CssToken StringSQ()
{
while (true) {
char next = GetNext();
switch (next) {
case '':
case '\'':
return CssStringToken.Plain(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
case '\\':
next = GetNext();
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (next == '') {
RaiseErrorOccurred(ErrorCode.EOF);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(next));
}
break;
default:
_stringBuffer.Append(next);
break;
}
}
}
private CssToken HashStart()
{
char next = GetNext();
if (next.IsNameStart()) {
_stringBuffer.Append(next);
return HashRest();
}
if (IsValidEscape(next)) {
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
return HashRest();
}
if (next == '\\') {
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
Back();
return CssToken.Delim('#');
}
Back();
return CssToken.Delim('#');
}
private CssToken HashRest()
{
char next;
while (true) {
next = GetNext();
if (next.IsName())
_stringBuffer.Append(next);
else {
if (!IsValidEscape(next))
break;
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
}
}
if (next == '\\') {
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
Back();
return CssKeywordToken.Hash(FlushBuffer());
}
Back();
return CssKeywordToken.Hash(FlushBuffer());
}
private CssToken Comment()
{
while (true) {
char next = GetNext();
switch (next) {
case '*':
next = GetNext();
if (next == '/')
return Data(GetNext());
break;
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return Data(next);
}
}
}
private CssToken AtKeywordStart()
{
char next = GetNext();
if (next == '-') {
next = GetNext();
if (next.IsNameStart() || IsValidEscape(next)) {
_stringBuffer.Append('-');
return AtKeywordRest(next);
}
Back(2);
return CssToken.Delim('@');
}
if (next.IsNameStart()) {
_stringBuffer.Append(next);
return AtKeywordRest(GetNext());
}
if (IsValidEscape(next)) {
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
return AtKeywordRest(GetNext());
}
Back();
return CssToken.Delim('@');
}
private CssToken AtKeywordRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
}
current = GetNext();
}
Back();
return CssKeywordToken.At(FlushBuffer());
}
private CssToken IdentStart(char current)
{
if (current == '-') {
current = GetNext();
if (current.IsNameStart() || IsValidEscape(current)) {
_stringBuffer.Append('-');
return IdentRest(current);
}
Back();
return CssToken.Delim('-');
}
if (current.IsNameStart()) {
_stringBuffer.Append(current);
return IdentRest(GetNext());
}
if (current == '\\' && IsValidEscape(current)) {
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
return IdentRest(GetNext());
}
return Data(current);
}
private CssToken IdentRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
}
current = GetNext();
}
if (current == '(') {
string a = _stringBuffer.ToString().ToLowerInvariant();
if (a == FunctionNames.Url) {
_stringBuffer.Clear();
return UrlStart(CssTokenType.Url);
}
if (a == FunctionNames.Domain) {
_stringBuffer.Clear();
return UrlStart(CssTokenType.Domain);
}
if (a == FunctionNames.Url_Prefix) {
_stringBuffer.Clear();
return UrlStart(CssTokenType.UrlPrefix);
}
return CssKeywordToken.Function(FlushBuffer());
}
Back();
return CssKeywordToken.Ident(FlushBuffer());
}
private CssToken TransformFunctionWhitespace(char current)
{
do {
current = GetNext();
if (current == '(') {
Back();
return CssKeywordToken.Function(FlushBuffer());
}
} while (current.IsSpaceCharacter());
Back(2);
return CssKeywordToken.Ident(FlushBuffer());
}
private CssToken NumberStart(char current)
{
while (true) {
switch (current) {
case '+':
case '-':
_stringBuffer.Append(current);
current = GetNext();
if (current == '.') {
_stringBuffer.Append(current);
_stringBuffer.Append(GetNext());
return NumberFraction();
}
_stringBuffer.Append(current);
return NumberRest();
case '.':
_stringBuffer.Append(current);
_stringBuffer.Append(GetNext());
return NumberFraction();
}
if (current.IsDigit())
break;
current = GetNext();
}
_stringBuffer.Append(current);
return NumberRest();
}
private CssToken NumberRest()
{
char next = GetNext();
while (next.IsDigit()) {
_stringBuffer.Append(next);
next = GetNext();
}
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(next);
return Dimension(number);
}
if (!IsValidEscape(next)) {
switch (next) {
case '.':
next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append('.').Append(next);
return NumberFraction();
}
Back();
return new CssNumberToken(FlushBuffer());
case '%':
return CssUnitToken.Percentage(FlushBuffer());
case 'E':
case 'e':
return NumberExponential();
case '-':
return NumberDash();
default:
Back();
return new CssNumberToken(FlushBuffer());
}
}
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(next));
return Dimension(number2);
}
private CssToken NumberFraction()
{
char next = GetNext();
while (next.IsDigit()) {
_stringBuffer.Append(next);
next = GetNext();
}
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(next);
return Dimension(number);
}
if (!IsValidEscape(next)) {
switch (next) {
case 'E':
case 'e':
return NumberExponential();
case '%':
return CssUnitToken.Percentage(FlushBuffer());
case '-':
return NumberDash();
default:
Back();
return new CssNumberToken(FlushBuffer());
}
}
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(next));
return Dimension(number2);
}
private CssToken Dimension(string number)
{
while (true) {
char next = GetNext();
if (next.IsLetter())
_stringBuffer.Append(next);
else {
if (!IsValidEscape(next))
break;
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
}
}
Back();
return CssUnitToken.Dimension(number, FlushBuffer());
}
private CssToken SciNotation()
{
while (true) {
char next = GetNext();
if (!next.IsDigit())
break;
_stringBuffer.Append(next);
}
Back();
return new CssNumberToken(FlushBuffer());
}
private CssToken UrlStart(CssTokenType type)
{
char c = SkipSpaces();
switch (c) {
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, string.Empty, true);
case '"':
return UrlDQ(type);
case '\'':
return UrlSQ(type);
case ')':
return CssStringToken.Url(type, string.Empty, false);
default:
return UrlUQ(c, type);
}
}
private CssToken UrlDQ(CssTokenType type)
{
while (true) {
char next = GetNext();
if (next.IsLineBreak()) {
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
return UrlBad(type);
}
if ('' == next)
break;
switch (next) {
case '"':
return UrlEnd(type);
case '\\':
next = GetNext();
if (next == '') {
Back(2);
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(next));
break;
default:
_stringBuffer.Append(next);
break;
}
}
return CssStringToken.Url(type, FlushBuffer(), false);
}
private CssToken UrlSQ(CssTokenType type)
{
while (true) {
char next = GetNext();
if (next.IsLineBreak()) {
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
return UrlBad(type);
}
if ('' == next)
break;
switch (next) {
case '\'':
return UrlEnd(type);
case '\\':
next = GetNext();
if (next == '') {
Back(2);
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(next));
break;
default:
_stringBuffer.Append(next);
break;
}
}
return CssStringToken.Url(type, FlushBuffer(), false);
}
private CssToken UrlUQ(char current, CssTokenType type)
{
while (true) {
if (current.IsSpaceCharacter())
return UrlEnd(type);
switch (current) {
case '':
case ')':
return CssStringToken.Url(type, FlushBuffer(), false);
default:
if (!current.IsNonPrintable())
break;
goto case '"';
case '"':
case '\'':
case '(':
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
return UrlBad(type);
}
if (current == '\\') {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
} else
_stringBuffer.Append(current);
current = GetNext();
}
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
return UrlBad(type);
}
private CssToken UrlEnd(CssTokenType type)
{
char next;
do {
next = GetNext();
if (next == ')')
return CssStringToken.Url(type, FlushBuffer(), false);
} while (next.IsSpaceCharacter());
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
Back();
return UrlBad(type);
}
private CssToken UrlBad(CssTokenType type)
{
while (true) {
char next = GetNext();
switch (next) {
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
case ')':
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (IsValidEscape(next)) {
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
}
}
}
private CssToken UnicodeRange(char current)
{
for (int i = 0; i < 6; i++) {
if (!current.IsHex())
break;
_stringBuffer.Append(current);
current = GetNext();
}
if (_stringBuffer.Length != 6) {
for (int j = 0; j < 6 - _stringBuffer.Length; j++) {
if (current != '?') {
current = GetPrevious();
break;
}
_stringBuffer.Append(current);
current = GetNext();
}
string text = FlushBuffer();
string start = text.Replace('?', '0');
string end = text.Replace('?', 'F');
return new CssRangeToken(start, end);
}
if (current == '-') {
current = GetNext();
if (current.IsHex()) {
string start2 = _stringBuffer.ToString();
_stringBuffer.Clear();
for (int k = 0; k < 6; k++) {
if (!current.IsHex()) {
current = GetPrevious();
break;
}
_stringBuffer.Append(current);
current = GetNext();
}
string end2 = FlushBuffer();
return new CssRangeToken(start2, end2);
}
Back(2);
return new CssRangeToken(FlushBuffer(), null);
}
Back();
return new CssRangeToken(FlushBuffer(), null);
}
private string FlushBuffer()
{
string result = _stringBuffer.ToString();
_stringBuffer.Clear();
return result;
}
private CssToken NumberExponential()
{
char next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append('e').Append(next);
return SciNotation();
}
if (next == '+' || next == '-') {
char value = next;
next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append('e').Append(value).Append(next);
return SciNotation();
}
Back();
}
next = GetPrevious();
string number = FlushBuffer();
_stringBuffer.Append(next);
return Dimension(number);
}
private CssToken NumberDash()
{
char next = GetNext();
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append('-').Append(next);
return Dimension(number);
}
if (IsValidEscape(next)) {
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append('-').Append(ConsumeEscape(next));
return Dimension(number2);
}
Back(2);
return new CssNumberToken(FlushBuffer());
}
private string ConsumeEscape(char current)
{
if (current.IsHex()) {
List<char> list = new List<char>();
for (int i = 0; i < 6; i++) {
list.Add(current);
current = GetNext();
if (!current.IsHex())
break;
}
if (current != ' ')
Back();
return char.ConvertFromUtf32(int.Parse(new string(list.ToArray()), NumberStyles.HexNumber));
}
return current.ToString();
}
private bool IsValidEscape(char current)
{
if (current != '\\')
return false;
current = GetNext();
Back();
if (current == '')
return false;
if (current.IsLineBreak())
return false;
return true;
}
}
}