CssTokenizer
The CSS tokenizer.
See http://dev.w3.org/csswg/css-syntax/#tokenization for more details.
using AngleSharp.Events;
using AngleSharp.Extensions;
using System.Diagnostics;
using System.Globalization;
namespace AngleSharp.Parser.Css
{
[DebuggerStepThrough]
internal sealed class CssTokenizer : BaseTokenizer
{
private bool _valueMode;
private TextPosition _position;
public bool IsInValue {
get {
return _valueMode;
}
set {
_valueMode = value;
}
}
public CssTokenizer(TextSource source, IEventAggregator events)
: base(source, events)
{
_valueMode = false;
}
public CssToken Get()
{
char next = GetNext();
_position = GetCurrentPosition();
return Data(next);
}
public void RaiseErrorOccurred(CssParseError error, TextPosition position)
{
if (_events != null) {
CssParseErrorEvent data = new CssParseErrorEvent(error.GetCode(), error.GetMessage(), position);
_events.Publish(data);
}
}
public void RaiseErrorOccurred(CssParseError code)
{
RaiseErrorOccurred(code, GetCurrentPosition());
}
private CssToken Data(char current)
{
_position = GetCurrentPosition();
switch (current) {
case '\t':
case '\n':
case '':
case '\r':
case ' ':
return NewWhitespace(current);
case '"':
return StringDQ();
case '#':
if (!_valueMode)
return HashStart();
return ColorLiteral();
case '$':
current = GetNext();
if (current == '=')
return NewSuffix();
return NewDelimiter(GetPrevious());
case '\'':
return StringSQ();
case '(':
return NewOpenRound();
case ')':
return NewCloseRound();
case '*':
current = GetNext();
if (current == '=')
return NewSubstring();
return NewDelimiter(GetPrevious());
case '+': {
char next4 = GetNext();
if (next4 != '') {
char next5 = GetNext();
Back(2);
if (next4.IsDigit() || (next4 == '.' && next5.IsDigit()))
return NumberStart(current);
} else
Back();
return NewDelimiter(current);
}
case ',':
return NewComma();
case '.': {
char next = GetNext();
if (next.IsDigit())
return NumberStart(GetPrevious());
return NewDelimiter(GetPrevious());
}
case '-': {
char next2 = GetNext();
if (next2 != '') {
char next3 = GetNext();
Back(2);
if (next2.IsDigit() || (next2 == '.' && next3.IsDigit()))
return NumberStart(current);
if (next2.IsNameStart())
return IdentStart(current);
if (next2 == '\\' && !next3.IsLineBreak() && next3 != '')
return IdentStart(current);
if (next2 == '-' && next3 == '>') {
Advance(2);
return NewCloseComment();
}
} else
Back();
return NewDelimiter(current);
}
case '/':
current = GetNext();
if (current == '*')
return Comment();
return NewDelimiter(GetPrevious());
case '\\':
current = GetNext();
if (current.IsLineBreak()) {
RaiseErrorOccurred(CssParseError.LineBreakUnexpected);
return NewDelimiter(GetPrevious());
}
if (current == '') {
RaiseErrorOccurred(CssParseError.EOF);
return NewDelimiter(GetPrevious());
}
return IdentStart(GetPrevious());
case ':':
return NewColon();
case ';':
return NewSemicolon();
case '<':
current = GetNext();
if (current == '!') {
current = GetNext();
if (current == '-') {
current = GetNext();
if (current == '-')
return NewOpenComment();
current = GetPrevious();
}
current = GetPrevious();
}
return NewDelimiter(GetPrevious());
case '@':
return AtKeywordStart();
case '[':
return NewOpenSquare();
case ']':
return NewCloseSquare();
case '^':
current = GetNext();
if (current == '=')
return NewPrefix();
return NewDelimiter(GetPrevious());
case '{':
return NewOpenCurly();
case '}':
return NewCloseCurly();
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return NumberStart(current);
case 'U':
case 'u':
current = GetNext();
if (current == '+') {
current = GetNext();
if (current.IsHex() || current == '?')
return UnicodeRange(current);
current = GetPrevious();
}
return IdentStart(GetPrevious());
case '|':
current = GetNext();
switch (current) {
case '=':
return NewDash();
case '|':
return NewColumn();
default:
return NewDelimiter(GetPrevious());
}
case '~':
current = GetNext();
if (current == '=')
return NewInclude();
return NewDelimiter(GetPrevious());
case '':
return NewEof();
case '!':
current = GetNext();
if (current == '=')
return NewNot();
return NewDelimiter(GetPrevious());
default:
if (current.IsNameStart())
return IdentStart(current);
return NewDelimiter(current);
}
}
private CssToken StringDQ()
{
while (true) {
char next = GetNext();
switch (next) {
case '':
case '"':
return NewString(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(CssParseError.LineBreakUnexpected);
Back();
return NewString(FlushBuffer(), true);
case '\\':
next = GetNext();
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (next == '') {
RaiseErrorOccurred(CssParseError.EOF);
Back();
return NewString(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(next));
}
break;
default:
_stringBuffer.Append(next);
break;
}
}
}
private CssToken StringSQ()
{
while (true) {
char next = GetNext();
switch (next) {
case '':
case '\'':
return NewString(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(CssParseError.LineBreakUnexpected);
Back();
return NewString(FlushBuffer(), true);
case '\\':
next = GetNext();
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (next == '') {
RaiseErrorOccurred(CssParseError.EOF);
Back();
return NewString(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(next));
}
break;
default:
_stringBuffer.Append(next);
break;
}
}
}
private CssToken ColorLiteral()
{
char next = GetNext();
while (next.IsHex()) {
_stringBuffer.Append(next);
next = GetNext();
}
Back();
return NewColor(FlushBuffer());
}
private CssToken HashStart()
{
char next = GetNext();
if (next.IsNameStart()) {
_stringBuffer.Append(next);
return HashRest();
}
if (IsValidEscape(next)) {
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
return HashRest();
}
if (next == '\\') {
RaiseErrorOccurred(CssParseError.InvalidCharacter);
Back();
return NewDelimiter('#');
}
Back();
return NewDelimiter('#');
}
private CssToken HashRest()
{
char next;
while (true) {
next = GetNext();
if (next.IsName())
_stringBuffer.Append(next);
else {
if (!IsValidEscape(next))
break;
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
}
}
if (next == '\\') {
RaiseErrorOccurred(CssParseError.InvalidCharacter);
Back();
return NewHash(FlushBuffer());
}
Back();
return NewHash(FlushBuffer());
}
private CssToken Comment()
{
char next = GetNext();
while (true) {
switch (next) {
case '*':
next = GetNext();
if (next == '/')
return NewComment(FlushBuffer(), false);
_stringBuffer.Append('*');
break;
default:
_stringBuffer.Append(next);
next = GetNext();
break;
case '':
RaiseErrorOccurred(CssParseError.EOF);
return NewComment(FlushBuffer(), true);
}
}
}
private CssToken AtKeywordStart()
{
char next = GetNext();
if (next == '-') {
next = GetNext();
if (next.IsNameStart() || IsValidEscape(next)) {
_stringBuffer.Append('-');
return AtKeywordRest(next);
}
Back(2);
return NewDelimiter('@');
}
if (next.IsNameStart()) {
_stringBuffer.Append(next);
return AtKeywordRest(GetNext());
}
if (IsValidEscape(next)) {
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
return AtKeywordRest(GetNext());
}
Back();
return NewDelimiter('@');
}
private CssToken AtKeywordRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
}
current = GetNext();
}
Back();
return NewAtKeyword(FlushBuffer());
}
private CssToken IdentStart(char current)
{
if (current == '-') {
current = GetNext();
if (current.IsNameStart() || IsValidEscape(current)) {
_stringBuffer.Append('-');
return IdentRest(current);
}
Back();
return NewDelimiter('-');
}
if (current.IsNameStart()) {
_stringBuffer.Append(current);
return IdentRest(GetNext());
}
if (current == '\\' && IsValidEscape(current)) {
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
return IdentRest(GetNext());
}
return Data(current);
}
private CssToken IdentRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
}
current = GetNext();
}
if (current == '(') {
string text = FlushBuffer();
CssTokenType typeFromName = text.GetTypeFromName();
CssTokenType cssTokenType = typeFromName;
if (cssTokenType == CssTokenType.Function)
return NewFunction(text);
return UrlStart(typeFromName);
}
Back();
return NewIdent(FlushBuffer());
}
private CssToken TransformFunctionWhitespace(char current)
{
do {
current = GetNext();
if (current == '(') {
Back();
return NewFunction(FlushBuffer());
}
} while (current.IsSpaceCharacter());
Back(2);
return NewIdent(FlushBuffer());
}
private CssToken NumberStart(char current)
{
while (true) {
switch (current) {
case '+':
case '-':
_stringBuffer.Append(current);
current = GetNext();
if (current == '.') {
_stringBuffer.Append(current);
_stringBuffer.Append(GetNext());
return NumberFraction();
}
_stringBuffer.Append(current);
return NumberRest();
case '.':
_stringBuffer.Append(current);
_stringBuffer.Append(GetNext());
return NumberFraction();
}
if (current.IsDigit())
break;
current = GetNext();
}
_stringBuffer.Append(current);
return NumberRest();
}
private CssToken NumberRest()
{
char next = GetNext();
while (next.IsDigit()) {
_stringBuffer.Append(next);
next = GetNext();
}
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(next);
return Dimension(number);
}
if (!IsValidEscape(next)) {
switch (next) {
case '.':
next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append('.').Append(next);
return NumberFraction();
}
Back();
return NewNumber(FlushBuffer());
case '%':
return NewPercentage(FlushBuffer());
case 'E':
case 'e':
return NumberExponential(next);
case '-':
return NumberDash();
default:
Back();
return NewNumber(FlushBuffer());
}
}
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(next));
return Dimension(number2);
}
private CssToken NumberFraction()
{
char next = GetNext();
while (next.IsDigit()) {
_stringBuffer.Append(next);
next = GetNext();
}
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(next);
return Dimension(number);
}
if (!IsValidEscape(next)) {
switch (next) {
case 'E':
case 'e':
return NumberExponential(next);
case '%':
return NewPercentage(FlushBuffer());
case '-':
return NumberDash();
default:
Back();
return NewNumber(FlushBuffer());
}
}
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(next));
return Dimension(number2);
}
private CssToken Dimension(string number)
{
while (true) {
char next = GetNext();
if (next.IsLetter())
_stringBuffer.Append(next);
else {
if (!IsValidEscape(next))
break;
next = GetNext();
_stringBuffer.Append(ConsumeEscape(next));
}
}
Back();
return NewDimension(number, FlushBuffer());
}
private CssToken SciNotation()
{
while (true) {
char next = GetNext();
if (!next.IsDigit())
break;
_stringBuffer.Append(next);
}
Back();
return NewNumber(FlushBuffer());
}
private CssToken UrlStart(CssTokenType type)
{
char c = SkipSpaces();
switch (c) {
case '':
RaiseErrorOccurred(CssParseError.EOF);
return NewUrl(type, string.Empty, true);
case '"':
return UrlDQ(type);
case '\'':
return UrlSQ(type);
case ')':
return NewUrl(type, string.Empty, false);
default:
return UrlUQ(c, type);
}
}
private CssToken UrlDQ(CssTokenType type)
{
while (true) {
char next = GetNext();
if (next.IsLineBreak()) {
RaiseErrorOccurred(CssParseError.LineBreakUnexpected);
return UrlBad(type);
}
if ('' == next)
break;
switch (next) {
case '"':
return UrlEnd(type);
default:
_stringBuffer.Append(next);
break;
case '\\':
next = GetNext();
if (next == '') {
Back(2);
RaiseErrorOccurred(CssParseError.EOF);
return NewUrl(type, FlushBuffer(), true);
}
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(next));
break;
}
}
return NewUrl(type, FlushBuffer(), false);
}
private CssToken UrlSQ(CssTokenType type)
{
while (true) {
char next = GetNext();
if (next.IsLineBreak()) {
RaiseErrorOccurred(CssParseError.LineBreakUnexpected);
return UrlBad(type);
}
if ('' == next)
break;
switch (next) {
case '\'':
return UrlEnd(type);
default:
_stringBuffer.Append(next);
break;
case '\\':
next = GetNext();
if (next == '') {
Back(2);
RaiseErrorOccurred(CssParseError.EOF);
return NewUrl(type, FlushBuffer(), true);
}
if (next.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(next));
break;
}
}
return NewUrl(type, FlushBuffer(), false);
}
private CssToken UrlUQ(char current, CssTokenType type)
{
while (true) {
if (current.IsSpaceCharacter())
return UrlEnd(type);
switch (current) {
case '':
case ')':
return NewUrl(type, FlushBuffer(), false);
default:
if (!current.IsNonPrintable())
break;
goto case '"';
case '"':
case '\'':
case '(':
RaiseErrorOccurred(CssParseError.InvalidCharacter);
return UrlBad(type);
}
if (current != '\\')
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = GetNext();
_stringBuffer.Append(ConsumeEscape(current));
}
current = GetNext();
}
RaiseErrorOccurred(CssParseError.InvalidCharacter);
return UrlBad(type);
}
private CssToken UrlEnd(CssTokenType type)
{
char next;
do {
next = GetNext();
if (next == ')')
return NewUrl(type, FlushBuffer(), false);
} while (next.IsSpaceCharacter());
RaiseErrorOccurred(CssParseError.InvalidCharacter);
Back();
return UrlBad(type);
}
private CssToken UrlBad(CssTokenType type)
{
char c = base.Current;
int num = 0;
int num2 = 1;
while (true) {
switch (c) {
case ';':
Back();
return NewUrl(type, FlushBuffer(), true);
case '}':
if (--num == -1) {
Back();
return NewUrl(type, FlushBuffer(), true);
}
break;
case '':
RaiseErrorOccurred(CssParseError.EOF);
return NewUrl(type, FlushBuffer(), true);
}
if (c == ')' && --num2 == 0)
break;
if (IsValidEscape(c)) {
c = GetNext();
_stringBuffer.Append(ConsumeEscape(c));
} else {
if (c == '(')
num2++;
else if (num == 123) {
num++;
}
_stringBuffer.Append(c);
}
c = GetNext();
}
return NewUrl(type, FlushBuffer(), true);
}
private CssToken UnicodeRange(char current)
{
for (int i = 0; i < 6; i++) {
if (!current.IsHex())
break;
_stringBuffer.Append(current);
current = GetNext();
}
if (_stringBuffer.Length != 6) {
for (int j = 0; j < 6 - _stringBuffer.Length; j++) {
if (current != '?') {
current = GetPrevious();
break;
}
_stringBuffer.Append(current);
current = GetNext();
}
return NewRange(FlushBuffer());
}
if (current == '-') {
current = GetNext();
if (current.IsHex()) {
string start = FlushBuffer();
for (int k = 0; k < 6; k++) {
if (!current.IsHex()) {
current = GetPrevious();
break;
}
_stringBuffer.Append(current);
current = GetNext();
}
string end = FlushBuffer();
return NewRange(start, end);
}
Back(2);
return NewRange(FlushBuffer());
}
Back();
return NewRange(FlushBuffer());
}
private CssToken NewNot()
{
return new CssToken(CssTokenType.NotMatch, "!=", _position);
}
private CssToken NewInclude()
{
return new CssToken(CssTokenType.IncludeMatch, "~=", _position);
}
private CssToken NewColumn()
{
return new CssToken(CssTokenType.Column, "||", _position);
}
private CssToken NewDash()
{
return new CssToken(CssTokenType.DashMatch, "|=", _position);
}
private CssToken NewCloseCurly()
{
return new CssToken(CssTokenType.CurlyBracketClose, "}", _position);
}
private CssToken NewOpenCurly()
{
return new CssToken(CssTokenType.CurlyBracketOpen, "{", _position);
}
private CssToken NewPrefix()
{
return new CssToken(CssTokenType.PrefixMatch, "^=", _position);
}
private CssToken NewCloseSquare()
{
return new CssToken(CssTokenType.SquareBracketClose, "]", _position);
}
private CssToken NewOpenSquare()
{
return new CssToken(CssTokenType.SquareBracketOpen, "[", _position);
}
private CssToken NewOpenComment()
{
return new CssToken(CssTokenType.Cdo, "<!--", _position);
}
private CssToken NewSemicolon()
{
return new CssToken(CssTokenType.Semicolon, ";", _position);
}
private CssToken NewColon()
{
return new CssToken(CssTokenType.Colon, ":", _position);
}
private CssToken NewCloseComment()
{
return new CssToken(CssTokenType.Cdc, "-->", _position);
}
private CssToken NewComma()
{
return new CssToken(CssTokenType.Comma, ",", _position);
}
private CssToken NewSubstring()
{
return new CssToken(CssTokenType.SubstringMatch, "*=", _position);
}
private CssToken NewCloseRound()
{
return new CssToken(CssTokenType.RoundBracketClose, ")", _position);
}
private CssToken NewOpenRound()
{
return new CssToken(CssTokenType.RoundBracketOpen, "(", _position);
}
private CssToken NewSuffix()
{
return new CssToken(CssTokenType.SuffixMatch, "$=", _position);
}
private CssToken NewString(string value, bool bad = false)
{
return new CssStringToken(CssTokenType.String, value, bad, _position);
}
private CssToken NewHash(string value)
{
return new CssKeywordToken(CssTokenType.Hash, value, _position);
}
private CssToken NewComment(string value, bool bad = false)
{
return new CssStringToken(CssTokenType.Comment, value, bad, _position);
}
private CssToken NewAtKeyword(string value)
{
return new CssKeywordToken(CssTokenType.AtKeyword, value, _position);
}
private CssToken NewIdent(string value)
{
return new CssKeywordToken(CssTokenType.Ident, value, _position);
}
private CssToken NewFunction(string value)
{
CssFunctionToken cssFunctionToken = new CssFunctionToken(value, _position);
CssToken cssToken = Get();
while (cssToken.Type != CssTokenType.Eof) {
if (cssToken.Type == CssTokenType.RoundBracketClose) {
cssFunctionToken.Close(cssToken);
break;
}
cssFunctionToken.With(cssToken);
cssToken = Get();
}
return cssFunctionToken;
}
private CssToken NewPercentage(string value)
{
return new CssUnitToken(CssTokenType.Percentage, value, "%", _position);
}
private CssToken NewDimension(string value, string unit)
{
return new CssUnitToken(CssTokenType.Dimension, value, unit, _position);
}
private CssToken NewUrl(CssTokenType type, string data, bool bad = false)
{
return new CssStringToken(type, data, bad, _position);
}
private CssToken NewRange(string range)
{
return new CssRangeToken(range, _position);
}
private CssToken NewRange(string start, string end)
{
return new CssRangeToken(start, end, _position);
}
private CssToken NewWhitespace(char c)
{
return new CssToken(CssTokenType.Whitespace, c.ToString(), _position);
}
private CssToken NewNumber(string number)
{
return new CssNumberToken(number, _position);
}
private CssToken NewDelimiter(char c)
{
return new CssToken(CssTokenType.Delim, c, _position);
}
private CssToken NewColor(string text)
{
bool bad = text.Length != 3 && text.Length != 6;
return new CssStringToken(CssTokenType.Color, text, bad, _position);
}
private CssToken NewEof()
{
return new CssToken(CssTokenType.Eof, string.Empty, _position);
}
private CssToken NumberExponential(char letter)
{
char next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append(letter).Append(next);
return SciNotation();
}
if (next == '+' || next == '-') {
char value = next;
next = GetNext();
if (next.IsDigit()) {
_stringBuffer.Append(letter).Append(value).Append(next);
return SciNotation();
}
Back();
}
string number = FlushBuffer();
_stringBuffer.Append(letter);
Back();
return Dimension(number);
}
private CssToken NumberDash()
{
char next = GetNext();
if (next.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append('-').Append(next);
return Dimension(number);
}
if (IsValidEscape(next)) {
next = GetNext();
string number2 = FlushBuffer();
_stringBuffer.Append('-').Append(ConsumeEscape(next));
return Dimension(number2);
}
Back(2);
return NewNumber(FlushBuffer());
}
private string ConsumeEscape(char current)
{
if (current.IsHex()) {
char[] array = new char[6];
int num = 0;
while (num < array.Length) {
array[num++] = current;
current = GetNext();
if (!current.IsHex())
break;
}
if (!current.IsSpaceCharacter())
Back();
int utf = int.Parse(new string(array, 0, num), NumberStyles.HexNumber);
if (!utf.IsInvalid())
return utf.ConvertFromUtf32();
current = '�';
}
return current.ToString();
}
private bool IsValidEscape(char current)
{
if (current != '\\')
return false;
current = GetNext();
Back();
if (current == '')
return false;
if (current.IsLineBreak())
return false;
return true;
}
}
}