CssTokenizer
The CSS tokenizer.
See http://dev.w3.org/csswg/css-syntax/#tokenization for more details.
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Text;
namespace AngleSharp.Parser.Css
{
[DebuggerStepThrough]
internal sealed class CssTokenizer : BaseTokenizer
{
private bool _ignoreWs;
private bool _ignoreCs;
public bool IgnoreWhitespace {
get {
return _ignoreWs;
}
set {
_ignoreWs = value;
}
}
public bool IgnoreComments {
get {
return _ignoreCs;
}
set {
_ignoreCs = value;
}
}
public IEnumerable<CssToken> Tokens {
get {
while (true) {
char current = base.Next;
CssToken token = Data(current);
if (token == null)
break;
yield return token;
}
}
}
public CssTokenizer(ITextSource source)
: base(source)
{
_stringBuffer = new StringBuilder();
}
private CssToken Data(char current)
{
switch (current) {
case '\t':
case '\n':
case '\r':
case ' ':
do {
current = base.Next;
} while (current.IsSpaceCharacter());
if (_ignoreWs)
return Data(current);
Back();
return CssSpecialCharacter.Whitespace;
case '"':
return StringDQ(base.Next);
case '#':
return HashStart(base.Next);
case '$':
current = base.Next;
if (current == '=')
return CssMatchToken.Suffix;
return CssToken.Delim(base.Previous);
case '\'':
return StringSQ(base.Next);
case '(':
return CssBracketToken.OpenRound;
case ')':
return CssBracketToken.CloseRound;
case '*':
current = base.Next;
if (current == '=')
return CssMatchToken.Substring;
return CssToken.Delim(base.Previous);
case '+': {
char next2 = base.Next;
if (next2 == '')
Back();
else {
char next3 = base.Next;
Back(2);
if (next2.IsDigit() || (next2 == '.' && next3.IsDigit()))
return NumberStart(current);
}
return CssToken.Delim(current);
}
case ',':
return CssSpecialCharacter.Comma;
case '.': {
char next = base.Next;
if (next.IsDigit())
return NumberStart(base.Previous);
return CssToken.Delim(base.Previous);
}
case '-': {
char next4 = base.Next;
if (next4 == '')
Back();
else {
char next5 = base.Next;
Back(2);
if (next4.IsDigit() || (next4 == '.' && next5.IsDigit()))
return NumberStart(current);
if (next4.IsNameStart())
return IdentStart(current);
if (next4 == '\\' && !next5.IsLineBreak() && next5 != '')
return IdentStart(current);
if (next4 == '-' && next5 == '>') {
Advance(2);
if (_ignoreCs)
return Data(base.Next);
return CssCommentToken.Close;
}
}
return CssToken.Delim(current);
}
case '/':
current = base.Next;
if (current == '*')
return Comment(base.Next);
return CssToken.Delim(base.Previous);
case '\\':
current = base.Next;
if (current.IsLineBreak() || current == '') {
RaiseErrorOccurred((current != '') ? ErrorCode.LineBreakUnexpected : ErrorCode.EOF);
return CssToken.Delim(base.Previous);
}
return IdentStart(base.Previous);
case ':':
return CssSpecialCharacter.Colon;
case ';':
return CssSpecialCharacter.Semicolon;
case '<':
current = base.Next;
if (current == '!') {
current = base.Next;
if (current == '-') {
current = base.Next;
if (current == '-') {
if (_ignoreCs)
return Data(base.Next);
return CssCommentToken.Open;
}
current = base.Previous;
}
current = base.Previous;
}
return CssToken.Delim(base.Previous);
case '@':
return AtKeywordStart(base.Next);
case '[':
return CssBracketToken.OpenSquare;
case ']':
return CssBracketToken.CloseSquare;
case '^':
current = base.Next;
if (current == '=')
return CssMatchToken.Prefix;
return CssToken.Delim(base.Previous);
case '{':
return CssBracketToken.OpenCurly;
case '}':
return CssBracketToken.CloseCurly;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return NumberStart(current);
case 'U':
case 'u':
current = base.Next;
if (current == '+') {
current = base.Next;
if (current.IsHex() || current == '?')
return UnicodeRange(current);
current = base.Previous;
}
return IdentStart(base.Previous);
case '|':
current = base.Next;
switch (current) {
case '=':
return CssMatchToken.Dash;
case '|':
return CssToken.Column;
default:
return CssToken.Delim(base.Previous);
}
case '~':
current = base.Next;
if (current == '=')
return CssMatchToken.Include;
return CssToken.Delim(base.Previous);
case '':
return null;
case '!':
current = base.Next;
if (current == '=')
return CssMatchToken.Not;
return CssToken.Delim(base.Previous);
default:
if (current.IsNameStart())
return IdentStart(current);
return CssToken.Delim(current);
}
}
private CssToken StringDQ(char current)
{
while (true) {
switch (current) {
case '':
case '"':
return CssStringToken.Plain(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
case '\\':
current = base.Next;
if (current.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (current == '') {
RaiseErrorOccurred(ErrorCode.EOF);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(current));
}
break;
default:
_stringBuffer.Append(current);
break;
}
current = base.Next;
}
}
private CssToken StringSQ(char current)
{
while (true) {
switch (current) {
case '':
case '\'':
return CssStringToken.Plain(FlushBuffer(), false);
case '\n':
case '':
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
case '\\':
current = base.Next;
if (current.IsLineBreak())
_stringBuffer.AppendLine();
else {
if (current == '') {
RaiseErrorOccurred(ErrorCode.EOF);
Back();
return CssStringToken.Plain(FlushBuffer(), true);
}
_stringBuffer.Append(ConsumeEscape(current));
}
break;
default:
_stringBuffer.Append(current);
break;
}
current = base.Next;
}
}
private CssToken HashStart(char current)
{
if (current.IsNameStart()) {
_stringBuffer.Append(current);
return HashRest(base.Next);
}
if (IsValidEscape(current)) {
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
return HashRest(base.Next);
}
if (current == '\\') {
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
Back();
return CssToken.Delim('#');
}
Back();
return CssToken.Delim('#');
}
private CssToken HashRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
}
current = base.Next;
}
if (current == '\\') {
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
Back();
return CssKeywordToken.Hash(FlushBuffer());
}
Back();
return CssKeywordToken.Hash(FlushBuffer());
}
private CssToken Comment(char current)
{
while (true) {
switch (current) {
case '*':
current = base.Next;
if (current == '/')
return Data(base.Next);
break;
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return Data(current);
}
current = base.Next;
}
}
private CssToken AtKeywordStart(char current)
{
if (current == '-') {
current = base.Next;
if (current.IsNameStart() || IsValidEscape(current)) {
_stringBuffer.Append('-');
return AtKeywordRest(current);
}
Back(2);
return CssToken.Delim('@');
}
if (current.IsNameStart()) {
_stringBuffer.Append(current);
return AtKeywordRest(base.Next);
}
if (IsValidEscape(current)) {
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
return AtKeywordRest(base.Next);
}
Back();
return CssToken.Delim('@');
}
private CssToken AtKeywordRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
}
current = base.Next;
}
Back();
return CssKeywordToken.At(FlushBuffer());
}
private CssToken IdentStart(char current)
{
if (current == '-') {
current = base.Next;
if (current.IsNameStart() || IsValidEscape(current)) {
_stringBuffer.Append('-');
return IdentRest(current);
}
Back();
return CssToken.Delim('-');
}
if (current.IsNameStart()) {
_stringBuffer.Append(current);
return IdentRest(base.Next);
}
if (current == '\\' && IsValidEscape(current)) {
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
return IdentRest(base.Next);
}
return Data(current);
}
private CssToken IdentRest(char current)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
}
current = base.Next;
}
if (current == '(') {
string a = _stringBuffer.ToString().ToLower();
if (a == FunctionNames.Url) {
_stringBuffer.Clear();
return UrlStart(base.Next, CssTokenType.Url);
}
if (a == FunctionNames.Domain) {
_stringBuffer.Clear();
return UrlStart(base.Next, CssTokenType.Domain);
}
if (a == FunctionNames.Url_Prefix) {
_stringBuffer.Clear();
return UrlStart(base.Next, CssTokenType.UrlPrefix);
}
return CssKeywordToken.Function(FlushBuffer());
}
Back();
return CssKeywordToken.Ident(FlushBuffer());
}
private CssToken TransformFunctionWhitespace(char current)
{
do {
current = base.Next;
if (current == '(') {
Back();
return CssKeywordToken.Function(FlushBuffer());
}
} while (current.IsSpaceCharacter());
Back(2);
return CssKeywordToken.Ident(FlushBuffer());
}
private CssToken NumberStart(char current)
{
while (true) {
switch (current) {
case '+':
case '-':
_stringBuffer.Append(current);
current = base.Next;
if (current == '.') {
_stringBuffer.Append(current);
_stringBuffer.Append(base.Next);
return NumberFraction(base.Next);
}
_stringBuffer.Append(current);
return NumberRest(base.Next);
case '.':
_stringBuffer.Append(current);
_stringBuffer.Append(base.Next);
return NumberFraction(base.Next);
}
if (current.IsDigit())
break;
current = base.Next;
}
_stringBuffer.Append(current);
return NumberRest(base.Next);
}
private CssToken NumberRest(char current)
{
while (current.IsDigit()) {
_stringBuffer.Append(current);
current = base.Next;
}
if (current.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(current);
return Dimension(base.Next, number);
}
if (!IsValidEscape(current)) {
switch (current) {
case '.':
current = base.Next;
if (current.IsDigit()) {
_stringBuffer.Append('.').Append(current);
return NumberFraction(base.Next);
}
Back();
return CssToken.Number(FlushBuffer());
case '%':
return CssUnitToken.Percentage(FlushBuffer());
case 'E':
case 'e':
return NumberExponential(current);
case '-':
return NumberDash(current);
default:
Back();
return CssToken.Number(FlushBuffer());
}
}
current = base.Next;
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(current));
return Dimension(base.Next, number2);
}
private CssToken NumberFraction(char current)
{
while (current.IsDigit()) {
_stringBuffer.Append(current);
current = base.Next;
}
if (current.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append(current);
return Dimension(base.Next, number);
}
if (!IsValidEscape(current)) {
switch (current) {
case 'E':
case 'e':
return NumberExponential(current);
case '%':
return CssUnitToken.Percentage(FlushBuffer());
case '-':
return NumberDash(current);
default:
Back();
return CssToken.Number(FlushBuffer());
}
}
current = base.Next;
string number2 = FlushBuffer();
_stringBuffer.Append(ConsumeEscape(current));
return Dimension(base.Next, number2);
}
private CssToken Dimension(char current, string number)
{
while (true) {
if (current.IsName())
_stringBuffer.Append(current);
else {
if (!IsValidEscape(current))
break;
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
}
current = base.Next;
}
Back();
return CssUnitToken.Dimension(number, FlushBuffer());
}
private CssToken SciNotation(char current)
{
while (current.IsDigit()) {
_stringBuffer.Append(current);
current = base.Next;
}
Back();
return CssToken.Number(FlushBuffer());
}
private CssToken UrlStart(char current, CssTokenType type)
{
while (current.IsSpaceCharacter()) {
current = base.Next;
}
switch (current) {
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, string.Empty, true);
case '"':
return UrlDQ(base.Next, type);
case '\'':
return UrlSQ(base.Next, type);
case ')':
return CssStringToken.Url(type, string.Empty, false);
default:
return UrlUQ(current, type);
}
}
private CssToken UrlDQ(char current, CssTokenType type)
{
while (true) {
if (current.IsLineBreak()) {
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
return UrlBad(base.Next, type);
}
if ('' == current)
break;
switch (current) {
case '"':
return UrlEnd(base.Next, type);
case '\\':
current = base.Next;
if (current == '') {
Back(2);
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (current.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(current));
break;
default:
_stringBuffer.Append(current);
break;
}
current = base.Next;
}
return CssStringToken.Url(type, FlushBuffer(), false);
}
private CssToken UrlSQ(char current, CssTokenType type)
{
while (true) {
if (current.IsLineBreak()) {
RaiseErrorOccurred(ErrorCode.LineBreakUnexpected);
return UrlBad(base.Next, type);
}
if ('' == current)
break;
switch (current) {
case '\'':
return UrlEnd(base.Next, type);
case '\\':
current = base.Next;
if (current == '') {
Back(2);
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (current.IsLineBreak())
_stringBuffer.AppendLine();
else
_stringBuffer.Append(ConsumeEscape(current));
break;
default:
_stringBuffer.Append(current);
break;
}
current = base.Next;
}
return CssStringToken.Url(type, FlushBuffer(), false);
}
private CssToken UrlUQ(char current, CssTokenType type)
{
while (true) {
if (current.IsSpaceCharacter())
return UrlEnd(base.Next, type);
switch (current) {
case '':
case ')':
return CssStringToken.Url(type, FlushBuffer(), false);
default:
if (!current.IsNonPrintable())
break;
goto case '"';
case '"':
case '\'':
case '(':
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
return UrlBad(base.Next, type);
}
if (current == '\\') {
if (!IsValidEscape(current))
break;
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
} else
_stringBuffer.Append(current);
current = base.Next;
}
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
return UrlBad(base.Next, type);
}
private CssToken UrlEnd(char current, CssTokenType type)
{
while (true) {
if (current == ')')
return CssStringToken.Url(type, FlushBuffer(), false);
if (!current.IsSpaceCharacter())
break;
current = base.Next;
}
RaiseErrorOccurred(ErrorCode.InvalidCharacter);
return UrlBad(current, type);
}
private CssToken UrlBad(char current, CssTokenType type)
{
while (true) {
switch (current) {
case '':
RaiseErrorOccurred(ErrorCode.EOF);
return CssStringToken.Url(type, FlushBuffer(), true);
case ')':
return CssStringToken.Url(type, FlushBuffer(), true);
}
if (IsValidEscape(current)) {
current = base.Next;
_stringBuffer.Append(ConsumeEscape(current));
}
current = base.Next;
}
}
private CssToken UnicodeRange(char current)
{
for (int i = 0; i < 6; i++) {
if (!current.IsHex())
break;
_stringBuffer.Append(current);
current = base.Next;
}
if (_stringBuffer.Length != 6) {
for (int j = 0; j < 6 - _stringBuffer.Length; j++) {
if (current != '?') {
current = base.Previous;
break;
}
_stringBuffer.Append(current);
current = base.Next;
}
string text = FlushBuffer();
string start = text.Replace('?', '0');
string end = text.Replace('?', 'F');
return CssToken.Range(start, end);
}
if (current == '-') {
current = base.Next;
if (current.IsHex()) {
string start2 = _stringBuffer.ToString();
_stringBuffer.Clear();
for (int k = 0; k < 6; k++) {
if (!current.IsHex()) {
current = base.Previous;
break;
}
_stringBuffer.Append(current);
current = base.Next;
}
string end2 = FlushBuffer();
return CssToken.Range(start2, end2);
}
Back(2);
return CssToken.Range(FlushBuffer(), null);
}
Back();
return CssToken.Range(FlushBuffer(), null);
}
private string FlushBuffer()
{
string result = _stringBuffer.ToString();
_stringBuffer.Clear();
return result;
}
private CssToken NumberExponential(char current)
{
current = base.Next;
if (current.IsDigit()) {
_stringBuffer.Append('e').Append(current);
return SciNotation(base.Next);
}
if (current == '+' || current == '-') {
char value = current;
current = base.Next;
if (current.IsDigit()) {
_stringBuffer.Append('e').Append(value).Append(current);
return SciNotation(base.Next);
}
Back();
}
current = base.Previous;
string number = FlushBuffer();
_stringBuffer.Append(current);
return Dimension(base.Next, number);
}
private CssToken NumberDash(char current)
{
current = base.Next;
if (current.IsNameStart()) {
string number = FlushBuffer();
_stringBuffer.Append('-').Append(current);
return Dimension(base.Next, number);
}
if (IsValidEscape(current)) {
current = base.Next;
string number2 = FlushBuffer();
_stringBuffer.Append('-').Append(ConsumeEscape(current));
return Dimension(base.Next, number2);
}
Back(2);
return CssToken.Number(FlushBuffer());
}
private string ConsumeEscape(char current)
{
if (current.IsHex()) {
List<char> list = new List<char>();
for (int i = 0; i < 6; i++) {
list.Add(current);
current = base.Next;
if (!current.IsHex())
break;
}
current = base.Previous;
int utf = int.Parse(new string(list.ToArray()), NumberStyles.HexNumber);
return char.ConvertFromUtf32(utf);
}
return current.ToString();
}
private bool IsValidEscape(char current)
{
if (current != '\\')
return false;
current = base.Next;
Back();
if (current == '')
return false;
if (current.IsLineBreak())
return false;
return true;
}
}
}