TextSource
A stream abstraction to handle encoding and more.
using AngleSharp.Extensions;
using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace AngleSharp
{
[DebuggerStepThrough]
internal sealed class TextSource : IDisposable
{
private enum EncodingConfidence
{
Tentative,
Certain,
Irrelevant
}
private const int BufferSize = 4096;
private readonly Stream _baseStream;
private readonly MemoryStream _raw;
private readonly byte[] _buffer;
private readonly char[] _chars;
private StringBuilder _content;
private EncodingConfidence _confidence;
private bool _finished;
private Encoding _encoding;
private Decoder _decoder;
private int _index;
public string Text => _content.ToString();
public char this[int index] {
get {
return _content[index];
}
}
public Encoding CurrentEncoding {
get {
return _encoding;
}
set {
if (_confidence == EncodingConfidence.Tentative) {
if (_encoding.IsUnicode())
_confidence = EncodingConfidence.Certain;
else {
if (value.IsUnicode())
value = TextEncoding.Utf8;
if (value == _encoding)
_confidence = EncodingConfidence.Certain;
else {
_encoding = value;
_decoder = value.GetDecoder();
byte[] array = _raw.ToArray();
char[] array2 = new char[_encoding.GetMaxCharCount(array.Length)];
int chars = _decoder.GetChars(array, 0, array.Length, array2, 0);
string text = new string(array2, 0, chars);
int num = Math.Min(_index, text.Length);
if (!text.Substring(0, num).Is(_content.ToString(0, num))) {
_index = 0;
_content.Clear().Append(text);
throw new NotSupportedException();
}
_confidence = EncodingConfidence.Certain;
_content.Remove(num, _content.Length - num);
_content.Append(text.Substring(num));
}
}
}
}
}
public int Index {
get {
return _index;
}
set {
_index = value;
}
}
public int Length => _content.Length;
private TextSource(Encoding encoding)
{
_buffer = new byte[4096];
_chars = new char[4097];
_raw = new MemoryStream();
_index = 0;
_encoding = (encoding ?? TextEncoding.Utf8);
_decoder = _encoding.GetDecoder();
}
public TextSource(string source)
: this(null, TextEncoding.Utf8)
{
_finished = true;
_content.Append(source);
_confidence = EncodingConfidence.Irrelevant;
}
public TextSource(Stream baseStream, Encoding encoding = null)
: this(encoding)
{
_baseStream = baseStream;
_content = Pool.NewStringBuilder();
_confidence = EncodingConfidence.Tentative;
}
public void Dispose()
{
if (_content != null) {
_raw.Dispose();
_content.Clear().ToPool();
_content = null;
}
}
public char ReadCharacter()
{
if (_index < _content.Length)
return _content[_index++];
ExpandBuffer(4096);
int num = _index++;
if (num >= _content.Length)
return '';
return _content[num];
}
public string ReadCharacters(int characters)
{
int index = _index;
int num = index + characters;
if (num <= _content.Length) {
_index += characters;
return _content.ToString(index, characters);
}
ExpandBuffer(Math.Max(4096, characters));
_index += characters;
characters = Math.Min(characters, _content.Length - index);
return _content.ToString(index, characters);
}
public async Task<char> ReadCharacterAsync(CancellationToken cancellationToken)
{
if (_index >= _content.Length) {
await ExpandBufferAsync(4096, cancellationToken).ConfigureAwait(false);
int index = _index++;
return (index < _content.Length) ? _content[index] : '';
}
return _content[_index++];
}
public async Task<string> ReadCharactersAsync(int characters, CancellationToken cancellationToken)
{
int start = _index;
int end = start + characters;
if (end <= _content.Length) {
_index += characters;
return _content.ToString(start, characters);
}
await ExpandBufferAsync(Math.Max(4096, characters), cancellationToken).ConfigureAwait(false);
_index += characters;
characters = Math.Min(characters, _content.Length - start);
return _content.ToString(start, characters);
}
public Task PrefetchAsync(int length, CancellationToken cancellationToken)
{
return ExpandBufferAsync(length, cancellationToken);
}
public async Task PrefetchAllAsync(CancellationToken cancellationToken)
{
if (_content.Length == 0)
await DetectByteOrderMarkAsync(cancellationToken).ConfigureAwait(false);
while (!_finished) {
await ReadIntoBufferAsync(cancellationToken).ConfigureAwait(false);
}
}
public void InsertText(string content)
{
if (_index >= 0 && _index < _content.Length)
_content.Insert(_index, content);
else
_content.Append(content);
_index += content.Length;
}
private async Task DetectByteOrderMarkAsync(CancellationToken cancellationToken)
{
int count = await _baseStream.ReadAsync(_buffer, 0, 4096).ConfigureAwait(false);
int offset = 0;
if (count > 2 && _buffer[0] == 239 && _buffer[1] == 187 && _buffer[2] == 191) {
_encoding = TextEncoding.Utf8;
offset = 3;
} else if (count > 3 && _buffer[0] == 255 && _buffer[1] == 254 && _buffer[2] == 0 && _buffer[3] == 0) {
_encoding = TextEncoding.Utf32Le;
offset = 4;
} else if (count > 3 && _buffer[0] == 0 && _buffer[1] == 0 && _buffer[2] == 254 && _buffer[3] == 255) {
_encoding = TextEncoding.Utf32Be;
offset = 4;
} else if (count > 1 && _buffer[0] == 254 && _buffer[1] == 255) {
_encoding = TextEncoding.Utf16Be;
offset = 2;
} else if (count > 1 && _buffer[0] == 255 && _buffer[1] == 254) {
_encoding = TextEncoding.Utf16Le;
offset = 2;
} else if (count > 3 && _buffer[0] == 132 && _buffer[1] == 49 && _buffer[2] == 149 && _buffer[3] == 51) {
_encoding = TextEncoding.Gb18030;
offset = 4;
}
if (offset > 0) {
count -= offset;
Array.Copy(_buffer, offset, _buffer, 0, count);
_decoder = _encoding.GetDecoder();
_confidence = EncodingConfidence.Certain;
}
AppendContentFromBuffer(count);
}
private async Task ExpandBufferAsync(long size, CancellationToken cancellationToken)
{
if (!_finished && _content.Length == 0)
await DetectByteOrderMarkAsync(cancellationToken).ConfigureAwait(false);
while (size + _index > _content.Length && !_finished) {
await ReadIntoBufferAsync(cancellationToken).ConfigureAwait(false);
}
}
private async Task ReadIntoBufferAsync(CancellationToken cancellationToken)
{
AppendContentFromBuffer(await _baseStream.ReadAsync(_buffer, 0, 4096, cancellationToken).ConfigureAwait(false));
}
private void ExpandBuffer(long size)
{
if (!_finished && _content.Length == 0)
DetectByteOrderMarkAsync(CancellationToken.None).Wait();
while (size + _index > _content.Length && !_finished) {
ReadIntoBuffer();
}
}
private void ReadIntoBuffer()
{
int size = _baseStream.Read(_buffer, 0, 4096);
AppendContentFromBuffer(size);
}
private void AppendContentFromBuffer(int size)
{
_finished = (size == 0);
int chars = _decoder.GetChars(_buffer, 0, size, _chars, 0);
if (_confidence != EncodingConfidence.Certain)
_raw.Write(_buffer, 0, size);
_content.Append(_chars, 0, chars);
}
}
}