Skip to content

Commit

Permalink
Merge pull request #14531 from heejaechang/sourcetextreader
Browse files Browse the repository at this point in the history
Reduce string allocation from TextFactory
  • Loading branch information
heejaechang committed Oct 17, 2016
2 parents b7d2e49 + f73dd86 commit 779bc9a
Show file tree
Hide file tree
Showing 10 changed files with 284 additions and 102 deletions.
32 changes: 30 additions & 2 deletions src/Compilers/Core/CodeAnalysisTest/EmbeddedTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
using System.IO.Compression;
using Roslyn.Test.Utilities;
using System.Linq;
using System.Collections.Immutable;
using System.Reflection;

namespace Microsoft.CodeAnalysis.UnitTests
{
Expand Down Expand Up @@ -194,6 +192,36 @@ public void FromSource_Large()
AssertEx.Equal(Encoding.Unicode.GetPreamble().Concat(Encoding.Unicode.GetBytes(LargeSource)), Decompress(text.Blob.Skip(4)));
}

[Fact]
public void FromTextReader_Small()
{
var expected = SourceText.From(SmallSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);

var actual = SourceText.From(new StringReader(SmallSource), SmallSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);

Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}

[Fact]
public void FromTextReader_Large()
{
var expected = SourceText.From(LargeSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);

var actual = SourceText.From(new StringReader(LargeSource), LargeSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);

Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}

[Fact]
public void FromSource_Precomputed()
{
Expand Down
18 changes: 18 additions & 0 deletions src/Compilers/Core/CodeAnalysisTest/Text/LargeTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ private static SourceText CreateSourceText(Stream stream, Encoding encoding = nu
return LargeText.Decode(stream, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1, throwIfBinaryDetected: true, canBeEmbedded: false);
}

private static SourceText CreateSourceText(TextReader reader, int length, Encoding encoding = null)
{
return LargeText.Decode(reader, length, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1);
}

private const string HelloWorld = "Hello, world!";

[Fact]
Expand Down Expand Up @@ -313,5 +318,18 @@ public void LinesGetText2()
var data = CreateSourceText(text);
Assert.Equal("foo", data.Lines[0].ToString());
}

[Fact]
public void FromTextReader()
{
var expected = "foo";
var expectedSourceText = CreateSourceText(expected);

var actual = new StringReader(expected);
var actualSourceText = CreateSourceText(actual, expected.Length);

Assert.Equal("foo", actualSourceText.Lines[0].ToString());
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());
}
}
}
35 changes: 35 additions & 0 deletions src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,41 @@ public void FromThrowsIfBinary()
Assert.Throws<InvalidDataException>(() => SourceText.From(stream, throwIfBinaryDetected: true));
}

[Fact]
public void FromTextReader()
{
var expected = "Text reader source text test";
var expectedSourceText = SourceText.From(expected);

var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);

Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());

Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}

[Fact]
public void FromTextReader_Large()
{
var expected = new string('l', SourceText.LargeObjectHeapLimitInChars);
var expectedSourceText = SourceText.From(expected);

var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);

Assert.IsType<LargeText>(actualSourceText);
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());

var utf8NoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);

Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}

private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data)
{
TestTryReadByteOrderMark(expectedEncoding, expectedPreambleLength, data, data == null ? 0 : data.Length);
Expand Down
1 change: 1 addition & 0 deletions src/Compilers/Core/Portable/GlobalSuppressions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
// required paramter type is different for all overloads and so there is no ambiguity.
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.Byte[],System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader,System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0027:Public API with optional parameter(s) should have the most parameters amongst its public overloads.", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.String,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]

Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,29 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
Debug.Assert(stream.CanSeek);
long length = stream.Length;

int maxCharCount;
if (encoding.TryGetMaxCharCount(length, out maxCharCount))
{
return maxCharCount;
}

#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
}

internal static bool TryGetMaxCharCount(this Encoding encoding, long length, out int maxCharCount)
{
maxCharCount = 0;

if (length <= int.MaxValue)
{
try
{
return encoding.GetMaxCharCount((int)length);
maxCharCount = encoding.GetMaxCharCount((int)length);
return true;
}
catch (ArgumentOutOfRangeException)
{
Expand All @@ -33,11 +51,7 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
}
}

#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
return false;
}
}
}
1 change: 1 addition & 0 deletions src/Compilers/Core/Portable/PublicAPI.Unshipped.txt
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Micro
static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>(Microsoft.CodeAnalysis.SeparatedSyntaxList<Microsoft.CodeAnalysis.SyntaxNode> nodes) -> Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader reader, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
virtual Microsoft.CodeAnalysis.Diagnostics.AnalysisContext.RegisterOperationAction(System.Action<Microsoft.CodeAnalysis.Diagnostics.OperationAnalysisContext> action, System.Collections.Immutable.ImmutableArray<Microsoft.CodeAnalysis.OperationKind> operationKinds) -> void
Expand Down
103 changes: 65 additions & 38 deletions src/Compilers/Core/Portable/Text/LargeText.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ internal sealed class LargeText : SourceText
private readonly ImmutableArray<char[]> _chunks;
private readonly int[] _chunkStartOffsets;
private readonly int _length;
private readonly Encoding _encoding;
private readonly Encoding _encodingOpt;

internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
: base(checksum, checksumAlgorithm, embeddedTextBlob)
{
_chunks = chunks;
_encoding = encoding;
_encodingOpt = encodingOpt;
_chunkStartOffsets = new int[chunks.Length];

int offset = 0;
for (int i = 0; i < chunks.Length; i++)
{
Expand All @@ -42,6 +43,11 @@ internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableAr
_length = offset;
}

internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
: this(chunks, encodingOpt, default(ImmutableArray<byte>), checksumAlgorithm, default(ImmutableArray<byte>))
{
}

internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected, bool canBeEmbedded)
{
stream.Seek(0, SeekOrigin.Begin);
Expand All @@ -58,48 +64,69 @@ internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAl

using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(length, 4096), leaveOpen: true))
{
ArrayBuilder<char[]> chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);
while (!reader.EndOfStream)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}
var chunks = ReadChunksFromTextReader(reader, maxCharRemainingGuess, throwIfBinaryDetected);

char[] chunk = new char[nextChunkSize];
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
return new LargeText(chunks, reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
}
}

int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}
internal static SourceText Decode(TextReader reader, int length, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
{
if (length == 0)
{
return SourceText.From(string.Empty, encodingOpt, checksumAlgorithm);
}

maxCharRemainingGuess -= charsRead;
// throwIfBinaryDetected == false since we are given text reader from the beginning
var chunks = ReadChunksFromTextReader(reader, length, throwIfBinaryDetected: false);

if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
}
return new LargeText(chunks, encodingOpt, checksumAlgorithm);
}

// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}
private static ImmutableArray<char[]> ReadChunksFromTextReader(TextReader reader, int maxCharRemainingGuess, bool throwIfBinaryDetected)
{
var chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);

chunks.Add(chunk);
while (reader.Peek() != -1)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}

// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
return new LargeText(chunks.ToImmutableAndFree(), reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
char[] chunk = new char[nextChunkSize];

int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}

maxCharRemainingGuess -= charsRead;

if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
}

// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}

chunks.Add(chunk);
}

return chunks.ToImmutableAndFree();
}

/// <summary>
Expand Down Expand Up @@ -146,7 +173,7 @@ public override char this[int position]
}
}

public override Encoding Encoding => _encoding;
public override Encoding Encoding => _encodingOpt;

public override int Length => _length;

Expand Down Expand Up @@ -272,7 +299,7 @@ private int[] ParseLineStarts()
case '\u0085':
case '\u2028':
case '\u2029':
line_break:
line_break:
arrayBuilder.Add(position);
position = index;
break;
Expand Down
Loading

0 comments on commit 779bc9a

Please sign in to comment.