Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce string allocation from TextFactory #14531

Merged
merged 4 commits into from
Oct 17, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions src/Compilers/Core/CodeAnalysisTest/EmbeddedTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
using System.IO.Compression;
using Roslyn.Test.Utilities;
using System.Linq;
using System.Collections.Immutable;
using System.Reflection;

namespace Microsoft.CodeAnalysis.UnitTests
{
Expand Down Expand Up @@ -194,6 +192,36 @@ public void FromSource_Large()
AssertEx.Equal(Encoding.Unicode.GetPreamble().Concat(Encoding.Unicode.GetBytes(LargeSource)), Decompress(text.Blob.Skip(4)));
}

[Fact]
public void FromTextReader_Small()
{
var expected = SourceText.From(SmallSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);

var actual = SourceText.From(new StringReader(SmallSource), SmallSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);

Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}

[Fact]
public void FromTextReader_Large()
{
var expected = SourceText.From(LargeSource, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var expectedEmbeded = EmbeddedText.FromSource("pathToSmall", expected);

var actual = SourceText.From(new StringReader(LargeSource), LargeSource.Length, Encoding.UTF8, SourceHashAlgorithm.Sha1);
var actualEmbeded = EmbeddedText.FromSource(expectedEmbeded.FilePath, actual);

Assert.Equal(expectedEmbeded.FilePath, actualEmbeded.FilePath);
Assert.Equal(expectedEmbeded.ChecksumAlgorithm, actualEmbeded.ChecksumAlgorithm);
AssertEx.Equal(expectedEmbeded.Checksum, actualEmbeded.Checksum);
AssertEx.Equal(expectedEmbeded.Blob, actualEmbeded.Blob);
}

[Fact]
public void FromSource_Precomputed()
{
Expand Down
18 changes: 18 additions & 0 deletions src/Compilers/Core/CodeAnalysisTest/Text/LargeTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ private static SourceText CreateSourceText(Stream stream, Encoding encoding = nu
return LargeText.Decode(stream, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1, throwIfBinaryDetected: true, canBeEmbedded: false);
}

private static SourceText CreateSourceText(TextReader reader, int length, Encoding encoding = null)
{
return LargeText.Decode(reader, length, encoding ?? Encoding.UTF8, SourceHashAlgorithm.Sha1);
}

private const string HelloWorld = "Hello, world!";

[Fact]
Expand Down Expand Up @@ -313,5 +318,18 @@ public void LinesGetText2()
var data = CreateSourceText(text);
Assert.Equal("foo", data.Lines[0].ToString());
}

[Fact]
public void FromTextReader()
{
var expected = "foo";
var expectedSourceText = CreateSourceText(expected);

var actual = new StringReader(expected);
var actualSourceText = CreateSourceText(actual, expected.Length);

Assert.Equal("foo", actualSourceText.Lines[0].ToString());
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());
}
}
}
35 changes: 35 additions & 0 deletions src/Compilers/Core/CodeAnalysisTest/Text/SourceTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,41 @@ public void FromThrowsIfBinary()
Assert.Throws<InvalidDataException>(() => SourceText.From(stream, throwIfBinaryDetected: true));
}

[Fact]
public void FromTextReader()
{
var expected = "Text reader source text test";
var expectedSourceText = SourceText.From(expected);

var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);

Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());

Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}

[Fact]
public void FromTextReader_Large()
{
var expected = new string('l', SourceText.LargeObjectHeapLimitInChars);
var expectedSourceText = SourceText.From(expected);

var actual = new StringReader(expected);
var actualSourceText = SourceText.From(actual, expected.Length);

Assert.IsType<LargeText>(actualSourceText);
Assert.Equal<byte>(expectedSourceText.GetChecksum(), actualSourceText.GetChecksum());

var utf8NoBOM = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);

Assert.Same(s_utf8, SourceText.From(actual, expected.Length, s_utf8).Encoding);
Assert.Same(s_unicode, SourceText.From(actual, expected.Length, s_unicode).Encoding);
Assert.Null(SourceText.From(actual, expected.Length, null).Encoding);
}

private static void TestTryReadByteOrderMark(Encoding expectedEncoding, int expectedPreambleLength, byte[] data)
{
TestTryReadByteOrderMark(expectedEncoding, expectedPreambleLength, data, data == null ? 0 : data.Length);
Expand Down
1 change: 1 addition & 0 deletions src/Compilers/Core/Portable/GlobalSuppressions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
// required paramter type is different for all overloads and so there is no ambiguity.
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.Byte[],System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm,System.Boolean,System.Boolean)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0026:Do not add multiple public overloads with optional parameters", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader,System.Int32,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]
[assembly: System.Diagnostics.CodeAnalysis.SuppressMessage("ApiDesign", "RS0027:Public API with optional parameter(s) should have the most parameters amongst its public overloads.", Justification = "<Pending>", Scope = "member", Target = "~M:Microsoft.CodeAnalysis.Text.SourceText.From(System.String,System.Text.Encoding,Microsoft.CodeAnalysis.Text.SourceHashAlgorithm)~Microsoft.CodeAnalysis.Text.SourceText")]

Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,29 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
Debug.Assert(stream.CanSeek);
long length = stream.Length;

int maxCharCount;
if (encoding.TryGetMaxCharCount(length, out maxCharCount))
{
return maxCharCount;
}

#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
}

internal static bool TryGetMaxCharCount(this Encoding encoding, long length, out int maxCharCount)
{
maxCharCount = 0;

if (length <= int.MaxValue)
{
try
{
return encoding.GetMaxCharCount((int)length);
maxCharCount = encoding.GetMaxCharCount((int)length);
return true;
}
catch (ArgumentOutOfRangeException)
{
Expand All @@ -33,11 +51,7 @@ internal static int GetMaxCharCountOrThrowIfHuge(this Encoding encoding, Stream
}
}

#if WORKSPACE_DESKTOP
throw new IOException(WorkspacesResources.Stream_is_too_long);
#else
throw new IOException(CodeAnalysisResources.StreamIsTooLong);
#endif
return false;
}
}
}
1 change: 1 addition & 0 deletions src/Compilers/Core/Portable/PublicAPI.Unshipped.txt
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,7 @@ static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Micro
static Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>.implicit operator Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>(Microsoft.CodeAnalysis.SeparatedSyntaxList<Microsoft.CodeAnalysis.SyntaxNode> nodes) -> Microsoft.CodeAnalysis.SeparatedSyntaxList<TNode>
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.Stream stream, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(System.IO.TextReader reader, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1) -> Microsoft.CodeAnalysis.Text.SourceText
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CC @AnthonyDGreen, @jaredpar, @gafter For new public API.

static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding = null, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm = Microsoft.CodeAnalysis.Text.SourceHashAlgorithm.Sha1, bool throwIfBinaryDetected = false, bool canBeEmbedded = false) -> Microsoft.CodeAnalysis.Text.SourceText
static Microsoft.CodeAnalysis.Text.SourceText.From(byte[] buffer, int length, System.Text.Encoding encoding, Microsoft.CodeAnalysis.Text.SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected) -> Microsoft.CodeAnalysis.Text.SourceText
virtual Microsoft.CodeAnalysis.Diagnostics.AnalysisContext.RegisterOperationAction(System.Action<Microsoft.CodeAnalysis.Diagnostics.OperationAnalysisContext> action, System.Collections.Immutable.ImmutableArray<Microsoft.CodeAnalysis.OperationKind> operationKinds) -> void
Expand Down
103 changes: 65 additions & 38 deletions src/Compilers/Core/Portable/Text/LargeText.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ internal sealed class LargeText : SourceText
private readonly ImmutableArray<char[]> _chunks;
private readonly int[] _chunkStartOffsets;
private readonly int _length;
private readonly Encoding _encoding;
private readonly Encoding _encodingOpt;

internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob)
: base(checksum, checksumAlgorithm, embeddedTextBlob)
{
_chunks = chunks;
_encoding = encoding;
_encodingOpt = encodingOpt;
_chunkStartOffsets = new int[chunks.Length];

int offset = 0;
for (int i = 0; i < chunks.Length; i++)
{
Expand All @@ -42,6 +43,11 @@ internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableAr
_length = offset;
}

internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
: this(chunks, encodingOpt, default(ImmutableArray<byte>), checksumAlgorithm, default(ImmutableArray<byte>))
{
}

internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected, bool canBeEmbedded)
{
stream.Seek(0, SeekOrigin.Begin);
Expand All @@ -58,48 +64,69 @@ internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAl

using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(length, 4096), leaveOpen: true))
{
ArrayBuilder<char[]> chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);
while (!reader.EndOfStream)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}
var chunks = ReadChunksFromTextReader(reader, maxCharRemainingGuess, throwIfBinaryDetected);

char[] chunk = new char[nextChunkSize];
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the 'embeddedTextBlob'?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nguerrera is probably better person to answer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the blob that encodes (optionally) embedded source in a PDB. See #12625 for design info.

return new LargeText(chunks, reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
}
}

int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}
internal static SourceText Decode(TextReader reader, int length, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm)
{
if (length == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put a comment mentioning why this optimization is necessary? i.e. what percentage of readers have length=0 and how much this saves us to optimize. Thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised we have any callers calling LargeText.Decode witha length of 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dotnet/roslyn-compiler compiler team is probably the one should answer.

{
return SourceText.From(string.Empty, encodingOpt, checksumAlgorithm);
}

maxCharRemainingGuess -= charsRead;
// throwIfBinaryDetected == false since we are given text reader from the beginning
var chunks = ReadChunksFromTextReader(reader, length, throwIfBinaryDetected: false);

if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
}
return new LargeText(chunks, encodingOpt, checksumAlgorithm);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 'LargeText'? 'Large' implies that it is used for, well, large inputs. But what if the length was only '1', would we still make a "Large" text?

Or, if LargeText is for texts of any size, perhaps its name should be changed. Maybe ChunkedSourceText?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah,n/m. This is LargeText.Decode, not SourceText.Decode. So the lenght check was already done before.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am following existing decode pattern. @dotnet/roslyn-compiler probably better people to answer

}

// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}
private static ImmutableArray<char[]> ReadChunksFromTextReader(TextReader reader, int maxCharRemainingGuess, bool throwIfBinaryDetected)
{
var chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize);

chunks.Add(chunk);
while (reader.Peek() != -1)
{
var nextChunkSize = ChunkSize;
if (maxCharRemainingGuess < ChunkSize)
{
// maxCharRemainingGuess typically overestimates a little
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk
// and then use 64 char tail, which is likley to be resized.
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64);
}

// We must compute the checksum and embedded text blob now while we still have the original bytes in hand.
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip.
var checksum = CalculateChecksum(stream, checksumAlgorithm);
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>);
return new LargeText(chunks.ToImmutableAndFree(), reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob);
char[] chunk = new char[nextChunkSize];

int charsRead = reader.ReadBlock(chunk, 0, chunk.Length);
if (charsRead == 0)
{
break;
}

maxCharRemainingGuess -= charsRead;

if (charsRead < chunk.Length)
{
Array.Resize(ref chunk, charsRead);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this resizing will not actually change 'chunk', it will allocate another array, copy the data inot it, then point 'ref chunk' at that new array. Given that, do we want to pool chunks so that we don't leak them in the case where we need to resize?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe there is any pool here for chunk.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could add one :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there is data that shows it help. but I doubt that will help since the chunk will live as long as source text live and the pool will statically alive in memory and the size should be big assuming it is for large text.

}

// Check for binary files
if (throwIfBinaryDetected && IsBinary(chunk))
{
throw new InvalidDataException();
}

chunks.Add(chunk);
}

return chunks.ToImmutableAndFree();
}

/// <summary>
Expand Down Expand Up @@ -146,7 +173,7 @@ public override char this[int position]
}
}

public override Encoding Encoding => _encoding;
public override Encoding Encoding => _encodingOpt;

public override int Length => _length;

Expand Down Expand Up @@ -272,7 +299,7 @@ private int[] ParseLineStarts()
case '\u0085':
case '\u2028':
case '\u2029':
line_break:
line_break:
arrayBuilder.Add(position);
position = index;
break;
Expand Down
Loading