Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RegexDiff X64] MihaZupan/runtime/regex-fewerUintCasts #625

Open
MihuBot opened this issue Sep 5, 2024 · 1 comment
Open

[RegexDiff X64] MihaZupan/runtime/regex-fewerUintCasts #625

MihuBot opened this issue Sep 5, 2024 · 1 comment

Comments

@MihuBot
Copy link
Owner

MihuBot commented Sep 5, 2024

Job completed in 20 minutes 42 seconds.

Using arguments: regexdiff -NoPRLink

Examples of GeneratedRegex source diffs

8625 out of 18885 patterns have generated source code changes.

"[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+" (21563 uses)
[GeneratedRegex("[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+", RegexOptions.None)]
  }
  
  // Match '\''.
-   if ((uint)slice.Length < 2 || slice[1] != '\'')
+   if (slice.Length < 2 || slice[1] != '\'')
  {
      goto AlternationBranch;
  }
  }
  
  // Match '"'.
-   if ((uint)slice.Length < 2 || slice[1] != '"')
+   if (slice.Length < 2 || slice[1] != '"')
  {
      goto AlternationBranch1;
  }
"(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:, ..." (18474 uses)
[GeneratedRegex("(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:,(?!\")|^(?!\"))([^,]*?)(?=$|,)|(\r\n|\n)", RegexOptions.Compiled)]
  // Branch 0
  {
      // Match ','.
-       if ((uint)slice.Length < 2 || slice[1] != ',')
+       if (slice.Length < 2 || slice[1] != ',')
      {
          goto AlternationBranch3;
      }
      }
      
      // Match '"'.
-       if ((uint)slice.Length < 2 || slice[1] != '"')
+       if (slice.Length < 2 || slice[1] != '"')
      {
          goto NegativeLookaroundMatch;
      }
  {
      case '\r':
          // Match '\n'.
-           if ((uint)slice.Length < 2 || slice[1] != '\n')
+           if (slice.Length < 2 || slice[1] != '\n')
          {
              UncaptureUntil(0);
              return false; // The input didn't match.
"^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)| ..." (7826 uses)
[GeneratedRegex("^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)|([^:]*))):)|())(?<SUBCATEGORY>(()|([^:]*? )))(?<CATEGORY>(error|warning))( \\s*(?<CODE>[^: ]*))?\\s*:(?<TEXT>.*)$", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  {
      case 'E' or 'e':
          
-           if ((uint)slice.Length < 5 ||
+           if (slice.Length < 5 ||
              !slice.Slice(1).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
          
      case 'W' or 'w':
          
-           if ((uint)slice.Length < 7 ||
+           if (slice.Length < 7 ||
              !slice.Slice(1).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
"^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>( ..." (7826 uses)
[GeneratedRegex("^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>(?<LINE>[0-9]*):(?<COLUMN>[0-9]*))):(?<CATEGORY> error| warning):(?<TEXT>.*)", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  // Match with 2 alternative expressions.
  //{
-       if ((uint)slice.Length < 2)
+       if (slice.Length < 2)
      {
          goto CaptureBacktrack1;
      }
      {
          case 'E' or 'e':
              
-               if ((uint)slice.Length < 6 ||
+               if (slice.Length < 6 ||
                  !slice.Slice(2).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
              
          case 'W' or 'w':
              
-               if ((uint)slice.Length < 8 ||
+               if (slice.Length < 8 ||
                  !slice.Slice(2).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
"^[A-Za-z]:" (5756 uses)
[GeneratedRegex("^[A-Za-z]:", RegexOptions.Compiled)]
      return false; // The input didn't match.
  }
  
-   if ((uint)slice.Length < 2 ||
+   if (slice.Length < 2 ||
      !char.IsAsciiLetter(slice[0]) || // Match a character in the set [A-Za-z].
      slice[1] != ':') // Match ':'.
  {
"[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQu ..." (5703 uses)
[GeneratedRegex("[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQuU]] *\\d[A-z-[dDfFiIoOqQuU]]\\d\\b", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  char ch;
  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
  
-   if ((uint)slice.Length < 3 ||
+   if (slice.Length < 3 ||
      ((ch = slice[0]) < 128 ? ("\0\0\0\0綮ﭝ綯͝"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0 \0DEFGIJOPQRUVWXZ[defgijopqruvwxz{")) || // Match a character in the set [A-z\u212A-[DFIOQUWZdfioquwz]].
      !char.IsDigit(slice[1]) || // Match a Unicode digit.
      ((ch = slice[2]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv"))) // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      pos += iteration;
  }
  
-   if ((uint)slice.Length < 6 ||
+   if (slice.Length < 6 ||
      !char.IsDigit(slice[3]) || // Match a Unicode digit.
      ((ch = slice[4]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv")) || // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      !char.IsDigit(slice[5])) // Match a Unicode digit.
"\\d{5}$|\\d{5}-\\d{4}$" (5703 uses)
[GeneratedRegex("\\d{5}$|\\d{5}-\\d{4}$", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  // Match a Unicode digit exactly 5 times.
  {
-       if ((uint)slice.Length < 5 ||
+       if (slice.Length < 5 ||
          !char.IsDigit(slice[0]) ||
          !char.IsDigit(slice[1]) ||
          !char.IsDigit(slice[2]) ||
      
      // Branch 1
      {
-           if ((uint)slice.Length < 10 ||
+           if (slice.Length < 10 ||
              slice[5] != '-' || // Match '-'.
              !char.IsDigit(slice[6]) || // Match a Unicode digit exactly 4 times.
              !char.IsDigit(slice[7]) ||
"^[a-f0-9]{32}$" (4920 uses)
[GeneratedRegex("^[a-f0-9]{32}$", RegexOptions.Compiled)]
  // Match a character in the set [0-9a-f] exactly 32 times.
  {
-       if ((uint)slice.Length < 32)
+       if (slice.Length < 32)
      {
          return false; // The input didn't match.
      }
"^((([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_` ..." (4566 uses)
[GeneratedRegex("^((([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_`{\\|}~]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])+(\\.([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_`{\\|}~]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])+)*)|((\\x22)((((\\x20|\\x09)*(\\x0d\\x0a))?(\\x20|\\x09)+)?(([\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]|\\x21|[\\x23-\\x5b]|[\\x5d-\\x7e]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(\\\\([\\x01-\\x09\\x0b\\x0c\\x0d-\\x7f]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF]))))*(((\\x20|\\x09)*(\\x0d\\x0a))?(\\x20|\\x09)+)?(\\x22)))@((([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])([a-z]|\\d|-|\\.|_|~|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])*([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])))\\.)+(([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])([a-z]|\\d|-|\\.|_|~|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])*([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])))\\.?$", RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture | RegexOptions.Compiled)]
  // Branch 1
  //{
-       if ((uint)slice.Length < 2 ||
+       if (slice.Length < 2 ||
          slice[0] != '\\' || // Match '\\'.
          ((ch = slice[1]) < 128 ? ("ﯾ\uffff\uffff\uffff\uffff\uffff\uffff\uffff"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\n\0\u0001\n\v\u0080 \ud800豈\ufdd0ﷰ\ufff0"))) // Match a character in the set [\u0001-\t\v-\u007F\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF].
      {
"xmlns=\"[^\"]*\"\\s*" (4166 uses)
[GeneratedRegex("xmlns=\"[^\"]*\"\\s*", RegexOptions.None)]
  }
  
  // Match '"'.
-   if ((uint)slice.Length < 8 || slice[7] != '"')
+   if (slice.Length < 8 || slice[7] != '"')
  {
      return false; // The input didn't match.
  }

For more diff examples, see https://gist.github.com/MihuBot/075613d3f28b6a1867e2d865fbc96b4b

Total bytes of base: 54137458
Total bytes of diff: 54153509
Total bytes of delta: 16051 (0.03 % of base)
Total relative delta: -58.17
    diff is a regression.
    relative diff is an improvement.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-625.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://runtimeutils.blob.core.windows.net/artifacts/EcdZFMOAAAE/Results.zip");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream)!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

@MihuBot
Copy link
Owner Author

MihuBot commented Sep 5, 2024

@MihaZupan

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant