dotnet · adamsitnik · Oct 20, 2023 · Aug 9, 2023 · Oct 19, 2023
diff --git a/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
@@ -46,29 +46,35 @@ public static AnalysisResults Analyze(
         /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
         private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
         {
-            const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
+            const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
+            int uniqueStringsLength = uniqueStrings.Length;
+
+            // Sufficient uniqueness factor of 95% is good enough.
+            // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
+            int acceptableNonUniqueCount = uniqueStringsLength / 20;
 
             SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
             HashSet<string> set = new HashSet<string>(
 #if NET6_0_OR_GREATER
-                uniqueStrings.Length,
+                uniqueStringsLength,
 #endif
                 comparer);
 
-            // For each substring length...
+            // For each substring length...preferring the shortest length that provides
+            // enough uniqueness
             int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
             for (int count = 1; count <= maxSubstringLength; count++)
             {
                 comparer.IsLeft = true;
                 comparer.Count = count;
 
-                // For each index, get a uniqueness factor for the left-justified substrings.
+                // For each index from, get a uniqueness factor for the left-justified substrings.
                 // If any is above our threshold, we're done.
                 for (int index = 0; index <= minLength - count; index++)
                 {
                     comparer.Index = index;
 
-                    if (HasSufficientUniquenessFactor(set, uniqueStrings))
+                    if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                     {
                         results = CreateAnalysisResults(
                             uniqueStrings, ignoreCase, minLength, maxLength, index, count,
@@ -90,10 +96,9 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
                     // If any is above our threshold, we're done.
                     for (int index = 0; index <= minLength - count; index++)
                     {
-                        // Get a uniqueness factor for the right-justified substrings.
-                        // If it's above our threshold, we're done.
                         comparer.Index = -index - count;
-                        if (HasSufficientUniquenessFactor(set, uniqueStrings))
+
+                        if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                         {
                             results = CreateAnalysisResults(
                                 uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
@@ -202,7 +207,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
 #if NET8_0_OR_GREATER
         private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
 #endif
-        private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
+        internal static bool ContainsAnyLetters(ReadOnlySpan<char> s)
         {
             Debug.Assert(IsAllAscii(s));
 
@@ -221,14 +226,10 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
 #endif
         }
 
-        private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
+        internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
         {
             set.Clear();
 
-            // Sufficient uniqueness factor of 95% is good enough.
-            // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
-            int acceptableNonUniqueCount = uniqueStrings.Length / 20;
-
             foreach (string s in uniqueStrings)
             {
                 if (!set.Add(s) && --acceptableNonUniqueCount < 0)

diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System;
+using System.Collections.Generic;
+using System.Linq;
 using Xunit;
 
 namespace System.Collections.Frozen.Tests
@@ -212,5 +214,75 @@ public static void IsAllAscii()
             Assert.True(KeyAnalyzer.IsAllAscii("abcdefghij".AsSpan()));
             Assert.False(KeyAnalyzer.IsAllAscii("abcdéfghij".AsSpan()));
         }
+
+        [Fact]
+        public static void ContainsAnyLetters()
+        {
+            Assert.True(KeyAnalyzer.ContainsAnyLetters("abc".AsSpan()));
+            Assert.True(KeyAnalyzer.ContainsAnyLetters("ABC".AsSpan()));
+            Assert.False(KeyAnalyzer.ContainsAnyLetters("123".AsSpan()));
+            // note, must only pass ASCII to ContainsAnyLetters, anything else is a
+            // Debug.Assert and would not have been called in the actual implementation
+        }
+
+        [Fact]
+        public static void HasSufficientUniquenessFactor()
+        {
+            HashSet<string> set = new HashSet<string>(StringComparer.Ordinal);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "c" }, 0));
+            Assert.Equal(3, set.Count);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "a" }, 1));
+            Assert.Equal(2, set.Count); // set should only have the non-collided ones
+
+            Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "ab", "aa" }, 0));
+            Assert.Equal(2, set.Count);
+        }
+
+        [Fact]
+        public static void HasSufficientUniquenessFactorInsensitive()
+        {
+            HashSet<string> set = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "B", "c" }, 0));
+            Assert.Equal(3, set.Count);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 1));
+            Assert.Equal(1, set.Count); // set should only have the non-collided ones
+
+            Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 0));
+        }
+
+        [Fact]
+        public static void HasSufficientUniquenessFactorLarge()
+        {
+            HashSet<string> set = new HashSet<string>(StringComparer.Ordinal);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "c" }, 1));
+            Assert.Equal(3, set.Count);
+
+            Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "a" }, 1));
+            Assert.Equal(2, set.Count); // set should only have the non-collided ones
+
+            Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "a", "a" }, 1));
+        }
+
+        // reuse the typical data declared in the FrozenFromKnownValuesTests
+        public static IEnumerable<object[]> TypicalData() => FrozenFromKnownValuesTests.StringStringData();
+
+        [Theory]
+        [MemberData(nameof(TypicalData))]
+        public static void HasSufficientUniquenessKnownData(Dictionary<string, string> source)
+        {
+            string[] keys = source.Keys.ToArray();
+            HashSet<string> set = new HashSet<string>(source.Comparer);
+
+            int allowedCollisions = keys.Length / 20;
+            bool passable = KeyAnalyzer.HasSufficientUniquenessFactor(set, keys.AsSpan(), allowedCollisions);
+
+            if (passable)
+                Assert.InRange(set.Count, keys.Length - allowedCollisions, keys.Length);
+        }
     }
 }