Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KeyAnalyzer tests. #90301

Merged
merged 2 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,29 +46,35 @@ public static AnalysisResults Analyze(
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
int uniqueStringsLength = uniqueStrings.Length;

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStringsLength / 20;

SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
uniqueStringsLength,
#endif
comparer);

// For each substring length...
// For each substring length...preferring the shortest length that provides
// enough uniqueness
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.IsLeft = true;
comparer.Count = count;

// For each index, get a uniqueness factor for the left-justified substrings.
// For each index from, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
comparer.Index = index;

if (HasSufficientUniquenessFactor(set, uniqueStrings))
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
Expand All @@ -90,10 +96,9 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))

if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
Expand Down Expand Up @@ -202,7 +207,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
#if NET8_0_OR_GREATER
private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
#endif
private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
internal static bool ContainsAnyLetters(ReadOnlySpan<char> s)
{
Debug.Assert(IsAllAscii(s));

Expand All @@ -221,14 +226,10 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
{
set.Clear();

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Linq;
using Xunit;

namespace System.Collections.Frozen.Tests
Expand Down Expand Up @@ -212,5 +214,75 @@ public static void IsAllAscii()
Assert.True(KeyAnalyzer.IsAllAscii("abcdefghij".AsSpan()));
Assert.False(KeyAnalyzer.IsAllAscii("abcdéfghij".AsSpan()));
}

[Fact]
public static void ContainsAnyLetters()
{
Assert.True(KeyAnalyzer.ContainsAnyLetters("abc".AsSpan()));
Assert.True(KeyAnalyzer.ContainsAnyLetters("ABC".AsSpan()));
Assert.False(KeyAnalyzer.ContainsAnyLetters("123".AsSpan()));
// note, must only pass ASCII to ContainsAnyLetters, anything else is a
// Debug.Assert and would not have been called in the actual implementation
}

[Fact]
public static void HasSufficientUniquenessFactor()
{
HashSet<string> set = new HashSet<string>(StringComparer.Ordinal);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "c" }, 0));
Assert.Equal(3, set.Count);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "a" }, 1));
Assert.Equal(2, set.Count); // set should only have the non-collided ones

Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "ab", "aa" }, 0));
Assert.Equal(0, set.Count); // if we failed it should empty the set
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
}

[Fact]
public static void HasSufficientUniquenessFactorInsensitive()
{
HashSet<string> set = new HashSet<string>(StringComparer.OrdinalIgnoreCase);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "B", "c" }, 0));
Assert.Equal(3, set.Count);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 1));
Assert.Equal(1, set.Count); // set should only have the non-collided ones

Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "aa", "AA" }, 0));
}

[Fact]
public static void HasSufficientUniquenessFactorLarge()
{
HashSet<string> set = new HashSet<string>(StringComparer.Ordinal);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "c" }, 1));
Assert.Equal(3, set.Count);

Assert.True(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "b", "a" }, 1));
Assert.Equal(2, set.Count); // set should only have the non-collided ones

Assert.False(KeyAnalyzer.HasSufficientUniquenessFactor(set, new[] { "a", "a", "a" }, 1));
}

// reuse the typical data declared in the FrozenFromKnownValuesTests
public static IEnumerable<object[]> TypicalData() => FrozenFromKnownValuesTests.StringStringData();

[Theory]
[MemberData(nameof(TypicalData))]
public static void HasSufficientUniquenessKnownData(Dictionary<string, string> source)
{
string[] keys = source.Keys.ToArray();
HashSet<string> set = new HashSet<string>(source.Comparer);

int allowedCollisions = keys.Length / 20;
bool passable = KeyAnalyzer.HasSufficientUniquenessFactor(set, keys.AsSpan(), allowedCollisions);

if (passable)
Assert.InRange(set.Count, keys.Length - allowedCollisions, keys.Length);
}
}
}