Skip to content

Commit

Permalink
SARIF has per-line rolling (partial) hash support (#2605)
Browse files Browse the repository at this point in the history
* starting rolling hash implementation: added computeFirstMod and data structures

* starting rolling hash implementation: added computeFirstMod and data structures

* [wip] added hashing algorithm to file regions

* [wip] Fixes to the rolling hash computation

* [wip] fixes to the hashing algorithm.
(await) discussion with codeql dev

* [wip] Moving hash computation to HashUtilities

* Porting tests from CodeQL repo

* Adding unit tests for rolling hash

* Adding comments

* [wip] added hashing algorithm to file regions

* [wip] Fixes to the rolling hash computation

* [wip] fixes to the hashing algorithm.
(await) discussion with codeql dev

* [wip] Moving hash computation to HashUtilities

* Porting tests from CodeQL repo

* Adding unit tests for rolling hash

* Adding comments

* removing generics in file regions cache

* updating Release History

* incorporating PR feedback

* format fixing Long
  • Loading branch information
suvamM authored Jan 26, 2023
1 parent d457aab commit e35f895
Show file tree
Hide file tree
Showing 5 changed files with 958 additions and 1 deletion.
2 changes: 1 addition & 1 deletion NuGet.Config
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<packageSources>
<clear />
Expand Down
1 change: 1 addition & 0 deletions src/ReleaseHistory.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## **v3.2.0** (UNRELEASED)

* FEATURE: Allow per-line rolling (partial) hash computation for a file. [#2605](https://github.com/microsoft/sarif-sdk/pull/2605)
* BREAKING: Rename `--normalize-for-github` argument to `--normalize-for-ghas` for `convert` command and mark `--normalize-for-github` as obsolete. [#2581](https://github.com/microsoft/sarif-sdk/pull/2581)
* BREAKING: Update `IAnalysisContext.LogToolNotification` method to add `ReportingDescriptor` parameter. This is required in order to populated `AssociatedRule` data in `Notification` instances. The new method has an option value of null for the `associatedRule` parameter to maximize build compatibility. [#2604](https://github.com/microsoft/sarif-sdk/pull/2604)
* BREAKING: Correct casing of `LogMissingreportingConfiguration` helper to `LogMissingReportingConfiguration`. [#2599](https://github.com/microsoft/sarif-sdk/pull/2599)
Expand Down
139 changes: 139 additions & 0 deletions src/Sarif/HashUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,22 @@
using System.Text;
using System.Threading.Tasks;

using Microsoft.CodeAnalysis.Sarif.Numeric;

namespace Microsoft.CodeAnalysis.Sarif
{
public static class HashUtilities
{
static HashUtilities() => FileSystem = Sarif.FileSystem.Instance;

private static readonly int TAB = '\t';
private static readonly int SPACE = ' ';
private static readonly int LF = '\n';
private static readonly int CR = '\r';
private static readonly int EOF = 65535;
private static readonly int BLOCK_SIZE = 100;
private static readonly Long MOD = new Long(37, 0, false);

private static IFileSystem _fileSystem;
internal static IFileSystem FileSystem
{
Expand Down Expand Up @@ -206,5 +216,134 @@ public static string ComputeMD5Hash(string fileName)
catch (UnauthorizedAccessException) { }
return md5;
}

public static Dictionary<int, string> RollingHash(string fileText)
{
Dictionary<int, string> rollingHashes = new Dictionary<int, string>();

// A rolling view into the input
int[] window = new int[BLOCK_SIZE];

int[] lineNumbers = new int[BLOCK_SIZE];
for (int i = 0; i < lineNumbers.Length; i++)
{
lineNumbers[i] = -1;
}

Long hashRaw = new Long(0, 0, false);
Long firstMod = ComputeFirstMod();

// The current index in the window, will wrap around to zero when we reach BLOCK_SIZE
int index = 0;
// The line number of the character we are currently processing from the input
int lineNumber = 0;
// Is the next character to be read the start of a new line
bool lineStart = true;
// Was the previous character a CR (carriage return)
bool prevCR = false;

Dictionary<string, int> hashCounts = new Dictionary<string, int>();

// Output the current hash and line number to the cache
Action outputHash = () =>
{
string hashValue = hashRaw.ToUnsigned().ToString(16);
if (!hashCounts.ContainsKey(hashValue))
{
hashCounts[hashValue] = 0;
}
hashCounts[hashValue]++;
rollingHashes[lineNumbers[index]] = $"{hashValue}:{hashCounts[hashValue]}";
lineNumbers[index] = -1;
};

// Update the current hash value and increment the index in the window
Action<int> updateHash = (current) =>
{
int begin = window[index];
window[index] = current;
hashRaw = MOD.Multiply(hashRaw)
.Add(Long.FromInt(current))
.Subtract(firstMod.Multiply(Long.FromInt(begin)));
index = (index + 1) % BLOCK_SIZE;
};

// First process every character in the input, updating the hash and lineNumbers
// as we go. Once we reach a point in the window again then we've processed
// BLOCK_SIZE characters and if the last character at this point in the window
// was the start of a line then we should output the hash for that line.
Action<int> processCharacter = (current) =>
{
// skip tabs, spaces, and line feeds that come directly after a carriage return
if (current == SPACE || current == TAB || (prevCR && current == LF))
{
prevCR = false;
return;
}
// replace CR with LF
if (current == CR)
{
current = LF;
prevCR = true;
}
else
{
prevCR = false;
}
if (lineNumbers[index] != -1)
{
outputHash();
}
if (lineStart)
{
lineStart = false;
lineNumber++;
lineNumbers[index] = lineNumber;
}
if (current == LF)
{
lineStart = true;
}
updateHash(current);
};

if (fileText != null)
{
for (int i = 0; i < fileText.Length; i++)
{
processCharacter(fileText[i]);
}

processCharacter(EOF);

// Flush the remaining lines
for (int i = 0; i < BLOCK_SIZE; i++)
{
if (lineNumbers[index] != -1)
{
outputHash();
}
updateHash(0);
}
}

return rollingHashes;
}

private static Long ComputeFirstMod()
{
Long firstMod = new Long(1, 0, false);

for (int i = 0; i < 100; i++)
{
firstMod = firstMod.Multiply(MOD);
}

return firstMod;
}
}
}
Loading

0 comments on commit e35f895

Please sign in to comment.