Skip to content

Commit

Permalink
Switch Content Hashing from CRC32 to XXHash64 (#198)
Browse files Browse the repository at this point in the history
- CRC32 collision probability is too high for comfort since it's only
32-bits
- Switching to XXHash64 since its a high quality hash, 64-bits and
extremely fast (faster than CRC32
for medium size inputs:
https://lz4.github.io/lz4-java/1.3.0/xxhash-benchmark/)
  • Loading branch information
ahirreddy authored Jan 11, 2024
1 parent 9778b1a commit f03a895
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 27 deletions.
1 change: 1 addition & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ lazy val main = (project in file("sjsonnet"))
"com.lihaoyi" %% "scalatags" % "0.9.3",
"com.lihaoyi" %% "os-lib" % "0.7.2",
"com.lihaoyi" %% "mainargs" % "0.2.0",
"org.lz4" % "lz4-java" % "1.8.0",
"org.json" % "json" % "20211205",
"org.scala-lang.modules" %% "scala-collection-compat" % "2.4.0",
"org.tukaani" % "xz" % "1.8",
Expand Down
1 change: 1 addition & 0 deletions build.sc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class SjsonnetModule(val crossScalaVersion: String) extends Module {
def ivyDeps = super.ivyDeps() ++ Agg(
ivy"org.json:json:20211205",
ivy"org.tukaani:xz::1.8",
ivy"org.lz4:lz4-java::1.8.0",
ivy"org.yaml:snakeyaml::1.30"
)
def scalacOptions = Seq("-opt:l:inline", "-opt-inline-from:sjsonnet.**")
Expand Down
4 changes: 4 additions & 0 deletions sjsonnet/src-js/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
package sjsonnet
import java.io.File
object Platform {
def gzipBytes(s: Array[Byte]): String = {
throw new Exception("GZip not implemented in Scala.js")
Expand All @@ -18,4 +19,7 @@ object Platform {
def md5(s: String): String = {
throw new Exception("MD5 not implemented in Scala.js")
}
def hashFile(file: File): String = {
throw new Exception("hashFile not implemented in Scala.js")
}
}
32 changes: 7 additions & 25 deletions sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package sjsonnet
import java.io.{BufferedInputStream, File, FileInputStream}
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.util.zip.CRC32

import fastparse.ParserInput

/**
Expand All @@ -15,8 +15,10 @@ import fastparse.ParserInput
* @param memoryLimitBytes The maximum size of a file that we will resolve. This is not the size of
* the buffer, but a mechanism to fail when being asked to resolve (and downstream parse) a file
* that is beyond this limit.
* @param cacheThresholdBytes The maximum size of a file that we will cache in memory. If the file
* is larger than this, then we will serve it from disk
*/
class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long) extends ResolvedFile {
class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long, cacheThresholdBytes: Long = 1024 * 1024) extends ResolvedFile {

private val jFile: File = resolvedImportPath.p.toIO

Expand All @@ -25,7 +27,7 @@ class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long)
assert(jFile.length() <= memoryLimitBytes, s"Resolved import path ${resolvedImportPath} is too large: ${jFile.length()} bytes > ${memoryLimitBytes} bytes")

private[this] val resolvedImportContent: StaticResolvedFile = {
if (jFile.length() > 1024 * 1024) {
if (jFile.length() > cacheThresholdBytes) {
// If the file is too large, then we will just read it from disk
null
} else {
Expand Down Expand Up @@ -59,33 +61,13 @@ class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long)
}
}

private def crcHashFile(file: File): Long = {
val buffer = new Array[Byte](8192)
val crc = new CRC32()

val fis = new FileInputStream(file)
val bis = new BufferedInputStream(fis)

try {
var bytesRead = bis.read(buffer)
while (bytesRead != -1) {
crc.update(buffer, 0, bytesRead)
bytesRead = bis.read(buffer)
}
} finally {
bis.close()
fis.close()
}

crc.getValue()
}

override lazy val contentHash: String = {
if (resolvedImportContent == null) {
// If the file is too large, then we will just read it from disk
crcHashFile(jFile).toString
Platform.hashFile(jFile)
} else {
resolvedImportContent.contentHash
}
}
}

26 changes: 25 additions & 1 deletion sjsonnet/src-jvm/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ package sjsonnet

import org.json.JSONObject

import java.io.ByteArrayOutputStream
import java.io.{ByteArrayOutputStream, BufferedInputStream, File, FileInputStream}
import java.util.Base64
import java.util.zip.GZIPOutputStream
import net.jpountz.xxhash.{StreamingXXHash64, XXHashFactory, XXHash64}
import org.tukaani.xz.LZMA2Options
import org.tukaani.xz.XZOutputStream
import org.yaml.snakeyaml.Yaml
Expand Down Expand Up @@ -53,4 +54,27 @@ object Platform {
.map{ b => String.format("%02x", new java.lang.Integer(b & 0xff))}
.mkString
}

private[this] val xxHashFactory = XXHashFactory.fastestInstance()

def hashFile(file: File): String = {
val buffer = new Array[Byte](8192)
val hash: StreamingXXHash64 = xxHashFactory.newStreamingHash64(0)

val fis = new FileInputStream(file)
val bis = new BufferedInputStream(fis)

try {
var bytesRead = bis.read(buffer)
while (bytesRead != -1) {
hash.update(buffer, 0, bytesRead)
bytesRead = bis.read(buffer)
}
} finally {
bis.close()
fis.close()
}

hash.getValue().toString
}
}
6 changes: 6 additions & 0 deletions sjsonnet/src-native/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
package sjsonnet
import java.io.File
object Platform {
def gzipBytes(s: Array[Byte]): String = {
throw new Exception("GZip not implemented in Scala Native")
Expand All @@ -18,4 +19,9 @@ object Platform {
def md5(s: String): String = {
throw new Exception("MD5 not implemented in Scala Native")
}

def hashFile(file: File): String = {
// File hashes in Scala Native are just the file content
scala.io.Source.fromFile(file).mkString
}
}
1 change: 0 additions & 1 deletion sjsonnet/src/sjsonnet/Importer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package sjsonnet

import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, File, FileInputStream, FileReader, InputStream, RandomAccessFile, Reader, StringReader}
import java.nio.file.Files
import java.util.zip.CRC32
import java.security.MessageDigest
import scala.collection.mutable
import fastparse.{IndexedParserInput, Parsed, ParserInput}
Expand Down
45 changes: 45 additions & 0 deletions sjsonnet/test/src-jvm/sjsonnet/XxHash64Tests.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package sjsonnet

import java.nio.file.{Files, Path => JavaPath}

import scala.util.Random

import net.jpountz.xxhash.{StreamingXXHash64, XXHashFactory, XXHash64}

import utest._
import TestUtils.eval

object XxHash64Tests extends TestSuite {
val tests = Tests {

test("xxhash") {
for (sizeInKb <- List(1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024)) {
val (randomContent, tempFilePath) = generateRandomContentAndSaveToFile(sizeInKb)
val xxHash64 = XXHashFactory.fastestInstance().hash64()
// Use the non-streaming version of xxHash64 to hash the whole byte array
val xxHash64Result = xxHash64.hash(randomContent, 0, randomContent.length, 0).toString
// Then use the streaming version of xxHash64 to hash the file in chunks
val cachedFile = new CachedResolvedFile(
OsPath(os.Path(tempFilePath)),
memoryLimitBytes = Int.MaxValue,
cacheThresholdBytes = 0)
// They should agree
val hash = cachedFile.contentHash
assert(xxHash64Result == hash)
}
}
}

private def generateRandomContentAndSaveToFile(sizeInKb: Int): (Array[Byte], JavaPath) = {
val random = new Random()
val byteArraySize = 1024 * sizeInKb
val randomContent = new Array[Byte](byteArraySize)
random.nextBytes(randomContent)

val tempFilePath = Files.createTempFile("randomContent", ".tmp")
Files.write(tempFilePath, randomContent)

(randomContent, tempFilePath)
}
}

0 comments on commit f03a895

Please sign in to comment.