fix tests using hashBiggestIntVM vm callback

nim-lang · Jul 18, 2019 · 7847592 · 7847592
1 parent 93b46c4
commit 7847592
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 32 deletions.
diff --git a/compiler/vmops.nim b/compiler/vmops.nim
@@ -16,6 +16,7 @@ from math import sqrt, ln, log10, log2, exp, round, arccos, arcsin,
 from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getAppFilename
 from md5 import getMD5
 from sighashes import symBodyDigest
+from std/hashes import hashBiggestInt
 
 template mathop(op) {.dirty.} =
   registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
@@ -144,3 +145,6 @@ proc registerAdditionalOps*(c: PCtx) =
     let n = getNode(a, 0)
     if n.kind != nkSym: raise newException(ValueError, "node is not a symbol")
     setResult(a, $symBodyDigest(c.graph, n.sym))
+
+  registerCallback c, "stdlib.hashes.hashBiggestIntVM", proc (a: VmArgs) {.nimcall.} =
+    a.setResult hashBiggestInt(getInt(a, 0))
diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
@@ -56,38 +56,11 @@ type
 const
   IntSize = sizeof(int)
 
-proc preferStringHash*(T: typedesc): bool =
-  ## whether hashing is more efficient using `hash($x)`
-  # when string hashing is more efficient, see #11764
-  # exported so user defined hash can use this too
-  T.sizeof >= 4
-
-proc hash*(x: string): Hash
-
-proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
-  ## Efficient hashing of numbers, ordinals (eg enum), char.
-  when preferStringHash(T): # fix #11764
-    when T is SomeFloat:
-      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
-      # the only non nan value that can cause clash is 0 according to
-      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
-      # bugfix: the previous code was using `x = x + 1.0` (presumably for
-      # handling negative 0), however this doesn't work well for small inputs
-      # because `x+1.0` can become 0 with floating point accuracy, which
-      # leads to hash collisions.
-      # Note: this hit this bug: #11775:
-      # `let x = if x == 0.0: 0.0 else: x`
-      var x = x
-      if x == 0: x = 0
-    hashData(cast[pointer](unsafeAddr x), T.sizeof)
-  else:
-    # more efficient for small types
-    ord(x)
-
 proc `!&`*(h: Hash, val: int): Hash {.inline.} =
   ## Mixes a hash value `h` with `val` to produce a new hash value.
   ##
   ## This is only needed if you need to implement a hash proc for a new datatype.
+  ## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
   let h = cast[uint](h)
   let val = cast[uint](val)
   var res = h + val
@@ -124,6 +97,45 @@ proc hashData*(data: pointer, size: int): Hash =
     dec(s)
   result = !$h
 
+proc hashBiggestIntVM(x: BiggestInt): Hash = discard # in vmops
+
+proc hashBiggestInt*(x: BiggestInt): Hash {.inline.} =
+  # ## for internal use; user code should prefer `hash` overloads
+  toU32(x)
+  # when nimvm: hashBiggestIntVM(x)
+  # else: hashData(cast[pointer](unsafeAddr x), type(x).sizeof)
+
+proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
+  # when T is int: if true: return toU32(x)
+  ## Efficient hashing of numbers, ordinals (eg enum), char.
+  when T.sizeof >= 4:
+    # fix #11764: `ord(x)`, `toU32(x)` or similar are up to 4X faster to compute
+    # compared to jenkins `hashData` but result in very poor hashes, leading to
+    # collisions; this can lead to several order magnitude (eg 1e3) slowdowns
+    # e.g. when used in hash tables, so we prefer to use slower to compute good
+    # hashes here. Murmur3 would improve speed of hash computation.
+    when T is SomeFloat:
+      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
+      # the only non nan value that can cause clash is 0 according to
+      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
+      # bugfix: the previous code was using `x = x + 1.0` (presumably for
+      # handling negative 0), however this leads to collisions for small x due
+      # to FP finite precision.
+      let x: BiggestInt =
+        if x == 0: 0.BiggestInt
+        else:
+          when sizeof(BiggestInt) == sizeof(T):
+            cast[BiggestInt](x)
+          else: # for nimvm
+            cast[int32](x).BiggestInt
+    else:
+      let x = x.BiggestInt
+    hashBiggestInt(x)
+  else:
+    # empirically better for small types, the collision risk is limited anyway
+    # due to cardinality of at most 2^16=65536
+    ord(x)
+
 when defined(js):
   var objectID = 0
 

diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim
@@ -165,9 +165,10 @@ block tableconstr:
 block ttables2:
   proc TestHashIntInt() =
     var tab = initTable[int,int]()
-    for i in 1..1_000_000:
+    const n = 1_000_000 # bottleneck: 50 seconds on OSX in debug mode
+    for i in 1..n:
       tab[i] = i
-    for i in 1..1_000_000:
+    for i in 1..n:
       var x = tab[i]
       if x != i : echo "not found ", i
 
@@ -233,7 +234,7 @@ block tablesref:
       for y in 0..1:
         assert t[(x,y)] == $x & $y
     assert($t ==
-      "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+      """{(x: 0, y: 1): "01", (x: 1, y: 0): "10", (x: 0, y: 0): "00", (x: 1, y: 1): "11"}""")
 
   block tableTest2:
     var t = newTable[string, float]()

diff --git a/tests/collections/ttablesthreads.nim b/tests/collections/ttablesthreads.nim
@@ -48,7 +48,7 @@ block tableTest1:
     for y in 0..1:
       assert t[(x,y)] == $x & $y
   assert($t ==
-    "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+    """{(x: 0, y: 0): "00", (x: 1, y: 0): "10", (x: 0, y: 1): "01", (x: 1, y: 1): "11"}""")
 
 block tableTest2:
   var t = initTable[string, float]()

diff --git a/tests/vm/tcompiletimetable.nim b/tests/vm/tcompiletimetable.nim
@@ -47,3 +47,29 @@ addStuff("Hey"): echo "Hey"
 addStuff("Hi"): echo "Hi"
 dump()
 
+import std/hashes
+block:
+  # check CT vs RT produces same results for Table
+  template callFun(T) =
+    block:
+      proc fun(): string =
+        var t: Table[T, string]
+        let n = 10
+        for i in 0..<n:
+          let i2 = when T.sizeof == type(i).sizeof: i else: i.int32
+          let k = cast[T](i2)
+            # cast intentional for regression testing,
+            # producing small values
+          doAssert k notin t
+          t[k] = $(i, k)
+          doAssert k in t
+        $t
+      const s1 = fun()
+      let s2 = fun()
+      # echo s1 # for debugging
+      doAssert s1 == s2
+      doAssert s1 == s2
+      doAssert hash(0.0) == hash(-0.0)
+  callFun(float)
+  callFun(float32)
+  callFun(int64)