Reimplementations of immutable HashSet and HashMap.

The reimplementations are based upon Compressed Hash-Array Mapped Prefix-trees (CHAMP), see paper "Optimizing Hash-Array Mapped Tries for Fast and Lean Immutable JVM Collections" by Steindorfer and Vinju (OOPSLA'15) for more details and descriptions of low-level performance optimizations (a pre-print of the paper is available under https://michael.steindorfer.name/publications/oopsla15.pdf). This commit closes #192. The new implementations (i.e., ChampHashSet and ChampHashMap) currently exist next to the previous HashMap and HashSet. By default immutable.Map and immutable.Set now pickup the CHAMP data structures. A JVM flag (-Dstrawman.collection.immutable.useBaseline=true) allows to switch back to the previous HashSet and HashMap implementations for testing. Note, the flag and the previous HashSet and HashMap implementations will be removed in the final version of collection-strawman, but for the time being they remain to support comparing the different trade-offs and performance characteristics of the current and the new data structures. Preliminary performance numbers of the new CHAMP data structures were presented in issue #192. Overall one can summarize that the CHAMP data structures significantly lower memory footprints and significantly improve all iteration-based operations and equality checks. Basic operations such as lookup, insertion, and deletion may slow down. The current state of the reimplementation does not optimize for hash-collisions yet. Note that the CHAMP design / implementation differs from the previous immutable hashed data structures by not memoizing the hash codes of the individual elements (which may change the performance of certain workloads). If necessary, CHAMP's design allows to modularly add memoized hash codes of the individual elements (at the expense of some memory savings). Details are discussed in the paper mentioned above.
scala · Feb 3, 2018 · 566b143 · 566b143
1 parent 2eb7f23
commit 566b143
Show file tree

Hide file tree

Showing 15 changed files with 2,429 additions and 20 deletions.
diff --git a/benchmarks/time/src/main/scala/strawman/collection/immutable/ChampHashSetBenchmark.scala b/benchmarks/time/src/main/scala/strawman/collection/immutable/ChampHashSetBenchmark.scala
@@ -0,0 +1,167 @@
+package strawman.collection.immutable
+
+import java.util.concurrent.TimeUnit
+
+import org.openjdk.jmh.annotations._
+import org.openjdk.jmh.infra.Blackhole
+
+@BenchmarkMode(scala.Array(Mode.AverageTime))
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Fork(1)
+@Warmup(iterations = 8)
+@Measurement(iterations = 8)
+@State(Scope.Benchmark)
+class ChampHashSetBenchmark {
+  @Param(scala.Array("0", "1", "2", "3", "4", "7", "8", "15", "16", "17", "39", "282", "4096", "131070", "7312102"))
+  var size: Int = _
+
+  var xs: ChampHashSet[Long] = _
+  var ys: ChampHashSet[Long] = _
+  var zs: ChampHashSet[Long] = _
+  var zipped: ChampHashSet[(Long, Long)] = _
+  var randomIndices: scala.Array[Int] = _
+  def fresh(n: Int) = ChampHashSet((1 to n).map(_.toLong): _*)
+
+  @Setup(Level.Trial)
+  def initTrial(): Unit = {
+    xs = fresh(size)
+    ys = fresh(size)
+    zs = fresh((size / 1000) max 2).map(-_)
+    zipped = xs.map(x => (x, x))
+    if (size > 0) {
+      randomIndices = scala.Array.fill(1000)(scala.util.Random.nextInt(size))
+    }
+  }
+
+  @Benchmark
+  def create(bh: Blackhole): Unit = bh.consume(fresh(size))
+
+  @Benchmark
+  @OperationsPerInvocation(1000)
+  def expand_incl(bh: Blackhole): Unit = {
+    var ys = xs
+    var i = 0L
+    while (i < 1000) {
+      ys += -i
+      i += 1
+    }
+    bh.consume(ys)
+  }
+
+  @Benchmark
+  def expand_concat(bh: Blackhole): Unit = bh.consume(xs ++ zs)
+
+  @Benchmark
+  def traverse_foreach(bh: Blackhole): Unit = xs.foreach(x => bh.consume(x))
+
+  @Benchmark
+  def traverse_headTail(bh: Blackhole): Unit = {
+    var ys = xs
+    while (ys.nonEmpty) {
+      bh.consume(ys.head)
+      ys = ys.tail
+    }
+  }
+
+  @Benchmark
+  def traverse_initLast(bh: Blackhole): Unit = {
+    var ys = xs
+    while (ys.nonEmpty) {
+      bh.consume(ys.last)
+      ys = ys.init
+    }
+  }
+
+  @Benchmark
+  def traverse_iterator(bh: Blackhole): Unit = {
+    val it = xs.iterator()
+    while (it.hasNext) {
+      bh.consume(it.next())
+    }
+  }
+
+  @Benchmark
+  def traverse_foldLeft(bh: Blackhole): Unit = bh.consume(xs.foldLeft(0) {
+    case (acc, n) =>
+      bh.consume(n)
+      acc + 1
+  })
+
+  @Benchmark
+  def traverse_foldRight(bh: Blackhole): Unit = bh.consume(xs.foldRight(0) {
+    case (n, acc) =>
+      bh.consume(n)
+      acc - 1
+  })
+  @Benchmark
+  def access_tail(bh: Blackhole): Unit = bh.consume(xs.tail)
+
+  @Benchmark
+  def access_init(bh: Blackhole): Unit = bh.consume(xs.init)
+
+  @Benchmark
+  @OperationsPerInvocation(100)
+  def access_slice(bh: Blackhole): Unit = {
+    var i = 0
+    while (i < 100) {
+      bh.consume(xs.slice(size - size / (i + 1), size))
+      i += 1
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(1000)
+  def access_contains(bh: Blackhole): Unit = {
+    var i = 0
+    while (i < 1000) {
+      bh.consume(xs.contains(i))
+      i += 1
+    }
+  }
+
+  @Benchmark
+  def transform_map(bh: Blackhole): Unit = bh.consume(xs.map(x => x + 1))
+
+  @Benchmark
+  @OperationsPerInvocation(100)
+  def transform_span(bh: Blackhole): Unit = {
+    var i = 0
+    while (i < 100) {
+      val (xs1, xs2) = xs.span(x => x < randomIndices(i))
+      bh.consume(xs1)
+      bh.consume(xs2)
+      i += 1
+    }
+  }
+
+  @Benchmark
+  def transform_zip(bh: Blackhole): Unit = bh.consume(xs.zip(xs))
+
+  @Benchmark
+  def transform_zipMapTupled(bh: Blackhole): Unit = {
+    val f = (a: Long, b: Long) => (a, b)
+    bh.consume(xs.zip(xs).map(f.tupled))
+  }
+
+  @Benchmark
+  def transform_zipWithIndex(bh: Blackhole): Unit = bh.consume(xs.zipWithIndex)
+
+  @Benchmark
+  def transform_lazyZip(bh: Blackhole): Unit = bh.consume(xs.lazyZip(xs).map((_, _)))
+
+  @Benchmark
+  def transform_unzip(bh: Blackhole): Unit = bh.consume(zipped.unzip)
+
+  @Benchmark
+  def transform_groupBy(bh: Blackhole): Unit = {
+    val result = xs.groupBy(_ % 5)
+    bh.consume(result)
+  }
+
+  @Benchmark
+  def traverse_subsetOf(bh: Blackhole): Unit = bh.consume(ys.subsetOf(xs))
+
+  @Benchmark
+  def traverse_equals(bh: Blackhole): Unit = bh.consume(xs == ys)
+
+}
diff --git a/benchmarks/time/src/main/scala/strawman/collection/immutable/HashSetBenchmark.scala b/benchmarks/time/src/main/scala/strawman/collection/immutable/HashSetBenchmark.scala
@@ -19,6 +19,7 @@ class HashSetBenchmark {
   var size: Int = _
 
   var xs: HashSet[Long] = _
+  var ys: HashSet[Long] = _
   var zs: HashSet[Long] = _
   var zipped: HashSet[(Long, Long)] = _
   var randomIndices: scala.Array[Int] = _
@@ -27,6 +28,7 @@ class HashSetBenchmark {
   @Setup(Level.Trial)
   def initTrial(): Unit = {
     xs = fresh(size)
+    ys = fresh(size)
     zs = fresh((size / 1000) max 2).map(-_)
     zipped = xs.map(x => (x, x))
     if (size > 0) {
@@ -64,14 +66,15 @@ class HashSetBenchmark {
     }
   }
 
-  @Benchmark
-  def traverse_initLast(bh: Blackhole): Unit = {
-    var ys = xs
-    while (ys.nonEmpty) {
-      bh.consume(ys.last)
-      ys = ys.init
-    }
-  }
+//  // TODO: currently disabled, since it does not finish
+//  @Benchmark
+//  def traverse_initLast(bh: Blackhole): Unit = {
+//    var ys = xs
+//    while (ys.nonEmpty) {
+//      bh.consume(ys.last)
+//      ys = ys.init
+//    }
+//  }
 
   @Benchmark
   def traverse_iterator(bh: Blackhole): Unit = {
@@ -158,4 +161,11 @@ class HashSetBenchmark {
     val result = xs.groupBy(_ % 5)
     bh.consume(result)
   }
+
+  @Benchmark
+  def traverse_subsetOf(bh: Blackhole): Unit = bh.consume(ys.subsetOf(xs))
+
+  @Benchmark
+  def traverse_equals(bh: Blackhole): Unit = bh.consume(xs == ys)
+
 }
diff --git a/benchmarks/time/src/main/scala/strawman/collection/immutable/ScalaHashSetBenchmark.scala b/benchmarks/time/src/main/scala/strawman/collection/immutable/ScalaHashSetBenchmark.scala
@@ -19,6 +19,7 @@ class ScalaHashSetBenchmark {
   var size: Int = _
 
   var xs: scala.collection.immutable.HashSet[Long] = _
+  var ys: scala.collection.immutable.HashSet[Long] = _
   var zs: scala.collection.immutable.HashSet[Long] = _
   var zipped: scala.collection.immutable.HashSet[(Long, Long)] = _
   var randomIndices: scala.Array[Int] = _
@@ -27,6 +28,7 @@ class ScalaHashSetBenchmark {
   @Setup(Level.Trial)
   def initTrial(): Unit = {
     xs = fresh(size)
+    ys = fresh(size)
     zs = fresh((size / 1000) max 2).map(-_)
     zipped = xs.map(x => (x, x))
     if (size > 0) {
@@ -64,14 +66,15 @@ class ScalaHashSetBenchmark {
     }
   }
 
-  @Benchmark
-  def traverse_initLast(bh: Blackhole): Unit = {
-    var ys = xs
-    while (ys.nonEmpty) {
-      bh.consume(ys.last)
-      ys = ys.init
-    }
-  }
+//  // TODO: currently disabled, since it does not finish
+//  @Benchmark
+//  def traverse_initLast(bh: Blackhole): Unit = {
+//    var ys = xs
+//    while (ys.nonEmpty) {
+//      bh.consume(ys.last)
+//      ys = ys.init
+//    }
+//  }
 
   @Benchmark
   def traverse_iterator(bh: Blackhole): Unit = {
@@ -158,4 +161,11 @@ class ScalaHashSetBenchmark {
     val result = xs.groupBy(_ % 5)
     bh.consume(result)
   }
+
+  @Benchmark
+  def traverse_subsetOf(bh: Blackhole): Unit = bh.consume(ys.subsetOf(xs))
+
+  @Benchmark
+  def traverse_equals(bh: Blackhole): Unit = bh.consume(xs == ys)
+
 }
diff --git a/build.sbt b/build.sbt
@@ -39,6 +39,7 @@ val commonSettings = Seq(
       <developer><id>odersky</id><name>Martin Odersky</name></developer>
       <developer><id>julienrf</id><name>Julien Richard-Foy</name></developer>
       <developer><id>szeiger</id><name>Stefan Zeiger</name></developer>
+      <developer><id>msteindorfer</id><name>Michael J. Steindorfer</name></developer>
     </developers>,
   // For publishing snapshots
   credentials ++= (