Skip to content

Commit

Permalink
Count words using interning
Browse files Browse the repository at this point in the history
To illustrate usage of the new "unique" package, store only handles in our set.

This has (unexpectedly) very poor performance, in terms of memory usage and time.
This is probably not the right use-case for the package.
  • Loading branch information
benoitmasson committed Nov 8, 2024
1 parent 17c26e3 commit 07c7482
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
11 changes: 11 additions & 0 deletions count.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"io"
"math/rand/v2"
"unique"
)

func countExactDistinctWords(r io.Reader) (int, int) {
Expand All @@ -15,6 +16,16 @@ func countExactDistinctWords(r io.Reader) (int, int) {
return count, len(words)
}

func countExactDistinctWordsInterned(r io.Reader) (int, int) {
count := 0
words := make(map[unique.Handle[string]]bool)
for w := range Words(r) {
words[unique.Make(w)] = true
count++
}
return count, len(words)
}

// countApproxDistinctWords implements CVM algorithm
// See https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/
func countApproxDistinctWords(r io.Reader, memSize int) (int, int, int) {
Expand Down
7 changes: 7 additions & 0 deletions main3.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ func main3(inputFile string, memSize int) {
m = clearMemory(f)
fmt.Printf("[ExactCount] Used %dkB heap memory in %dms\n\n", m-m0, time.Since(start).Milliseconds())

// // interning
// start = time.Now()
// totalCount, distinctCount = countExactDistinctWordsInterned(f)
// fmt.Printf("[ExactCountInterned] Found %d distinct words (out of %d total words)\n", distinctCount, totalCount)
// m = clearMemory(f)
// fmt.Printf("[ExactCountInterned] Used %dkB heap memory in %dms\n\n", m-m0, time.Since(start).Milliseconds())

// CVM
start = time.Now()
totalCount, distinctCount, rounds = countApproxDistinctWords(f, memSize)
Expand Down

0 comments on commit 07c7482

Please sign in to comment.