-
Notifications
You must be signed in to change notification settings - Fork 0
/
count.go
64 lines (56 loc) · 1.39 KB
/
count.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package main
import (
"io"
"math/rand/v2"
"unique"
)
func countExactDistinctWords(r io.Reader) (int, int) {
count := 0
words := make(map[string]bool)
for w := range Words(r) {
words[w] = true
count++
}
return count, len(words)
}
func countExactDistinctWordsInterned(r io.Reader) (int, int) {
count := 0
words := make(map[unique.Handle[string]]bool)
for w := range Words(r) {
words[unique.Make(w)] = true
count++
}
return count, len(words)
}
// countApproxDistinctWords implements CVM algorithm
// See https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/
func countApproxDistinctWords(r io.Reader, memSize int) (int, int, int) {
count := 0
words := make(map[string]bool, memSize)
currentRound := 0
for w := range Words(r) {
count++
// fmt.Println(w, currentRound, len(words))
if rand.Uint64N(1<<currentRound) > 0 {
// randomly clear memory
// keeping/storing words becomes more and more difficult as rounds go by (two times harder by round)
delete(words, w)
} else {
words[w] = true
}
if len(words) >= memSize {
// memory full: cleanup half of it and move to next round
cleanup(words)
currentRound++
}
}
return count, len(words) << currentRound, currentRound + 1
}
// cleanup randomly clears half of the keys
func cleanup(m map[string]bool) {
for k := range m {
if rand.IntN(2) == 0 {
delete(m, k)
}
}
}