From 04bc3a423c8ea9e7c945b15dffb83d674bab3a68 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 1 Jul 2024 16:13:48 -0700 Subject: [PATCH] fix(blooms): ensure tokenizer cache is reset between series (#13370) --- pkg/storage/bloom/v1/bloom_tokenizer.go | 3 ++ pkg/storage/bloom/v1/bloom_tokenizer_test.go | 39 ++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 7d2ba41b7f49..f0365f7cc78d 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -97,11 +97,14 @@ func (bt *BloomTokenizer) newBloom() *Bloom { } } +// Populates a bloom filter(s) with the tokens from the given chunks. +// Called once per series func (bt *BloomTokenizer) Populate( blooms SizedIterator[*Bloom], chks Iterator[ChunkRefWithIter], ch chan *BloomCreation, ) { + clear(bt.cache) // MUST always clear the cache before starting a new series var next bool // All but the last bloom are considered full -- send back unaltered diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 9bef1ab2ca20..0f837fbee27b 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -288,6 +288,45 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { } } +func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) { + bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, NewMetrics(nil)) + line := "foobarbazz" + var blooms []*Bloom + + for i := 0; i < 2; i++ { + ch := make(chan *BloomCreation) + itr, err := chunkRefItrFromLines(line) + require.NoError(t, err) + go bt.Populate( + NewEmptyIter[*Bloom](), + NewSliceIter([]ChunkRefWithIter{ + { + Ref: ChunkRef{}, + Itr: itr, + }, + }), + ch, + ) + var ct int + for created := range ch { + blooms = append(blooms, created.Bloom) + ct++ + } + // ensure we created one bloom for each call + require.Equal(t, 1, ct) + + } + + for _, bloom := range blooms { + toks := bt.lineTokenizer.Tokens(line) + for toks.Next() { + token := toks.At() + require.True(t, bloom.Test(token)) + } + require.NoError(t, toks.Err()) + } +} + func BenchmarkMapClear(b *testing.B) { bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics) for i := 0; i < b.N; i++ {