-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement adaptive BloomFilter algorithm (#251)
* Add basic adaptive BF and factory Signed-off-by: Chen Dai <daichen@amazon.com> * Refactor BloomFilter factory Signed-off-by: Chen Dai <daichen@amazon.com> * Add number of candidate param Signed-off-by: Chen Dai <daichen@amazon.com> * Add BloomFilter candidate class Signed-off-by: Chen Dai <daichen@amazon.com> * Change default NDV to 1024 Signed-off-by: Chen Dai <daichen@amazon.com> * Fix broken IT Signed-off-by: Chen Dai <daichen@amazon.com> * Add UT for bloom filter factory Signed-off-by: Chen Dai <daichen@amazon.com> * Refactor adaptive BF and UT Signed-off-by: Chen Dai <daichen@amazon.com> * Add comment and more UT for adaptive BloomFilter Signed-off-by: Chen Dai <daichen@amazon.com> --------- Signed-off-by: Chen Dai <daichen@amazon.com>
- Loading branch information
Showing
8 changed files
with
616 additions
and
52 deletions.
There are no files selected for viewing
154 changes: 154 additions & 0 deletions
154
flint-core/src/main/java/org/opensearch/flint/core/field/bloomfilter/BloomFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.core.field.bloomfilter; | ||
|
||
import java.io.InputStream; | ||
import java.io.Serializable; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import org.opensearch.flint.core.field.bloomfilter.adaptive.AdaptiveBloomFilter; | ||
import org.opensearch.flint.core.field.bloomfilter.classic.ClassicBloomFilter; | ||
|
||
/** | ||
* Bloom filter factory that builds bloom filter based on algorithm parameters. | ||
*/ | ||
public abstract class BloomFilterFactory implements Serializable { | ||
|
||
/** | ||
* Bloom filter adaptive key and default value. | ||
*/ | ||
public static final String BLOOM_FILTER_ADAPTIVE_KEY = "adaptive"; | ||
public static final boolean DEFAULT_BLOOM_FILTER_ADAPTIVE = true; | ||
|
||
/** | ||
* Expected number of unique items key and default value. | ||
*/ | ||
public static final String CLASSIC_BLOOM_FILTER_NUM_ITEMS_KEY = "num_items"; | ||
public static final int DEFAULT_CLASSIC_BLOOM_FILTER_NUM_ITEMS = 10000; | ||
|
||
/** | ||
* False positive probability (FPP) key and default value. | ||
*/ | ||
public static final String CLASSIC_BLOOM_FILTER_FPP_KEY = "fpp"; | ||
public static final double DEFAULT_CLASSIC_BLOOM_FILTER_FPP = 0.03; | ||
|
||
/** | ||
* Number of candidate key and default value. | ||
*/ | ||
public static final String ADAPTIVE_NUMBER_CANDIDATE_KEY = "num_candidates"; | ||
public static final int DEFAULT_ADAPTIVE_NUMBER_CANDIDATE = 10; | ||
|
||
/** | ||
* Bloom filter algorithm parameters. | ||
*/ | ||
private final Map<String, String> parameters; | ||
|
||
protected BloomFilterFactory(Map<String, String> parameters) { | ||
this.parameters = parameters; | ||
} | ||
|
||
/** | ||
* @return all parameters including the default ones. | ||
*/ | ||
public abstract Map<String, String> getParameters(); | ||
|
||
/** | ||
* Create specific BloomFilter instance. | ||
* | ||
* @return BloomFilter instance | ||
*/ | ||
public abstract BloomFilter create(); | ||
|
||
/** | ||
* Create specific BloomFilter instance by deserialization. | ||
* | ||
* @param in input stream | ||
* @return BloomFilter instance | ||
*/ | ||
public abstract BloomFilter deserialize(InputStream in); | ||
|
||
/** | ||
* Create specific BloomFilter factory given the parameters. | ||
* | ||
* @param parameters BloomFilter parameters | ||
* @return BloomFilter factory instance | ||
*/ | ||
public static BloomFilterFactory of(Map<String, String> parameters) { | ||
if (isAdaptiveEnabled(parameters)) { | ||
return createAdaptiveBloomFilterFactory(parameters); | ||
} else { | ||
return createClassicBloomFilterFactory(parameters); | ||
} | ||
} | ||
|
||
private static BloomFilterFactory createAdaptiveBloomFilterFactory(Map<String, String> parameters) { | ||
return new BloomFilterFactory(parameters) { | ||
@Override | ||
public Map<String, String> getParameters() { | ||
return Map.of( | ||
BLOOM_FILTER_ADAPTIVE_KEY, "true", | ||
ADAPTIVE_NUMBER_CANDIDATE_KEY, Integer.toString(numCandidates()), | ||
CLASSIC_BLOOM_FILTER_FPP_KEY, Double.toString(fpp())); | ||
} | ||
|
||
@Override | ||
public BloomFilter create() { | ||
return new AdaptiveBloomFilter(numCandidates(), fpp()); | ||
} | ||
|
||
@Override | ||
public BloomFilter deserialize(InputStream in) { | ||
return AdaptiveBloomFilter.readFrom(numCandidates(), in); | ||
} | ||
}; | ||
} | ||
|
||
private static BloomFilterFactory createClassicBloomFilterFactory(Map<String, String> parameters) { | ||
return new BloomFilterFactory(parameters) { | ||
@Override | ||
public Map<String, String> getParameters() { | ||
return Map.of( | ||
BLOOM_FILTER_ADAPTIVE_KEY, "false", | ||
CLASSIC_BLOOM_FILTER_NUM_ITEMS_KEY, Integer.toString(expectedNumItems()), | ||
CLASSIC_BLOOM_FILTER_FPP_KEY, Double.toString(fpp())); | ||
} | ||
|
||
@Override | ||
public BloomFilter create() { | ||
return new ClassicBloomFilter(expectedNumItems(), fpp()); | ||
} | ||
|
||
@Override | ||
public BloomFilter deserialize(InputStream in) { | ||
return ClassicBloomFilter.readFrom(in); | ||
} | ||
}; | ||
} | ||
|
||
private static boolean isAdaptiveEnabled(Map<String, String> params) { | ||
return Optional.ofNullable(params.get(BLOOM_FILTER_ADAPTIVE_KEY)) | ||
.map(Boolean::parseBoolean) | ||
.orElse(DEFAULT_BLOOM_FILTER_ADAPTIVE); | ||
} | ||
|
||
protected int expectedNumItems() { | ||
return Optional.ofNullable(parameters.get(CLASSIC_BLOOM_FILTER_NUM_ITEMS_KEY)) | ||
.map(Integer::parseInt) | ||
.orElse(DEFAULT_CLASSIC_BLOOM_FILTER_NUM_ITEMS); | ||
} | ||
|
||
protected double fpp() { | ||
return Optional.ofNullable(parameters.get(CLASSIC_BLOOM_FILTER_FPP_KEY)) | ||
.map(Double::parseDouble) | ||
.orElse(DEFAULT_CLASSIC_BLOOM_FILTER_FPP); | ||
} | ||
|
||
protected int numCandidates() { | ||
return Optional.ofNullable(parameters.get(ADAPTIVE_NUMBER_CANDIDATE_KEY)) | ||
.map(Integer::parseInt) | ||
.orElse(DEFAULT_ADAPTIVE_NUMBER_CANDIDATE); | ||
} | ||
} |
230 changes: 230 additions & 0 deletions
230
...c/main/java/org/opensearch/flint/core/field/bloomfilter/adaptive/AdaptiveBloomFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.core.field.bloomfilter.adaptive; | ||
|
||
import java.io.DataInputStream; | ||
import java.io.DataOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.OutputStream; | ||
import java.util.Arrays; | ||
import java.util.Iterator; | ||
import java.util.Objects; | ||
import java.util.function.Function; | ||
import org.opensearch.flint.core.field.bloomfilter.BloomFilter; | ||
import org.opensearch.flint.core.field.bloomfilter.classic.ClassicBloomFilter; | ||
|
||
/** | ||
* Adaptive BloomFilter implementation that generates a series of bloom filter candidate | ||
* with different expected number of item (NDV) and at last choose the best one. | ||
*/ | ||
public class AdaptiveBloomFilter implements BloomFilter { | ||
|
||
/** | ||
* Initial expected number of items for the first candidate. | ||
*/ | ||
public static final int INITIAL_EXPECTED_NUM_ITEMS = 1024; | ||
|
||
/** | ||
* Total number of distinct items seen so far. | ||
*/ | ||
private int cardinality = 0; | ||
|
||
/** | ||
* BloomFilter candidates. | ||
*/ | ||
private final BloomFilterCandidate[] candidates; | ||
|
||
/** | ||
* Construct adaptive BloomFilter instance with the given algorithm parameters. | ||
* | ||
* @param numCandidates number of candidate | ||
* @param fpp false positive probability | ||
*/ | ||
public AdaptiveBloomFilter(int numCandidates, double fpp) { | ||
this.candidates = initializeCandidates(numCandidates, | ||
expectedNumItems -> new ClassicBloomFilter(expectedNumItems, fpp)); | ||
} | ||
|
||
/** | ||
* Construct adaptive BloomFilter instance from BloomFilter array deserialized from input stream. | ||
* | ||
* @param cardinality total number of distinct items | ||
* @param candidates BloomFilter candidates | ||
*/ | ||
AdaptiveBloomFilter(int cardinality, BloomFilter[] candidates) { | ||
this.cardinality = cardinality; | ||
Iterator<BloomFilter> it = Arrays.stream(candidates).iterator(); | ||
this.candidates = initializeCandidates(candidates.length, expectedNumItems -> it.next()); | ||
} | ||
|
||
/** | ||
* Deserialize adaptive BloomFilter instance from input stream. | ||
* | ||
* @param numCandidates number of candidates | ||
* @param in input stream of serialized adaptive BloomFilter instance | ||
* @return adaptive BloomFilter instance | ||
*/ | ||
public static BloomFilter readFrom(int numCandidates, InputStream in) { | ||
try { | ||
// Read total distinct counter | ||
int cardinality = new DataInputStream(in).readInt(); | ||
|
||
// Read BloomFilter candidate array | ||
BloomFilter[] candidates = new BloomFilter[numCandidates]; | ||
for (int i = 0; i < numCandidates; i++) { | ||
candidates[i] = ClassicBloomFilter.readFrom(in); | ||
} | ||
return new AdaptiveBloomFilter(cardinality, candidates); | ||
} catch (IOException e) { | ||
throw new RuntimeException("Failed to deserialize adaptive BloomFilter", e); | ||
} | ||
} | ||
|
||
/** | ||
* @return best BloomFilter candidate which has expected number of item right above total distinct counter. | ||
*/ | ||
public BloomFilterCandidate bestCandidate() { | ||
return candidates[bestCandidateIndex()]; | ||
} | ||
|
||
@Override | ||
public long bitSize() { | ||
return Arrays.stream(candidates) | ||
.mapToLong(c -> c.bloomFilter.bitSize()) | ||
.sum(); | ||
} | ||
|
||
@Override | ||
public boolean put(long item) { | ||
// Only insert into candidate with larger expectedNumItems for efficiency | ||
boolean bitChanged = false; | ||
for (int i = bestCandidateIndex(); i < candidates.length; i++) { | ||
bitChanged = candidates[i].bloomFilter.put(item); | ||
} | ||
|
||
// Use the last candidate's put result which is the most accurate | ||
if (bitChanged) { | ||
cardinality++; | ||
} | ||
return bitChanged; | ||
} | ||
|
||
@Override | ||
public BloomFilter merge(BloomFilter other) { | ||
AdaptiveBloomFilter otherBf = (AdaptiveBloomFilter) other; | ||
cardinality += otherBf.cardinality; | ||
|
||
for (int i = bestCandidateIndex(); i < candidates.length; i++) { | ||
candidates[i].bloomFilter.merge(otherBf.candidates[i].bloomFilter); | ||
} | ||
return this; | ||
} | ||
|
||
@Override | ||
public boolean mightContain(long item) { | ||
// Use the last candidate which is the most accurate | ||
return candidates[candidates.length - 1].bloomFilter.mightContain(item); | ||
} | ||
|
||
@Override | ||
public void writeTo(OutputStream out) throws IOException { | ||
// Serialized cardinality counter first | ||
new DataOutputStream(out).writeInt(cardinality); | ||
|
||
// Serialize classic BloomFilter array | ||
for (BloomFilterCandidate candidate : candidates) { | ||
candidate.bloomFilter.writeTo(out); | ||
} | ||
} | ||
|
||
private BloomFilterCandidate[] initializeCandidates(int numCandidates, | ||
Function<Integer, BloomFilter> initializer) { | ||
BloomFilterCandidate[] candidates = new BloomFilterCandidate[numCandidates]; | ||
int ndv = INITIAL_EXPECTED_NUM_ITEMS; | ||
|
||
// Initialize candidate with NDV doubled in each iteration | ||
for (int i = 0; i < numCandidates; i++, ndv *= 2) { | ||
candidates[i] = new BloomFilterCandidate(ndv, initializer.apply(ndv)); | ||
} | ||
return candidates; | ||
} | ||
|
||
private int bestCandidateIndex() { | ||
int index = Arrays.binarySearch(candidates, new BloomFilterCandidate(cardinality, null)); | ||
if (index < 0) { | ||
index = -(index + 1); | ||
} | ||
|
||
/* | ||
* Now 'index' represents the position where the current cardinality should be inserted, | ||
* indicating the best candidate to choose based on its expected number of distinct values. | ||
* The last one is chosen if cardinality exceeds each candidate's expected number. | ||
*/ | ||
return Math.min(index, candidates.length - 1); | ||
} | ||
|
||
/** | ||
* BloomFilter candidate that records expected number of items for each candidate. | ||
*/ | ||
public static class BloomFilterCandidate implements Comparable<BloomFilterCandidate> { | ||
/** | ||
* Expected number of items associated with this candidate. | ||
*/ | ||
private final int expectedNumItems; | ||
|
||
/** | ||
* BloomFilter instance. | ||
*/ | ||
private final BloomFilter bloomFilter; | ||
|
||
BloomFilterCandidate(int expectedNumItems, BloomFilter bloomFilter) { | ||
this.expectedNumItems = expectedNumItems; | ||
this.bloomFilter = bloomFilter; | ||
} | ||
|
||
public int getExpectedNumItems() { | ||
return expectedNumItems; | ||
} | ||
|
||
public BloomFilter getBloomFilter() { | ||
return bloomFilter; | ||
} | ||
|
||
@Override | ||
public int compareTo(BloomFilterCandidate other) { | ||
return Integer.compare(expectedNumItems, other.expectedNumItems); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
BloomFilterCandidate that = (BloomFilterCandidate) o; | ||
return expectedNumItems == that.expectedNumItems && Objects.equals(bloomFilter, that.bloomFilter); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(expectedNumItems, bloomFilter); | ||
} | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
AdaptiveBloomFilter that = (AdaptiveBloomFilter) o; | ||
return cardinality == that.cardinality && Arrays.equals(candidates, that.candidates); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
int result = Objects.hash(cardinality); | ||
result = 31 * result + Arrays.hashCode(candidates); | ||
return result; | ||
} | ||
} |
Oops, something went wrong.