diff --git a/server/build.gradle b/server/build.gradle index 5818ccc769b4b..b6ec22a3c4d5e 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -123,8 +123,6 @@ dependencies { api "org.apache.lucene:lucene-spatial-extras:${versions.lucene}" api "org.apache.lucene:lucene-spatial3d:${versions.lucene}" api "org.apache.lucene:lucene-suggest:${versions.lucene}" - api 'me.k11i:xor-filter:0.1.1' - api 'io.github.fastfilter:fastfilter_java:1.0.2' // utilities api project(":libs:opensearch-cli") diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java index ae59c656d769c..b7a581c5cc4c2 100644 --- a/server/src/main/java/org/opensearch/index/IndexSettings.java +++ b/server/src/main/java/org/opensearch/index/IndexSettings.java @@ -625,7 +625,7 @@ public final class IndexSettings { ); public static final Setting DOC_ID_FUZZY_SET_TYPE_SETTING = Setting.simpleString( - "index.doc_id.fuzzy_set_type", + "index.doc_id_fuzzy_set.type", FuzzySet.SetType.BLOOM_FILTER_V1.getAliases().get(0), Property.IndexScope, Property.Dynamic @@ -673,9 +673,11 @@ public final class IndexSettings { private volatile String defaultSearchPipeline; - private volatile boolean useBloomFilterForDocIds; + private volatile boolean useFuzzySetForDocIds; - private volatile double bloomFilterForDocIdFalsePositiveProbability; + private volatile double fuzzySetForDocIdFalsePositiveProbability; + + private volatile String fuzzySetType; /** * The maximum age of a retention lease before it is considered expired. @@ -873,8 +875,9 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti setMergeOnFlushPolicy(scopedSettings.get(INDEX_MERGE_ON_FLUSH_POLICY)); defaultSearchPipeline = scopedSettings.get(DEFAULT_SEARCH_PIPELINE); - useBloomFilterForDocIds = scopedSettings.get(DOC_ID_FUZZY_SET_ENABLED_SETTING); - bloomFilterForDocIdFalsePositiveProbability = scopedSettings.get(DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING); + useFuzzySetForDocIds = scopedSettings.get(DOC_ID_FUZZY_SET_ENABLED_SETTING); + fuzzySetForDocIdFalsePositiveProbability = scopedSettings.get(DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING); + fuzzySetType = scopedSettings.get(DOC_ID_FUZZY_SET_TYPE_SETTING); scopedSettings.addSettingsUpdateConsumer(MergePolicyConfig.INDEX_COMPOUND_FORMAT_SETTING, mergePolicyConfig::setNoCFSRatio); scopedSettings.addSettingsUpdateConsumer( @@ -957,6 +960,8 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti this::setUseFuzzySetForDocIdSetting); scopedSettings.addSettingsUpdateConsumer(DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING, this::setFuzzySetForDocIdFalsePositiveProbability); + scopedSettings.addSettingsUpdateConsumer(DOC_ID_FUZZY_SET_TYPE_SETTING, + this::setFuzzySetType); } private void setSearchSegmentOrderReversed(boolean reversed) { @@ -1661,12 +1666,12 @@ public void setDefaultSearchPipeline(String defaultSearchPipeline) { } public boolean useFuzzySetForDocIds() { - return useBloomFilterForDocIds; + return useFuzzySetForDocIds; } public void setUseFuzzySetForDocIdSetting(boolean useBloomFilterForDocIds) { if (FeatureFlags.isEnabled(FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS)) { - this.useBloomFilterForDocIds = useBloomFilterForDocIds; + this.useFuzzySetForDocIds = useBloomFilterForDocIds; } else { throw new IllegalArgumentException("Setting cannot be updated as feature flag: " + FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS + " is not enabled."); @@ -1674,12 +1679,26 @@ public void setUseFuzzySetForDocIdSetting(boolean useBloomFilterForDocIds) { } public double getFuzzySetForDocIdFalsePositiveProbability() { - return bloomFilterForDocIdFalsePositiveProbability; + return fuzzySetForDocIdFalsePositiveProbability; } public void setFuzzySetForDocIdFalsePositiveProbability(double bloomFilterForDocIdFalsePositiveProbability) { if (FeatureFlags.isEnabled(FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS)) { - this.bloomFilterForDocIdFalsePositiveProbability = bloomFilterForDocIdFalsePositiveProbability; + this.fuzzySetForDocIdFalsePositiveProbability = bloomFilterForDocIdFalsePositiveProbability; + } else { + throw new IllegalArgumentException("Setting cannot be updated as feature flag: " + FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS + + " is not enabled."); + } + } + + public String getFuzzySetType() { + return fuzzySetType; + } + + public void setFuzzySetType(String fuzzySetType) { + if (FeatureFlags.isEnabled(FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS)) { + FuzzySet.SetType.fromAlias(fuzzySetType); + this.fuzzySetType = fuzzySetType; } else { throw new IllegalArgumentException("Setting cannot be updated as feature flag: " + FeatureFlags.BLOOM_FILTER_FOR_DOC_IDS + " is not enabled."); diff --git a/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java b/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java index 5cc8d6a3abcec..9e3596e4801b2 100644 --- a/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java +++ b/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java @@ -97,7 +97,7 @@ private PostingsFormat getFuzzyFilterPostingsFormat(String field) { if (fuzzyFilterPostingsFormat == null) { fuzzyFilterPostingsFormat = new FuzzyFilterPostingsFormat(super.getPostingsFormatForField(field), new FuzzySetFactory(Map.of( IdFieldMapper.NAME, new FuzzySetParameters(mapperService.getIndexSettings().getFuzzySetForDocIdFalsePositiveProbability(), - FuzzySet.SetType.fromAlias(mapperService.getIndexSettings().getFuzzySetForDocIdFalsePositiveProbability())) // This can be replaced with a setting/mapping type + FuzzySet.SetType.fromAlias(mapperService.getIndexSettings().getFuzzySetType())) // This can be replaced with a setting/mapping type ))); } return fuzzyFilterPostingsFormat; diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java index 209e2e03960e2..c99422c2553c0 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java @@ -13,8 +13,10 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; +import org.opensearch.common.CheckedSupplier; import java.io.IOException; +import java.util.Iterator; import java.util.Optional; /** @@ -58,13 +60,7 @@ public class BloomFilter implements FuzzySet { this.hashCount = hashCount; } - private BloomFilter(FixedBitSet filter, int setSize, int hashCount) { - this.filter = filter; - this.setSize = setSize; - this.hashCount = hashCount; - } - - public BloomFilter(int maxDocs, double maxFpp) { + public BloomFilter(int maxDocs, double maxFpp, CheckedSupplier, IOException> fieldIteratorProvider) throws IOException { int setSize = (int) Math.ceil( @@ -76,6 +72,8 @@ public BloomFilter(int maxDocs, double maxFpp) { this.filter = new FixedBitSet(setSize + 1); this.setSize = setSize; this.hashCount = optimalK; + + addAll(fieldIteratorProvider); } static int getNearestSetSize(int maxNumberOfBits) { @@ -152,4 +150,14 @@ private boolean mayContainValue(int aHash) { int pos = aHash & setSize; return filter.get(pos); } + + public static void main(String[] args) { + double[] d = new double[]{0.0511, 0.1023, 0.2047}; + for (double fpp: d) { + int setSize = + (int) + Math.ceil((120000000 * Math.log(fpp)) / Math.log(1 / Math.pow(2, Math.log(2)))); + System.out.println(fpp + " -> " + getNearestSetSize(2 * setSize)); + } + } } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/CuckooFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/CuckooFilter.java index e296a6893c0a9..392fca5292c1c 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/CuckooFilter.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/CuckooFilter.java @@ -8,28 +8,46 @@ package org.opensearch.index.codec.fuzzy; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.XPackedInts; +import org.opensearch.common.CheckedSupplier; import org.opensearch.common.hash.MurmurHash3; +import org.opensearch.common.io.stream.BytesStreamInput; +import org.opensearch.common.io.stream.BytesStreamOutput; +import org.opensearch.common.io.stream.StreamInput; +import org.opensearch.common.io.stream.StreamOutput; import java.io.IOException; +import java.util.Iterator; +import java.util.List; import java.util.Optional; import java.util.Random; public class CuckooFilter implements FuzzySet { + private static final Logger logger = LogManager.getLogger(CuckooFilter.class); + private final org.opensearch.common.util.CuckooFilter delegatingFilter; private MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128(); - private static final Random RNG = new Random(0xFFFFF); + private final long seed; - public CuckooFilter(int maxDocs, double fpp) { - this.delegatingFilter = new org.opensearch.common.util.CuckooFilter(maxDocs, fpp, RNG); + public CuckooFilter(int maxDocs, double fpp, CheckedSupplier, IOException> fieldIteratorProvider) throws IOException { + this.seed = System.nanoTime(); + this.delegatingFilter = new org.opensearch.common.util.CuckooFilter(maxDocs, fpp, new Random(seed)); + addAll(fieldIteratorProvider); } CuckooFilter(DataInput in) throws IOException { - this.delegatingFilter = new org.opensearch.common.util.CuckooFilter(StreamWrapper.fromDataInput(in), RNG); + this.seed = in.readLong(); + this.delegatingFilter = new org.opensearch.common.util.CuckooFilter(StreamWrapper.fromDataInput(in), new Random(seed)); } @Override @@ -46,7 +64,11 @@ public Result contains(BytesRef value) { @Override public void add(BytesRef value) { MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); - delegatingFilter.add(hash.h1); + boolean added = delegatingFilter.add(hash.h1); + if(!added) { + logger.error("Failed to insert into the cuckoo filter: " + value); + throw new IllegalStateException("Failed to insert into the cuckoo filter: " + value); + } } @Override @@ -61,6 +83,7 @@ public Optional maybeDownsize() { @Override public void writeTo(DataOutput out) throws IOException { + out.writeLong(seed); delegatingFilter.writeTo(StreamWrapper.wrap(out)); } @@ -68,4 +91,48 @@ public void writeTo(DataOutput out) throws IOException { public long ramBytesUsed() { return delegatingFilter.getSizeInBytes(); } + + @Override + public String toString() { + return "CuckooFilter{" + + "delegatingFilter=" + delegatingFilter + + ", scratchHash=" + scratchHash + + ", seed=" + seed + + '}'; + } + + public static void main(String[] args) throws IOException { +// CuckooFilter filter = new CuckooFilter(100, 0.2047, () ->List.of(new BytesRef("item1"), new BytesRef("item2"), new BytesRef("item3")).iterator()); +// System.out.println(filter.contains(new BytesRef("item1"))); +// System.out.println(filter.contains(new BytesRef("item2"))); +// System.out.println(filter.contains(new BytesRef("item3"))); +// System.out.println(filter.contains(new BytesRef("item4"))); +// System.out.println(filter.contains(new BytesRef("item5"))); +// byte[] b = new byte[1000000]; +// DataOutput output = new ByteArrayDataOutput(b); +// filter.writeTo(output); +// +// for (int i = 0; i < 100; i ++) { +// System.out.print(b[i] + " "); +// } +// System.out.println(); +// +// CuckooFilter filter2 = new CuckooFilter(new ByteArrayDataInput(b)); +// System.out.println(filter2.contains(new BytesRef("item1"))); +// System.out.println(filter2.contains(new BytesRef("item2"))); +// System.out.println(filter2.contains(new BytesRef("item3"))); +// System.out.println(filter2.contains(new BytesRef("item4"))); +// System.out.println(filter2.contains(new BytesRef("item5"))); + + XPackedInts.Mutable data = XPackedInts.getMutable(323, 32, PackedInts.COMPACT); + byte[] b = new byte[100000]; + for (int i = 0; i < 323; i ++) { + data.set(i, i); + } + data.save(new ByteArrayDataOutput(b)); + XPackedInts.Mutable data2 = (XPackedInts.Mutable)XPackedInts.getReader(new ByteArrayDataInput(b)); + for (int i = 0; i < 323; i ++) { + assert data.get(i) == data2.get(i); + } + } } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java index e3ab7a7ae7b11..d1c587028dcbe 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java @@ -350,33 +350,73 @@ public void write(Fields fields, NormsProducer norms) throws IOException { continue; } FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); - TermsEnum termsEnum = terms.iterator(); +// TermsEnum termsEnum = terms.iterator(); FuzzySet fuzzySet = null; PostingsEnum postingsEnum = null; - while (true) { - BytesRef term = termsEnum.next(); - if (term == null) { - break; - } +// while (true) { +// BytesRef term = termsEnum.next(); +// if (term == null) { +// break; +// } +// if (fuzzySet == null) { +// fuzzySet = fuzzySetFactory.createFuzzySet(state.segmentInfo.maxDoc(), fieldInfo.name); +// if (fuzzySet == null) { +// break; +// } +// assert fuzzySets.containsKey(fieldInfo) == false; +// fuzzySets.put(fieldInfo, fuzzySet); +// } +// // Make sure there's at least one doc for this term: +// postingsEnum = termsEnum.postings(postingsEnum, 0); +// if (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { +// fuzzySet.add(term); +// } +// } + + if (fuzzySet == null) { + fuzzySet = fuzzySetFactory.createFuzzySet(state.segmentInfo.maxDoc(), fieldInfo.name, () -> iterator(terms)); if (fuzzySet == null) { - fuzzySet = fuzzySetFactory.createFuzzySet(state.segmentInfo.maxDoc(), fieldInfo.name); - if (fuzzySet == null) { - break; - } - assert fuzzySets.containsKey(fieldInfo) == false; - fuzzySets.put(fieldInfo, fuzzySet); - } - // Make sure there's at least one doc for this term: - postingsEnum = termsEnum.postings(postingsEnum, 0); - if (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { - fuzzySet.add(term); + break; } + assert fuzzySets.containsKey(fieldInfo) == false; + fuzzySets.put(fieldInfo, fuzzySet); } } } + private Iterator iterator(Terms terms) throws IOException { + TermsEnum termIterator = terms.iterator(); + return new Iterator<>() { + + private BytesRef currentTerm; + private PostingsEnum postingsEnum; + @Override + public boolean hasNext() { + try { + do { + currentTerm = termIterator.next(); + if (currentTerm == null) { + return false; + } + postingsEnum = termIterator.postings(postingsEnum, 0); + if (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { + return true; + } + } while (true); + } catch (IOException ex) { + throw new IllegalStateException("Cannot read terms: " + termIterator.attributes()); + } + } + + @Override + public BytesRef next() { + return currentTerm; + } + }; + } + private boolean closed; @Override diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java index d0a084175585a..4ffe5d23bf9bb 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java @@ -13,10 +13,13 @@ import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; import org.opensearch.common.CheckedFunction; +import org.opensearch.common.CheckedSupplier; import java.io.IOException; +import java.util.Iterator; import java.util.List; import java.util.Optional; +import java.util.function.Supplier; /** * Fuzzy Filter interface @@ -43,11 +46,12 @@ public interface FuzzySet extends Accountable { /** * Add all items to the underlying set. * Implementations can choose to perform this using an optimized strategy based on the type of set. - * @param values All values which should be added to the set. + * @param valuesIteratorProvider Supplier for an iterator over All values which should be added to the set. */ - default void addAll(Iterable values) { - for (BytesRef val: values) { - add(val); + default void addAll(CheckedSupplier, IOException> valuesIteratorProvider) throws IOException { + Iterator values = valuesIteratorProvider.get(); + while (values.hasNext()) { + add(values.next()); } } @@ -73,7 +77,8 @@ enum Result { enum SetType { BLOOM_FILTER_V1("bloom_filter_v1", BloomFilter::new, List.of("bloom_filter")), - CUCKOO_FILTER_V1("cuckoo_filter_v1", CuckooFilter::new, List.of("cuckoo_filter")); + CUCKOO_FILTER_V1("cuckoo_filter_v1", CuckooFilter::new, List.of("cuckoo_filter")), + XOR_FILTER_V1("xor_filter_v1", XORFilter::new, List.of("xor_filter")); private final String setName; private final CheckedFunction deserializer; @@ -121,7 +126,10 @@ public static SetType fromAlias(String alias) { } } } - throw new IllegalArgumentException("There is no implementation for fuzzy set for alias: " + alias); + if (toReturn == null) { + throw new IllegalArgumentException("There is no implementation for fuzzy set for alias: " + alias); + } + return toReturn; } } } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java index 74a7a552e25e5..bb387a248da50 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java @@ -9,8 +9,11 @@ package org.opensearch.index.codec.fuzzy; import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.BytesRef; +import org.opensearch.common.CheckedSupplier; import java.io.IOException; +import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -32,16 +35,18 @@ public FuzzySetFactory(Map setTypeForField) { // setTypeForField.compute(fieldName, (k, v) -> new FuzzySetParameters(maxFalsePositiveProbability, v.getSetType())); // } - public FuzzySet createFuzzySet(int maxDocs, String fieldName) { + public FuzzySet createFuzzySet(int maxDocs, String fieldName, CheckedSupplier, IOException> iteratorProvider) throws IOException { FuzzySetParameters params = setTypeForField.get(fieldName); if (params == null) { throw new IllegalArgumentException("No fuzzy set defined for field: " + fieldName); } switch (params.getSetType()) { case BLOOM_FILTER_V1: - return new BloomFilter(maxDocs, params.getFalsePositiveProbability()); + return new BloomFilter(maxDocs, params.getFalsePositiveProbability(), iteratorProvider); case CUCKOO_FILTER_V1: - return new CuckooFilter(maxDocs, params.getFalsePositiveProbability()); + return new CuckooFilter(maxDocs, params.getFalsePositiveProbability(), iteratorProvider); + case XOR_FILTER_V1: + return new XORFilter(iteratorProvider); default: throw new IllegalArgumentException("No Implementation for set type: " + params.getSetType()); } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/XORFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/XORFilter.java new file mode 100644 index 0000000000000..bd47021dbcc01 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/XORFilter.java @@ -0,0 +1,300 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.codec.fuzzy; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BytesRef; +import org.opensearch.common.CheckedSupplier; +import org.opensearch.common.hash.MurmurHash3; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; + +public class XORFilter implements FuzzySet { + + private static final Logger logger = LogManager.getLogger(XORFilter.class); + + private static final int BITS_PER_FINGERPRINT = 8; + private static final int HASHES = 3; + private static final int OFFSET = 32; + private static final int FACTOR_TIMES_100 = 123; + private static final MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128(); + + private final int size; + private final int arrayLength; + private final int blockLength; + private long seed; + private byte[] fingerprints; + private final int bitCount; + private final AtomicBoolean frozen = new AtomicBoolean(); + + public XORFilter(CheckedSupplier, IOException> iteratorProvider) throws IOException { + this.size = count(iteratorProvider.get()); + arrayLength = getArrayLength(size); + bitCount = arrayLength * BITS_PER_FINGERPRINT; + blockLength = arrayLength / HASHES; + addAll(iteratorProvider); + } + + private static int count(Iterator iterator) { + int cnt = 0; + while (iterator.hasNext()) { + cnt ++; + iterator.next(); + } + return cnt; + } + + public XORFilter(DataInput in) throws IOException { + this.size = in.readInt(); + arrayLength = getArrayLength(this.size); + bitCount = arrayLength * BITS_PER_FINGERPRINT; + blockLength = arrayLength / HASHES; + seed = in.readLong(); + fingerprints = new byte[arrayLength]; + in.readBytes(fingerprints, 0, fingerprints.length); + } + + @Override + public SetType setType() { + return SetType.XOR_FILTER_V1; + } + + @Override + public Result contains(BytesRef value) { + MurmurHash3.Hash128 hashed = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + long hash = Hash.hash64(hashed.h1, seed); + int f = fingerprint(hash); + int r0 = (int) hash; + int r1 = (int) Long.rotateLeft(hash, 21); + int r2 = (int) Long.rotateLeft(hash, 42); + + int h0 = Hash.reduce(r0, blockLength); + int h1 = Hash.reduce(r1, blockLength) + blockLength; + int h2 = Hash.reduce(r2, blockLength) + 2 * blockLength; + + f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2]; + return (f & 0xff) == 0 ? Result.MAYBE : Result.NO; + } + + @Override + public void add(BytesRef value) { + throw new UnsupportedOperationException("Cannot add a single element to xor filter!"); + } + + @Override + public void addAll(CheckedSupplier, IOException> valuesProvider) throws IOException { + if (!frozen.compareAndSet(false, true)) { + throw new IllegalStateException("Filter is frozen and cannot be updated"); + } + + int m = arrayLength; + long[] reverseOrder = new long[size]; + byte[] reverseH = new byte[size]; + int reverseOrderPos; + long seed; + boolean isFirstIteration = true; + int cnt = 0; + + mainloop: + do { + if (isFirstIteration) cnt = 0; + + seed = Hash.randomSeed(); + reverseOrderPos = 0; + byte[] t2count = new byte[m]; + long[] t2 = new long[m]; + Iterator iterator = valuesProvider.get(); + + while (iterator.hasNext()) { + if (isFirstIteration) cnt += 1; + + BytesRef value = iterator.next(); + MurmurHash3.Hash128 hashed = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + long k = hashed.h1; + for (int hi = 0; hi < HASHES; hi++) { + int h = getHash(k, seed, hi); + t2[h] ^= k; + if (t2count[h] > 120) { + // probably something wrong with the hash function + continue mainloop; + } + t2count[h]++; + } + } + isFirstIteration = false; + + int[][] alone = new int[HASHES][blockLength]; + int[] alonePos = new int[HASHES]; + for (int nextAlone = 0; nextAlone < HASHES; nextAlone++) { + for (int i = 0; i < blockLength; i++) { + if (t2count[nextAlone * blockLength + i] == 1) { + alone[nextAlone][alonePos[nextAlone]++] = nextAlone * blockLength + i; + } + } + } + int found = -1; + while (true) { + int i = -1; + for (int hi = 0; hi < HASHES; hi++) { + if (alonePos[hi] > 0) { + i = alone[hi][--alonePos[hi]]; + found = hi; + break; + } + } + if (i == -1) { + // no entry found + break; + } + if (t2count[i] <= 0) { + continue; + } + long k = t2[i]; + if (t2count[i] != 1) { + throw new AssertionError(); + } + --t2count[i]; + for (int hi = 0; hi < HASHES; hi++) { + if (hi != found) { + int h = getHash(k, seed, hi); + int newCount = --t2count[h]; + if (newCount == 1) { + alone[hi][alonePos[hi]++] = h; + } + t2[h] ^= k; + } + } + reverseOrder[reverseOrderPos] = k; + reverseH[reverseOrderPos] = (byte) found; + reverseOrderPos++; + } + } while (reverseOrderPos != size); // This was size in actual but changed it to use actual count of docs in iterator. + + this.seed = seed; + byte[] fp = new byte[m]; + for (int i = reverseOrderPos - 1; i >= 0; i--) { + long k = reverseOrder[i]; + int found = reverseH[i]; + int change = -1; + long hash = Hash.hash64(k, seed); + int xor = fingerprint(hash); + for (int hi = 0; hi < HASHES; hi++) { + int h = getHash(k, seed, hi); + if (found == hi) { + change = h; + } else { + xor ^= fp[h]; + } + } + fp[change] = (byte) xor; + } + fingerprints = new byte[m]; + System.arraycopy(fp, 0, fingerprints, 0, fp.length); + } + + @Override + public boolean isSaturated() { + return false; + } + + @Override + public Optional maybeDownsize() { + return Optional.empty(); + } + + @Override + public void writeTo(DataOutput out) throws IOException { + out.writeInt(size); + out.writeLong(seed); + out.writeBytes(fingerprints, fingerprints.length); + } + + @Override + public long ramBytesUsed() { + return 0; + } + + private int getHash(long key, long seed, int index) { + long r = Long.rotateLeft(Hash.hash64(key, seed), 21 * index); + r = Hash.reduce((int) r, blockLength); + r = r + index * blockLength; + return (int) r; + } + + private int fingerprint(long hash) { + return (int) (hash & ((1 << BITS_PER_FINGERPRINT) - 1)); + } + + private static int getArrayLength(int size) { + return (int) (OFFSET + (long) FACTOR_TIMES_100 * size / 100); + } + + private static class Hash { + + private static Random random = new Random(); + + public static void setSeed(long seed) { + random.setSeed(seed); + } + + public static long hash64(long x, long seed) { + x += seed; + x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL; + x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L; + x = x ^ (x >>> 33); + return x; + } + + public static long randomSeed() { + return random.nextLong(); + } + + /** + * Shrink the hash to a value 0..n. Kind of like modulo, but using + * multiplication and shift, which are faster to compute. + * + * @param hash the hash + * @param n the maximum of the result + * @return the reduced value + */ + public static int reduce(int hash, int n) { + // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + return (int) (((hash & 0xffffffffL) * n) >>> 32); + } + } + + public static void main(String[] args) throws Exception { + XORFilter filter = new XORFilter(() -> List.of(new BytesRef("item1"), new BytesRef("item2"), new BytesRef("item3")).iterator()); + System.out.println(filter.contains(new BytesRef("item1"))); + System.out.println(filter.contains(new BytesRef("item2"))); + System.out.println(filter.contains(new BytesRef("item3"))); + System.out.println(filter.contains(new BytesRef("item5"))); + + byte[] b = new byte[1000000]; + DataOutput output = new ByteArrayDataOutput(b); + + filter.writeTo(output); + + XORFilter filter1 = new XORFilter(new ByteArrayDataInput(b)); + System.out.println(filter1.contains(new BytesRef("item1"))); + System.out.println(filter1.contains(new BytesRef("item2"))); + System.out.println(filter1.contains(new BytesRef("item3"))); + System.out.println(filter1.contains(new BytesRef("item5"))); + } +}