From beccd4efff6ea882ab468164cc51694915e4757f Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 04:52:15 -0500 Subject: [PATCH 01/12] Upgrade to Lucene 9.9 --- pom.xml | 2 +- src/main/java/io/anserini/index/IndexCollection.java | 3 ++- .../java/io/anserini/index/IndexHnswDenseVectors.java | 8 ++++---- .../java/io/anserini/index/IndexInvertedDenseVectors.java | 4 ++-- .../io/anserini/search/SearchHnswDenseVectorsTest.java | 4 ++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pom.xml b/pom.xml index 8027e32204..e4cd61b282 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ - 9.8.0 + 9.9.0 UTF-8 diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index f8775d92f0..f6148880b0 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -31,6 +31,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -190,7 +191,7 @@ public IndexCollection(Args args) throws Exception { } final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()); + final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()).setCodec(new Lucene99Codec()); if (args.bm25Accurate) { // Necessary during indexing as the norm used in BM25 is already determined at index time. diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 10e7c15640..c66d9db5cf 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -23,8 +23,8 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; -import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -80,11 +80,11 @@ public IndexHnswDenseVectors(Args args) throws Exception { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); final IndexWriterConfig config = new IndexWriterConfig().setCodec( - new Lucene95Codec() { + new Lucene99Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new DelegatingKnnVectorsFormat( - new Lucene95HnswVectorsFormat(args.M, args.efC), 4096); + new Lucene99HnswVectorsFormat(args.M, args.efC), 4096); } }); diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index e3a0c8e810..608a773c6b 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -104,7 +104,7 @@ public IndexInvertedDenseVectors(Args args) { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec()); + final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene99Codec()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); config.setUseCompoundFile(false); diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java index 2f9bb98032..5e27246346 100644 --- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java @@ -308,8 +308,8 @@ public void test1() throws Exception { "160885 Q0 44 2 0.861596 Anserini", "160885 Q0 40 3 0.858651 Anserini", "160885 Q0 48 4 0.858514 Anserini", - "160885 Q0 41 5 0.856264 Anserini", - "867490 Q0 10 1 0.850332 Anserini", + "160885 Q0 41 5 0.856265 Anserini", + "867490 Q0 10 1 0.850331 Anserini", "867490 Q0 45 2 0.846281 Anserini", "867490 Q0 44 3 0.845236 Anserini", "867490 Q0 95 4 0.845013 Anserini", From 4d6ae03e18405b6ee6bcffe1215749453d6728a1 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 05:34:43 -0500 Subject: [PATCH 02/12] Add int8 option for hnsw. --- .../anserini/index/IndexHnswDenseVectors.java | 33 ++++++++++++++----- .../index/IndexHnswDenseVectorsTest.java | 22 +++++++++++++ 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index c66d9db5cf..7b0b55b86e 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.lucene99.Lucene99Codec; +import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; @@ -56,6 +57,9 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction") public int efC = 100; + @Option(name = "-quantize.int8", usage = "Quantize vectors into int8.") + public boolean quantizeInt8 = false; + @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") public boolean storeVectors = false; } @@ -79,14 +83,27 @@ public IndexHnswDenseVectors(Args args) throws Exception { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig().setCodec( - new Lucene99Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new DelegatingKnnVectorsFormat( - new Lucene99HnswVectorsFormat(args.M, args.efC), 4096); - } - }); + final IndexWriterConfig config; + + if (args.quantizeInt8) { + config = new IndexWriterConfig().setCodec( + new Lucene99Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene99HnswScalarQuantizedVectorsFormat(args.M, args.efC), 4096); + } + }); + } else { + config = new IndexWriterConfig().setCodec( + new Lucene99Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene99HnswVectorsFormat(args.M, args.efC), 4096); + } + }); + } config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); diff --git a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java index bade9e0366..535475aee5 100644 --- a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java @@ -152,4 +152,26 @@ public void test1() throws Exception { assertNotNull(results); assertEquals(100, results.get("documents")); } + + @Test + public void testQuantizedInt8() throws Exception { + String indexPath = "target/idx-sample-hnsw" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-index", indexPath, + "-generator", "HnswDenseVectorDocumentGenerator", + "-threads", "1", + "-M", "16", "-efC", "100", "-quantize.int8" + }; + + IndexHnswDenseVectors.main(indexArgs); + + IndexReader reader = IndexReaderUtils.getReader(indexPath); + assertNotNull(reader); + + Map results = IndexReaderUtils.getIndexStats(reader, Constants.VECTOR); + assertNotNull(results); + assertEquals(100, results.get("documents")); + } } \ No newline at end of file From 4473ba6ed8cecb8a58316188db28dadd9183c84c Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 07:36:28 -0500 Subject: [PATCH 03/12] Add int8 regression. --- .../io/anserini/index/IndexCollection.java | 29 +++++---- .../anserini/index/IndexHnswDenseVectors.java | 13 ++-- .../index/IndexInvertedDenseVectors.java | 9 +-- ...arco-passage-cos-dpr-distil-hnsw-int8.yaml | 65 +++++++++++++++++++ 4 files changed, 92 insertions(+), 24 deletions(-) create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index f6148880b0..f80c778cd3 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -162,20 +162,6 @@ public static class Args extends AbstractIndexer.Args { public IndexCollection(Args args) throws Exception { super(args); - LOG.info("IndexCollection settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + Language: " + args.language); - LOG.info(" + Stemmer: " + args.stemmer); - LOG.info(" + Keep stopwords? " + args.keepStopwords); - LOG.info(" + Stopwords: " + args.stopwords); - LOG.info(" + Store positions? " + args.storePositions); - LOG.info(" + Store docvectors? " + args.storeDocvectors); - LOG.info(" + Store document \"contents\" field? " + args.storeContents); - LOG.info(" + Store document \"raw\" field? " + args.storeRaw); - LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields)); - LOG.info(" + Whitelist: " + args.whitelist); - LOG.info(" + Pretokenized?: " + args.pretokenized); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -207,6 +193,21 @@ public IndexCollection(Args args) throws Exception { config.setMergeScheduler(new ConcurrentMergeScheduler()); super.writer = new IndexWriter(dir, config); + + LOG.info("IndexCollection settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Language: " + args.language); + LOG.info(" + Stemmer: " + args.stemmer); + LOG.info(" + Keep stopwords? " + args.keepStopwords); + LOG.info(" + Stopwords: " + args.stopwords); + LOG.info(" + Store positions? " + args.storePositions); + LOG.info(" + Store docvectors? " + args.storeDocvectors); + LOG.info(" + Store document \"contents\" field? " + args.storeContents); + LOG.info(" + Store document \"raw\" field? " + args.storeRaw); + LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields)); + LOG.info(" + Whitelist: " + args.whitelist); + LOG.info(" + Pretokenized?: " + args.pretokenized); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); } private Analyzer getAnalyzer() { diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 7b0b55b86e..1834baa9a9 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -68,12 +68,6 @@ public static final class Args extends AbstractIndexer.Args { public IndexHnswDenseVectors(Args args) throws Exception { super(args); - LOG.info("HnswIndexer settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + M: " + args.M); - LOG.info(" + efC: " + args.efC); - LOG.info(" + Store document vectors? " + args.storeVectors); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -123,6 +117,13 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } catch (Exception e) { throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); } + + LOG.info("HnswIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + M: " + args.M); + LOG.info(" + efC: " + args.efC); + LOG.info(" + Store document vectors? " + args.storeVectors); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index 608a773c6b..b415bf8efa 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -78,10 +78,6 @@ public static final class Args extends AbstractIndexer.Args { public IndexInvertedDenseVectors(Args args) { super(args); - LOG.info("InvertedDenseIndexer settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + Encoding: " + args.encoding); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -113,6 +109,11 @@ public IndexInvertedDenseVectors(Args args) { } catch (Exception e) { throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); } + + LOG.info("InvertedDenseIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Encoding: " + args.encoding); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); } public static void main(String[] args) throws Exception { diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..7390028fcb --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.393 + RR@10: + - 0.388 + R@100: + - 0.903 + R@1000: + - 0.974 From 0ab46f875f9af32aaffd122baf935e80389076d1 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 12:30:38 -0500 Subject: [PATCH 04/12] Tweak to HNSW iwc. --- .../java/io/anserini/index/IndexHnswDenseVectors.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 1834baa9a9..5aebf37f10 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -101,17 +101,23 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); + config.setRAMPerThreadHardLimitMB(2047); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); + TieredMergePolicy mergePolicy = new TieredMergePolicy(); if (args.optimize) { // If we're going to merge down into a single segment at the end, skip intermediate merges, // since they are a waste of time. - TieredMergePolicy mergePolicy = new TieredMergePolicy(); mergePolicy.setMaxMergeAtOnce(256); mergePolicy.setSegmentsPerTier(256); - config.setMergePolicy(mergePolicy); + } else { + mergePolicy.setMaxMergedSegmentMB(1024 * 16); + mergePolicy.setFloorSegmentMB(1024); + mergePolicy.setSegmentsPerTier(16); + mergePolicy.setMaxMergeAtOnce(16); } + config.setMergePolicy(mergePolicy); this.writer = new IndexWriter(dir, config); } catch (Exception e) { @@ -124,6 +130,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { LOG.info(" + efC: " + args.efC); LOG.info(" + Store document vectors? " + args.storeVectors); LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); + LOG.info(" + MemoryBuffer: " + args.memoryBuffer); } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html From 04043cad802d9c07782944b47d2a5fc582769062 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 13:22:10 -0500 Subject: [PATCH 05/12] Add config. --- .../anserini/index/IndexHnswDenseVectors.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 5aebf37f10..d84f2429b2 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -62,6 +62,15 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") public boolean storeVectors = false; + + @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).") + public int maxMergedSegmentSize = 1024 * 16; + + @Option(name = "-segmentsPerTier", metaVar = "[num]", usage = "Allowed number of segments per tier.") + public int segmentsPerTier = 10; + + @Option(name = "-maxMergeAtOnce", metaVar = "[num]", usage = "Maximum number of segments to be merged at a time during \"normal\" merging.") + public int maxMergeAtOnce = 10; } @SuppressWarnings("unchecked") @@ -101,7 +110,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); - config.setRAMPerThreadHardLimitMB(2047); + config.setRAMPerThreadHardLimitMB(2047); // Max possible value. config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); @@ -112,10 +121,10 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { mergePolicy.setMaxMergeAtOnce(256); mergePolicy.setSegmentsPerTier(256); } else { - mergePolicy.setMaxMergedSegmentMB(1024 * 16); mergePolicy.setFloorSegmentMB(1024); - mergePolicy.setSegmentsPerTier(16); - mergePolicy.setMaxMergeAtOnce(16); + mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize); + mergePolicy.setSegmentsPerTier(args.segmentsPerTier); + mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce); } config.setMergePolicy(mergePolicy); @@ -131,6 +140,9 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { LOG.info(" + Store document vectors? " + args.storeVectors); LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); LOG.info(" + MemoryBuffer: " + args.memoryBuffer); + LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize); + LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier); + LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce); } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html From 0b9f79e7b0a1695ae5abfb1c9621e52b3f827b2c Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 13 Dec 2023 16:35:05 -0500 Subject: [PATCH 06/12] Added -noMerge option. --- ...ons-msmarco-passage-cos-dpr-distil-hnsw.md | 2 +- .../anserini/index/IndexHnswDenseVectors.java | 43 +++++++++++++------ ...arco-passage-cos-dpr-distil-hnsw-int8.yaml | 2 +- .../msmarco-passage-cos-dpr-distil-hnsw.yaml | 2 +- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md index cf41e9645a..641106d14d 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index d84f2429b2..4046d87009 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -29,6 +29,8 @@ import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TieredMergePolicy; @@ -63,6 +65,9 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") public boolean storeVectors = false; + @Option(name = "-noMerge", usage = "Do not merge segments (fast indexing, slow retrieval).") + public boolean noMerge = false; + @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).") public int maxMergedSegmentSize = 1024 * 16; @@ -114,19 +119,23 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); - TieredMergePolicy mergePolicy = new TieredMergePolicy(); - if (args.optimize) { - // If we're going to merge down into a single segment at the end, skip intermediate merges, - // since they are a waste of time. - mergePolicy.setMaxMergeAtOnce(256); - mergePolicy.setSegmentsPerTier(256); + if (args.noMerge) { + config.setMergePolicy(NoMergePolicy.INSTANCE); } else { - mergePolicy.setFloorSegmentMB(1024); - mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize); - mergePolicy.setSegmentsPerTier(args.segmentsPerTier); - mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce); + TieredMergePolicy mergePolicy = new TieredMergePolicy(); + if (args.optimize) { + // If we're going to merge down into a single segment at the end, skip intermediate merges, + // since they are a waste of time. + mergePolicy.setMaxMergeAtOnce(256); + mergePolicy.setSegmentsPerTier(256); + } else { + mergePolicy.setFloorSegmentMB(1024); + mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize); + mergePolicy.setSegmentsPerTier(args.segmentsPerTier); + mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce); + } + config.setMergePolicy(mergePolicy); } - config.setMergePolicy(mergePolicy); this.writer = new IndexWriter(dir, config); } catch (Exception e) { @@ -140,9 +149,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { LOG.info(" + Store document vectors? " + args.storeVectors); LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); LOG.info(" + MemoryBuffer: " + args.memoryBuffer); - LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize); - LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier); - LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce); + + if (args.noMerge) { + LOG.info(" + MergePolicy: NoMerge"); + } else { + LOG.info(" + MergePolicy: TieredMergePolicy"); + LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize); + LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier); + LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce); + } } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml index 7390028fcb..6f93e3b017 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 -quantize.int8 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml index c63c3b7972..a235e7823e 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 From 93014741c93d2456576025bbb5d87ac836345362 Mon Sep 17 00:00:00 2001 From: lintool Date: Thu, 14 Dec 2023 11:04:44 -0500 Subject: [PATCH 07/12] Updated regressions yaml files. --- ...s-dl19-passage-cos-dpr-distil-hnsw-onnx.md | 2 +- ...ssions-dl19-passage-cos-dpr-distil-hnsw.md | 2 +- .../regressions-dl19-passage-openai-ada2.md | 2 +- ...s-dl20-passage-cos-dpr-distil-hnsw-onnx.md | 2 +- ...ssions-dl20-passage-cos-dpr-distil-hnsw.md | 2 +- .../regressions-dl20-passage-openai-ada2.md | 2 +- ...smarco-passage-cos-dpr-distil-hnsw-onnx.md | 2 +- ...regressions-msmarco-passage-openai-ada2.md | 2 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 +++++++++++++++++++ ...dl19-passage-cos-dpr-distil-hnsw-int8.yaml | 65 +++++++++++++++++++ ...dl19-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../dl19-passage-cos-dpr-distil-hnsw.yaml | 2 +- .../dl19-passage-openai-ada2-int8.yaml | 65 +++++++++++++++++++ .../regression/dl19-passage-openai-ada2.yaml | 2 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 +++++++++++++++++++ ...dl20-passage-cos-dpr-distil-hnsw-int8.yaml | 65 +++++++++++++++++++ ...dl20-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../dl20-passage-cos-dpr-distil-hnsw.yaml | 2 +- .../dl20-passage-openai-ada2-int8.yaml | 65 +++++++++++++++++++ .../regression/dl20-passage-openai-ada2.yaml | 2 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 +++++++++++++++++++ ...arco-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../msmarco-passage-openai-ada2-int8.yaml | 65 +++++++++++++++++++ .../msmarco-passage-openai-ada2.yaml | 2 +- 24 files changed, 536 insertions(+), 16 deletions(-) create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml create mode 100644 src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml create mode 100644 src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md index b08012b307..5c3cb3c775 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md index dc625e14d0..8b9b91d211 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md index c5382dc3c5..7df4357e6e 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md index f040a9ce41..bb268c9d5b 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md index c2f46b422c..e5fdd01e05 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md index f3e93c63ef..2a03bf25b3 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md index 6df98970f1..0e395c8575 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md index c6ca60e2bc..c6e7be50ec 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md @@ -54,7 +54,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..cae9fc745e --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.458 + nDCG@10: + - 0.717 + R@100: + - 0.605 + R@1000: + - 0.805 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..596ff276ed --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.458 + nDCG@10: + - 0.717 + R@100: + - 0.605 + R@1000: + - 0.805 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml index 31966a5457..8422854271 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml index 8b38c5c1b2..fa55081708 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..f05d1f2290 --- /dev/null +++ b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.479 + nDCG@10: + - 0.704 + R@100: + - 0.624 + R@1000: + - 0.857 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml index 9667d0907c..2c4b0796f7 100644 --- a/src/main/resources/regression/dl19-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..4e64494b59 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.482 + nDCG@10: + - 0.701 + R@100: + - 0.712 + R@1000: + - 0.843 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..98968a983f --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.482 + nDCG@10: + - 0.701 + R@100: + - 0.712 + R@1000: + - 0.843 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml index d2e0f89991..055abefa4b 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml index f120eadd61..f149679351 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..6f26a14fe7 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20-passage.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.477 + nDCG@10: + - 0.676 + R@100: + - 0.723 + R@1000: + - 0.867 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml index 152d18765c..ff7d16aa64 100644 --- a/src/main/resources/regression/dl20-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..1691c197c8 --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.393 + RR@10: + - 0.388 + R@100: + - 0.903 + R@1000: + - 0.974 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml index 4746f389f3..372d40a67b 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..9332504916 --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.350 + RR@10: + - 0.343 + R@100: + - 0.898 + R@1000: + - 0.985 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml index 08289ce2c0..5bd13b6d28 100644 --- a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml +++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 From cef6d3645a81fc6954e48d94ebc4bb66eeee8fac Mon Sep 17 00:00:00 2001 From: lintool Date: Thu, 14 Dec 2023 17:14:06 -0500 Subject: [PATCH 08/12] Added docs. --- README.md | 9 +- ...9-passage-cos-dpr-distil-hnsw-int8-onnx.md | 123 ++++++++++++++++++ ...s-dl19-passage-cos-dpr-distil-hnsw-int8.md | 121 +++++++++++++++++ ...s-dl19-passage-cos-dpr-distil-hnsw-onnx.md | 8 +- ...ssions-dl19-passage-cos-dpr-distil-hnsw.md | 4 +- ...gressions-dl19-passage-openai-ada2-int8.md | 121 +++++++++++++++++ .../regressions-dl19-passage-openai-ada2.md | 4 +- ...0-passage-cos-dpr-distil-hnsw-int8-onnx.md | 123 ++++++++++++++++++ ...s-dl20-passage-cos-dpr-distil-hnsw-int8.md | 121 +++++++++++++++++ ...s-dl20-passage-cos-dpr-distil-hnsw-onnx.md | 8 +- ...ssions-dl20-passage-cos-dpr-distil-hnsw.md | 4 +- ...gressions-dl20-passage-openai-ada2-int8.md | 121 +++++++++++++++++ .../regressions-dl20-passage-openai-ada2.md | 4 +- ...o-passage-cos-dpr-distil-hnsw-int8-onnx.md | 115 ++++++++++++++++ ...smarco-passage-cos-dpr-distil-hnsw-int8.md | 115 ++++++++++++++++ ...smarco-passage-cos-dpr-distil-hnsw-onnx.md | 8 +- ...ons-msmarco-passage-cos-dpr-distil-hnsw.md | 4 +- ...ssions-msmarco-passage-openai-ada2-int8.md | 114 ++++++++++++++++ ...regressions-msmarco-passage-openai-ada2.md | 4 +- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 101 ++++++++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 99 ++++++++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- .../dl19-passage-cos-dpr-distil-hnsw.template | 4 +- .../dl19-passage-openai-ada2-int8.template | 99 ++++++++++++++ .../dl19-passage-openai-ada2.template | 4 +- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 101 ++++++++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 99 ++++++++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- .../dl20-passage-cos-dpr-distil-hnsw.template | 4 +- .../dl20-passage-openai-ada2-int8.template | 99 ++++++++++++++ .../dl20-passage-openai-ada2.template | 4 +- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 93 +++++++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 93 +++++++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- ...marco-passage-cos-dpr-distil-hnsw.template | 4 +- .../msmarco-passage-openai-ada2-int8.template | 92 +++++++++++++ .../msmarco-passage-openai-ada2.template | 4 +- 37 files changed, 2028 insertions(+), 27 deletions(-) create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-dl19-passage-openai-ada2-int8.md create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-dl20-passage-openai-ada2-int8.md create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template diff --git a/README.md b/README.md index 93c1b1a925..ca9361b895 100644 --- a/README.md +++ b/README.md @@ -89,9 +89,12 @@ See individual pages for details! | SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd.md) | | SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd-onnx.md) | | **Learned Dense** (HNSW) | | | | -| cosDPR-distil w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) | -| cosDPR-distil w/ HSNW (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) | -| OpenAI-ada2 w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | +| cosDPR-distil w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) | +| cosDPR-distil w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md) | +| cosDPR-distil w/ HSNW fp32 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) | +| cosDPR-distil w/ HSNW int8 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md) | +| OpenAI Ada2 w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | +| OpenAI Ada2 w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2-int8.md) | | **Learned Dense** (Inverted; experimental) | | | | | cosDPR-distil w/ "fake words" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md) | | cosDPR-distil w/ "LexLSH" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md) | diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..24f529ef99 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..bbad60de68 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md index 5c3cb3c775..5335899358 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md index 8b9b91d211..9b533ff518 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..761fa8bcc4 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.704 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.624 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.857 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md index 7df4357e6e..57f5ab8932 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..21c2f8cd12 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..cc9c2f14b6 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md index bb268c9d5b..a802d3370e 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md index e5fdd01e05..d67487d1e4 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..46073ce731 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.dl20-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.723 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.867 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md index 2a03bf25b3..ed3f4be6d7 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md @@ -62,9 +62,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..f130af6512 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,115 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..e97fb49c17 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,115 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md index 0e395c8575..a8f41de6ee 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml). Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. @@ -59,9 +59,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -78,6 +80,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md index 641106d14d..0d98114e80 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md @@ -59,9 +59,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..d5b0729069 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md @@ -0,0 +1,114 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.898 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.985 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md index c6e7be50ec..8f58da3521 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md @@ -59,9 +59,11 @@ target/appassembler/bin/IndexHnswDenseVectors \ ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..fd1fa8fa91 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..0900647c40 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template index 355d54348f..07322676fc 100644 --- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template index 81c68f7ec2..a1839cf6df 100644 --- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..f06a8132ea --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template index 15fe5a605a..f84f7d2d1d 100644 --- a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..9179919a51 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..e9e46d5fd7 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template index 36fee4a8b0..e5b80bf511 100644 --- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template index e3f6969a9f..2ec64f9a41 100644 --- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..033930bc14 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template index 4cd598d31b..069c3d41fe 100644 --- a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..a7e21103f1 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,93 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..ae04c97479 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,93 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template index bbc6f5a298..cad852f311 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. The exact configurations for these regressions are stored in [this YAML file](${yaml}). Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -67,6 +69,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template index 98b1ed5b42..f5d7400267 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..d95af519b9 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template @@ -0,0 +1,92 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template index c1b658c309..7bf567dda7 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. From a1da98d783b739cf94dd73536b047fe15d70aa6a Mon Sep 17 00:00:00 2001 From: lintool Date: Sun, 17 Dec 2023 08:46:34 -0500 Subject: [PATCH 09/12] Upgrade to 9.9.1; made setRAMPerThreadHardLimitMB settable. --- pom.xml | 2 +- .../java/io/anserini/index/IndexHnswDenseVectors.java | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index e4cd61b282..3e570aa2fe 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ - 9.9.0 + 9.9.1 UTF-8 diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 4046d87009..161fbc4fac 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -29,7 +29,6 @@ import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -68,6 +67,11 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-noMerge", usage = "Do not merge segments (fast indexing, slow retrieval).") public boolean noMerge = false; + @Option(name = "-maxThreadMemoryBeforeFlush", metaVar = "[num]", usage = "Maximum memory consumption per thread before triggering a forced flush (in MB); must be smaller than 2048.") + public int maxThreadMemoryBeforeFlush = 2047; + // This is the most aggressive possible setting; default is 1945. + // If the setting is too aggressive, may result in GCLocker issues. + @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).") public int maxMergedSegmentSize = 1024 * 16; @@ -115,7 +119,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); - config.setRAMPerThreadHardLimitMB(2047); // Max possible value. + config.setRAMPerThreadHardLimitMB(args.maxThreadMemoryBeforeFlush); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); @@ -149,6 +153,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { LOG.info(" + Store document vectors? " + args.storeVectors); LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); LOG.info(" + MemoryBuffer: " + args.memoryBuffer); + LOG.info(" + MaxThreadMemoryBeforeFlush: " + args.maxThreadMemoryBeforeFlush); if (args.noMerge) { LOG.info(" + MergePolicy: NoMerge"); From ae8c68d58675cc9e1f9e6517ee0a689d6d09b110 Mon Sep 17 00:00:00 2001 From: lintool Date: Sun, 17 Dec 2023 10:25:58 -0500 Subject: [PATCH 10/12] Less strict score matching on HNSW tests. --- .../io/anserini/search/SearchHnswDenseVectorsTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java index 8e44e90761..2a9e2a9fab 100644 --- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java @@ -305,7 +305,7 @@ public void testBasicAda2() throws Exception { "-hits", "5"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "160885 Q0 45 1 0.863064 Anserini", "160885 Q0 44 2 0.861596 Anserini", "160885 Q0 40 3 0.858651 Anserini", @@ -393,7 +393,7 @@ public void testBasicWithOnnx() throws Exception { SearchHnswDenseVectors.main(searchArgs); // Note output is slightly different from pre-encoded query vectors. - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "2 Q0 208 1 0.578723 Anserini", "2 Q0 224 2 0.578716 Anserini", "2 Q0 384 3 0.573913 Anserini", @@ -437,7 +437,7 @@ public void testRemoveQuery() throws Exception { "-removeQuery"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "10 Q0 45 1 0.846281 Anserini", "10 Q0 44 2 0.845236 Anserini", "10 Q0 95 3 0.845013 Anserini", @@ -480,7 +480,7 @@ public void testPassage() throws Exception { "-hits", "10"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "160885 Q0 44 1 0.863064 Anserini", "160885 Q0 40 2 0.858651 Anserini", "160885 Q0 48 3 0.858514 Anserini", From e6253e76d3ed44aa4a4b31b815c31db6fe3a5a69 Mon Sep 17 00:00:00 2001 From: lintool Date: Sun, 17 Dec 2023 15:01:29 -0500 Subject: [PATCH 11/12] added script/doc bindings for regressions; no openai-int8 --- docs/regressions.md | 24 ++- src/main/python/regressions-batch03.txt | 213 ++++++++++++------------ 2 files changed, 124 insertions(+), 113 deletions(-) diff --git a/docs/regressions.md b/docs/regressions.md index dec59541f6..36f7eb59f6 100644 --- a/docs/regressions.md +++ b/docs/regressions.md @@ -51,13 +51,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed >& logs/log.msmarco-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd >& logs/log.msmarco-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw >& logs/log.msmarco-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw >& logs/log.msmarco-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh >& logs/log.msmarco-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 >& logs/log.msmarco-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc >& logs/log.msmarco-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp >& logs/log.msmarco-doc-wp & @@ -83,13 +85,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed >& logs/log.dl19-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd >& logs/log.dl19-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw >& logs/log.dl19-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw >& logs/log.dl19-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh >& logs/log.dl19-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 >& logs/log.dl19-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc >& logs/log.dl19-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-wp >& logs/log.dl19-doc-wp & @@ -115,13 +119,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed >& logs/log.dl20-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd >& logs/log.dl20-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw >& logs/log.dl20-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw >& logs/log.dl20-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh >& logs/log.dl20-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 >& logs/log.dl20-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc >& logs/log.dl20-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-wp >& logs/log.dl20-doc-wp & diff --git a/src/main/python/regressions-batch03.txt b/src/main/python/regressions-batch03.txt index baab7fc298..4a38ff5dfc 100644 --- a/src/main/python/regressions-batch03.txt +++ b/src/main/python/regressions-batch03.txt @@ -1,8 +1,10 @@ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1 + # MS MARCO V1 passage python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed > logs/log.msmarco-passage-splade-pp-ed 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd > logs/log.msmarco-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1 -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw > logs/log.msmarco-passage-cos-dpr-distil-fw 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh > logs/log.msmarco-passage-cos-dpr-distil-lexlsh 2>&1 @@ -20,6 +22,10 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-distill-splade-max > logs/log.msmarco-passage-distill-splade-max 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-distil-cocodenser-medium > logs/log.msmarco-passage-splade-distil-cocodenser-medium 2>&1 +# HNSW search-only +python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 + # MS MARCO V1 doc python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc > logs/log.msmarco-doc 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp > logs/log.msmarco-doc-wp 2>&1 @@ -34,9 +40,8 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp > logs/log.msmarco-doc-segmented-unicoil-noexp 2>&1 # MS MARCO V1 passage ONNX runs - uses same index, so need to make sure previous runs finish -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 # MIRACL python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-ar > logs/log.miracl-v1.0-ar 2>&1 @@ -121,107 +126,107 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-te-aca > logs/log.mrtydi-v1.1-te-aca 2>&1 python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-th-aca > logs/log.mrtydi-v1.1-th-aca 2>&1 -# DL19 - ONNX -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1 - -# Other DL19 -python src/main/python/run_regression.py --verify --search --regression dl19-passage > logs/log.dl19-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-doc > logs/log.dl19-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1 - -# DL20 - ONNX -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1 - -# Other DL20 -python src/main/python/run_regression.py --verify --search --regression dl20-passage > logs/log.dl20-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-doc > logs/log.dl20-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1 +# DL19 +python src/main/python/run_regression.py --search --regression dl19-passage > logs/log.dl19-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-doc > logs/log.dl19-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1 + +# DL20 +python src/main/python/run_regression.py --search --regression dl20-passage > logs/log.dl20-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-doc > logs/log.dl20-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1 # DL21/22 -python src/main/python/run_regression.py --verify --search --regression dl21-passage > logs/log.dl21-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl21-doc > logs/log.dl21-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl22-passage > logs/log.dl22-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage > logs/log.dl21-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1 + +python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1 + +python src/main/python/run_regression.py --search --regression dl21-doc > logs/log.dl21-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1 + +python src/main/python/run_regression.py --search --regression dl22-passage > logs/log.dl22-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1 + +python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1 # CIRAL python src/main/python/run_regression.py --index --verify --search --regression ciral-v1.0-ha > logs/log.ciral-v1.0-ha 2>&1 From 2b6e14acd7a59af96b43b6e2e770eaa8bc350e09 Mon Sep 17 00:00:00 2001 From: lintool Date: Tue, 19 Dec 2023 13:53:26 -0500 Subject: [PATCH 12/12] Addressed CR. added note of errors to openai int8 indexes. --- docs/regressions/regressions-dl19-passage-openai-ada2-int8.md | 2 ++ docs/regressions/regressions-dl20-passage-openai-ada2-int8.md | 2 ++ .../regressions/regressions-msmarco-passage-openai-ada2-int8.md | 2 ++ src/main/java/io/anserini/index/IndexCollection.java | 2 +- src/main/java/io/anserini/index/IndexInvertedDenseVectors.java | 2 +- .../docgen/templates/dl19-passage-openai-ada2-int8.template | 2 ++ .../docgen/templates/dl20-passage-openai-ada2-int8.template | 2 ++ .../docgen/templates/msmarco-passage-openai-ada2-int8.template | 2 ++ 8 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md index 761fa8bcc4..e71b7c03b8 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2019 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md index 46073ce731..beb86feb50 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2020 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md index d5b0729069..7b2053dec9 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: MS MARCO Passage Ranking +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index f80c778cd3..2242837027 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -177,7 +177,7 @@ public IndexCollection(Args args) throws Exception { } final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()).setCodec(new Lucene99Codec()); + final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()); if (args.bm25Accurate) { // Necessary during indexing as the norm used in BM25 is already determined at index time. diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index b415bf8efa..249c626a97 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -100,7 +100,7 @@ public IndexInvertedDenseVectors(Args args) { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene99Codec()); + final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); config.setUseCompoundFile(false); diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template index f06a8132ea..99454b9d31 100644 --- a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2019 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template index 033930bc14..eea224f03e 100644 --- a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2020 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template index d95af519b9..b9e3a3c5e5 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: MS MARCO Passage Ranking +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: