From 2b6e14acd7a59af96b43b6e2e770eaa8bc350e09 Mon Sep 17 00:00:00 2001 From: lintool Date: Tue, 19 Dec 2023 13:53:26 -0500 Subject: [PATCH] Addressed CR. added note of errors to openai int8 indexes. --- docs/regressions/regressions-dl19-passage-openai-ada2-int8.md | 2 ++ docs/regressions/regressions-dl20-passage-openai-ada2-int8.md | 2 ++ .../regressions/regressions-msmarco-passage-openai-ada2-int8.md | 2 ++ src/main/java/io/anserini/index/IndexCollection.java | 2 +- src/main/java/io/anserini/index/IndexInvertedDenseVectors.java | 2 +- .../docgen/templates/dl19-passage-openai-ada2-int8.template | 2 ++ .../docgen/templates/dl20-passage-openai-ada2-int8.template | 2 ++ .../docgen/templates/msmarco-passage-openai-ada2-int8.template | 2 ++ 8 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md index 761fa8bcc4..e71b7c03b8 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2019 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md index 46073ce731..beb86feb50 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2020 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md index d5b0729069..7b2053dec9 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md @@ -1,5 +1,7 @@ # Anserini Regressions: MS MARCO Passage Ranking +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index f80c778cd3..2242837027 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -177,7 +177,7 @@ public IndexCollection(Args args) throws Exception { } final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()).setCodec(new Lucene99Codec()); + final IndexWriterConfig config = new IndexWriterConfig(getAnalyzer()); if (args.bm25Accurate) { // Necessary during indexing as the norm used in BM25 is already determined at index time. diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index b415bf8efa..249c626a97 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -100,7 +100,7 @@ public IndexInvertedDenseVectors(Args args) { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene99Codec()); + final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); config.setUseCompoundFile(false); diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template index f06a8132ea..99454b9d31 100644 --- a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2019 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template index 033930bc14..eea224f03e 100644 --- a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: TREC 2020 Deep Learning Track (Passage) +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template index d95af519b9..b9e3a3c5e5 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template @@ -1,5 +1,7 @@ # Anserini Regressions: MS MARCO Passage Ranking +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + **Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: