From 3885b5c25178d2a88fc3b953d572b518ef0d1da6 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Mon, 8 Jul 2024 10:08:58 -0400 Subject: [PATCH] Refactor tolerance settings for MS MARCO dense vector regressions (#2541) Continuation of #2538 - refactor tolerance values for HNSW indexes, calibrate wrt flat index scores. --- ...ssage.bge-base-en-v1.5.hnsw-int8.cached.md | 13 +- ...passage.bge-base-en-v1.5.hnsw-int8.onnx.md | 11 +- ...19-passage.bge-base-en-v1.5.hnsw.cached.md | 11 +- ...dl19-passage.bge-base-en-v1.5.hnsw.onnx.md | 13 +- ...ere-embed-english-v3.0.hnsw-int8.cached.md | 13 +- ...e.cohere-embed-english-v3.0.hnsw.cached.md | 13 +- ...passage.cos-dpr-distil.hnsw-int8.cached.md | 13 +- ...9-passage.cos-dpr-distil.hnsw-int8.onnx.md | 13 +- ...dl19-passage.cos-dpr-distil.hnsw.cached.md | 13 +- ...s-dl19-passage.cos-dpr-distil.hnsw.onnx.md | 13 +- ...19-passage.openai-ada2.hnsw-int8.cached.md | 13 +- ...ns-dl19-passage.openai-ada2.hnsw.cached.md | 11 +- ...ssage.bge-base-en-v1.5.hnsw-int8.cached.md | 13 +- ...passage.bge-base-en-v1.5.hnsw-int8.onnx.md | 13 +- ...20-passage.bge-base-en-v1.5.hnsw.cached.md | 13 +- ...dl20-passage.bge-base-en-v1.5.hnsw.onnx.md | 13 +- ...ere-embed-english-v3.0.hnsw-int8.cached.md | 13 +- ...e.cohere-embed-english-v3.0.hnsw.cached.md | 11 +- ...passage.cos-dpr-distil.hnsw-int8.cached.md | 13 +- ...0-passage.cos-dpr-distil.hnsw-int8.onnx.md | 13 +- ...dl20-passage.cos-dpr-distil.hnsw.cached.md | 13 +- ...s-dl20-passage.cos-dpr-distil.hnsw.onnx.md | 13 +- ...20-passage.openai-ada2.hnsw-int8.cached.md | 11 +- ...ns-dl20-passage.openai-ada2.hnsw.cached.md | 9 +- ...ssage.bge-base-en-v1.5.hnsw-int8.cached.md | 13 +- ...passage.bge-base-en-v1.5.hnsw-int8.onnx.md | 13 +- ...v1-passage.bge-base-en-v1.5.hnsw.cached.md | 11 +- ...o-v1-passage.bge-base-en-v1.5.hnsw.onnx.md | 11 +- ...ere-embed-english-v3.0.hnsw-int8.cached.md | 13 +- ...e.cohere-embed-english-v3.0.hnsw.cached.md | 13 +- ...passage.cos-dpr-distil.hnsw-int8.cached.md | 13 +- ...1-passage.cos-dpr-distil.hnsw-int8.onnx.md | 13 +- ...o-v1-passage.cos-dpr-distil.hnsw.cached.md | 13 +- ...rco-v1-passage.cos-dpr-distil.hnsw.onnx.md | 13 +- ...v1-passage.openai-ada2.hnsw-int8.cached.md | 13 +- ...arco-v1-passage.openai-ada2.hnsw.cached.md | 9 +- src/main/python/run_regression.py | 192 +++++++++++------- ...bge-base-en-v1.5.hnsw-int8.cached.template | 5 +- ...e.bge-base-en-v1.5.hnsw-int8.onnx.template | 5 +- ...sage.bge-base-en-v1.5.hnsw.cached.template | 5 +- ...assage.bge-base-en-v1.5.hnsw.onnx.template | 5 +- ...bed-english-v3.0.hnsw-int8.cached.template | 5 +- ...re-embed-english-v3.0.hnsw.cached.template | 5 +- ...e.cos-dpr-distil.hnsw-int8.cached.template | 5 +- ...age.cos-dpr-distil.hnsw-int8.onnx.template | 5 +- ...assage.cos-dpr-distil.hnsw.cached.template | 5 +- ...-passage.cos-dpr-distil.hnsw.onnx.template | 5 +- ...sage.openai-ada2.hnsw-int8.cached.template | 5 +- ...9-passage.openai-ada2.hnsw.cached.template | 5 +- ...bge-base-en-v1.5.hnsw-int8.cached.template | 5 +- ...e.bge-base-en-v1.5.hnsw-int8.onnx.template | 5 +- ...sage.bge-base-en-v1.5.hnsw.cached.template | 5 +- ...assage.bge-base-en-v1.5.hnsw.onnx.template | 5 +- ...bed-english-v3.0.hnsw-int8.cached.template | 5 +- ...re-embed-english-v3.0.hnsw.cached.template | 5 +- ...e.cos-dpr-distil.hnsw-int8.cached.template | 5 +- ...age.cos-dpr-distil.hnsw-int8.onnx.template | 5 +- ...assage.cos-dpr-distil.hnsw.cached.template | 5 +- ...-passage.cos-dpr-distil.hnsw.onnx.template | 5 +- ...sage.openai-ada2.hnsw-int8.cached.template | 5 +- ...0-passage.openai-ada2.hnsw.cached.template | 5 +- ...bge-base-en-v1.5.hnsw-int8.cached.template | 5 +- ...e.bge-base-en-v1.5.hnsw-int8.onnx.template | 5 +- ...sage.bge-base-en-v1.5.hnsw.cached.template | 5 +- ...assage.bge-base-en-v1.5.hnsw.onnx.template | 5 +- ...bed-english-v3.0.hnsw-int8.cached.template | 5 +- ...re-embed-english-v3.0.hnsw.cached.template | 5 +- ...e.cos-dpr-distil.hnsw-int8.cached.template | 5 +- ...age.cos-dpr-distil.hnsw-int8.onnx.template | 5 +- ...assage.cos-dpr-distil.hnsw.cached.template | 5 +- ...-passage.cos-dpr-distil.hnsw.onnx.template | 5 +- ...sage.openai-ada2.hnsw-int8.cached.template | 5 +- ...1-passage.openai-ada2.hnsw.cached.template | 5 +- ...age.bge-base-en-v1.5.hnsw-int8.cached.yaml | 8 +- ...ssage.bge-base-en-v1.5.hnsw-int8.onnx.yaml | 8 +- ...-passage.bge-base-en-v1.5.hnsw.cached.yaml | 8 +- ...19-passage.bge-base-en-v1.5.hnsw.onnx.yaml | 8 +- ...e-embed-english-v3.0.hnsw-int8.cached.yaml | 8 +- ...cohere-embed-english-v3.0.hnsw.cached.yaml | 8 +- ...ssage.cos-dpr-distil.hnsw-int8.cached.yaml | 8 +- ...passage.cos-dpr-distil.hnsw-int8.onnx.yaml | 8 +- ...19-passage.cos-dpr-distil.hnsw.cached.yaml | 8 +- ...dl19-passage.cos-dpr-distil.hnsw.onnx.yaml | 8 +- ...-passage.openai-ada2.hnsw-int8.cached.yaml | 8 +- .../dl19-passage.openai-ada2.hnsw.cached.yaml | 8 +- ...age.bge-base-en-v1.5.hnsw-int8.cached.yaml | 8 +- ...ssage.bge-base-en-v1.5.hnsw-int8.onnx.yaml | 8 +- ...-passage.bge-base-en-v1.5.hnsw.cached.yaml | 8 +- ...20-passage.bge-base-en-v1.5.hnsw.onnx.yaml | 8 +- ...e-embed-english-v3.0.hnsw-int8.cached.yaml | 8 +- ...cohere-embed-english-v3.0.hnsw.cached.yaml | 8 +- ...ssage.cos-dpr-distil.hnsw-int8.cached.yaml | 8 +- ...passage.cos-dpr-distil.hnsw-int8.onnx.yaml | 8 +- ...20-passage.cos-dpr-distil.hnsw.cached.yaml | 8 +- ...dl20-passage.cos-dpr-distil.hnsw.onnx.yaml | 8 +- ...-passage.openai-ada2.hnsw-int8.cached.yaml | 8 +- .../dl20-passage.openai-ada2.hnsw.cached.yaml | 8 +- ...age.bge-base-en-v1.5.hnsw-int8.cached.yaml | 8 +- ...ssage.bge-base-en-v1.5.hnsw-int8.onnx.yaml | 8 +- ...-passage.bge-base-en-v1.5.hnsw.cached.yaml | 8 +- ...v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml | 8 +- ...e-embed-english-v3.0.hnsw-int8.cached.yaml | 8 +- ...cohere-embed-english-v3.0.hnsw.cached.yaml | 8 +- ...ssage.cos-dpr-distil.hnsw-int8.cached.yaml | 8 +- ...passage.cos-dpr-distil.hnsw-int8.onnx.yaml | 8 +- ...v1-passage.cos-dpr-distil.hnsw.cached.yaml | 8 +- ...o-v1-passage.cos-dpr-distil.hnsw.onnx.yaml | 8 +- ...-passage.openai-ada2.hnsw-int8.cached.yaml | 8 +- ...co-v1-passage.openai-ada2.hnsw.cached.yaml | 8 +- 109 files changed, 616 insertions(+), 490 deletions(-) diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md index dfb916cdd2..be636c039b 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.443 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.708 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | | **R@100** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.614 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index b6b0689ea5..b93694b000 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -103,14 +103,15 @@ With the above commands, you should be able to reproduce the following results: |:-------------------------------------------------------------------------------------------------------------|-----------| | [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | | **R@100** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.609 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.836 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md index 742744f7e5..513212316f 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.442 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | | **nDCG@10** | **BGE-base-en-v1.5**| | [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | | **R@100** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.616 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.842 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md index 77537a951d..b51406b4ea 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.447 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | | **R@100** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.607 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.837 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index 2c3985fd07..d8facc9287 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -94,16 +94,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.487 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.690 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.696 | | **R@100** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.647 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.648 | | **R@1000** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md index b8c6825dd5..5d1df7ef3c 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -94,16 +94,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.486 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.690 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.696 | | **R@100** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.645 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.648 | | **R@1000** | **cohere-embed-english-v3.0**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.851 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md index 70a66a7981..b93e2554eb 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | | **nDCG@10** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md index 9aa8383ef9..cdd25400ad 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -103,16 +103,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | | **nDCG@10** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md index c4eb2dcb3b..dd9deb1995 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | | **nDCG@10** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md index 44b66615f0..0be142e4a7 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | | **nDCG@10** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | | **R@1000** | **cosDPR-distil**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md index c241c421b3..f007aa8380 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **OpenAI-ada2**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.478 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | | **nDCG@10** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.707 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.703 | | **R@100** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.623 | | **R@1000** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.openai-ada2.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md index 6101c5d241..61cef1094d 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md @@ -101,14 +101,15 @@ With the above commands, you should be able to reproduce the following results: |:-------------------------------------------------------------------------------------------------------------|-----------| | [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | | **nDCG@10** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.704 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.703 | | **R@100** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.624 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.623 | | **R@1000** | **OpenAI-ada2**| -| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.857 | +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage.openai-ada2.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md index 4f964bbf15..5368960d8a 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.463 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.674 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | | **R@100** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.840 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index a4a0e59285..ce8df58074 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.462 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.677 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | | **R@100** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.711 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.848 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md index 82ef436897..b5e4477cef 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.464 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.677 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | | **R@100** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.714 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.840 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md index df2686e60a..a34df3fc40 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.462 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | | **nDCG@10** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.677 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | | **R@100** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | | **R@1000** | **BGE-base-en-v1.5**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.849 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index a81cc2311a..5110a75e6d 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -94,16 +94,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.505 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.507 | | **nDCG@10** | **cohere-embed-english-v3.0**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.722 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cohere-embed-english-v3.0**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.728 | | **R@1000** | **cohere-embed-english-v3.0**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.858 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.868 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md index aa28c99e1c..da47afbfae 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -94,16 +94,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.505 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.507 | | **nDCG@10** | **cohere-embed-english-v3.0**| | [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | | **R@100** | **cohere-embed-english-v3.0**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.724 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.728 | | **R@1000** | **cohere-embed-english-v3.0**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.864 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.868 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md index da4b118033..8148593918 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | | **R@100** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | | **R@1000** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md index 1aca72eb64..2f9d5c0898 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -103,16 +103,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | | **R@100** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | | **R@1000** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md index 2e60059bab..c39fd6afdf 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | | **R@100** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | | **R@1000** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md index ce2bdd0834..095aa1728f 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md @@ -101,16 +101,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | | **nDCG@10** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | | **R@100** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | | **R@1000** | **cosDPR-distil**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md index 5f7f9c2a1a..b88f3ee759 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md @@ -103,14 +103,15 @@ With the above commands, you should be able to reproduce the following results: |:-------------------------------------------------------------------------------------------------------------|-----------| | [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | | **nDCG@10** | **OpenAI-ada2**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.675 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | | **R@100** | **OpenAI-ada2**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.727 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.724 | | **R@1000** | **OpenAI-ada2**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.866 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.871 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.openai-ada2.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md index f9f9b70fb5..8be06c47ef 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md @@ -103,12 +103,13 @@ With the above commands, you should be able to reproduce the following results: | **nDCG@10** | **OpenAI-ada2**| | [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | | **R@100** | **OpenAI-ada2**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.723 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.724 | | **R@1000** | **OpenAI-ada2**| -| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.867 | +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.871 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage.openai-ada2.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md index 78e2f4b56c..db144b9dee 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -97,16 +97,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.362 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | | **RR@10** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.356 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | | **R@100** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.897 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | | **R@1000** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.977 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index ff8d251335..fb494b97b1 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -97,16 +97,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.362 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | | **RR@10** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.356 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | | **R@100** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.897 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | | **R@1000** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.977 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md index 01f458764b..404c821997 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md @@ -95,16 +95,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.363 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | | **RR@10** | **BGE-base-en-v1.5**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | | **R@100** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.897 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | | **R@1000** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.977 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md index 080fe67cbe..9e026f78a6 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -95,16 +95,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **BGE-base-en-v1.5**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.363 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | | **RR@10** | **BGE-base-en-v1.5**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | | **R@100** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.897 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | | **R@1000** | **BGE-base-en-v1.5**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.977 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index 5cf1a11852..641496596d 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -95,16 +95,17 @@ With the above commands, you should be able to reproduce the following results: | **nDCG@10** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.427 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.429 | | **AP@1000** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.371 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.372 | | **RR@10** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.365 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.366 | | **R@1000** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.979 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md index 98741acdd8..4e90bcbb05 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -93,16 +93,17 @@ With the above commands, you should be able to reproduce the following results: | **nDCG@10** | **cohere-embed-english-v3.0**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.428 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.429 | | **AP@1000** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.371 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.372 | | **RR@10** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.365 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.366 | | **R@1000** | **cohere-embed-english-v3.0**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.979 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md index eca31bdbb9..b4befc6d38 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -97,16 +97,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | | **RR@10** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | | **R@100** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | | **R@1000** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md index a72f7ec16e..30b090de99 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -99,16 +99,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | | **RR@10** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | | **R@100** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | | **R@1000** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md index 4f908615ef..d793851efd 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md @@ -95,16 +95,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | | **RR@10** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | | **R@100** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | | **R@1000** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md index 4aad9f8895..40335af5a9 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md @@ -97,16 +97,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | | **RR@10** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | | **R@100** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | | **R@1000** | **cosDPR-distil**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md index a1b562ed84..271e5c99ca 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md @@ -97,16 +97,17 @@ With the above commands, you should be able to reproduce the following results: | **AP@1000** | **OpenAI-ada2**| |:-------------------------------------------------------------------------------------------------------------|-----------| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | | **RR@10** | **OpenAI-ada2**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.336 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | | **R@100** | **OpenAI-ada2**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.894 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | | **R@1000** | **OpenAI-ada2**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.983 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.986 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md index 383c26e5a5..564fdc0447 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md @@ -99,12 +99,13 @@ With the above commands, you should be able to reproduce the following results: | **RR@10** | **OpenAI-ada2**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | | **R@100** | **OpenAI-ada2**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.898 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | | **R@1000** | **OpenAI-ada2**| -| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.985 | +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.986 | -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw.cached.yaml). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](../../docs/reproducibility.md) diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index 72741f06ac..335f071eaf 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -263,6 +263,104 @@ def construct_convert_commands(yaml_data): beir_dataset_pattern = re.compile(r'BEIR \(v1.0.0\): (.*)$') +msmarco_v1_flat_int8_onnx = defaultdict(lambda: 0.002) +msmarco_v1_flat_int8_cached = defaultdict(lambda: 0.002) +msmarco_v1_flat_int8_cached['openai-ada2-flat-int8-cached'] = 0.008 +msmarco_v1_flat_onnx = defaultdict(lambda: 0.0001) +msmarco_v1_flat_cached = defaultdict(lambda: 1e-9) + +msmarco_v1_flat_tolerance = { + 'flat-int8-onnx': msmarco_v1_flat_int8_onnx, + 'flat-int8-cached': msmarco_v1_flat_int8_cached, + 'flat-onnx': msmarco_v1_flat_onnx, + 'flat-cached': msmarco_v1_flat_cached, +} + +dl19_flat_int8_onnx = defaultdict(lambda: 0.002) +dl19_flat_int8_onnx['bge-flat-int8-onnx'] = 0.008 +dl19_flat_int8_cached = defaultdict(lambda: 0.002) +dl19_flat_int8_cached['bge-flat-int8-cached'] = 0.005 +dl19_flat_int8_cached['openai-ada2-flat-int8-cached'] = 0.008 +dl19_flat_onnx = defaultdict(lambda: 0.0001) +dl19_flat_onnx['bge-flat-onnx'] = 0.008 +dl19_flat_cached = defaultdict(lambda: 1e-9) + +dl19_flat_tolerance = { + 'flat-int8-onnx': dl19_flat_int8_onnx, + 'flat-int8-cached': dl19_flat_int8_cached, + 'flat-onnx': dl19_flat_onnx, + 'flat-cached': dl19_flat_cached, +} + +dl20_flat_int8_onnx = defaultdict(lambda: 0.002) +dl20_flat_int8_onnx['bge-flat-int8-onnx'] = 0.004 +dl20_flat_int8_cached = defaultdict(lambda: 0.002) +dl20_flat_int8_cached['bge-flat-int8-cached'] = 0.005 +dl20_flat_int8_cached['cos-dpr-distil-flat-int8-cached'] = 0.004 +dl20_flat_int8_cached['cohere-embed-english-v3.0-flat-int8-cached'] = 0.004 +dl20_flat_onnx = defaultdict(lambda: 0.0001) +dl20_flat_onnx['bge-flat-onnx'] = 0.005 +dl20_flat_cached = defaultdict(lambda: 1e-9) + +dl20_flat_tolerance = { + 'flat-int8-onnx': dl20_flat_int8_onnx, + 'flat-int8-cached': dl20_flat_int8_cached, + 'flat-onnx': dl20_flat_onnx, + 'flat-cached': dl20_flat_cached, +} + +msmarco_v1_hnsw_int8_onnx = defaultdict(lambda: 0.01) +msmarco_v1_hnsw_int8_cached = defaultdict(lambda: 0.01) +msmarco_v1_hnsw_onnx = defaultdict(lambda: 0.01) +msmarco_v1_hnsw_onnx['cos-dpr-distil-hnsw-onnx'] = 0.015 +msmarco_v1_hnsw_cached = defaultdict(lambda: 0.01) +msmarco_v1_hnsw_cached['cos-dpr-distil-hnsw-cached'] = 0.015 + +msmarco_v1_hnsw_tolerance = { + 'hnsw-int8-onnx': msmarco_v1_hnsw_int8_onnx, + 'hnsw-int8-cached': msmarco_v1_hnsw_int8_cached, + 'hnsw-onnx': msmarco_v1_hnsw_onnx, + 'hnsw-cached': msmarco_v1_hnsw_cached, +} + +dl19_hnsw_int8_onnx = defaultdict(lambda: 0.01) +dl19_hnsw_int8_onnx['bge-hnsw-int8-onnx'] = 0.02 +dl19_hnsw_int8_onnx['cos-dpr-distil-hnsw-int8-onnx'] = 0.025 +dl19_hnsw_int8_cached = defaultdict(lambda: 0.01) +dl19_hnsw_int8_cached['bge-hnsw-int8-cached'] = 0.015 +dl19_hnsw_int8_cached['cohere-embed-english-v3.0-hnsw-int8-cached'] = 0.015 +dl19_hnsw_int8_cached['cos-dpr-distil-hnsw-int8-cached'] = 0.025 +dl19_hnsw_int8_cached['openai-ada2-hnsw-int8-cached'] = 0.015 +dl19_hnsw_onnx = defaultdict(lambda: 0.015) +dl19_hnsw_onnx['bge-hnsw-onnx'] = 0.02 +dl19_hnsw_cached = defaultdict(lambda: 0.01) +dl19_hnsw_cached['cohere-embed-english-v3.0-hnsw-cached'] = 0.02 +dl19_hnsw_cached['cos-dpr-distil-hnsw-cached'] = 0.015 + +dl19_hnsw_tolerance = { + 'hnsw-int8-onnx': dl19_hnsw_int8_onnx, + 'hnsw-int8-cached': dl19_hnsw_int8_cached, + 'hnsw-onnx': dl19_hnsw_onnx, + 'hnsw-cached': dl19_hnsw_cached, +} + +dl20_hnsw_int8_onnx = defaultdict(lambda: 0.01) +dl20_hnsw_int8_cached = defaultdict(lambda: 0.01) +dl20_hnsw_int8_cached['bge-hnsw-int8-cached'] = 0.015 +dl20_hnsw_int8_cached['cohere-embed-english-v3.0-hnsw-int8-cached'] = 0.012 +dl20_hnsw_onnx = defaultdict(lambda: 0.015) +dl20_hnsw_cached = defaultdict(lambda: 0.01) +dl20_hnsw_cached['bge-hnsw-cached'] = 0.015 +dl20_hnsw_cached['cohere-embed-english-v3.0-hnsw-cached'] = 0.025 +dl20_hnsw_cached['cos-dpr-distil-hnsw-cached'] = 0.015 + +dl20_hnsw_tolerance = { + 'hnsw-int8-onnx': dl20_hnsw_int8_onnx, + 'hnsw-int8-cached': dl20_hnsw_int8_cached, + 'hnsw-onnx': dl20_hnsw_onnx, + 'hnsw-cached': dl20_hnsw_cached, +} + def evaluate_and_verify(yaml_data, dry_run): fail_str = '\033[91m[FAIL]\033[0m ' @@ -295,85 +393,41 @@ def evaluate_and_verify(yaml_data, dry_run): using_hnsw = True if 'type' in model and model['type'] == 'hnsw' else False using_flat = True if 'type' in model and model['type'] == 'flat' else False - if using_flat and 'BEIR' in topic_set['name']: - # Extract BEIR dataset - match = beir_dataset_pattern.search(topic_set['name']) - beir_dataset = match.group(1) - + if using_flat: # Extract model match = flat_model_type_pattern.search(model['name']) model_type = match.group(1) - # Lookup tolerance - tolerance_ok = beir_flat_tolerance[model_type][beir_dataset] - elif using_flat and 'MS MARCO Passage' in topic_set['name']: - if model['name'].endswith('-flat-int8-onnx'): - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-int8-cached'): - if model['name'] == 'openai-ada2-flat-int8-cached': - tolerance_ok = 0.008 - else: - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-onnx'): - tolerance_ok = 0.0001 - else: - tolerance_ok = 1e-9 - elif using_flat and 'DL19' in topic_set['name']: - if model['name'].endswith('-flat-int8-onnx'): - if model['name'] == 'bge-flat-int8-onnx': - tolerance_ok = 0.007 - elif model['name'] == 'cos-dpr-distil-flat-int8-onnx': - tolerance_ok = 0.004 - else: - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-int8-cached'): - if model['name'] == 'openai-ada2-flat-int8-cached': - tolerance_ok = 0.008 - else: - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-onnx'): - if model['name'] == 'bge-flat-onnx': - tolerance_ok = 0.008 - else: - tolerance_ok = 0.0001 - else: - tolerance_ok = 1e-9 - elif using_flat and 'DL20' in topic_set['name']: - if model['name'].endswith('-flat-int8-onnx'): - if model['name'] == 'bge-flat-int8-onnx': - tolerance_ok = 0.004 - elif model['name'] == 'cos-dpr-distil-flat-int8-onnx': - tolerance_ok = 0.004 - else: - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-int8-cached'): - if model['name'] == 'bge-flat-int8-cached': - tolerance_ok = 0.005 - elif model['name'] == 'cos-dpr-distil-flat-int8-cached': - tolerance_ok = 0.004 - else: - tolerance_ok = 0.002 - elif model['name'].endswith('-flat-onnx'): - if model['name'] == 'bge-flat-onnx': - tolerance_ok = 0.005 - else: - tolerance_ok = 0.0001 - else: - tolerance_ok = 1e-9 - else: - tolerance_ok = 1e-9 + if 'BEIR' in topic_set['name']: + # Extract BEIR dataset + match = beir_dataset_pattern.search(topic_set['name']) + beir_dataset = match.group(1) - if using_hnsw and 'BEIR' in topic_set['name']: - # Extract BEIR dataset - match = beir_dataset_pattern.search(topic_set['name']) - beir_dataset = match.group(1) + tolerance_ok = beir_flat_tolerance[model_type][beir_dataset] + elif 'MS MARCO Passage' in topic_set['name']: + tolerance_ok = msmarco_v1_flat_tolerance[model_type][model['name']] + elif 'DL19' in topic_set['name']: + tolerance_ok = dl19_flat_tolerance[model_type][model['name']] + elif using_flat and 'DL20' in topic_set['name']: + tolerance_ok = dl20_flat_tolerance[model_type][model['name']] + if using_hnsw: # Extract model match = hnsw_model_type_pattern.search(model['name']) model_type = match.group(1) - # Lookup tolerance - tolerance_ok = beir_hnsw_tolerance[model_type][beir_dataset] + if 'BEIR' in topic_set['name']: + # Extract BEIR dataset + match = beir_dataset_pattern.search(topic_set['name']) + beir_dataset = match.group(1) + + tolerance_ok = beir_hnsw_tolerance[model_type][beir_dataset] + elif 'MS MARCO Passage' in topic_set['name']: + tolerance_ok = msmarco_v1_hnsw_tolerance[model_type][model['name']] + elif 'DL19' in topic_set['name']: + tolerance_ok = dl19_hnsw_tolerance[model_type][model['name']] + elif 'DL20' in topic_set['name']: + tolerance_ok = dl20_hnsw_tolerance[model_type][model['name']] if using_flat or using_hnsw: result_str = (f'expected: {expected:.4f} actual: {actual:.4f} ' diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template index 684d6a68a8..63fdad31d4 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index 07f64b2993..69afbfc033 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template index 1684a43b43..d011773dd3 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template index 5c7f62f36e..89c9d8ee76 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index c3b3877000..565c13fea7 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -80,8 +80,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template index 789f388206..a47174b5f5 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -80,8 +80,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template index 751379ca58..9406fe3822 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template index e66a46e480..ff348cdbc9 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -89,8 +89,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template index 681c0bcd6d..e387bf030e 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template index 65bc136073..c6d7c0e522 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template index 7e71728323..991983ff4f 100644 --- a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template index 0ffa09bb80..24ec176d91 100644 --- a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template index ec86f6078b..a67e150823 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index 5500041509..957322a5bf 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template index 324a9c2a42..9197b856be 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template index 7bf9c62844..0585024953 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index 6afb3a9c6a..fcc92a8dc7 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -80,8 +80,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template index ac85f4fae7..1bc8727a09 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -80,8 +80,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template index b9622009f8..4c9517f5c5 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template index 55826ab0be..28dc4082c8 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -89,8 +89,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template index fafa6ebc2d..fd89cd5bdf 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template index 42a2906370..582cc46099 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template index 3633e0aa96..ad2259f779 100644 --- a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template @@ -87,8 +87,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template index c7ee57e2ff..361c8bb695 100644 --- a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template index af69251204..486b4bc69d 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -83,8 +83,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index 94075818a8..5dc942633e 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -83,8 +83,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template index e75f7be10e..aba6e3262f 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template @@ -81,8 +81,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template index d8180d75c4..fb6a697224 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -81,8 +81,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index c809996e13..924906b796 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -81,8 +81,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template index f159ced864..5f51caffa4 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -79,8 +79,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template index 5f85acc05b..89099d71fe 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -83,8 +83,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template index aa88ab7d67..b3737d5459 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -85,8 +85,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template index affa039161..5173b9b09b 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template @@ -81,8 +81,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template index cad852f311..3defd04a81 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template @@ -83,8 +83,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template index cdf4ff2c98..b82567babb 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template @@ -83,8 +83,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template index 6b705c906e..a1827f765c 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template @@ -81,8 +81,9 @@ With the above commands, you should be able to reproduce the following results: ${effectiveness} -Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). ## Reproduction Log[*](${root_path}/docs/reproducibility.md) diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml index af877e2c0b..eb8604232f 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.443 + - 0.4435 nDCG@10: - - 0.708 + - 0.7065 R@100: - - 0.614 + - 0.6171 R@1000: - - 0.843 + - 0.8472 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml index 14d4e1438e..fb2b2c1cc2 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.444 + - 0.4435 nDCG@10: - - 0.702 + - 0.7065 R@100: - - 0.609 + - 0.6171 R@1000: - - 0.836 + - 0.8472 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.cached.yaml index 4eeb1942e6..f47a2977e0 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.442 + - 0.4435 nDCG@10: - - 0.706 + - 0.7065 R@100: - - 0.616 + - 0.6171 R@1000: - - 0.842 + - 0.8472 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.onnx.yaml index 73796aa95a..d5103d9d6b 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.447 + - 0.4435 nDCG@10: - - 0.701 + - 0.7065 R@100: - - 0.607 + - 0.6171 R@1000: - - 0.837 + - 0.8472 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml index aa91abdb7f..50e305c663 100644 --- a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.487 + - 0.4884 nDCG@10: - - 0.690 + - 0.6956 R@100: - - 0.647 + - 0.6484 R@1000: - - 0.850 \ No newline at end of file + - 0.8630 \ No newline at end of file diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.yaml index 99a9b9652c..029bc8cedb 100644 --- a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.486 + - 0.4884 nDCG@10: - - 0.690 + - 0.6956 R@100: - - 0.645 + - 0.6484 R@1000: - - 0.851 \ No newline at end of file + - 0.8630 \ No newline at end of file diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.cached.yaml index bfb7bf2930..a6100f94a9 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.458 + - 0.4656 nDCG@10: - - 0.717 + - 0.7250 R@100: - - 0.605 + - 0.6173 R@1000: - - 0.805 + - 0.8201 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.yaml index a70061e7ae..2fc1199b0d 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.458 + - 0.4656 nDCG@10: - - 0.717 + - 0.7250 R@100: - - 0.605 + - 0.6173 R@1000: - - 0.805 + - 0.8201 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.cached.yaml index 73576b9798..8c215bf0a2 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.458 + - 0.4656 nDCG@10: - - 0.717 + - 0.7250 R@100: - - 0.605 + - 0.6173 R@1000: - - 0.805 + - 0.8201 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.onnx.yaml index 8455574323..65010dbabb 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.458 + - 0.4656 nDCG@10: - - 0.717 + - 0.7250 R@100: - - 0.605 + - 0.6173 R@1000: - - 0.805 + - 0.8201 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.hnsw-int8.cached.yaml index a0b05f6cc5..ff8a1c5d26 100644 --- a/src/main/resources/regression/dl19-passage.openai-ada2.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.openai-ada2.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.478 + - 0.4788 nDCG@10: - - 0.707 + - 0.7035 R@100: - - 0.617 + - 0.6235 R@1000: - - 0.853 + - 0.8629 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.hnsw.cached.yaml index 21ac8f5c9d..8aeade94f9 100644 --- a/src/main/resources/regression/dl19-passage.openai-ada2.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.openai-ada2.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.479 + - 0.4788 nDCG@10: - - 0.704 + - 0.7035 R@100: - - 0.624 + - 0.6235 R@1000: - - 0.857 + - 0.8629 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml index e092a742bb..65f7cbfc88 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.463 + - 0.4650 nDCG@10: - - 0.674 + - 0.6780 R@100: - - 0.712 + - 0.7169 R@1000: - - 0.840 + - 0.8503 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml index 78a5495329..cb04b68b38 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.462 + - 0.4650 nDCG@10: - - 0.677 + - 0.6780 R@100: - - 0.711 + - 0.7169 R@1000: - - 0.848 + - 0.8503 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.cached.yaml index f08b12e116..af7c099584 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.464 + - 0.4650 nDCG@10: - - 0.677 + - 0.6780 R@100: - - 0.714 + - 0.7169 R@1000: - - 0.840 + - 0.8503 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.onnx.yaml index 526e313788..bf5892e910 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.462 + - 0.4650 nDCG@10: - - 0.677 + - 0.6780 R@100: - - 0.712 + - 0.7169 R@1000: - - 0.849 + - 0.8503 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml index 5b3f8b4056..1fc331b7a2 100644 --- a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.505 + - 0.5067 nDCG@10: - - 0.722 + - 0.7245 R@100: - - 0.720 + - 0.7279 R@1000: - - 0.858 \ No newline at end of file + - 0.8682 \ No newline at end of file diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.yaml index 9fda129765..5c4529fa7e 100644 --- a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.505 + - 0.5067 nDCG@10: - - 0.725 + - 0.7245 R@100: - - 0.724 + - 0.7279 R@1000: - - 0.864 \ No newline at end of file + - 0.8682 \ No newline at end of file diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.cached.yaml index 5dce72b873..20873dfe04 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.482 + - 0.4876 nDCG@10: - - 0.701 + - 0.7025 R@100: - - 0.712 + - 0.7204 R@1000: - - 0.843 + - 0.8533 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.yaml index b20ad4affd..51049d8555 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.482 + - 0.4876 nDCG@10: - - 0.701 + - 0.7025 R@100: - - 0.712 + - 0.7204 R@1000: - - 0.843 + - 0.8533 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.cached.yaml index ff5c51375f..b1ac3947a2 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.482 + - 0.4876 nDCG@10: - - 0.701 + - 0.7025 R@100: - - 0.712 + - 0.7204 R@1000: - - 0.843 + - 0.8533 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.onnx.yaml index a7f57293c2..0e1c03d3c1 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.482 + - 0.4876 nDCG@10: - - 0.701 + - 0.7025 R@100: - - 0.712 + - 0.7204 R@1000: - - 0.843 + - 0.8533 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.hnsw-int8.cached.yaml index 787a2fb507..a76e57f1b5 100644 --- a/src/main/resources/regression/dl20-passage.openai-ada2.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.openai-ada2.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.477 + - 0.4771 nDCG@10: - - 0.675 + - 0.6759 R@100: - - 0.727 + - 0.7237 R@1000: - - 0.866 + - 0.8705 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.hnsw.cached.yaml index 13b1ac8a14..e0f50391ed 100644 --- a/src/main/resources/regression/dl20-passage.openai-ada2.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.openai-ada2.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.477 + - 0.4771 nDCG@10: - - 0.676 + - 0.6759 R@100: - - 0.723 + - 0.7237 R@1000: - - 0.867 + - 0.8705 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml index 0008962a82..fff856cbb5 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.362 + - 0.3641 RR@10: - - 0.356 + - 0.3583 R@100: - - 0.897 + - 0.9006 R@1000: - - 0.977 + - 0.9811 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml index 61a07240de..ae45b57f8a 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.362 + - 0.3641 RR@10: - - 0.356 + - 0.3583 R@100: - - 0.897 + - 0.9006 R@1000: - - 0.977 + - 0.9811 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.yaml index 61e0188f74..da3bae3244 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.363 + - 0.3641 RR@10: - - 0.358 + - 0.3583 R@100: - - 0.897 + - 0.9006 R@1000: - - 0.977 + - 0.9811 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml index 996c2daafb..41af6b0138 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15 results: AP@1000: - - 0.363 + - 0.3641 RR@10: - - 0.358 + - 0.3583 R@100: - - 0.897 + - 0.9006 R@1000: - - 0.977 + - 0.9811 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml index 53fd269549..dc50962369 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: nDCG@10: - - 0.427 + - 0.4287 AP@1000: - - 0.371 + - 0.3716 RR@10: - - 0.365 + - 0.3658 R@1000: - - 0.974 + - 0.9786 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.yaml index bd45211b5d..433c39ba24 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: nDCG@10: - - 0.428 + - 0.4287 AP@1000: - - 0.371 + - 0.3716 RR@10: - - 0.365 + - 0.3658 R@1000: - - 0.974 + - 0.9786 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.yaml index 7b1dd87fb1..1aae195713 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.393 + - 0.3942 RR@10: - - 0.388 + - 0.3896 R@100: - - 0.903 + - 0.9075 R@1000: - - 0.974 + - 0.9796 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.yaml index b5347ae6e0..51f76139ea 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.393 + - 0.3942 RR@10: - - 0.388 + - 0.3896 R@100: - - 0.903 + - 0.9075 R@1000: - - 0.974 + - 0.9796 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.yaml index 5fa7e7c702..adb737abb1 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.393 + - 0.3942 RR@10: - - 0.388 + - 0.3896 R@100: - - 0.903 + - 0.9075 R@1000: - - 0.974 + - 0.9796 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.yaml index 0c79b27540..27b10e26b2 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: - - 0.393 + - 0.3942 RR@10: - - 0.388 + - 0.3896 R@100: - - 0.903 + - 0.9075 R@1000: - - 0.974 + - 0.9796 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.yaml index 66b1bb8cde..8309dc34dc 100644 --- a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.343 + - 0.3505 RR@10: - - 0.336 + - 0.3434 R@100: - - 0.894 + - 0.8996 R@1000: - - 0.983 + - 0.9858 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw.cached.yaml index 10cb1a48f3..388193244a 100644 --- a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.hnsw.cached.yaml @@ -56,10 +56,10 @@ models: params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: - - 0.350 + - 0.3505 RR@10: - - 0.343 + - 0.3434 R@100: - - 0.898 + - 0.8996 R@1000: - - 0.985 + - 0.9858