Skip to content

Commit

Permalink
Update DPR index names in ODQA regressions; minor tweak in MS MARCO t…
Browse files Browse the repository at this point in the history
…est tolerance (#1508)
  • Loading branch information
lintool authored May 2, 2023
1 parent ca5a2be commit dcc0ba0
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 27 deletions.
7 changes: 4 additions & 3 deletions pyserini/2cr/msmarco.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,10 @@ def run_conditions(args):
runfile))
if math.isclose(score, float(expected[metric])):
result_str = ok_str
# Flaky test: small difference on my iMac Studio
elif args.collection == 'v1-passage' and topic_key == 'msmarco-passage-dev-subset' and \
name == 'ance-otf' and math.isclose(score, float(expected[metric]), abs_tol=2e-4):
# Flaky tests
elif args.collection == 'msmarco-v1-passage' \
and topic_key == 'msmarco-passage-dev-subset' and name == 'ance-otf' \
and metric == 'MRR@10' and abs(score-float(expected[metric])) <= 0.0001:
result_str = okish_str
else:
result_str = fail_str + f' expected {expected[metric]:.4f}'
Expand Down
16 changes: 8 additions & 8 deletions pyserini/resources/naturalquestion.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
conditions:
- model_name: BM25-k1_0.9_b_0.4
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics nq-test --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 44.82
Top20: 64.02
Expand All @@ -10,7 +10,7 @@ conditions:
Top1000: 88.95
- model_name: BM25-k1_0.9_b_0.4_dpr-topics
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-nq-test --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-nq-test --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 43.77
Top20: 62.99
Expand All @@ -19,9 +19,9 @@ conditions:
Top1000: 88.01
- model_name: GarT5-RRF
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics nq-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics nq-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics nq-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics nq-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 64.62
Top20: 77.17
Expand All @@ -30,7 +30,7 @@ conditions:
Top1000: 92.91
- model_name: DPR
command:
- python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-single-nq-bf --encoder facebook/dpr-question_encoder-single-nq-base --topics nq-test --output $output
- python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dpr-single-nq --encoder facebook/dpr-question_encoder-single-nq-base --topics nq-test --output $output
scores:
- Top5: 68.61
Top20: 80.58
Expand All @@ -39,7 +39,7 @@ conditions:
Top1000: 91.83
- model_name: DPR-DKRR
command:
- 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-dkrr-nq --encoder castorini/dkrr-dpr-nq-retriever --topics nq-test --output $output --query-prefix question: '
- 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dkrr-nq --encoder castorini/dkrr-dpr-nq-retriever --topics nq-test --output $output --query-prefix question: '
scores:
- Top5: 73.80
Top20: 84.27
Expand All @@ -48,7 +48,7 @@ conditions:
Top1000: 93.43
- model_name: DPR-Hybrid
command:
- python -m pyserini.search.hybrid dense --index wikipedia-dpr-single-nq-bf --encoder facebook/dpr-question_encoder-single-nq-base sparse --index wikipedia-dpr fusion --alpha 1.2 run --topics nq-test --output $output --threads 72 --batch-size 128
- python -m pyserini.search.hybrid dense --index wikipedia-dpr-100w.dpr-single-nq --encoder facebook/dpr-question_encoder-single-nq-base sparse --index wikipedia-dpr-100w fusion --alpha 1.2 run --topics nq-test --output $output --threads 72 --batch-size 128
scores:
- Top5: 72.52
Top20: 83.43
Expand Down
16 changes: 8 additions & 8 deletions pyserini/resources/triviaqa.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
conditions:
- model_name: BM25-k1_0.9_b_0.4
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 66.29
Top20: 76.41
Expand All @@ -10,7 +10,7 @@ conditions:
Top1000: 88.50
- model_name: BM25-k1_0.9_b_0.4_dpr-topics
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 66.29
Top20: 76.41
Expand All @@ -19,9 +19,9 @@ conditions:
Top1000: 88.50
- model_name: GarT5-RRF
command:
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-trivia-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-trivia-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr --topics dpr-trivia-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-answers --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-titles --output $output --bm25 --k1 0.9 --b 0.4
- python -m pyserini.search.lucene --threads 72 --batch-size 128 --index wikipedia-dpr-100w --topics dpr-trivia-test-gar-t5-sentences --output $output --bm25 --k1 0.9 --b 0.4
scores:
- Top5: 72.82
Top20: 80.66
Expand All @@ -30,7 +30,7 @@ conditions:
Top1000: 90.06
- model_name: DPR
command:
- python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-multi-bf --encoder facebook/dpr-question_encoder-multiset-base --topics dpr-trivia-test --output $output
- python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dpr-multi --encoder facebook/dpr-question_encoder-multiset-base --topics dpr-trivia-test --output $output
scores:
- Top5: 69.80
Top20: 78.87
Expand All @@ -39,7 +39,7 @@ conditions:
Top1000: 89.30
- model_name: DPR-DKRR
command:
- 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-dkrr-tqa --encoder castorini/dkrr-dpr-tqa-retriever --topics dpr-trivia-test --output $output --query-prefix question: '
- 'python -m pyserini.search.faiss --threads 72 --batch-size 128 --index wikipedia-dpr-100w.dkrr-tqa --encoder castorini/dkrr-dpr-tqa-retriever --topics dpr-trivia-test --output $output --query-prefix question: '
scores:
- Top5: 77.23
Top20: 83.74
Expand All @@ -48,7 +48,7 @@ conditions:
Top1000: 90.63
- model_name: DPR-Hybrid
command:
- python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf --encoder facebook/dpr-question_encoder-multiset-base sparse --index wikipedia-dpr fusion --alpha 0.95 run --topics dpr-trivia-test --output $output --threads 72 --batch-size 128
- python -m pyserini.search.hybrid dense --index wikipedia-dpr-100w.dpr-multi --encoder facebook/dpr-question_encoder-multiset-base sparse --index wikipedia-dpr-100w fusion --alpha 0.95 run --topics dpr-trivia-test --output $output --threads 72 --batch-size 128
scores:
- Top5: 76.01
Top20: 82.64
Expand Down
15 changes: 7 additions & 8 deletions scripts/repro_matrix/run_all_odqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
GARRRF_LS = ['answers','titles','sentences']
HITS_1K = set(['GarT5-RRF', 'DPR-DKRR', 'DPR-Hybrid'])


def print_results(metric, topics):
print(f'Metric = {metric}, Topics = {topics}')
for model in models['models']:
Expand Down Expand Up @@ -112,16 +113,14 @@ def print_results(metric, topics):
raise RuntimeError('fusion failed')
runfile = [output]


# trec conversion + evaluation
if not args.skip_eval:
jsonfile = runfile[0].replace('.txt', '.json')
runfile = jsonfile.replace('.json','.txt')
if not os.path.exists(jsonfile):
status = convert_trec_run_to_dpr_retrieval_json(
topics, 'wikipedia-dpr', runfile, jsonfile)
status = convert_trec_run_to_dpr_retrieval_json(topics, 'wikipedia-dpr-100w', runfile, jsonfile)
if status != 0:
raise RuntimeError("dpr retrieval convertion failed")
raise RuntimeError("dpr retrieval conversion failed")
topk_defs = evaluate_dpr_retrieval_metric_definitions['Top5-100']
if args.full_topk:
topk_defs = evaluate_dpr_retrieval_metric_definitions['Top5-1000']
Expand All @@ -130,13 +129,13 @@ def print_results(metric, topics):
# comparing ground truth scores with the generated ones
for expected in condition['scores']:
for metric, expected_score in expected.items():
if metric not in score.keys(): continue
if metric not in score.keys():
continue
if not args.skip_eval:
if math.isclose(score[metric], float(expected_score),abs_tol=2e-2):
if math.isclose(score[metric], float(expected_score), abs_tol=2e-2):
result_str = ok_str
else:
result_str = fail_str + \
f' expected {expected[metric]:.4f}'
result_str = fail_str + f' expected {expected[metric]:.4f}'
print(f' {metric:7}: {score[metric]:.2f} {result_str}')
table[name][metric] = score[metric]
else:
Expand Down

0 comments on commit dcc0ba0

Please sign in to comment.