Skip to content

Commit

Permalink
Fix TREC-COVID regressions and fine-tuning experiments (#2431)
Browse files Browse the repository at this point in the history
Renamed SearchCollection parameters a while ago but never fixed in Python scripts for
TREC-COVID regressions and fine-tuning experiments.
  • Loading branch information
lintool committed Mar 29, 2024
1 parent 5fb697a commit b91df2f
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 88 deletions.
13 changes: 9 additions & 4 deletions src/main/java/io/anserini/search/ScoredDocs.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,17 @@ public static ScoredDocs fromQrels(Map<String, Integer> qrels, IndexReader reade
String externalDocid = qrelsDocScorePair.getKey();
Query q = new TermQuery(new Term(Constants.ID, externalDocid));
TopDocs rs = searcher.search(q, 1);
lucene_documents.add(storedFields.document(rs.scoreDocs[0].doc));
lucene_docids.add(rs.scoreDocs[0].doc);
score.add(Float.valueOf(qrelsDocScorePair.getValue().floatValue()));
docids.add(storedFields.document(rs.scoreDocs[0].doc).get(Constants.ID));

// If for whatever reason we can't find the doc, then skip.
if (rs.totalHits.value > 0) {
lucene_documents.add(storedFields.document(rs.scoreDocs[0].doc));
lucene_docids.add(rs.scoreDocs[0].doc);
score.add(Float.valueOf(qrelsDocScorePair.getValue().floatValue()));
docids.add(storedFields.document(rs.scoreDocs[0].doc).get(Constants.ID));
}
}
} catch (IOException | ArrayIndexOutOfBoundsException | NullPointerException e) {
e.printStackTrace();
throw new RuntimeException("Error loading qrels.");
}

Expand Down
3 changes: 1 addition & 2 deletions src/main/python/fine_tuning/run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def batch_retrieval(collection_yaml, models_yaml, output_root):
for para in model_params:
this_para = (
program,
'-searchtweets' if 'mb' in collection_yaml['name'] else '',
'-topicReader', collection_yaml['topic_reader'],
'-index', index_path,
'-topics', os.path.join(collection_yaml['anserini_root'], collection_yaml['topic_root'], collection_yaml['topic']),
Expand Down Expand Up @@ -194,7 +193,7 @@ def verify_effectiveness(collection_yaml, models_yaml, output_root, fold_setting
parser.add_argument('--run', action='store_true', help='Generate the runs files and evaluate them. Otherwise we only output the evaluation results (based on the existing eval files)')
parser.add_argument('--collection', required=True, help='the collection key in yaml')
parser.add_argument('--model', required=True, help='model')
parser.add_argument('--threads', dest='parallelism', type=int, default=16, help='number of parallel threads for retrieval and evaluation')
parser.add_argument('--parallelism', dest='parallelism', type=int, default=8, help='number of parallel threads for retrieval and evaluation')
parser.add_argument('--output_root', default='fine_tuning_results', help='output directory of all results')
parser.add_argument('--fold_settings', default='', help='JSON file holding fold definitions, see src/main/resources/fine_tuning/robust04-paper1-folds.json for an example')
parser.add_argument('--verbose', action='store_true', help='if specified print out model parameters and per fold scores')
Expand Down
4 changes: 2 additions & 2 deletions src/main/python/fine_tuning/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ def gen_batch_retrieval_params(self, model_yaml, output_root, parallelism=4):
all_params = []
if not os.path.exists(os.path.join(output_root, self.run_files_root)):
os.makedirs(os.path.join(output_root, self.run_files_root))
para_str = '-threads %d %s' % (parallelism, model_yaml['fixed_params'])
para_str = '-parallelism %d %s' % (parallelism, model_yaml['fixed_params'])
results_fn = os.path.join(output_root, self.run_files_root, model_yaml['name'])
for param_name, params in model_yaml['params'].items():
para_str += ' -%s' % (param_name)
para_str += ' -%s' % param_name
for p in self.drange(params['lower'], params['upper']+1e-8, params['pace']):
is_float = True if params['type'] == 'float' else False
para_str += ' %.2f' % (p) if is_float else ' %d' % (p)
Expand Down
24 changes: 12 additions & 12 deletions src/main/python/trec-covid/covid_baseline_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

import math
import os
import re
import subprocess

import pyserini.util


def perform_runs(round_number, indexes):
base_topics = f'tools/topics-and-qrels/topics.covid-round{round_number}.xml'
udel_topics = f'tools/topics-and-qrels/topics.covid-round{round_number}-udel.xml'
Expand All @@ -35,17 +35,17 @@ def perform_runs(round_number, indexes):
abstract_index = indexes[0]
abstract_prefix = f'anserini.covid-r{round_number}.abstract'
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qq.bm25.txt -runtag {abstract_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/{abstract_prefix}.qdel.bm25.txt -runtag {abstract_prefix}.qdel.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query -removedups ' +
f'-topicReader Covid -topics {udel_topics} -topicField query -removeDuplicates ' +
f'-bm25 -rm3 -rm3.fbTerms 100 -hits 10000 ' +
f'-rf.qrels {cumulative_qrels} ' +
f'-output runs/{abstract_prefix}.qdel.bm25+rm3Rf.txt -runtag {abstract_prefix}.qdel.bm25+rm3Rf.txt')
Expand All @@ -57,13 +57,13 @@ def perform_runs(round_number, indexes):
full_text_index = indexes[1]
full_text_prefix = f'anserini.covid-r{round_number}.full-text'
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qq.bm25.txt -runtag {full_text_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/{full_text_prefix}.qdel.bm25.txt -runtag {full_text_prefix}.qdel.bm25.txt')

print('')
Expand All @@ -73,12 +73,12 @@ def perform_runs(round_number, indexes):
paragraph_index = indexes[2]
paragraph_prefix = f'anserini.covid-r{round_number}.paragraph'
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-selectMaxPassage -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qq.bm25.txt -runtag {paragraph_prefix}.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-selectMaxPassage -bm25 -hits 50000 ' +
f'-output runs/{paragraph_prefix}.qdel.bm25.txt -runtag {paragraph_prefix}.qdel.bm25.txt')

Expand Down
72 changes: 36 additions & 36 deletions src/main/python/trec-covid/generate_round1_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,28 +63,28 @@ def perform_runs():

abstract_index = indexes[0]
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.query.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield question ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField question ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.query+question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question+narrative ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question+narrative ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.query+question+narrative.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.query-udel.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query -querygenerator Covid19QueryGenerator ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query -generator Covid19QueryGenerator ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.abstract.query-covid19.bm25.txt')

print('')
Expand All @@ -93,28 +93,28 @@ def perform_runs():

full_text_index = indexes[1]
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.query.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield question ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField question ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.query+question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question+narrative ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question+narrative ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.query+question+narrative.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.query-udel.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query -querygenerator Covid19QueryGenerator ' +
f'-removedups -bm25 ' +
f'-topicReader Covid -topics {base_topics} -topicField query -generator Covid19QueryGenerator ' +
f'-removeDuplicates -bm25 ' +
f'-output runs/anserini.covid-r1.full-text.query-covid19.bm25.txt')

print('')
Expand All @@ -123,28 +123,28 @@ def perform_runs():

paragraph_index = indexes[2]
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {base_topics} -topicField query ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.query.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield question ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {base_topics} -topicField question ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.query+question.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question+narrative ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question+narrative ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.query+question+narrative.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.query-udel.bm25.txt')
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query -querygenerator Covid19QueryGenerator ' +
f'-removedups -bm25 -selectMaxPassage ' +
f'-topicReader Covid -topics {base_topics} -topicField query -generator Covid19QueryGenerator ' +
f'-removeDuplicates -bm25 -selectMaxPassage ' +
f'-output runs/anserini.covid-r1.paragraph.query-covid19.bm25.txt')


Expand Down
20 changes: 10 additions & 10 deletions src/main/python/trec-covid/generate_round2_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ def perform_runs():

abstract_index = indexes[0]
os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.abstract.qq.bm25.txt -runtag anserini.covid-r2.abstract.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {abstract_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.abstract.qdel.bm25.txt -runtag anserini.covid-r2.abstract.qdel.bm25.txt')

print('')
Expand All @@ -74,13 +74,13 @@ def perform_runs():

full_text_index = indexes[1]
os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.full-text.qq.bm25.txt -runtag anserini.covid-r2.full-text.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {full_text_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-removedups -bm25 -hits 10000 ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-removeDuplicates -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.full-text.qdel.bm25.txt -runtag anserini.covid-r2.full-text.qdel.bm25.txt')

print('')
Expand All @@ -89,12 +89,12 @@ def perform_runs():

paragraph_index = indexes[2]
os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {base_topics} -topicfield query+question ' +
f'-topicReader Covid -topics {base_topics} -topicField query+question ' +
f'-selectMaxPassage -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.paragraph.qq.bm25.txt -runtag anserini.covid-r2.paragraph.qq.bm25.txt')

os.system(f'target/appassembler/bin/SearchCollection -index {paragraph_index} ' +
f'-topicreader Covid -topics {udel_topics} -topicfield query ' +
f'-topicReader Covid -topics {udel_topics} -topicField query ' +
f'-selectMaxPassage -bm25 -hits 10000 ' +
f'-output runs/anserini.covid-r2.paragraph.qdel.bm25.txt -runtag anserini.covid-r2.paragraph.qdel.bm25.txt')

Expand Down
Loading

0 comments on commit b91df2f

Please sign in to comment.