Skip to content

Commit

Permalink
Add bindings for MS MARCO V2.1 prebuilt indexes + qrels (#2459)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool authored Apr 20, 2024
1 parent 4f96751 commit 9863611
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 18 deletions.
5 changes: 5 additions & 0 deletions src/main/java/io/anserini/eval/Qrels.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,17 @@ public enum Qrels {
TREC2022_DL_PASSAGE("qrels.dl22-passage.txt"),
TREC2023_DL_DOC("qrels.dl23-doc.txt"),
TREC2023_DL_PASSAGE("qrels.dl23-passage.txt"),
TREC2021_DL_DOC_MSMARCO_V21("qrels.dl21-doc-msmarco-v2.1.txt"),
TREC2022_DL_DOC_MSMARCO_V21("qrels.dl22-doc-msmarco-v2.1.txt"),
TREC2023_DL_DOC_MSMARCO_V21("qrels.dl23-doc-msmarco-v2.1.txt"),
MSMARCO_DOC_DEV("qrels.msmarco-doc.dev.txt"),
MSMARCO_PASSAGE_DEV_SUBSET("qrels.msmarco-passage.dev-subset.txt"),
MSMARCO_V2_DOC_DEV("qrels.msmarco-v2-doc.dev.txt"),
MSMARCO_V2_DOC_DEV2("qrels.msmarco-v2-doc.dev2.txt"),
MSMARCO_V2_PASSAGE_DEV("qrels.msmarco-v2-passage.dev.txt"),
MSMARCO_V2_PASSAGE_DEV2("qrels.msmarco-v2-passage.dev2.txt"),
MSMARCO_V21_DOC_DEV("qrels.msmarco-v2.1-doc.dev.txt"),
MSMARCO_V21_DOC_DEV2("qrels.msmarco-v2.1-doc.dev2.txt"),
NTCIR8_ZH("qrels.ntcir8.eval.txt"),
CLEF2006_FR("qrels.clef06fr.txt"),
TREC2002_AR("qrels.trec02ar.txt"),
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/io/anserini/index/IndexInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,20 @@ public enum IndexInfo {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v2-doc-segmented.20220808.4d6d2a.tar.gz" },
"8a5f444fa5a63cc5d4ddc3e6dd15faa0"),

MSMARCO_V21_DOC("msmarco-v2.1-doc",
"Lucene index of the MS MARCO V2.1 document corpus.",
"lucene-inverted.msmarco-v2.1-doc.20240418.4f9675.tar.gz",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v2.1-doc.20240418.4f9675.tar.gz" },
"cecd55856c34afa82f1a499705c9df02"),

MSMARCO_V21_DOC_SEGMENTED("msmarco-v2.1-doc-segmented",
"Lucene index of the MS MARCO V2.1 segmented document corpus.",
"lucene-inverted.msmarco-v2.1-doc-segmented.20240418.4f9675.tar.gz",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene/lucene-inverted.msmarco-v2.1-doc-segmented.20240418.4f9675.tar.gz" },
"6ec4cd595c9fe1ad91b43eabb39a637c"),

// BEIR: flat
BEIR_V1_0_0_TREC_COVID_FLAT("beir-v1.0.0-trec-covid.flat",
"Lucene inverted 'flat' index of BEIR collection 'trec-covid'.",
Expand Down
137 changes: 121 additions & 16 deletions src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ public int getQrelsCount(RelevanceJudgments qrels) throws IOException{
return count;
}

@Test
public void testTotalCount() {
assertEquals(169, Qrels.values().length);
}

@Test(expected = IOException.class)
public void testFileNotFound() throws IOException{
// Purposely read non-existent file.
Expand Down Expand Up @@ -205,6 +210,28 @@ public void testTrec21DLPassage() throws IOException{
assertEquals(1, qrels.getRelevanceGrade("1129560", "msmarco_passage_67_937656589"));
}

@Test
public void testTrec21DLDocMsMarcoV21() throws IOException{
// % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt | sort | uniq | wc
// 57 57 412
// % wc tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt
// 10973 43892 456277 tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt

RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt");
assertNotNull(qrels);
assertEquals(57, qrels.getQids().size());
assertEquals(10973, getQrelsCount(qrels));
assertEquals(2, qrels.getRelevanceGrade("2082", "msmarco_v2.1_doc_01_1281570012"));
assertEquals(2, qrels.getRelevanceGrade("1128632", "msmarco_v2.1_doc_17_481617788"));

qrels = RelevanceJudgments.fromQrels(Qrels.TREC2021_DL_DOC_MSMARCO_V21);
assertNotNull(qrels);
assertEquals(57, qrels.getQids().size());
assertEquals(10973, getQrelsCount(qrels));
assertEquals(2, qrels.getRelevanceGrade("2082", "msmarco_v2.1_doc_01_1281570012"));
assertEquals(2, qrels.getRelevanceGrade("1128632", "msmarco_v2.1_doc_17_481617788"));
}

@Test
public void testTrec22DLDoc() throws IOException{
// % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl22-doc.txt | uniq | wc
Expand Down Expand Up @@ -249,6 +276,28 @@ public void testTrec22DLPassage() throws IOException{
assertEquals(1, qrels.getRelevanceGrade("2056323", "msmarco_passage_68_715747739"));
}

@Test
public void testTrec22DLDocMsMarcoV21() throws IOException{
// % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt | sort | uniq | wc
// 76 76 608
// % wc tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt
// 349541 1398164 14786970 tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt

RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt");
assertNotNull(qrels);
assertEquals(76, qrels.getQids().size());
assertEquals(349541, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("2000511", "msmarco_v2.1_doc_00_896525856"));
assertEquals(2, qrels.getRelevanceGrade("2056158", "msmarco_v2.1_doc_06_934688453"));

qrels = RelevanceJudgments.fromQrels(Qrels.TREC2022_DL_DOC_MSMARCO_V21);
assertNotNull(qrels);
assertEquals(76, qrels.getQids().size());
assertEquals(349541, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("2000511", "msmarco_v2.1_doc_00_896525856"));
assertEquals(2, qrels.getRelevanceGrade("2056158", "msmarco_v2.1_doc_06_934688453"));
}

@Test
public void testTrec23DLDoc() throws IOException{
// % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl23-doc.txt | uniq | wc
Expand Down Expand Up @@ -293,6 +342,28 @@ public void testTrec23DLPassage() throws IOException{
assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_passage_22_487548813"));
}

@Test
public void testTrec23DLDocMsMarcoV21() throws IOException{
// % cut -f 1 -d ' ' tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt | uniq | wc
// 82 82 656
// % wc tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt
// 15995 63980 677618 tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt

RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt");
assertNotNull(qrels);
assertEquals(82, qrels.getQids().size());
assertEquals(15995, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("2001010", "msmarco_v2.1_doc_00_1372241967"));
assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_v2.1_doc_19_1982402861"));

qrels = RelevanceJudgments.fromQrels(Qrels.TREC2023_DL_DOC_MSMARCO_V21);
assertNotNull(qrels);
assertEquals(82, qrels.getQids().size());
assertEquals(15995, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("2001010", "msmarco_v2.1_doc_00_1372241967"));
assertEquals(2, qrels.getRelevanceGrade("3100922", "msmarco_v2.1_doc_19_1982402861"));
}

@Test
public void testMsmarcoDocDev() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-doc.dev.txt");
Expand Down Expand Up @@ -328,37 +399,37 @@ public void testMsmarcoPassageDevSubset() throws IOException{
}

@Test
public void testMsmarcoV2DocDev() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt");
public void testMsmarcoV2DocDevMsMarcoV21() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2.1-doc.dev.txt");
assertNotNull(qrels);
assertEquals(4552, qrels.getQids().size());
assertEquals(4702, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121"));
assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348"));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_v2.1_doc_17_1968189952"));
assertEquals(1, qrels.getRelevanceGrade("999897", "msmarco_v2.1_doc_46_191673440"));

qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV);
qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V21_DOC_DEV);
assertNotNull(qrels);
assertEquals(4552, qrels.getQids().size());
assertEquals(4702, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121"));
assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348"));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_v2.1_doc_17_1968189952"));
assertEquals(1, qrels.getRelevanceGrade("999897", "msmarco_v2.1_doc_46_191673440"));
}

@Test
public void testMsmarcoV2DocDev2() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt");
public void testMsmarcoV2DocDev2MsMarcoV21() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2.1-doc.dev2.txt");
assertNotNull(qrels);
assertEquals(5000, qrels.getQids().size());
assertEquals(5178, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062"));
assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607"));
assertEquals(5177, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_v2.1_doc_08_69146701"));
assertEquals(1, qrels.getRelevanceGrade("999659", "msmarco_v2.1_doc_08_1247437925"));

qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV2);
qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V21_DOC_DEV2);
assertNotNull(qrels);
assertEquals(5000, qrels.getQids().size());
assertEquals(5178, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062"));
assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607"));
assertEquals(5177, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_v2.1_doc_08_69146701"));
assertEquals(1, qrels.getRelevanceGrade("999659", "msmarco_v2.1_doc_08_1247437925"));
}

@Test
Expand Down Expand Up @@ -395,6 +466,40 @@ public void testMsmarcoV2DocPassage2() throws IOException{
assertEquals(1, qrels.getRelevanceGrade("961297", "msmarco_passage_18_858458289"));
}

@Test
public void testMsmarcoV2DocDev() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt");
assertNotNull(qrels);
assertEquals(4552, qrels.getQids().size());
assertEquals(4702, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121"));
assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348"));

qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV);
assertNotNull(qrels);
assertEquals(4552, qrels.getQids().size());
assertEquals(4702, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000000", "msmarco_doc_17_2560009121"));
assertEquals(1, qrels.getRelevanceGrade("999942", "msmarco_doc_06_956348348"));
}

@Test
public void testMsmarcoV2DocDev2() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt");
assertNotNull(qrels);
assertEquals(5000, qrels.getQids().size());
assertEquals(5178, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062"));
assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607"));

qrels = RelevanceJudgments.fromQrels(Qrels.MSMARCO_V2_DOC_DEV2);
assertNotNull(qrels);
assertEquals(5000, qrels.getQids().size());
assertEquals(5178, getQrelsCount(qrels));
assertEquals(1, qrels.getRelevanceGrade("1000202", "msmarco_doc_08_73026062"));
assertEquals(1, qrels.getRelevanceGrade("999937", "msmarco_doc_05_319743607"));
}

@Test
public void testCore17() throws IOException{
RelevanceJudgments qrels = new RelevanceJudgments("tools/topics-and-qrels/qrels.core17.txt");
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/io/anserini/index/PrebuiltIndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@ public void testUrls() {
// test number of prebuilt-indexes
@Test
public void testNumPrebuiltIndexes() {
assertEquals(128, IndexInfo.values().length);
assertEquals(130, IndexInfo.values().length);
}
}

0 comments on commit 9863611

Please sign in to comment.