Skip to content

Commit

Permalink
Refactoring TopicReader, exposing bg20 topics (#1502)
Browse files Browse the repository at this point in the history
Better way of building TOPIC_FILE_TO_TYPE mapping so we don't have to enter same information twice.
  • Loading branch information
lintool committed Apr 4, 2021
1 parent 346cb9b commit f110d36
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 54 deletions.
63 changes: 9 additions & 54 deletions src/main/java/io/anserini/search/topicreader/TopicReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,60 +39,15 @@
public abstract class TopicReader<K> {
protected Path topicFile;

// Holds mappings from known topic files to corresponding TopicReader class.
static private final Map<String, Class<? extends TopicReader>> TOPIC_FILE_TO_TYPE = Map.ofEntries(
Map.entry("topics.adhoc.51-100.txt", TrecTopicReader.class),
Map.entry("topics.adhoc.101-150.txt", TrecTopicReader.class),
Map.entry("topics.adhoc.151-200.txt", TrecTopicReader.class),
Map.entry("topics.robust04.txt", TrecTopicReader.class),
Map.entry("topics.robust05.txt", TrecTopicReader.class),
Map.entry("topics.core17.txt", TrecTopicReader.class),
Map.entry("topics.core18.txt", TrecTopicReader.class),
Map.entry("topics.adhoc.451-550.txt", TrecTopicReader.class),
Map.entry("topics.terabyte04.701-750.txt", TrecTopicReader.class),
Map.entry("topics.terabyte05.751-800.txt", TrecTopicReader.class),
Map.entry("topics.terabyte06.801-850.txt", TrecTopicReader.class),
Map.entry("topics.web.51-100.txt", WebxmlTopicReader.class),
Map.entry("topics.web.101-150.txt", WebxmlTopicReader.class),
Map.entry("topics.web.151-200.txt", WebxmlTopicReader.class),
Map.entry("topics.web.201-250.txt", WebxmlTopicReader.class),
Map.entry("topics.web.251-300.txt", WebxmlTopicReader.class),
Map.entry("topics.microblog2011.txt", MicroblogTopicReader.class),
Map.entry("topics.microblog2012.txt", MicroblogTopicReader.class),
Map.entry("topics.microblog2013.txt", MicroblogTopicReader.class),
Map.entry("topics.microblog2014.txt", MicroblogTopicReader.class),
Map.entry("topics.car17v1.5.benchmarkY1test.txt", CarTopicReader.class),
Map.entry("topics.car17v2.0.benchmarkY1test.txt", CarTopicReader.class),
Map.entry("topics.dl19-doc.txt", TsvIntTopicReader.class),
Map.entry("topics.dl19-passage.txt", TsvIntTopicReader.class),
Map.entry("topics.msmarco-doc.dev.txt", TsvIntTopicReader.class),
Map.entry("topics.msmarco-passage.dev-subset.txt", TsvIntTopicReader.class),
Map.entry("topics.ntcir8zh.eval.txt", TsvStringTopicReader.class),
Map.entry("topics.clef06fr.mono.fr.txt", TsvStringTopicReader.class),
Map.entry("topics.trec02ar-ar.txt", TrecTopicReader.class),
Map.entry("topics.fire12bn.176-225.txt", TrecTopicReader.class),
Map.entry("topics.fire12hi.176-225.txt", TrecTopicReader.class),
Map.entry("topics.fire12en.176-225.txt", TrecTopicReader.class),
Map.entry("topics.covid-round1.xml", CovidTopicReader.class),
Map.entry("topics.covid-round1-udel.xml", CovidTopicReader.class),
Map.entry("topics.covid-round2.xml", CovidTopicReader.class),
Map.entry("topics.covid-round2-udel.xml", CovidTopicReader.class),
Map.entry("topics.covid-round3.xml", CovidTopicReader.class),
Map.entry("topics.covid-round3-udel.xml", CovidTopicReader.class),
Map.entry("topics.covid-round4.xml", CovidTopicReader.class),
Map.entry("topics.covid-round4-udel.xml", CovidTopicReader.class),
Map.entry("topics.covid-round5.xml", CovidTopicReader.class),
Map.entry("topics.covid-round5-udel.xml", CovidTopicReader.class),
Map.entry("topics.backgroundlinking18.txt", BackgroundLinkingTopicReader.class),
Map.entry("topics.backgroundlinking19.txt", BackgroundLinkingTopicReader.class),
Map.entry("topics.dpr.nq.dev.txt", DprNqTopicReader.class),
Map.entry("topics.dpr.nq.test.txt", DprNqTopicReader.class),
Map.entry("topics.dpr.trivia.dev.txt", DprNqTopicReader.class),
Map.entry("topics.dpr.trivia.test.txt", DprNqTopicReader.class),
Map.entry("topics.dpr.wq.test.txt", DprJsonlTopicReader.class),
Map.entry("topics.dpr.squad.test.txt", DprJsonlTopicReader.class),
Map.entry("topics.dpr.curated.test.txt", DprJsonlTopicReader.class)
);
static private final Map<String, Class<? extends TopicReader>> TOPIC_FILE_TO_TYPE = new HashMap<>();

static {
// Inverts the "Topic" enum to populate the lookup table that maps topics filename to reader class.
for (Topics topic : Topics.values()) {
String pathParts[] = topic.path.split("\\/");
TOPIC_FILE_TO_TYPE.put(pathParts[1], topic.readerClass);
}
}

/**
* Returns the {@link TopicReader} class corresponding to a known topics file, or <code>null</code> if unknown.
Expand Down
1 change: 1 addition & 0 deletions src/main/java/io/anserini/search/topicreader/Topics.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ public enum Topics {
COVID_ROUND5_UDEL(CovidTopicReader.class, "topics-and-qrels/topics.covid-round5-udel.xml"),
TREC2018_BL(BackgroundLinkingTopicReader.class, "topics-and-qrels/topics.backgroundlinking18.txt"),
TREC2019_BL(BackgroundLinkingTopicReader.class, "topics-and-qrels/topics.backgroundlinking19.txt"),
TREC2020_BL(BackgroundLinkingTopicReader.class, "topics-and-qrels/topics.backgroundlinking20.txt"),
EPIDEMIC_QA_EXPERT_PRELIM(EpidemicQATopicReader.class, "topics-and-qrels/topics.epidemic-qa.expert.prelim.json"),
EPIDEMIC_QA_CONSUMER_PRELIM(EpidemicQATopicReader.class, "topics-and-qrels/topics.epidemic-qa.consumer.prelim.json"),
DPR_NQ_DEV(DprNqTopicReader.class, "topics-and-qrels/topics.dpr.nq.dev.txt"),
Expand Down
28 changes: 28 additions & 0 deletions src/test/java/io/anserini/search/topicreader/TopicReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@

public class TopicReaderTest {

@Test
public void testIterateThroughAllEnums() {
int cnt = 0;
for (Topics topic : Topics.values()) {
cnt++;

// Verify that we can fetch the TopicReader class given the name of the topic file.
String pathParts[] = topic.path.split("\\/");
assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(pathParts[1]));
}
assertEquals(60, cnt);
}

@Test
public void testTopicReaderClassLookup() {
assertEquals(TrecTopicReader.class,
Expand Down Expand Up @@ -983,6 +996,21 @@ public void testBackgroundLinkingTopics() {
assertEquals("https://www.washingtonpost.com/news/capital-weather-gang/wp/2017/07/14/" +
"sun-erupts-to-mark-another-bastille-day-aurora-possible-in-new-england-sunday-night/",
topics.get(topics.lastKey()).get("url"));

topics = TopicReader.getTopics(Topics.TREC2020_BL);

assertEquals(50, topics.keySet().size());
assertEquals(886, (int) topics.firstKey());
assertEquals("AEQZNZSVT5BGPPUTTJO7SNMOLE", topics.get(topics.firstKey()).get("title"));
assertEquals("https://www.washingtonpost.com/politics/2019/06/05/" +
"trump-says-transgender-troops-cant-serve-because-troops-cant-take-any-drugs-hes-wrong-many-ways/",
topics.get(topics.firstKey()).get("url"));

assertEquals(935, (int) topics.lastKey());
assertEquals("CCUJNXOJNFEJFBL57GD27EHMWI", topics.get(topics.lastKey()).get("title"));
assertEquals("https://www.washingtonpost.com/news/to-your-health/wp/2018/05/30/" +
"this-mock-pandemic-killed-150-million-people-next-time-it-might-not-be-a-drill/",
topics.get(topics.lastKey()).get("url"));
}

@Test
Expand Down

0 comments on commit f110d36

Please sign in to comment.