From 6b24bbd28631c94577327923e37da71259c84425 Mon Sep 17 00:00:00 2001 From: stephaniewhoo Date: Thu, 15 Apr 2021 16:38:34 -0400 Subject: [PATCH] add pretokenized option in index and search collection (#1514) add pretokenized option in IndexCollection and SearchCollection with test cases --- .../java/io/anserini/index/IndexArgs.java | 4 + .../io/anserini/index/IndexCollection.java | 3 +- .../java/io/anserini/search/SearchArgs.java | 3 + .../io/anserini/search/SearchCollection.java | 13 ++- .../io/anserini/integration/EndToEndTest.java | 27 ++++++ .../PretokenizedIndexEndToEndTest.java | 92 +++++++++++++++++++ .../json/collection_tokenized/segment1.json | 10 ++ .../resources/sample_topics/json_topics.tsv | 1 + 8 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java create mode 100644 src/test/resources/sample_docs/json/collection_tokenized/segment1.json create mode 100644 src/test/resources/sample_topics/json_topics.tsv diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java index a74e9d6253..5af561a2d7 100644 --- a/src/main/java/io/anserini/index/IndexArgs.java +++ b/src/main/java/io/anserini/index/IndexArgs.java @@ -122,6 +122,10 @@ public class IndexArgs { usage = "Analyzer language (ISO 3166 two-letter code).") public String language= "en"; + @Option(name = "-pretokenized", + usage = "index pre-tokenized collections without any additional stemming, stopword processing") + public boolean pretokenized = false; + // Tweet options @Option(name = "-tweet.keepRetweets", diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index 1c9e6cffbc..1dea9a27c4 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -646,6 +646,7 @@ public IndexCollection(IndexArgs args) throws Exception { LOG.info("Store document \"raw\" field? " + args.storeRaw); LOG.info("Optimize (merge segments)? " + args.optimize); LOG.info("Whitelist: " + args.whitelist); + LOG.info("Pretokenized?: " + args.pretokenized); if (args.solr) { LOG.info("Indexing into Solr..."); @@ -753,7 +754,7 @@ public Counters run() throws IOException { config = new IndexWriterConfig(germanAnalyzer); } else if (args.language.equals("es")) { config = new IndexWriterConfig(spanishAnalyzer); - } else if (args.language.equals("en_ws")) { + } else if (args.pretokenized) { config = new IndexWriterConfig(whitespaceAnalyzer); } else { config = new IndexWriterConfig(analyzer); diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java index fd975d3898..f1ce46c869 100644 --- a/src/main/java/io/anserini/search/SearchArgs.java +++ b/src/main/java/io/anserini/search/SearchArgs.java @@ -84,6 +84,9 @@ public class SearchArgs { usage = "Path to file with stopwords.") public String stopwords = null; + @Option(name = "-pretokenized", usage = "Boolean switch to accept pre tokenized jsonl.") + public boolean pretokenized = false; + @Option(name = "-arbitraryScoreTieBreak", usage = "Break score ties arbitrarily (not recommended)") public boolean arbitraryScoreTieBreak = false; diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 834825eeba..1be425e8d1 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -139,7 +139,8 @@ public final class SearchCollection implements Closeable { private List cascades; private final boolean isRerank; private Map qrels; - private Set queriesWithRel; + private Set queriesWithRel; + private Map> queries = new HashMap<>(); // let query tokens get exposed to the test (with analyzer) private final class SearcherThread extends Thread { final private IndexReader reader; @@ -302,9 +303,9 @@ public SearchCollection(SearchArgs args) throws IOException { } else if (args.language.equals("es")) { analyzer = new SpanishAnalyzer(); LOG.info("Language: es"); - } else if (args.language.equals("en_ws")) { + } else if (args.pretokenized) { analyzer = new WhitespaceAnalyzer(); - LOG.info("Language: en_ws"); + LOG.info("Pretokenized"); } else { // Default to English analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepstop, args.stopwords); @@ -587,6 +588,8 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr List queryTokens = AnalyzerUtils.analyze(analyzer, queryString); + queries.put(qid.toString(), queryTokens); + RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args); ScoredDocuments scoredFbDocs; if ( isRerank && args.rf_qrels != null) { @@ -700,6 +703,10 @@ public ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String qu return cascade.run(scoredFbDocs, context); } + public Map> getQueries(){ + return queries; + } + public static void main(String[] args) throws Exception { SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(100)); diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java index 857e7c5270..fe396b18ff 100644 --- a/src/test/java/io/anserini/integration/EndToEndTest.java +++ b/src/test/java/io/anserini/integration/EndToEndTest.java @@ -19,6 +19,7 @@ import io.anserini.index.IndexArgs; import io.anserini.index.IndexCollection; import io.anserini.index.IndexReaderUtils; +import io.anserini.index.NotStoredException; import io.anserini.search.SearchArgs; import io.anserini.search.SearchCollection; import org.apache.commons.io.FileUtils; @@ -46,6 +47,7 @@ import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Iterator; // This automatically tests indexing, retrieval, and evaluation from end to end. // Subclasses inherit and special to different collections. @@ -62,6 +64,8 @@ public abstract class EndToEndTest extends LuceneTestCase { protected String searchOutputPrefix = "e2eTestSearch"; protected Map referenceRunOutput = new HashMap<>(); protected Map> documents = new HashMap<>(); + protected Map>> tokens = new HashMap<>(); + protected Map> queryTokens = new HashMap<>(); // These are the sources of truth protected int fieldNormStatusTotalFields; @@ -145,6 +149,10 @@ public void setUp() throws Exception { args.add(Integer.toString(indexArgs.shardCurrent)); } + if (indexArgs.pretokenized){ + args.add("-pretokenized"); + } + IndexCollection.main(args.toArray(new String[args.size()])); } @@ -194,6 +202,20 @@ public void checkIndex() throws IOException { IndexReaderUtils.documentRaw(reader, collectionDocid)); assertEquals(documents.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid)); + // check list of tokens by calling document vector + if(!tokens.isEmpty()){ + try { + Map actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid); + Iterator it = actualToken.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pair = (Map.Entry)it.next(); + assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue()); + it.remove(); + } + } catch (NotStoredException e) { + e.printStackTrace(); + } + } } reader.close(); @@ -265,7 +287,12 @@ public void testSearching() { for (Map.Entry entry : testQueries.entrySet()) { SearchCollection searcher = new SearchCollection(entry.getValue()); searcher.runTopics(); + Map> actualQuery = searcher.getQueries(); searcher.close(); + //check query tokens + if(!queryTokens.isEmpty()){ + assertEquals(queryTokens, actualQuery); + } checkRankingResults(entry.getKey(), entry.getValue().output); // Remember to cleanup run files. cleanup.add(new File(entry.getValue().output)); diff --git a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java new file mode 100644 index 0000000000..c36ca0c706 --- /dev/null +++ b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java @@ -0,0 +1,92 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.anserini.integration; + +import io.anserini.collection.DocumentCollection; +import io.anserini.collection.JsonCollection; +import io.anserini.collection.TrecCollection; +import io.anserini.index.IndexArgs; +import io.anserini.index.IndexCollection; +import io.anserini.index.generator.DefaultLuceneDocumentGenerator; +import io.anserini.search.SearchArgs; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + + +public class PretokenizedIndexEndToEndTest extends EndToEndTest { + @Override + IndexArgs getIndexArgs() { + IndexArgs indexArgs = createDefaultIndexArgs(); + indexArgs.input = "src/test/resources/sample_docs/json/collection_tokenized"; + indexArgs.collectionClass = JsonCollection.class.getSimpleName(); + indexArgs.generatorClass = DefaultLuceneDocumentGenerator.class.getSimpleName(); + indexArgs.pretokenized = true; + indexArgs.storeRaw = true; + + return indexArgs; + } + + @Override + protected void setCheckIndexGroundTruth() { + docCount = 2; + documents.put("2000000", Map.of( + "contents", "this was ##a simple pretokenized test", + "raw","{\n" + + " \"id\" : \"2000000\",\n" + + " \"contents\" : \"this was ##a simple pretokenized test\"\n" + + "}")); + documents.put("2000001", Map.of( + "contents", "some time extra ##vert ##ing and some time intro ##vert ##ing", + "raw","{\n" + + " \"id\" : \"2000001\",\n" + + " \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" + + "}" + )); + tokens.put("2000000", Map.of( + "contents", Map.of( + "this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L))); + tokens.put("2000001",Map.of( + "contents", Map.of( + "some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L))); + + fieldNormStatusTotalFields = 1; + // whitespace analyzer keeps everything, includes docid + // this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001 + termIndexStatusTermCount = 15; + termIndexStatusTotFreq = 15; + storedFieldStatusTotalDocCounts = 2; + termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts; + storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2 + } + + @Override + protected void setSearchGroundTruth() { + topicReader = "TsvInt"; + topicFile = "src/test/resources/sample_topics/json_topics.tsv"; + SearchArgs searchArg = createDefaultSearchArgs().bm25(); + searchArg.pretokenized = true; + testQueries.put("bm25", searchArg); + queryTokens.put("1", new ArrayList<>()); + queryTokens.get("1").add("##ing"); + queryTokens.get("1").add("##vert"); + referenceRunOutput.put("bm25", new String[]{ + "1 Q0 2000001 1 0.922400 Anserini"}); + } + +} diff --git a/src/test/resources/sample_docs/json/collection_tokenized/segment1.json b/src/test/resources/sample_docs/json/collection_tokenized/segment1.json new file mode 100644 index 0000000000..25553e4498 --- /dev/null +++ b/src/test/resources/sample_docs/json/collection_tokenized/segment1.json @@ -0,0 +1,10 @@ +[ + { + "id": "2000000", + "contents": "this was ##a simple pretokenized test" + }, + { + "id": "2000001", + "contents": "some time extra ##vert ##ing and some time intro ##vert ##ing" + } +] \ No newline at end of file diff --git a/src/test/resources/sample_topics/json_topics.tsv b/src/test/resources/sample_topics/json_topics.tsv new file mode 100644 index 0000000000..7c3178a914 --- /dev/null +++ b/src/test/resources/sample_topics/json_topics.tsv @@ -0,0 +1 @@ +1 ##ing ##vert \ No newline at end of file