add pretokenized option in index and search collection (#1514)

add pretokenized option in IndexCollection and SearchCollection with test cases
castorini · Apr 15, 2021 · 6b24bbd · 6b24bbd
1 parent a708652
commit 6b24bbd
Show file tree

Hide file tree

Showing 8 changed files with 149 additions and 4 deletions.
diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java
@@ -122,6 +122,10 @@ public class IndexArgs {
       usage = "Analyzer language (ISO 3166 two-letter code).")
   public String language= "en";
 
+  @Option(name = "-pretokenized",
+          usage = "index pre-tokenized collections without any additional stemming, stopword processing")
+  public boolean pretokenized = false;
+
   // Tweet options
 
   @Option(name = "-tweet.keepRetweets",

diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
@@ -646,6 +646,7 @@ public IndexCollection(IndexArgs args) throws Exception {
     LOG.info("Store document \"raw\" field? " + args.storeRaw);
     LOG.info("Optimize (merge segments)? " + args.optimize);
     LOG.info("Whitelist: " + args.whitelist);
+    LOG.info("Pretokenized?: " + args.pretokenized);
 
     if (args.solr) {
       LOG.info("Indexing into Solr...");
@@ -753,7 +754,7 @@ public Counters run() throws IOException {
         config = new IndexWriterConfig(germanAnalyzer);
       } else if (args.language.equals("es")) {
         config = new IndexWriterConfig(spanishAnalyzer);
-      } else if (args.language.equals("en_ws")) {
+      } else if (args.pretokenized) {
         config = new IndexWriterConfig(whitespaceAnalyzer);
       } else {
         config = new IndexWriterConfig(analyzer);

diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java
@@ -84,6 +84,9 @@ public class SearchArgs {
           usage = "Path to file with stopwords.")
   public String stopwords = null;
 
+  @Option(name = "-pretokenized", usage = "Boolean switch to accept pre tokenized jsonl.")
+  public boolean pretokenized = false;
+
   @Option(name = "-arbitraryScoreTieBreak", usage = "Break score ties arbitrarily (not recommended)")
   public boolean arbitraryScoreTieBreak = false;
 

diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -139,7 +139,8 @@ public final class SearchCollection implements Closeable {
   private List<RerankerCascade> cascades;
   private final boolean isRerank;
   private Map<String, ScoredDocuments> qrels;
-  private Set<String> queriesWithRel; 
+  private Set<String> queriesWithRel;
+  private Map<String, List<String>> queries = new HashMap<>(); // let query tokens get exposed to the test (with analyzer)
 
   private final class SearcherThread<K> extends Thread {
     final private IndexReader reader;
@@ -302,9 +303,9 @@ public SearchCollection(SearchArgs args) throws IOException {
     } else if (args.language.equals("es")) {
       analyzer = new SpanishAnalyzer();
       LOG.info("Language: es");
-    } else if (args.language.equals("en_ws")) {
+    } else if (args.pretokenized) {
       analyzer = new WhitespaceAnalyzer();
-      LOG.info("Language: en_ws");
+      LOG.info("Pretokenized");
     } else {
       // Default to English
       analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepstop, args.stopwords);
@@ -587,6 +588,8 @@ public <K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr
 
     List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
 
+    queries.put(qid.toString(), queryTokens);
+
     RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
     ScoredDocuments scoredFbDocs; 
     if ( isRerank && args.rf_qrels != null) {
@@ -700,6 +703,10 @@ public <K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String qu
     return cascade.run(scoredFbDocs,  context);
   }
 
+  public Map<String, List<String>> getQueries(){
+    return queries;
+  }
+
   public static void main(String[] args) throws Exception {
     SearchArgs searchArgs = new SearchArgs();
     CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(100));

diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java
@@ -19,6 +19,7 @@
 import io.anserini.index.IndexArgs;
 import io.anserini.index.IndexCollection;
 import io.anserini.index.IndexReaderUtils;
+import io.anserini.index.NotStoredException;
 import io.anserini.search.SearchArgs;
 import io.anserini.search.SearchCollection;
 import org.apache.commons.io.FileUtils;
@@ -46,6 +47,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.Iterator;
 
 // This automatically tests indexing, retrieval, and evaluation from end to end.
 // Subclasses inherit and special to different collections.
@@ -62,6 +64,8 @@ public abstract class EndToEndTest extends LuceneTestCase {
   protected String searchOutputPrefix = "e2eTestSearch";
   protected Map<String, String[]> referenceRunOutput = new HashMap<>();
   protected Map<String, Map<String, String>> documents = new HashMap<>();
+  protected Map<String, Map<String, Map<String, Long>>> tokens = new HashMap<>();
+  protected Map<String, List<String>>  queryTokens = new HashMap<>();
 
   // These are the sources of truth
   protected int fieldNormStatusTotalFields;
@@ -145,6 +149,10 @@ public void setUp() throws Exception {
       args.add(Integer.toString(indexArgs.shardCurrent));
     }
 
+    if (indexArgs.pretokenized){
+      args.add("-pretokenized");
+    }
+
     IndexCollection.main(args.toArray(new String[args.size()]));
   }
 
@@ -194,6 +202,20 @@ public void checkIndex() throws IOException {
           IndexReaderUtils.documentRaw(reader, collectionDocid));
       assertEquals(documents.get(collectionDocid).get("contents"),
           IndexReaderUtils.documentContents(reader, collectionDocid));
+      // check list of tokens by calling document vector
+      if(!tokens.isEmpty()){
+        try {
+          Map<String, Long> actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid);
+          Iterator it = actualToken.entrySet().iterator();
+          while (it.hasNext()) {
+            Map.Entry pair = (Map.Entry)it.next();
+            assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue());
+            it.remove();
+          }
+        } catch (NotStoredException e) {
+          e.printStackTrace();
+        }
+      }
     }
     reader.close();
 
@@ -265,7 +287,12 @@ public void testSearching() {
       for (Map.Entry<String, SearchArgs> entry : testQueries.entrySet()) {
         SearchCollection searcher = new SearchCollection(entry.getValue());
         searcher.runTopics();
+        Map<String, List<String>> actualQuery = searcher.getQueries();
         searcher.close();
+        //check query tokens
+        if(!queryTokens.isEmpty()){
+          assertEquals(queryTokens, actualQuery);
+        }
         checkRankingResults(entry.getKey(), entry.getValue().output);
         // Remember to cleanup run files.
         cleanup.add(new File(entry.getValue().output));

diff --git a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java
@@ -0,0 +1,92 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.anserini.integration;
+
+import io.anserini.collection.DocumentCollection;
+import io.anserini.collection.JsonCollection;
+import io.anserini.collection.TrecCollection;
+import io.anserini.index.IndexArgs;
+import io.anserini.index.IndexCollection;
+import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
+import io.anserini.search.SearchArgs;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+
+public class PretokenizedIndexEndToEndTest extends EndToEndTest {
+  @Override
+  IndexArgs getIndexArgs() {
+    IndexArgs indexArgs = createDefaultIndexArgs();
+    indexArgs.input = "src/test/resources/sample_docs/json/collection_tokenized";
+    indexArgs.collectionClass = JsonCollection.class.getSimpleName();
+    indexArgs.generatorClass = DefaultLuceneDocumentGenerator.class.getSimpleName();
+    indexArgs.pretokenized = true;
+    indexArgs.storeRaw = true;
+
+    return indexArgs;
+    }
+
+  @Override
+  protected void setCheckIndexGroundTruth() {
+    docCount = 2;
+    documents.put("2000000", Map.of(
+      "contents", "this was ##a simple pretokenized test",
+      "raw","{\n" +
+      "  \"id\" : \"2000000\",\n" +
+      "  \"contents\" : \"this was ##a simple pretokenized test\"\n" +
+      "}"));
+    documents.put("2000001", Map.of(
+      "contents", "some time extra ##vert ##ing and some time intro ##vert ##ing",
+      "raw","{\n" +
+      "  \"id\" : \"2000001\",\n" +
+      "  \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" +
+      "}"
+    ));
+    tokens.put("2000000", Map.of(
+      "contents", Map.of(
+      "this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L)));
+    tokens.put("2000001",Map.of(
+      "contents", Map.of(
+      "some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L)));
+
+    fieldNormStatusTotalFields = 1;
+    // whitespace analyzer keeps everything, includes docid
+    // this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001
+    termIndexStatusTermCount = 15;
+    termIndexStatusTotFreq = 15;
+    storedFieldStatusTotalDocCounts = 2;
+    termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts;
+    storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2
+  }
+
+  @Override
+  protected void setSearchGroundTruth() {
+    topicReader = "TsvInt";
+    topicFile = "src/test/resources/sample_topics/json_topics.tsv";
+    SearchArgs searchArg = createDefaultSearchArgs().bm25();
+    searchArg.pretokenized = true;
+    testQueries.put("bm25", searchArg);
+    queryTokens.put("1", new ArrayList<>());
+    queryTokens.get("1").add("##ing");
+    queryTokens.get("1").add("##vert");
+    referenceRunOutput.put("bm25", new String[]{
+            "1 Q0 2000001 1 0.922400 Anserini"});
+  }
+
+}
diff --git a/src/test/resources/sample_docs/json/collection_tokenized/segment1.json b/src/test/resources/sample_docs/json/collection_tokenized/segment1.json
@@ -0,0 +1,10 @@
+[
+  {
+    "id": "2000000",
+    "contents": "this was ##a simple pretokenized test"
+  },
+  {
+    "id": "2000001",
+    "contents": "some time extra ##vert ##ing and some time intro ##vert ##ing"
+  }
+]
diff --git a/src/test/resources/sample_topics/json_topics.tsv b/src/test/resources/sample_topics/json_topics.tsv
@@ -0,0 +1 @@
+1	##ing ##vert