From 5781c871db12f0e36139982fbf1c805cfec189ee Mon Sep 17 00:00:00 2001
From: Jimmy Lin <jimmylin@uwaterloo.ca>
Date: Thu, 15 Apr 2021 18:57:03 -0400
Subject: [PATCH] Improved e2e testing: implemented getDocumentTokens in
 IndexReaderUtils (#1521)

---
 .../io/anserini/index/IndexReaderUtils.java   | 65 +++++++++++++++++++
 .../integration/AclAnthologyEndToEndTest.java |  6 +-
 .../integration/BibtexEndToEndTest.java       |  6 +-
 .../anserini/integration/C4EndToEndTest.java  |  6 +-
 .../integration/CoreEndToEndTest.java         |  6 +-
 .../io/anserini/integration/EndToEndTest.java | 24 +++----
 .../integration/MultiThreadingSearchTest.java |  6 +-
 .../PretokenizedIndexEndToEndTest.java        | 22 ++-----
 .../TrecEndToEndExternalStopwordsTest.java    | 15 ++++-
 .../integration/TrecEndToEndPassageTest.java  | 15 ++++-
 .../integration/TrecEndToEndTest.java         | 15 ++++-
 .../TrecEndToEndWhitelistTest.java            |  7 +-
 .../integration/TweetEndToEndTest.java        |  8 +--
 13 files changed, 141 insertions(+), 60 deletions(-)
diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
index 9e9d06e0c8..213ac6f114 100755
--- a/src/main/java/io/anserini/index/IndexReaderUtils.java
+++ b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -55,6 +55,7 @@
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -349,6 +350,70 @@ public static List<Posting> getPostingsListWithAnalyzer(IndexReader reader, Stri
     return getPostingsList(reader, term, analyzer);
   }
 
+  /**
+   * Returns the document vector for a particular document as a list of tokens contained in the document. Note that this
+   * method explicitly returns {@code null} if the document does not exist (as opposed to an empty list), so that the
+   * caller is explicitly forced to handle this case.
+   *
+   * @param reader index reader
+   * @param docid collection docid
+   * @return the document vector for a particular document as a list of tokens or {@code null} if document does not exist.
+   * @throws IOException if error encountered during query
+   * @throws NotStoredException if the term vector is not stored or positions are not stored
+   */
+  public static List<String> getDocumentTokens(IndexReader reader, String docid) throws IOException, NotStoredException {
+    int ldocid = convertDocidToLuceneDocid(reader, docid);
+    if (ldocid == -1) {
+      return null;
+    }
+    Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
+    if (terms == null) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+    if (!terms.hasPositions()) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+    TermsEnum te = terms.iterator();
+    if (te == null) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+
+    // We need to first find out how long the document vector is so we can allocate an array for it.
+    // The temptation is to just call terms.getSumTotalTermFreq(), but we can't - since this value will not include stopwords!
+    // The only sure way is to iterate through all the terms once to find the max position.
+    // Note that position is zero-based.
+    PostingsEnum postingsEnum = null;
+    int maxPos = 0;
+    while ((te.next()) != null) {
+      postingsEnum = te.postings(postingsEnum);
+      postingsEnum.nextDoc();
+
+      for (int j=0; j<postingsEnum.freq(); j++) {
+        int pos = postingsEnum.nextPosition();
+        if (pos > maxPos) {
+          maxPos = pos;
+        }
+      }
+    }
+
+    // We now know how long to make the array.
+    String[] tokens = new String[maxPos + 1];
+
+    // Go through the terms again, this time to actually build the list of tokens.
+    te = reader.getTermVector(ldocid, IndexArgs.CONTENTS).iterator();
+    while ((te.next()) != null) {
+      postingsEnum = te.postings(postingsEnum);
+      postingsEnum.nextDoc();
+
+      for (int j=0; j<postingsEnum.freq(); j++) {
+        int pos = postingsEnum.nextPosition();
+        tokens[pos] = te.term().utf8ToString();
+      }
+    }
+
+    return Arrays.asList(tokens);
+  }
+
   /**
    * Returns the document vector for a particular document as a map of terms to term frequencies. Note that this
    * method explicitly returns {@code null} if the document does not exist (as opposed to an empty map), so that the
diff --git a/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java
index 915a7968bc..3fd5da0b8d 100644
--- a/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java
@@ -37,7 +37,7 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("E17-1003", Map.of(
+    referenceDocs.put("E17-1003", Map.of(
         "contents",
         "Exploring Different Dimensions of Attention for Uncertainty Detection Neural networks with attention " +
             "have proven effective for many natural language processing tasks. In this paper, we develop attention " +
@@ -58,12 +58,12 @@ protected void setCheckIndexGroundTruth() {
             "Our novel architectures set the new state of the art on a Wikipedia benchmark dataset and perform " +
             "similar to the state-of-the-art model on a biomedical benchmark which uses a large set of linguistic " +
             "features."));
-    documents.put("C00-1003", Map.of(
+    referenceDocs.put("C00-1003", Map.of(
         "contents",
         "Selectional Restrictions in HPSG ",
         "raw",
         "Selectional Restrictions in HPSG "));
-    documents.put("C00-1007", Map.of(
+    referenceDocs.put("C00-1007", Map.of(
         "contents",
         "Exploiting a Probabilistic Hierarchical Model for Generation ",
         "raw",
diff --git a/src/test/java/io/anserini/integration/BibtexEndToEndTest.java b/src/test/java/io/anserini/integration/BibtexEndToEndTest.java
index 0a2ed05787..e8461eeec0 100644
--- a/src/test/java/io/anserini/integration/BibtexEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/BibtexEndToEndTest.java
@@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("article-id", Map.of(
+    referenceDocs.put("article-id", Map.of(
         "contents", "this is the title. ",
         "raw", "this is the title. "));
-    documents.put("inproceedings-id", Map.of(
+    referenceDocs.put("inproceedings-id", Map.of(
         "contents", "this is the title. this is the abstract",
         "raw", "this is the title. this is the abstract"));
-    documents.put("proceedings-id", Map.of(
+    referenceDocs.put("proceedings-id", Map.of(
         "contents", "this is the title. ",
         "raw", "this is the title. "));
 
diff --git a/src/test/java/io/anserini/integration/C4EndToEndTest.java b/src/test/java/io/anserini/integration/C4EndToEndTest.java
index 04f36e2641..0f57814d1f 100644
--- a/src/test/java/io/anserini/integration/C4EndToEndTest.java
+++ b/src/test/java/io/anserini/integration/C4EndToEndTest.java
@@ -16,10 +16,8 @@
 
 package io.anserini.integration;
 
-import io.anserini.collection.BibtexCollection;
 import io.anserini.collection.C4Collection;
 import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.BibtexGenerator;
 import io.anserini.index.generator.C4Generator;
 
 import java.util.Map;
@@ -41,14 +39,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 2;
-    documents.put("c4-0001-000000", Map.of(
+    referenceDocs.put("c4-0001-000000", Map.of(
             "contents", "test text",
             "raw", "{\n" +
                     "  \"text\" : \"test text\",\n" +
                     "  \"timestamp\" : \"2019-04-23T08:26:47Z\",\n" +
                     "  \"url\" : \"http://www.test.com\"\n" +
                     "}"));
-    documents.put("c4-0001-000001", Map.of(
+    referenceDocs.put("c4-0001-000001", Map.of(
             "contents", "test text2",
             "raw", "{\n" +
                     "  \"text\" : \"test text2\",\n" +
diff --git a/src/test/java/io/anserini/integration/CoreEndToEndTest.java b/src/test/java/io/anserini/integration/CoreEndToEndTest.java
index d6a1ba6381..0a0284d8f5 100644
--- a/src/test/java/io/anserini/integration/CoreEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/CoreEndToEndTest.java
@@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("fullCoreDoc", Map.of(
+    referenceDocs.put("fullCoreDoc", Map.of(
         "contents", "Full CORE doc ",
         "raw", "Full CORE doc "));
-    documents.put("coreDoc1", Map.of(
+    referenceDocs.put("coreDoc1", Map.of(
         "contents", "this is the title 1 this is the abstract 1",
         "raw", "this is the title 1 this is the abstract 1"));
-    documents.put("doi2", Map.of(
+    referenceDocs.put("doi2", Map.of(
         "contents", "this is the title 2 this is the abstract 2",
         "raw", "this is the title 2 this is the abstract 2"));
 
diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java
index fe396b18ff..60baf15252 100644
--- a/src/test/java/io/anserini/integration/EndToEndTest.java
+++ b/src/test/java/io/anserini/integration/EndToEndTest.java
@@ -43,11 +43,11 @@
 import java.io.PrintStream;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
-import java.util.Iterator;
 
 // This automatically tests indexing, retrieval, and evaluation from end to end.
 // Subclasses inherit and special to different collections.
@@ -63,8 +63,8 @@ public abstract class EndToEndTest extends LuceneTestCase {
   protected String topicFile;
   protected String searchOutputPrefix = "e2eTestSearch";
   protected Map<String, String[]> referenceRunOutput = new HashMap<>();
-  protected Map<String, Map<String, String>> documents = new HashMap<>();
-  protected Map<String, Map<String, Map<String, Long>>> tokens = new HashMap<>();
+  protected Map<String, Map<String, String>> referenceDocs = new HashMap<>();
+  protected Map<String, Map<String, List<String>>> referenceDocTokens = new HashMap<>();
   protected Map<String, List<String>>  queryTokens = new HashMap<>();
 
   // These are the sources of truth
@@ -198,20 +198,14 @@ public void checkIndex() throws IOException {
 
     for (int i=0; i<reader.maxDoc(); i++) {
       String collectionDocid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
-      assertEquals(documents.get(collectionDocid).get("raw"),
-          IndexReaderUtils.documentRaw(reader, collectionDocid));
-      assertEquals(documents.get(collectionDocid).get("contents"),
-          IndexReaderUtils.documentContents(reader, collectionDocid));
+      assertEquals(referenceDocs.get(collectionDocid).get("raw"), IndexReaderUtils.documentRaw(reader, collectionDocid));
+      assertEquals(referenceDocs.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid));
+
       // check list of tokens by calling document vector
-      if(!tokens.isEmpty()){
+      if (!referenceDocTokens.isEmpty()){
         try {
-          Map<String, Long> actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid);
-          Iterator it = actualToken.entrySet().iterator();
-          while (it.hasNext()) {
-            Map.Entry pair = (Map.Entry)it.next();
-            assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue());
-            it.remove();
-          }
+          List<String> docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid);
+          assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens);
         } catch (NotStoredException e) {
           e.printStackTrace();
         }
diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
index 0cfe1ba967..86388417f0 100644
--- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
+++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
@@ -45,14 +45,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1", Map.of(
+    referenceDocs.put("TREC_DOC_1", Map.of(
         "contents", "This is head very simple text",
         "raw", "<HEAD>This is head</HEAD>\n" +
             "<TEXT>\n" +
             "very simple\n" +
             "text\n" +
             "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
         "contents", "head text 01/30/03 content",
         "raw", "<HL>\n" +
             "head text\n" +
@@ -65,7 +65,7 @@ protected void setCheckIndexGroundTruth() {
             "</LP>\n" +
             "<TEXT>\n" +
             "</TEXT>"));
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +
diff --git a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java
index c36ca0c706..cfb6034e59 100644
--- a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java
@@ -15,15 +15,11 @@
  */
 package io.anserini.integration;
 
-import io.anserini.collection.DocumentCollection;
 import io.anserini.collection.JsonCollection;
-import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
-import io.anserini.index.IndexCollection;
 import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
 import io.anserini.search.SearchArgs;
 
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -45,34 +41,30 @@ IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 2;
-    documents.put("2000000", Map.of(
+    referenceDocs.put("2000000", Map.of(
       "contents", "this was ##a simple pretokenized test",
       "raw","{\n" +
       "  \"id\" : \"2000000\",\n" +
       "  \"contents\" : \"this was ##a simple pretokenized test\"\n" +
       "}"));
-    documents.put("2000001", Map.of(
+    referenceDocs.put("2000001", Map.of(
       "contents", "some time extra ##vert ##ing and some time intro ##vert ##ing",
       "raw","{\n" +
       "  \"id\" : \"2000001\",\n" +
       "  \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" +
       "}"
     ));
-    tokens.put("2000000", Map.of(
-      "contents", Map.of(
-      "this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L)));
-    tokens.put("2000001",Map.of(
-      "contents", Map.of(
-      "some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L)));
+    referenceDocTokens.put("2000000", Map.of(
+      "contents", List.of("this", "was", "##a", "simple", "pretokenized", "test")));
+    referenceDocTokens.put("2000001", Map.of(
+      "contents", List.of("some", "time", "extra", "##vert", "##ing", "and", "some", "time", "intro", "##vert", "##ing")));
 
     fieldNormStatusTotalFields = 1;
-    // whitespace analyzer keeps everything, includes docid
-    // this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001
     termIndexStatusTermCount = 15;
     termIndexStatusTotFreq = 15;
     storedFieldStatusTotalDocCounts = 2;
     termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts;
-    storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2
+    storedFieldStatusTotFields = 6;
   }
 
   @Override
diff --git a/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java b/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java
index 50b6a1fc49..c50c76d4bc 100644
--- a/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java
+++ b/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java
@@ -3,6 +3,8 @@
 import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndExternalStopwordsTest extends EndToEndTest {
@@ -20,14 +22,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1", Map.of(
+    referenceDocs.put("TREC_DOC_1", Map.of(
             "contents", "This is head very simple text",
             "raw", "<HEAD>This is head</HEAD>\n" +
                     "<TEXT>\n" +
                     "very simple\n" +
                     "text\n" +
                     "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
             "contents", "head text 01/30/03 content",
             "raw", "<HL>\n" +
                     "head text\n" +
@@ -40,7 +42,7 @@ protected void setCheckIndexGroundTruth() {
                     "</LP>\n" +
                     "<TEXT>\n" +
                     "</TEXT>"));
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
             "contents", "HEAD simple enough text text text",
             "raw", "<HEAD>HEAD</HEAD>\n" +
                     "<TEXT>\n" +
@@ -51,6 +53,13 @@ protected void setCheckIndexGroundTruth() {
                     "text\n" +
                     "</TEXT>"));
 
+    referenceDocTokens.put("TREC_DOC_1", Map.of(
+        "contents", Arrays.asList(new String[]{"thi", "is", "head", "veri", null, "text"})));
+    referenceDocTokens.put("WSJ_1", Map.of(
+        "contents", List.of("head", "text", "01", "30", "03", "content")));
+    referenceDocTokens.put("DOC222", Map.of(
+        "contents", Arrays.asList(new String[]{"head", null, null, "text", "text", "text"})));
+
     // Terms per document:
     // d1: TREC_DOC_1 this is head very simple text
     // d2: DOC222 head simple enough text
diff --git a/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java b/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java
index c25fdbe183..dfa9fc268d 100644
--- a/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java
+++ b/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java
@@ -20,6 +20,8 @@
 import io.anserini.index.IndexArgs;
 import io.anserini.search.SearchArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndPassageTest extends EndToEndTest {
@@ -36,14 +38,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1.00001", Map.of(
+    referenceDocs.put("TREC_DOC_1.00001", Map.of(
         "contents", "This is head very simple text",
         "raw", "<HEAD>This is head</HEAD>\n" +
             "<TEXT>\n" +
             "very simple\n" +
             "text\n" +
             "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
         "contents", "head text 01/30/03 content",
         "raw", "<HL>\n" +
             "head text\n" +
@@ -56,7 +58,7 @@ protected void setCheckIndexGroundTruth() {
             "</LP>\n" +
             "<TEXT>\n" +
             "</TEXT>"));
-    documents.put("TREC_DOC_1.00002", Map.of(
+    referenceDocs.put("TREC_DOC_1.00002", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +
@@ -67,6 +69,13 @@ protected void setCheckIndexGroundTruth() {
             "text\n" +
             "</TEXT>"));
 
+    referenceDocTokens.put("TREC_DOC_1.00001", Map.of(
+        "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"})));
+    referenceDocTokens.put("WSJ_1", Map.of(
+        "contents", List.of("head", "text", "01", "30", "03", "content")));
+    referenceDocTokens.put("TREC_DOC_1.00002", Map.of(
+        "contents", List.of("head", "simpl", "enough", "text", "text", "text")));
+
     fieldNormStatusTotalFields = 1;  // text
     termIndexStatusTermCount = 12;   // Note that standard analyzer ignores stopwords; includes docids.
     termIndexStatusTotFreq = 17;
diff --git a/src/test/java/io/anserini/integration/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/TrecEndToEndTest.java
index f876e0cf79..efce8d9c15 100644
--- a/src/test/java/io/anserini/integration/TrecEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/TrecEndToEndTest.java
@@ -19,6 +19,8 @@
 import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndTest extends EndToEndTest {
@@ -35,14 +37,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1", Map.of(
+    referenceDocs.put("TREC_DOC_1", Map.of(
         "contents", "This is head very simple text",
         "raw", "<HEAD>This is head</HEAD>\n" +
             "<TEXT>\n" +
             "very simple\n" +
             "text\n" +
             "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
         "contents", "head text 01/30/03 content",
         "raw", "<HL>\n" +
             "head text\n" +
@@ -55,7 +57,7 @@ protected void setCheckIndexGroundTruth() {
             "</LP>\n" +
             "<TEXT>\n" +
             "</TEXT>"));
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +
@@ -66,6 +68,13 @@ protected void setCheckIndexGroundTruth() {
             "text\n" +
             "</TEXT>"));
 
+    referenceDocTokens.put("TREC_DOC_1", Map.of(
+        "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"})));
+    referenceDocTokens.put("WSJ_1", Map.of(
+        "contents", List.of("head", "text", "01", "30", "03", "content")));
+    referenceDocTokens.put("DOC222", Map.of(
+        "contents", List.of("head", "simpl", "enough", "text", "text", "text")));
+
     fieldNormStatusTotalFields = 1;  // text
     termIndexStatusTermCount = 12;   // Note that standard analyzer ignores stopwords; includes docids.
     termIndexStatusTotFreq = 17;
diff --git a/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java b/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java
index c57089b7ad..4aa8a76281 100644
--- a/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java
+++ b/src/test/java/io/anserini/integration/TrecEndToEndWhitelistTest.java
@@ -19,6 +19,8 @@
 import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndWhitelistTest extends EndToEndTest {
@@ -37,7 +39,7 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 1;
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +
@@ -48,6 +50,9 @@ protected void setCheckIndexGroundTruth() {
             "text\n" +
             "</TEXT>"));
 
+    referenceDocTokens.put("DOC222", Map.of(
+        "contents", List.of("head", "simpl", "enough", "text", "text", "text")));
+
     fieldNormStatusTotalFields = 1;  // text
     termIndexStatusTermCount = 5;   // Note that standard analyzer ignores stopwords; includes docids.
     termIndexStatusTotFreq = 5;
diff --git a/src/test/java/io/anserini/integration/TweetEndToEndTest.java b/src/test/java/io/anserini/integration/TweetEndToEndTest.java
index 2907d5a2f3..f8d14077ea 100644
--- a/src/test/java/io/anserini/integration/TweetEndToEndTest.java
+++ b/src/test/java/io/anserini/integration/TweetEndToEndTest.java
@@ -48,16 +48,16 @@ protected void setCheckIndexGroundTruth() {
     // Note that based on our settings, retweets and tweets with id > 9 will not be indexed.
 
     docCount = 4;
-    documents.put("3", Map.of(
+    referenceDocs.put("3", Map.of(
         "contents", "This tweet will be indexed thanks",
         "raw", "{\"created_at\":\"Thu Aug 11 22:57:52 +0000 2016\",\"id\":3,\"id_str\":\"3\",\"text\":\"This tweet will be indexed thanks.\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":3358015773,\"id_str\":\"3358015773\",\"name\":\"Cami\",\"screen_name\":\"B\",\"location\":\"Ciudad Aut\\u00f3noma de Buenos Aire\",\"url\":null,\"description\":\"15.Geminiana\\u264a Ig: CamiiMariana15 Snap: camilaracabutto\",\"protected\":false,\"verified\":false,\"followers_count\":392,\"friends_count\":307,\"listed_count\":0,\"favourites_count\":11254,\"statuses_count\":21876,\"created_at\":\"Sat Jul 04 04:32:40 +0000 2015\",\"utc_offset\":-25200,\"time_zone\":\"Pacific Time (US & Canada)\",\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_image_url_https\":\"https:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_tile\":false,\"profile_link_color\":\"9266CC\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/3358015773\\/1470945786\",\"default_profile\":false,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[],\"urls\":[],\"user_mentions\":[{\"screen_name\":\"Jul1et4wizz\",\"name\":\"Julieta\",\"id\":1599099673,\"id_str\":\"1599099673\",\"indices\":[3,15]}],\"symbols\":[]},\"favorited\":false,\"retweeted\":false,\"filter_level\":\"low\",\"lang\":\"en\",\"timestamp_ms\":\"1470956272659\"}"));
-    documents.put("5", Map.of(
+    referenceDocs.put("5", Map.of(
         "contents", "Can you think of more interesting contents",
         "raw", "{\"created_at\":\"Thu Aug 11 23:57:52 +0000 2016\",\"id\":5,\"id_str\":\"5\",\"text\":\"Can you think of more interesting contents?\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":3358015773,\"id_str\":\"3358015773\",\"name\":\"Cami\",\"screen_name\":\"C\",\"location\":\"Ciudad Aut\\u00f3noma de Buenos Aire\",\"url\":null,\"description\":\"15.Geminiana\\u264a Ig: CamiiMariana15 Snap: camilaracabutto\",\"protected\":false,\"verified\":false,\"followers_count\":392,\"friends_count\":307,\"listed_count\":0,\"favourites_count\":11254,\"statuses_count\":21876,\"created_at\":\"Sat Jul 04 04:32:40 +0000 2015\",\"utc_offset\":-25200,\"time_zone\":\"Pacific Time (US & Canada)\",\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_image_url_https\":\"https:\\/\\/abs.twimg.com\\/images\\/themes\\/theme1\\/bg.png\",\"profile_background_tile\":false,\"profile_link_color\":\"9266CC\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/742940112527429636\\/2EcOpkFu_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/3358015773\\/1470945786\",\"default_profile\":false,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[],\"urls\":[],\"user_mentions\":[{\"screen_name\":\"Jul1et4wizz\",\"name\":\"Julieta\",\"id\":1599099673,\"id_str\":\"1599099673\",\"indices\":[3,15]}],\"symbols\":[]},\"favorited\":false,\"retweeted\":false,\"filter_level\":\"low\",\"lang\":\"cn\",\"timestamp_ms\":\"1470956272659\"}"));
-    documents.put("6", Map.of(
+    referenceDocs.put("6", Map.of(
         "contents", "We have some real contents here thanks",
         "raw", "{\"created_at\":\"Thu Aug 11 21:57:50 +0000 2016\",\"id\":6,\"id_str\":\"6\",\"text\":\"We have some real contents here thanks https:\\/\\/t.co\\/1a2b3c\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":763875115104960516,\"id_str\":\"763875115104960516\",\"name\":\"Esequiel Manson\",\"screen_name\":\"X\",\"location\":\"San Miguel, Argentina\",\"url\":null,\"description\":null,\"protected\":false,\"verified\":false,\"followers_count\":0,\"friends_count\":2,\"listed_count\":0,\"favourites_count\":2,\"statuses_count\":2,\"created_at\":\"Thu Aug 11 23:09:54 +0000 2016\",\"utc_offset\":null,\"time_zone\":null,\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"F5F8FA\",\"profile_background_image_url\":\"\",\"profile_background_image_url_https\":\"\",\"profile_background_tile\":false,\"profile_link_color\":\"2B7BB9\",\"profile_sidebar_border_color\":\"C0DEED\",\"profile_sidebar_fill_color\":\"DDEEF6\",\"profile_text_color\":\"333333\",\"profile_use_background_image\":true,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/763875115104960516\\/1470957611\",\"default_profile\":true,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[{\"text\":\"perfil\",\"indices\":[0,7]},{\"text\":\"tattoo\",\"indices\":[8,15]},{\"text\":\"feo\",\"indices\":[16,20]},{\"text\":\"paisaje\",\"indices\":[21,29]}],\"urls\":[],\"user_mentions\":[],\"symbols\":[],\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"extended_entities\":{\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"favorited\":false,\"retweeted\":false,\"possibly_sensitive\":false,\"filter_level\":\"low\",\"lang\":\"und\",\"timestamp_ms\":\"1470959870658\"}"));
-    documents.put("8", Map.of(
+    referenceDocs.put("8", Map.of(
         "contents", "test adding more tweet",
         "raw", "{\"created_at\":\"Thu Aug 11 22:57:50 +0000 2016\",\"id\":8,\"id_str\":\"8\",\"text\":\"test adding more tweets\",\"source\":\"\\u003ca href=\\\"http:\\/\\/twitter.com\\/download\\/android\\\" rel=\\\"nofollow\\\"\\u003eTwitter for Android\\u003c\\/a\\u003e\",\"truncated\":false,\"in_reply_to_status_id\":null,\"in_reply_to_status_id_str\":null,\"in_reply_to_user_id\":null,\"in_reply_to_user_id_str\":null,\"in_reply_to_screen_name\":null,\"user\":{\"id\":763875115104960516,\"id_str\":\"763875115104960516\",\"name\":\"Esequiel Manson\",\"screen_name\":\"Y\",\"location\":\"San Miguel, Argentina\",\"url\":null,\"description\":null,\"protected\":false,\"verified\":false,\"followers_count\":0,\"friends_count\":2,\"listed_count\":0,\"favourites_count\":2,\"statuses_count\":2,\"created_at\":\"Thu Aug 11 23:09:54 +0000 2016\",\"utc_offset\":null,\"time_zone\":null,\"geo_enabled\":false,\"lang\":\"es\",\"contributors_enabled\":false,\"is_translator\":false,\"profile_background_color\":\"F5F8FA\",\"profile_background_image_url\":\"\",\"profile_background_image_url_https\":\"\",\"profile_background_tile\":false,\"profile_link_color\":\"2B7BB9\",\"profile_sidebar_border_color\":\"C0DEED\",\"profile_sidebar_fill_color\":\"DDEEF6\",\"profile_text_color\":\"333333\",\"profile_use_background_image\":true,\"profile_image_url\":\"http:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_image_url_https\":\"https:\\/\\/pbs.twimg.com\\/profile_images\\/763877709353193472\\/Nhe0IMQI_normal.jpg\",\"profile_banner_url\":\"https:\\/\\/pbs.twimg.com\\/profile_banners\\/763875115104960516\\/1470957611\",\"default_profile\":true,\"default_profile_image\":false,\"following\":null,\"follow_request_sent\":null,\"notifications\":null},\"geo\":null,\"coordinates\":null,\"place\":null,\"contributors\":null,\"is_quote_status\":false,\"retweet_count\":0,\"favorite_count\":0,\"entities\":{\"hashtags\":[{\"text\":\"perfil\",\"indices\":[0,7]},{\"text\":\"tattoo\",\"indices\":[8,15]},{\"text\":\"feo\",\"indices\":[16,20]},{\"text\":\"paisaje\",\"indices\":[21,29]}],\"urls\":[],\"user_mentions\":[],\"symbols\":[],\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"extended_entities\":{\"media\":[{\"id\":763887130565287937,\"id_str\":\"763887130565287937\",\"indices\":[30,53],\"media_url\":\"http:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"media_url_https\":\"https:\\/\\/pbs.twimg.com\\/media\\/CpnfTEnWIAERJmq.jpg\",\"url\":\"https:\\/\\/t.co\\/au5Gk5k4Wd\",\"display_url\":\"pic.twitter.com\\/au5Gk5k4Wd\",\"expanded_url\":\"http:\\/\\/twitter.com\\/EsequielManson\\/status\\/763887179798020096\\/photo\\/1\",\"type\":\"photo\",\"sizes\":{\"medium\":{\"w\":1200,\"h\":1200,\"resize\":\"fit\"},\"thumb\":{\"w\":150,\"h\":150,\"resize\":\"crop\"},\"small\":{\"w\":680,\"h\":680,\"resize\":\"fit\"},\"large\":{\"w\":2048,\"h\":2048,\"resize\":\"fit\"}}}]},\"favorited\":false,\"retweeted\":false,\"possibly_sensitive\":false,\"filter_level\":\"low\",\"lang\":\"ab\",\"timestamp_ms\":\"1470959870658\"}"));