Improved e2e testing: implemented getDocumentTokens in IndexReaderUti…

…ls (#1521)
castorini · Apr 15, 2021 · 5781c87 · 5781c87
1 parent 6b24bbd
commit 5781c87
Show file tree

Hide file tree

Showing 13 changed files with 141 additions and 60 deletions.
diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -55,6 +55,7 @@
 import java.io.IOException;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -349,6 +350,70 @@ public static List<Posting> getPostingsListWithAnalyzer(IndexReader reader, Stri
     return getPostingsList(reader, term, analyzer);
   }
 
+  /**
+   * Returns the document vector for a particular document as a list of tokens contained in the document. Note that this
+   * method explicitly returns {@code null} if the document does not exist (as opposed to an empty list), so that the
+   * caller is explicitly forced to handle this case.
+   *
+   * @param reader index reader
+   * @param docid collection docid
+   * @return the document vector for a particular document as a list of tokens or {@code null} if document does not exist.
+   * @throws IOException if error encountered during query
+   * @throws NotStoredException if the term vector is not stored or positions are not stored
+   */
+  public static List<String> getDocumentTokens(IndexReader reader, String docid) throws IOException, NotStoredException {
+    int ldocid = convertDocidToLuceneDocid(reader, docid);
+    if (ldocid == -1) {
+      return null;
+    }
+    Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
+    if (terms == null) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+    if (!terms.hasPositions()) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+    TermsEnum te = terms.iterator();
+    if (te == null) {
+      throw new NotStoredException("Document vector not stored!");
+    }
+
+    // We need to first find out how long the document vector is so we can allocate an array for it.
+    // The temptation is to just call terms.getSumTotalTermFreq(), but we can't - since this value will not include stopwords!
+    // The only sure way is to iterate through all the terms once to find the max position.
+    // Note that position is zero-based.
+    PostingsEnum postingsEnum = null;
+    int maxPos = 0;
+    while ((te.next()) != null) {
+      postingsEnum = te.postings(postingsEnum);
+      postingsEnum.nextDoc();
+
+      for (int j=0; j<postingsEnum.freq(); j++) {
+        int pos = postingsEnum.nextPosition();
+        if (pos > maxPos) {
+          maxPos = pos;
+        }
+      }
+    }
+
+    // We now know how long to make the array.
+    String[] tokens = new String[maxPos + 1];
+
+    // Go through the terms again, this time to actually build the list of tokens.
+    te = reader.getTermVector(ldocid, IndexArgs.CONTENTS).iterator();
+    while ((te.next()) != null) {
+      postingsEnum = te.postings(postingsEnum);
+      postingsEnum.nextDoc();
+
+      for (int j=0; j<postingsEnum.freq(); j++) {
+        int pos = postingsEnum.nextPosition();
+        tokens[pos] = te.term().utf8ToString();
+      }
+    }
+
+    return Arrays.asList(tokens);
+  }
+
   /**
    * Returns the document vector for a particular document as a map of terms to term frequencies. Note that this
    * method explicitly returns {@code null} if the document does not exist (as opposed to an empty map), so that the

diff --git a/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java
@@ -37,7 +37,7 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("E17-1003", Map.of(
+    referenceDocs.put("E17-1003", Map.of(
         "contents",
         "Exploring Different Dimensions of Attention for Uncertainty Detection Neural networks with attention " +
             "have proven effective for many natural language processing tasks. In this paper, we develop attention " +
@@ -58,12 +58,12 @@ protected void setCheckIndexGroundTruth() {
             "Our novel architectures set the new state of the art on a Wikipedia benchmark dataset and perform " +
             "similar to the state-of-the-art model on a biomedical benchmark which uses a large set of linguistic " +
             "features."));
-    documents.put("C00-1003", Map.of(
+    referenceDocs.put("C00-1003", Map.of(
         "contents",
         "Selectional Restrictions in HPSG ",
         "raw",
         "Selectional Restrictions in HPSG "));
-    documents.put("C00-1007", Map.of(
+    referenceDocs.put("C00-1007", Map.of(
         "contents",
         "Exploiting a Probabilistic Hierarchical Model for Generation ",
         "raw",

diff --git a/src/test/java/io/anserini/integration/BibtexEndToEndTest.java b/src/test/java/io/anserini/integration/BibtexEndToEndTest.java
@@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("article-id", Map.of(
+    referenceDocs.put("article-id", Map.of(
         "contents", "this is the title. ",
         "raw", "this is the title. "));
-    documents.put("inproceedings-id", Map.of(
+    referenceDocs.put("inproceedings-id", Map.of(
         "contents", "this is the title. this is the abstract",
         "raw", "this is the title. this is the abstract"));
-    documents.put("proceedings-id", Map.of(
+    referenceDocs.put("proceedings-id", Map.of(
         "contents", "this is the title. ",
         "raw", "this is the title. "));
 

diff --git a/src/test/java/io/anserini/integration/C4EndToEndTest.java b/src/test/java/io/anserini/integration/C4EndToEndTest.java
@@ -16,10 +16,8 @@
 
 package io.anserini.integration;
 
-import io.anserini.collection.BibtexCollection;
 import io.anserini.collection.C4Collection;
 import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.BibtexGenerator;
 import io.anserini.index.generator.C4Generator;
 
 import java.util.Map;
@@ -41,14 +39,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 2;
-    documents.put("c4-0001-000000", Map.of(
+    referenceDocs.put("c4-0001-000000", Map.of(
             "contents", "test text",
             "raw", "{\n" +
                     "  \"text\" : \"test text\",\n" +
                     "  \"timestamp\" : \"2019-04-23T08:26:47Z\",\n" +
                     "  \"url\" : \"http://www.test.com\"\n" +
                     "}"));
-    documents.put("c4-0001-000001", Map.of(
+    referenceDocs.put("c4-0001-000001", Map.of(
             "contents", "test text2",
             "raw", "{\n" +
                     "  \"text\" : \"test text2\",\n" +

diff --git a/src/test/java/io/anserini/integration/CoreEndToEndTest.java b/src/test/java/io/anserini/integration/CoreEndToEndTest.java
@@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("fullCoreDoc", Map.of(
+    referenceDocs.put("fullCoreDoc", Map.of(
         "contents", "Full CORE doc ",
         "raw", "Full CORE doc "));
-    documents.put("coreDoc1", Map.of(
+    referenceDocs.put("coreDoc1", Map.of(
         "contents", "this is the title 1 this is the abstract 1",
         "raw", "this is the title 1 this is the abstract 1"));
-    documents.put("doi2", Map.of(
+    referenceDocs.put("doi2", Map.of(
         "contents", "this is the title 2 this is the abstract 2",
         "raw", "this is the title 2 this is the abstract 2"));
 

diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java
@@ -43,11 +43,11 @@
 import java.io.PrintStream;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
-import java.util.Iterator;
 
 // This automatically tests indexing, retrieval, and evaluation from end to end.
 // Subclasses inherit and special to different collections.
@@ -63,8 +63,8 @@ public abstract class EndToEndTest extends LuceneTestCase {
   protected String topicFile;
   protected String searchOutputPrefix = "e2eTestSearch";
   protected Map<String, String[]> referenceRunOutput = new HashMap<>();
-  protected Map<String, Map<String, String>> documents = new HashMap<>();
-  protected Map<String, Map<String, Map<String, Long>>> tokens = new HashMap<>();
+  protected Map<String, Map<String, String>> referenceDocs = new HashMap<>();
+  protected Map<String, Map<String, List<String>>> referenceDocTokens = new HashMap<>();
   protected Map<String, List<String>>  queryTokens = new HashMap<>();
 
   // These are the sources of truth
@@ -198,20 +198,14 @@ public void checkIndex() throws IOException {
 
     for (int i=0; i<reader.maxDoc(); i++) {
       String collectionDocid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
-      assertEquals(documents.get(collectionDocid).get("raw"),
-          IndexReaderUtils.documentRaw(reader, collectionDocid));
-      assertEquals(documents.get(collectionDocid).get("contents"),
-          IndexReaderUtils.documentContents(reader, collectionDocid));
+      assertEquals(referenceDocs.get(collectionDocid).get("raw"), IndexReaderUtils.documentRaw(reader, collectionDocid));
+      assertEquals(referenceDocs.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid));
+
       // check list of tokens by calling document vector
-      if(!tokens.isEmpty()){
+      if (!referenceDocTokens.isEmpty()){
         try {
-          Map<String, Long> actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid);
-          Iterator it = actualToken.entrySet().iterator();
-          while (it.hasNext()) {
-            Map.Entry pair = (Map.Entry)it.next();
-            assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue());
-            it.remove();
-          }
+          List<String> docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid);
+          assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens);
         } catch (NotStoredException e) {
           e.printStackTrace();
         }

diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
@@ -45,14 +45,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1", Map.of(
+    referenceDocs.put("TREC_DOC_1", Map.of(
         "contents", "This is head very simple text",
         "raw", "<HEAD>This is head</HEAD>\n" +
             "<TEXT>\n" +
             "very simple\n" +
             "text\n" +
             "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
         "contents", "head text 01/30/03 content",
         "raw", "<HL>\n" +
             "head text\n" +
@@ -65,7 +65,7 @@ protected void setCheckIndexGroundTruth() {
             "</LP>\n" +
             "<TEXT>\n" +
             "</TEXT>"));
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +

diff --git a/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java b/src/test/java/io/anserini/integration/PretokenizedIndexEndToEndTest.java
@@ -15,15 +15,11 @@
  */
 package io.anserini.integration;
 
-import io.anserini.collection.DocumentCollection;
 import io.anserini.collection.JsonCollection;
-import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
-import io.anserini.index.IndexCollection;
 import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
 import io.anserini.search.SearchArgs;
 
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -45,34 +41,30 @@ IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 2;
-    documents.put("2000000", Map.of(
+    referenceDocs.put("2000000", Map.of(
       "contents", "this was ##a simple pretokenized test",
       "raw","{\n" +
       "  \"id\" : \"2000000\",\n" +
       "  \"contents\" : \"this was ##a simple pretokenized test\"\n" +
       "}"));
-    documents.put("2000001", Map.of(
+    referenceDocs.put("2000001", Map.of(
       "contents", "some time extra ##vert ##ing and some time intro ##vert ##ing",
       "raw","{\n" +
       "  \"id\" : \"2000001\",\n" +
       "  \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" +
       "}"
     ));
-    tokens.put("2000000", Map.of(
-      "contents", Map.of(
-      "this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L)));
-    tokens.put("2000001",Map.of(
-      "contents", Map.of(
-      "some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L)));
+    referenceDocTokens.put("2000000", Map.of(
+      "contents", List.of("this", "was", "##a", "simple", "pretokenized", "test")));
+    referenceDocTokens.put("2000001", Map.of(
+      "contents", List.of("some", "time", "extra", "##vert", "##ing", "and", "some", "time", "intro", "##vert", "##ing")));
 
     fieldNormStatusTotalFields = 1;
-    // whitespace analyzer keeps everything, includes docid
-    // this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001
     termIndexStatusTermCount = 15;
     termIndexStatusTotFreq = 15;
     storedFieldStatusTotalDocCounts = 2;
     termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts;
-    storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2
+    storedFieldStatusTotFields = 6;
   }
 
   @Override

diff --git a/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java b/src/test/java/io/anserini/integration/TrecEndToEndExternalStopwordsTest.java
@@ -3,6 +3,8 @@
 import io.anserini.collection.TrecCollection;
 import io.anserini.index.IndexArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndExternalStopwordsTest extends EndToEndTest {
@@ -20,14 +22,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1", Map.of(
+    referenceDocs.put("TREC_DOC_1", Map.of(
             "contents", "This is head very simple text",
             "raw", "<HEAD>This is head</HEAD>\n" +
                     "<TEXT>\n" +
                     "very simple\n" +
                     "text\n" +
                     "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
             "contents", "head text 01/30/03 content",
             "raw", "<HL>\n" +
                     "head text\n" +
@@ -40,7 +42,7 @@ protected void setCheckIndexGroundTruth() {
                     "</LP>\n" +
                     "<TEXT>\n" +
                     "</TEXT>"));
-    documents.put("DOC222", Map.of(
+    referenceDocs.put("DOC222", Map.of(
             "contents", "HEAD simple enough text text text",
             "raw", "<HEAD>HEAD</HEAD>\n" +
                     "<TEXT>\n" +
@@ -51,6 +53,13 @@ protected void setCheckIndexGroundTruth() {
                     "text\n" +
                     "</TEXT>"));
 
+    referenceDocTokens.put("TREC_DOC_1", Map.of(
+        "contents", Arrays.asList(new String[]{"thi", "is", "head", "veri", null, "text"})));
+    referenceDocTokens.put("WSJ_1", Map.of(
+        "contents", List.of("head", "text", "01", "30", "03", "content")));
+    referenceDocTokens.put("DOC222", Map.of(
+        "contents", Arrays.asList(new String[]{"head", null, null, "text", "text", "text"})));
+
     // Terms per document:
     // d1: TREC_DOC_1 this is head very simple text
     // d2: DOC222 head simple enough text

diff --git a/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java b/src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java
@@ -20,6 +20,8 @@
 import io.anserini.index.IndexArgs;
 import io.anserini.search.SearchArgs;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 
 public class TrecEndToEndPassageTest extends EndToEndTest {
@@ -36,14 +38,14 @@ protected IndexArgs getIndexArgs() {
   @Override
   protected void setCheckIndexGroundTruth() {
     docCount = 3;
-    documents.put("TREC_DOC_1.00001", Map.of(
+    referenceDocs.put("TREC_DOC_1.00001", Map.of(
         "contents", "This is head very simple text",
         "raw", "<HEAD>This is head</HEAD>\n" +
             "<TEXT>\n" +
             "very simple\n" +
             "text\n" +
             "</TEXT>"));
-    documents.put("WSJ_1", Map.of(
+    referenceDocs.put("WSJ_1", Map.of(
         "contents", "head text 01/30/03 content",
         "raw", "<HL>\n" +
             "head text\n" +
@@ -56,7 +58,7 @@ protected void setCheckIndexGroundTruth() {
             "</LP>\n" +
             "<TEXT>\n" +
             "</TEXT>"));
-    documents.put("TREC_DOC_1.00002", Map.of(
+    referenceDocs.put("TREC_DOC_1.00002", Map.of(
         "contents", "HEAD simple enough text text text",
         "raw", "<HEAD>HEAD</HEAD>\n" +
             "<TEXT>\n" +
@@ -67,6 +69,13 @@ protected void setCheckIndexGroundTruth() {
             "text\n" +
             "</TEXT>"));
 
+    referenceDocTokens.put("TREC_DOC_1.00001", Map.of(
+        "contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"})));
+    referenceDocTokens.put("WSJ_1", Map.of(
+        "contents", List.of("head", "text", "01", "30", "03", "content")));
+    referenceDocTokens.put("TREC_DOC_1.00002", Map.of(
+        "contents", List.of("head", "simpl", "enough", "text", "text", "text")));
+
     fieldNormStatusTotalFields = 1;  // text
     termIndexStatusTermCount = 12;   // Note that standard analyzer ignores stopwords; includes docids.
     termIndexStatusTotFreq = 17;