Skip to content

Commit

Permalink
Improved e2e testing: implemented getDocumentTokens in IndexReaderUti…
Browse files Browse the repository at this point in the history
…ls (#1521)
  • Loading branch information
lintool committed Apr 15, 2021
1 parent 6b24bbd commit 5781c87
Show file tree
Hide file tree
Showing 13 changed files with 141 additions and 60 deletions.
65 changes: 65 additions & 0 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -349,6 +350,70 @@ public static List<Posting> getPostingsListWithAnalyzer(IndexReader reader, Stri
return getPostingsList(reader, term, analyzer);
}

/**
* Returns the document vector for a particular document as a list of tokens contained in the document. Note that this
* method explicitly returns {@code null} if the document does not exist (as opposed to an empty list), so that the
* caller is explicitly forced to handle this case.
*
* @param reader index reader
* @param docid collection docid
* @return the document vector for a particular document as a list of tokens or {@code null} if document does not exist.
* @throws IOException if error encountered during query
* @throws NotStoredException if the term vector is not stored or positions are not stored
*/
public static List<String> getDocumentTokens(IndexReader reader, String docid) throws IOException, NotStoredException {
int ldocid = convertDocidToLuceneDocid(reader, docid);
if (ldocid == -1) {
return null;
}
Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
if (terms == null) {
throw new NotStoredException("Document vector not stored!");
}
if (!terms.hasPositions()) {
throw new NotStoredException("Document vector not stored!");
}
TermsEnum te = terms.iterator();
if (te == null) {
throw new NotStoredException("Document vector not stored!");
}

// We need to first find out how long the document vector is so we can allocate an array for it.
// The temptation is to just call terms.getSumTotalTermFreq(), but we can't - since this value will not include stopwords!
// The only sure way is to iterate through all the terms once to find the max position.
// Note that position is zero-based.
PostingsEnum postingsEnum = null;
int maxPos = 0;
while ((te.next()) != null) {
postingsEnum = te.postings(postingsEnum);
postingsEnum.nextDoc();

for (int j=0; j<postingsEnum.freq(); j++) {
int pos = postingsEnum.nextPosition();
if (pos > maxPos) {
maxPos = pos;
}
}
}

// We now know how long to make the array.
String[] tokens = new String[maxPos + 1];

// Go through the terms again, this time to actually build the list of tokens.
te = reader.getTermVector(ldocid, IndexArgs.CONTENTS).iterator();
while ((te.next()) != null) {
postingsEnum = te.postings(postingsEnum);
postingsEnum.nextDoc();

for (int j=0; j<postingsEnum.freq(); j++) {
int pos = postingsEnum.nextPosition();
tokens[pos] = te.term().utf8ToString();
}
}

return Arrays.asList(tokens);
}

/**
* Returns the document vector for a particular document as a map of terms to term frequencies. Note that this
* method explicitly returns {@code null} if the document does not exist (as opposed to an empty map), so that the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("E17-1003", Map.of(
referenceDocs.put("E17-1003", Map.of(
"contents",
"Exploring Different Dimensions of Attention for Uncertainty Detection Neural networks with attention " +
"have proven effective for many natural language processing tasks. In this paper, we develop attention " +
Expand All @@ -58,12 +58,12 @@ protected void setCheckIndexGroundTruth() {
"Our novel architectures set the new state of the art on a Wikipedia benchmark dataset and perform " +
"similar to the state-of-the-art model on a biomedical benchmark which uses a large set of linguistic " +
"features."));
documents.put("C00-1003", Map.of(
referenceDocs.put("C00-1003", Map.of(
"contents",
"Selectional Restrictions in HPSG ",
"raw",
"Selectional Restrictions in HPSG "));
documents.put("C00-1007", Map.of(
referenceDocs.put("C00-1007", Map.of(
"contents",
"Exploiting a Probabilistic Hierarchical Model for Generation ",
"raw",
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/io/anserini/integration/BibtexEndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("article-id", Map.of(
referenceDocs.put("article-id", Map.of(
"contents", "this is the title. ",
"raw", "this is the title. "));
documents.put("inproceedings-id", Map.of(
referenceDocs.put("inproceedings-id", Map.of(
"contents", "this is the title. this is the abstract",
"raw", "this is the title. this is the abstract"));
documents.put("proceedings-id", Map.of(
referenceDocs.put("proceedings-id", Map.of(
"contents", "this is the title. ",
"raw", "this is the title. "));

Expand Down
6 changes: 2 additions & 4 deletions src/test/java/io/anserini/integration/C4EndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@

package io.anserini.integration;

import io.anserini.collection.BibtexCollection;
import io.anserini.collection.C4Collection;
import io.anserini.index.IndexArgs;
import io.anserini.index.generator.BibtexGenerator;
import io.anserini.index.generator.C4Generator;

import java.util.Map;
Expand All @@ -41,14 +39,14 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 2;
documents.put("c4-0001-000000", Map.of(
referenceDocs.put("c4-0001-000000", Map.of(
"contents", "test text",
"raw", "{\n" +
" \"text\" : \"test text\",\n" +
" \"timestamp\" : \"2019-04-23T08:26:47Z\",\n" +
" \"url\" : \"http://www.test.com\"\n" +
"}"));
documents.put("c4-0001-000001", Map.of(
referenceDocs.put("c4-0001-000001", Map.of(
"contents", "test text2",
"raw", "{\n" +
" \"text\" : \"test text2\",\n" +
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/io/anserini/integration/CoreEndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("fullCoreDoc", Map.of(
referenceDocs.put("fullCoreDoc", Map.of(
"contents", "Full CORE doc ",
"raw", "Full CORE doc "));
documents.put("coreDoc1", Map.of(
referenceDocs.put("coreDoc1", Map.of(
"contents", "this is the title 1 this is the abstract 1",
"raw", "this is the title 1 this is the abstract 1"));
documents.put("doi2", Map.of(
referenceDocs.put("doi2", Map.of(
"contents", "this is the title 2 this is the abstract 2",
"raw", "this is the title 2 this is the abstract 2"));

Expand Down
24 changes: 9 additions & 15 deletions src/test/java/io/anserini/integration/EndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@
import java.io.PrintStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Iterator;

// This automatically tests indexing, retrieval, and evaluation from end to end.
// Subclasses inherit and special to different collections.
Expand All @@ -63,8 +63,8 @@ public abstract class EndToEndTest extends LuceneTestCase {
protected String topicFile;
protected String searchOutputPrefix = "e2eTestSearch";
protected Map<String, String[]> referenceRunOutput = new HashMap<>();
protected Map<String, Map<String, String>> documents = new HashMap<>();
protected Map<String, Map<String, Map<String, Long>>> tokens = new HashMap<>();
protected Map<String, Map<String, String>> referenceDocs = new HashMap<>();
protected Map<String, Map<String, List<String>>> referenceDocTokens = new HashMap<>();
protected Map<String, List<String>> queryTokens = new HashMap<>();

// These are the sources of truth
Expand Down Expand Up @@ -198,20 +198,14 @@ public void checkIndex() throws IOException {

for (int i=0; i<reader.maxDoc(); i++) {
String collectionDocid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
assertEquals(documents.get(collectionDocid).get("raw"),
IndexReaderUtils.documentRaw(reader, collectionDocid));
assertEquals(documents.get(collectionDocid).get("contents"),
IndexReaderUtils.documentContents(reader, collectionDocid));
assertEquals(referenceDocs.get(collectionDocid).get("raw"), IndexReaderUtils.documentRaw(reader, collectionDocid));
assertEquals(referenceDocs.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid));

// check list of tokens by calling document vector
if(!tokens.isEmpty()){
if (!referenceDocTokens.isEmpty()){
try {
Map<String, Long> actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid);
Iterator it = actualToken.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry)it.next();
assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue());
it.remove();
}
List<String> docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid);
assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens);
} catch (NotStoredException e) {
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("TREC_DOC_1", Map.of(
referenceDocs.put("TREC_DOC_1", Map.of(
"contents", "This is head very simple text",
"raw", "<HEAD>This is head</HEAD>\n" +
"<TEXT>\n" +
"very simple\n" +
"text\n" +
"</TEXT>"));
documents.put("WSJ_1", Map.of(
referenceDocs.put("WSJ_1", Map.of(
"contents", "head text 01/30/03 content",
"raw", "<HL>\n" +
"head text\n" +
Expand All @@ -65,7 +65,7 @@ protected void setCheckIndexGroundTruth() {
"</LP>\n" +
"<TEXT>\n" +
"</TEXT>"));
documents.put("DOC222", Map.of(
referenceDocs.put("DOC222", Map.of(
"contents", "HEAD simple enough text text text",
"raw", "<HEAD>HEAD</HEAD>\n" +
"<TEXT>\n" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,11 @@
*/
package io.anserini.integration;

import io.anserini.collection.DocumentCollection;
import io.anserini.collection.JsonCollection;
import io.anserini.collection.TrecCollection;
import io.anserini.index.IndexArgs;
import io.anserini.index.IndexCollection;
import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
import io.anserini.search.SearchArgs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand All @@ -45,34 +41,30 @@ IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 2;
documents.put("2000000", Map.of(
referenceDocs.put("2000000", Map.of(
"contents", "this was ##a simple pretokenized test",
"raw","{\n" +
" \"id\" : \"2000000\",\n" +
" \"contents\" : \"this was ##a simple pretokenized test\"\n" +
"}"));
documents.put("2000001", Map.of(
referenceDocs.put("2000001", Map.of(
"contents", "some time extra ##vert ##ing and some time intro ##vert ##ing",
"raw","{\n" +
" \"id\" : \"2000001\",\n" +
" \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" +
"}"
));
tokens.put("2000000", Map.of(
"contents", Map.of(
"this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L)));
tokens.put("2000001",Map.of(
"contents", Map.of(
"some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L)));
referenceDocTokens.put("2000000", Map.of(
"contents", List.of("this", "was", "##a", "simple", "pretokenized", "test")));
referenceDocTokens.put("2000001", Map.of(
"contents", List.of("some", "time", "extra", "##vert", "##ing", "and", "some", "time", "intro", "##vert", "##ing")));

fieldNormStatusTotalFields = 1;
// whitespace analyzer keeps everything, includes docid
// this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001
termIndexStatusTermCount = 15;
termIndexStatusTotFreq = 15;
storedFieldStatusTotalDocCounts = 2;
termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts;
storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2
storedFieldStatusTotFields = 6;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import io.anserini.collection.TrecCollection;
import io.anserini.index.IndexArgs;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

public class TrecEndToEndExternalStopwordsTest extends EndToEndTest {
Expand All @@ -20,14 +22,14 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("TREC_DOC_1", Map.of(
referenceDocs.put("TREC_DOC_1", Map.of(
"contents", "This is head very simple text",
"raw", "<HEAD>This is head</HEAD>\n" +
"<TEXT>\n" +
"very simple\n" +
"text\n" +
"</TEXT>"));
documents.put("WSJ_1", Map.of(
referenceDocs.put("WSJ_1", Map.of(
"contents", "head text 01/30/03 content",
"raw", "<HL>\n" +
"head text\n" +
Expand All @@ -40,7 +42,7 @@ protected void setCheckIndexGroundTruth() {
"</LP>\n" +
"<TEXT>\n" +
"</TEXT>"));
documents.put("DOC222", Map.of(
referenceDocs.put("DOC222", Map.of(
"contents", "HEAD simple enough text text text",
"raw", "<HEAD>HEAD</HEAD>\n" +
"<TEXT>\n" +
Expand All @@ -51,6 +53,13 @@ protected void setCheckIndexGroundTruth() {
"text\n" +
"</TEXT>"));

referenceDocTokens.put("TREC_DOC_1", Map.of(
"contents", Arrays.asList(new String[]{"thi", "is", "head", "veri", null, "text"})));
referenceDocTokens.put("WSJ_1", Map.of(
"contents", List.of("head", "text", "01", "30", "03", "content")));
referenceDocTokens.put("DOC222", Map.of(
"contents", Arrays.asList(new String[]{"head", null, null, "text", "text", "text"})));

// Terms per document:
// d1: TREC_DOC_1 this is head very simple text
// d2: DOC222 head simple enough text
Expand Down
15 changes: 12 additions & 3 deletions src/test/java/io/anserini/integration/TrecEndToEndPassageTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import io.anserini.index.IndexArgs;
import io.anserini.search.SearchArgs;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

public class TrecEndToEndPassageTest extends EndToEndTest {
Expand All @@ -36,14 +38,14 @@ protected IndexArgs getIndexArgs() {
@Override
protected void setCheckIndexGroundTruth() {
docCount = 3;
documents.put("TREC_DOC_1.00001", Map.of(
referenceDocs.put("TREC_DOC_1.00001", Map.of(
"contents", "This is head very simple text",
"raw", "<HEAD>This is head</HEAD>\n" +
"<TEXT>\n" +
"very simple\n" +
"text\n" +
"</TEXT>"));
documents.put("WSJ_1", Map.of(
referenceDocs.put("WSJ_1", Map.of(
"contents", "head text 01/30/03 content",
"raw", "<HL>\n" +
"head text\n" +
Expand All @@ -56,7 +58,7 @@ protected void setCheckIndexGroundTruth() {
"</LP>\n" +
"<TEXT>\n" +
"</TEXT>"));
documents.put("TREC_DOC_1.00002", Map.of(
referenceDocs.put("TREC_DOC_1.00002", Map.of(
"contents", "HEAD simple enough text text text",
"raw", "<HEAD>HEAD</HEAD>\n" +
"<TEXT>\n" +
Expand All @@ -67,6 +69,13 @@ protected void setCheckIndexGroundTruth() {
"text\n" +
"</TEXT>"));

referenceDocTokens.put("TREC_DOC_1.00001", Map.of(
"contents", Arrays.asList(new String[]{null, null, "head", "veri", "simpl", "text"})));
referenceDocTokens.put("WSJ_1", Map.of(
"contents", List.of("head", "text", "01", "30", "03", "content")));
referenceDocTokens.put("TREC_DOC_1.00002", Map.of(
"contents", List.of("head", "simpl", "enough", "text", "text", "text")));

fieldNormStatusTotalFields = 1; // text
termIndexStatusTermCount = 12; // Note that standard analyzer ignores stopwords; includes docids.
termIndexStatusTotFreq = 17;
Expand Down
Loading

0 comments on commit 5781c87

Please sign in to comment.