Skip to content

Commit

Permalink
add pretokenized option in index and search collection (#1514)
Browse files Browse the repository at this point in the history
add pretokenized option in IndexCollection and SearchCollection with test cases
  • Loading branch information
stephaniewhoo committed Apr 15, 2021
1 parent a708652 commit 6b24bbd
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 4 deletions.
4 changes: 4 additions & 0 deletions src/main/java/io/anserini/index/IndexArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ public class IndexArgs {
usage = "Analyzer language (ISO 3166 two-letter code).")
public String language= "en";

@Option(name = "-pretokenized",
usage = "index pre-tokenized collections without any additional stemming, stopword processing")
public boolean pretokenized = false;

// Tweet options

@Option(name = "-tweet.keepRetweets",
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ public IndexCollection(IndexArgs args) throws Exception {
LOG.info("Store document \"raw\" field? " + args.storeRaw);
LOG.info("Optimize (merge segments)? " + args.optimize);
LOG.info("Whitelist: " + args.whitelist);
LOG.info("Pretokenized?: " + args.pretokenized);

if (args.solr) {
LOG.info("Indexing into Solr...");
Expand Down Expand Up @@ -753,7 +754,7 @@ public Counters run() throws IOException {
config = new IndexWriterConfig(germanAnalyzer);
} else if (args.language.equals("es")) {
config = new IndexWriterConfig(spanishAnalyzer);
} else if (args.language.equals("en_ws")) {
} else if (args.pretokenized) {
config = new IndexWriterConfig(whitespaceAnalyzer);
} else {
config = new IndexWriterConfig(analyzer);
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ public class SearchArgs {
usage = "Path to file with stopwords.")
public String stopwords = null;

@Option(name = "-pretokenized", usage = "Boolean switch to accept pre tokenized jsonl.")
public boolean pretokenized = false;

@Option(name = "-arbitraryScoreTieBreak", usage = "Break score ties arbitrarily (not recommended)")
public boolean arbitraryScoreTieBreak = false;

Expand Down
13 changes: 10 additions & 3 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ public final class SearchCollection implements Closeable {
private List<RerankerCascade> cascades;
private final boolean isRerank;
private Map<String, ScoredDocuments> qrels;
private Set<String> queriesWithRel;
private Set<String> queriesWithRel;
private Map<String, List<String>> queries = new HashMap<>(); // let query tokens get exposed to the test (with analyzer)

private final class SearcherThread<K> extends Thread {
final private IndexReader reader;
Expand Down Expand Up @@ -302,9 +303,9 @@ public SearchCollection(SearchArgs args) throws IOException {
} else if (args.language.equals("es")) {
analyzer = new SpanishAnalyzer();
LOG.info("Language: es");
} else if (args.language.equals("en_ws")) {
} else if (args.pretokenized) {
analyzer = new WhitespaceAnalyzer();
LOG.info("Language: en_ws");
LOG.info("Pretokenized");
} else {
// Default to English
analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepstop, args.stopwords);
Expand Down Expand Up @@ -587,6 +588,8 @@ public <K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr

List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);

queries.put(qid.toString(), queryTokens);

RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
ScoredDocuments scoredFbDocs;
if ( isRerank && args.rf_qrels != null) {
Expand Down Expand Up @@ -700,6 +703,10 @@ public <K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String qu
return cascade.run(scoredFbDocs, context);
}

public Map<String, List<String>> getQueries(){
return queries;
}

public static void main(String[] args) throws Exception {
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(100));
Expand Down
27 changes: 27 additions & 0 deletions src/test/java/io/anserini/integration/EndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import io.anserini.index.IndexArgs;
import io.anserini.index.IndexCollection;
import io.anserini.index.IndexReaderUtils;
import io.anserini.index.NotStoredException;
import io.anserini.search.SearchArgs;
import io.anserini.search.SearchCollection;
import org.apache.commons.io.FileUtils;
Expand Down Expand Up @@ -46,6 +47,7 @@
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Iterator;

// This automatically tests indexing, retrieval, and evaluation from end to end.
// Subclasses inherit and special to different collections.
Expand All @@ -62,6 +64,8 @@ public abstract class EndToEndTest extends LuceneTestCase {
protected String searchOutputPrefix = "e2eTestSearch";
protected Map<String, String[]> referenceRunOutput = new HashMap<>();
protected Map<String, Map<String, String>> documents = new HashMap<>();
protected Map<String, Map<String, Map<String, Long>>> tokens = new HashMap<>();
protected Map<String, List<String>> queryTokens = new HashMap<>();

// These are the sources of truth
protected int fieldNormStatusTotalFields;
Expand Down Expand Up @@ -145,6 +149,10 @@ public void setUp() throws Exception {
args.add(Integer.toString(indexArgs.shardCurrent));
}

if (indexArgs.pretokenized){
args.add("-pretokenized");
}

IndexCollection.main(args.toArray(new String[args.size()]));
}

Expand Down Expand Up @@ -194,6 +202,20 @@ public void checkIndex() throws IOException {
IndexReaderUtils.documentRaw(reader, collectionDocid));
assertEquals(documents.get(collectionDocid).get("contents"),
IndexReaderUtils.documentContents(reader, collectionDocid));
// check list of tokens by calling document vector
if(!tokens.isEmpty()){
try {
Map<String, Long> actualToken = IndexReaderUtils.getDocumentVector(reader, collectionDocid);
Iterator it = actualToken.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry)it.next();
assertEquals(tokens.get(collectionDocid).get("contents").get(pair.getKey()), pair.getValue());
it.remove();
}
} catch (NotStoredException e) {
e.printStackTrace();
}
}
}
reader.close();

Expand Down Expand Up @@ -265,7 +287,12 @@ public void testSearching() {
for (Map.Entry<String, SearchArgs> entry : testQueries.entrySet()) {
SearchCollection searcher = new SearchCollection(entry.getValue());
searcher.runTopics();
Map<String, List<String>> actualQuery = searcher.getQueries();
searcher.close();
//check query tokens
if(!queryTokens.isEmpty()){
assertEquals(queryTokens, actualQuery);
}
checkRankingResults(entry.getKey(), entry.getValue().output);
// Remember to cleanup run files.
cleanup.add(new File(entry.getValue().output));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.anserini.integration;

import io.anserini.collection.DocumentCollection;
import io.anserini.collection.JsonCollection;
import io.anserini.collection.TrecCollection;
import io.anserini.index.IndexArgs;
import io.anserini.index.IndexCollection;
import io.anserini.index.generator.DefaultLuceneDocumentGenerator;
import io.anserini.search.SearchArgs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;


public class PretokenizedIndexEndToEndTest extends EndToEndTest {
@Override
IndexArgs getIndexArgs() {
IndexArgs indexArgs = createDefaultIndexArgs();
indexArgs.input = "src/test/resources/sample_docs/json/collection_tokenized";
indexArgs.collectionClass = JsonCollection.class.getSimpleName();
indexArgs.generatorClass = DefaultLuceneDocumentGenerator.class.getSimpleName();
indexArgs.pretokenized = true;
indexArgs.storeRaw = true;

return indexArgs;
}

@Override
protected void setCheckIndexGroundTruth() {
docCount = 2;
documents.put("2000000", Map.of(
"contents", "this was ##a simple pretokenized test",
"raw","{\n" +
" \"id\" : \"2000000\",\n" +
" \"contents\" : \"this was ##a simple pretokenized test\"\n" +
"}"));
documents.put("2000001", Map.of(
"contents", "some time extra ##vert ##ing and some time intro ##vert ##ing",
"raw","{\n" +
" \"id\" : \"2000001\",\n" +
" \"contents\" : \"some time extra ##vert ##ing and some time intro ##vert ##ing\"\n" +
"}"
));
tokens.put("2000000", Map.of(
"contents", Map.of(
"this", 1L, "was", 1L, "##a", 1L, "simple", 1L, "pretokenized", 1L, "test", 1L)));
tokens.put("2000001",Map.of(
"contents", Map.of(
"some", 2L, "time", 2L, "extra", 1L, "##vert", 2L, "##ing", 2L, "and", 1L, "intro", 1L)));

fieldNormStatusTotalFields = 1;
// whitespace analyzer keeps everything, includes docid
// this is ##a simple pretokenized test some time extra ##vert ##ing and intro 2000000 2000001
termIndexStatusTermCount = 15;
termIndexStatusTotFreq = 15;
storedFieldStatusTotalDocCounts = 2;
termIndexStatusTotPos = 17 + storedFieldStatusTotalDocCounts;
storedFieldStatusTotFields = 6; // 1 docs * (1 id + 1 contents + 1 raw) *2
}

@Override
protected void setSearchGroundTruth() {
topicReader = "TsvInt";
topicFile = "src/test/resources/sample_topics/json_topics.tsv";
SearchArgs searchArg = createDefaultSearchArgs().bm25();
searchArg.pretokenized = true;
testQueries.put("bm25", searchArg);
queryTokens.put("1", new ArrayList<>());
queryTokens.get("1").add("##ing");
queryTokens.get("1").add("##vert");
referenceRunOutput.put("bm25", new String[]{
"1 Q0 2000001 1 0.922400 Anserini"});
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
"id": "2000000",
"contents": "this was ##a simple pretokenized test"
},
{
"id": "2000001",
"contents": "some time extra ##vert ##ing and some time intro ##vert ##ing"
}
]
1 change: 1 addition & 0 deletions src/test/resources/sample_topics/json_topics.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 ##ing ##vert

0 comments on commit 6b24bbd

Please sign in to comment.