From 3aeed9180642b83b619779ec9667432acf8c57a1 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 14 Jun 2021 20:23:19 -0400 Subject: [PATCH 1/2] Updated regression log, fixed unit tests --- docs/regressions-log.md | 10 ++++++++++ docs/regressions.md | 4 ++-- .../java/io/anserini/collection/JsonCollection.java | 2 ++ .../java/io/anserini/integration/EndToEndTest.java | 3 +-- .../anserini/integration/JsonVectorEndToEndTest.java | 8 ++++---- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/docs/regressions-log.md b/docs/regressions-log.md index 63d33d6a48..268e3d72c9 100644 --- a/docs/regressions-log.md +++ b/docs/regressions-log.md @@ -3,6 +3,16 @@ The following change log details commits to regression tests that alter effectiveness and the addition of new regression tests. This documentation is useful for figuring why results may have changed over time. +### June 14, 2021 + ++ commit [`b58c85`](https://github.com/castorini/anserini/commit/b58c8559b4fc473e857b9ce5ca73523d8d017b41) (06/14/2021) + +Overhaul of regressions for MS MARCO {passage, doc} and DL {19, 20}: ++ MS MARCO passage + {doc2query, docTTTTTquery} ++ MS MARCO doc {per-doc, per-passage} x {doc2query, docTTTTTquery} ++ {DL19, DL20} passage + {doc2query, docTTTTTquery} ++ {DL19, DL20} doc {per-doc, per passage} x {doc2query, docTTTTTquery} + ### April 13, 2021 + commit [`868afe`](https://github.com/castorini/anserini/commit/868afe9ec07fa477ce817d7a43dd5723cb4c8a86) (04/13/2021) diff --git a/docs/regressions.md b/docs/regressions.md index 67812b2657..f7859a1f6c 100644 --- a/docs/regressions.md +++ b/docs/regressions.md @@ -69,7 +69,7 @@ nohup python src/main/python/run_regression.py --collection dl19-doc-docTTTTTque nohup python src/main/python/run_regression.py --collection dl19-doc-docTTTTTquery-per-passage >& logs/log.dl19-doc-docTTTTTquery-per-passage & nohup python src/main/python/run_regression.py --collection dl20-passage >& logs/log.dl20-passage & -nohup python src/main/python/run_regression.py --collection dl20-passage-docTTTTTquery >& logs/dl20-passage-docTTTTTquery & +nohup python src/main/python/run_regression.py --collection dl20-passage-docTTTTTquery >& logs/log.dl20-passage-docTTTTTquery & nohup python src/main/python/run_regression.py --collection dl20-doc >& logs/log.dl20-doc & nohup python src/main/python/run_regression.py --collection dl20-doc-per-passage >& logs/log.dl20-doc-per-passage & nohup python src/main/python/run_regression.py --collection dl20-doc-docTTTTTquery-per-doc >& logs/log.dl20-doc-docTTTTTquery-per-doc & @@ -128,7 +128,7 @@ nohup python src/main/python/run_regression.py --index --collection dl19-doc-doc nohup python src/main/python/run_regression.py --index --collection dl19-doc-docTTTTTquery-per-passage >& logs/log.dl19-doc-docTTTTTquery-per-passage & nohup python src/main/python/run_regression.py --index --collection dl20-passage >& logs/log.dl20-passage & -nohup python src/main/python/run_regression.py --index --collection dl20-passage-docTTTTTquery >& logs/dl20-passage-docTTTTTquery & +nohup python src/main/python/run_regression.py --index --collection dl20-passage-docTTTTTquery >& logs/log.dl20-passage-docTTTTTquery & nohup python src/main/python/run_regression.py --index --collection dl20-doc >& logs/log.dl20-doc & nohup python src/main/python/run_regression.py --index --collection dl20-doc-per-passage >& logs/log.dl20-doc-per-passage & nohup python src/main/python/run_regression.py --index --collection dl20-doc-docTTTTTquery-per-doc >& logs/log.dl20-doc-docTTTTTquery-per-doc & diff --git a/src/main/java/io/anserini/collection/JsonCollection.java b/src/main/java/io/anserini/collection/JsonCollection.java index e8e3625df8..3c4776fb3e 100644 --- a/src/main/java/io/anserini/collection/JsonCollection.java +++ b/src/main/java/io/anserini/collection/JsonCollection.java @@ -73,6 +73,7 @@ public JsonCollection(Path path){ this.allowedFileSuffix = new HashSet<>(Arrays.asList(".json", ".jsonl")); } + @SuppressWarnings("unchecked") @Override public FileSegment createFileSegment(Path p) throws IOException { return new Segment(p); @@ -99,6 +100,7 @@ public Segment(Path path) throws IOException { } } + @SuppressWarnings("unchecked") @Override public void readNext() throws NoSuchElementException { if (node == null) { diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java index 4ce5d037a2..42cf13e154 100644 --- a/src/test/java/io/anserini/integration/EndToEndTest.java +++ b/src/test/java/io/anserini/integration/EndToEndTest.java @@ -312,8 +312,7 @@ protected void checkRankingResults(String key, String output) throws IOException int cnt = 0; String s; while ((s = br.readLine()) != null) { - //assertEquals(ref[cnt], s); - System.out.println(s); + assertEquals(ref[cnt], s); cnt++; } diff --git a/src/test/java/io/anserini/integration/JsonVectorEndToEndTest.java b/src/test/java/io/anserini/integration/JsonVectorEndToEndTest.java index ef3565911a..bc2581f7ef 100644 --- a/src/test/java/io/anserini/integration/JsonVectorEndToEndTest.java +++ b/src/test/java/io/anserini/integration/JsonVectorEndToEndTest.java @@ -68,10 +68,10 @@ protected void setSearchGroundTruth() { queryTokens.get("3").add("f4"); referenceRunOutput.put("impact", new String[]{ - "1 Q0 doc2 1 0.613600 Anserini", - "2 Q0 doc1 1 0.393100 Anserini", - "3 Q0 doc1 1 0.153100 Anserini", - "3 Q0 doc2 2 0.135500 Anserini"}); + "1 Q0 doc2 1 8.000000 Anserini", + "2 Q0 doc1 1 1.000000 Anserini", + "3 Q0 doc1 1 4.000000 Anserini", + "3 Q0 doc2 2 3.000000 Anserini"}); } } From 95ea4fccfc5c517df4edd1a17bcb6f6057464fc0 Mon Sep 17 00:00:00 2001 From: lintool Date: Mon, 14 Jun 2021 22:34:13 -0400 Subject: [PATCH 2/2] cleanup. --- .../anserini/collection/JsonCollection.java | 7 --- ...JsonVectorCollectionDocumentArrayTest.java | 53 ++++++++++++++++++ ...sonVectorCollectionDocumentObjectTest.java | 55 +++++++++++++++++++ .../JsonVectorCollectionLineObjectTest.java | 11 +--- .../collection/JsonVectorCollectionTest.java | 6 +- .../json_vector/collection1/doc1.json | 5 ++ .../json_vector/collection1/doc2.json | 5 ++ .../json_vector/collection2/segment1.json | 12 ++++ 8 files changed, 136 insertions(+), 18 deletions(-) create mode 100644 src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java create mode 100644 src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java create mode 100644 src/test/resources/sample_docs/json_vector/collection1/doc1.json create mode 100644 src/test/resources/sample_docs/json_vector/collection1/doc2.json create mode 100644 src/test/resources/sample_docs/json_vector/collection2/segment1.json diff --git a/src/main/java/io/anserini/collection/JsonCollection.java b/src/main/java/io/anserini/collection/JsonCollection.java index 3c4776fb3e..7104b6607f 100644 --- a/src/main/java/io/anserini/collection/JsonCollection.java +++ b/src/main/java/io/anserini/collection/JsonCollection.java @@ -112,13 +112,6 @@ public void readNext() throws NoSuchElementException { } else { atEOF = true; // there is no more JSON object in the bufferedReader } - } else if (node.isArray()) { - if (iter != null && iter.hasNext()) { - JsonNode json = iter.next(); - bufferedRecord = (T) createNewDocument(node); - } else { - throw new NoSuchElementException("Reached end of JsonNode iterator"); - } } else { LOG.error("Error: invalid JsonNode type"); throw new NoSuchElementException("Invalid JsonNode type"); diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java new file mode 100644 index 0000000000..f626875b55 --- /dev/null +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java @@ -0,0 +1,53 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +// A file in a JsonVectorCollection can either be: +// (1) A single JSON object (i.e., a single document) +// (2) An array of JSON objects +// (3) JSON Lines (i.e., one JSON object per line) +// +// This is the test case for (2) +public class JsonVectorCollectionDocumentArrayTest extends JsonVectorCollectionTest { + + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/json_vector/collection2"); + collection = new JsonVectorCollection(collectionPath); + + Path segment1 = Paths.get("src/test/resources/sample_docs/json_vector/collection2/segment1.json"); + + segmentPaths.add(segment1); + segmentDocCounts.put(segment1, 2); + + totalSegments = 1; + totalDocs = 2; + + expected.put("doc1", Map.of("id", "doc1", + "content", "f1 f2 f2 f3 f4 f4 f4 f4 f5 ")); + expected.put("doc2", Map.of("id", "doc2", + "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); + } +} diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java new file mode 100644 index 0000000000..ded9429cd9 --- /dev/null +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java @@ -0,0 +1,55 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +// A file in a JsonVectorCollection can either be: +// (1) A single JSON object (i.e., a single document) +// (2) An array of JSON objects +// (3) JSON Lines (i.e., one JSON object per line) +// +// This is the test case for (1) +public class JsonVectorCollectionDocumentObjectTest extends JsonVectorCollectionTest { + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/json_vector/collection1"); + collection = new JsonVectorCollection(collectionPath); + + Path segment1 = Paths.get("src/test/resources/sample_docs/json_vector/collection1/doc1.json"); + Path segment2 = Paths.get("src/test/resources/sample_docs/json_vector/collection1/doc2.json"); + + segmentPaths.add(segment1); + segmentDocCounts.put(segment1, 1); + segmentPaths.add(segment2); + segmentDocCounts.put(segment2, 1); + + totalSegments = 2; + totalDocs = 2; + + expected.put("doc1", Map.of("id", "doc1", + "content", "f1 f2 f2 f3 f4 f4 f4 f4 f5 ")); + expected.put("doc2", Map.of("id", "doc2", + "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); + } +} diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java index 592cf8f51c..d03ecab7c1 100644 --- a/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java @@ -22,14 +22,12 @@ import java.nio.file.Paths; import java.util.Map; -// A file in a JsonCollection can either be: +// A file in a JsonVectorCollection can either be: // (1) A single JSON object (i.e., a single document) // (2) An array of JSON objects // (3) JSON Lines (i.e., one JSON object per line) // // This is the test case for (3) -// -// Note that we're testing the multifield capability here and only here, since the codepath is shared. public class JsonVectorCollectionLineObjectTest extends JsonVectorCollectionTest { @Before @@ -52,11 +50,4 @@ public void setUp() throws Exception { expected.put("doc2", Map.of("id", "doc2", "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); } - - @Override - void checkDocument(SourceDocument doc, Map expected) { - assertTrue(doc.indexable()); - assertEquals(expected.get("id"), doc.id()); - assertEquals(expected.get("content"), doc.contents()); - } } diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java index 51faedb684..57a0c7e3a5 100644 --- a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java @@ -24,6 +24,10 @@ void checkDocument(SourceDocument doc, Map expected) { assertTrue(doc.indexable()); assertEquals(expected.get("id"), doc.id()); assertEquals(expected.get("content"), doc.contents()); - assertEquals(expected.get("raw"), doc.raw()); + + // Checking raw is optional + if (expected.get("raw") != null) { + assertEquals(expected.get("raw"), doc.raw()); + } } } \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection1/doc1.json b/src/test/resources/sample_docs/json_vector/collection1/doc1.json new file mode 100644 index 0000000000..c750976edc --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection1/doc1.json @@ -0,0 +1,5 @@ +{ + "id": "doc1", + "contents": "this is the contents 1.", + "vector": {"f1": 1, "f2": 2, "f3": 1, "f4": 4, "f5": 1} +} \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection1/doc2.json b/src/test/resources/sample_docs/json_vector/collection1/doc2.json new file mode 100644 index 0000000000..41c3f5a7e8 --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection1/doc2.json @@ -0,0 +1,5 @@ +{ + "id": "doc2", + "contents": "this is the contents 2.", + "vector": {"f4": 3, "f5": 1, "f9": 2, "f22": 6, "f35": 8} +} \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection2/segment1.json b/src/test/resources/sample_docs/json_vector/collection2/segment1.json new file mode 100644 index 0000000000..a02b97f964 --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection2/segment1.json @@ -0,0 +1,12 @@ +[ + { + "id": "doc1", + "contents": "this is the contents 1.", + "vector": {"f1": 1, "f2": 2, "f3": 1, "f4": 4, "f5": 1} + }, + { + "id": "doc2", + "contents": "this is the contents 2.", + "vector": {"f4": 3, "f5": 1, "f9": 2, "f22": 6, "f35": 8} + } +]