diff --git a/src/main/java/io/anserini/collection/JsonCollection.java b/src/main/java/io/anserini/collection/JsonCollection.java index 3c4776fb3e..7104b6607f 100644 --- a/src/main/java/io/anserini/collection/JsonCollection.java +++ b/src/main/java/io/anserini/collection/JsonCollection.java @@ -112,13 +112,6 @@ public void readNext() throws NoSuchElementException { } else { atEOF = true; // there is no more JSON object in the bufferedReader } - } else if (node.isArray()) { - if (iter != null && iter.hasNext()) { - JsonNode json = iter.next(); - bufferedRecord = (T) createNewDocument(node); - } else { - throw new NoSuchElementException("Reached end of JsonNode iterator"); - } } else { LOG.error("Error: invalid JsonNode type"); throw new NoSuchElementException("Invalid JsonNode type"); diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java new file mode 100644 index 0000000000..f626875b55 --- /dev/null +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentArrayTest.java @@ -0,0 +1,53 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +// A file in a JsonVectorCollection can either be: +// (1) A single JSON object (i.e., a single document) +// (2) An array of JSON objects +// (3) JSON Lines (i.e., one JSON object per line) +// +// This is the test case for (2) +public class JsonVectorCollectionDocumentArrayTest extends JsonVectorCollectionTest { + + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/json_vector/collection2"); + collection = new JsonVectorCollection(collectionPath); + + Path segment1 = Paths.get("src/test/resources/sample_docs/json_vector/collection2/segment1.json"); + + segmentPaths.add(segment1); + segmentDocCounts.put(segment1, 2); + + totalSegments = 1; + totalDocs = 2; + + expected.put("doc1", Map.of("id", "doc1", + "content", "f1 f2 f2 f3 f4 f4 f4 f4 f5 ")); + expected.put("doc2", Map.of("id", "doc2", + "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); + } +} diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java new file mode 100644 index 0000000000..ded9429cd9 --- /dev/null +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionDocumentObjectTest.java @@ -0,0 +1,55 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +// A file in a JsonVectorCollection can either be: +// (1) A single JSON object (i.e., a single document) +// (2) An array of JSON objects +// (3) JSON Lines (i.e., one JSON object per line) +// +// This is the test case for (1) +public class JsonVectorCollectionDocumentObjectTest extends JsonVectorCollectionTest { + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/json_vector/collection1"); + collection = new JsonVectorCollection(collectionPath); + + Path segment1 = Paths.get("src/test/resources/sample_docs/json_vector/collection1/doc1.json"); + Path segment2 = Paths.get("src/test/resources/sample_docs/json_vector/collection1/doc2.json"); + + segmentPaths.add(segment1); + segmentDocCounts.put(segment1, 1); + segmentPaths.add(segment2); + segmentDocCounts.put(segment2, 1); + + totalSegments = 2; + totalDocs = 2; + + expected.put("doc1", Map.of("id", "doc1", + "content", "f1 f2 f2 f3 f4 f4 f4 f4 f5 ")); + expected.put("doc2", Map.of("id", "doc2", + "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); + } +} diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java index 592cf8f51c..d03ecab7c1 100644 --- a/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionLineObjectTest.java @@ -22,14 +22,12 @@ import java.nio.file.Paths; import java.util.Map; -// A file in a JsonCollection can either be: +// A file in a JsonVectorCollection can either be: // (1) A single JSON object (i.e., a single document) // (2) An array of JSON objects // (3) JSON Lines (i.e., one JSON object per line) // // This is the test case for (3) -// -// Note that we're testing the multifield capability here and only here, since the codepath is shared. public class JsonVectorCollectionLineObjectTest extends JsonVectorCollectionTest { @Before @@ -52,11 +50,4 @@ public void setUp() throws Exception { expected.put("doc2", Map.of("id", "doc2", "content", "f4 f4 f4 f5 f9 f9 f22 f22 f22 f22 f22 f22 f35 f35 f35 f35 f35 f35 f35 f35 ")); } - - @Override - void checkDocument(SourceDocument doc, Map expected) { - assertTrue(doc.indexable()); - assertEquals(expected.get("id"), doc.id()); - assertEquals(expected.get("content"), doc.contents()); - } } diff --git a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java index 51faedb684..57a0c7e3a5 100644 --- a/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java +++ b/src/test/java/io/anserini/collection/JsonVectorCollectionTest.java @@ -24,6 +24,10 @@ void checkDocument(SourceDocument doc, Map expected) { assertTrue(doc.indexable()); assertEquals(expected.get("id"), doc.id()); assertEquals(expected.get("content"), doc.contents()); - assertEquals(expected.get("raw"), doc.raw()); + + // Checking raw is optional + if (expected.get("raw") != null) { + assertEquals(expected.get("raw"), doc.raw()); + } } } \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection1/doc1.json b/src/test/resources/sample_docs/json_vector/collection1/doc1.json new file mode 100644 index 0000000000..c750976edc --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection1/doc1.json @@ -0,0 +1,5 @@ +{ + "id": "doc1", + "contents": "this is the contents 1.", + "vector": {"f1": 1, "f2": 2, "f3": 1, "f4": 4, "f5": 1} +} \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection1/doc2.json b/src/test/resources/sample_docs/json_vector/collection1/doc2.json new file mode 100644 index 0000000000..41c3f5a7e8 --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection1/doc2.json @@ -0,0 +1,5 @@ +{ + "id": "doc2", + "contents": "this is the contents 2.", + "vector": {"f4": 3, "f5": 1, "f9": 2, "f22": 6, "f35": 8} +} \ No newline at end of file diff --git a/src/test/resources/sample_docs/json_vector/collection2/segment1.json b/src/test/resources/sample_docs/json_vector/collection2/segment1.json new file mode 100644 index 0000000000..a02b97f964 --- /dev/null +++ b/src/test/resources/sample_docs/json_vector/collection2/segment1.json @@ -0,0 +1,12 @@ +[ + { + "id": "doc1", + "contents": "this is the contents 1.", + "vector": {"f1": 1, "f2": 2, "f3": 1, "f4": 4, "f5": 1} + }, + { + "id": "doc2", + "contents": "this is the contents 2.", + "vector": {"f4": 3, "f5": 1, "f9": 2, "f22": 6, "f35": 8} + } +]