From f63cd2275fa5a9d4da2d17e5f983a3308e8b50ce Mon Sep 17 00:00:00 2001 From: Zeynep Akkalyoncu Yilmaz Date: Tue, 7 Jan 2020 10:40:49 -0500 Subject: [PATCH] Exposes method in IndexReaderUtils to fetch raw document (#937) --- .../io/anserini/index/IndexReaderUtils.java | 19 +++++++++++++++++++ .../anserini/index/IndexReaderUtilsTest.java | 10 ++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index a5c176600f..e59ba0514c 100644 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -332,6 +332,25 @@ public static Map getDocumentVector(IndexReader reader, String doc return docVector; } + /** + * Returns the raw document given its collection docid. + * @param reader index reader + * @param docid collection docid + * @return the raw document given its collection docid, or null if not found. + */ + public static String getRawDocument(IndexReader reader, String docid) { + try { + Document rawDoc = reader.document(convertDocidToLuceneDocid(reader, docid)); + + if (rawDoc == null) { + return null; + } + return rawDoc.get(LuceneDocumentGenerator.FIELD_RAW); + } catch (IOException e) { + return null; + } + } + /** * Computes the BM25 weight of a term (prior to analysis) in a particular document. * @param reader index reader diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java index 3ceea82e22..985156f1c5 100644 --- a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java +++ b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java @@ -249,6 +249,16 @@ public void testDocumentVector() throws Exception { assertEquals(Long.valueOf(1), documentVector.get("test")); } + @Test + public void testRawDoc() throws Exception { + Directory dir = FSDirectory.open(tempDir1); + IndexReader reader = DirectoryReader.open(dir); + + assertEquals("here is some text here is some more text", IndexReaderUtils.getRawDocument(reader, "doc1")); + assertEquals("more texts", IndexReaderUtils.getRawDocument(reader, "doc2")); + assertEquals("here is a test", IndexReaderUtils.getRawDocument(reader, "doc3")); + } + @Test public void testDocidConversion() throws Exception { Directory dir = FSDirectory.open(tempDir1);