-
Notifications
You must be signed in to change notification settings - Fork 458
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add HydroRIVERS index feature (#1776)
* finished GeoGenerator and have a start on GeoJsonCollection * removed GeoJsonCollection * removed files no longer used * reverted old GeoIndexerTestBase * added test for Json collection on Rivers * updated to pass tests now * removed debug code in GeoIndexerTestBase * pedantic changes * Update comment syntax * fixed comment grammar
- Loading branch information
Showing
4 changed files
with
340 additions
and
0 deletions.
There are no files selected for viewing
85 changes: 85 additions & 0 deletions
85
src/main/java/io/anserini/index/generator/GeoGenerator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package io.anserini.index.generator; | ||
|
||
import io.anserini.collection.JsonCollection; | ||
import io.anserini.index.IndexArgs; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.apache.lucene.document.*; | ||
import org.apache.lucene.geo.Line; | ||
import org.apache.lucene.geo.Polygon; | ||
import org.apache.lucene.geo.SimpleWKTShapeParser; | ||
|
||
import java.io.IOException; | ||
import java.text.ParseException; | ||
|
||
public class GeoGenerator implements LuceneDocumentGenerator<JsonCollection.Document> { | ||
private static final Logger LOG = LogManager.getLogger(GeoGenerator.class); | ||
|
||
protected IndexArgs args; | ||
|
||
public GeoGenerator(IndexArgs args) { | ||
this.args = args; | ||
} | ||
|
||
@Override | ||
public Document createDocument(JsonCollection.Document geoDoc) { | ||
Document doc = new Document(); | ||
|
||
// Store the raw JSON | ||
if (args.storeRaw) { | ||
doc.add(new StoredField(IndexArgs.RAW, geoDoc.raw())); | ||
} | ||
|
||
geoDoc.fields().forEach((k, v) -> { | ||
if ("geometry".equals(k)) { | ||
// parse the geometry fields using SimpleWKTParser and index them | ||
try { | ||
Object shape = SimpleWKTShapeParser.parse(v); | ||
|
||
Field[] fields = new Field[0]; | ||
if (shape instanceof Line) { | ||
fields = LatLonShape.createIndexableFields("geometry", (Line) shape); | ||
} else if (shape instanceof Polygon) { | ||
fields = LatLonShape.createIndexableFields("geometry", (Polygon) shape); | ||
} else if (shape instanceof Line[]) { | ||
for (Line line: (Line[]) shape) { | ||
fields = LatLonShape.createIndexableFields("geometry", line); | ||
} | ||
} else if (shape instanceof Polygon[]) { | ||
for (Polygon polygon: (Polygon[]) shape) { | ||
fields = LatLonShape.createIndexableFields("geometry", polygon); | ||
} | ||
} else { | ||
throw new IllegalArgumentException("unknown shape"); | ||
} | ||
|
||
for (Field f: fields) { | ||
doc.add(f); | ||
} | ||
} catch (ParseException | IOException e) { | ||
LOG.error("Error parsing unknown shape using SimpleWKTShapeParser: " + v); | ||
} catch (IllegalArgumentException e) { | ||
LOG.error("Error casting shape to any of the types Line, Line[], Polygon, Polygon[]: " + v); | ||
} | ||
|
||
} else { | ||
// go through all the non-geometry fields and try to index them as int or long if possible | ||
try { | ||
long vLong = Long.parseLong(v); | ||
doc.add(new LongPoint(k, vLong)); | ||
doc.add(new StoredField(k, v)); | ||
} catch (NumberFormatException e1) { | ||
try { | ||
double vDouble = Double.parseDouble(v); | ||
doc.add(new DoublePoint(k, vDouble)); | ||
doc.add(new StoredField(k, v)); | ||
} catch (NumberFormatException e2) { | ||
doc.add(new StringField(k, v, Field.Store.YES)); | ||
} | ||
} | ||
} | ||
}); | ||
|
||
return doc; | ||
} | ||
} |
104 changes: 104 additions & 0 deletions
104
src/test/java/io/anserini/collection/JsonCollectionGeoRiverTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
package io.anserini.collection; | ||
|
||
import org.junit.Before; | ||
|
||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.Map; | ||
|
||
public class JsonCollectionGeoRiverTest extends JsonCollectionTest { | ||
@Before | ||
public void setUp() throws Exception { | ||
super.setUp(); | ||
|
||
collectionPath = Paths.get("src/test/resources/sample_docs/json/collection_geo"); | ||
collection = new JsonCollection(collectionPath); | ||
|
||
Path segment = Paths.get("src/test/resources/sample_docs/json/collection_geo/rivers.json"); | ||
|
||
segmentPaths.add(segment); | ||
segmentDocCounts.put(segment, 3); | ||
|
||
totalSegments = 1; | ||
totalDocs = 3; | ||
|
||
expected.put("90000001", Map.ofEntries( | ||
Map.entry("HYRIV_ID", "90000001"), | ||
Map.entry("NEXT_DOWN", "0"), | ||
Map.entry("MAIN_RIV", "90000001"), | ||
Map.entry("LENGTH_KM", "1.16"), | ||
Map.entry("DIST_DN_KM", "0.0"), | ||
Map.entry("DIST_UP_KM", "8.4"), | ||
Map.entry("CATCH_SKM", "15.06"), | ||
Map.entry("UPLAND_SKM", "15.0"), | ||
Map.entry("ENDORHEIC", "0"), | ||
Map.entry("DIS_AV_CMS", "0.089"), | ||
Map.entry("ORD_STRA", "1"), | ||
Map.entry("ORD_CLAS", "1"), | ||
Map.entry("ORD_FLOW", "8"), | ||
Map.entry("HYBAS_L12", "9120016560"), | ||
Map.entry("geometry", "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)"), | ||
Map.entry("id", "90000001") | ||
)); | ||
|
||
expected.put("90000002", Map.ofEntries( | ||
Map.entry("HYRIV_ID", "90000002"), | ||
Map.entry("NEXT_DOWN", "0"), | ||
Map.entry("MAIN_RIV", "90000002"), | ||
Map.entry("LENGTH_KM", "1.16"), | ||
Map.entry("DIST_DN_KM", "0.0"), | ||
Map.entry("DIST_UP_KM", "69.1"), | ||
Map.entry("CATCH_SKM", "10.08"), | ||
Map.entry("UPLAND_SKM", "10.1"), | ||
Map.entry("ENDORHEIC", "0"), | ||
Map.entry("DIS_AV_CMS", "0.0"), | ||
Map.entry("ORD_STRA", "1"), | ||
Map.entry("ORD_CLAS", "1"), | ||
Map.entry("ORD_FLOW", "10"), | ||
Map.entry("HYBAS_L12", "9120016500"), | ||
Map.entry("geometry", "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)"), | ||
Map.entry("id", "90000002") | ||
)); | ||
|
||
expected.put("90000003", Map.ofEntries( | ||
Map.entry("HYRIV_ID", "90000003"), | ||
Map.entry("NEXT_DOWN", "0"), | ||
Map.entry("MAIN_RIV", "90000003"), | ||
Map.entry("LENGTH_KM", "3.02"), | ||
Map.entry("DIST_DN_KM", "0.0"), | ||
Map.entry("DIST_UP_KM", "35.3"), | ||
Map.entry("CATCH_SKM", "12.24"), | ||
Map.entry("UPLAND_SKM", "12.2"), | ||
Map.entry("ENDORHEIC", "0"), | ||
Map.entry("DIS_AV_CMS", "0.03"), | ||
Map.entry("ORD_STRA", "1"), | ||
Map.entry("ORD_CLAS", "1"), | ||
Map.entry("ORD_FLOW", "8"), | ||
Map.entry("HYBAS_L12", "9120016580"), | ||
Map.entry("geometry", "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)"), | ||
Map.entry("id", "90000003") | ||
)); | ||
} | ||
|
||
@Override | ||
void checkDocument(SourceDocument doc, Map<String, String> expected) { | ||
// Note that we need an id in addition to HYRIV_ID to distinguish between different docs | ||
assertTrue(doc.indexable()); | ||
assertEquals(expected.get("HYRIV_ID"), ((JsonCollection.Document) doc).fields().get("HYRIV_ID")); | ||
assertEquals(expected.get("NEXT_DOWN"), ((JsonCollection.Document) doc).fields().get("NEXT_DOWN")); | ||
assertEquals(expected.get("MAIN_RIV"), ((JsonCollection.Document) doc).fields().get("MAIN_RIV")); | ||
assertEquals(expected.get("LENGTH_KM"), ((JsonCollection.Document) doc).fields().get("LENGTH_KM")); | ||
assertEquals(expected.get("DIST_DN_KM"), ((JsonCollection.Document) doc).fields().get("DIST_DN_KM")); | ||
assertEquals(expected.get("DIST_UP_KM"), ((JsonCollection.Document) doc).fields().get("DIST_UP_KM")); | ||
assertEquals(expected.get("CATCH_SKM"), ((JsonCollection.Document) doc).fields().get("CATCH_SKM")); | ||
assertEquals(expected.get("UPLAND_SKM"), ((JsonCollection.Document) doc).fields().get("UPLAND_SKM")); | ||
assertEquals(expected.get("ENDORHEIC"), ((JsonCollection.Document) doc).fields().get("ENDORHEIC")); | ||
assertEquals(expected.get("DIS_AV_CMS"), ((JsonCollection.Document) doc).fields().get("DIS_AV_CMS")); | ||
assertEquals(expected.get("ORD_STRA"), ((JsonCollection.Document) doc).fields().get("ORD_STRA")); | ||
assertEquals(expected.get("ORD_CLAS"), ((JsonCollection.Document) doc).fields().get("ORD_CLAS")); | ||
assertEquals(expected.get("ORD_FLOW"), ((JsonCollection.Document) doc).fields().get("ORD_FLOW")); | ||
assertEquals(expected.get("HYBAS_L12"), ((JsonCollection.Document) doc).fields().get("HYBAS_L12")); | ||
assertEquals(expected.get("geometry"), ((JsonCollection.Document) doc).fields().get("geometry")); | ||
assertEquals(expected.get("id"), doc.id()); | ||
} | ||
} |
97 changes: 97 additions & 0 deletions
97
src/test/java/io/anserini/index/generator/GeoGeneratorTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
package io.anserini.index.generator; | ||
|
||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import com.fasterxml.jackson.databind.node.ObjectNode; | ||
import com.fasterxml.jackson.databind.node.TextNode; | ||
import io.anserini.collection.JsonCollection; | ||
import io.anserini.index.IndexArgs; | ||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.document.DoublePoint; | ||
import org.apache.lucene.document.LongPoint; | ||
import org.apache.lucene.document.ShapeField; | ||
import org.apache.lucene.index.IndexableField; | ||
import org.junit.Before; | ||
import org.junit.Test; | ||
import static org.junit.Assert.assertEquals; | ||
|
||
public class GeoGeneratorTest { | ||
private JsonCollection.Document geoDoc; | ||
private Document doc; | ||
|
||
@Before | ||
public void riverSetUp() { | ||
ObjectMapper mapper = new ObjectMapper(); | ||
ObjectNode jsonObj = mapper.createObjectNode(); | ||
jsonObj.set("HYRIV_ID", TextNode.valueOf("90000003")); | ||
jsonObj.set("NEXT_DOWN", TextNode.valueOf("0")); | ||
jsonObj.set("MAIN_RIV", TextNode.valueOf("90000003")); | ||
jsonObj.set("LENGTH_KM", TextNode.valueOf("3.02")); | ||
jsonObj.set("DIST_DN_KM", TextNode.valueOf("0.0")); | ||
jsonObj.set("DIST_UP_KM", TextNode.valueOf("35.3")); | ||
jsonObj.set("CATCH_SKM", TextNode.valueOf("12.24")); | ||
jsonObj.set("UPLAND_SKM", TextNode.valueOf("12.2")); | ||
jsonObj.set("ENDORHEIC", TextNode.valueOf("0")); | ||
jsonObj.set("DIS_AV_CMS", TextNode.valueOf("0.03")); | ||
jsonObj.set("ORD_STRA", TextNode.valueOf("1")); | ||
jsonObj.set("ORD_CLAS", TextNode.valueOf("1")); | ||
jsonObj.set("ORD_FLOW", TextNode.valueOf("8")); | ||
jsonObj.set("HYBAS_L12", TextNode.valueOf("9120016580")); | ||
jsonObj.set("geometry", TextNode.valueOf("LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)")); | ||
jsonObj.set("id", TextNode.valueOf("90000003")); | ||
|
||
geoDoc = new JsonCollection.Document(jsonObj); | ||
|
||
GeoGenerator generator = new GeoGenerator(new IndexArgs()); | ||
doc = generator.createDocument(geoDoc); | ||
} | ||
|
||
@Test | ||
public void testRiverDocumentFields() { | ||
// Check if the field types were inferred correctly, id field is omitted since it's a repeat of HYRIV_ID | ||
assertEquals(LongPoint.class, doc.getField("HYRIV_ID").getClass()); | ||
assertEquals(90000003L, doc.getField("HYRIV_ID").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("NEXT_DOWN").getClass()); | ||
assertEquals(0L, doc.getField("NEXT_DOWN").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("MAIN_RIV").getClass()); | ||
assertEquals(90000003L, doc.getField("MAIN_RIV").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("LENGTH_KM").getClass()); | ||
assertEquals(3.02, doc.getField("LENGTH_KM").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("DIST_DN_KM").getClass()); | ||
assertEquals(0.0, doc.getField("DIST_DN_KM").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("DIST_UP_KM").getClass()); | ||
assertEquals(35.3, doc.getField("DIST_UP_KM").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("CATCH_SKM").getClass()); | ||
assertEquals(12.24, doc.getField("CATCH_SKM").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("UPLAND_SKM").getClass()); | ||
assertEquals(12.2, doc.getField("UPLAND_SKM").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("ENDORHEIC").getClass()); | ||
assertEquals(0L, doc.getField("ENDORHEIC").numericValue()); | ||
|
||
assertEquals(DoublePoint.class, doc.getField("DIS_AV_CMS").getClass()); | ||
assertEquals(0.03, doc.getField("DIS_AV_CMS").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("ORD_STRA").getClass()); | ||
assertEquals(1L, doc.getField("ORD_STRA").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("ORD_CLAS").getClass()); | ||
assertEquals(1L, doc.getField("ORD_CLAS").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("ORD_FLOW").getClass()); | ||
assertEquals(8L, doc.getField("ORD_FLOW").numericValue()); | ||
|
||
assertEquals(LongPoint.class, doc.getField("HYBAS_L12").getClass()); | ||
assertEquals(9120016580L, doc.getField("HYBAS_L12").numericValue()); | ||
|
||
for (IndexableField f: doc.getFields("geometry")) { | ||
assertEquals(ShapeField.Triangle.class, f.getClass()); | ||
} | ||
} | ||
} |
54 changes: 54 additions & 0 deletions
54
src/test/resources/sample_docs/json/collection_geo/rivers.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
{ | ||
"HYRIV_ID": 90000001, | ||
"NEXT_DOWN": 0, | ||
"MAIN_RIV": 90000001, | ||
"LENGTH_KM": 1.16, | ||
"DIST_DN_KM": 0.0, | ||
"DIST_UP_KM": 8.4, | ||
"CATCH_SKM": 15.06, | ||
"UPLAND_SKM": 15.0, | ||
"ENDORHEIC": 0, | ||
"DIS_AV_CMS": 0.089, | ||
"ORD_STRA": 1, | ||
"ORD_CLAS": 1, | ||
"ORD_FLOW": 8, | ||
"HYBAS_L12": 9120016560, | ||
"geometry": "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)", | ||
"id": 90000001 | ||
} | ||
{ | ||
"HYRIV_ID": 90000002, | ||
"NEXT_DOWN": 0, | ||
"MAIN_RIV": 90000002, | ||
"LENGTH_KM": 1.16, | ||
"DIST_DN_KM": 0.0, | ||
"DIST_UP_KM": 69.1, | ||
"CATCH_SKM": 10.08, | ||
"UPLAND_SKM": 10.1, | ||
"ENDORHEIC": 0, | ||
"DIS_AV_CMS": 0.0, | ||
"ORD_STRA": 1, | ||
"ORD_CLAS": 1, | ||
"ORD_FLOW": 10, | ||
"HYBAS_L12": 9120016500, | ||
"geometry": "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)", | ||
"id": 90000002 | ||
} | ||
{ | ||
"HYRIV_ID": 90000003, | ||
"NEXT_DOWN": 0, | ||
"MAIN_RIV": 90000003, | ||
"LENGTH_KM": 3.02, | ||
"DIST_DN_KM": 0.0, | ||
"DIST_UP_KM": 35.3, | ||
"CATCH_SKM": 12.24, | ||
"UPLAND_SKM": 12.2, | ||
"ENDORHEIC": 0, | ||
"DIS_AV_CMS": 0.03, | ||
"ORD_STRA": 1, | ||
"ORD_CLAS": 1, | ||
"ORD_FLOW": 8, | ||
"HYBAS_L12": 9120016580, | ||
"geometry": "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)", | ||
"id": 90000003 | ||
} |