Skip to content

Commit

Permalink
Fixed indexing error: Inconsistency of field data structures across d…
Browse files Browse the repository at this point in the history
…ocuments for field [organism]
  • Loading branch information
IgorRodchenkov committed Apr 26, 2024
1 parent f2dfee4 commit a34ac2a
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 19 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
<paxtools.version>6.0.0-SNAPSHOT</paxtools.version>
<validator.version>6.0.0-SNAPSHOT</validator.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>9.7.0</lucene.version>
<lucene.version>9.10.0</lucene.version>
<jvm.options>-Xmx3g -Dfile.encoding=UTF-8 -ea --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED</jvm.options>
<agent>${settings.localRepository}/org/springframework/spring-instrument/${spring-framework.version}/spring-instrument-${spring-framework.version}.jar</agent>
<!-- this copy is created by maven-dependency-plugin -->
Expand Down
16 changes: 6 additions & 10 deletions src/main/java/cpath/service/IndexImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -524,13 +524,8 @@ public long count(String queryString) {
private void addDatasources(Set<Provenance> set, Document doc) {
for (Provenance p : set) {
//store but do not index/tokenize the URI
doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));

//index the last/local (collection prefix) part of the Provenance uri
String u = p.getUri();
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
u = u.replaceAll(".*[/#:]", "");
doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
// doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));
doc.add(new TextField(FIELD_DATASOURCE, p.getUri(), Field.Store.YES));

//index names (including the datasource identifier from metadata json config; see premerge/merge)
//different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
Expand All @@ -542,8 +537,8 @@ private void addDatasources(Set<Provenance> set, Document doc) {

private void addOrganisms(Set<BioSource> set, Document doc) {
for(BioSource bs : set) {
// store but do not index URI (see transform method above, where the organism URIs are added to search hits)
doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
//doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
doc.add(new TextField(FIELD_ORGANISM, bs.getUri(), Field.Store.YES));

// add organism names
for(String s : bs.getName()) {
Expand All @@ -558,8 +553,9 @@ private void addOrganisms(Set<BioSource> set, Document doc) {
}
// include tissue type terms
if (bs.getTissue() != null) {
for (String s : bs.getTissue().getTerm())
for (String s : bs.getTissue().getTerm()) {
doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
}
}
// include cell type terms
if (bs.getCellType() != null) {
Expand Down
9 changes: 1 addition & 8 deletions src/test/java/cpath/service/IndexIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,12 @@ public final void search() throws IOException {
assertEquals(2, response.getSearchHit().size());
response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
assertEquals(1, response.getSearchHit().size());
//datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
assertTrue(index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/reactome/"}, null).isEmpty());
assertTrue(index.search("*", 0, Pathway.class, new String[] {"test:kegg_test"}, null).isEmpty());
//using the local/last part of the URI (standard bio collection prefix/name)
response = index.search("*", 0, Pathway.class, new String[] {"kegg_test"}, null);
assertFalse(response.isEmpty());
assertEquals(1, response.getSearchHit().size());
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("test:kegg_test")));

//find by partial name of a datasource - "pathway" of "KEGG Pathway"...
response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
assertFalse(response.isEmpty());
assertEquals(1, response.getSearchHit().size());
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("test:kegg_test")));

response = index.search("pathway:glycolysis", 0, SmallMoleculeReference.class, null, null);
assertEquals(5, response.getSearchHit().size());
Expand Down
51 changes: 51 additions & 0 deletions src/test/resources/test-index-it.owl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xmlns:bp="http://www.biopax.org/release/biopax-level3.owl#"
xml:base="pc14:">
<owl:Ontology rdf:about="">
<owl:imports rdf:resource="http://www.biopax.org/release/biopax-level3.owl#" />
</owl:Ontology>

<bp:Provenance rdf:ID="bind">
<bp:standardName rdf:datatype = "xsd:string">BIND</bp:standardName>
<bp:displayName rdf:datatype = "xsd:string">BIND</bp:displayName>
<bp:name rdf:datatype = "xsd:string">Biomolecular Interaction Network Database</bp:name>
<bp:name rdf:datatype = "xsd:string">bind</bp:name>
<bp:comment rdf:datatype = "xsd:string">Source http://download.baderlab.org/BINDTranslation/release1_0/PSIMI25_XML/taxid9606_PSIMI25.xml type: PSI_MI, BIND (human), 15-Dec-2010</bp:comment>
</bp:Provenance>

<bp:Protein rdf:about="bind:Protein_rcsb_pdb_1ZDT_see-also_1713944180538">
<bp:displayName rdf:datatype = "xsd:string">1ZDT_B</bp:displayName>
<bp:entityReference rdf:resource="bind:ProteinReference_rcsb_pdb_1ZDT_see-also" />
<bp:comment rdf:datatype = "xsd:string">Protein Chain B, NR5A1[221-461]. This residue range is taken from the PDB file and may not match the GI given.</bp:comment>
<bp:comment rdf:datatype = "xsd:string">experimental form entity</bp:comment>
<bp:dataSource rdf:resource="#bind" />
</bp:Protein>

<bp:ProteinReference rdf:about="bind:ProteinReference_rcsb_pdb_1ZDT_see-also">
<bp:xref rdf:resource="#RX_genbank_indentifier_67463979_see-also" />
<bp:xref rdf:resource="#RX_pdb_1ZDT_see-also" />
<bp:xref rdf:resource="#RX_hgnc_symbol_NCOA2_see-also" />
<bp:xref rdf:resource="#RX_uniprot_Q15596_see-also" />
<bp:xref rdf:resource="#RX_hgnc_symbol_NR5A1_see-also" />
<bp:xref rdf:resource="#RX_uniprot_Q13285_see-also" />
<bp:organism rdf:resource="bind:BIO_ncbitaxon_0" />
<bp:displayName rdf:datatype = "xsd:string">Nuclear Receptor Coactivator 2</bp:displayName>
<bp:name rdf:datatype = "xsd:string">Chain P; The Crystal Structure Of Human Steroidogenic Factor-1</bp:name>
</bp:ProteinReference>

<bp:BioSource rdf:about="bind:BIO_ncbitaxon_0">
</bp:BioSource>

<bp:RelationshipXref rdf:ID="RX_genbank_indentifier_67463979_see-also">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0361" />
<bp:id rdf:datatype = "xsd:string">67463979</bp:id>
<bp:db rdf:datatype = "xsd:string">genbank indentifier</bp:db><!-- kept the original (mis)spelling (BIND) -->
</bp:RelationshipXref>



</rdf:RDF>

0 comments on commit a34ac2a

Please sign in to comment.