Skip to content

Commit

Permalink
Bumped Paxtools and Validator versions, added Analysis class and cons…
Browse files Browse the repository at this point in the history
…ole app command "-m" to fix bad/invalid URIs issue #319 (e.g. netpath "S 312" and all intact_complex URIs)
  • Loading branch information
IgorRodchenkov committed Jun 21, 2024
1 parent 6d8311c commit abdebe4
Show file tree
Hide file tree
Showing 18 changed files with 20,179 additions and 5,253 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
<start-class>cpath.Application</start-class>
<license.licenseName>MIT</license.licenseName>
<github.global.server>github</github.global.server>
<paxtools.version>6.0.0</paxtools.version>
<validator.version>6.0.0</validator.version>
<paxtools.version>6.0.1-SNAPSHOT</paxtools.version>
<validator.version>6.0.1-SNAPSHOT</validator.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>9.10.0</lucene.version>
<jvm.options>-Xmx3g -Dfile.encoding=UTF-8 -ea -Dpaxtools.CollectionProvider=org.biopax.paxtools.trove.TProvider --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED</jvm.options>
Expand Down
40 changes: 40 additions & 0 deletions src/main/java/cpath/analysis/Fix319.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package cpath.analysis;

import cpath.service.api.Analysis;
import org.apache.commons.lang3.RegExUtils;
import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.ModelUtils;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.SimplePhysicalEntity;
import org.biopax.paxtools.model.level3.UtilityClass;

import java.util.HashSet;

/**
* This is to fix issue #319 in the PC v14 data (in June 2024 it was beta)
* and similar potential LD issues due to invalid biopax URIs...
* Also remove all dangling SimplePhysicalEntity (i.e. not Complex) individuals, if any
* (these ain't useful for anything and are likely there due to mistakes or duplicate original data, e.g. in NetPath...)
*/
public class Fix319 implements Analysis<Model> {
public void execute(Model model) {
//remove dangling SPEs (such non-participant/components molecules are not useful for pathway analyses...)
ModelUtils.removeObjectsIfDangling(model, SimplePhysicalEntity.class);

//now, remove dangling xrefs, CV et al. utility type individuals
ModelUtils.removeObjectsIfDangling(model, UtilityClass.class);

//replace bad URI part "intact_complex" with "intact.complex" (also replaces "pc14:intact_complex")
for (BioPAXElement e : new HashSet<>(model.getObjects())) {
if (StringUtils.contains(e.getUri(), "intact_complex")) {
String r = RegExUtils.replaceFirst(e.getUri(), "intact_complex", "intact.complex");
ModelUtils.updateUri(model, e, r);
}
}

//fix bad invalid URIs (there were some URIs with a space,
// e.g. "netpath:S 312" causing trouble when converting to JSONLD, etc.)
ModelUtils.fixInvalidUris(model);
}
}
3 changes: 1 addition & 2 deletions src/main/java/cpath/analysis/TraverseAnalysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void execute(Model model) {
final String propertyPath = callback.getPropertyPath();
callback.getTraverseEntry().clear();

PathAccessor pathAccessor = null;
PathAccessor pathAccessor;
try {
pathAccessor = new PathAccessor(propertyPath, model.getLevel());
} catch (Exception e) {
Expand All @@ -43,7 +43,6 @@ public void execute(Model model) {
TraverseEntry entry = new TraverseEntry();
entry.setUri(uri);
if(!pathAccessor.isUnknown(v)) {
// entry.getValue().addAll(v);
for(Object o : v) {
if(o instanceof BioPAXElement)
entry.getValue().add(((BioPAXElement) o).getUri());
Expand Down
69 changes: 32 additions & 37 deletions src/main/java/cpath/cleaner/NetPathCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,7 @@
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.ControlledVocabulary;
import org.biopax.paxtools.model.level3.RelationshipXref;
import org.biopax.paxtools.model.level3.SequenceModificationVocabulary;
import org.biopax.paxtools.model.level3.UnificationXref;
import org.biopax.paxtools.model.level3.UtilityClass;
import org.biopax.paxtools.model.level3.XReferrable;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.util.ClassFilterSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -25,29 +20,29 @@
* Implementation of Cleaner interface for NetPath data.
*/
final class NetPathCleaner implements Cleaner {

// logger
private static Logger log = LoggerFactory.getLogger(NetPathCleaner.class);
private static Logger log = LoggerFactory.getLogger(NetPathCleaner.class);

public void clean(InputStream data, OutputStream cleanedData)
{
public void clean(InputStream data, OutputStream cleanedData) {
// create bp model from dataFile
SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3);
Model model = simpleReader.convertFromOWL(data);
log.info("Cleaning NetPath data, please be patient...");

//fix bad/invalid URIs (there are several with spaces in them...)
ModelUtils.fixInvalidUris(model);

// Fix some CV xrefs
// a CV must have one unification xref;
// if there are also relationship and publication xrefs, it's a biopax error, but we'll keep as is (not critical);
// So, if there is no unification xref but rel. xrefs (in fact, one or none in NetPath), we convert rel. to unif. xref.
Set<ControlledVocabulary> cvs = new HashSet<>(model.getObjects(ControlledVocabulary.class));
for(ControlledVocabulary cv : cvs) {
log.info("Processing " + cv.toString() + "; xrefs: " + cv.getXref());
for (ControlledVocabulary cv : cvs) {
log.info("Processing " + cv.toString() + "; xrefs: " + cv.getXref());

//insert "L-" after "phospho-" in MFV terms (if it does not contain "phospho-L-" already)
if(cv instanceof SequenceModificationVocabulary) {
for(String t: new HashSet<>(cv.getTerm())) {
if(t.contains("phospho-") && !t.contains("phospho-L-")) {
if (cv instanceof SequenceModificationVocabulary) {
for (String t : new HashSet<>(cv.getTerm())) {
if (t.contains("phospho-") && !t.contains("phospho-L-")) {
//insert "L-", replace term
cv.removeTerm(t);
t = t.replace("phospho-", "phospho-L-");
Expand All @@ -56,54 +51,54 @@ public void clean(InputStream data, OutputStream cleanedData)
}
}
}

Set<UnificationXref> urefs = new ClassFilterSet<>(new HashSet<>(cv.getXref()), UnificationXref.class);
//skip if there is a unification xref
if(!urefs.isEmpty()) {
log.info("(skip) there are unif.xref: " + urefs);
if (!urefs.isEmpty()) {
continue; //perhaps, will never happen (I manually checked a couple of orig. files)
}

Set<RelationshipXref> rxrefs = new ClassFilterSet<>(new HashSet<>(cv.getXref()), RelationshipXref.class);
for(RelationshipXref x : rxrefs) {
for (RelationshipXref x : rxrefs) {
//remove and skip for bad xref (just in case there are any)
if(x.getDb()==null || x.getId()==null) {
if (x.getDb() == null || x.getId() == null) {
cv.removeXref(x);
model.remove(x);
continue;
}

String id = x.getId();
String uri = "UX_" + BaseCleaner.encode(x.getDb() + "_"+ id);
String uri = "UX_" + BaseCleaner.encode(x.getDb() + "_" + id);
UnificationXref ux = (UnificationXref) model.getByID(uri);
if(ux == null) {
if (ux == null) {
ux = model.addNew(UnificationXref.class, uri);
ux.setDb(x.getDb());
ux.setId(id);
}
}
cv.removeXref(x);
cv.addXref(ux);
}
}

//convert shared UnificationXrefs into RelationshipXrefs (in fact, some of those are just invalid db/id)
Set<UnificationXref> uxrefs = new HashSet<>(model.getObjects(UnificationXref.class));
for(UnificationXref x : uxrefs) {
if(x.getXrefOf().size() > 1) {
for (UnificationXref x : new HashSet<>(model.getObjects(UnificationXref.class))) {
if (x.getXrefOf().size() > 1) {
//convert to RX, re-associate
RelationshipXref rx = BaseCleaner.getOrCreateRx(x, model);
for(XReferrable owner : new HashSet<>(x.getXrefOf())) {
if(owner instanceof ControlledVocabulary)
for (XReferrable owner : new HashSet<>(x.getXrefOf())) {
if (owner instanceof ControlledVocabulary) {
continue; //CVs can use same UX, but that means they are to merge...
}
owner.removeXref(x);
owner.addXref(rx);
}
log.info("replaced UX {} with RX {}", x, rx);
}
}
}


//SPEs that are not component/participant are not needed
ModelUtils.removeObjectsIfDangling(model, SimplePhysicalEntity.class);
//xrefs, CVs et al. utility class individuals are not interesting for pathway analysis
ModelUtils.removeObjectsIfDangling(model, UtilityClass.class);

// convert model back to OutputStream for return
try {
simpleReader.convertToOWL(model, cleanedData);
Expand Down
22 changes: 5 additions & 17 deletions src/main/java/cpath/service/CPathUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.nio.file.*;
import java.util.*;
import java.util.stream.Stream;
Expand All @@ -21,7 +20,6 @@
import org.biopax.paxtools.controller.Fetcher;
import org.biopax.paxtools.controller.ModelUtils;
import org.biopax.paxtools.controller.SimpleEditorMap;
import org.biopax.paxtools.impl.BioPAXElementImpl;
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.BioPAXLevel;
Expand Down Expand Up @@ -89,27 +87,17 @@ static void saveMetadata(Metadata metadata, String path) {
}

/**
* Replaces the URI of a BioPAX object
* using java reflection. Normally, one should avoid this;
* please use when absolutely necessary and with great care.
* Replaces the URI of a BioPAX object using java reflection.
* Please use when absolutely necessary and with great care.
*
* @param model model
* @param el biopax object from the model
* @param newUri new URI
*/
public static void replaceUri(Model model, BioPAXElement el, String newUri) {
if (el.getUri().equals(newUri)) {
return; // no action required
if (!el.getUri().equals(newUri)) {
ModelUtils.updateUri(model, el, newUri);
}
model.remove(el);
try {
Method m = BioPAXElementImpl.class.getDeclaredMethod("setUri", String.class);
m.setAccessible(true);
m.invoke(el, newUri);
} catch (Exception e) {
throw new RuntimeException(e);
}
model.add(el);
}

/**
Expand Down Expand Up @@ -154,7 +142,7 @@ static String rebaseUri(String absoluteUri, String fromBase, String toBase) {
* Replaces xml:base for the normalized model and updates the URis of all non-normalized objects
* (mostly Entity, Evidence, etc.)
* The model is already normalized, which means the URIs of many xrefs, CVs, entity reference start with
* bioregistry.io/ or are CURIEs like e.g. chebi:1234, pubmed:1234556.
* http://bioregistry.io/ or are CURIEs like e.g. chebi:1234, pubmed:1234556.
*/
public static void rebaseUris(Model model, String fromBase, String toBase) {
Assert.hasText(toBase, "Blank/null value is not allowed for xmlBase");
Expand Down
59 changes: 40 additions & 19 deletions src/main/java/cpath/service/ConsoleApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,15 @@ public void run(String... args) throws Exception {
.hasArg().argName("from-stage").optionalArg(true).type(Stage.class).build();
options.addOption(o);
o = Option.builder("a").longOpt("analyze")
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to analyse the integrated " +
"BioPAX model (the class and its dependencies are expected to be found on the classpath)")
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to analyse the " +
"BioPAX model (the class and its dependencies are expected to be on the classpath)")
.hasArg().argName("class").build();
options.addOption(o);
o = Option.builder("m").longOpt("modify")
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to modify the " +
"BioPAX model and re-index (the class and its dependencies are expected to be on the classpath)")
.hasArg().argName("class").build();
options.addOption(o);
o = Option.builder("e").longOpt("export")
.desc("export the main BioPAX model or sub-model defined by additional filters (see: -F)")
.hasArg().argName("filename").build();
Expand Down Expand Up @@ -134,43 +139,59 @@ else if (cmd.hasOption("export")) {
exportData(cmd.getOptionValue("export"), uris, datasources, types);
}
else if (cmd.hasOption("analyze")) {
executeAnalysis(cmd.getOptionValue("analyze"), true);
analyzeModel(cmd.getOptionValue("analyze"));
}
else if (cmd.hasOption("modify")) {
modifyModel(cmd.getOptionValue("modify"));
}
else {
new HelpFormatter().printHelp("cPath2", options);
}
}

/**
* Runs a class that analyses or modifies the main BioPAX model.
/*
* Runs a class that analyses the main BioPAX model.
*
* @param analysisClass a class that implements {@link Analysis}
* @param readOnly whether this is to modify and replace the BioPAX Model or not
*/
private void executeAnalysis(String analysisClass, boolean readOnly) {
private void analyzeModel(String analysisClass) {
Analysis<Model> analysis;
try {
Class c = Class.forName(analysisClass);
analysis = (Analysis<Model>) c.getDeclaredConstructor().newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}

Model model = CPathUtils.importFromTheArchive(service.settings().mainModelFile());
analysis.execute(model);
}

if (!readOnly) { //replace the main BioPAX model archive
try {
new SimpleIOHandler(BioPAXLevel.L3).convertToOWL(model,
new GZIPOutputStream(new FileOutputStream(service.settings().mainModelFile())));
} catch (Exception e) {
throw new RuntimeException("Failed updating the main BioPAX archive!", e);
}

LOG.warn("The main BioPAX model was modified; "
+ "do not forget to re-index, update counts, re-export other files, etc.");
/*
* Runs a class that analyses and modifies the main BioPAX model and index.
*
* @param analysisClass a class that implements {@link Analysis} and can edit the data.
*/
private void modifyModel(String analysisClass) throws IOException {
Analysis<Model> analysis;
try {
Class c = Class.forName(analysisClass);
analysis = (Analysis<Model>) c.getDeclaredConstructor().newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}

//load current model from the file
Model model = CPathUtils.importFromTheArchive(service.settings().mainModelFile());
// and apply the changes
LOG.info("Running class: {}...", analysisClass);
analysis.execute(model);
// export the modified model to the file
LOG.info("Over-writing model: {}...", service.settings().mainModelFile());
new SimpleIOHandler(BioPAXLevel.L3).convertToOWL(model,
new GZIPOutputStream(new FileOutputStream(service.settings().mainModelFile())));
//init the lucene index as read-write
service.initIndex(model, service.settings().indexDir(), false);
//re-index the model
service.index().save(model);
}

/*
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/cpath/service/Merger.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,20 @@ public void merge() {
}
Model providerModel = merge(datasource); //uses lucene index, via service.mapping() repo, for id-mapping
log.info("Replacing xml:base of non-generated/normalized URIs in {}", datasource.getIdentifier());
//todo: new URI must be valid (e.g. base/prefix cannot contain '_' or '-'; or start with a standard URI scheme, e.g. 'urn:' or 'http://')
CPathUtils.rebaseUris(providerModel, null, datasource.getIdentifier()+":");
log.info("Replacing conflicting URIs in {} before merging into Main...", datasource.getIdentifier());
replaceConflictingUris(providerModel, m);
save(providerModel, datasource);
log.info("Merging '{}' model into the Main BioPAX model...", datasource.getIdentifier());
simpleMerger.merge(m, providerModel);
}

//remove dangling SPEs (such non-participant/components molecules are not useful for pathway analyses...)
ModelUtils.removeObjectsIfDangling(m, SimplePhysicalEntity.class);
//now, remove dangling xrefs, CV et al. utility type individuals
ModelUtils.removeObjectsIfDangling(m, UtilityClass.class);

//m.repair(); //todo: check if we really need this call (unlikely)
save(m); //save the main model as rdfxml file
log.info("Merged, saved.");
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/cpath/web/PagesController.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ String robots() {
// but allow - to web page resources (css, js, images)
return "User-agent: *\n" +
"Disallow: /v2\n" +
"Disallow: /fetch\n" +
"Disallow: /get\n" +
"Disallow: /search\n" +
"Disallow: /graph\n" +
"Disallow: /top_pathways\n" +
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
},
{
"dataUrl": "classpath:test_mapping.zip",
"identifier": "TEST_MAPPING",
"identifier": "TESTMAPPING",
"homepageUrl": "https://www.ebi.ac.uk/unichem/",
"name": [
"UniChem"
Expand Down
Loading

0 comments on commit abdebe4

Please sign in to comment.