From 93b5640a2438e0f8f0e9bdfe503b630468d0e4a1 Mon Sep 17 00:00:00 2001 From: Jose Duarte Date: Sun, 15 Apr 2018 12:40:09 -0700 Subject: [PATCH] Fix for #205 and tests --- .../src/main/java/eppic/DataModelAdaptor.java | 101 ++++++++++-- eppic-cli/src/main/java/eppic/Main.java | 6 +- .../test/java/eppic/TestLargeStructures.java | 152 ++++++++++++++++++ 3 files changed, 243 insertions(+), 16 deletions(-) diff --git a/eppic-cli/src/main/java/eppic/DataModelAdaptor.java b/eppic-cli/src/main/java/eppic/DataModelAdaptor.java index 39f4030f9..59e5556a5 100644 --- a/eppic-cli/src/main/java/eppic/DataModelAdaptor.java +++ b/eppic-cli/src/main/java/eppic/DataModelAdaptor.java @@ -88,7 +88,7 @@ public class DataModelAdaptor { public static final int INVALID_ASSEMBLY_ID = 0; private PdbInfoDB pdbInfo; - + private EppicParams params; private RunParametersDB runParameters; @@ -170,7 +170,10 @@ public void setPdbMetadata(Structure pdb) { pdbInfo.setCellBeta(cc.getBeta()); pdbInfo.setCellGamma(cc.getGamma()); } - + + } + + public void setChainClustersData(Structure pdb, Map chainOrigNames) { List chainClusterDBs = new ArrayList(); for (EntityInfo compound:pdb.getEntityInfos()) { @@ -180,12 +183,12 @@ public void setPdbMetadata(Structure pdb) { // in mmCIF files some sugars are annotated as compounds with no chains linked to them, e.g. 3s26 if (compound.getChains().isEmpty()) continue; - chainClusterDBs.add(createChainCluster(compound)); + chainClusterDBs.add(createChainCluster(compound, chainOrigNames)); } } pdbInfo.setNumChainClusters(chainClusterDBs.size()); pdbInfo.setChainClusters(chainClusterDBs); - + initAsymIds2chainIdsMap(pdb); } @@ -196,7 +199,8 @@ public void setPdbMetadata(Structure pdb) { *

* Note that the map should work in most cases, but it's not guaranteed because there is a one-to-many * relationship between author chain ids and asym ids (internal ids). This is the best we can do - * with the data available from Biojava 4.2 + * with the data available from Biojava 4.2 + * TODO check if we still need with BioJava 5 * @param pdb the structure */ private void initAsymIds2chainIdsMap(Structure pdb) { @@ -206,15 +210,15 @@ private void initAsymIds2chainIdsMap(Structure pdb) { asymIds2chainIds.put(c.getId(), c.getName()); } } - - private ChainClusterDB createChainCluster(EntityInfo compound) { + + private ChainClusterDB createChainCluster(EntityInfo compound, Map chainOrigNames) { ChainClusterDB chainClusterDB = new ChainClusterDB(); chainClusterDB.setPdbCode(pdbInfo.getPdbCode()); chainClusterDB.setRepChain(compound.getRepresentative().getName()); - chainClusterDB.setMemberChains(getMemberChainsString(compound)); - chainClusterDB.setNumMembers(compound.getChainIds().size()); + chainClusterDB.setMemberChains(getMemberChainsString(compound, chainOrigNames)); + chainClusterDB.setNumMembers(getUniqueChainNames(compound, chainOrigNames).size()); chainClusterDB.setProtein(compound.getRepresentative().isProtein()); chainClusterDB.setPdbInfo(pdbInfo); @@ -299,9 +303,9 @@ private List getGroups(EntityInfo compound) { } public void setInterfaces(StructureInterfaceList interfaces) { - - List interfaceClusters = interfaces.getClusters(EppicParams.CLUSTERING_CONTACT_OVERLAP_SCORE_CUTOFF); + List interfaceClusters = reduceToNcsUnique(interfaces); + List icDBs = new ArrayList(); for (StructureInterfaceCluster ic:interfaceClusters) { InterfaceClusterDB icDB = new InterfaceClusterDB(); @@ -452,6 +456,56 @@ public int compare(ContactDB first, ContactDB second) { pdbInfo.setMaxNumClashesAnyInterface(Collections.max(numClashesPerInterface)); } + + private List reduceToNcsUnique(StructureInterfaceList interfaces) { + List clusters = interfaces.getClusters(EppicParams.CLUSTERING_CONTACT_OVERLAP_SCORE_CUTOFF); + + if (!pdbInfo.isNcsOpsPresent()) { + // no NCS case (normal case), return clusters as is + return clusters; + } + + // NCS case. We need to reduce to the unique-to-NCS set + List interfaceClustersNcs = interfaces.getClustersNcs(); + + List reduced = new ArrayList<>(); + for (StructureInterfaceCluster cluster : clusters) { + Set indices = new TreeSet<>(); + for (StructureInterface interf : cluster.getMembers()) { + indices.add(getCorrespondingClustersIndex(interf, interfaceClustersNcs)); + } + + StructureInterfaceCluster reducedCluster = new StructureInterfaceCluster(); + reducedCluster.setId(cluster.getId()); + reducedCluster.setAverageScore(cluster.getAverageScore()); + for (int i : indices) { + // we add one interface per NCS interface cluster + StructureInterface interf = interfaceClustersNcs.get(i).getMembers().get(0); + if (interf.getCluster().getId() != reducedCluster.getId()) { + LOGGER.warn("Interface {} belongs to cluster {}. It should not be added to cluster id {}", + interf.getId(), interf.getCluster().getId(), reducedCluster.getId()); + } + reducedCluster.addMember(interf); + // we add also the new back-reference to the parent + interf.setCluster(reducedCluster); + } + + reduced.add(reducedCluster); + } + + return reduced; + } + + private static int getCorrespondingClustersIndex(StructureInterface interf, List interfaceClustersNcs) { + for (int i = 0; i< interfaceClustersNcs.size(); i++) { + for (StructureInterface s : interfaceClustersNcs.get(i).getMembers()) { + if (s.getId() == interf.getId()) { + return i; + } + } + } + return -1; + } public void setAssemblies(CrystalAssemblies validAssemblies) { @@ -792,6 +846,10 @@ public void setGeometryScores(List gps, List iril = new ArrayList(); @@ -1323,13 +1385,24 @@ public static String getChainClusterString(EntityInfo compound) { return sb.toString(); } - public static String getMemberChainsString(EntityInfo compound) { - + private Set getUniqueChainNames(EntityInfo compound, Map chainOrigNames) { List chains = compound.getChains(); Set uniqChainNames = new TreeSet<>(); for (Chain c : chains) { - uniqChainNames.add(c.getName()); + String chainName; + if(chainOrigNames!=null) { // will only be not null in cases with NCS ops + chainName = chainOrigNames.get(c.getName()); + } else { + chainName = c.getName(); + } + uniqChainNames.add(chainName); } + return uniqChainNames; + } + + private String getMemberChainsString(EntityInfo compound, Map chainOrigNames) { + + Set uniqChainNames = getUniqueChainNames(compound, chainOrigNames); StringBuilder sb = new StringBuilder(); int i = 0; diff --git a/eppic-cli/src/main/java/eppic/Main.java b/eppic-cli/src/main/java/eppic/Main.java index f1d6457d0..3672768a6 100644 --- a/eppic-cli/src/main/java/eppic/Main.java +++ b/eppic-cli/src/main/java/eppic/Main.java @@ -264,16 +264,18 @@ public void doFindInterfaces() throws EppicException { LOGGER.info("Calculating possible interfaces"); CrystalBuilder interfFinder; + Map chainOrigNames = null; if (modelAdaptor.getPdbInfo().isNcsOpsPresent()) { - Map chainOrigNames = new HashMap<>(); + chainOrigNames = new HashMap<>(); Map chainNcsOps = new HashMap<>(); CrystalBuilder.expandNcsOps(pdb,chainOrigNames,chainNcsOps); - modelAdaptor.setPdbMetadata(pdb); interfFinder = new CrystalBuilder(pdb,chainOrigNames,chainNcsOps); } else { interfFinder = new CrystalBuilder(pdb); } + modelAdaptor.setChainClustersData(pdb, chainOrigNames); + interfaces = interfFinder.getUniqueInterfaces(EppicParams.INTERFACE_DIST_CUTOFF); LOGGER.info("Calculating ASAs"); interfaces.calcAsas(params.getnSpherePointsASAcalc(), params.getNumThreads(), params.getMinSizeCofactorForAsa()); diff --git a/eppic-cli/src/test/java/eppic/TestLargeStructures.java b/eppic-cli/src/test/java/eppic/TestLargeStructures.java index c9ca099c8..1c28fa95a 100644 --- a/eppic-cli/src/test/java/eppic/TestLargeStructures.java +++ b/eppic-cli/src/test/java/eppic/TestLargeStructures.java @@ -1,6 +1,12 @@ package eppic; //import org.junit.Ignore; +import eppic.assembly.TestLatticeGraph; +import org.biojava.nbio.structure.Structure; +import org.biojava.nbio.structure.contact.StructureInterface; +import org.biojava.nbio.structure.contact.StructureInterfaceCluster; +import org.biojava.nbio.structure.contact.StructureInterfaceList; +import org.biojava.nbio.structure.xtal.CrystalBuilder; import org.junit.Test; import eppic.model.ChainClusterDB; @@ -8,10 +14,15 @@ import eppic.model.InterfaceDB; import eppic.model.PdbInfoDB; +import javax.vecmath.Matrix4d; + import static org.junit.Assert.*; import java.io.File; import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * An integration test that makes sure that large structures are correctly handled @@ -51,6 +62,10 @@ public void test4v9e() throws IOException { m.run(params); PdbInfoDB pdbInfo = m.getDataModelAdaptor().getPdbInfo(); + + // the title should be set, this checks that DataModelAdaptor.setPdbMetadata worked + assertNotNull(pdbInfo.getTitle()); + assertTrue(pdbInfo.getTitle().length()>2); ChainClusterDB cc = pdbInfo.getChainCluster("AA"); assertEquals(36, cc.getNumMembers()); @@ -78,4 +93,141 @@ public void test4v9e() throws IOException { } + /** + * NCS output needs to be less redundant. + * Issue https://github.com/eppic-team/eppic/issues/205 + * @throws IOException + */ + @Test + public void test1auy() throws IOException { + + File outDir = new File(TMPDIR, "eppicTestLargeStructures"); + + outDir.mkdir(); + + assertTrue(outDir.isDirectory()); + + + String pdbId = "1auy"; + EppicParams params = Utils.generateEppicParams(pdbId, outDir); + + Main m = new Main(); + + m.run(params); + + PdbInfoDB pdbInfo = m.getDataModelAdaptor().getPdbInfo(); + + assertTrue(pdbInfo.isNcsOpsPresent()); + + assertEquals(1, pdbInfo.getNumChainClusters()); + ChainClusterDB ccdb = pdbInfo.getChainClusters().get(0); + assertEquals(3, ccdb.getNumMembers()); + + assertEquals(10, pdbInfo.getInterfaceClusters().size()); + + assertEquals(4, pdbInfo.getAssemblies().size()); + + // the cluster members should be reduced to NCS equivalents: it should be a low number + int count = 0; + for (InterfaceClusterDB interfCluster : pdbInfo.getInterfaceClusters()) { + assertTrue(interfCluster.size()<10); + assertTrue(interfCluster.getAvgContactOverlapScore() > 0); + for (InterfaceDB idb : interfCluster.getInterfaces()) { + // can't assert this, the n chains are still in some interfaces + //assertFalse(idb.getChain1().endsWith("n")); + assertEquals(interfCluster.getClusterId(), idb.getClusterId()); + count++; + } + } + + assertTrue(count<20); + + outDir.delete(); + + } + + /** + * As an extra test for NCS: some sanity checks that the grouping by NCS and clustering by contact + * overlap score are consistent with each other. + * @throws Exception + */ + @Test + public void testInterfaceNcsGrouping() throws Exception { + Structure s = TestLatticeGraph.getStructure("1auy"); + + Map chainOrigNames = new HashMap<>(); + Map chainNcsOps = new HashMap<>(); + CrystalBuilder.expandNcsOps(s,chainOrigNames,chainNcsOps); + CrystalBuilder cb = new CrystalBuilder(s,chainOrigNames,chainNcsOps); + + StructureInterfaceList interfaces = cb.getUniqueInterfaces(); + int spherePoints = StructureInterfaceList.DEFAULT_ASA_SPHERE_POINTS / 10; + interfaces.calcAsas(spherePoints, + Runtime.getRuntime().availableProcessors(), + StructureInterfaceList.DEFAULT_MIN_COFACTOR_SIZE); + interfaces.removeInterfacesBelowArea(); + + List full = interfaces.getClusters(EppicParams.CLUSTERING_CONTACT_OVERLAP_SCORE_CUTOFF); + List ncs = interfaces.getClustersNcs(); + + int idx = 0; + for (StructureInterfaceCluster c : ncs) { + int refId = 0; + int jdx = 0; + for (StructureInterface i : c.getMembers()) { + // it seems that ncs list does not filter for area (bug in biojava 5.0.0), this is a workaround + if (i.getTotalArea() clusters) { + for (StructureInterfaceCluster c : clusters) { + for (StructureInterface i : c.getMembers()) { + if (interf.getId() == i.getId()) return c; + } + } + return null; + } + +// private StructureInterface findCorrespondingInterf(StructureInterface interf, List clusters) { +// for (StructureInterfaceCluster c : clusters) { +// for (StructureInterface i : c.getMembers()) { +// if (interf.getId() == i.getId()) return i; +// } +// } +// return null; +// } }