Merge pull request #203 from julianu/main

Updating jmzIdentML and adding Comet TSV parser
medbioinf · Dec 15, 2023 · 8817dfb · 8817dfb
2 parents e890b41 + e041584
commit 8817dfb
Show file tree

Hide file tree

Showing 7 changed files with 5,738 additions and 13 deletions.
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 
 	<groupId>de.mpc.pia</groupId>
 	<artifactId>pia</artifactId>
-	<version>1.4.10</version>
+	<version>1.5.0</version>
 	<name>PIA - Protein Inference Algorithms</name>
 	<url>https://github.com/mpc-bioinformatics/pia</url>
 
@@ -43,9 +43,9 @@
 		<junit.version>4.13.2</junit.version>
 		<commons-collections.version>4.4</commons-collections.version>
 		<commons-text.version>1.11.0</commons-text.version>
-		<jmzidentml.version>1.2.11</jmzidentml.version>
+		<jmzidentml.version>1.2.13</jmzidentml.version>
 		<jmztab.version>3.0.8</jmztab.version>
-		<pride-mod.version>2.1.8</pride-mod.version>
+		<pride-mod.version>2.1.12</pride-mod.version>
 		<pride-jaxb.version>1.0.22</pride-jaxb.version>
 		<xxindex.version>0.23</xxindex.version>
 		<mascotdatfile.version>3.6.1</mascotdatfile.version>
@@ -147,6 +147,12 @@
 			<artifactId>pride-mod</artifactId>
 			<version>${pride-mod.version}</version>
 		</dependency>
+
+		<dependency>
+			<groupId>it.unimi.dsi</groupId>
+			<artifactId>fastutil</artifactId>
+			<version>8.5.12</version>
+		</dependency>
 		<!-- End pride mod dependency -->
 
 		<!-- mzTab dependencies -->
@@ -327,6 +333,9 @@
 						<exclude>src/test/*.class</exclude>
 					</excludes>
 					<archive>
+						<manifestEntries>
+							<Add-Opens>java.base/sun.reflect.annotation</Add-Opens>
+						</manifestEntries>
 						<index>true</index>
 						<manifest>
 							<!-- Adds the classpath to the created manifest -->
@@ -384,7 +393,7 @@
 				<artifactId>maven-surefire-plugin</artifactId>
 				<version>3.2.2</version>
 				<configuration>
-					<argLine>${argLine} -Xmx2560m</argLine>
+					<argLine>${argLine} -Xmx2560m --add-opens java.base/sun.reflect.annotation=ALL-UNNAMED</argLine>
 				</configuration>
 			</plugin>
 		</plugins>

diff --git a/src/main/java/de/mpc/pia/intermediate/compiler/parser/InputFileParserFactory.java b/src/main/java/de/mpc/pia/intermediate/compiler/parser/InputFileParserFactory.java
@@ -9,6 +9,7 @@
 import org.apache.logging.log4j.Logger;
 
 import de.mpc.pia.intermediate.compiler.PIACompiler;
+import de.mpc.pia.intermediate.compiler.parser.searchengines.CometTSVFileParser;
 import de.mpc.pia.intermediate.compiler.parser.searchengines.MascotDatFileParser;
 import de.mpc.pia.intermediate.compiler.parser.searchengines.TandemFileParser;
 import de.mpc.pia.intermediate.compiler.parser.searchengines.ThermoMSFFileParser;
@@ -20,6 +21,38 @@ public class InputFileParserFactory {
 	private static final Logger LOGGER = LogManager.getLogger();
 
     public enum InputFileTypes {
+
+        /**
+         * the input file is a Comet TSV file
+         */
+        COMET_TSV_INPUT {
+            @Override
+            public String getFileSuffix() {
+                return "txt";
+            }
+
+            @Override
+            public String getFileTypeName() {
+                return "Comet TSV";
+            }
+
+            @Override
+            public String getFileTypeShort() {
+                return "comet";
+            }
+
+            @Override
+            public boolean checkFileType(String fileName) {
+                return CometTSVFileParser.checkFileType(fileName);
+            }
+
+            @Override
+            public boolean parseFile(String name, String fileName,
+                    PIACompiler compiler, String additionalInfoFileName) {
+                return CometTSVFileParser.getDataFromCometTSVFile(name, fileName, compiler);
+            }
+        },
+
         /**
          * the input file is a FASTA database file
          */

diff --git a/src/main/java/de/mpc/pia/intermediate/compiler/parser/MzIdentMLFileParser.java b/src/main/java/de/mpc/pia/intermediate/compiler/parser/MzIdentMLFileParser.java
@@ -135,6 +135,12 @@ private boolean parseFile(String name, String fileName) {
 
         // get the AnalysisCollection:SpectrumIdentification for the SpectrumIdentificationLists
         AnalysisCollection analysisCollection = unmarshaller.unmarshal(AnalysisCollection.class);
+
+        LOGGER.debug("scanning analysisCollection: " + analysisCollection
+        		+ "\n\tgetSpectrumIdentification " + analysisCollection.getSpectrumIdentification()
+        		+ "\n\tgetProteinDetection " + analysisCollection.getProteinDetection()
+        		);
+
         for (SpectrumIdentification si : analysisCollection.getSpectrumIdentification()) {
             if (specIdLists.keySet().contains(si.getSpectrumIdentificationListRef())) {
                 // if the SpectrumIdentification's SpectrumIdentificationList is in the file, we need the SpectrumIdentification
@@ -165,8 +171,6 @@ private boolean parseFile(String name, String fileName) {
                     spectraDataRefs.put(ref, sd);
                 });
 
-        LOGGER.debug("Number of spectraData in inputs: " + inputs.getSpectraData().size());
-
         // get the necessary inputs:SearchDBs
         inputs.getSearchDatabase().stream()
                 .filter(searchDB -> neededSearchDatabases.contains(searchDB.getId()))
@@ -189,23 +193,37 @@ private boolean parseFile(String name, String fileName) {
         // update the PIAFile's references for SpectraData, SearchDBs and AnalysisSoftwares
         file.updateReferences(spectraDataRefs, searchDBRefs, analysisSoftwareRefs);
 
-        // get/hash the SequenceCollection:PeptideEvidences
         SequenceCollection sc = unmarshaller.unmarshal(SequenceCollection.class);
-        peptideEvidences = new HashMap<>();
-        for (PeptideEvidence pepEvidence : sc.getPeptideEvidence()) {
-            peptideEvidences.put(pepEvidence.getId(), pepEvidence);
-        }
 
         // get/hash the SequenceCollection:DBSequences
         dbSequences = new HashMap<>();
         for (DBSequence dbSeq : sc.getDBSequence()) {
             dbSequences.put(dbSeq.getId(), dbSeq);
+
+            LOGGER.debug("added dbSequence: " + dbSeq.getId() + " -> " + dbSequences.get(dbSeq.getId()));
         }
 
         // get/hash the SequenceCollection:Peptides
         peptides = new HashMap<>();
         for (uk.ac.ebi.jmzidml.model.mzidml.Peptide peptide: sc.getPeptide()) {
             peptides.put(peptide.getId(), peptide);
+
+            LOGGER.debug("added peptide: " + peptide.getId()
+            		+ " -> " + peptides.get(peptide.getId())
+            		+ "\n\tpeptideSequence " + peptide.getPeptideSequence()
+    		);
+        }
+
+        // get/hash the SequenceCollection:PeptideEvidences
+        peptideEvidences = new HashMap<>();
+        for (PeptideEvidence pepEvidence : sc.getPeptideEvidence()) {
+            peptideEvidences.put(pepEvidence.getId(), pepEvidence);
+
+            LOGGER.debug("added pepEvidence: " + pepEvidence.getId()
+            		+ " -> " + peptideEvidences.get(pepEvidence.getId())
+            		+ "\n\tdbSequenceRef " + pepEvidence.getDBSequenceRef()
+            		+ "\n\tdbSequence " + pepEvidence.getDBSequence()
+            		);
         }
 
 
@@ -667,7 +685,8 @@ private Peptide parseSIIPeptideEvidences(List<PeptideEvidenceRef> peptideEvidenc
 
             DBSequence dbSeq = dbSequences.get(pepEvidence.getDBSequenceRef());
             if (dbSeq == null) {
-                LOGGER.error("DBSequence " + pepEvidence.getDBSequenceRef() + " not found!");
+                LOGGER.error("DBSequence " + pepEvidence.getDBSequenceRef()
+                		+ " for pepEvidence " + pepEvidence.getId() + " not found!");
                 return null;
             }