Deploy for release 0.3.3 from 4fef684

czbiohub-sf · Jan 30, 2023 · 62c2f10 · 62c2f10
1 parent 1c67f17
commit 62c2f10
Show file tree

Hide file tree

Showing 55 changed files with 1,756 additions and 1,033 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,23 @@
+# mspipeline 0.3.3
+
+## Minor changes
+
+* `convert/maxquant_to_h5ad`:  Converted script to python and implemented a (preliminary) unit test.
+
+* `maxquant/maxquant`: Added parameters:
+
+  - `--input_experiment`: Set the experiment label for a given input. Must be undefined or of the same length as `--input`.
+  - `--input_fraction`: Set the fraction for a given input. Must be undefined or of the same length as `--input`.
+  - `--input_ptm`: Set the PTM value for a given input. Must be undefined or of the same length as `--input`.
+  - `--dia_library_type`: Which type of DIA library to use. Must be "MaxQuant" or "tsv".
+  - `--dia_library`: Path to the DIA library.
+
+
+## Bug fixes
+
+* `maxquant/maxquant`: Fix issue where `lfq_mode` was not being passed to the jinja template.
+
+
 # mspipeline 0.3.2
 
 ## New functionality

diff --git a/src/convert/maxquant_to_h5ad/config.vsh.yaml b/src/convert/maxquant_to_h5ad/config.vsh.yaml
@@ -7,6 +7,10 @@ functionality:
       email: rcannood@gmail.com
       roles: [ maintainer ]
       props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kenneth Verheggen
+      email: kenneth@data-intuitive.com
+      roles: [ maintainer ]
+      props: { github: KennethVerheggen }
   arguments:
     - name: "--input"
       type: file
@@ -20,8 +24,12 @@ functionality:
       example: output.h5ad
       description: The dataset as an AnnData file.
   resources:
-    - type: r_script
-      path: script.R
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: ../../../resources_test/zenodo_4274987
 platforms:
   - type: docker
     image: dataintuitive/randpy:r4.0_py3.8_bioc3.12

diff --git a/src/convert/maxquant_to_h5ad/script.R b/src/convert/maxquant_to_h5ad/script.R
diff --git a/src/convert/maxquant_to_h5ad/script.py b/src/convert/maxquant_to_h5ad/script.py
@@ -0,0 +1,84 @@
+"""This Module is used to convert MaxQuant output to anndata"""
+import pandas as pd
+import anndata as ad
+
+## VIASH START
+par = {
+    "input": "resources_test/zenodo_4274987/maxquant_out",
+    "output": "resources_test/zenodo_4274987/maxquant_out/output.h5ad",
+}
+## VIASH END
+
+
+# helper function for transforming column names in proteingroups
+# to snakecase
+def fix_headers(dataframe_old:pd.DataFrame)->pd.DataFrame:
+    """Fixes the headers by unescaping and converting to
+    snakecase and replacing booleans with integers"""
+    dataframe = dataframe_old.copy(deep=True)
+
+    dataframe.columns = dataframe.columns.str.lower()
+
+    replaces={  "+":("and",False),
+                "%":("and",False),
+                " ":("and",False),
+                "[^a-z0-9_]*":("",True)
+             }
+
+    for old, (new,use_regex) in replaces.items():
+        dataframe.columns = dataframe.columns.str.replace(old, new, regex=use_regex)
+    #TODO figure out which are causing the issues
+    for column_name,column in dataframe.items():
+        print(f"Removing booleans for : {column_name} ")
+        column.replace([False, True], [0, 1])
+    return dataframe
+
+# helper function to collate layer data from proteingroups
+def get_layer_data(_protein_groups:pd.DataFrame, template:str,
+                    sample_ids:pd.DataFrame)->pd.DataFrame:
+    """Retrieves data for the protein group layers"""
+    headers = []
+    for sample_id in sample_ids:
+        headers.append(template.format(sample_id=sample_id))
+    dataframe = _protein_groups.loc[:, _protein_groups.columns.isin(headers)]
+    dataframe.columns = sample_ids
+    dataframe = dataframe.transpose()
+    return dataframe
+
+# read sample metadata
+summary = pd.read_table(f"{par['input']}/combined/txt/summary.txt")
+# this is the only working, confirmed way
+summary_nt = summary[summary["Raw file"].str.contains("Total")==False]
+# read protein group info
+protein_groups = pd.read_table(f"{par['input']}/combined/txt/proteinGroups.txt")
+
+# use hardcoded templates
+#TODO evaluate strategy (alternative = dynamically load column headers)
+templates = {
+    "peptides": "Peptides {sample_id}",
+    "razor_and_unique_peptides": "Razor + unique peptides {sample_id}",
+    "unique_peptides": "Unique peptides {sample_id}",
+    "sequence_coverage": "Sequence coverage {sample_id} [%]",
+    "intensity": "Intensity {sample_id}",
+}
+
+# The sample IDs
+sampleIDs = summary_nt.loc[:, "Experiment"]
+
+layers = {}
+for key, value in templates.items():
+    x = get_layer_data(protein_groups, value, sampleIDs)
+    layers[key] = x
+
+# set sample metadata as observations
+obs=fix_headers(summary_nt)
+# set protein identifications as metadata
+var = fix_headers(protein_groups)
+
+# Create an AnnData object
+adata = ad.AnnData(None, obs, var)
+for key, value in layers.items():
+    adata.layers[key] = layers[key]
+
+# Export data to file...
+adata.write_h5ad(par["output"])
diff --git a/src/convert/maxquant_to_h5ad/test.py b/src/convert/maxquant_to_h5ad/test.py
@@ -0,0 +1,41 @@
+from unittest import main, TestCase
+import subprocess
+from pathlib import Path
+import anndata as ad
+
+## VIASH START
+meta = {
+    'executable': './target/docker/convert/maxquant_to_h5ad',
+    'resources_dir': './resources_test/zenodo_4274987',
+}
+## VIASH END
+
+target ="output.h5ad"
+resources_dir, executable = meta["resources_dir"], meta["executable"]
+conversion_output = f"{resources_dir}/{target}]"
+
+class TestMaxQuantToHAD(TestCase):
+    def _run_and_check_output(self, args_as_list, expected_raise=False):
+        try:
+            subprocess.check_output([meta['executable']] + args_as_list, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            if not expected_raise:
+                print(e.stdout.decode("utf-8"))
+            raise e
+
+    def test_maxquant_convert(self):
+        self._run_and_check_output(["--input", "zenodo_4274987/maxquant_out",
+                                    "--output", target])
+        self.assertTrue(Path(target).is_file())
+        converted_data = ad.read_h5ad(target)
+
+        #Check if the specified data layers are present in the output
+        self.assertListEqual(list(converted_data.layers),['intensity', 'peptides', 'razor_and_unique_peptides', 'sequence_coverage', 'unique_peptides'])
+        #Check the number of observations (should be 2: Sample1 & Sample2)
+        self.assertEqual(len(converted_data.obs),2)
+        #Check the number of metadata columns (proteingroups), should be 270 in the given example
+        self.assertEqual(len(converted_data.var),270)
+        #TODO is it worth testing further in depth?
+
+if __name__ == "__main__":
+    main()
diff --git a/src/maxquant/maxquant/config.vsh.yaml b/src/maxquant/maxquant/config.vsh.yaml
@@ -19,6 +19,41 @@ functionality:
           description: One or more Thermo Raw files.
           # todo: also support mzml and other input formats supported by MaxQuant?
           example: input.raw
+        - name: "--input_experiment"
+          type: string
+          required: false
+          multiple: true
+          multiple_sep: ";"
+          description: Experiment labels. Must be undefined or of the same length as `--input`.
+          example: input
+        - name: "--input_fraction"
+          type: integer
+          required: false
+          multiple: true
+          multiple_sep: ";"
+          description: Input fractions. Must be undefined or of the same length as `--input`.
+        - name: "--input_ptm"
+          type: boolean
+          required: false
+          multiple: true
+          multiple_sep: ";"
+          description: Input PTMs. Must be undefined or of the same length as `--input`.
+        # different parameter groups are not allowed (for now).
+        # if needed, do multiple separate runs.
+        # if still needed, change code
+        # - name: "--input_group_indices"
+        #   type: boolean
+        #   required: false
+        #   multiple: true
+        #   multiple_sep: ";"
+        #   description: Parameter group index.
+        # specifying reference channels not supported (for now)
+        # - name: "--input_reference_channel"
+        #   type: boolean
+        #   required: false
+        #   multiple: true
+        #   multiple_sep: ";"
+        #   description: Reference channel.
     - name: Outputs
       arguments:
         - name: "--output"
@@ -113,6 +148,17 @@ functionality:
             
             The use of an experimental design so specify which LC-MS runs or groups of LC-MS runs correspond to the different samples is obligatory here. 
             The output of the label free algorithm can be found in the proteinGroups table in the columns starting with 'LFQ Intensity'.
+        - name: "--dia_library_type"
+          type: string
+          choices: ["MaxQuant", "tsv"]
+          default: "tsv"
+          description: Which type of DIA library to use.
+        - name: "--dia_library"
+          type: file
+          multiple: true
+          multiple_sep: ";"
+          example: "path/to/library.tsv"
+          description: Which DIA library to use.
     - name: Identification
       description: Arguments listed in the MaxQuant GUI under "Global parameters > Identifications"
       arguments: