added functionality to download functional subsets

johanneswerner · Jun 29, 2019 · 00ea8f7 · 00ea8f7
1 parent d4ec2d3
commit 00ea8f7
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -8,14 +8,14 @@ unassembled-derived to build a consensus of these databases and increase the map
 
 ## Installation
 
-The easiest way is to use bioconda and create a new environment. 
+The easiest way is to use bioconda and create a new environment.
 
 ```bash
 conda create -n mpies --file conda_env.conf
 source activate mpies
 ```
 
-SingleM has been packaged by AppImage (due to the Python 2 dependency).  Download 
+SingleM has been packaged by AppImage (due to the Python 2 dependency).  Download
 [AppImage](https://github.com/AppImage/AppImageKit/releases) and build the image with
 
 ```bash
@@ -51,6 +51,27 @@ then a text file with the taxon names (one per line) is used for downloading the
 amplicon data is available, you can set the option `config["otu_table"]["run_singlem"]` to `true` and a taxon file is
 created with SingleM (this tool detects OTU abundances based on metagenome shotgun sequencing data).
 
+##### Functional-derived subset
+
+It is also possible to create a subset derived from UniProt based not only on taxonomy but to also restrict the
+gene and functional names instead of downloading the entire proteomes for the taxa of interest. To do so, a TOML file
+should be created (see example below)
+
+```toml
+Taxonomy = [
+    "Bacteria"
+]
+Gene_names = [
+     "dnaK",
+     "soxA"
+]
+Protein_names = [
+    "Heat shock protein 70", # something commented
+]
+```
+
+and the path needs to be set in the snakemake configuration (`config["functional_subset"]["toml_file"]`).
+
 ##### Assembled-derived proteome file
 
 If only raw data is available, it is possible to run an assembly with MEGAHIT or metaSPAdes (set
@@ -111,7 +132,7 @@ diamond makedb --threads <number_of_threads> --in nr.gz --db nr.dmnd
 
 4. Now you can set `config["taxonomy"]["run_taxonomy"]` to `true` and run `snakemake`. Remember to set the paths for the
 diamond database, the binary of `blast2lca` and the path to the file `prot_acc2tax-Jun2018X1.abin`. Please note that
-`diamond blastp` takes a very long time to execute. 
+`diamond blastp` takes a very long time to execute.
 
 ##### Functional annotation
 
@@ -179,4 +200,3 @@ Please note that input and output files must be/are compressed with gzip.
 The test data set is a subset from the Ocean Sampling Day (first 18,000 lines for each read file), Accession number
 ERR770958 obtained from https://www.ebi.ac.uk/ena/data/view/ERR770958). The data is deposited in the test_data
 directory of this repository.
-
diff --git a/database_creation.json b/database_creation.json
@@ -44,6 +44,10 @@
             "mode": "amplicon"
         }
     },
+    "functional_subset": {
+        "toml_file": "functional_subset.toml",
+        "mode": "function_subset"
+    },
     "postprocessing": {
         "remove_short_sequences": {
             "min_length": 2000
@@ -54,4 +58,3 @@
         }
     }
 }
-
diff --git a/database_creation.smk b/database_creation.smk
@@ -10,6 +10,10 @@ include:
     "rules/otu_table.smk"
 inputs.append("checkpoints/amplicon_proteome.done")
 
+include:
+    "rules/functional_subset.smk"
+inputs.append("checkpoints/functional_subset_proteome.done")
+
 include:
     "rules/assembled.smk"
 inputs.append("checkpoints/assembled_proteome.done")

diff --git a/main.py b/main.py
@@ -6,8 +6,8 @@
 import logging.config
 import os
 import sys
-from mptk import general_functions, hash_headers, parse_singlem, use_amplicon, subset_sequences, parse_taxonomy, \
-  parse_functions_cog, parse_functions_uniprot
+from mptk import general_functions, hash_headers, parse_singlem, use_amplicon, use_functional_subset, \
+  subset_sequences, parse_taxonomy, parse_functions_cog, parse_functions_uniprot
 
 
 def configure_logger(name, log_file, level="DEBUG"):
@@ -73,6 +73,8 @@ def main():
     subparser_singlem = subparsers.add_parser("parse_singlem", help="build genus list from singlem OTU table")
     subparser_amplicon = subparsers.add_parser("amplicon",
                                                help="use genus list (amplicons) or singlem (metagenome reads)")
+    subparser_functionsubset = subparser.add_parser("function_subset",
+                                                    help="use gene, protein and taxonomy name subset")
     subparser_hashing = subparsers.add_parser("hashing", help="hash fasta headers")
     subparser_subset_sequences = subparsers.add_parser("subset_sequences",
                                                        help="subsets sequences (only keeps identified proteins)")
@@ -112,6 +114,13 @@ def main():
     subparser_amplicon.add_argument("-t", "--taxonomy", action="store_true", dest="taxonomy", required=False,
                                     help="add taxonomic lineage to fasta header")
 
+    subparser_functionsubset.add_argument("-t", "--toml_file", action="store", dest="toml_file", required=True,
+                                          help="toml file with taxonomy, gene and protein names")
+    subparser_functionsubset.add_argument("-r", "--reviewed", action="store_true", dest="reviewed", required=False,
+                                          help="use unreviewed TrEMBL hits (default) or only reviewed SwissProt")
+    subparser_functionsubset.add_argument("-p", "--proteome_file", action="store", dest="proteome_file", required=True,
+                                    help="proteome file")
+
     subparser_hashing.add_argument("-p", "--proteome_file", action="store", dest="proteome_file", required=True,
                                    help="proteome input file")
     subparser_hashing.add_argument("-s", "--hashed_proteome_file", action="store", dest="hashed_file", required=True,
@@ -216,6 +225,11 @@ def main():
         use_amplicon.get_protein_sequences(tax_list=taxids, output_file=args.proteome_file, ncbi_tax_dict=tax_dict,
                                            reviewed=args.reviewed, add_taxonomy=args.taxonomy)
 
+    elif args.mode == "function_subset":
+        logger.info("creating functional subsets")
+        query_command = use_functional_subset.search_lists_to_query_url(args.toml_file)
+        use_amplicon.get_protein_sequences(tax_list=False, query=query_command, output_file=args.proteome_file, reviewed=args.reviewed, add_taxonomy=args.taxonomy)
+
     elif args.mode == "hashing":
         logger.info("hashing protein headers")
         hash_headers.write_hashed_protein_header_fasta_file(input_file=args.proteome_file, output_file=args.hashed_file,

diff --git a/mptk/use_amplicon.py b/mptk/use_amplicon.py
@@ -90,8 +90,8 @@ def add_taxonomy_to_fasta(fasta_file, ncbi_tax_dict):
     return None
 
 
-def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False,
-                          add_taxonomy=True, remove_backup=True):
+def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, query=None, reviewed=False,
+                          add_taxonomy=True):
     """
     Fetch the proteomes for all tax IDs.
 
@@ -102,6 +102,7 @@ def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False,
     ----------
       tax_list: unique list with tax IDs
       output_file: output file for the downloaded protein sequences
+      query: query can be passed from the use_functional_subset module (if None, download entire proteome of taxa of interest)
       reviewed: use TrEMBL (False) or SwissProt (True)
 
     Returns
@@ -112,15 +113,18 @@ def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False,
     logger.info("fetching protein sequences ...")
     filename = output_file
 
-    taxon_queries = ['taxonomy:"%s"' % tid for tid in tax_list]
-    taxon_query = ' OR '.join(taxon_queries)
     rev = " reviewed:%s" % reviewed if reviewed else ''
+    if not tax_list:
+        taxon_queries = ['taxonomy:"%s"' % tid for tid in tax_list]
+        taxon_query = ' OR '.join(taxon_queries)
+        query = "%s%s" % (taxon_query, rev)
 
     url = 'https://www.uniprot.org/uniprot/'
-    query = "%s%s" % (taxon_query, rev)
+
     params = {'query': query, 'force': 'yes', 'format': 'fasta'}
     data = urllib.parse.urlencode(params).encode("utf-8")
-    logger.info("Taxid: " + str(tax_list))
+    if not tax_list:
+        logger.info("Taxid: " + str(tax_list))
     msg = urllib.request.urlretrieve(url=url, filename=filename, data=data)[1]
     headers = {j[0]: j[1].strip() for j in [i.split(':', 1)
                                                 for i in str(msg).strip().splitlines()]}

diff --git a/mptk/use_functional_subset.py b/mptk/use_functional_subset.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+"""
+Get the gene names and protein names for certain taxonomic groups.
+
+The module uses a text file in TOML format to fetch all sequences from UniProt with certain gene or protein names and a certain taxonomy.
+"""
+
+import logging
+import toml
+
+logger = logging.getLogger("pies.use_functional_subset")
+
+
+def search_lists_to_query_url(toml_file, search_prots, search_genes, tax_limits):
+    """
+    Return the query to download the sequences of interest from UniProt.
+
+    Based on the list of protein, gene and taxonomic restrictions, the function creates a query that gets passed
+    to the UniProt API.
+
+    Parameters
+    ----------
+      search_prots: protein name subset
+      search_genes: exact gene name subset
+      tax_limits: taxonomy subset
+      toml_file: the TOML file passed by the user
+
+    Returns
+    -------
+      the query to pass to UniProt
+
+    """
+    toml_parsed = toml.load(toml_file)
+    tax_limits = toml_parsed["Taxonomy"]
+    search_prots = toml_parsed["Protein_names"]
+    search_genes = toml_parsed["Gene_names"]
+
+    return '( {taxlimit} ) AND ( {protquery} )'.format(
+        taxlimit=' OR '.join([ 'taxonomy:"%s"' % t for t in tax_limits ]),
+        protquery=' OR '.join(
+           [ 'name:"%s"' % n for n in search_prots ]
+           + [ 'gene_exact:"%s"' % g for g in search_genes ]
+        ),
+      )
+
+
+# mPies (metaProteomics in environmental sciences) creates annotated databases for metaproteomics analysis.
+# Copyright 2018 Johannes Werner (Leibniz-Institute for Baltic Sea Research)
+# Copyright 2018 Augustin Geron (University of Mons, University of Stirling)
+# Copyright 2018 Sabine Matallana Surget (University of Stirling)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
diff --git a/rules/functional_subset.smk b/rules/functional_subset.smk
@@ -0,0 +1,34 @@
+rule obtain_functional_subset:
+    input:
+        expand("{sample}/functional_subset/functional_subset.toml", sample=config["sample"])
+    output:
+        expand("{sample}/functional_subset/functional_subset.faa", sample=config["sample"])
+    params:
+        mode=config["functional_subset"]["mode"]
+    shell:
+        "./main.py -v {params.mode} -t {input} -p {output}"
+
+rule get_functional_subset_done:
+    input:
+        expand("{sample}/functional_subset/functional_subset.faa", sample=config["sample"])
+    output:
+        touch("checkpoints/functional_subset.done")
+
+
+# mPies (metaProteomics in environmental sciences) creates annotated databases for metaproteomics analysis.
+# Copyright 2018 Johannes Werner (Leibniz-Institute for Baltic Sea Research)
+# Copyright 2018 Augustin Geron (University of Mons, University of Stirling)
+# Copyright 2018 Sabine Matallana Surget (University of Stirling)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.