diff --git a/README.md b/README.md index 6e174e8..fa7c289 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,14 @@ unassembled-derived to build a consensus of these databases and increase the map ## Installation -The easiest way is to use bioconda and create a new environment. +The easiest way is to use bioconda and create a new environment. ```bash conda create -n mpies --file conda_env.conf source activate mpies ``` -SingleM has been packaged by AppImage (due to the Python 2 dependency). Download +SingleM has been packaged by AppImage (due to the Python 2 dependency). Download [AppImage](https://github.com/AppImage/AppImageKit/releases) and build the image with ```bash @@ -51,6 +51,27 @@ then a text file with the taxon names (one per line) is used for downloading the amplicon data is available, you can set the option `config["otu_table"]["run_singlem"]` to `true` and a taxon file is created with SingleM (this tool detects OTU abundances based on metagenome shotgun sequencing data). +##### Functional-derived subset + +It is also possible to create a subset derived from UniProt based not only on taxonomy but to also restrict the +gene and functional names instead of downloading the entire proteomes for the taxa of interest. To do so, a TOML file +should be created (see example below) + +```toml +Taxonomy = [ + "Bacteria" +] +Gene_names = [ + "dnaK", + "soxA" +] +Protein_names = [ + "Heat shock protein 70", # something commented +] +``` + +and the path needs to be set in the snakemake configuration (`config["functional_subset"]["toml_file"]`). + ##### Assembled-derived proteome file If only raw data is available, it is possible to run an assembly with MEGAHIT or metaSPAdes (set @@ -111,7 +132,7 @@ diamond makedb --threads --in nr.gz --db nr.dmnd 4. Now you can set `config["taxonomy"]["run_taxonomy"]` to `true` and run `snakemake`. Remember to set the paths for the diamond database, the binary of `blast2lca` and the path to the file `prot_acc2tax-Jun2018X1.abin`. Please note that -`diamond blastp` takes a very long time to execute. +`diamond blastp` takes a very long time to execute. ##### Functional annotation @@ -179,4 +200,3 @@ Please note that input and output files must be/are compressed with gzip. The test data set is a subset from the Ocean Sampling Day (first 18,000 lines for each read file), Accession number ERR770958 obtained from https://www.ebi.ac.uk/ena/data/view/ERR770958). The data is deposited in the test_data directory of this repository. - diff --git a/database_creation.json b/database_creation.json index 96d1be2..0c39b30 100644 --- a/database_creation.json +++ b/database_creation.json @@ -44,6 +44,10 @@ "mode": "amplicon" } }, + "functional_subset": { + "toml_file": "functional_subset.toml", + "mode": "function_subset" + }, "postprocessing": { "remove_short_sequences": { "min_length": 2000 @@ -54,4 +58,3 @@ } } } - diff --git a/database_creation.smk b/database_creation.smk index 7385f16..516b350 100644 --- a/database_creation.smk +++ b/database_creation.smk @@ -10,6 +10,10 @@ include: "rules/otu_table.smk" inputs.append("checkpoints/amplicon_proteome.done") +include: + "rules/functional_subset.smk" +inputs.append("checkpoints/functional_subset_proteome.done") + include: "rules/assembled.smk" inputs.append("checkpoints/assembled_proteome.done") diff --git a/main.py b/main.py index 734559a..ffa5460 100755 --- a/main.py +++ b/main.py @@ -6,8 +6,8 @@ import logging.config import os import sys -from mptk import general_functions, hash_headers, parse_singlem, use_amplicon, subset_sequences, parse_taxonomy, \ - parse_functions_cog, parse_functions_uniprot +from mptk import general_functions, hash_headers, parse_singlem, use_amplicon, use_functional_subset, \ + subset_sequences, parse_taxonomy, parse_functions_cog, parse_functions_uniprot def configure_logger(name, log_file, level="DEBUG"): @@ -73,6 +73,8 @@ def main(): subparser_singlem = subparsers.add_parser("parse_singlem", help="build genus list from singlem OTU table") subparser_amplicon = subparsers.add_parser("amplicon", help="use genus list (amplicons) or singlem (metagenome reads)") + subparser_functionsubset = subparser.add_parser("function_subset", + help="use gene, protein and taxonomy name subset") subparser_hashing = subparsers.add_parser("hashing", help="hash fasta headers") subparser_subset_sequences = subparsers.add_parser("subset_sequences", help="subsets sequences (only keeps identified proteins)") @@ -112,6 +114,13 @@ def main(): subparser_amplicon.add_argument("-t", "--taxonomy", action="store_true", dest="taxonomy", required=False, help="add taxonomic lineage to fasta header") + subparser_functionsubset.add_argument("-t", "--toml_file", action="store", dest="toml_file", required=True, + help="toml file with taxonomy, gene and protein names") + subparser_functionsubset.add_argument("-r", "--reviewed", action="store_true", dest="reviewed", required=False, + help="use unreviewed TrEMBL hits (default) or only reviewed SwissProt") + subparser_functionsubset.add_argument("-p", "--proteome_file", action="store", dest="proteome_file", required=True, + help="proteome file") + subparser_hashing.add_argument("-p", "--proteome_file", action="store", dest="proteome_file", required=True, help="proteome input file") subparser_hashing.add_argument("-s", "--hashed_proteome_file", action="store", dest="hashed_file", required=True, @@ -216,6 +225,11 @@ def main(): use_amplicon.get_protein_sequences(tax_list=taxids, output_file=args.proteome_file, ncbi_tax_dict=tax_dict, reviewed=args.reviewed, add_taxonomy=args.taxonomy) + elif args.mode == "function_subset": + logger.info("creating functional subsets") + query_command = use_functional_subset.search_lists_to_query_url(args.toml_file) + use_amplicon.get_protein_sequences(tax_list=False, query=query_command, output_file=args.proteome_file, reviewed=args.reviewed, add_taxonomy=args.taxonomy) + elif args.mode == "hashing": logger.info("hashing protein headers") hash_headers.write_hashed_protein_header_fasta_file(input_file=args.proteome_file, output_file=args.hashed_file, diff --git a/mptk/use_amplicon.py b/mptk/use_amplicon.py index 2bc8c09..e58f81f 100644 --- a/mptk/use_amplicon.py +++ b/mptk/use_amplicon.py @@ -90,8 +90,8 @@ def add_taxonomy_to_fasta(fasta_file, ncbi_tax_dict): return None -def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False, - add_taxonomy=True, remove_backup=True): +def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, query=None, reviewed=False, + add_taxonomy=True): """ Fetch the proteomes for all tax IDs. @@ -102,6 +102,7 @@ def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False, ---------- tax_list: unique list with tax IDs output_file: output file for the downloaded protein sequences + query: query can be passed from the use_functional_subset module (if None, download entire proteome of taxa of interest) reviewed: use TrEMBL (False) or SwissProt (True) Returns @@ -112,15 +113,18 @@ def get_protein_sequences(tax_list, output_file, ncbi_tax_dict, reviewed=False, logger.info("fetching protein sequences ...") filename = output_file - taxon_queries = ['taxonomy:"%s"' % tid for tid in tax_list] - taxon_query = ' OR '.join(taxon_queries) rev = " reviewed:%s" % reviewed if reviewed else '' + if not tax_list: + taxon_queries = ['taxonomy:"%s"' % tid for tid in tax_list] + taxon_query = ' OR '.join(taxon_queries) + query = "%s%s" % (taxon_query, rev) url = 'https://www.uniprot.org/uniprot/' - query = "%s%s" % (taxon_query, rev) + params = {'query': query, 'force': 'yes', 'format': 'fasta'} data = urllib.parse.urlencode(params).encode("utf-8") - logger.info("Taxid: " + str(tax_list)) + if not tax_list: + logger.info("Taxid: " + str(tax_list)) msg = urllib.request.urlretrieve(url=url, filename=filename, data=data)[1] headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} diff --git a/mptk/use_functional_subset.py b/mptk/use_functional_subset.py new file mode 100644 index 0000000..4ab56f8 --- /dev/null +++ b/mptk/use_functional_subset.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +""" +Get the gene names and protein names for certain taxonomic groups. + +The module uses a text file in TOML format to fetch all sequences from UniProt with certain gene or protein names and a certain taxonomy. +""" + +import logging +import toml + +logger = logging.getLogger("pies.use_functional_subset") + + +def search_lists_to_query_url(toml_file, search_prots, search_genes, tax_limits): + """ + Return the query to download the sequences of interest from UniProt. + + Based on the list of protein, gene and taxonomic restrictions, the function creates a query that gets passed + to the UniProt API. + + Parameters + ---------- + search_prots: protein name subset + search_genes: exact gene name subset + tax_limits: taxonomy subset + toml_file: the TOML file passed by the user + + Returns + ------- + the query to pass to UniProt + + """ + toml_parsed = toml.load(toml_file) + tax_limits = toml_parsed["Taxonomy"] + search_prots = toml_parsed["Protein_names"] + search_genes = toml_parsed["Gene_names"] + + return '( {taxlimit} ) AND ( {protquery} )'.format( + taxlimit=' OR '.join([ 'taxonomy:"%s"' % t for t in tax_limits ]), + protquery=' OR '.join( + [ 'name:"%s"' % n for n in search_prots ] + + [ 'gene_exact:"%s"' % g for g in search_genes ] + ), + ) + + +# mPies (metaProteomics in environmental sciences) creates annotated databases for metaproteomics analysis. +# Copyright 2018 Johannes Werner (Leibniz-Institute for Baltic Sea Research) +# Copyright 2018 Augustin Geron (University of Mons, University of Stirling) +# Copyright 2018 Sabine Matallana Surget (University of Stirling) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . diff --git a/rules/functional_subset.smk b/rules/functional_subset.smk new file mode 100644 index 0000000..60e457a --- /dev/null +++ b/rules/functional_subset.smk @@ -0,0 +1,34 @@ +rule obtain_functional_subset: + input: + expand("{sample}/functional_subset/functional_subset.toml", sample=config["sample"]) + output: + expand("{sample}/functional_subset/functional_subset.faa", sample=config["sample"]) + params: + mode=config["functional_subset"]["mode"] + shell: + "./main.py -v {params.mode} -t {input} -p {output}" + +rule get_functional_subset_done: + input: + expand("{sample}/functional_subset/functional_subset.faa", sample=config["sample"]) + output: + touch("checkpoints/functional_subset.done") + + +# mPies (metaProteomics in environmental sciences) creates annotated databases for metaproteomics analysis. +# Copyright 2018 Johannes Werner (Leibniz-Institute for Baltic Sea Research) +# Copyright 2018 Augustin Geron (University of Mons, University of Stirling) +# Copyright 2018 Sabine Matallana Surget (University of Stirling) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see .