diff --git a/config/mgmp_config.json b/config/mgmp_config.json index e4da6f7..55e07f8 100644 --- a/config/mgmp_config.json +++ b/config/mgmp_config.json @@ -1,5 +1,4 @@ { - "version": "2.0.0", "output": "output", "resources_directory": "resources_directory", "threads": 15, @@ -40,77 +39,31 @@ "Name": "" } ], - "minimum_read_length": 100, + "minimum_mg_read_length": 100, "minimum_read_average_quality": 20, + "rrna_database": "default", "do_assembly": true, - "max_memory": 10, + "max_memory": 100, "assembler": "megahit", - "error_model": "illumina_5", "markerset": "40", "do_binning": false, "do_iterative_binning": false, "upimapi_search_mode": "fast", - "upimapi_database": "uniprot", + "upimapi_database": "taxids", "upimapi_max_target_seqs": 1, - "upimapi_taxids": "196137,143067,2159,213465,213422,213421,68298", - "upimapi_check_db": false, - "uniprot_columns": [ - "Entry", - "Entry name", - "Gene names", - "Protein names", - "EC number", - "Function [CC]", - "Pathway", - "Keywords", - "Protein existence", - "Gene ontology (GO)", - "Protein families", - "Taxonomic lineage (SUPERKINGDOM)", - "Taxonomic lineage (PHYLUM)", - "Taxonomic lineage (CLASS)", - "Taxonomic lineage (ORDER)", - "Taxonomic lineage (FAMILY)", - "Taxonomic lineage (GENUS)", - "Taxonomic lineage IDs (GENUS)", - "Taxonomic lineage (SPECIES)", - "Taxonomic lineage IDs (SPECIES)", - "BioCyc", - "BRENDA", - "CDD", - "eggNOG", - "Ensembl", - "InterPro", - "KEGG", - "Pfam", - "Reactome", - "RefSeq", - "UniPathway" - ], - "download_cdd_resources": true, - "recognizer_databases": [ - "CDD", - "Pfam", - "NCBIfam", - "Protein_Clusters", - "Smart", - "TIGRFAM", - "COG", - "KOG" - ], + "upimapi_taxids": "2159,213465", "normalization_method": "TMM", - "minimum_differential_expression": 1, - "proteomics_workflow": "compomics", - "use_crap": true, - "proteomics_contaminants_database": "crap.fasta", + "minimum_differential_expression": 2, + "metaproteomics_add_reference_proteomes": true, "reference_proteomes_taxa_level": "SPECIES", "protease": "Trypsin", - "protease_file": "", "significance_threshold": 0.05, - "keggcharter_taxa_level": "SPECIES", + "imputation_method": "LLS", + "keggcharter_taxa_level": "GENUS", "keggcharter_number_of_taxa": 10, "keggcharter_maps": [ "00680" ], - "inside_container": false -} \ No newline at end of file + "inside_container": true +} + diff --git a/config/mgmt_config.json b/config/mgmt_config.json index 9b39727..3614372 100644 --- a/config/mgmt_config.json +++ b/config/mgmt_config.json @@ -1,5 +1,4 @@ { - "version": "2.0.0", "output": "output", "resources_directory": "resources_directory", "threads": 15, @@ -12,105 +11,54 @@ "Name": "" }, { - "Files": "MOSCARDO/input/mt1_R1.fq,MOSCARDO/input/mt1_R2.fq", + "Files": "MOSCARDO/input/mt1_R1.fq.gz,MOSCARDO/input/mt1_R2.fq.gz", "Sample": "sample", "Data type": "mrna", "Condition": "c1", "Name": "" }, { - "Files": "MOSCARDO/input/mt2_R1.fq,MOSCARDO/input/mt2_R2.fq", + "Files": "MOSCARDO/input/mt2_R1.fq.gz,MOSCARDO/input/mt2_R2.fq.gz", "Sample": "sample", "Data type": "mrna", "Condition": "c1", "Name": "" }, { - "Files": "MOSCARDO/input/mt3_R1.fq,MOSCARDO/input/mt3_R2.fq", + "Files": "MOSCARDO/input/mt3_R1.fq.gz,MOSCARDO/input/mt3_R2.fq.gz", "Sample": "sample", "Data type": "mrna", "Condition": "c2", "Name": "" }, { - "Files": "MOSCARDO/input/mt4_R1.fq,MOSCARDO/input/mt4_R2.fq", + "Files": "MOSCARDO/input/mt4_R1.fq.gz,MOSCARDO/input/mt4_R2.fq.gz", "Sample": "sample", "Data type": "mrna", "Condition": "c2", "Name": "" } ], - "minimum_read_length": 50, + "minimum_mg_read_length": 100, + "minimum_mt_read_length": 50, "minimum_read_average_quality": 20, + "rrna_database": "fast", "do_assembly": true, "max_memory": 100, "assembler": "metaspades", - "error_model": "illumina_5", "markerset": "40", "do_binning": false, "do_iterative_binning": false, "upimapi_search_mode": "fast", - "upimapi_database": "uniprot", + "upimapi_database": "taxids", "upimapi_max_target_seqs": 1, - "upimapi_taxids": "196137,143067,2159,213465,213422,213421,68298", - "upimapi_check_db": false, - "uniprot_columns": [ - "Entry", - "Entry name", - "Gene names", - "Protein names", - "EC number", - "Function [CC]", - "Pathway", - "Keywords", - "Protein existence", - "Gene ontology (GO)", - "Protein families", - "Taxonomic lineage (SUPERKINGDOM)", - "Taxonomic lineage (PHYLUM)", - "Taxonomic lineage (CLASS)", - "Taxonomic lineage (ORDER)", - "Taxonomic lineage (FAMILY)", - "Taxonomic lineage (GENUS)", - "Taxonomic lineage IDs (GENUS)", - "Taxonomic lineage (SPECIES)", - "Taxonomic lineage IDs (SPECIES)", - "BioCyc", - "BRENDA", - "CDD", - "eggNOG", - "Ensembl", - "InterPro", - "KEGG", - "Pfam", - "Reactome", - "RefSeq", - "UniPathway" - ], - "download_cdd_resources": true, - "recognizer_databases": [ - "CDD", - "Pfam", - "NCBIfam", - "Protein_Clusters", - "Smart", - "TIGRFAM", - "COG", - "KOG" - ], + "upimapi_taxids": "2159,213465", "normalization_method": "TMM", "minimum_differential_expression": 1, - "proteomics_workflow": "compomics", - "use_crap": true, - "proteomics_contaminants_database": "crap.fasta", - "reference_proteomes_taxa_level": "SPECIES", - "protease": "Trypsin", - "protease_file": "", - "significance_threshold": 0.05, - "keggcharter_taxa_level": "SPECIES", + "keggcharter_taxa_level": "GENUS", "keggcharter_number_of_taxa": 10, "keggcharter_maps": [ "00680" ], - "inside_container": false -} \ No newline at end of file + "inside_container": true +} diff --git a/resources/default_config.json b/resources/default_config.json index 9f0ccdd..a1da8b8 100644 --- a/resources/default_config.json +++ b/resources/default_config.json @@ -1,10 +1,12 @@ { - "version": "2.2.0", + "version": "2.3", "output": "output", "resources_directory": "resources_directory", "threads": 15, "experiments": [], - "minimum_read_length": 100, + "sortmerna_database": "default", + "minimum_mg_read_length": 100, + "minimum_mt_read_length": 100, "minimum_read_average_quality": 20, "do_assembly": true, "max_memory": 100, diff --git a/resources/wiki/.README b/resources/wiki/.README new file mode 100644 index 0000000..0516b34 --- /dev/null +++ b/resources/wiki/.README @@ -0,0 +1 @@ +Files for [MOSCA's wiki](https://github.com/iquasere/MOSCA/wiki). \ No newline at end of file diff --git a/workflow/rules/preprocess.smk b/workflow/rules/preprocess.smk index 70e502b..3d08e11 100644 --- a/workflow/rules/preprocess.smk +++ b/workflow/rules/preprocess.smk @@ -12,7 +12,9 @@ rule preprocess: reads = lambda wildcards: EXPS.loc[EXPS['Name'] == wildcards.name, 'Files'].iloc[0], resources_directory = config["resources_directory"], data_type = lambda wildcards: EXPS.loc[EXPS['Name'] == wildcards.name, 'Data type'].iloc[0], - minlen = config["minimum_read_length"], + rrna_db = config["sortmerna_database"], + mg_minlen = config["minimum_mg_read_length"], + mt_minlen= config["minimum_mt_read_length"], avgqual = config["minimum_read_average_quality"] conda: "../envs/preprocess.yaml" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index fbe2186..e5e8c49 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -35,7 +35,10 @@ experiments: required: [Files, Sample, "Data type"] minItems: 1 -minimum_read_length: +minimum_mg_read_length: + type: integer + +minimum_mt_read_length: type: integer minimum_read_average_quality: diff --git a/workflow/scripts/preprocess.py b/workflow/scripts/preprocess.py index a924558..ae82392 100644 --- a/workflow/scripts/preprocess.py +++ b/workflow/scripts/preprocess.py @@ -60,10 +60,9 @@ def download_resources(self, resources_directory): with open(f'{resources_directory}/downloaded_timestamp.txt', 'w') as f: f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) - def get_adapters_dir(self): - return glob(f"{'/'.join(check_output('which trimmomatic', shell=True).decode('utf8').split('/')[:-2])}/share/" - f"trimmomatic-*/adapters")[0] + return glob(f"{'/'.join(check_output('which trimmomatic', shell=True).decode('utf8').split('/')[:-2])}" + f"/share/trimmomatic-*/adapters")[0] def remove_adapters(self, files, out_dir, name, adapters, threads='12'): adapter_contaminated = False @@ -100,11 +99,11 @@ def remove_adapters(self, files, out_dir, name, adapters, threads='12'): return 'Failed' # SortMeRNA - rRNA removal - def rrna_removal(self, reads, out_dir, name, databases, indexes_dir, tmp_dir, threads=12): + def rrna_removal(self, reads, out_dir, name, database, indexes_dir, tmp_dir, threads=12): if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) run_pipe_command( - f"sortmerna -ref {' -ref '.join(databases)} --reads {' --reads '.join(reads)} --idx-dir {indexes_dir} " + f"sortmerna -ref {database} --reads {' --reads '.join(reads)} --idx-dir {indexes_dir} " f"--workdir {tmp_dir} --aligned {out_dir}/rrna_{name} --other {out_dir}/norrna_{name} -out2 --fastx " f"--paired_in --threads {threads} 1>{out_dir}/{name}_sortmerna.log 2>{out_dir}/{name}_sortmerna.err") shutil.rmtree(tmp_dir) @@ -221,16 +220,14 @@ def run(self): # rRNA removal if snakemake.params.data_type == 'mrna': + rrna_db = snakemake.params.rrna_db rrna_dbs_options = ['fast', 'default', 'sensitive', 'sensitive_with_rfam'] - lista = [ - 'smr_v4.3_default_db.fasta', 'smr_v4.3_sensitive_db_rfam_seeds.fasta', 'smr_v4.3_fast_db.fasta', - 'smr_v4.3_sensitive_db.fasta'] - if not snakemake.params.rrna_db in rrna_dbs_options: + if rrna_db not in rrna_dbs_options: exit(f'rrna_db must be one of {rrna_dbs_options}') - rrna_db_name = snakemake.params.rrna_db.replace("sensitive_with_rfam", "sensitive_db_rfam_seeds") - rrna_databases = [f'{rrna_databases_dir}/smr_v4.3_{rrna_db_name}_db.fasta'] + rrna_database = 'sensitive_db_rfam_seeds' if rrna_db == 'sensitive_with_rfam' else f'{rrna_db}_db' + rrna_database = f'{rrna_databases_dir}/smr_v4.3_{rrna_database}.fasta' self.rrna_removal( - reads, f'{snakemake.params.output}/SortMeRNA', name, rrna_databases, rrna_databases_dir, + reads, f'{snakemake.params.output}/SortMeRNA', name, rrna_database, rrna_databases_dir, tmp_dir=f'{snakemake.params.output}/SortMeRNA/tmp', threads=snakemake.threads) reads = ([f'{snakemake.params.output}/SortMeRNA/norrna_{name}_{fr}.fq.gz' for fr in ['fwd', 'rev']] if @@ -240,7 +237,8 @@ def run(self): self.quality_trimming( reads, snakemake.params.output, name, threads=snakemake.threads, avgqual=snakemake.params.avgqual, - minlen=snakemake.params.minlen, type_of_data=snakemake.params.data_type) + minlen=snakemake.params.mt_minlen if snakemake.params.data_type == 'mrna' else snakemake.params.mg_minlen, + type_of_data=snakemake.params.data_type) reads = ([f'{snakemake.params.output}/Trimmomatic/quality_trimmed_{name}_{fr}_paired.fq' for fr in ['forward', 'reverse']]