Skip to content

Commit

Permalink
Split the minimum_length
Browse files Browse the repository at this point in the history
between minimum_length for mg and for mt
Added sortmerna_database parameter
  • Loading branch information
iquasere committed Jan 24, 2024
1 parent a52b5cf commit 4c707d2
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 140 deletions.
71 changes: 12 additions & 59 deletions config/mgmp_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
{
"version": "2.0.0",
"output": "output",
"resources_directory": "resources_directory",
"threads": 15,
Expand Down Expand Up @@ -40,77 +39,31 @@
"Name": ""
}
],
"minimum_read_length": 100,
"minimum_mg_read_length": 100,
"minimum_read_average_quality": 20,
"rrna_database": "default",
"do_assembly": true,
"max_memory": 10,
"max_memory": 100,
"assembler": "megahit",
"error_model": "illumina_5",
"markerset": "40",
"do_binning": false,
"do_iterative_binning": false,
"upimapi_search_mode": "fast",
"upimapi_database": "uniprot",
"upimapi_database": "taxids",
"upimapi_max_target_seqs": 1,
"upimapi_taxids": "196137,143067,2159,213465,213422,213421,68298",
"upimapi_check_db": false,
"uniprot_columns": [
"Entry",
"Entry name",
"Gene names",
"Protein names",
"EC number",
"Function [CC]",
"Pathway",
"Keywords",
"Protein existence",
"Gene ontology (GO)",
"Protein families",
"Taxonomic lineage (SUPERKINGDOM)",
"Taxonomic lineage (PHYLUM)",
"Taxonomic lineage (CLASS)",
"Taxonomic lineage (ORDER)",
"Taxonomic lineage (FAMILY)",
"Taxonomic lineage (GENUS)",
"Taxonomic lineage IDs (GENUS)",
"Taxonomic lineage (SPECIES)",
"Taxonomic lineage IDs (SPECIES)",
"BioCyc",
"BRENDA",
"CDD",
"eggNOG",
"Ensembl",
"InterPro",
"KEGG",
"Pfam",
"Reactome",
"RefSeq",
"UniPathway"
],
"download_cdd_resources": true,
"recognizer_databases": [
"CDD",
"Pfam",
"NCBIfam",
"Protein_Clusters",
"Smart",
"TIGRFAM",
"COG",
"KOG"
],
"upimapi_taxids": "2159,213465",
"normalization_method": "TMM",
"minimum_differential_expression": 1,
"proteomics_workflow": "compomics",
"use_crap": true,
"proteomics_contaminants_database": "crap.fasta",
"minimum_differential_expression": 2,
"metaproteomics_add_reference_proteomes": true,
"reference_proteomes_taxa_level": "SPECIES",
"protease": "Trypsin",
"protease_file": "",
"significance_threshold": 0.05,
"keggcharter_taxa_level": "SPECIES",
"imputation_method": "LLS",
"keggcharter_taxa_level": "GENUS",
"keggcharter_number_of_taxa": 10,
"keggcharter_maps": [
"00680"
],
"inside_container": false
}
"inside_container": true
}

76 changes: 12 additions & 64 deletions config/mgmt_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
{
"version": "2.0.0",
"output": "output",
"resources_directory": "resources_directory",
"threads": 15,
Expand All @@ -12,105 +11,54 @@
"Name": ""
},
{
"Files": "MOSCARDO/input/mt1_R1.fq,MOSCARDO/input/mt1_R2.fq",
"Files": "MOSCARDO/input/mt1_R1.fq.gz,MOSCARDO/input/mt1_R2.fq.gz",
"Sample": "sample",
"Data type": "mrna",
"Condition": "c1",
"Name": ""
},
{
"Files": "MOSCARDO/input/mt2_R1.fq,MOSCARDO/input/mt2_R2.fq",
"Files": "MOSCARDO/input/mt2_R1.fq.gz,MOSCARDO/input/mt2_R2.fq.gz",
"Sample": "sample",
"Data type": "mrna",
"Condition": "c1",
"Name": ""
},
{
"Files": "MOSCARDO/input/mt3_R1.fq,MOSCARDO/input/mt3_R2.fq",
"Files": "MOSCARDO/input/mt3_R1.fq.gz,MOSCARDO/input/mt3_R2.fq.gz",
"Sample": "sample",
"Data type": "mrna",
"Condition": "c2",
"Name": ""
},
{
"Files": "MOSCARDO/input/mt4_R1.fq,MOSCARDO/input/mt4_R2.fq",
"Files": "MOSCARDO/input/mt4_R1.fq.gz,MOSCARDO/input/mt4_R2.fq.gz",
"Sample": "sample",
"Data type": "mrna",
"Condition": "c2",
"Name": ""
}
],
"minimum_read_length": 50,
"minimum_mg_read_length": 100,
"minimum_mt_read_length": 50,
"minimum_read_average_quality": 20,
"rrna_database": "fast",
"do_assembly": true,
"max_memory": 100,
"assembler": "metaspades",
"error_model": "illumina_5",
"markerset": "40",
"do_binning": false,
"do_iterative_binning": false,
"upimapi_search_mode": "fast",
"upimapi_database": "uniprot",
"upimapi_database": "taxids",
"upimapi_max_target_seqs": 1,
"upimapi_taxids": "196137,143067,2159,213465,213422,213421,68298",
"upimapi_check_db": false,
"uniprot_columns": [
"Entry",
"Entry name",
"Gene names",
"Protein names",
"EC number",
"Function [CC]",
"Pathway",
"Keywords",
"Protein existence",
"Gene ontology (GO)",
"Protein families",
"Taxonomic lineage (SUPERKINGDOM)",
"Taxonomic lineage (PHYLUM)",
"Taxonomic lineage (CLASS)",
"Taxonomic lineage (ORDER)",
"Taxonomic lineage (FAMILY)",
"Taxonomic lineage (GENUS)",
"Taxonomic lineage IDs (GENUS)",
"Taxonomic lineage (SPECIES)",
"Taxonomic lineage IDs (SPECIES)",
"BioCyc",
"BRENDA",
"CDD",
"eggNOG",
"Ensembl",
"InterPro",
"KEGG",
"Pfam",
"Reactome",
"RefSeq",
"UniPathway"
],
"download_cdd_resources": true,
"recognizer_databases": [
"CDD",
"Pfam",
"NCBIfam",
"Protein_Clusters",
"Smart",
"TIGRFAM",
"COG",
"KOG"
],
"upimapi_taxids": "2159,213465",
"normalization_method": "TMM",
"minimum_differential_expression": 1,
"proteomics_workflow": "compomics",
"use_crap": true,
"proteomics_contaminants_database": "crap.fasta",
"reference_proteomes_taxa_level": "SPECIES",
"protease": "Trypsin",
"protease_file": "",
"significance_threshold": 0.05,
"keggcharter_taxa_level": "SPECIES",
"keggcharter_taxa_level": "GENUS",
"keggcharter_number_of_taxa": 10,
"keggcharter_maps": [
"00680"
],
"inside_container": false
}
"inside_container": true
}
6 changes: 4 additions & 2 deletions resources/default_config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
{
"version": "2.2.0",
"version": "2.3",
"output": "output",
"resources_directory": "resources_directory",
"threads": 15,
"experiments": [],
"minimum_read_length": 100,
"sortmerna_database": "default",
"minimum_mg_read_length": 100,
"minimum_mt_read_length": 100,
"minimum_read_average_quality": 20,
"do_assembly": true,
"max_memory": 100,
Expand Down
1 change: 1 addition & 0 deletions resources/wiki/.README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Files for [MOSCA's wiki](https://github.com/iquasere/MOSCA/wiki).
4 changes: 3 additions & 1 deletion workflow/rules/preprocess.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ rule preprocess:
reads = lambda wildcards: EXPS.loc[EXPS['Name'] == wildcards.name, 'Files'].iloc[0],
resources_directory = config["resources_directory"],
data_type = lambda wildcards: EXPS.loc[EXPS['Name'] == wildcards.name, 'Data type'].iloc[0],
minlen = config["minimum_read_length"],
rrna_db = config["sortmerna_database"],
mg_minlen = config["minimum_mg_read_length"],
mt_minlen= config["minimum_mt_read_length"],
avgqual = config["minimum_read_average_quality"]
conda:
"../envs/preprocess.yaml"
Expand Down
5 changes: 4 additions & 1 deletion workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ experiments:
required: [Files, Sample, "Data type"]
minItems: 1

minimum_read_length:
minimum_mg_read_length:
type: integer

minimum_mt_read_length:
type: integer

minimum_read_average_quality:
Expand Down
24 changes: 11 additions & 13 deletions workflow/scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@ def download_resources(self, resources_directory):
with open(f'{resources_directory}/downloaded_timestamp.txt', 'w') as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))


def get_adapters_dir(self):
return glob(f"{'/'.join(check_output('which trimmomatic', shell=True).decode('utf8').split('/')[:-2])}/share/"
f"trimmomatic-*/adapters")[0]
return glob(f"{'/'.join(check_output('which trimmomatic', shell=True).decode('utf8').split('/')[:-2])}"
f"/share/trimmomatic-*/adapters")[0]

def remove_adapters(self, files, out_dir, name, adapters, threads='12'):
adapter_contaminated = False
Expand Down Expand Up @@ -100,11 +99,11 @@ def remove_adapters(self, files, out_dir, name, adapters, threads='12'):
return 'Failed'

# SortMeRNA - rRNA removal
def rrna_removal(self, reads, out_dir, name, databases, indexes_dir, tmp_dir, threads=12):
def rrna_removal(self, reads, out_dir, name, database, indexes_dir, tmp_dir, threads=12):
if os.path.isdir(tmp_dir):
shutil.rmtree(tmp_dir)
run_pipe_command(
f"sortmerna -ref {' -ref '.join(databases)} --reads {' --reads '.join(reads)} --idx-dir {indexes_dir} "
f"sortmerna -ref {database} --reads {' --reads '.join(reads)} --idx-dir {indexes_dir} "
f"--workdir {tmp_dir} --aligned {out_dir}/rrna_{name} --other {out_dir}/norrna_{name} -out2 --fastx "
f"--paired_in --threads {threads} 1>{out_dir}/{name}_sortmerna.log 2>{out_dir}/{name}_sortmerna.err")
shutil.rmtree(tmp_dir)
Expand Down Expand Up @@ -221,16 +220,14 @@ def run(self):

# rRNA removal
if snakemake.params.data_type == 'mrna':
rrna_db = snakemake.params.rrna_db
rrna_dbs_options = ['fast', 'default', 'sensitive', 'sensitive_with_rfam']
lista = [
'smr_v4.3_default_db.fasta', 'smr_v4.3_sensitive_db_rfam_seeds.fasta', 'smr_v4.3_fast_db.fasta',
'smr_v4.3_sensitive_db.fasta']
if not snakemake.params.rrna_db in rrna_dbs_options:
if rrna_db not in rrna_dbs_options:
exit(f'rrna_db must be one of {rrna_dbs_options}')
rrna_db_name = snakemake.params.rrna_db.replace("sensitive_with_rfam", "sensitive_db_rfam_seeds")
rrna_databases = [f'{rrna_databases_dir}/smr_v4.3_{rrna_db_name}_db.fasta']
rrna_database = 'sensitive_db_rfam_seeds' if rrna_db == 'sensitive_with_rfam' else f'{rrna_db}_db'
rrna_database = f'{rrna_databases_dir}/smr_v4.3_{rrna_database}.fasta'
self.rrna_removal(
reads, f'{snakemake.params.output}/SortMeRNA', name, rrna_databases, rrna_databases_dir,
reads, f'{snakemake.params.output}/SortMeRNA', name, rrna_database, rrna_databases_dir,
tmp_dir=f'{snakemake.params.output}/SortMeRNA/tmp', threads=snakemake.threads)

reads = ([f'{snakemake.params.output}/SortMeRNA/norrna_{name}_{fr}.fq.gz' for fr in ['fwd', 'rev']] if
Expand All @@ -240,7 +237,8 @@ def run(self):

self.quality_trimming(
reads, snakemake.params.output, name, threads=snakemake.threads, avgqual=snakemake.params.avgqual,
minlen=snakemake.params.minlen, type_of_data=snakemake.params.data_type)
minlen=snakemake.params.mt_minlen if snakemake.params.data_type == 'mrna' else snakemake.params.mg_minlen,
type_of_data=snakemake.params.data_type)

reads = ([f'{snakemake.params.output}/Trimmomatic/quality_trimmed_{name}_{fr}_paired.fq'
for fr in ['forward', 'reverse']]
Expand Down

0 comments on commit 4c707d2

Please sign in to comment.