diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index 5019bd3..be5989b 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -494,12 +494,16 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index 6876861..774f94b 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -494,12 +494,16 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 40e2097..08f6c6e 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -494,12 +494,16 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + # Minimum and maximum GERP score for a site to be included into calculations # of relative mutational load. # Positive values indicate purifying selection. diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index 0c8de76..a8db6e3 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -494,7 +494,7 @@ gerp_ref_path: ".test/data/gerp_data" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling +# Divergence time estimates must be in millions of years for correct scaling # of GERP scores (see dated phylogenetic trees from www.timetree.org). # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". @@ -506,6 +506,10 @@ tree: ".test/data/gerp_data/gerp_tree.nwk" min_gerp: 0 max_gerp: 1000 +# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from +# www.timetree.org is provided (in millions of years), set to 0.001. +tree_scaling_factor: 0.001 + ##### # NOTE: # The GERP step produces a large number of large intermediate files, diff --git a/config/config.yaml b/config/config.yaml index ee1c0b1..fcbc17b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -496,8 +496,9 @@ gerp_ref_path: "" # Full path to phylogenetic tree of all species included in the analysis # (including the target species) in NEWICK format and including divergence # time estimates. -# Divergence time estimates must be in billions of years for correct scaling -# of GERP scores (see dated phylogenetic trees from www.timetree.org). +# Divergence time estimates must be in millions of years for consistent scaling +# of GERP scores across GenErode runs. Dated phylogenetic trees in millions of years +# are e.g. available from www.timetree.org. # Species names in the tree must be identical to the FASTA file names # without ".fa.gz", ".fasta.gz" or ".fna.gz". tree: "" diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index cdf9c17..1d8a606 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -585,6 +585,12 @@ rule concatenate_fasta_per_contig: rule compute_gerp: """ Compute GERP++ scores. + '-s 0.001': tree scaling factor to re-scale tree. Set to 0.001 to scale + a tree from millions of years (such as trees from www.timetree.org) to + billions of years to ensure consistently scaled GERP scores across GenErode + runs. GERP++ gerpcol default: 1.0 + '-v': verbose mode + '-a': alignment in mfa format Output only includes positions, no contig names. This analysis is run as one job per genome chunk, but is internally run per contig. """ @@ -608,7 +614,7 @@ rule compute_gerp: fi for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig do - gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -e {params.name} 2> {log} && + gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s 0.001 -e {params.name} 2> {log} && mv {input.concatenated_fasta_dir}/${{contig}}.fasta.rates {output.gerp_dir} 2>> {log} && echo "Computed GERP++ scores for" $contig >> {log} done @@ -650,32 +656,6 @@ rule gerp2coords: """.format(contig=contig)) -rule rescale_gerp: - """ - Re-scale GERP scores to correct time scale. - This analysis is run as one job per genome chunk, but is internally run per contig. - """ - input: - gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir, - chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", - output: - gerp_rescaled_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_rescaled/")), - log: - "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_rescale_gerp.log", - shell: - """ - if [ ! -d {output.gerp_rescaled_dir} ]; then - mkdir -p {output.gerp_rescaled_dir}; - fi - for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig - do - awk -F'\t' '{{ if($1 ~ /[0-9]+/ && $1 != 0) {{print $1/1000}} else {{print $1}} }}' OFS='\t' \ - {input.gerp_coords_dir}/${{contig}}.fasta.rates.parsed > {output.gerp_rescaled_dir}/${{contig}}.fasta.rates.parsed.rescaled 2>> {log} && - echo "GERP scores rescaled for" $contig >> {log} - done - """ - - rule get_ancestral_state: """Get the ancestral state of each position in the focal reference genome.""" input: @@ -709,7 +689,7 @@ rule produce_contig_out: """Merge the ancestral allele and gerp-scores into one file per contig.""" input: fasta_ancestral_dir=rules.get_ancestral_state.output.fasta_ancestral_dir, - gerp_rescaled_dir=rules.rescale_gerp.output.gerp_rescaled_dir, + gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir, chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", output: gerp_merged_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")), @@ -727,9 +707,9 @@ rule produce_contig_out: if [ ! -d {{output.gerp_merged_dir}} ]; then mkdir -p {{output.gerp_merged_dir}}; fi - paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_rescaled_dir}}/{contig}.fasta.rates.parsed.rescaled | \ + paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_coords_dir}}/{contig}.fasta.rates.parsed | \ sed "s/^/{contig}\t/g" > {{output.gerp_merged_dir}}/{contig}.fasta.parsed.rates 2>> {{log}} && - echo "Rescaled GERP-scores and ancestral states merged for {contig}" >> {{log}} + echo "GERP-scores and ancestral states merged for {contig}" >> {{log}} """.format(contig=contig)) @@ -772,7 +752,7 @@ rule merge_gerp_gz: rule plot_gerp_hist: - """Plot the rescaled GERP scores as histogram""" + """Plot the GERP scores as histogram""" input: gerp_out=rules.merge_gerp_gz.output.gerp_out, output: