Skip to content

Commit

Permalink
Merge pull request #59 from NBISweden/gerp
Browse files Browse the repository at this point in the history
Move rescaling to gerp rule and update documentation
  • Loading branch information
verku authored Oct 6, 2023
2 parents 81e24df + 79d3384 commit be1fd3b
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 37 deletions.
6 changes: 5 additions & 1 deletion .test/config/config_mitogenomes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,12 +494,16 @@ gerp_ref_path: ""
# Full path to phylogenetic tree of all species included in the analysis
# (including the target species) in NEWICK format and including divergence
# time estimates.
# Divergence time estimates must be in billions of years for correct scaling
# Divergence time estimates must be in millions of years for correct scaling
# of GERP scores (see dated phylogenetic trees from www.timetree.org).
# Species names in the tree must be identical to the FASTA file names
# without ".fa.gz", ".fasta.gz" or ".fna.gz".
tree: ""

# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
# www.timetree.org is provided (in millions of years), set to 0.001.
tree_scaling_factor: 0.001

# Minimum and maximum GERP score for a site to be included into calculations
# of relative mutational load.
# Positive values indicate purifying selection.
Expand Down
6 changes: 5 additions & 1 deletion .test/config/config_mlRho_options.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,12 +494,16 @@ gerp_ref_path: ""
# Full path to phylogenetic tree of all species included in the analysis
# (including the target species) in NEWICK format and including divergence
# time estimates.
# Divergence time estimates must be in billions of years for correct scaling
# Divergence time estimates must be in millions of years for correct scaling
# of GERP scores (see dated phylogenetic trees from www.timetree.org).
# Species names in the tree must be identical to the FASTA file names
# without ".fa.gz", ".fasta.gz" or ".fna.gz".
tree: ""

# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
# www.timetree.org is provided (in millions of years), set to 0.001.
tree_scaling_factor: 0.001

# Minimum and maximum GERP score for a site to be included into calculations
# of relative mutational load.
# Positive values indicate purifying selection.
Expand Down
6 changes: 5 additions & 1 deletion .test/config/config_pca_roh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,12 +494,16 @@ gerp_ref_path: ""
# Full path to phylogenetic tree of all species included in the analysis
# (including the target species) in NEWICK format and including divergence
# time estimates.
# Divergence time estimates must be in billions of years for correct scaling
# Divergence time estimates must be in millions of years for correct scaling
# of GERP scores (see dated phylogenetic trees from www.timetree.org).
# Species names in the tree must be identical to the FASTA file names
# without ".fa.gz", ".fasta.gz" or ".fna.gz".
tree: ""

# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
# www.timetree.org is provided (in millions of years), set to 0.001.
tree_scaling_factor: 0.001

# Minimum and maximum GERP score for a site to be included into calculations
# of relative mutational load.
# Positive values indicate purifying selection.
Expand Down
6 changes: 5 additions & 1 deletion .test/config/config_snpeff_gerp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ gerp_ref_path: ".test/data/gerp_data"
# Full path to phylogenetic tree of all species included in the analysis
# (including the target species) in NEWICK format and including divergence
# time estimates.
# Divergence time estimates must be in billions of years for correct scaling
# Divergence time estimates must be in millions of years for correct scaling
# of GERP scores (see dated phylogenetic trees from www.timetree.org).
# Species names in the tree must be identical to the FASTA file names
# without ".fa.gz", ".fasta.gz" or ".fna.gz".
Expand All @@ -506,6 +506,10 @@ tree: ".test/data/gerp_data/gerp_tree.nwk"
min_gerp: 0
max_gerp: 1000

# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
# www.timetree.org is provided (in millions of years), set to 0.001.
tree_scaling_factor: 0.001

#####
# NOTE:
# The GERP step produces a large number of large intermediate files,
Expand Down
5 changes: 3 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,9 @@ gerp_ref_path: ""
# Full path to phylogenetic tree of all species included in the analysis
# (including the target species) in NEWICK format and including divergence
# time estimates.
# Divergence time estimates must be in billions of years for correct scaling
# of GERP scores (see dated phylogenetic trees from www.timetree.org).
# Divergence time estimates must be in millions of years for consistent scaling
# of GERP scores across GenErode runs. Dated phylogenetic trees in millions of years
# are e.g. available from www.timetree.org.
# Species names in the tree must be identical to the FASTA file names
# without ".fa.gz", ".fasta.gz" or ".fna.gz".
tree: ""
Expand Down
42 changes: 11 additions & 31 deletions workflow/rules/13_GERP.smk
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,12 @@ rule concatenate_fasta_per_contig:
rule compute_gerp:
"""
Compute GERP++ scores.
'-s 0.001': tree scaling factor to re-scale tree. Set to 0.001 to scale
a tree from millions of years (such as trees from www.timetree.org) to
billions of years to ensure consistently scaled GERP scores across GenErode
runs. GERP++ gerpcol default: 1.0
'-v': verbose mode
'-a': alignment in mfa format
Output only includes positions, no contig names.
This analysis is run as one job per genome chunk, but is internally run per contig.
"""
Expand All @@ -608,7 +614,7 @@ rule compute_gerp:
fi
for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig
do
gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -e {params.name} 2> {log} &&
gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s 0.001 -e {params.name} 2> {log} &&
mv {input.concatenated_fasta_dir}/${{contig}}.fasta.rates {output.gerp_dir} 2>> {log} &&
echo "Computed GERP++ scores for" $contig >> {log}
done
Expand Down Expand Up @@ -650,32 +656,6 @@ rule gerp2coords:
""".format(contig=contig))


rule rescale_gerp:
"""
Re-scale GERP scores to correct time scale.
This analysis is run as one job per genome chunk, but is internally run per contig.
"""
input:
gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir,
chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed",
output:
gerp_rescaled_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_rescaled/")),
log:
"results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_rescale_gerp.log",
shell:
"""
if [ ! -d {output.gerp_rescaled_dir} ]; then
mkdir -p {output.gerp_rescaled_dir};
fi
for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig
do
awk -F'\t' '{{ if($1 ~ /[0-9]+/ && $1 != 0) {{print $1/1000}} else {{print $1}} }}' OFS='\t' \
{input.gerp_coords_dir}/${{contig}}.fasta.rates.parsed > {output.gerp_rescaled_dir}/${{contig}}.fasta.rates.parsed.rescaled 2>> {log} &&
echo "GERP scores rescaled for" $contig >> {log}
done
"""


rule get_ancestral_state:
"""Get the ancestral state of each position in the focal reference genome."""
input:
Expand Down Expand Up @@ -709,7 +689,7 @@ rule produce_contig_out:
"""Merge the ancestral allele and gerp-scores into one file per contig."""
input:
fasta_ancestral_dir=rules.get_ancestral_state.output.fasta_ancestral_dir,
gerp_rescaled_dir=rules.rescale_gerp.output.gerp_rescaled_dir,
gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir,
chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed",
output:
gerp_merged_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")),
Expand All @@ -727,9 +707,9 @@ rule produce_contig_out:
if [ ! -d {{output.gerp_merged_dir}} ]; then
mkdir -p {{output.gerp_merged_dir}};
fi
paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_rescaled_dir}}/{contig}.fasta.rates.parsed.rescaled | \
paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_coords_dir}}/{contig}.fasta.rates.parsed | \
sed "s/^/{contig}\t/g" > {{output.gerp_merged_dir}}/{contig}.fasta.parsed.rates 2>> {{log}} &&
echo "Rescaled GERP-scores and ancestral states merged for {contig}" >> {{log}}
echo "GERP-scores and ancestral states merged for {contig}" >> {{log}}
""".format(contig=contig))


Expand Down Expand Up @@ -772,7 +752,7 @@ rule merge_gerp_gz:


rule plot_gerp_hist:
"""Plot the rescaled GERP scores as histogram"""
"""Plot the GERP scores as histogram"""
input:
gerp_out=rules.merge_gerp_gz.output.gerp_out,
output:
Expand Down

0 comments on commit be1fd3b

Please sign in to comment.