Merge pull request #59 from NBISweden/gerp

Move rescaling to gerp rule and update documentation
NBISweden · Oct 6, 2023 · be1fd3b · be1fd3b
2 parents 81e24df + 79d3384
commit be1fd3b
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 37 deletions.
diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml
@@ -494,12 +494,16 @@ gerp_ref_path: ""
 # Full path to phylogenetic tree of all species included in the analysis 
 # (including the target species) in NEWICK format and including divergence 
 # time estimates.
-# Divergence time estimates must be in billions of years for correct scaling 
+# Divergence time estimates must be in millions of years for correct scaling 
 # of GERP scores (see dated phylogenetic trees from www.timetree.org).
 # Species names in the tree must be identical to the FASTA file names 
 # without ".fa.gz", ".fasta.gz" or ".fna.gz".
 tree: ""
 
+# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
+# www.timetree.org is provided (in millions of years), set to 0.001.
+tree_scaling_factor: 0.001
+
 # Minimum and maximum GERP score for a site to be included into calculations 
 # of relative mutational load.
 # Positive values indicate purifying selection.

diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml
@@ -494,12 +494,16 @@ gerp_ref_path: ""
 # Full path to phylogenetic tree of all species included in the analysis 
 # (including the target species) in NEWICK format and including divergence 
 # time estimates.
-# Divergence time estimates must be in billions of years for correct scaling 
+# Divergence time estimates must be in millions of years for correct scaling 
 # of GERP scores (see dated phylogenetic trees from www.timetree.org).
 # Species names in the tree must be identical to the FASTA file names 
 # without ".fa.gz", ".fasta.gz" or ".fna.gz".
 tree: ""
 
+# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
+# www.timetree.org is provided (in millions of years), set to 0.001.
+tree_scaling_factor: 0.001
+
 # Minimum and maximum GERP score for a site to be included into calculations 
 # of relative mutational load.
 # Positive values indicate purifying selection.

diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml
@@ -494,12 +494,16 @@ gerp_ref_path: ""
 # Full path to phylogenetic tree of all species included in the analysis 
 # (including the target species) in NEWICK format and including divergence 
 # time estimates.
-# Divergence time estimates must be in billions of years for correct scaling 
+# Divergence time estimates must be in millions of years for correct scaling 
 # of GERP scores (see dated phylogenetic trees from www.timetree.org).
 # Species names in the tree must be identical to the FASTA file names 
 # without ".fa.gz", ".fasta.gz" or ".fna.gz".
 tree: ""
 
+# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
+# www.timetree.org is provided (in millions of years), set to 0.001.
+tree_scaling_factor: 0.001
+
 # Minimum and maximum GERP score for a site to be included into calculations 
 # of relative mutational load.
 # Positive values indicate purifying selection.

diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml
@@ -494,7 +494,7 @@ gerp_ref_path: ".test/data/gerp_data"
 # Full path to phylogenetic tree of all species included in the analysis 
 # (including the target species) in NEWICK format and including divergence 
 # time estimates.
-# Divergence time estimates must be in billions of years for correct scaling 
+# Divergence time estimates must be in millions of years for correct scaling 
 # of GERP scores (see dated phylogenetic trees from www.timetree.org).
 # Species names in the tree must be identical to the FASTA file names 
 # without ".fa.gz", ".fasta.gz" or ".fna.gz".
@@ -506,6 +506,10 @@ tree: ".test/data/gerp_data/gerp_tree.nwk"
 min_gerp: 0
 max_gerp: 1000
 
+# Tree scaling factor for GERP++ ("-s") to re-scale tree. If a tree from
+# www.timetree.org is provided (in millions of years), set to 0.001.
+tree_scaling_factor: 0.001
+
 #####
 # NOTE:
 # The GERP step produces a large number of large intermediate files,

diff --git a/config/config.yaml b/config/config.yaml
@@ -496,8 +496,9 @@ gerp_ref_path: ""
 # Full path to phylogenetic tree of all species included in the analysis 
 # (including the target species) in NEWICK format and including divergence 
 # time estimates.
-# Divergence time estimates must be in billions of years for correct scaling 
-# of GERP scores (see dated phylogenetic trees from www.timetree.org).
+# Divergence time estimates must be in millions of years for consistent scaling 
+# of GERP scores across GenErode runs. Dated phylogenetic trees in millions of years 
+# are e.g. available from www.timetree.org.
 # Species names in the tree must be identical to the FASTA file names 
 # without ".fa.gz", ".fasta.gz" or ".fna.gz".
 tree: ""

diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk
@@ -585,6 +585,12 @@ rule concatenate_fasta_per_contig:
 rule compute_gerp:
     """
     Compute GERP++ scores.
+    '-s 0.001': tree scaling factor to re-scale tree. Set to 0.001 to scale
+    a tree from millions of years (such as trees from www.timetree.org) to 
+    billions of years to ensure consistently scaled GERP scores across GenErode
+    runs. GERP++ gerpcol default: 1.0
+    '-v': verbose mode
+    '-a': alignment in mfa format
     Output only includes positions, no contig names.
     This analysis is run as one job per genome chunk, but is internally run per contig.
     """
@@ -608,7 +614,7 @@ rule compute_gerp:
         fi
         for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig
         do
-          gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -e {params.name} 2> {log} &&
+          gerpcol -v -f {input.concatenated_fasta_dir}/${{contig}}.fasta -t {input.tree} -a -s 0.001 -e {params.name} 2> {log} &&
           mv {input.concatenated_fasta_dir}/${{contig}}.fasta.rates {output.gerp_dir} 2>> {log} &&
           echo "Computed GERP++ scores for" $contig >> {log} 
         done
@@ -650,32 +656,6 @@ rule gerp2coords:
                 """.format(contig=contig))
 
 
-rule rescale_gerp:
-    """
-    Re-scale GERP scores to correct time scale. 
-    This analysis is run as one job per genome chunk, but is internally run per contig.
-    """
-    input:
-        gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir,
-        chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed",
-    output:
-        gerp_rescaled_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_rescaled/")),
-    log:
-        "results/logs/13_GERP/chunks/" + REF_NAME + "/gerp/{chunk}_rescale_gerp.log",
-    shell:
-        """
-        if [ ! -d {output.gerp_rescaled_dir} ]; then 
-            mkdir -p {output.gerp_rescaled_dir}; 
-        fi
-        for contig in $(awk -F'\t' '{{print $1}}' {input.chunk_bed}) # run the analysis per contig
-        do
-          awk -F'\t' '{{ if($1 ~ /[0-9]+/ && $1 != 0) {{print $1/1000}} else {{print $1}} }}' OFS='\t' \
-          {input.gerp_coords_dir}/${{contig}}.fasta.rates.parsed > {output.gerp_rescaled_dir}/${{contig}}.fasta.rates.parsed.rescaled 2>> {log} &&
-          echo "GERP scores rescaled for" $contig >> {log}
-        done
-        """
-
-
 rule get_ancestral_state:
     """Get the ancestral state of each position in the focal reference genome."""
     input:
@@ -709,7 +689,7 @@ rule produce_contig_out:
     """Merge the ancestral allele and gerp-scores into one file per contig."""
     input:
         fasta_ancestral_dir=rules.get_ancestral_state.output.fasta_ancestral_dir,
-        gerp_rescaled_dir=rules.rescale_gerp.output.gerp_rescaled_dir,
+        gerp_coords_dir=rules.gerp2coords.output.gerp_coords_dir,
         chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed",
     output:
         gerp_merged_dir=temp(directory("results/gerp/chunks/" + REF_NAME + "/gerp/{chunk}_gerp_merged/")),
@@ -727,9 +707,9 @@ rule produce_contig_out:
                 if [ ! -d {{output.gerp_merged_dir}} ]; then 
                     mkdir -p {{output.gerp_merged_dir}}; 
                 fi
-                paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_rescaled_dir}}/{contig}.fasta.rates.parsed.rescaled | \
+                paste {{input.fasta_ancestral_dir}}/{contig}.fasta.parsed {{input.gerp_coords_dir}}/{contig}.fasta.rates.parsed | \
                 sed "s/^/{contig}\t/g" > {{output.gerp_merged_dir}}/{contig}.fasta.parsed.rates 2>> {{log}} &&
-                echo "Rescaled GERP-scores and ancestral states merged for {contig}" >> {{log}}
+                echo "GERP-scores and ancestral states merged for {contig}" >> {{log}}
                 """.format(contig=contig))
 
 
@@ -772,7 +752,7 @@ rule merge_gerp_gz:
 
 
 rule plot_gerp_hist:
-    """Plot the rescaled GERP scores as histogram"""
+    """Plot the GERP scores as histogram"""
     input:
         gerp_out=rules.merge_gerp_gz.output.gerp_out,
     output: