Skip to content

Commit

Permalink
Add prM-E build [#15]
Browse files Browse the repository at this point in the history
  • Loading branch information
genehack committed Aug 29, 2024
1 parent ebff7bb commit d64c507
Show file tree
Hide file tree
Showing 16 changed files with 191 additions and 38 deletions.
4 changes: 2 additions & 2 deletions phylogenetic/Snakefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
configfile: "defaults/config.yaml"

# we will likely add gene-specific builds in the future
gene = ["genome"]
gene = ["genome", "prM-E"]

rule all:
input:
auspice_json="auspice/yellow-fever-virus_genome.json",
auspice_json=expand("auspice/yellow-fever-virus_{gene}.json", gene=gene),


include: "rules/prepare_sequences.smk"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "Real-time tracking of yellow fever virus full genome virus evolution",
"title": "Real-time tracking of yellow fever virus full genome evolution",
"maintainers": [
{"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
{"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
Expand Down
64 changes: 64 additions & 0 deletions phylogenetic/defaults/auspice_config_prM-E.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"title": "Real-time tracking of yellow fever virus prM-E region evolution",
"maintainers": [
{"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
{"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
],
"data_provenance": [
{
"name": "GenBank",
"url": "https://www.ncbi.nlm.nih.gov/genbank/"
}
],
"build_url": "https://github.com/nextstrain/yellow-fever",
"colorings": [
{
"key": "gt",
"title": "Genotype",
"type": "categorical"
},
{
"key": "clade",
"title": "Genotype (via Nextclade tree)",
"type": "categorical"
},
{
"key": "num_date",
"title": "Date",
"type": "continuous"
},
{
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "country",
"title": "Country",
"type": "categorical"
},
{
"key": "host",
"title": "Host",
"type": "categorical"
}
],
"geo_resolutions": [
"country",
"region"
],
"display_defaults": {
"map_triplicate": true,
"color_by": "region"
},
"filters": [
"clade",
"region",
"country",
"author",
"host"
],
"metadata_columns": [
"author"
]
}
26 changes: 18 additions & 8 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
strain_id_field: "accession"
files:
auspice_config: "defaults/auspice_config.json"
auspice_config_genome: "defaults/auspice_config_genome.json"
auspice_config_prM-E: "defaults/auspice_config_prM-E.json"
colors: "defaults/colors.tsv"
description: "defaults/description.md"
exclude: "defaults/dropped_strains.txt"
genemap: "defaults/genemap.gff"
include_genome: "defaults/include_strains_genome.txt"
reference_gb: "defaults/reference.gb"
reference_fasta: "defaults/reference.fasta"
genome:
exclude: "defaults/dropped_strains_genome.txt"
genemap: "defaults/genemap_genome.gff"
include: "defaults/include_strains_genome.txt"
reference: "defaults/reference_genome.fasta"
prM-E:
exclude: "defaults/dropped_strains_prM-E.txt"
genemap: "defaults/genemap_prM-E.gff"
include: "defaults/include_strains_prM-E.txt"
reference: "defaults/reference_prM-E.fasta"
filter:
group_by: "country year"
min_date: "1900-01-01"
min_length: 9000
sequences_per_group: 20
genome:
min_length: 9000
sequences_per_group: 20
prM-E:
min_length: 600
sequences_per_group: 200
refine:
coalescent: "opt"
date_inference: "marginal"
Expand Down
File renamed without changes.
Empty file.
File renamed without changes.
5 changes: 5 additions & 0 deletions phylogenetic/defaults/genemap_prM-E.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##sequence-region prM-E 1 672
NC_002031.1 feature source 1 672 . + . gene=nuc
NC_002031.1 feature gene 1 333 . + . gene_name=prM
NC_002031.1 feature gene 109 333 . + . gene_name=M
NC_002031.1 feature gene 334 672 . + . gene_name=E
Empty file.
File renamed without changes.
File renamed without changes.
13 changes: 13 additions & 0 deletions phylogenetic/defaults/reference_prM-E.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
> prM-E region (genome 641-1312, 672 nt)
CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC
GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG
CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG
GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG
ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC
CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG
GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG
TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC
ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG
ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC
AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG
AAAGGGAGCATT
4 changes: 2 additions & 2 deletions phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rule ancestral:
"""Reconstructing ancestral sequences and mutations"""
input:
tree = "results/{gene}/tree.nwk",
alignment = "results/{gene}/aligned.fasta"
alignment = "results/{gene}/aligned_and_filtered.fasta"
output:
node_data = "results/{gene}/nt_muts.json"
params:
Expand All @@ -31,7 +31,7 @@ rule translate:
input:
tree = "results/{gene}/tree.nwk",
node_data = "results/{gene}/nt_muts.json",
genemap = "defaults/genemap.gff"
genemap = "defaults/genemap_{gene}.gff"
output:
node_data = "results/{gene}/aa_muts.json"
log:
Expand Down
5 changes: 2 additions & 3 deletions phylogenetic/rules/construct_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ This part of the workflow constructs the phylogenetic tree.
rule tree:
"""Building tree"""
input:
alignment = "results/{gene}/aligned.fasta"
alignment = "results/{gene}/aligned_and_filtered.fasta"
output:
tree = "results/{gene}/tree_raw.nwk"
log:
Expand All @@ -29,7 +29,7 @@ rule refine:
"""
input:
tree = "results/{gene}/tree_raw.nwk",
alignment = "results/{gene}/aligned.fasta",
alignment = "results/{gene}/aligned_and_filtered.fasta",
metadata = "../ingest/results/metadata.tsv"
output:
tree = "results/{gene}/tree.nwk",
Expand All @@ -54,7 +54,6 @@ rule refine:
--output-tree {output.tree:q} \
--output-node-data {output.node_data:q} \
--coalescent {params.coalescent:q} \
--timetree \
--date-confidence \
--date-inference {params.date_inference:q} \
--clock-filter-iqd {params.clock_filter_iqd:q} \
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ rule export:
aa_muts = "results/{gene}/aa_muts.json",
traits = "results/{gene}/traits.json",
colors = config["files"]["colors"],
auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json",
description=config["files"]["description"]
auspice_config = lambda w: config["files"]["auspice_config_genome"] if w.gene == "genome" else config["files"]["auspice_config_prM-E"],
description=config["files"]["description"],
output:
auspice_json = "auspice/yellow-fever-virus_{gene}.json"
params:
Expand Down
102 changes: 82 additions & 20 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,27 @@ This part of the workflow prepares sequences for constructing the
phylogenetic tree.
"""

rule filter:
message: """
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- excluding strains in {input.exclude}
- minimum genome length of {params.min_length}
"""
rule filter_genome:
input:
exclude = config["files"]["exclude"],
include = config["files"]["include_genome"],
exclude = config["files"]["genome"]["exclude"],
include = config["files"]["genome"]["include"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
metadata = "../ingest/results/metadata.tsv",
sequences = "../ingest/results/sequences.fasta"
output:
sequences = "results/{gene}/filtered.fasta"
sequences = "results/genome/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_date = config["filter"]["min_date"],
min_length = config["filter"]["min_length"],
sequences_per_group = config["filter"]["sequences_per_group"],
min_length = config["filter"]["genome"]["min_length"],
sequences_per_group = config["filter"]["genome"]["sequences_per_group"],
strain_id = config["strain_id_field"]
log:
"logs/{gene}/filter.txt",
"logs/genome/filter_genome.txt",
benchmark:
"benchmarks/{gene}/filter.txt"
"benchmarks/genome/filter_genome.txt"
shell:
r"""
augur filter \
Expand All @@ -46,17 +40,16 @@ rule filter:
2> {log:q}
"""

rule align:
rule align_genome:
input:
sequences="results/genome/filtered.fasta",
reference=config["files"]["reference_gb"],
genemap=config["files"]["genemap"],
reference=config["files"]["genome"]["reference"],
output:
alignment="results/{gene}/aligned.fasta",
alignment="results/genome/aligned_and_filtered.fasta",
log:
"logs/{gene}/align.txt",
"logs/genome/align_genome.txt",
benchmark:
"benchmarks/{gene}/align.txt"
"benchmarks/genome/align_genome.txt"
shell:
r"""
augur align \
Expand All @@ -67,3 +60,72 @@ rule align:
--remove-reference \
2> {log:q}
"""


rule align_and_extract_prME:
input:
reference=config["files"]["prM-E"]["reference"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
sequences = "../ingest/results/sequences.fasta",
output:
alignment = "results/prM-E/aligned.fasta"
params:
group_by = config["filter"]["group_by"],
min_date = config["filter"]["min_date"],
min_length = config["filter"]["prM-E"]["min_length"],
sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
strain_id = config["strain_id_field"]
log:
"logs/genome/filter_and_extract_prM-E.txt",
benchmark:
"benchmarks/genome/filter_and_extract_prM-E.txt"
shell:
r"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--fill-gaps \
--remove-reference \
2> {log:q}
"""


rule filter_prME:
input:
exclude = config["files"]["prM-E"]["exclude"],
include = config["files"]["prM-E"]["include"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
metadata = "../ingest/results/metadata.tsv",
sequences = "results/prM-E/aligned.fasta"
output:
sequences = "results/prM-E/aligned_and_filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_date = config["filter"]["min_date"],
min_length = config["filter"]["prM-E"]["min_length"],
sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
strain_id = config["strain_id_field"]
log:
"logs/genome/filter_prM-E.txt",
benchmark:
"benchmarks/genome/filter_prM-E.txt"
shell:
r"""
augur filter \
--sequences {input.sequences:q} \
--metadata {input.metadata:q} \
--metadata-id-columns {params.strain_id:q} \
--exclude {input.exclude:q} \
--include {input.include:q} \
--output {output.sequences:q} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group:q} \
--min-date {params.min_date:q} \
--min-length {params.min_length:q} \
2> {log:q}
"""

0 comments on commit d64c507

Please sign in to comment.