Add prM-E build [#15]

nextstrain · Aug 29, 2024 · d64c507 · d64c507
1 parent ebff7bb
commit d64c507
Show file tree

Hide file tree

Showing 16 changed files with 191 additions and 38 deletions.
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -1,11 +1,11 @@
 configfile: "defaults/config.yaml"
 
 # we will likely add gene-specific builds in the future
-gene = ["genome"]
+gene = ["genome", "prM-E"]
 
 rule all:
     input:
-        auspice_json="auspice/yellow-fever-virus_genome.json",
+        auspice_json=expand("auspice/yellow-fever-virus_{gene}.json", gene=gene),
 
 
 include: "rules/prepare_sequences.smk"

diff --git a/phylogenetic/defaults/auspice_config.json → ...netic/defaults/auspice_config_genome.json b/phylogenetic/defaults/auspice_config.json → ...netic/defaults/auspice_config_genome.json
@@ -1,5 +1,5 @@
 {
-  "title": "Real-time tracking of yellow fever virus full genome virus evolution",
+  "title": "Real-time tracking of yellow fever virus full genome evolution",
   "maintainers": [
     {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
     {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}

diff --git a/phylogenetic/defaults/auspice_config_prM-E.json b/phylogenetic/defaults/auspice_config_prM-E.json
@@ -0,0 +1,64 @@
+{
+  "title": "Real-time tracking of yellow fever virus prM-E region evolution",
+  "maintainers": [
+    {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "data_provenance": [
+    {
+      "name": "GenBank",
+      "url": "https://www.ncbi.nlm.nih.gov/genbank/"
+    }
+  ],
+  "build_url": "https://github.com/nextstrain/yellow-fever",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "clade",
+      "title": "Genotype (via Nextclade tree)",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true,
+    "color_by": "region"
+  },
+  "filters": [
+    "clade",
+    "region",
+    "country",
+    "author",
+    "host"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -1,18 +1,28 @@
 strain_id_field: "accession"
 files:
-    auspice_config: "defaults/auspice_config.json"
+    auspice_config_genome: "defaults/auspice_config_genome.json"
+    auspice_config_prM-E: "defaults/auspice_config_prM-E.json"
     colors: "defaults/colors.tsv"
     description: "defaults/description.md"
-    exclude: "defaults/dropped_strains.txt"
-    genemap: "defaults/genemap.gff"
-    include_genome: "defaults/include_strains_genome.txt"
-    reference_gb: "defaults/reference.gb"
-    reference_fasta: "defaults/reference.fasta"
+    genome:
+        exclude: "defaults/dropped_strains_genome.txt"
+        genemap: "defaults/genemap_genome.gff"
+        include: "defaults/include_strains_genome.txt"
+        reference: "defaults/reference_genome.fasta"
+    prM-E:
+        exclude: "defaults/dropped_strains_prM-E.txt"
+        genemap: "defaults/genemap_prM-E.gff"
+        include: "defaults/include_strains_prM-E.txt"
+        reference: "defaults/reference_prM-E.fasta"
 filter:
     group_by: "country year"
     min_date: "1900-01-01"
-    min_length: 9000
-    sequences_per_group: 20
+    genome:
+        min_length: 9000
+        sequences_per_group: 20
+    prM-E:
+        min_length: 600
+        sequences_per_group: 200
 refine:
     coalescent: "opt"
     date_inference: "marginal"

diff --git a/phylogenetic/defaults/dropped_strains.txt → ...netic/defaults/dropped_strains_genome.txt b/phylogenetic/defaults/dropped_strains.txt → ...netic/defaults/dropped_strains_genome.txt
diff --git a/phylogenetic/defaults/dropped_strains_prM-E.txt b/phylogenetic/defaults/dropped_strains_prM-E.txt
diff --git a/phylogenetic/defaults/genemap.gff → phylogenetic/defaults/genemap_genome.gff b/phylogenetic/defaults/genemap.gff → phylogenetic/defaults/genemap_genome.gff
diff --git a/phylogenetic/defaults/genemap_prM-E.gff b/phylogenetic/defaults/genemap_prM-E.gff
@@ -0,0 +1,5 @@
+##sequence-region prM-E 1 672
+NC_002031.1	feature	source	1	672	.	+	.	gene=nuc
+NC_002031.1	feature	gene	1	333	.	+	.	gene_name=prM
+NC_002031.1	feature	gene	109	333	.	+	.	gene_name=M
+NC_002031.1	feature	gene	334	672	.	+	.	gene_name=E
diff --git a/phylogenetic/defaults/include_strains_prM-E.txt b/phylogenetic/defaults/include_strains_prM-E.txt
diff --git a/phylogenetic/defaults/reference.fasta → phylogenetic/defaults/reference_genome.fasta b/phylogenetic/defaults/reference.fasta → phylogenetic/defaults/reference_genome.fasta
diff --git a/phylogenetic/defaults/reference.gb → phylogenetic/defaults/reference_genome.gb b/phylogenetic/defaults/reference.gb → phylogenetic/defaults/reference_genome.gb
diff --git a/phylogenetic/defaults/reference_prM-E.fasta b/phylogenetic/defaults/reference_prM-E.fasta
@@ -0,0 +1,13 @@
+> prM-E region (genome 641-1312, 672 nt)
+CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC
+GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG
+CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG
+GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG
+ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC
+CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG
+GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG
+TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC
+ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG
+ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC
+AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG
+AAAGGGAGCATT
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
@@ -7,7 +7,7 @@ rule ancestral:
     """Reconstructing ancestral sequences and mutations"""
     input:
         tree = "results/{gene}/tree.nwk",
-        alignment = "results/{gene}/aligned.fasta"
+        alignment = "results/{gene}/aligned_and_filtered.fasta"
     output:
         node_data = "results/{gene}/nt_muts.json"
     params:
@@ -31,7 +31,7 @@ rule translate:
     input:
         tree = "results/{gene}/tree.nwk",
         node_data = "results/{gene}/nt_muts.json",
-        genemap = "defaults/genemap.gff"
+        genemap = "defaults/genemap_{gene}.gff"
     output:
         node_data = "results/{gene}/aa_muts.json"
     log:

diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
@@ -5,7 +5,7 @@ This part of the workflow constructs the phylogenetic tree.
 rule tree:
     """Building tree"""
     input:
-        alignment = "results/{gene}/aligned.fasta"
+        alignment = "results/{gene}/aligned_and_filtered.fasta"
     output:
         tree = "results/{gene}/tree_raw.nwk"
     log:
@@ -29,7 +29,7 @@ rule refine:
     """
     input:
         tree = "results/{gene}/tree_raw.nwk",
-        alignment = "results/{gene}/aligned.fasta",
+        alignment = "results/{gene}/aligned_and_filtered.fasta",
         metadata = "../ingest/results/metadata.tsv"
     output:
         tree = "results/{gene}/tree.nwk",
@@ -54,7 +54,6 @@ rule refine:
             --output-tree {output.tree:q} \
             --output-node-data {output.node_data:q} \
             --coalescent {params.coalescent:q} \
-            --timetree \
             --date-confidence \
             --date-inference {params.date_inference:q} \
             --clock-filter-iqd {params.clock_filter_iqd:q} \

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
@@ -13,8 +13,8 @@ rule export:
         aa_muts = "results/{gene}/aa_muts.json",
         traits = "results/{gene}/traits.json",
         colors = config["files"]["colors"],
-        auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json",
-        description=config["files"]["description"]
+        auspice_config = lambda w: config["files"]["auspice_config_genome"] if w.gene == "genome" else config["files"]["auspice_config_prM-E"],
+        description=config["files"]["description"],
     output:
         auspice_json = "auspice/yellow-fever-virus_{gene}.json"
     params:

diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
@@ -3,33 +3,27 @@ This part of the workflow prepares sequences for constructing the
 phylogenetic tree.
 """
 
-rule filter:
-    message: """
-    Filtering to
-      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-      - excluding strains in {input.exclude}
-      - minimum genome length of {params.min_length}
-    """
+rule filter_genome:
     input:
-        exclude = config["files"]["exclude"],
-        include = config["files"]["include_genome"],
+        exclude = config["files"]["genome"]["exclude"],
+        include = config["files"]["genome"]["include"],
         # TODO once this repo is fully automated and uploading data to
         # S3, this step should download data from there instead of
         # depending on the ingest build
         metadata = "../ingest/results/metadata.tsv",
         sequences = "../ingest/results/sequences.fasta"
     output:
-        sequences = "results/{gene}/filtered.fasta"
+        sequences = "results/genome/filtered.fasta"
     params:
         group_by = config["filter"]["group_by"],
         min_date = config["filter"]["min_date"],
-        min_length = config["filter"]["min_length"],
-        sequences_per_group = config["filter"]["sequences_per_group"],
+        min_length = config["filter"]["genome"]["min_length"],
+        sequences_per_group = config["filter"]["genome"]["sequences_per_group"],
         strain_id = config["strain_id_field"]
     log:
-        "logs/{gene}/filter.txt",
+        "logs/genome/filter_genome.txt",
     benchmark:
-        "benchmarks/{gene}/filter.txt"
+        "benchmarks/genome/filter_genome.txt"
     shell:
         r"""
         augur filter \
@@ -46,17 +40,16 @@ rule filter:
           2> {log:q}
         """
 
-rule align:
+rule align_genome:
     input:
         sequences="results/genome/filtered.fasta",
-        reference=config["files"]["reference_gb"],
-        genemap=config["files"]["genemap"],
+        reference=config["files"]["genome"]["reference"],
     output:
-        alignment="results/{gene}/aligned.fasta",
+        alignment="results/genome/aligned_and_filtered.fasta",
     log:
-        "logs/{gene}/align.txt",
+        "logs/genome/align_genome.txt",
     benchmark:
-        "benchmarks/{gene}/align.txt"
+        "benchmarks/genome/align_genome.txt"
     shell:
         r"""
         augur align \
@@ -67,3 +60,72 @@ rule align:
             --remove-reference \
           2> {log:q}
         """
+
+
+rule align_and_extract_prME:
+    input:
+        reference=config["files"]["prM-E"]["reference"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+    output:
+        alignment = "results/prM-E/aligned.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["prM-E"]["min_length"],
+        sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
+        strain_id = config["strain_id_field"]
+    log:
+        "logs/genome/filter_and_extract_prM-E.txt",
+    benchmark:
+        "benchmarks/genome/filter_and_extract_prM-E.txt"
+    shell:
+        r"""
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --fill-gaps \
+            --remove-reference \
+          2> {log:q}
+        """
+
+
+rule filter_prME:
+    input:
+        exclude = config["files"]["prM-E"]["exclude"],
+        include = config["files"]["prM-E"]["include"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        sequences = "results/prM-E/aligned.fasta"
+    output:
+        sequences = "results/prM-E/aligned_and_filtered.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["prM-E"]["min_length"],
+        sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
+        strain_id = config["strain_id_field"]
+    log:
+        "logs/genome/filter_prM-E.txt",
+    benchmark:
+        "benchmarks/genome/filter_prM-E.txt"
+    shell:
+        r"""
+        augur filter \
+            --sequences {input.sequences:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --exclude {input.exclude:q} \
+            --include {input.include:q} \
+            --output {output.sequences:q} \
+            --group-by {params.group_by} \
+            --sequences-per-group {params.sequences_per_group:q} \
+            --min-date {params.min_date:q} \
+            --min-length {params.min_length:q} \
+          2> {log:q}
+        """