Nextclade assignment into subtypes #16

nextstrain · Feb 24, 2024 · 36a0bb1 · 36a0bb1
2 parents 9f54ad5 + b5dc665
commit 36a0bb1
Show file tree

Hide file tree

Showing 34 changed files with 496,467 additions and 20 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -8,30 +8,43 @@ Follow the [standard installation instructions](https://docs.nextstrain.org/en/l
 
 ## Usage
 
-> NOTE: All command examples assume you are within the `ingest` directory.
-> If running commands from the outer `dengue` directory, please replace the `.` with `ingest`
+All workflows are expected to the be run from the top level pathogen repo directory.
+The default ingest workflow should be run with
 
 Fetch sequences with
 
 ```sh
-nextstrain build . data/sequences.ndjson
+nextstrain build ingest data/sequences.ndjson
 ```
 
 Run the complete ingest pipeline with
 
 ```sh
-nextstrain build .
+nextstrain build ingest
 ```
 
-This will produce two files (within the `ingest` directory):
+This will produce 10 files (within the `ingest` directory):
 
-- `results/metadata.tsv`
-- `results/sequences.fasta`
+A pair of files with all the dengue sequences:
+
+- `ingest/results/metadata_all.tsv`
+- `ingest/results/sequences_all.fasta`
+
+A pair of files for each dengue serotype (denv1 - denv4)
+
+- `ingest/results/metadata_denv1.tsv`
+- `ingest/results/sequences_denv1.fasta`
+- `ingest/results/metadata_denv2.tsv`
+- `ingest/results/sequences_denv2.fasta`
+- `ingest/results/metadata_denv3.tsv`
+- `ingest/results/sequences_denv3.fasta`
+- `ingest/results/metadata_denv4.tsv`
+- `ingest/results/sequences_denv4.fasta`
 
 Run the complete ingest pipeline and upload results to AWS S3 with
 
 ```sh
-nextstrain build . --configfiles config/config.yaml config/optional.yaml
+nextstrain build ingest --configfiles config/config.yaml config/optional.yaml
 ```
 
 ### Adding new sequences not from GenBank

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -60,6 +60,7 @@ rule all:
 include: "workflow/snakemake_rules/fetch_sequences.smk"
 include: "workflow/snakemake_rules/transform.smk"
 include: "workflow/snakemake_rules/split_serotypes.smk"
+include: "workflow/snakemake_rules/nextclade.smk"
 
 
 if config.get("upload", False):

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -106,3 +106,8 @@ transform:
     'authors',
     'institution'
   ]
+
+nextclade:
+  min_length: 1000 # E gene length is approximately 1400nt
+  min_seed_cover: 0.1
+  nextclade_field: 'nextclade_subtype'
diff --git a/ingest/config/optional.yaml b/ingest/config/optional.yaml
@@ -10,11 +10,16 @@ upload:
     files_to_upload:
       genbank.ndjson.xz: data/genbank.ndjson
       all_sequences.ndjson.xz: data/sequences.ndjson
-      metadata.tsv.gz: results/metadata.tsv
-      sequences.fasta.xz: results/sequences.fasta
-      alignment.fasta.xz: data/alignment.fasta
-      insertions.csv.gz: data/insertions.csv
-      translations.zip: data/translations.zip
+      metadata_all.tsv.zst: results/metadata_all.tsv
+      sequences_all.fasta.zst: results/sequences_all.fasta
+      metadata_denv1.tsv.zst: results/metadata_denv1.tsv
+      sequences_denv1.fasta.zst: results/sequences_denv1.fasta
+      metadata_denv2.tsv.zst: results/metadata_denv2.tsv
+      sequences_denv2.fasta.zst: results/sequences_denv2.fasta
+      metadata_denv3.tsv.zst: results/metadata_denv3.tsv
+      sequences_denv3.fasta.zst: results/sequences_denv3.fasta
+      metadata_denv4.tsv.zst: results/metadata_denv4.tsv
+      sequences_denv4.fasta.zst: results/sequences_denv4.fasta
 
     cloudfront_domain: 'data.nextstrain.org'
 

diff --git a/ingest/workflow/snakemake_rules/nextclade.smk b/ingest/workflow/snakemake_rules/nextclade.smk
@@ -0,0 +1,104 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+REQUIRED INPUTS:
+    metadata    = data/metadata_all.tsv
+    sequences   = results/sequences_{serotype}.fasta
+    nextclade_datasets = ../nextclade_data/{serotype}
+OUTPUTS:
+    metadata        = results/metadata_{serotype}.tsv
+    nextclade       = results/nextclade_subtypes.tsv
+See Nextclade docs for more details on usage, inputs, and outputs if you would
+like to customize the rules:
+https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
+"""
+
+SUPPORTED_NEXTCLADE_SEROTYPES = ['denv1', 'denv2', 'denv3', 'denv4']
+SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)
+
+rule nextclade_denvX:
+    """
+    For each type, classify into the appropriate subtype
+    """
+    input:
+        sequences="results/sequences_{serotype}.fasta",
+        dataset="../nextclade_data/{serotype}",
+    output:
+        nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
+    threads: 4
+    params:
+        min_length=config["nextclade"]["min_length"],
+        min_seed_cover=config["nextclade"]["min_seed_cover"],
+    wildcard_constraints:
+        serotype=SEROTYPE_CONSTRAINTS
+    shell:
+        """
+        nextclade run \
+          --input-dataset {input.dataset} \
+          -j {threads} \
+          --output-tsv {output.nextclade_denvX} \
+          --min-length {params.min_length} \
+          --min-seed-cover {params.min_seed_cover} \
+          --silent \
+          {input.sequences}
+        """
+
+rule concat_nextclade_subtype_results:
+    """
+    Concatenate all the nextclade results for dengue subtype classification
+    """
+    input:
+        expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
+    output:
+        nextclade_subtypes="results/nextclade_subtypes.tsv",
+    params:
+        id_field=config["transform"]["id_field"],
+        nextclade_field=config["nextclade"]["nextclade_field"],
+    shell:
+        """
+        echo "{params.id_field},{params.nextclade_field}" \
+        | tr ',' '\t' \
+        > {output.nextclade_subtypes}
+
+        tsv-select -H -f "seqName,clade" {input} \
+        | awk 'NR>1 {{print}}' \
+        >> {output.nextclade_subtypes}
+        """
+
+rule append_nextclade_columns:
+    """
+    Append the nextclade results to the metadata
+    """
+    input:
+        metadata="data/metadata_all.tsv",
+        nextclade_subtypes="results/nextclade_subtypes.tsv",
+    output:
+        metadata_all="results/metadata_all.tsv",
+    params:
+        id_field=config["transform"]["id_field"],
+        nextclade_field=config["nextclade"]["nextclade_field"],
+    shell:
+        """
+        tsv-join -H \
+            --filter-file {input.nextclade_subtypes} \
+            --key-fields {params.id_field} \
+            --append-fields {params.nextclade_field} \
+            --write-all ? \
+            {input.metadata} \
+        > {output.metadata_all}
+        """
+
+rule split_metadata_by_serotype:
+    """
+    Split the metadata by serotype
+    """
+    input:
+        metadata="results/metadata_all.tsv",
+    output:
+        serotype_metadata="results/metadata_{serotype}.tsv"
+    wildcard_constraints:
+        serotype=SEROTYPE_CONSTRAINTS
+    shell:
+        """
+        tsv-filter -H --str-eq ncbi_serotype:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
+        """
diff --git a/ingest/workflow/snakemake_rules/split_serotypes.smk b/ingest/workflow/snakemake_rules/split_serotypes.smk
@@ -2,12 +2,11 @@
 This part of the workflow handles splitting the data by serotype either based on the 
 NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate.
 
-    metadata = "results/metadata_all.tsv"
+    metadata = "data/metadata_all.tsv"
     sequences = "results/sequences_all.fasta"
 
 This will produce output files as
 
-    metadata_{serotype} = "results/metadata_{serotype}.tsv"
     sequences_{serotype} = "results/sequences_{serotype}.fasta"
 
 Parameters are expected to be defined in `config.transform`.
@@ -18,10 +17,9 @@ rule split_by_ncbi_serotype:
     Split the data by serotype based on the NCBI metadata.
     """
     input:
-        metadata = "results/metadata_all.tsv",
+        metadata = "data/metadata_all.tsv",
         sequences = "results/sequences_all.fasta"
     output:
-        metadata = "results/metadata_{serotype}.tsv",
         sequences = "results/sequences_{serotype}.fasta"
     params:
         id_field = config["transform"]["id_field"]
@@ -32,6 +30,5 @@ rule split_by_ncbi_serotype:
           --metadata {input.metadata} \
           --metadata-id-columns {params.id_field} \
           --query "ncbi_serotype=='{wildcards.serotype}'" \
-          --output-sequences {output.sequences} \
-          --output-metadata {output.metadata}
+          --output-sequences {output.sequences}
         """
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
@@ -42,7 +42,7 @@ rule transform:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["transform"]["annotations"],
     output:
-        metadata="results/metadata_all.tsv",
+        metadata="data/metadata_all.tsv",
         sequences="results/sequences_all.fasta",
     log:
         "logs/transform.txt",

diff --git a/nextclade_data/README.md b/nextclade_data/README.md
@@ -0,0 +1,8 @@
+# Nextclade v3 Dataset
+
+| Serotype | Reference | Nextclade link |
+|:--|:--|:--|
+| denv1 | [NC_001477.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_001477.1) | [nextclade denv1](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv1) |
+| denv2 | [NC_001474.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001474.2) | [nextclade denv2](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv2) |
+| denv3 | [NC_001475.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001475.2) | [nextclade denv3](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv3) |
+| denv4 | [NC_002640.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_002640.1) | [nextclade denv4](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv4) |
diff --git a/nextclade_data/denv1/CHANGELOG.md b/nextclade_data/denv1/CHANGELOG.md
@@ -0,0 +1,5 @@
+## Unreleased
+
+Initial release for Nextclade v3!
+
+Read more about Nextclade datasets in the documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade_data/denv1/README.md b/nextclade_data/denv1/README.md
@@ -0,0 +1,8 @@
+# Nextclade dataset for "UNKNOWN" (/Users/jchang3/github/nextstrain/dengue_branches/nextclade_assignment/nextclade_data/denv1)
+
+
+## Dataset attributes
+
+Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade_data/denv1/genome_annotation.gff3 b/nextclade_data/denv1/genome_annotation.gff3
@@ -0,0 +1,14 @@
+##gff-version 3
+##sequence-region NC_001477.1 1 10735
+NC_001477.1	feature	gene	95	394	.	+	.	codon_start=1;gene=C;gene_name=C;
+NC_001477.1	feature	gene	437	709	.	+	.	codon_start=1;gene=pr;gene_name=pr;
+NC_001477.1	feature	gene	710	934	.	+	.	codon_start=1;gene=M;gene_name=M;
+NC_001477.1	feature	gene	935	2419	.	+	.	codon_start=1;gene=E;gene_name=E;
+NC_001477.1	feature	gene	2420	3475	.	+	.	codon_start=1;gene=NS1;gene_name=NS1;
+NC_001477.1	feature	gene	3476	4129	.	+	.	codon_start=1;gene=NS2A;gene_name=NS2A;
+NC_001477.1	feature	gene	4130	4519	.	+	.	codon_start=1;gene=NS2B;gene_name=NS2B;
+NC_001477.1	feature	gene	4520	6376	.	+	.	codon_start=1;gene=NS3;gene_name=NS3;
+NC_001477.1	feature	gene	6377	6757	.	+	.	codon_start=1;gene=NS4A;gene_name=NS4A;
+NC_001477.1	feature	gene	6758	6826	.	+	.	codon_start=1;gene=2K;gene_name=2K;
+NC_001477.1	feature	gene	6827	7573	.	+	.	codon_start=1;gene=NS4B;gene_name=NS4B;
+NC_001477.1	feature	gene	7574	10270	.	+	.	codon_start=1;gene=NS5;gene_name=NS5;
diff --git a/nextclade_data/denv1/pathogen.json b/nextclade_data/denv1/pathogen.json
@@ -0,0 +1,65 @@
+{
+  "alignmentParams": {
+    "minSeedCover": 0.1,
+    "minLength": 1000
+  },
+  "attributes": {
+    "name": "Dengue virus DENV1 dataset",
+    "reference accession": "NC_001477",
+    "reference name": "NC_001477"
+  },
+  "compatibility": {
+    "cli": "3.0.0-alpha.0",
+    "web": "3.0.0-alpha.0"
+  },
+  "deprecated": false,
+  "enabled": true,
+  "experimental": true,
+  "files": {
+    "changelog": "CHANGELOG.md",
+    "genomeAnnotation": "genome_annotation.gff3",
+    "pathogenJson": "pathogen.json",
+    "readme": "README.md",
+    "reference": "reference.fasta",
+    "treeJson": "tree.json"
+  },
+  "meta": {
+    "bugs": "https://github.com/nextstrain/nextclade_data/issues",
+    "source code": "https://github.com/nextstrain/nextclade_data"
+  },
+  "qc": {
+    "frameShifts": {
+      "enabled": false
+    },
+    "missingData": {
+      "enabled": false,
+      "missingDataThreshold": 1000,
+      "scoreBias": 100
+    },
+    "mixedSites": {
+      "enabled": false,
+      "mixedSitesThreshold": 8
+    },
+    "privateMutations": {
+      "cutoff": 25,
+      "enabled": false,
+      "typical": 10,
+      "weightLabeledSubstitutions": 2,
+      "weightReversionSubstitutions": 1,
+      "weightUnlabeledSubstitutions": 1
+    },
+    "snpClusters": {
+      "clusterCutOff": 5,
+      "enabled": false,
+      "scoreWeight": 50,
+      "windowSize": 100
+    },
+    "stopCodons": {
+      "enabled": false
+    }
+  },
+  "schemaVersion": "3.0.0",
+  "version": {
+    "tag": "unreleased"
+  }
+}