Merge pull request #281 from nextstrain/ingest-nextclade-v3

Migrate to Nextclade v3
nextstrain · Oct 30, 2024 · 4ce0b5b · 4ce0b5b
2 parents b453693 + dab86ff
commit 4ce0b5b
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 79 deletions.
diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -18,9 +18,9 @@ upload:
       all_sequences.ndjson.xz: data/sequences.ndjson
       metadata.tsv.gz: results/metadata.tsv
       sequences.fasta.xz: results/sequences.fasta
-      alignment.fasta.xz: data/alignment.fasta
-      insertions.csv.gz: data/insertions.csv
-      translations.zip: data/translations.zip
+      nextclade.tsv.zst: results/nextclade.tsv
+      alignment.fasta.xz: results/alignment.fasta
+      translations.zip: results/translations.zip
 
     cloudfront_domain: 'data.nextstrain.org'
 

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -80,5 +80,23 @@ curate:
 nextclade:
   # Field to use as the sequence ID in the Nextclade file
   id_field: 'seqName'
-  # Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file
-  field_map: 'defaults/nextclade-field-map.tsv'
+  # The first column should be the original column name of the Nextclade TSV
+  # The second column should be the new column name to use in the final metadata TSV
+  # Nextclade can have pathogen specific output columns so make sure to check which
+  # columns would be useful for your downstream phylogenetic analysis.
+  field_map:
+    seqName: "seqName"
+    clade: "clade"
+    outbreak: "outbreak"
+    lineage: "lineage"
+    coverage: "coverage"
+    totalMissing: "missing_data"
+    totalSubstitutions: "divergence"
+    totalNonACGTNs: "nonACGTN"
+    qc.missingData.status: "QC_missing_data"
+    qc.mixedSites.status: "QC_mixed_sites"
+    qc.privateMutations.status: "QC_rare_mutations"
+    qc.frameShifts.status: "QC_frame_shifts"
+    qc.stopCodons.status: "QC_stop_codons"
+    frameShifts: "frame_shifts"
+    isReverseComplement: "is_reverse_complement"
diff --git a/ingest/defaults/nextclade-field-map.tsv b/ingest/defaults/nextclade-field-map.tsv
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -1,86 +1,115 @@
-
-rule nextclade_dataset:
-    output:
-        temp("mpxv.zip"),
-    shell:
-        """
-        nextclade2 dataset get --name MPXV --output-zip {output}
-        """
+import sys
 
 
-rule nextclade_dataset_hMPXV:
+rule get_nextclade_dataset:
     output:
-        temp("hmpxv.zip"),
+        temp("data/mpxv.zip"),
+    params:
+        dataset_name="MPXV",
+    log:
+        "logs/get_nextclade_dataset.txt",
+    benchmark:
+        "benchmarks/get_nextclade_dataset.txt"
     shell:
-        """
-        nextclade2 dataset get --name hMPXV --output-zip {output}
+        r"""
+        nextclade3 dataset get \
+            --name {params.dataset_name:q} \
+            --output-zip {output:q} 2>&1 | tee {log}
         """
 
 
-rule align:
+rule run_nextclade:
     input:
         sequences="results/sequences.fasta",
-        dataset="hmpxv.zip",
+        dataset="data/mpxv.zip",
     output:
-        alignment="data/alignment.fasta",
-        insertions="data/insertions.csv",
-        translations="data/translations.zip",
+        nextclade="results/nextclade.tsv",
+        alignment="results/alignment.fasta",
+        translations="results/translations.zip",
     params:
         # The lambda is used to deactivate automatic wildcard expansion.
         # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
-        translations=lambda w: "data/translations/{gene}.fasta",
-    threads: 4
+        translations=lambda w: "results/translations/{cds}.fasta",
+    threads: workflow.cores
+    log:
+        "logs/run_nextclade.txt",
+    benchmark:
+        "benchmarks/run_nextclade.txt"
     shell:
-        """
-        nextclade2 run -D {input.dataset} -j {threads}   --retry-reverse-complement \
-                  --output-fasta {output.alignment}  --output-translations {params.translations} \
-                  --output-insertions {output.insertions} {input.sequences}
-        zip -rj {output.translations} data/translations
+        r"""
+        nextclade3 run \
+            {input.sequences:q} \
+            --jobs {threads:q} \
+            --retry-reverse-complement \
+            --input-dataset {input.dataset:q} \
+            --output-tsv {output.nextclade:q} \
+            --output-fasta {output.alignment:q} \
+            --output-translations {params.translations:q} 2>&1 | tee {log}
+
+        zip -rj {output.translations:q} results/translations
         """
 
 
-rule nextclade:
+if isinstance(config["nextclade"]["field_map"], str):
+    print(
+        f"Converting config['nextclade']['field_map'] from TSV file ({config['nextclade']['field_map']}) to dictionary; "
+        f"consider putting the field map directly in the config file.",
+        file=sys.stderr,
+    )
+    with open(config["nextclade"]["field_map"], "r") as f:
+        config["nextclade"]["field_map"] = dict(
+            line.rstrip("\n").split("\t", 1) for line in f if not line.startswith("#")
+        )
+
+
+rule nextclade_metadata:
     input:
-        sequences="results/sequences.fasta",
-        dataset="mpxv.zip",
+        nextclade="results/nextclade.tsv",
     output:
-        "data/nextclade.tsv",
-    threads: 4
+        nextclade_metadata=temp("results/nextclade_metadata.tsv"),
+    params:
+        nextclade_id_field=config["nextclade"]["id_field"],
+        nextclade_field_map=[
+            f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()
+        ],
+        nextclade_fields=",".join(config["nextclade"]["field_map"].keys()),
+    log:
+        "logs/nextclade_metadata.txt",
+    benchmark:
+        "benchmarks/nextclade_metadata.txt"
     shell:
-        """
-        nextclade2 run -D {input.dataset} -j {threads} --output-tsv {output}  {input.sequences}  --retry-reverse-complement
+        r"""
+        (tsv-select --header --fields {params.nextclade_fields:q} {input.nextclade} \
+        | augur curate rename \
+            --metadata - \
+            --id-column {params.nextclade_id_field:q} \
+            --field-map {params.nextclade_field_map:q} \
+            --output-metadata {output.nextclade_metadata:q} ) 2>&1 | tee {log}
         """
 
 
-rule join_metadata_clades:
+rule join_metadata_and_nextclade:
     input:
-        nextclade="data/nextclade.tsv",
         metadata="data/subset_metadata.tsv",
-        nextclade_field_map=config["nextclade"]["field_map"],
+        nextclade_metadata="results/nextclade_metadata.tsv",
     output:
         metadata="results/metadata.tsv",
     params:
-        id_field=config["curate"]["id_field"],
+        metadata_id_field=config["curate"]["id_field"],
         nextclade_id_field=config["nextclade"]["id_field"],
+    log:
+        "logs/join_metadata_and_nextclade.txt",
+    benchmark:
+        "benchmarks/join_metadata_and_nextclade.txt"
     shell:
-        """
-        export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'`
-
-        csvtk -tl cut -f $SUBSET_FIELDS \
-            {input.nextclade} \
-        | csvtk -tl rename2 \
-            -F \
-            -f '*' \
-            -p '(.+)' \
-            -r '{{kv}}' \
-            -k {input.nextclade_field_map} \
-        | tsv-join -H \
-            --filter-file - \
-            --key-fields {params.nextclade_id_field} \
-            --data-fields {params.id_field} \
-            --append-fields '*' \
-            --write-all ? \
-            {input.metadata} \
-        | tsv-select -H --exclude {params.nextclade_id_field} \
-            > {output.metadata}
+        r"""
+        augur merge \
+            --metadata \
+                metadata={input.metadata:q} \
+                nextclade={input.nextclade_metadata:q} \
+            --metadata-id-columns \
+                metadata={params.metadata_id_field:q} \
+                nextclade={params.nextclade_id_field:q} \
+            --output-metadata {output.metadata:q} \
+            --no-source-columns 2>&1 | tee {log}
         """
diff --git a/phylogenetic/defaults/description.md b/phylogenetic/defaults/description.md
@@ -12,7 +12,7 @@ The fourth is [`mpox/clade-I`](https://nextstrain.org/mpox/clade-I), which focus
 
 #### Analysis
 Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpox](https://github.com/nextstrain/mpox) and includes:
-- sequence alignment by [nextalign](https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextalign-cli.html)
+- sequence alignment by [Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli/index.html)
 - masking several regions of the genome, including the first 1350 and last 6422 base pairs and multiple repetitive regions of variable length
 - phylogenetic reconstruction using [IQTREE-2](http://www.iqtree.org/)
 - ancestral state reconstruction and temporal inference using [TreeTime](https://github.com/neherlab/treetime)
@@ -26,9 +26,9 @@ Curated sequences and metadata are available as flat files at:
 - [data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz)
 - [data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz)
 
-Pairwise alignments with [Nextclade](https://clades.nextstrain.org/) against the [reference sequence MPXV-M5312_HM12_Rivers](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383), insertions relative to the reference, and translated ORFs are available at
+Pairwise alignments with [Nextclade](https://clades.nextstrain.org/) against the [reference sequence MPXV-M5312_HM12_Rivers](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383), Nextclade analysis results, and translated ORFs are available at
 - [data.nextstrain.org/files/workflows/mpox/alignment.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/alignment.fasta.xz)
-- [data.nextstrain.org/files/workflows/mpox/insertions.csv.gz](https://data.nextstrain.org/files/workflows/mpox/insertions.csv.gz)
+- [data.nextstrain.org/files/workflows/mpox/nextclade.tsv.zst](https://data.nextstrain.org/files/workflows/mpox/nextclade.tsv.zst)
 - [data.nextstrain.org/files/workflows/mpox/translations.zip](https://data.nextstrain.org/files/workflows/mpox/translations.zip)
 
 ---