Skip to content

Commit

Permalink
Nextclade assignment into subtypes #16
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Feb 24, 2024
2 parents 9f54ad5 + b5dc665 commit 36a0bb1
Show file tree
Hide file tree
Showing 34 changed files with 496,467 additions and 20 deletions.
29 changes: 21 additions & 8 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,43 @@ Follow the [standard installation instructions](https://docs.nextstrain.org/en/l

## Usage

> NOTE: All command examples assume you are within the `ingest` directory.
> If running commands from the outer `dengue` directory, please replace the `.` with `ingest`
All workflows are expected to the be run from the top level pathogen repo directory.
The default ingest workflow should be run with

Fetch sequences with

```sh
nextstrain build . data/sequences.ndjson
nextstrain build ingest data/sequences.ndjson
```

Run the complete ingest pipeline with

```sh
nextstrain build .
nextstrain build ingest
```

This will produce two files (within the `ingest` directory):
This will produce 10 files (within the `ingest` directory):

- `results/metadata.tsv`
- `results/sequences.fasta`
A pair of files with all the dengue sequences:

- `ingest/results/metadata_all.tsv`
- `ingest/results/sequences_all.fasta`

A pair of files for each dengue serotype (denv1 - denv4)

- `ingest/results/metadata_denv1.tsv`
- `ingest/results/sequences_denv1.fasta`
- `ingest/results/metadata_denv2.tsv`
- `ingest/results/sequences_denv2.fasta`
- `ingest/results/metadata_denv3.tsv`
- `ingest/results/sequences_denv3.fasta`
- `ingest/results/metadata_denv4.tsv`
- `ingest/results/sequences_denv4.fasta`

Run the complete ingest pipeline and upload results to AWS S3 with

```sh
nextstrain build . --configfiles config/config.yaml config/optional.yaml
nextstrain build ingest --configfiles config/config.yaml config/optional.yaml
```

### Adding new sequences not from GenBank
Expand Down
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ rule all:
include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/split_serotypes.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload", False):
Expand Down
5 changes: 5 additions & 0 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,8 @@ transform:
'authors',
'institution'
]

nextclade:
min_length: 1000 # E gene length is approximately 1400nt
min_seed_cover: 0.1
nextclade_field: 'nextclade_subtype'
15 changes: 10 additions & 5 deletions ingest/config/optional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ upload:
files_to_upload:
genbank.ndjson.xz: data/genbank.ndjson
all_sequences.ndjson.xz: data/sequences.ndjson
metadata.tsv.gz: results/metadata.tsv
sequences.fasta.xz: results/sequences.fasta
alignment.fasta.xz: data/alignment.fasta
insertions.csv.gz: data/insertions.csv
translations.zip: data/translations.zip
metadata_all.tsv.zst: results/metadata_all.tsv
sequences_all.fasta.zst: results/sequences_all.fasta
metadata_denv1.tsv.zst: results/metadata_denv1.tsv
sequences_denv1.fasta.zst: results/sequences_denv1.fasta
metadata_denv2.tsv.zst: results/metadata_denv2.tsv
sequences_denv2.fasta.zst: results/sequences_denv2.fasta
metadata_denv3.tsv.zst: results/metadata_denv3.tsv
sequences_denv3.fasta.zst: results/sequences_denv3.fasta
metadata_denv4.tsv.zst: results/metadata_denv4.tsv
sequences_denv4.fasta.zst: results/sequences_denv4.fasta

cloudfront_domain: 'data.nextstrain.org'

Expand Down
104 changes: 104 additions & 0 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
REQUIRED INPUTS:
metadata = data/metadata_all.tsv
sequences = results/sequences_{serotype}.fasta
nextclade_datasets = ../nextclade_data/{serotype}
OUTPUTS:
metadata = results/metadata_{serotype}.tsv
nextclade = results/nextclade_subtypes.tsv
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
"""

SUPPORTED_NEXTCLADE_SEROTYPES = ['denv1', 'denv2', 'denv3', 'denv4']
SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)

rule nextclade_denvX:
"""
For each type, classify into the appropriate subtype
"""
input:
sequences="results/sequences_{serotype}.fasta",
dataset="../nextclade_data/{serotype}",
output:
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
threads: 4
params:
min_length=config["nextclade"]["min_length"],
min_seed_cover=config["nextclade"]["min_seed_cover"],
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output.nextclade_denvX} \
--min-length {params.min_length} \
--min-seed-cover {params.min_seed_cover} \
--silent \
{input.sequences}
"""

rule concat_nextclade_subtype_results:
"""
Concatenate all the nextclade results for dengue subtype classification
"""
input:
expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
nextclade_subtypes="results/nextclade_subtypes.tsv",
params:
id_field=config["transform"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
shell:
"""
echo "{params.id_field},{params.nextclade_field}" \
| tr ',' '\t' \
> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input} \
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
"""

rule append_nextclade_columns:
"""
Append the nextclade results to the metadata
"""
input:
metadata="data/metadata_all.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
output:
metadata_all="results/metadata_all.tsv",
params:
id_field=config["transform"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
shell:
"""
tsv-join -H \
--filter-file {input.nextclade_subtypes} \
--key-fields {params.id_field} \
--append-fields {params.nextclade_field} \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
"""

rule split_metadata_by_serotype:
"""
Split the metadata by serotype
"""
input:
metadata="results/metadata_all.tsv",
output:
serotype_metadata="results/metadata_{serotype}.tsv"
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
shell:
"""
tsv-filter -H --str-eq ncbi_serotype:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
"""
9 changes: 3 additions & 6 deletions ingest/workflow/snakemake_rules/split_serotypes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
This part of the workflow handles splitting the data by serotype either based on the
NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate.
metadata = "results/metadata_all.tsv"
metadata = "data/metadata_all.tsv"
sequences = "results/sequences_all.fasta"
This will produce output files as
metadata_{serotype} = "results/metadata_{serotype}.tsv"
sequences_{serotype} = "results/sequences_{serotype}.fasta"
Parameters are expected to be defined in `config.transform`.
Expand All @@ -18,10 +17,9 @@ rule split_by_ncbi_serotype:
Split the data by serotype based on the NCBI metadata.
"""
input:
metadata = "results/metadata_all.tsv",
metadata = "data/metadata_all.tsv",
sequences = "results/sequences_all.fasta"
output:
metadata = "results/metadata_{serotype}.tsv",
sequences = "results/sequences_{serotype}.fasta"
params:
id_field = config["transform"]["id_field"]
Expand All @@ -32,6 +30,5 @@ rule split_by_ncbi_serotype:
--metadata {input.metadata} \
--metadata-id-columns {params.id_field} \
--query "ncbi_serotype=='{wildcards.serotype}'" \
--output-sequences {output.sequences} \
--output-metadata {output.metadata}
--output-sequences {output.sequences}
"""
2 changes: 1 addition & 1 deletion ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ rule transform:
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["transform"]["annotations"],
output:
metadata="results/metadata_all.tsv",
metadata="data/metadata_all.tsv",
sequences="results/sequences_all.fasta",
log:
"logs/transform.txt",
Expand Down
8 changes: 8 additions & 0 deletions nextclade_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Nextclade v3 Dataset

| Serotype | Reference | Nextclade link |
|:--|:--|:--|
| denv1 | [NC_001477.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_001477.1) | [nextclade denv1](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv1) |
| denv2 | [NC_001474.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001474.2) | [nextclade denv2](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv2) |
| denv3 | [NC_001475.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001475.2) | [nextclade denv3](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv3) |
| denv4 | [NC_002640.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_002640.1) | [nextclade denv4](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv4) |
5 changes: 5 additions & 0 deletions nextclade_data/denv1/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## Unreleased

Initial release for Nextclade v3!

Read more about Nextclade datasets in the documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
8 changes: 8 additions & 0 deletions nextclade_data/denv1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Nextclade dataset for "UNKNOWN" (/Users/jchang3/github/nextstrain/dengue_branches/nextclade_assignment/nextclade_data/denv1)


## Dataset attributes

Nextclade dataset

Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
14 changes: 14 additions & 0 deletions nextclade_data/denv1/genome_annotation.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
##gff-version 3
##sequence-region NC_001477.1 1 10735
NC_001477.1 feature gene 95 394 . + . codon_start=1;gene=C;gene_name=C;
NC_001477.1 feature gene 437 709 . + . codon_start=1;gene=pr;gene_name=pr;
NC_001477.1 feature gene 710 934 . + . codon_start=1;gene=M;gene_name=M;
NC_001477.1 feature gene 935 2419 . + . codon_start=1;gene=E;gene_name=E;
NC_001477.1 feature gene 2420 3475 . + . codon_start=1;gene=NS1;gene_name=NS1;
NC_001477.1 feature gene 3476 4129 . + . codon_start=1;gene=NS2A;gene_name=NS2A;
NC_001477.1 feature gene 4130 4519 . + . codon_start=1;gene=NS2B;gene_name=NS2B;
NC_001477.1 feature gene 4520 6376 . + . codon_start=1;gene=NS3;gene_name=NS3;
NC_001477.1 feature gene 6377 6757 . + . codon_start=1;gene=NS4A;gene_name=NS4A;
NC_001477.1 feature gene 6758 6826 . + . codon_start=1;gene=2K;gene_name=2K;
NC_001477.1 feature gene 6827 7573 . + . codon_start=1;gene=NS4B;gene_name=NS4B;
NC_001477.1 feature gene 7574 10270 . + . codon_start=1;gene=NS5;gene_name=NS5;
65 changes: 65 additions & 0 deletions nextclade_data/denv1/pathogen.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"alignmentParams": {
"minSeedCover": 0.1,
"minLength": 1000
},
"attributes": {
"name": "Dengue virus DENV1 dataset",
"reference accession": "NC_001477",
"reference name": "NC_001477"
},
"compatibility": {
"cli": "3.0.0-alpha.0",
"web": "3.0.0-alpha.0"
},
"deprecated": false,
"enabled": true,
"experimental": true,
"files": {
"changelog": "CHANGELOG.md",
"genomeAnnotation": "genome_annotation.gff3",
"pathogenJson": "pathogen.json",
"readme": "README.md",
"reference": "reference.fasta",
"treeJson": "tree.json"
},
"meta": {
"bugs": "https://github.com/nextstrain/nextclade_data/issues",
"source code": "https://github.com/nextstrain/nextclade_data"
},
"qc": {
"frameShifts": {
"enabled": false
},
"missingData": {
"enabled": false,
"missingDataThreshold": 1000,
"scoreBias": 100
},
"mixedSites": {
"enabled": false,
"mixedSitesThreshold": 8
},
"privateMutations": {
"cutoff": 25,
"enabled": false,
"typical": 10,
"weightLabeledSubstitutions": 2,
"weightReversionSubstitutions": 1,
"weightUnlabeledSubstitutions": 1
},
"snpClusters": {
"clusterCutOff": 5,
"enabled": false,
"scoreWeight": 50,
"windowSize": 100
},
"stopCodons": {
"enabled": false
}
},
"schemaVersion": "3.0.0",
"version": {
"tag": "unreleased"
}
}
Loading

0 comments on commit 36a0bb1

Please sign in to comment.