-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Nextclade assignment into subtypes #16
- Loading branch information
Showing
34 changed files
with
496,467 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
""" | ||
This part of the workflow handles running Nextclade on the curated metadata | ||
and sequences. | ||
REQUIRED INPUTS: | ||
metadata = data/metadata_all.tsv | ||
sequences = results/sequences_{serotype}.fasta | ||
nextclade_datasets = ../nextclade_data/{serotype} | ||
OUTPUTS: | ||
metadata = results/metadata_{serotype}.tsv | ||
nextclade = results/nextclade_subtypes.tsv | ||
See Nextclade docs for more details on usage, inputs, and outputs if you would | ||
like to customize the rules: | ||
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html | ||
""" | ||
|
||
SUPPORTED_NEXTCLADE_SEROTYPES = ['denv1', 'denv2', 'denv3', 'denv4'] | ||
SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES) | ||
|
||
rule nextclade_denvX: | ||
""" | ||
For each type, classify into the appropriate subtype | ||
""" | ||
input: | ||
sequences="results/sequences_{serotype}.fasta", | ||
dataset="../nextclade_data/{serotype}", | ||
output: | ||
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", | ||
threads: 4 | ||
params: | ||
min_length=config["nextclade"]["min_length"], | ||
min_seed_cover=config["nextclade"]["min_seed_cover"], | ||
wildcard_constraints: | ||
serotype=SEROTYPE_CONSTRAINTS | ||
shell: | ||
""" | ||
nextclade run \ | ||
--input-dataset {input.dataset} \ | ||
-j {threads} \ | ||
--output-tsv {output.nextclade_denvX} \ | ||
--min-length {params.min_length} \ | ||
--min-seed-cover {params.min_seed_cover} \ | ||
--silent \ | ||
{input.sequences} | ||
""" | ||
|
||
rule concat_nextclade_subtype_results: | ||
""" | ||
Concatenate all the nextclade results for dengue subtype classification | ||
""" | ||
input: | ||
expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), | ||
output: | ||
nextclade_subtypes="results/nextclade_subtypes.tsv", | ||
params: | ||
id_field=config["transform"]["id_field"], | ||
nextclade_field=config["nextclade"]["nextclade_field"], | ||
shell: | ||
""" | ||
echo "{params.id_field},{params.nextclade_field}" \ | ||
| tr ',' '\t' \ | ||
> {output.nextclade_subtypes} | ||
tsv-select -H -f "seqName,clade" {input} \ | ||
| awk 'NR>1 {{print}}' \ | ||
>> {output.nextclade_subtypes} | ||
""" | ||
|
||
rule append_nextclade_columns: | ||
""" | ||
Append the nextclade results to the metadata | ||
""" | ||
input: | ||
metadata="data/metadata_all.tsv", | ||
nextclade_subtypes="results/nextclade_subtypes.tsv", | ||
output: | ||
metadata_all="results/metadata_all.tsv", | ||
params: | ||
id_field=config["transform"]["id_field"], | ||
nextclade_field=config["nextclade"]["nextclade_field"], | ||
shell: | ||
""" | ||
tsv-join -H \ | ||
--filter-file {input.nextclade_subtypes} \ | ||
--key-fields {params.id_field} \ | ||
--append-fields {params.nextclade_field} \ | ||
--write-all ? \ | ||
{input.metadata} \ | ||
> {output.metadata_all} | ||
""" | ||
|
||
rule split_metadata_by_serotype: | ||
""" | ||
Split the metadata by serotype | ||
""" | ||
input: | ||
metadata="results/metadata_all.tsv", | ||
output: | ||
serotype_metadata="results/metadata_{serotype}.tsv" | ||
wildcard_constraints: | ||
serotype=SEROTYPE_CONSTRAINTS | ||
shell: | ||
""" | ||
tsv-filter -H --str-eq ncbi_serotype:{wildcards.serotype} {input.metadata} > {output.serotype_metadata} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Nextclade v3 Dataset | ||
|
||
| Serotype | Reference | Nextclade link | | ||
|:--|:--|:--| | ||
| denv1 | [NC_001477.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_001477.1) | [nextclade denv1](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv1) | | ||
| denv2 | [NC_001474.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001474.2) | [nextclade denv2](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv2) | | ||
| denv3 | [NC_001475.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_001475.2) | [nextclade denv3](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv3) | | ||
| denv4 | [NC_002640.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_002640.1) | [nextclade denv4](https://clades.nextstrain.org/?dataset-url=https://github.com/nextstrain/dengue/tree/main/nextclade_data/denv4) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
## Unreleased | ||
|
||
Initial release for Nextclade v3! | ||
|
||
Read more about Nextclade datasets in the documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Nextclade dataset for "UNKNOWN" (/Users/jchang3/github/nextstrain/dengue_branches/nextclade_assignment/nextclade_data/denv1) | ||
|
||
|
||
## Dataset attributes | ||
|
||
Nextclade dataset | ||
|
||
Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
##gff-version 3 | ||
##sequence-region NC_001477.1 1 10735 | ||
NC_001477.1 feature gene 95 394 . + . codon_start=1;gene=C;gene_name=C; | ||
NC_001477.1 feature gene 437 709 . + . codon_start=1;gene=pr;gene_name=pr; | ||
NC_001477.1 feature gene 710 934 . + . codon_start=1;gene=M;gene_name=M; | ||
NC_001477.1 feature gene 935 2419 . + . codon_start=1;gene=E;gene_name=E; | ||
NC_001477.1 feature gene 2420 3475 . + . codon_start=1;gene=NS1;gene_name=NS1; | ||
NC_001477.1 feature gene 3476 4129 . + . codon_start=1;gene=NS2A;gene_name=NS2A; | ||
NC_001477.1 feature gene 4130 4519 . + . codon_start=1;gene=NS2B;gene_name=NS2B; | ||
NC_001477.1 feature gene 4520 6376 . + . codon_start=1;gene=NS3;gene_name=NS3; | ||
NC_001477.1 feature gene 6377 6757 . + . codon_start=1;gene=NS4A;gene_name=NS4A; | ||
NC_001477.1 feature gene 6758 6826 . + . codon_start=1;gene=2K;gene_name=2K; | ||
NC_001477.1 feature gene 6827 7573 . + . codon_start=1;gene=NS4B;gene_name=NS4B; | ||
NC_001477.1 feature gene 7574 10270 . + . codon_start=1;gene=NS5;gene_name=NS5; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{ | ||
"alignmentParams": { | ||
"minSeedCover": 0.1, | ||
"minLength": 1000 | ||
}, | ||
"attributes": { | ||
"name": "Dengue virus DENV1 dataset", | ||
"reference accession": "NC_001477", | ||
"reference name": "NC_001477" | ||
}, | ||
"compatibility": { | ||
"cli": "3.0.0-alpha.0", | ||
"web": "3.0.0-alpha.0" | ||
}, | ||
"deprecated": false, | ||
"enabled": true, | ||
"experimental": true, | ||
"files": { | ||
"changelog": "CHANGELOG.md", | ||
"genomeAnnotation": "genome_annotation.gff3", | ||
"pathogenJson": "pathogen.json", | ||
"readme": "README.md", | ||
"reference": "reference.fasta", | ||
"treeJson": "tree.json" | ||
}, | ||
"meta": { | ||
"bugs": "https://github.com/nextstrain/nextclade_data/issues", | ||
"source code": "https://github.com/nextstrain/nextclade_data" | ||
}, | ||
"qc": { | ||
"frameShifts": { | ||
"enabled": false | ||
}, | ||
"missingData": { | ||
"enabled": false, | ||
"missingDataThreshold": 1000, | ||
"scoreBias": 100 | ||
}, | ||
"mixedSites": { | ||
"enabled": false, | ||
"mixedSitesThreshold": 8 | ||
}, | ||
"privateMutations": { | ||
"cutoff": 25, | ||
"enabled": false, | ||
"typical": 10, | ||
"weightLabeledSubstitutions": 2, | ||
"weightReversionSubstitutions": 1, | ||
"weightUnlabeledSubstitutions": 1 | ||
}, | ||
"snpClusters": { | ||
"clusterCutOff": 5, | ||
"enabled": false, | ||
"scoreWeight": 50, | ||
"windowSize": 100 | ||
}, | ||
"stopCodons": { | ||
"enabled": false | ||
} | ||
}, | ||
"schemaVersion": "3.0.0", | ||
"version": { | ||
"tag": "unreleased" | ||
} | ||
} |
Oops, something went wrong.