Skip to content

Commit

Permalink
Split out the join_nextclade_clades rule into 3 separate rules for cl…
Browse files Browse the repository at this point in the history
…arity

This rule can be simplified by splitting into 3 different rules.

1. Aggregates all serotype Nextclade outputs to create a single nextclade_subtype.tsv mapping file
2. Joins metadata with nextclade_subtype.tsv into the final metadata_all.tsv file
3. Splits out the all metadata + nextclade file into individual serotype metadata.tsv files

The reasoning is that the metadata_all.tsv has all dengue metadata, including
sequences that are not assigned serotypes. Sequences without serotype are not listed
in any of the subtype results.

Therefore the nextclade_subtypes are added as a new field in metadata_all.tsv
and then subsequently split back out into individual serotypes to make them
available via data.nextstrain.org.
  • Loading branch information
j23414 committed Feb 16, 2024
1 parent a9b4d01 commit 11a5cbc
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 26 deletions.
3 changes: 2 additions & 1 deletion ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,5 @@ transform:

nextclade:
min_length: 1000 # E gene length is approximately 1400nt
min_seed_cover: 0.1
min_seed_cover: 0.1
nextclade_field: 'nextclade_subtype'
77 changes: 52 additions & 25 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,51 +23,78 @@ rule nextclade_denvX:
{input.sequences}
"""

rule join_nextclade_clades:
rule concat_nextclade_subtype_results:
"""
Merge all the nextclade results into metadata and split metadata
Concatenate all the nextclade results for dengue subtype classification
"""
input:
metadata="data/metadata_all.tsv",
nextclade_denv1="data/nextclade_results/nextclade_denv1.tsv",
nextclade_denv2="data/nextclade_results/nextclade_denv2.tsv",
nextclade_denv3="data/nextclade_results/nextclade_denv3.tsv",
nextclade_denv4="data/nextclade_results/nextclade_denv4.tsv",
output:
metadata_all="results/metadata_all.tsv",
metadata_denv1="results/metadata_denv1.tsv",
metadata_denv2="results/metadata_denv2.tsv",
metadata_denv3="results/metadata_denv3.tsv",
metadata_denv4="results/metadata_denv4.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
params:
id_field=config["transform"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
shell:
"""
echo "genbank_accession,nextclade_subtype,nextclade_type" \
echo "{params.id_field},{params.nextclade_field}" \
| tr ',' '\t' \
> results/nextclade_subtype.tsv
> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input.nextclade_denv1} \
| awk 'NR>1 {{print $0"\tDENV1"}}' \
>> results/nextclade_subtype.tsv
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input.nextclade_denv2} \
| awk 'NR>1 {{print $0"\tDENV2"}}' \
>> results/nextclade_subtype.tsv
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input.nextclade_denv3} \
| awk 'NR>1 {{print $0"\tDENV3"}}' \
>> results/nextclade_subtype.tsv
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input.nextclade_denv4} \
| awk 'NR>1 {{print $0"\tDENV4"}}' \
>> results/nextclade_subtype.tsv
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
"""

rule append_nextclade_columns:
"""
Append the nextclade results to the metadata
"""
input:
metadata="data/metadata_all.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
output:
metadata_all="results/metadata_all.tsv",
params:
id_field=config["transform"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
shell:
"""
tsv-join -H \
--filter-file results/nextclade_subtype.tsv \
--key-fields genbank_accession \
--append-fields 'nextclade_subtype,nextclade_type' \
--filter-file {input.nextclade_subtypes} \
--key-fields {params.id_field} \
--append-fields {params.nextclade_field} \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
"""

tsv-filter -H --str-eq ncbi_serotype:denv1 {output.metadata_all} > {output.metadata_denv1}
tsv-filter -H --str-eq ncbi_serotype:denv2 {output.metadata_all} > {output.metadata_denv2}
tsv-filter -H --str-eq ncbi_serotype:denv3 {output.metadata_all} > {output.metadata_denv3}
tsv-filter -H --str-eq ncbi_serotype:denv4 {output.metadata_all} > {output.metadata_denv4}
rule split_metadata_by_serotype:
"""
Split the metadata by serotype
"""
input:
metadata="results/metadata_all.tsv",
output:
metadata_denv1="results/metadata_denv1.tsv",
metadata_denv2="results/metadata_denv2.tsv",
metadata_denv3="results/metadata_denv3.tsv",
metadata_denv4="results/metadata_denv4.tsv",
shell:
"""
tsv-filter -H --str-eq ncbi_serotype:denv1 {input.metadata} > {output.metadata_denv1}
tsv-filter -H --str-eq ncbi_serotype:denv2 {input.metadata} > {output.metadata_denv2}
tsv-filter -H --str-eq ncbi_serotype:denv3 {input.metadata} > {output.metadata_denv3}
tsv-filter -H --str-eq ncbi_serotype:denv4 {input.metadata} > {output.metadata_denv4}
"""

0 comments on commit 11a5cbc

Please sign in to comment.