Skip to content

Commit

Permalink
[build] Update zika build to pull new ingest data
Browse files Browse the repository at this point in the history
Since Ingest pushes data to a different endpoint, update zika build to pull
from the new endpoint. Other modifications from the original described below:

Since strains (or isolates) may be re-sequenced resulting in duplicate strain
names breaking Nextstrain builds, other pathogen repos (e.g. Monkeypox) had
switched to indexing records by GenBank ID instead (`rule wrangle_metdata`)
and swapping in the final strain name at the end (`rule final_strain_name`)
and Zika was updated accordingly.

Other changes include updating the list of dropped strain names to GenBank IDs
(where the original strain names moved to comments) and updating the example
sequence fasta headers. In addition, added the strain name as a Tip Label
dropdown item as discussed in:

  #25 (comment)
  • Loading branch information
j23414 committed May 4, 2023
1 parent 31f044f commit 9f4f694
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 128 deletions.
61 changes: 50 additions & 11 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
if not config:
configfile: "config/config_zika.yaml"

rule all:
input:
auspice_json = "auspice/zika.json",
Expand All @@ -17,10 +20,10 @@ rule download:
message: "Downloading sequences and metadata from data.nextstrain.org"
output:
sequences = "data/sequences.fasta.zst",
metadata = "data/metadata.tsv.zst"
metadata = "data/metadata.tsv.zst",
params:
sequences_url = "https://data.nextstrain.org/files/zika/sequences.fasta.zst",
metadata_url = "https://data.nextstrain.org/files/zika/metadata.tsv.zst"
sequences_url = "https://data.nextstrain.org/files/workflows/zika/sequences_all.fasta.zst",
metadata_url = "https://data.nextstrain.org/files/workflows/zika/metadata_all.tsv.zst"
shell:
"""
curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
Expand All @@ -30,8 +33,8 @@ rule download:
rule decompress:
message: "Decompressing sequences and metadata"
input:
sequences = "data/sequences.fasta.zst",
metadata = "data/metadata.tsv.zst"
sequences = "data/sequences_all.fasta.zst",
metadata = "data/metadata_all.tsv.zst"
output:
sequences = "data/sequences.fasta",
metadata = "data/metadata.tsv"
Expand All @@ -41,6 +44,20 @@ rule decompress:
zstd -d -c {input.metadata} > {output.metadata}
"""

rule wrangle_metadata:
input:
metadata="data/metadata.tsv"
output:
metadata="results/wrangled_metadata.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
csvtk -t rename -f strain -n strain_original {input.metadata} \
| csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata}
"""


rule filter:
message:
"""
Expand All @@ -51,8 +68,8 @@ rule filter:
- minimum genome length of {params.min_length} (50% of Zika virus genome)
"""
input:
sequences = rules.decompress.output.sequences,
metadata = rules.decompress.output.metadata,
sequences = "data/sequences.fasta",
metadata = rules.wrangle_metadata.output.metadata,
exclude = files.dropped_strains
output:
sequences = "results/filtered.fasta"
Expand Down Expand Up @@ -120,7 +137,7 @@ rule refine:
input:
tree = rules.tree.output.tree,
alignment = rules.align.output,
metadata = rules.decompress.output.metadata
metadata = rules.wrangle_metadata.output.metadata
output:
tree = "results/tree.nwk",
node_data = "results/branch_lengths.json"
Expand Down Expand Up @@ -186,7 +203,7 @@ rule traits:
"""
input:
tree = rules.refine.output.tree,
metadata = rules.decompress.output.metadata
metadata = rules.wrangle_metadata.output.metadata
output:
node_data = "results/traits.json",
params:
Expand All @@ -207,7 +224,7 @@ rule export:
message: "Exporting data files for for auspice"
input:
tree = rules.refine.output.tree,
metadata = rules.decompress.output.metadata,
metadata = rules.wrangle_metadata.output.metadata,
branch_lengths = rules.refine.output.node_data,
traits = rules.traits.output.node_data,
nt_muts = rules.ancestral.output.node_data,
Expand All @@ -216,7 +233,8 @@ rule export:
auspice_config = files.auspice_config,
description = files.description
output:
auspice_json = rules.all.input.auspice_json
auspice_json = "results/raw_zika.json",
root_sequence = "results/raw_zika_root-sequence.json",
shell:
"""
augur export v2 \
Expand All @@ -230,6 +248,27 @@ rule export:
--output {output.auspice_json}
"""

rule final_strain_name:
input:
auspice_json=rules.export.output.auspice_json,
metadata=rules.wrangle_metadata.output.metadata,
root_sequence=rules.export.output.root_sequence,
output:
auspice_json=rules.all.input.auspice_json,
root_sequence="auspice/zika_root-sequence.json",
params:
display_strain_field=lambda w: config.get("display_strain_field", "strain"),
shell:
"""
python3 bin/set_final_strain_name.py \
--metadata {input.metadata} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
cp {input.root_sequence} {output.root_sequence}
"""

rule clean:
message: "Removing directories: {params}"
params:
Expand Down
36 changes: 36 additions & 0 deletions bin/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import json, argparse

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
node["name"] = lookup[node["name"]]

if "children" in node:
for child in node["children"]:
replace_name_recursive(child, lookup)

if __name__=="__main__":
parser = argparse.ArgumentParser(
description="Swaps out the strain names in the Auspice JSON with the final strain name",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
data = json.load(fh)

replace_name_recursive(data['tree'], name_lookup)

with open(args.output, 'w') as fh:
json.dump(data, fh)
5 changes: 5 additions & 0 deletions config/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
],
"build_url": "https://github.com/nextstrain/zika",
"colorings": [
{
"key": "strain_original",
"title": "strain name",
"type": "categorical"
},
{
"key": "gt",
"title": "genotype",
Expand Down
2 changes: 2 additions & 0 deletions config/config_zika.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
strain_id_field: "accession"
display_strain_field: "strain_original"
167 changes: 84 additions & 83 deletions config/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -1,86 +1,87 @@
PF13/251013_18 # reference included in config/zika_reference.gb
AFMC_U # too basal
AFMC_S # too basal
Boracay/16423 # too basal
JMB_185 # too basal
PHL/2012/CPC_0740 # too basal
MG827392
KX369547 # PF13/251013_18 # reference included in config/zika_reference.gb
KY553111 # AFMC_U # too basal
KY962729 # AFMC_S # too basal
KY120353 # Boracay/16423 # too basal
KU179098 # JMB_185 # too basal
KU681082 # PHL/2012/CPC_0740 # too basal
VIE/Bra/2016 # too basal
Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
GD01 # duplicate of other strain in dataset
GDZ16001 # duplicate of other strain in dataset
VEN/UF_2/2016 # duplicate of other strain in dataset
ZZ_1 # duplicate of other strain in dataset
VR10599/Pavia/2016 # export with unknown origin
34997/Pavia/2016 # export with unknown origin
COL/FLR_00001/2015 # duplicate of COL/FLR/2015
COL/FLR_00002/2015 # duplicate of COL/FLR/2015
COL/FLR_00003/2015 # duplicate of COL/FLR/2015
COL/FLR_00004/2015 # duplicate of COL/FLR/2015
COL/FLR_00005/2015 # duplicate of COL/FLR/2015
COL/FLR_00006/2015 # duplicate of COL/FLR/2015
COL/FLR_00007/2015 # duplicate of COL/FLR/2015
COL/FLR_00008/2015 # duplicate of COL/FLR/2015
COL/FLR_00009/2015 # duplicate of COL/FLR/2015
COL/FLR_00010/2015 # duplicate of COL/FLR/2015
COL/FLR_00011/2015 # duplicate of COL/FLR/2015
COL/FLR_00012/2015 # duplicate of COL/FLR/2015
COL/FLR_00013/2015 # duplicate of COL/FLR/2015
COL/FLR_00014/2015 # duplicate of COL/FLR/2015
COL/FLR_00015/2015 # duplicate of COL/FLR/2015
COL/FLR_00016/2015 # duplicate of COL/FLR/2015
COL/FLR_00017/2015 # duplicate of COL/FLR/2015
COL/FLR_00018/2015 # duplicate of COL/FLR/2015
COL/FLR_00019/2015 # duplicate of COL/FLR/2015
COL/FLR_00020/2015 # duplicate of COL/FLR/2015
COL/FLR_00021/2015 # duplicate of COL/FLR/2015
COL/FLR_00022/2015 # duplicate of COL/FLR/2015
COL/FLR_00023/2015 # duplicate of COL/FLR/2015
COL/FLR_00024/2015 # duplicate of COL/FLR/2015
COL/FLR_00025/2015 # duplicate of COL/FLR/2015
COL/FLR_00026/2015 # duplicate of COL/FLR/2015
COL/FLR_00034/2015 # duplicate of COL/FLR/2015
COL/FLR_00035/2015 # duplicate of COL/FLR/2015
COL/FLR_00036/2015 # duplicate of COL/FLR/2015
COL/FLR_00038/2015 # duplicate of COL/FLR/2015
COL/FLR_00040/2015 # duplicate of COL/FLR/2015
COL/FLR_00041/2015 # duplicate of COL/FLR/2015
COL/FLR_00042/2015 # duplicate of COL/FLR/2015
COL/PRV_00027/2015 # misdated
COL/PRV_00028/2015 # misdated
COL/PAN_00029/2015 # misdated
COL/PAN_00030/2015 # misdated
BRA/2016/FC_DQ12D1 # large indel
Brazil/2016/ZBRX8 # large indel
Brazil/2016/ZBRX11 # large indel
CX17 # large indel
MEX/2016/mex27 # large indel
MEX/2016/mex50 # large indel
SLV/2016/ElSalvador_1055 # large indel
USVI/20/2016 # large indel
KU853013 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
KU740184 # GD01 # duplicate of other strain in dataset
KU761564 # GDZ16001 # duplicate of other strain in dataset
KX893855 # VEN/UF_2/2016 # duplicate of other strain in dataset
KY927808 # ZZ_1 # duplicate of other strain in dataset
KY003154 # VR10599/Pavia/2016 # export with unknown origin
KY003153 # 34997/Pavia/2016 # export with unknown origin
MF574552 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015
MF574559 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015
MF574560 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015
MF574561 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015
MF574571 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015
MF574555 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015
MF574557 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015
MF574562 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015
MF574572 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015
MF574570 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015
MF574565 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015
MF574568 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015
MF574558 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015
MF574576 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015
MF574567 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015
MF574575 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015
MF574553 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015
MF574573 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015
MF574574 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015
MF574577 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015
MF574556 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015
MF574554 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015
MF574566 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015
MF574569 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015
MF574563 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015
MF574564 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015
MF574581 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015
MF574588 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015
MF574582 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015
MF574586 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015
MF574584 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015
MF574583 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015
MF574580 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015
MF574579 # COL/PRV_00027/2015 # misdated
MF574578 # COL/PRV_00028/2015 # misdated
MF574585 # COL/PAN_00029/2015 # misdated
MF574587 # COL/PAN_00030/2015 # misdated
KY785436 # BRA/2016/FC_DQ12D1 # large indel
KY559010 # Brazil/2016/ZBRX8 # large indel
KY559011 # Brazil/2016/ZBRX11 # large indel
KX986761 # CX17 # large indel
MF801405 # MEX/2016/mex27 # large indel
MF801424 # MEX/2016/mex50 # large indel
MF801377 # SLV/2016/ElSalvador_1055 # large indel
VI20_12plex # USVI/20/2016 # large indel
USVI/21/2016 # large indel
USVI/23/2016 # large indel
USVI/27/2016 # large indel
USVI/30/2016 # large indel
USVI/32/2016 # large indel
Thailand/1605aTw # excess divergence
VE_Ganxian # excess divergence
ZK_YN001 # excess divergence
Haiti/0029/2014 # contamination present
Haiti/0033/2014 # contamination present
Haiti/0036/2014 # contamination present
Haiti/0054/2014 # contamination present
Haiti/0074/2014 # contamination present
Haiti/0097/2014 # contamination present
mosquito/Haiti/1682/2016 # contamination present
VI23_12plex # USVI/23/2016 # large indel
VI27_1d # USVI/27/2016 # large indel
VI30_1d # USVI/30/2016 # large indel
VI32_12plex # USVI/32/2016 # large indel
KY126351 # Thailand/1605aTw # excess divergence
KU744693 # VE_Ganxian # excess divergence
KY328290 # ZK_YN001 # excess divergence
KY415986 # Haiti/0029/2014 # contamination present
KY415987 # Haiti/0033/2014 # contamination present
KY415990 # Haiti/0036/2014 # contamination present
KY415988 # Haiti/0054/2014 # contamination present
KY415989 # Haiti/0074/2014 # contamination present
KY415991 # Haiti/0097/2014 # contamination present
MF384325 # mosquito/Haiti/1682/2016 # contamination present
ZF36_36S # contamination present
MR766 # lab strain
Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
V15555 # highly diverged
DK # lab strain
DK23 # lab strain
rGZ02a/2018 # highly diverged
rGZ02p/2018 # highly diverged
V211784 # highly diverged
LMM/AG5643
Faranah/18
MK105975 # MR766 # lab strain
KX856011 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
MK028857 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
MN025403 # V15555 # highly diverged
MT505349 # DK # lab strain
MT505350 # DK23 # lab strain
MW680969 # rGZ02a/2018 # highly diverged
MW680970 # rGZ02p/2018 # highly diverged
OK054351 # V211784 # highly diverged
MT478034 # LMM/AG5643
OL414716 # Faranah/18
Loading

0 comments on commit 9f4f694

Please sign in to comment.