diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b55d04..ab3f754 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ # CHANGELOG +* 20 August 2024: Assign host taxa to taxonomic groupings that are relevant to rabies for coloring in auspice * 12 August 2024: Create a full genome phylogeny for rabies [PR#3](https://github.com/nextstrain/rabies/pull/3) * 25 July 2024: Add CI GH Action workflow to test the ingest workflow [PR#6](https://github.com/nextstrain/rabies/pull/6) * 15 July 2024: Make rabies-specific modifications to the ingest directory (which originated from the pathogen-repo-guide) [PR#2](https://github.com/nextstrain/rabies/pull/2) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 6a48c07..7051f81 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -23,6 +23,7 @@ ncbi_datasets_fields: - update-date - length - host-name + - host-tax-id - isolate-lineage-source - biosample-acc - submitter-names @@ -53,12 +54,18 @@ curate: release-date: date_released update-date: date_updated length: length - host-name: host + host-name: host_latin_name + host-tax-id: host_tax_id isolate-lineage-source: sample_type biosample-acc: biosample_accessions submitter-names: authors submitter-affiliation: institution submitter-country: submitter_country + Group name: host_group + Curator common name: host_common_name + Family name: host_family + Genus name: host_genus + # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: "^.+$" @@ -109,6 +116,11 @@ curate: "location", "length", "host", + "host_latin_name", + "host_family", + "host_genus", + "host_group", + "host_common_name", "date_released", "date_updated", "sra_accessions", diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index bdf3211..b02c27e 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -107,6 +107,9 @@ rule curate: --abbr-authors-field {params.abbr_authors_field} \ | augur curate apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ + | scripts/add-host-categories.py \ + --latin-field host_latin_name --family-field host_family \ + --genus-field host_genus --group-field host_group \ | augur curate apply-record-annotations \ --annotations {input.annotations} \ --id-field {params.annotations_id} \ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index c019ca0..eec5214 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -95,6 +95,67 @@ rule format_ncbi_dataset_report: > {output.ncbi_dataset_tsv} """ +rule extract_ncbi_dataset_hosttaxid: + input: + ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", + output: + ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv", + log: + "logs/extract_ncbi_dataset_hosttaxid.txt", + benchmark: + "benchmarks/extract_ncbi_dataset_hosttaxid.txt" + shell: + """ + tsv-select {input.ncbi_dataset_tsv} -H -f 'host\-tax\-id' | \ + tsv-filter --is-numeric 1 | \ + tsv-uniq \ + 2> {log} > {output.ncbi_dataset_hosttaxid} + """ + +rule get_ncbi_hosttax_info: + input: + ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv", + output: + ncbi_hosttax_info="data/hosttax_info.zip", + # Allow retries in case of network errors + retries: 5 + log: + "logs/get_ncbi_hosttax_info.txt", + benchmark: + "benchmarks/get_ncbi_hosttax_info.txt" + shell: + """ + datasets download taxonomy taxon \ + --inputfile {input.ncbi_dataset_hosttaxid} \ + --filename {output.ncbi_hosttax_info} \ + 2>&1 | tee {log} + """ + +rule join_metadata_and_hostinfo: + input: + ncbi_hosttax_info="data/hosttax_info.zip", + ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", + output: + metadata = "data/metadata_with_taxinfo.tsv", + log: + "logs/join_metadata_and_hostinfo.txt", + benchmark: + "benchmarks/join_metadata_and_hostinfo.txt" + params: + ncbi_hosttax_columns = "Query,'Group\ name','Curator\ common\ name','Family\ name','Genus\ name'" + shell: + """ + unzip -p {input.ncbi_hosttax_info} ncbi_dataset/data/taxonomy_summary.tsv \ + | tsv-select -H -f {params.ncbi_hosttax_columns} \ + | tsv-join -H \ + --filter-file - \ + --key-fields Query \ + --data-fields 'host\-tax\-id' \ + --append-fields '*' \ + --write-all ? \ + {input.ncbi_dataset_tsv} \ + 2> {log} > {output.metadata} + """ # Technically you can bypass this step and directly provide FASTA and TSV files # as input files for the curate pipeline. @@ -103,7 +164,7 @@ rule format_ncbi_dataset_report: rule format_ncbi_datasets_ndjson: input: ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta", - ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", + ncbi_dataset_tsv="data/metadata_with_taxinfo.tsv", output: ndjson="data/ncbi.ndjson", log: diff --git a/ingest/scripts/add-host-categories.py b/ingest/scripts/add-host-categories.py new file mode 100755 index 0000000..2041cee --- /dev/null +++ b/ingest/scripts/add-host-categories.py @@ -0,0 +1,70 @@ +#! /usr/bin/env python3 +""" +From stdin, generates host names using info from the NCBI taxonomy output of the NDJSON record, with output to 'host' + +Outputs the modified record to stdout. +""" + +import argparse +import json +from sys import stdin, stdout + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate host names and output to 'host'.") + parser.add_argument("--latin-field", default='host_latin_name', + help="Field from the records to use as the host latin name.") + parser.add_argument("--family-field", default='host_family', + help="Field from the records to use as the host Family name.") + parser.add_argument("--genus-field", default='host_genus', + help="Field from the records to use as the host genus name.") + parser.add_argument("--group-field", default='host_group', + help="Field from the records to use as the host group.") + return parser.parse_args() + +def _set_host_name_transformed(record, args): + latin_replacements = { + "Canis lupus familiaris": "Domestic Dog", + "Homo sapiens": "Human", + "Bos taurus": "Cattle", + "Didelphis albiventris": "Other Mammal", + "Elephas maximus": "Other Mammal", + "Dasypus novemcinctus": "Other Mammal"} + family_replacements = {"Mephitidae": "Skunk"} + group_replacements = { + "odd-toed ungulates": "Other Ungulate", + "even-toed ungulates & whales": "Other Ungulate", + "carnivores": "Other Carnivore", + "bats": "Bat", + "birds": "Bird", + "primates": "Other Mammal", + "rodents": "Other Mammal", + "mammals": "Other Mammal" + } + latin_field = record[args.latin_field] + family_field = record[args.family_field] + group_field = record[args.group_field] + + if record[args.family_field] == "Canidae" and record[args.genus_field] == "Vulpes": + return "Fox (Vulpes sp.)" + elif record[args.family_field] == "Procyonidae" and record[args.genus_field] == "Procyon": + return "Raccoon" + elif latin_field in latin_replacements: + return latin_replacements[latin_field] + elif family_field in family_replacements: + return family_replacements[family_field] + elif group_field in group_replacements: + return group_replacements[group_field] + else: + return group_field + +def main(): + args = parse_args() + + for index, record in enumerate(stdin): + record = json.loads(record) + record['host'] = _set_host_name_transformed(record, args) + stdout.write(json.dumps(record) + "\n") + +if __name__ == "__main__": + main() diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index 208858f..ea4eea1 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -31,6 +31,16 @@ "key": "host", "title": "Host", "type": "categorical" + }, + { + "key": "host_latin_name", + "title": "Host latin name", + "type": "categorical" + }, + { + "key": "host_common_name", + "title": "Host common name", + "type": "categorical" } ], "geo_resolutions": [ @@ -39,7 +49,7 @@ ], "display_defaults": { "map_triplicate": true, - "color_by": "region" + "color_by": "host" }, "filters": [ "region", diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv index db1f906..c8bf721 100644 --- a/phylogenetic/defaults/colors.tsv +++ b/phylogenetic/defaults/colors.tsv @@ -5,3 +5,16 @@ region Africa #8ABB6A region Europe #BEBB48 region South America #E29E39 region North America #E2562B +# +# Host taxa +host Bat #3F47C9 +host Domestic Dog #4274CE +host Fox (Vulpes sp.) #4F97BB +host Raccoon #64AC99 +host Skunk #7EB976 +host Other Carnivore #9EBE5A +host Cattle #BEBB48 +host Other Ungulate #D9AE3E +host Human #E69036 +host Other Mammal #E35F2D +host Bird #DB2823