From f139941f50a6f0bf086acd4c344457b26054702e Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 10:13:18 -0700 Subject: [PATCH 1/7] Archive previous contents under _LEGACY [#2] Once the repo modernization is complete, these files will be deleted; for the moment this allows them to be easily accessed. --- Snakefile => _LEGACY/Snakefile | 0 {auspice => _LEGACY/auspice}/yellow-fever_meta.json | 0 {auspice => _LEGACY/auspice}/yellow-fever_tree.json | 0 {config => _LEGACY/config}/YFV112.gb | 0 {config => _LEGACY/config}/auspice_config.json | 0 {config => _LEGACY/config}/genome_annotation_file.json | 0 {scripts => _LEGACY/scripts}/parseMetadata.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename Snakefile => _LEGACY/Snakefile (100%) rename {auspice => _LEGACY/auspice}/yellow-fever_meta.json (100%) rename {auspice => _LEGACY/auspice}/yellow-fever_tree.json (100%) rename {config => _LEGACY/config}/YFV112.gb (100%) rename {config => _LEGACY/config}/auspice_config.json (100%) rename {config => _LEGACY/config}/genome_annotation_file.json (100%) rename {scripts => _LEGACY/scripts}/parseMetadata.py (100%) diff --git a/Snakefile b/_LEGACY/Snakefile similarity index 100% rename from Snakefile rename to _LEGACY/Snakefile diff --git a/auspice/yellow-fever_meta.json b/_LEGACY/auspice/yellow-fever_meta.json similarity index 100% rename from auspice/yellow-fever_meta.json rename to _LEGACY/auspice/yellow-fever_meta.json diff --git a/auspice/yellow-fever_tree.json b/_LEGACY/auspice/yellow-fever_tree.json similarity index 100% rename from auspice/yellow-fever_tree.json rename to _LEGACY/auspice/yellow-fever_tree.json diff --git a/config/YFV112.gb b/_LEGACY/config/YFV112.gb similarity index 100% rename from config/YFV112.gb rename to _LEGACY/config/YFV112.gb diff --git a/config/auspice_config.json b/_LEGACY/config/auspice_config.json similarity index 100% rename from config/auspice_config.json rename to _LEGACY/config/auspice_config.json diff --git a/config/genome_annotation_file.json b/_LEGACY/config/genome_annotation_file.json similarity index 100% rename from config/genome_annotation_file.json rename to _LEGACY/config/genome_annotation_file.json diff --git a/scripts/parseMetadata.py b/_LEGACY/scripts/parseMetadata.py similarity index 100% rename from scripts/parseMetadata.py rename to _LEGACY/scripts/parseMetadata.py From 2a8574c591eaba2a0ba037f8520c2969dde36f6c Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 10:14:37 -0700 Subject: [PATCH 2/7] Strip out old README; scaffold new one [#2] --- README.md | 88 ++++++------------------------------------------------- 1 file changed, 9 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index a9e08a0..4546527 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,12 @@ -# Nextstrain build for Yellow Fever - -This build is currently designed for in-the-field running and is not yet generalised for a stable, updated nextstrain.org page. - -## Install augur + auspice using conda - -``` -curl http://data.nextstrain.org/nextstrain.yml --compressed -o nextstrain.yml -conda env create -f nextstrain.yml -conda activate nextstrain -npm install --global auspice -``` - -When you're inside the "nextstrain" environment (via `conda activate nextstrain`) you should have both `augur` & `auspice` installed. -You can test this by running `augur --version` and `auspice --version`. -Currently, augur is around `v5.1.1` and auspice is around `v1.36.6`. - - -## Clone this repo - -``` -git clone https://github.com/nextstrain/yellow-fever.git -cd yellow-fever -mkdir data results auspice -``` - - -## Make input files available - -The bioinformatics "pipeline" for YFV starts with 4 main files as input: -* `./data/genbankReleased.fasta` -* `./data/genbankReleased.csv` -* `./data/newSequences.fasta` -* `./data/newSequences.csv` - -These are not committed to the github repo (they're "gitignored"), so you'll have to put them there. - -These files are specified in the first few lines of the Snakemake file, which contains all the commands necessary to run the pipeline. -It's possible to use >2 sets of input files, or different file names, but you'll have to add / change them in the snakemake file. - - -Additionally, there are 2 other input-like files, also defined in the snakemake file: -* `./config/auspice_config.json` which contains options -- such as what traits to display as the color-by's on the tree -- which are used to control how auspice will visualise the data. -* `./config/YFV112.gb` the YFV reference used here -- currently [YF112](https://www.ncbi.nlm.nih.gov/nuccore/1269012770). Please replace this if needed & update the snakemake file accordingly. - - -## Run the pipeline - -``` -snakemake --printshellcmds -``` - -This will run all the steps defined in the Snakefile 🎉 - -These steps are (roughly): -1. __parse__ convert the (potentially multiple) CSV + FASTA files into the correct format for augur (TSV + FASTA). Also performs some field manipulation, such as extracting "country" from the "Sequence_name", extracting collection year, storing which file a sequence came from etcetera. -2. __align__ Using mafft -3. __tree__ Using IQ-TREE (cahn change this to RAxML or FastTree if needed) -4. __refine__ Normally this is where we date the internal nodes, but I haven't enabled this here. It is needed however to label the internal nodes & reroot the tree (see below). -5. __ancestral__ Infer ancestral mutations on the tree. This step could easily be dropped if desired! -6. __traits__ Use DTA to infer some traits across the tree. Currently used for "country" only. You can easily add fields to the snakemake file which will perform this for additional traits. -7. __export__ Create the final JSON for auspice to visualise. - -Steps 1-6 produce output in `./results`, while step 7 (export) produces the JSONs in `./auspice`. Both of these directories are gitignored (as well as `./data`) so that files here won't be pushed up to GitHub. - -## Visualise the data - -``` -auspice view --datasetDir auspice -``` -Then open a browser at http://localhost:4000 - -Current color-bys include most of the metadata provided, as well as which file the samples came from, year of collection. -The GPS co-ordinates are per-strain, so that is the geographic-resolution available. We could also aggregate municipalities if desired (we'd need GPS coordinates for each one if so). - - -## To-Do -* The tree is rooted on the oldest available sequence ("JF912179", 1980), but there may be a better choice? This is defined in the Snakefile and is really easy to change. -* Reference sequence used may not be ideal. +# Nextstrain repository for yellow fever virus +##TODO## finish updating this +This repository is in the process of being upgraded to follow the +[pathogen repo +guide](https://github.com/nextstrain/pathogen-repo-guide/). +## Installation +Follow the [standard installation +instructions](https://docs.nextstrain.org/en/latest/install.html) for +Nextstrain's suite of software tools. From df1a6dd993cdd006a99330fcf7a138fc70643916 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 10:14:56 -0700 Subject: [PATCH 3/7] Add pre-commit-config.yaml and supporting files [#2] Exclude the _LEGACY tree because there's no point in cleaning that stuff up; leave a comment so I remember to go back and remove it when `_LEGACY` goes away. --- .pre-commit-config.yaml | 55 +++++++++++++++++++++++++++++++++++++++++ .yamlfmt | 5 ++++ pyproject.toml | 8 ++++++ 3 files changed, 68 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 .yamlfmt create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..110d5ae --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,55 @@ +default_language_version: + python: python3 +# TODO remove _LEGACY once it's gone +exclude: '\.(tsv|fasta|gb)$|^ingest/vendored/|^_LEGACY' +repos: + - repo: https://github.com/snakemake/snakefmt + rev: v0.10.1 + hooks: + - id: snakefmt + language_version: python3 + - repo: https://github.com/rhysd/actionlint + rev: v1.6.27 + hooks: + - id: actionlint + entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint + - repo: https://github.com/codespell-project/codespell + rev: v2.2.6 + hooks: + - id: codespell + additional_dependencies: + - tomli + - repo: https://github.com/google/yamlfmt + rev: v0.12.1 + hooks: + - id: yamlfmt + - repo: https://github.com/pappasam/toml-sort + rev: v0.23.1 + hooks: + - id: toml-sort-fix + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: check-ast + - id: check-case-conflict + - id: check-docstring-first + - id: check-json + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: check-toml + - id: check-yaml + - id: destroyed-symlinks + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - repo: https://github.com/pre-commit/sync-pre-commit-deps + rev: v0.0.1 + hooks: + - id: sync-pre-commit-deps + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck diff --git a/.yamlfmt b/.yamlfmt new file mode 100644 index 0000000..354963d --- /dev/null +++ b/.yamlfmt @@ -0,0 +1,5 @@ +formatter: + type: basic + line_ending: lf + retain_line_breaks: true + max_line_length: 120 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2c839bd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[tool.codespell] +write = '' +skip = '*.gff,*.gff3' + +[tool.snakefmt] +include = '\.smk$|^Snakefile' +exclude = '' +line_length = 120 From 3a2acd289683daf7450b6c9d9535df723438bc55 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 10:18:03 -0700 Subject: [PATCH 4/7] Pull ingest and phylogenetic workflows from guide repo [#2] Also apply pre-commit cleanup --- ingest/README.md | 105 +++++++++++ ingest/Snakefile | 86 +++++++++ .../nextstrain-automation/README.md | 38 ++++ .../nextstrain-automation/config.yaml | 24 +++ .../nextstrain-automation/upload.smk | 43 +++++ ingest/defaults/annotations.tsv | 6 + ingest/defaults/config.yaml | 134 ++++++++++++++ ingest/defaults/geolocation_rules.tsv | 7 + ingest/defaults/nextclade_config.yaml | 12 ++ ingest/defaults/nextclade_field_map.tsv | 18 ++ ingest/rules/curate.smk | 131 +++++++++++++ ingest/rules/fetch_from_ncbi.smk | 173 ++++++++++++++++++ ingest/rules/nextclade.smk | 97 ++++++++++ nextstrain-pathogen.yaml | 5 + phylogenetic/README.md | 59 ++++++ phylogenetic/Snakefile | 55 ++++++ phylogenetic/build-configs/ci/config.yaml | 7 + .../build-configs/ci/copy_example_data.smk | 17 ++ phylogenetic/defaults/config.yaml | 5 + phylogenetic/example_data/metadata.tsv | 0 phylogenetic/example_data/sequences.fasta | 0 phylogenetic/rules/annotate_phylogeny.smk | 32 ++++ phylogenetic/rules/construct_phylogeny.smk | 20 ++ phylogenetic/rules/export.smk | 26 +++ phylogenetic/rules/prepare_sequences.smk | 22 +++ 25 files changed, 1122 insertions(+) create mode 100644 ingest/README.md create mode 100644 ingest/Snakefile create mode 100644 ingest/build-configs/nextstrain-automation/README.md create mode 100644 ingest/build-configs/nextstrain-automation/config.yaml create mode 100644 ingest/build-configs/nextstrain-automation/upload.smk create mode 100644 ingest/defaults/annotations.tsv create mode 100644 ingest/defaults/config.yaml create mode 100644 ingest/defaults/geolocation_rules.tsv create mode 100644 ingest/defaults/nextclade_config.yaml create mode 100644 ingest/defaults/nextclade_field_map.tsv create mode 100644 ingest/rules/curate.smk create mode 100644 ingest/rules/fetch_from_ncbi.smk create mode 100644 ingest/rules/nextclade.smk create mode 100644 nextstrain-pathogen.yaml create mode 100644 phylogenetic/README.md create mode 100644 phylogenetic/Snakefile create mode 100644 phylogenetic/build-configs/ci/config.yaml create mode 100644 phylogenetic/build-configs/ci/copy_example_data.smk create mode 100644 phylogenetic/defaults/config.yaml create mode 100644 phylogenetic/example_data/metadata.tsv create mode 100644 phylogenetic/example_data/sequences.fasta create mode 100644 phylogenetic/rules/annotate_phylogeny.smk create mode 100644 phylogenetic/rules/construct_phylogeny.smk create mode 100644 phylogenetic/rules/export.smk create mode 100644 phylogenetic/rules/prepare_sequences.smk diff --git a/ingest/README.md b/ingest/README.md new file mode 100644 index 0000000..c3b3a16 --- /dev/null +++ b/ingest/README.md @@ -0,0 +1,105 @@ +# Ingest + +This workflow ingests public data from NCBI and outputs curated +metadata and sequences that can be used as input for the phylogenetic +workflow. + +## Workflow Usage + +The workflow can be run from the top level pathogen repo directory: + +```bash +nextstrain build ingest +``` + +Alternatively, the workflow can also be run from within the ingest +directory: + +```bash +cd ingest +nextstrain build . +``` + +This produces the default outputs of the ingest workflow: + +- metadata = results/metadata.tsv +- sequences = results/sequences.fasta + +### Dumping the full raw metadata from NCBI Datasets + +The workflow has a target for dumping the full raw metadata from NCBI +Datasets. + +```bash +nextstrain build ingest dump_ncbi_dataset_report +``` + +This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`, +which you can inspect to determine what fields and data to use if you +want to configure the workflow for your pathogen. + +## Defaults + +The defaults directory contains all of the default configurations for +the ingest workflow. + +[defaults/config.yaml](defaults/config.yaml) contains all of the +default configuration parameters used for the ingest workflow. Use +Snakemake's `--configfile`/`--config` options to override these +default values. + +## Snakefile and rules + +The rules directory contains separate Snakefiles (`*.smk`) as modules +of the core ingest workflow. The modules of the workflow are in +separate files to keep the main ingest [Snakefile](Snakefile) succinct +and organized. + +The `workdir` is hardcoded to be the ingest directory so all filepaths +for inputs/outputs should be relative to the ingest directory. + +Modules are all +[included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) +in the main Snakefile in the order that they are expected to run. + +### Nextclade + +Nextstrain is pushing to standardize ingest workflows with Nextclade +runs to include Nextclade outputs in our publicly hosted data. +However, if a Nextclade dataset does not already exist, it requires +curated data as input, so we are making Nextclade steps optional here. + +If Nextclade config values are included, the Nextclade rules will +create the final metadata TSV by joining the Nextclade output with the +metadata. If Nextclade configs are not included, we rename the subset +metadata TSV to the final metadata TSV. + +To run Nextclade rules, include the `defaults/nextclade_config.yaml` +config file with: + +```bash +nextstrain build ingest --configfile defaults/nextclade_config.yaml +``` + +> [!TIP] +> If the Nextclade dataset is stable and you always want to run the +> Nextclade rules as part of ingest, we recommend moving the Nextclade +> related config parameters from the `defaults/nextclade_config.yaml` +> file to the default config file `defaults/config.yaml`. + +## Build configs + +The build-configs directory contains custom configs and rules that +override and/or extend the default workflow. + +- [nextstrain-automation](build-configs/nextstrain-automation/) - automated internal Nextstrain builds. + +## Vendored + +This repository uses +[`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies +of ingest scripts in [vendored](vendored), from +[nextstrain/ingest](https://github.com/nextstrain/ingest). + +See [vendored/README.md](vendored/README.md#vendoring) for +instructions on how to update the vendored scripts. diff --git a/ingest/Snakefile b/ingest/Snakefile new file mode 100644 index 0000000..98b14a2 --- /dev/null +++ b/ingest/Snakefile @@ -0,0 +1,86 @@ +""" +This is the main ingest Snakefile that orchestrates the full ingest workflow +and defines its default outputs. +""" + + +# The workflow filepaths are written relative to this Snakefile's base +# directory +workdir: workflow.current_basedir + + +# Use default configuration values. Override with Snakemake's +# --configfile/--config options. +configfile: "defaults/config.yaml" + + +# This is the default rule that Snakemake will run when there are no +# specified targets. The default output of the ingest workflow is +# usually the curated metadata and sequences. Nextstrain-maintained +# ingest workflows will produce metadata files with the standard +# Nextstrain fields and additional fields that are pathogen specific. +# We recommend using these standard fields in custom ingests as well +# to minimize the customizations you will need for the downstream +# phylogenetic workflow. + + +# TODO: Add link to centralized docs on standard Nextstrain metadata fields +rule all: + input: + "results/sequences.fasta", + "results/metadata.tsv", + + +# Note that only PATHOGEN-level customizations should be added to +# these core steps, meaning they are custom rules necessary for all +# builds of the pathogen. If there are build-specific customizations, +# they should be added with the custom_rules imported below to ensure +# that the core workflow is not complicated by build-specific rules. +include: "rules/fetch_from_ncbi.smk" +include: "rules/curate.smk" + + +# We are pushing to standardize ingest workflows with Nextclade runs +# to include Nextclade outputs in our publicly hosted data. However, +# if a Nextclade dataset does not already exist, creating one requires +# curated data as input, so we are making Nextclade steps optional +# here. +# +# If Nextclade config values are included, the nextclade rules will +# create the final metadata TSV by joining the Nextclade output with +# the metadata. If Nextclade configs are not included, we rename the +# subset metadata TSV to the final metadata TSV. To run nextclade.smk +# rules, include the `defaults/nextclade_config.yaml` config file with +# `nextstrain build ingest --configfile +# defaults/nextclade_config.yaml`. +if "nextclade" in config: + + include: "rules/nextclade.smk" + +else: + + rule create_final_metadata: + input: + metadata="data/subset_metadata.tsv", + output: + metadata="results/metadata.tsv", + shell: + """ + mv {input.metadata} {output.metadata} + """ + + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override +# the workflow. A concrete example of using custom rules is the +# extension of the workflow with rules to support the Nextstrain +# automation that uploads files and sends internal Slack +# notifications. For extensions, the user will have to specify the +# custom rule targets when running the workflow. For overrides, the +# custom Snakefile will have to use the `ruleorder` directive to allow +# Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/ingest/build-configs/nextstrain-automation/README.md b/ingest/build-configs/nextstrain-automation/README.md new file mode 100644 index 0000000..8eff7dd --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/README.md @@ -0,0 +1,38 @@ +# Nextstrain automation + +> [!NOTE] +> External users can ignore this directory! +> This build config/customization is tailored for the internal Nextstrain team +> to extend the core ingest workflow for automated workflows. + +## Update the config + +Update the [config.yaml](config.yaml) for your pathogen: + +1. Edit the `s3_dst` param to add the pathogen repository name. +2. Edit the `files_to_upload` param to a mapping of files you need to upload for your pathogen. +The default includes suggested files for uploading curated data and Nextclade outputs. + +## Run the workflow + +Provide the additional config file to the Snakemake options in order to +include the custom rules from [upload.smk](upload.smk) in the workflow. +Specify the `upload_all` target in order to run the additional upload rules. + +The upload rules will require AWS credentials for a user that has permissions +to upload to the Nextstrain data bucket. + +The customized workflow can be run from the top level pathogen repo directory with: +``` +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + ingest \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml +``` + +## Automated GitHub Action workflows + +Additional instructions on how to use this with the shared `pathogen-repo-build` +GitHub Action workflow to come! diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml new file mode 100644 index 0000000..411399d --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -0,0 +1,24 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. + +# Custom rules to run as part of the Nextstrain automated workflow The +# paths should be relative to the ingest directory. +custom_rules: + - build-configs/nextstrain-automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront +# after the S3 uploads This is required as long as we are using the +# AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +s3_dst: "s3://nextstrain-data/files/workflows/seasonal-cov" + +# Mapping of files to upload +## TODO verify +files_to_upload: + ncbi.ndjson.zst: data/ncbi.ndjson + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta + alignments.fasta.zst: results/alignment.fasta + translations.zip: results/translations.zip diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk new file mode 100644 index 0000000..6954966 --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/upload.smk @@ -0,0 +1,43 @@ +""" +This part of the workflow handles uploading files to AWS S3. + +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. + +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" + +The rule `upload_all` can be used as a target to upload all files. +""" + +import os + +slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ +send_notifications = config.get("send_slack_notifications", False) and slack_envvars_defined + + +rule upload_to_s3: + input: + file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file], + output: + "results/upload/{remote_file}.upload", + params: + quiet="" if send_notifications else "--quiet", + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {params.quiet} \ + {input.file_to_upload:q} \ + {params.s3_dst:q}/{wildcards.remote_file:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + + +rule upload_all: + input: + uploads=[f"results/upload/{remote_file}.upload" for remote_file in config["files_to_upload"].keys()], + output: + touch("results/upload_all.done"), diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv new file mode 100644 index 0000000..89c0059 --- /dev/null +++ b/ingest/defaults/annotations.tsv @@ -0,0 +1,6 @@ +# Manually curated annotations TSV file +# The TSV should not have a header and should have exactly three columns: +# id to match existing metadata, field name, and field value +# If there are multiple annotations for the same id and field, then the last value is used +# Lines starting with '#' are treated as comments +# Any '#' after the field value are treated as comments. diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml new file mode 100644 index 0000000..e9af859 --- /dev/null +++ b/ingest/defaults/config.yaml @@ -0,0 +1,134 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run to completion. +# +# Define optional config parameters with their default values here so that users +# do not have to dig through the workflows to figure out the default values + +# Required to fetch from Entrez +entrez_search_term: "" + +# Required to fetch from NCBI Datasets +ncbi_taxon_id: "" + +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country + +# Config parameters related to the curate pipeline +curate: + # URL pointed to public generalized geolocation rules + # For the Nextstrain team, this is currently + # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" + geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" + # The path to the local geolocation rules within the pathogen repo + # The path should be relative to the ingest directory. + local_geolocation_rules: "defaults/geolocation_rules.tsv" + # List of field names to change where the key is the original field name and the value is the new field name + # The original field names should match the ncbi_datasets_fields provided above. + # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names + field_map: + accession: accession + accession_version: accession_version + sourcedb: database + sra-accs: sra_accessions + isolate-lineage: strain + geo-region: region + geo-location: location + isolate-collection-date: date + release-date: date_released + update-date: date_updated + length: length + host-name: host + isolate-lineage-source: sample_type + biosample-acc: biosample_accessions + submitter-names: authors + submitter-affiliation: institution + submitter-country: submitter_country + # Standardized strain name regex + # Currently accepts any characters because we do not have a clear standard for strain names across pathogens + strain_regex: "^.+$" + # Back up strain name field to use if "strain" doesn"t match regex above + strain_backup_fields: ["accession"] + # List of date fields to standardize to ISO format YYYY-MM-DD + date_fields: ["date", "date_released", "date_updated"] + # List of expected date formats that are present in the date fields provided above + # These date formats should use directives expected by datetime + # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes + expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"] + titlecase: + # List of string fields to titlecase + fields: ["region", "country", "division", "location"] + # List of abbreviations not cast to titlecase, keeps uppercase + abbreviations: ["USA"] + # Articles that should not be cast to titlecase + articles: + - and + - d + - de + - del + - des + - di + - do + - en + - l + - la + - las + - le + - los + - nad + - of + - op + - sur + - the + - y + # Metadata field that contains the list of authors associated with the sequence + authors_field: "authors" + # Default value to use if the authors field is empty + authors_default_value: "?" + # Name to use for the generated abbreviated authors field + abbr_authors_field: "abbr_authors" + # Path to the manual annotations file + # The path should be relative to the ingest directory + annotations: "defaults/annotations.tsv" + # The ID field in the metadata to use to merge the manual annotations + annotations_id: "accession" + # The ID field in the metadata to use as the sequence id in the output FASTA file + output_id_field: "accession" + # The field in the NDJSON record that contains the actual genomic sequence + output_sequence_field: "sequence" + # The list of metadata columns to keep in the final output of the curation pipeline. + metadata_columns: + - accession + - accession_version + - strain + - date + - region + - country + - division + - location + - length + - host + - date_released + - date_updated + - sra_accessions + - authors + - abbr_authors + - institution diff --git a/ingest/defaults/geolocation_rules.tsv b/ingest/defaults/geolocation_rules.tsv new file mode 100644 index 0000000..bb5f7d2 --- /dev/null +++ b/ingest/defaults/geolocation_rules.tsv @@ -0,0 +1,7 @@ +# TSV file of geolocation rules with the format: +# '' where the raw and annotated geolocations +# are formatted as '///' +# If creating a general rule, then the raw field value can be substituted with '*' +# Lines starting with '#' will be ignored as comments. +# Trailing '#' will be ignored as comments. +# TODO example line? diff --git a/ingest/defaults/nextclade_config.yaml b/ingest/defaults/nextclade_config.yaml new file mode 100644 index 0000000..3c48bc8 --- /dev/null +++ b/ingest/defaults/nextclade_config.yaml @@ -0,0 +1,12 @@ +# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow +# Note that this requires a Nextclade dataset to already exist for your pathogen. +nextclade: + # The name of the Nextclade dataset to use for running nextclade. + # Run `nextclade dataset list` to get a full list of available Nextclade datasets + dataset_name: "" + # Path to the mapping for renaming Nextclade output columns + # The path should be relative to the ingest directory + field_map: "config/nextclade_field_map.tsv" + # This is the ID field you would use to match the Nextclade output with the record metadata. + # This should be the new name that you have defined in your field map. + id_field: "seqName" diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv new file mode 100644 index 0000000..513b0fd --- /dev/null +++ b/ingest/defaults/nextclade_field_map.tsv @@ -0,0 +1,18 @@ +# TSV file that is a mapping of column names for Nextclade output TSV +# The first column should be the original column name of the Nextclade TSV +# The second column should be the new column name to use in the final metadata TSV +# Nextclade can have pathogen specific output columns so make sure to check which +# columns would be useful for your downstream phylogenetic analysis. +seqName seqName +clade clade +lineage lineage +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk new file mode 100644 index 0000000..112eb34 --- /dev/null +++ b/ingest/rules/curate.smk @@ -0,0 +1,131 @@ +""" +This part of the workflow handles the curation of data from NCBI + +REQUIRED INPUTS: + + ndjson = data/ncbi.ndjson + +OUTPUTS: + + metadata = data/subset_metadata.tsv + seuqences = results/sequences.fasta + +""" + + +# The following two rules can be ignored if you choose not to use the +# generalized geolocation rules that are shared across pathogens. +# The Nextstrain team will try to maintain a generalized set of geolocation +# rules that can then be overridden by local geolocation rules per pathogen repo. +rule fetch_general_geolocation_rules: + output: + general_geolocation_rules="data/general-geolocation-rules.tsv", + params: + geolocation_rules_url=config["curate"]["geolocation_rules_url"], + shell: + """ + curl {params.geolocation_rules_url} > {output.general_geolocation_rules} + """ + + +rule concat_geolocation_rules: + input: + general_geolocation_rules="data/general-geolocation-rules.tsv", + local_geolocation_rules=config["curate"]["local_geolocation_rules"], + output: + all_geolocation_rules="data/all-geolocation-rules.tsv", + shell: + # why is this `>>` and not `>` + """ + cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} + """ + + +def format_field_map(field_map: dict[str, str]) -> str: + """ + Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands. + """ + return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()]) + + +# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data. +# You may want to add and/or remove steps from the pipeline for custom metadata +# curation for your pathogen. Note that the curate pipeline is streaming NDJSON +# records between scripts, so any custom scripts added to the pipeline should expect +# the input as NDJSON records from stdin and output NDJSON records to stdout. +# The final step of the pipeline should convert the NDJSON records to two +# separate files: a metadata TSV and a sequences FASTA. +rule curate: + input: + sequences_ndjson="data/ncbi.ndjson", + # Change the geolocation_rules input path if you are removing the above two rules + all_geolocation_rules="data/all-geolocation-rules.tsv", + annotations=config["curate"]["annotations"], + output: + metadata="data/all_metadata.tsv", + sequences="results/sequences.fasta", + log: + "logs/curate.txt", + benchmark: + "benchmarks/curate.txt" + params: + field_map=format_field_map(config["curate"]["field_map"]), + strain_regex=config["curate"]["strain_regex"], + strain_backup_fields=config["curate"]["strain_backup_fields"], + date_fields=config["curate"]["date_fields"], + expected_date_formats=config["curate"]["expected_date_formats"], + articles=config["curate"]["titlecase"]["articles"], + abbreviations=config["curate"]["titlecase"]["abbreviations"], + titlecase_fields=config["curate"]["titlecase"]["fields"], + authors_field=config["curate"]["authors_field"], + authors_default_value=config["curate"]["authors_default_value"], + abbr_authors_field=config["curate"]["abbr_authors_field"], + annotations_id=config["curate"]["annotations_id"], + id_field=config["curate"]["output_id_field"], + sequence_field=config["curate"]["output_sequence_field"], + shell: + """ + (cat {input.sequences_ndjson} \ + | ./vendored/transform-field-names \ + --field-map {params.field_map} \ + | augur curate normalize-strings \ + | ./vendored/transform-strain-names \ + --strain-regex {params.strain_regex} \ + --backup-fields {params.strain_backup_fields} \ + | augur curate format-dates \ + --date-fields {params.date_fields} \ + --expected-date-formats {params.expected_date_formats} \ + | ./vendored/transform-genbank-location \ + | augur curate titlecase \ + --titlecase-fields {params.titlecase_fields} \ + --articles {params.articles} \ + --abbreviations {params.abbreviations} \ + | ./vendored/transform-authors \ + --authors-field {params.authors_field} \ + --default-value {params.authors_default_value} \ + --abbr-authors-field {params.abbr_authors_field} \ + | ./vendored/apply-geolocation-rules \ + --geolocation-rules {input.all_geolocation_rules} \ + | ./vendored/merge-user-metadata \ + --annotations {input.annotations} \ + --id-field {params.annotations_id} \ + | augur curate passthru \ + --output-metadata {output.metadata} \ + --output-fasta {output.sequences} \ + --output-id-field {params.id_field} \ + --output-seq-field {params.sequence_field} ) 2>> {log} + """ + + +rule subset_metadata: + input: + metadata="data/all_metadata.tsv", + output: + subset_metadata="data/subset_metadata.tsv", + params: + metadata_fields=",".join(config["curate"]["metadata_columns"]), + shell: + """ + tsv-select -H -f {params.metadata_fields} \ + {input.metadata} > {output.subset_metadata} + """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk new file mode 100644 index 0000000..ed350ce --- /dev/null +++ b/ingest/rules/fetch_from_ncbi.smk @@ -0,0 +1,173 @@ +""" +This part of the workflow handles fetching sequences and metadata from NCBI. + +REQUIRED INPUTS: + + None + +OUTPUTS: + + ndjson = data/ncbi.ndjson + +There are two different approaches for fetching data from NCBI. +Choose the one that works best for the pathogen data and edit the workflow config +to provide the correct parameter. + +1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) + - requires `ncbi_taxon_id` config + - Directly returns NDJSON without custom parsing + - Fastest option for large datasets (e.g. SARS-CoV-2) + - Only returns metadata fields that are available through NCBI Datasets + - Only works for viral genomes + +2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) + - requires `entrez_search_term` config + - Returns all available data via a GenBank file + - Requires a custom script to parse the necessary fields from the GenBank file +""" + + +# This ruleorder determines which rule to use to produce the final NCBI NDJSON file. +# The default is set to use NCBI Datasets since it does not require a custom script. +# Switch the rule order if you plan to use Entrez +ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson + + +########################################################################### +####################### 1. Fetch from NCBI Datasets ####################### +########################################################################### + + +rule fetch_ncbi_dataset_package: + params: + ncbi_taxon_id=config["ncbi_taxon_id"], + output: + dataset_package=temp("data/ncbi_dataset.zip"), + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/fetch_ncbi_dataset_package.txt" + shell: + # what's the `:q` mean + """ + datasets download virus genome taxon {params.ncbi_taxon_id:q} \ + --no-progressbar \ + --filename {output.dataset_package} + """ + + +# Note: This rule is not part of the default workflow! +# It is intended to be used as a specific target for users to be able +# to inspect and explore the full raw metadata from NCBI Datasets. +rule dump_ncbi_dataset_report: + input: + dataset_package="data/ncbi_dataset.zip", + output: + ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv", + shell: + """ + dataformat tsv virus-genome \ + --package {input.dataset_package} > {output.ncbi_dataset_tsv} + """ + + +rule extract_ncbi_dataset_sequences: + input: + dataset_package="data/ncbi_dataset.zip", + output: + ncbi_dataset_sequences=temp("data/ncbi_dataset_sequences.fasta"), + # why benchmarks here but not elsewhere + benchmark: + "benchmarks/extract_ncbi_dataset_sequences.txt" + shell: + """ + unzip -jp {input.dataset_package} \ + ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} + """ + + +rule format_ncbi_dataset_report: + input: + dataset_package="data/ncbi_dataset.zip", + output: + ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), + params: + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), + benchmark: + "benchmarks/format_ncbi_dataset_report.txt" + shell: + """ + dataformat tsv virus-genome \ + --package {input.dataset_package} \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk fix-quotes -Ht \ + | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \ + | csvtk rename -t -f accession -n accession_version \ + | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \ + | csvtk del-quotes -t \ + | tsv-select -H -f accession --rest last \ + > {output.ncbi_dataset_tsv} + """ + + +# Technically you can bypass this step and directly provide FASTA and TSV files +# as input files for the curate pipeline. +# We do the formatting here to have a uniform NDJSON file format for the raw +# data that we host on data.nextstrain.org +rule format_ncbi_datasets_ndjson: + input: + ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta", + ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", + output: + ndjson="data/ncbi.ndjson", + log: + "logs/format_ncbi_datasets_ndjson.txt", + benchmark: + "benchmarks/format_ncbi_datasets_ndjson.txt" + shell: + """ + augur curate passthru \ + --metadata {input.ncbi_dataset_tsv} \ + --fasta {input.ncbi_dataset_sequences} \ + --seq-id-column accession_version \ + --seq-field sequence \ + --unmatched-reporting warn \ + --duplicate-reporting warn \ + 2> {log} > {output.ndjson} + """ + + +########################################################################### +########################## 2. Fetch from Entrez ########################### +########################################################################### + + +rule fetch_from_ncbi_entrez: + params: + term=config["entrez_search_term"], + output: + genbank="data/genbank.gb", + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/fetch_from_ncbi_entrez.txt" + shell: + """ + vendored/fetch-from-ncbi-entrez \ + --term {params.term:q} \ + --output {output.genbank} + """ + + +rule parse_genbank_to_ndjson: + input: + genbank="data/genbank.gb", + output: + ndjson="data/ncbi.ndjson", + benchmark: + "benchmarks/parse_genbank_to_ndjson.txt" + shell: + """ + # Add in custom script to parse needed fields from GenBank file to NDJSON file + """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk new file mode 100644 index 0000000..ffbeab8 --- /dev/null +++ b/ingest/rules/nextclade.smk @@ -0,0 +1,97 @@ +""" +This part of the workflow handles running Nextclade on the curated metadata +and sequences. + +REQUIRED INPUTS: + + metadata = data/subset_metadata.tsv + sequences = results/sequences.fasta + +OUTPUTS: + + metadata = results/metadata.tsv + nextclade = results/nextclade.tsv + alignment = results/alignment.fasta + translations = results/translations.zip + +See Nextclade docs for more details on usage, inputs, and outputs if you would +like to customize the rules: +https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html +""" + +DATASET_NAME = config["nextclade"]["dataset_name"] + + +rule get_nextclade_dataset: + """Download Nextclade dataset""" + output: + dataset=f"data/nextclade_data/{DATASET_NAME}.zip", + params: + dataset_name=DATASET_NAME, + shell: + # should this get updated to `nextclade3`? + """ + nextclade2 dataset get \ + --name={params.dataset_name:q} \ + --output-zip={output.dataset} \ + --verbose + """ + + +rule run_nextclade: + input: + dataset=f"data/nextclade_data/{DATASET_NAME}.zip", + sequences="results/sequences.fasta", + output: + nextclade="results/nextclade.tsv", + alignment="results/alignment.fasta", + translations="results/translations.zip", + params: + # The lambda is used to deactivate automatic wildcard expansion. + # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 + translations=lambda w: "results/translations/{gene}.fasta", + shell: + """ + nextclade2 run \ + {input.sequences} \ + --input-dataset {input.dataset} \ + --output-tsv {output.nextclade} \ + --output-fasta {output.alignment} \ + --output-translations {params.translations} + + zip -rj {output.translations} results/translations + """ + + +rule join_metadata_and_nextclade: + input: + nextclade="results/nextclade.tsv", + metadata="data/subset_metadata.tsv", + nextclade_field_map=config["nextclade"]["field_map"], + output: + metadata="results/metadata.tsv", + params: + metadata_id_field=config["curate"]["output_id_field"], + nextclade_id_field=config["nextclade"]["id_field"], + shell: + """ + export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` + + csvtk -tl cut -f $SUBSET_FIELDS \ + {input.nextclade} \ + | csvtk -tl rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + | tsv-join -H \ + --filter-file - \ + --key-fields {params.nextclade_id_field} \ + --data-fields {params.metadata_id_field} \ + --append-fields '*' \ + --write-all ? \ + {input.metadata} \ + | tsv-select -H --exclude {params.nextclade_id_field} \ + > {output.metadata} + """ diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml new file mode 100644 index 0000000..b74c50d --- /dev/null +++ b/nextstrain-pathogen.yaml @@ -0,0 +1,5 @@ +# This is currently an empty file to indicate the top level pathogen repo. +# The inclusion of this file allows the Nextstrain CLI to run the +# `nextstrain build` from any directory regardless of runtime. +# +# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details. diff --git a/phylogenetic/README.md b/phylogenetic/README.md new file mode 100644 index 0000000..7e961c8 --- /dev/null +++ b/phylogenetic/README.md @@ -0,0 +1,59 @@ +# Phylogenetic + +This workflow uses metadata and sequences to produce one or multiple [Nextstrain datasets][] +that can be visualized in Auspice. + +## Workflow Usage + +The workflow can be run from the top level pathogen repo directory: +``` +nextstrain build phylogenetic +``` + +Alternatively, the workflow can also be run from within the phylogenetic directory: +``` +cd phylogenetic +nextstrain build . +``` + +This produces the default outputs of the phylogenetic workflow: + +- auspice_json(s) = auspice/*.json + +## Data Requirements + +The core phylogenetic workflow will use metadata values as-is, so please do any +desired data formatting and curations as part of the [ingest](../ingest/) workflow. + +1. The metadata must include an ID column that can be used as as exact match for + the sequence ID present in the FASTA headers. +2. The `date` column in the metadata must be in ISO 8601 date format (i.e. YYYY-MM-DD). +3. Ambiguous dates should be masked with `XX` (e.g. 2023-01-XX). + +## Defaults + +The defaults directory contains all of the default configurations for the phylogenetic workflow. + +[defaults/config.yaml](defaults/config.yaml) contains all of the default configuration parameters +used for the phylogenetic workflow. Use Snakemake's `--configfile`/`--config` +options to override these default values. + +## Snakefile and rules + +The rules directory contains separate Snakefiles (`*.smk`) as modules of the core phylogenetic workflow. +The modules of the workflow are in separate files to keep the main phylogenetic [Snakefile](Snakefile) succinct and organized. + +The `workdir` is hardcoded to be the phylogenetic directory so all filepaths for +inputs/outputs should be relative to the phylogenetic directory. + +Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) +in the main Snakefile in the order that they are expected to run. + +## Build configs + +The build-configs directory contains custom configs and rules that override and/or +extend the default workflow. + +- [ci](build-configs/ci/) - CI build that runs with example data + +[Nextstrain datasets]: https://docs.nextstrain.org/en/latest/reference/glossary.html#term-dataset diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile new file mode 100644 index 0000000..53eb471 --- /dev/null +++ b/phylogenetic/Snakefile @@ -0,0 +1,55 @@ +""" +This is the main phylogenetic Snakefile that orchestrates the full phylogenetic +workflow and defines its default output(s). +""" + + +# The workflow filepaths are written relative to this Snakefile's base directory +workdir: workflow.current_basedir + + +# Use default configuration values. Override with Snakemake's --configfile/--config options. +configfile: "defaults/config.yaml" + + +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the phylogenetic workflow is usually the final +# Nexstrain dataset(s) or Auspice JSON(s) that are output from `rules/export.smk` +# See Nextstrain docs on expected naming conventions of dataset files +# https://docs.nextstrain.org/page/reference/data-formats.html +rule all: + input: + # Fill in path(s) to the final exported Auspice JSON(s) + auspice_json="", + + +# These rules are imported in the order that they are expected to run. +# Each Snakefile will have documented inputs and outputs that should be kept as +# consistent interfaces across pathogen repos. This allows us to define typical +# steps that are required for a phylogenetic workflow, but still allow pathogen +# specific customizations within each step. +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. +include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" +include: "rules/export.smk" + + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to support the Nextstrain automation that upload files and send internal +# Slack notifications. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml new file mode 100644 index 0000000..de89c67 --- /dev/null +++ b/phylogenetic/build-configs/ci/config.yaml @@ -0,0 +1,7 @@ +# This configuration file contains the custom configurations parameters +# for the CI workflow to run with the example data. + +# Custom rules to run as part of the CI automated workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - build-configs/ci/copy_example_data.smk diff --git a/phylogenetic/build-configs/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk new file mode 100644 index 0000000..8634488 --- /dev/null +++ b/phylogenetic/build-configs/ci/copy_example_data.smk @@ -0,0 +1,17 @@ +rule copy_example_data: + input: + sequences="example_data/sequences.fasta", + metadata="example_data/metadata.tsv", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + cp -f {input.sequences} {output.sequences} + cp -f {input.metadata} {output.metadata} + """ + + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. +# ruleorder: copy_example_data > ... diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml new file mode 100644 index 0000000..234fc62 --- /dev/null +++ b/phylogenetic/defaults/config.yaml @@ -0,0 +1,5 @@ +# This configuration file should contain all required configuration parameters +# for the phylogenetic workflow to run to completion. +# +# Define optional config parameters with their default values here so that users +# do not have to dig through the workflows to figure out the default values diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk new file mode 100644 index 0000000..9b9843e --- /dev/null +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -0,0 +1,32 @@ +""" +This part of the workflow creates additional annotations for the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + node_data = results/*.json + + There are no required outputs for this part of the workflow as it depends + on which annotations are created. All outputs are expected to be node data + JSON files that can be fed into `augur export`. + + See Nextstrain's data format docs for more details on node data JSONs: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur traits + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. + +Custom node data files can also be produced by build-specific scripts in addition +to the ones produced by Augur commands. +""" diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk new file mode 100644 index 0000000..7652005 --- /dev/null +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -0,0 +1,20 @@ +""" +This part of the workflow constructs the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. +""" diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk new file mode 100644 index 0000000..a273659 --- /dev/null +++ b/phylogenetic/rules/export.smk @@ -0,0 +1,26 @@ +""" +This part of the workflow collects the phylogenetic tree and annotations to +export a Nextstrain dataset. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + node_data = results/*.json + +OUTPUTS: + + auspice_json = auspice/${build_name}.json + + There are optional sidecar JSON files that can be exported as part of the dataset. + See Nextstrain's data format docs for more details on sidecar files: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - augur frequencies + +See Augur's usage docs for these commands for more details. +""" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk new file mode 100644 index 0000000..c1c9e22 --- /dev/null +++ b/phylogenetic/rules/prepare_sequences.smk @@ -0,0 +1,22 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta + +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - augur align + - augur mask + +See Augur's usage docs for these commands for more details. +""" From faf3bd876a2077f24dfab5677aecd3283543e8fb Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 10:22:58 -0700 Subject: [PATCH 5/7] git subrepo clone (merge) https://github.com/nextstrain/ingest ingest/vendored subrepo: subdir: "ingest/vendored" merged: "5b74edc" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "5b74edc" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo/" commit: "73a0129" --- ingest/vendored/.cramrc | 3 + ingest/vendored/.github/dependabot.yml | 17 ++ .../vendored/.github/pull_request_template.md | 16 ++ ingest/vendored/.github/workflows/ci.yaml | 23 ++ ingest/vendored/.gitrepo | 12 + ingest/vendored/.shellcheckrc | 6 + ingest/vendored/README.md | 140 +++++++++++ ingest/vendored/apply-geolocation-rules | 234 ++++++++++++++++++ ingest/vendored/cloudfront-invalidate | 42 ++++ ingest/vendored/download-from-s3 | 48 ++++ ingest/vendored/fetch-from-ncbi-entrez | 70 ++++++ ingest/vendored/merge-user-metadata | 55 ++++ ingest/vendored/notify-on-diff | 35 +++ ingest/vendored/notify-on-job-fail | 23 ++ ingest/vendored/notify-on-job-start | 27 ++ ingest/vendored/notify-on-record-change | 53 ++++ ingest/vendored/notify-slack | 56 +++++ ingest/vendored/s3-object-exists | 8 + ingest/vendored/sha256sum | 15 ++ .../transform-genbank-location.t | 30 +++ .../transform-strain-names.t | 17 ++ ingest/vendored/transform-authors | 66 +++++ ingest/vendored/transform-field-names | 48 ++++ ingest/vendored/transform-genbank-location | 59 +++++ ingest/vendored/transform-strain-names | 50 ++++ ingest/vendored/trigger | 56 +++++ ingest/vendored/trigger-on-new-data | 32 +++ ingest/vendored/upload-to-s3 | 78 ++++++ 28 files changed, 1319 insertions(+) create mode 100644 ingest/vendored/.cramrc create mode 100644 ingest/vendored/.github/dependabot.yml create mode 100644 ingest/vendored/.github/pull_request_template.md create mode 100644 ingest/vendored/.github/workflows/ci.yaml create mode 100644 ingest/vendored/.gitrepo create mode 100644 ingest/vendored/.shellcheckrc create mode 100644 ingest/vendored/README.md create mode 100755 ingest/vendored/apply-geolocation-rules create mode 100755 ingest/vendored/cloudfront-invalidate create mode 100755 ingest/vendored/download-from-s3 create mode 100755 ingest/vendored/fetch-from-ncbi-entrez create mode 100755 ingest/vendored/merge-user-metadata create mode 100755 ingest/vendored/notify-on-diff create mode 100755 ingest/vendored/notify-on-job-fail create mode 100755 ingest/vendored/notify-on-job-start create mode 100755 ingest/vendored/notify-on-record-change create mode 100755 ingest/vendored/notify-slack create mode 100755 ingest/vendored/s3-object-exists create mode 100755 ingest/vendored/sha256sum create mode 100644 ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t create mode 100644 ingest/vendored/tests/transform-strain-names/transform-strain-names.t create mode 100755 ingest/vendored/transform-authors create mode 100755 ingest/vendored/transform-field-names create mode 100755 ingest/vendored/transform-genbank-location create mode 100755 ingest/vendored/transform-strain-names create mode 100755 ingest/vendored/trigger create mode 100755 ingest/vendored/trigger-on-new-data create mode 100755 ingest/vendored/upload-to-s3 diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc new file mode 100644 index 0000000..153d20f --- /dev/null +++ b/ingest/vendored/.cramrc @@ -0,0 +1,3 @@ +[cram] +shell = /bin/bash +indent = 2 diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml new file mode 100644 index 0000000..89bd084 --- /dev/null +++ b/ingest/vendored/.github/dependabot.yml @@ -0,0 +1,17 @@ +# Dependabot configuration file +# +# +# Each ecosystem is checked on a scheduled interval defined below. To trigger +# a check manually, go to +# +# https://github.com/nextstrain/ingest/network/updates +# +# and look for a "Check for updates" button. You may need to click around a +# bit first. +--- +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/ingest/vendored/.github/pull_request_template.md b/ingest/vendored/.github/pull_request_template.md new file mode 100644 index 0000000..ed4a5b2 --- /dev/null +++ b/ingest/vendored/.github/pull_request_template.md @@ -0,0 +1,16 @@ +### Description of proposed changes + + + +### Related issue(s) + + + +### Checklist + + + +- [ ] Checks pass +- [ ] If adding a script, add an entry for it in the README. + + diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml new file mode 100644 index 0000000..e75d828 --- /dev/null +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -0,0 +1,23 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: nextstrain/.github/actions/shellcheck@master + + cram: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - run: pip install cram + - run: cram tests/ \ No newline at end of file diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo new file mode 100644 index 0000000..82df6d7 --- /dev/null +++ b/ingest/vendored/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = https://github.com/nextstrain/ingest + branch = main + commit = 5b74edc5f79896d02697d6066c6e31dd7cdc7802 + parent = 3a2acd289683daf7450b6c9d9535df723438bc55 + method = merge + cmdver = 0.4.6 diff --git a/ingest/vendored/.shellcheckrc b/ingest/vendored/.shellcheckrc new file mode 100644 index 0000000..ebed438 --- /dev/null +++ b/ingest/vendored/.shellcheckrc @@ -0,0 +1,6 @@ +# Use of this file requires Shellcheck v0.7.0 or newer. +# +# SC2064 - We intentionally want variables to expand immediately within traps +# so the trap can not fail due to variable interpolation later. +# +disable=SC2064 diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md new file mode 100644 index 0000000..2c14573 --- /dev/null +++ b/ingest/vendored/README.md @@ -0,0 +1,140 @@ +# ingest + +Shared internal tooling for pathogen data ingest. Used by our individual +pathogen repos which produce Nextstrain builds. Expected to be vendored by +each pathogen repo using `git subrepo`. + +Some tools may only live here temporarily before finding a permanent home in +`augur curate` or Nextstrain CLI. Others may happily live out their days here. + +## Vendoring + +Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts. +(See discussion on this decision in https://github.com/nextstrain/ingest/issues/3) + +For a list of Nextstrain repos that are currently using this method, use [this +GitHub code search](https://github.com/search?type=code&q=org%3Anextstrain+subrepo+%22remote+%3D+https%3A%2F%2Fgithub.com%2Fnextstrain%2Fingest%22). + +If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation). +Then add the latest ingest scripts to the pathogen repo by running: + +``` +git subrepo clone https://github.com/nextstrain/ingest ingest/vendored +``` + +Any future updates of ingest scripts can be pulled in with: + +``` +git subrepo pull ingest/vendored +``` + +If you run into merge conflicts and would like to pull in a fresh copy of the +latest ingest scripts, pull with the `--force` flag: + +``` +git subrepo pull ingest/vendored --force +``` + +> **Warning** +> Beware of rebasing/dropping the parent commit of a `git subrepo` update + +`git subrepo` relies on metadata in the `ingest/vendored/.gitrepo` file, +which includes the hash for the parent commit in the pathogen repos. +If this hash no longer exists in the commit history, there will be errors when +running future `git subrepo pull` commands. + +If you run into an error similar to the following: +``` +$ git subrepo pull ingest/vendored +git-subrepo: Command failed: 'git branch subrepo/ingest/vendored '. +fatal: not a valid object name: '' +``` +Check the parent commit hash in the `ingest/vendored/.gitrepo` file and make +sure the commit exists in the commit history. Update to the appropriate parent +commit hash if needed. + +## History + +Much of this tooling originated in +[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru +[mpox's ingest/](https://github.com/nextstrain/mpox/tree/@/ingest/). It +subsequently proliferated from [mpox][] to other pathogen repos ([rsv][], +[zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily thru +copying. To [counter that +proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079), +this repo was made. + +[mpox]: https://github.com/nextstrain/mpox +[rsv]: https://github.com/nextstrain/rsv +[zika]: https://github.com/nextstrain/zika/pull/24 +[dengue]: https://github.com/nextstrain/dengue/pull/10 +[hepatitisB]: https://github.com/nextstrain/hepatitisB +[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov + +## Elsewhere + +The creation of this repo, in both the abstract and concrete, and the general +approach to "ingest" has been discussed in various internal places, including: + +- https://github.com/nextstrain/private/issues/59 +- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i) +- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079) +- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit) +- _…many others_ + +## Scripts + +Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools. + +- [notify-on-diff](notify-on-diff) - Send Slack message with diff of a local file and an S3 object +- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch +- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch +- [notify-on-record-change](notify-on-recod-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`. + If the S3 object's metadata does not have `recordcount`, then will attempt to download S3 object to count lines locally, which only supports `xz` compressed S3 objects. +- [notify-slack](notify-slack) - Send message or file to Slack +- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts +- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events. +- [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message` + A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated. + +NCBI interaction scripts that are useful for fetching public metadata and sequences. + +- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. + Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. + +Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18. + +Potential Nextstrain CLI scripts + +- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. +- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104). + This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script. +- [upload-to-s3](upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL. + Skips upload if the local file's hash is identical to the S3 object's metadata `sha256sum`. + Adds the following user defined metadata to uploaded S3 object: + - `sha256sum` - hash of the file generated by [sha256sum](sha256sum) + - `recordcount` - the line count of the file +- [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. + Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`. + +Potential augur curate scripts + +- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records +- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records +- [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' +- [transform-field-names](transform-field-names) - Rename fields of NDJSON records +- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) +- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. + +## Software requirements + +Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date. + +## Testing + +Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack. + +For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally, + +1. Download Cram: `pip install cram` +2. Run the tests: `cram tests/` diff --git a/ingest/vendored/apply-geolocation-rules b/ingest/vendored/apply-geolocation-rules new file mode 100755 index 0000000..776cf16 --- /dev/null +++ b/ingest/vendored/apply-geolocation-rules @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +""" +Applies user curated geolocation rules to the geolocation fields in the NDJSON +records from stdin. The modified records are output to stdout. This does not do +any additional transformations on top of the user curations. +""" +import argparse +import json +from collections import defaultdict +from sys import exit, stderr, stdin, stdout + + +class CyclicGeolocationRulesError(Exception): + pass + + +def load_geolocation_rules(geolocation_rules_file): + """ + Loads the geolocation rules from the provided *geolocation_rules_file*. + Returns the rules as a dict: + { + regions: { + countries: { + divisions: { + locations: corrected_geolocations_tuple + } + } + } + } + """ + geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + with open(geolocation_rules_file, 'r') as rules_fh: + for line in rules_fh: + # ignore comments + if line.strip()=="" or line.lstrip()[0] == '#': + continue + + row = line.strip('\n').split('\t') + # Skip lines that cannot be split into raw and annotated geolocations + if len(row) != 2: + print( + f"WARNING: Could not decode geolocation rule {line!r}.", + "Please make sure rules are formatted as", + "'region/country/division/locationregion/country/division/location'.", + file=stderr) + continue + + # remove trailing comments + row[-1] = row[-1].partition('#')[0].rstrip() + raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') ) + + # Skip lines where raw or annotated geolocations cannot be split into 4 fields + if len(raw) != 4: + print( + f"WARNING: Could not decode the raw geolocation {row[0]!r}.", + "Please make sure it is formatted as 'region/country/division/location'.", + file=stderr + ) + continue + + if len(annot) != 4: + print( + f"WARNING: Could not decode the annotated geolocation {row[1]!r}.", + "Please make sure it is formatted as 'region/country/division/location'.", + file=stderr + ) + continue + + + geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot + + return geolocation_rules + + +def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None): + """ + Gets the annotated geolocation for the *raw_geolocation* in the provided + *geolocation_rules*. + + Recursively traverses the *geolocation_rules* until we get the annotated + geolocation, which must be a Tuple. Returns `None` if there are no + applicable rules for the provided *raw_geolocation*. + + Rules are applied in the order of region, country, division, location. + First checks the provided raw values for geolocation fields, then if there + are not matches, tries to use general rules marked with '*'. + """ + # Always instantiate the rule traversal as an empty list if not provided, + # e.g. the first call of this recursive function + if rule_traversal is None: + rule_traversal = [] + + current_rules = geolocation_rules + # Traverse the geolocation rules based using the rule_traversal values + for field_value in rule_traversal: + current_rules = current_rules.get(field_value) + # If we hit `None`, then we know there are no matching rules, so stop the rule traversal + if current_rules is None: + break + + # We've found the tuple of the annotated geolocation + if isinstance(current_rules, tuple): + return current_rules + + # We've reach the next level of geolocation rules, + # so try to traverse the rules with the next target in raw_geolocation + if isinstance(current_rules, dict): + next_traversal_target = raw_geolocation[len(rule_traversal)] + rule_traversal.append(next_traversal_target) + return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) + + # We did not find any matching rule for the last traversal target + if current_rules is None: + # If we've used all general rules and we still haven't found a match, + # then there are no applicable rules for this geolocation + if all(value == '*' for value in rule_traversal): + return None + + # If we failed to find matching rule with a general rule as the last + # traversal target, then delete all trailing '*'s to reset rule_traversal + # to end with the last index that is currently NOT a '*' + # [A, *, B, *] => [A, *, B] + # [A, B, *, *] => [A, B] + # [A, *, *, *] => [A] + if rule_traversal[-1] == '*': + # Find the index of the first of the consecutive '*' from the + # end of the rule_traversal + # [A, *, B, *] => first_consecutive_general_rule_index = 3 + # [A, B, *, *] => first_consecutive_general_rule_index = 2 + # [A, *, *, *] => first_consecutive_general_rule_index = 1 + for index, field_value in reversed(list(enumerate(rule_traversal))): + if field_value == '*': + first_consecutive_general_rule_index = index + else: + break + + rule_traversal = rule_traversal[:first_consecutive_general_rule_index] + + # Set the final value to '*' in hopes that by moving to a general rule, + # we can find a matching rule. + rule_traversal[-1] = '*' + + return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) + + +def transform_geolocations(geolocation_rules, geolocation): + """ + Transform the provided *geolocation* by looking it up in the provided + *geolocation_rules*. + + This will use all rules that apply to the geolocation and rules will + be applied in the order of region, country, division, location. + + Returns the original geolocation if no geolocation rules apply. + + Raises a `CyclicGeolocationRulesError` if more than 1000 rules have + been applied to the raw geolocation. + """ + transformed_values = geolocation + rules_applied = 0 + continue_to_apply = True + + while continue_to_apply: + annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values) + + # Stop applying rules if no annotated values were found + if annotated_values is None: + continue_to_apply = False + else: + rules_applied += 1 + + if rules_applied > 1000: + raise CyclicGeolocationRulesError( + "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}." + ) + + # Create a new list of values for comparison to previous values + new_values = list(transformed_values) + for index, value in enumerate(annotated_values): + # Keep original value if annotated value is '*' + if value != '*': + new_values[index] = value + + # Stop applying rules if this rule did not change the values, + # since this means we've reach rules with '*' that no longer change values + if new_values == transformed_values: + continue_to_apply = False + + transformed_values = new_values + + return transformed_values + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--region-field", default="region", + help="Field that contains regions in NDJSON records.") + parser.add_argument("--country-field", default="country", + help="Field that contains countries in NDJSON records.") + parser.add_argument("--division-field", default="division", + help="Field that contains divisions in NDJSON records.") + parser.add_argument("--location-field", default="location", + help="Field that contains location in NDJSON records.") + parser.add_argument("--geolocation-rules", metavar="TSV", required=True, + help="TSV file of geolocation rules with the format: " + + "'' where the raw and annotated geolocations " + + "are formatted as '///'. " + + "If creating a general rule, then the raw field value can be substituted with '*'." + + "Lines starting with '#' will be ignored as comments." + + "Trailing '#' will be ignored as comments.") + + args = parser.parse_args() + + location_fields = [args.region_field, args.country_field, args.division_field, args.location_field] + + geolocation_rules = load_geolocation_rules(args.geolocation_rules) + + for record in stdin: + record = json.loads(record) + + try: + annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields]) + except CyclicGeolocationRulesError as e: + print(e, file=stderr) + exit(1) + + for index, field in enumerate(location_fields): + record[field] = annotated_values[index] + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate new file mode 100755 index 0000000..dbea398 --- /dev/null +++ b/ingest/vendored/cloudfront-invalidate @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 +set -euo pipefail + +main() { + local domain="$1" + shift + local paths=("$@") + local distribution invalidation + + echo "-> Finding CloudFront distribution" + distribution=$( + aws cloudfront list-distributions \ + --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ + --output text + ) + + if [[ -z $distribution || $distribution == None ]]; then + exec >&2 + echo "Unable to find CloudFront distribution id for $domain" + echo + echo "Are your AWS CLI credentials for the right account?" + exit 1 + fi + + echo "-> Creating CloudFront invalidation for distribution $distribution" + invalidation=$( + aws cloudfront create-invalidation \ + --distribution-id "$distribution" \ + --paths "${paths[@]}" \ + --query Invalidation.Id \ + --output text + ) + + echo "-> Waiting for CloudFront invalidation $invalidation to complete" + echo " Ctrl-C to stop waiting." + aws cloudfront wait invalidation-completed \ + --distribution-id "$distribution" \ + --id "$invalidation" +} + +main "$@" diff --git a/ingest/vendored/download-from-s3 b/ingest/vendored/download-from-s3 new file mode 100755 index 0000000..4981186 --- /dev/null +++ b/ingest/vendored/download-from-s3 @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +bin="$(dirname "$0")" + +main() { + local src="${1:?A source s3:// URL is required as the first argument.}" + local dst="${2:?A destination file path is required as the second argument.}" + # How many lines to subsample to. 0 means no subsampling. Optional. + # It is not advised to use this for actual subsampling! This is intended to be + # used for debugging workflows with large datasets such as ncov-ingest as + # described in https://github.com/nextstrain/ncov-ingest/pull/367 + + # Uses `tsv-sample` to subsample, so it will not work as expected with files + # that have a single record split across multiple lines (i.e. FASTA sequences) + local n="${3:-0}" + + local s3path="${src#s3://}" + local bucket="${s3path%%/*}" + local key="${s3path#*/}" + + local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 + dst_hash="$("$bin/sha256sum" < "$dst" || true)" + src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + + echo "[ INFO] Downloading $src → $dst" + if [[ $src_hash != "$dst_hash" ]]; then + aws s3 cp --no-progress "$src" - | + if [[ "$src" == *.gz ]]; then + gunzip -cfq + elif [[ "$src" == *.xz ]]; then + xz -T0 -dcq + elif [[ "$src" == *.zst ]]; then + zstd -T0 -dcq + else + cat + fi | + if [[ "$n" -gt 0 ]]; then + tsv-sample -H -i -n "$n" + else + cat + fi >"$dst" + else + echo "[ INFO] Files are identical, skipping download" + fi +} + +main "$@" diff --git a/ingest/vendored/fetch-from-ncbi-entrez b/ingest/vendored/fetch-from-ncbi-entrez new file mode 100755 index 0000000..194a0c8 --- /dev/null +++ b/ingest/vendored/fetch-from-ncbi-entrez @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Fetch metadata and nucleotide sequences from NCBI Entrez and output to a GenBank file. +""" +import json +import argparse +from Bio import SeqIO, Entrez + +# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request +# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch +# However, in my testing with HepB, the max records returned was 9,999 +# - Jover, 16 August 2023 +BATCH_SIZE = 9999 + +Entrez.email = "hello@nextstrain.org" + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--term', required=True, type=str, + help='Genbank search term. Replace spaces with "+", e.g. "Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]"') + parser.add_argument('--output', required=True, type=str, help='Output file (Genbank)') + return parser.parse_args() + + +def get_esearch_history(term): + """ + Search for the provided *term* via ESearch and store the results using the + Entrez history server.¹ + + Returns the total count of returned records, query key, and web env needed + to access the records from the server. + + ¹ https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Using_the_Entrez_History_Server + """ + handle = Entrez.esearch(db="nucleotide", term=term, retmode="json", usehistory="y", retmax=0) + esearch_result = json.loads(handle.read())['esearchresult'] + print(f"Search term {term!r} returned {esearch_result['count']} IDs.") + return { + "count": int(esearch_result["count"]), + "query_key": esearch_result["querykey"], + "web_env": esearch_result["webenv"] + } + + +def fetch_from_esearch_history(count, query_key, web_env): + """ + Fetch records in batches from Entrez history server using the provided + *query_key* and *web_env* and yields them as a BioPython SeqRecord iterator. + """ + print(f"Fetching GenBank records in batches of n={BATCH_SIZE}") + + for start in range(0, count, BATCH_SIZE): + handle = Entrez.efetch( + db="nucleotide", + query_key=query_key, + webenv=web_env, + retstart=start, + retmax=BATCH_SIZE, + rettype="gb", + retmode="text") + + yield SeqIO.parse(handle, "genbank") + + +if __name__=="__main__": + args = parse_args() + + with open(args.output, "w") as output_handle: + for batch_results in fetch_from_esearch_history(**get_esearch_history(args.term)): + SeqIO.write(batch_results, output_handle, "genbank") diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata new file mode 100755 index 0000000..341c2df --- /dev/null +++ b/ingest/vendored/merge-user-metadata @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Merges user curated annotations with the NDJSON records from stdin, with the user +curations overwriting the existing fields. The modified records are output +to stdout. This does not do any additional transformations on top of the user +curations. +""" +import argparse +import csv +import json +from collections import defaultdict +from sys import exit, stdin, stderr, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--annotations", metavar="TSV", required=True, + help="Manually curated annotations TSV file. " + + "The TSV should not have a header and should have exactly three columns: " + + "id to match existing metadata, field name, and field value. " + + "If there are multiple annotations for the same id and field, then the last value is used. " + + "Lines starting with '#' are treated as comments. " + + "Any '#' after the field value are treated as comments.") + parser.add_argument("--id-field", default="accession", + help="The ID field in the metadata to use to merge with the annotations.") + + args = parser.parse_args() + + annotations = defaultdict(dict) + with open(args.annotations, 'r') as annotations_fh: + csv_reader = csv.reader(annotations_fh, delimiter='\t') + for row in csv_reader: + if not row or row[0].lstrip()[0] == '#': + continue + elif len(row) != 3: + print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) + continue + id, field, value = row + annotations[id][field] = value.partition('#')[0].rstrip() + + for record in stdin: + record = json.loads(record) + + record_id = record.get(args.id_field) + if record_id is None: + print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) + exit(1) + + record.update(annotations.get(record_id, {})) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/notify-on-diff b/ingest/vendored/notify-on-diff new file mode 100755 index 0000000..ddbe7da --- /dev/null +++ b/ingest/vendored/notify-on-diff @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +bin="$(dirname "$0")" + +src="${1:?A source file is required as the first argument.}" +dst="${2:?A destination s3:// URL is required as the second argument.}" + +dst_local="$(mktemp -t s3-file-XXXXXX)" +diff="$(mktemp -t diff-XXXXXX)" + +trap "rm -f '$dst_local' '$diff'" EXIT + +# if the file is not already present, just exit +"$bin"/s3-object-exists "$dst" || exit 0 + +"$bin"/download-from-s3 "$dst" "$dst_local" + +# diff's exit code is 0 for no differences, 1 for differences found, and >1 for errors +diff_exit_code=0 +diff "$dst_local" "$src" > "$diff" || diff_exit_code=$? + +if [[ "$diff_exit_code" -eq 1 ]]; then + echo "Notifying Slack about diff." + "$bin"/notify-slack --upload "$src.diff" < "$diff" +elif [[ "$diff_exit_code" -gt 1 ]]; then + echo "Notifying Slack about diff failure" + "$bin"/notify-slack "Diff failed for $src" +else + echo "No change in $src." +fi diff --git a/ingest/vendored/notify-on-job-fail b/ingest/vendored/notify-on-job-fail new file mode 100755 index 0000000..7dd2409 --- /dev/null +++ b/ingest/vendored/notify-on-job-fail @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" + +echo "Notifying Slack about failed ${job_name} job." +message="❌ ${job_name} job has FAILED 😞 " + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` () for error details. " +elif [[ -n "${GITHUB_RUN_ID}" ]]; then + message+="See GitHub Action for error details. " +fi + +"$bin"/notify-slack "$message" diff --git a/ingest/vendored/notify-on-job-start b/ingest/vendored/notify-on-job-start new file mode 100755 index 0000000..1c8ce7d --- /dev/null +++ b/ingest/vendored/notify-on-job-start @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" +build_dir="${3:-ingest}" + +echo "Notifying Slack about started ${job_name} job." +message="${job_name} job has started." + +if [[ -n "${GITHUB_RUN_ID}" ]]; then + message+=" The job was submitted by GitHub Action ." +fi + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` ()." + message+=" Follow along in your local clone of ${github_repo} with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ${build_dir}"'```' +fi + +"$bin"/notify-slack "$message" diff --git a/ingest/vendored/notify-on-record-change b/ingest/vendored/notify-on-record-change new file mode 100755 index 0000000..f424252 --- /dev/null +++ b/ingest/vendored/notify-on-record-change @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +bin="$(dirname "$0")" + +src="${1:?A source ndjson file is required as the first argument.}" +dst="${2:?A destination ndjson s3:// URL is required as the second argument.}" +source_name=${3:?A record source name is required as the third argument.} + +# if the file is not already present, just exit +"$bin"/s3-object-exists "$dst" || exit 0 + +s3path="${dst#s3://}" +bucket="${s3path%%/*}" +key="${s3path#*/}" + +src_record_count="$(wc -l < "$src")" + +# Try getting record count from S3 object metadata +dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)" +if [[ -z "$dst_record_count" ]]; then + # This object doesn't have the record count stored as metadata + # We have to download it and count the lines locally + dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))" +fi + +added_records="$(( src_record_count - dst_record_count ))" + +printf "%'4d %s\n" "$src_record_count" "$src" +printf "%'4d %s\n" "$dst_record_count" "$dst" +printf "%'4d added records\n" "$added_records" + +slack_message="" + +if [[ $added_records -gt 0 ]]; then + echo "Notifying Slack about added records (n=$added_records)" + slack_message="📈 New records (n=$added_records) found on $source_name." + +elif [[ $added_records -lt 0 ]]; then + echo "Notifying Slack about fewer records (n=$added_records)" + slack_message="📉 Fewer records (n=$added_records) found on $source_name." + +else + echo "Notifying Slack about same number of records" + slack_message="⛔ No new records found on $source_name." +fi + +slack_message+=" (Total record count: $src_record_count)" + +"$bin"/notify-slack "$slack_message" diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack new file mode 100755 index 0000000..a343435 --- /dev/null +++ b/ingest/vendored/notify-slack @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +upload=0 +output=/dev/null +thread_ts="" +broadcast=0 +args=() + +for arg; do + case "$arg" in + --upload) + upload=1;; + --output=*) + output="${arg#*=}";; + --thread-ts=*) + thread_ts="${arg#*=}";; + --broadcast) + broadcast=1;; + *) + args+=("$arg");; + esac +done + +set -- "${args[@]}" + +text="${1:?Some message text is required.}" + +if [[ "$upload" == 1 ]]; then + echo "Uploading data to Slack with the message: $text" + curl https://slack.com/api/files.upload \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channels="$SLACK_CHANNELS" \ + --form-string title="$text" \ + --form-string filename="$text" \ + --form-string thread_ts="$thread_ts" \ + --form file=@/dev/stdin \ + --form filetype=text \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +else + echo "Posting Slack message: $text" + curl https://slack.com/api/chat.postMessage \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channel="$SLACK_CHANNELS" \ + --form-string text="$text" \ + --form-string thread_ts="$thread_ts" \ + --form-string reply_broadcast="$broadcast" \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +fi diff --git a/ingest/vendored/s3-object-exists b/ingest/vendored/s3-object-exists new file mode 100755 index 0000000..679c20a --- /dev/null +++ b/ingest/vendored/s3-object-exists @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +url="${1#s3://}" +bucket="${url%%/*}" +key="${url#*/}" + +aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/ingest/vendored/sha256sum b/ingest/vendored/sha256sum new file mode 100755 index 0000000..32d7ef8 --- /dev/null +++ b/ingest/vendored/sha256sum @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +Portable sha256sum utility. +""" +from hashlib import sha256 +from sys import stdin + +chunk_size = 5 * 1024**2 # 5 MiB + +h = sha256() + +for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): + h.update(chunk) + +print(h.hexdigest()) diff --git a/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t b/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t new file mode 100644 index 0000000..a835455 --- /dev/null +++ b/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t @@ -0,0 +1,30 @@ +Verify behavior of `transform-genbank-location` around prescence/abscence of +`database` and `location` fields. + +If `location` field is present, transform it. + + $ echo '{"database":"GenBank", "location": "USA:Oregon, Salem" }' \ + > | $TESTDIR/../../transform-genbank-location + {"database":"GenBank","location":"Salem","country":"USA","division":"Oregon"} + +If `database` field is absent, complain. + + $ echo '{"location": "USA:Oregon, Salem" }' \ + > | $TESTDIR/../../transform-genbank-location + Record must contain `database` field to use `transform-genbank-location.` + {"location":"USA:Oregon, Salem"} + +If `database` field has unsupported value, complain. + + $ echo '{"database": "unsupported", "location": "USA:Oregon, Salem" }' \ + > | $TESTDIR/../../transform-genbank-location + Database value of unsupported not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq". + {"database":"unsupported","location":"USA:Oregon, Salem"} + + +If `location` field is absent, complain. + + $ echo '{"database": "GenBank" }' \ + > | $TESTDIR/../../transform-genbank-location + `transform-genbank-location` requires a `location` field; this record does not have one. + {"database":"GenBank"} diff --git a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t new file mode 100644 index 0000000..1c05df7 --- /dev/null +++ b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t @@ -0,0 +1,17 @@ +Look for strain name in "strain" or a list of backup fields. + +If strain entry exists, do not do anything. + + $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields strain_s accession + {"strain":"i/am/a/strain","strain_s":"other"} + +If strain entry does not exists, search the backup fields + + $ echo '{"strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields accession strain_s + {"strain_s":"other","strain":"other"} \ No newline at end of file diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors new file mode 100755 index 0000000..0bade20 --- /dev/null +++ b/ingest/vendored/transform-authors @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +""" +Abbreviates a full list of authors to be ' et al.' of the NDJSON +record from stdin and outputs modified records to stdout. + +Note: This is a "best effort" approach and can potentially mangle the author name. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +def parse_authors(record: dict, authors_field: str, default_value: str, + index: int, abbr_authors_field: str = None) -> dict: + # Strip and normalize whitespace + new_authors = re.sub(r'\s+', ' ', record[authors_field]) + + if new_authors == "": + new_authors = default_value + else: + # Split authors list on comma/semicolon + # OR "and"/"&" with at least one space before and after + new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] + + # if it does not already end with " et al.", add it + if not new_authors.strip('. ').endswith(" et al"): + new_authors += ' et al' + + if abbr_authors_field: + if record.get(abbr_authors_field): + print( + f"WARNING: the {abbr_authors_field!r} field already exists", + f"in record {index} and will be overwritten!", + file=stderr + ) + + record[abbr_authors_field] = new_authors + else: + record[authors_field] = new_authors + + return record + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--authors-field", default="authors", + help="The field containing list of authors.") + parser.add_argument("--default-value", default="?", + help="Default value to use if authors list is empty.") + parser.add_argument("--abbr-authors-field", + help="The field for the generated abbreviated authors. " + + "If not provided, the original authors field will be modified.") + + args = parser.parse_args() + + for index, record in enumerate(stdin): + record = json.loads(record) + + parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names new file mode 100755 index 0000000..fde223f --- /dev/null +++ b/ingest/vendored/transform-field-names @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Renames fields of the NDJSON record from stdin and outputs modified records +to stdout. +""" +import argparse +import json +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--field-map", nargs="+", + help="Fields names in the NDJSON record mapped to new field names, " + + "formatted as '{old_field_name}={new_field_name}'. " + + "If the old field does not exist in record, the new field will be added with an empty string value." + + "If the new field already exists in record, then the renaming of the old field will be skipped.") + parser.add_argument("--force", action="store_true", + help="Force renaming of old field even if the new field already exists. " + + "Please keep in mind this will overwrite the value of the new field.") + + args = parser.parse_args() + + field_map = {} + for field in args.field_map: + old_name, new_name = field.split('=') + field_map[old_name] = new_name + + for record in stdin: + record = json.loads(record) + + for old_field, new_field in field_map.items(): + + if record.get(new_field) and not args.force: + print( + f"WARNING: skipping rename of {old_field} because record", + f"already has a field named {new_field}.", + file=stderr + ) + continue + + record[new_field] = record.pop(old_field, '') + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location new file mode 100755 index 0000000..010955a --- /dev/null +++ b/ingest/vendored/transform-genbank-location @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate +fields: 'country', 'division', and 'location'. Checks that a record is from +GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". + +Outputs the modified record to stdout. +""" +import json +from sys import stdin, stderr, stdout + + +def parse_location(record: dict) -> dict: + # Expected pattern for the location field is "[:][, ]" + # See GenBank docs for their "country" field: + # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ + location_field = record.get("location", "") + if not location_field: + print( + "`transform-genbank-location` requires a `location` field; this record does not have one.", + file=stderr, + ) + # bail early because we're not gonna make any changes + return record + + geographic_data = location_field.split(':') + + country = geographic_data[0] + division = '' + location = '' + + if len(geographic_data) == 2: + division , _ , location = geographic_data[1].partition(',') + + record['country'] = country.strip() + record['division'] = division.strip() + record['location'] = location.strip() + + return record + + +if __name__ == '__main__': + + for record in stdin: + record = json.loads(record) + + database = record.get('database', '') + if database in {'GenBank', 'RefSeq'}: + parse_location(record) + else: + if database: + error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" + else: + error_msg = "Record must contain `database` field to use `transform-genbank-location.`" + + print(error_msg, file=stderr) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/transform-strain-names b/ingest/vendored/transform-strain-names new file mode 100755 index 0000000..d86c0e4 --- /dev/null +++ b/ingest/vendored/transform-strain-names @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Verifies strain name pattern in the 'strain' field of the NDJSON record from +stdin. Adds a 'strain' field to the record if it does not already exist. + +Outputs the modified records to stdout. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--strain-regex", default="^.+$", + help="Regex pattern for strain names. " + + "Strain names that do not match the pattern will be dropped.") + parser.add_argument("--backup-fields", nargs="*", + help="List of backup fields to use as strain name if the value in 'strain' " + + "does not match the strain regex pattern. " + + "If multiple fields are provided, will use the first field that has a non-empty string.") + + args = parser.parse_args() + + strain_name_pattern = re.compile(args.strain_regex) + + for index, record in enumerate(stdin): + record = json.loads(record) + + # Verify strain name matches the strain regex pattern + if strain_name_pattern.match(record.get('strain', '')) is None: + # Default to empty string if not matching pattern + record['strain'] = '' + # Use non-empty value of backup fields if provided + if args.backup_fields: + for field in args.backup_fields: + if record.get(field): + record['strain'] = str(record[field]) + break + + if record['strain'] == '': + print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) + + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/trigger b/ingest/vendored/trigger new file mode 100755 index 0000000..586f9cc --- /dev/null +++ b/ingest/vendored/trigger @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${PAT_GITHUB_DISPATCH:=}" + +github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}" +event_type="${2:?An event type is required as the second argument.}" +shift 2 + +if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then + cat >&2 <<. +You must specify options to curl for your GitHub credentials. For example, you +can specify your GitHub username, and will be prompted for your password: + + $0 $github_repo $event_type --user + +Be sure to enter a personal access token¹ as your password since GitHub has +discontinued password authentication to the API starting on November 13, 2020². + +You can also store your credentials or a personal access token in a netrc +file³: + + machine api.github.com + login + password + +and then tell curl to use it: + + $0 $github_repo $event_type --netrc + +which will then not require you to type your password every time. + +¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line +² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password +³ https://ec.haxx.se/usingcurl/usingcurl-netrc +. + exit 1 +fi + +auth=':' +if [[ -n $PAT_GITHUB_DISPATCH ]]; then + auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" +fi + +if curl -fsS "https://api.github.com/repos/${github_repo}/dispatches" \ + -H 'Accept: application/vnd.github.v3+json' \ + -H 'Content-Type: application/json' \ + -H "$auth" \ + -d '{"event_type":"'"$event_type"'"}' \ + "$@" +then + echo "Successfully triggered $event_type" +else + echo "Request failed" >&2 + exit 1 +fi diff --git a/ingest/vendored/trigger-on-new-data b/ingest/vendored/trigger-on-new-data new file mode 100755 index 0000000..470d2f4 --- /dev/null +++ b/ingest/vendored/trigger-on-new-data @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" + +bin="$(dirname "$0")" + +github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}" +event_type="${2:?An event type is required as the second argument.}" +metadata="${3:?A metadata upload output file is required as the third argument.}" +sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}" +identical_file_message="${5:-files are identical}" + +new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?) +new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?) + +slack_message="" + +# grep exit status 0 for found match, 1 for no match, 2 if an error occurred +if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then + slack_message="Triggering new builds due to updated metadata and/or sequences" + "$bin"/trigger "$github_repo" "$event_type" +elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then + slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files." +else + slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated." +fi + + +if ! "$bin"/notify-slack "$slack_message"; then + echo "Notifying Slack failed, but exiting with success anyway." +fi diff --git a/ingest/vendored/upload-to-s3 b/ingest/vendored/upload-to-s3 new file mode 100755 index 0000000..36d171c --- /dev/null +++ b/ingest/vendored/upload-to-s3 @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail + +bin="$(dirname "$0")" + +main() { + local quiet=0 + + for arg; do + case "$arg" in + --quiet) + quiet=1 + shift;; + *) + break;; + esac + done + + local src="${1:?A source file is required as the first argument.}" + local dst="${2:?A destination s3:// URL is required as the second argument.}" + local cloudfront_domain="${3:-}" + + local s3path="${dst#s3://}" + local bucket="${s3path%%/*}" + local key="${s3path#*/}" + + local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 + src_hash="$("$bin/sha256sum" < "$src")" + dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + + if [[ $src_hash != "$dst_hash" ]]; then + # The record count may have changed + src_record_count="$(wc -l < "$src")" + + echo "Uploading $src → $dst" + if [[ "$dst" == *.gz ]]; then + gzip -c "$src" + elif [[ "$dst" == *.xz ]]; then + xz -2 -T0 -c "$src" + elif [[ "$dst" == *.zst ]]; then + zstd -T0 -c "$src" + else + cat "$src" + fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")" + + if [[ -n $cloudfront_domain ]]; then + echo "Creating CloudFront invalidation for $cloudfront_domain/$key" + if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then + echo "CloudFront invalidation failed, but exiting with success anyway." + fi + fi + + if [[ $quiet == 1 ]]; then + echo "Quiet mode. No Slack notification sent." + exit 0 + fi + + if ! "$bin"/notify-slack "Updated $dst available."; then + echo "Notifying Slack failed, but exiting with success anyway." + fi + else + echo "Uploading $src → $dst: files are identical, skipping upload" + fi +} + +content-type() { + case "$1" in + *.tsv) echo --content-type=text/tab-separated-values;; + *.csv) echo --content-type=text/comma-separated-values;; + *.ndjson) echo --content-type=application/x-ndjson;; + *.gz) echo --content-type=application/gzip;; + *.xz) echo --content-type=application/x-xz;; + *.zst) echo --content-type=application/zstd;; + *) echo --content-type=text/plain;; + esac +} + +main "$@" From c25d87804666602703c8c9daa085cb52c4421736 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 23 May 2024 17:32:33 -0700 Subject: [PATCH 6/7] Update snakefmt pre-commit action [#2] --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 110d5ae..8b6627f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,10 @@ default_language_version: exclude: '\.(tsv|fasta|gb)$|^ingest/vendored/|^_LEGACY' repos: - repo: https://github.com/snakemake/snakefmt - rev: v0.10.1 + rev: v0.10.2 hooks: - id: snakefmt - language_version: python3 + language_version: python3.11 - repo: https://github.com/rhysd/actionlint rev: v1.6.27 hooks: From d8239175c325c4d6429f39bad1c6dde4c3953e10 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Wed, 29 May 2024 10:20:12 -0700 Subject: [PATCH 7/7] Sync with "standard" pre-commit-config [#2] Remove unused configuration files --- .pre-commit-config.yaml | 37 ++++++++++++------------------------- .yamlfmt | 5 ----- pyproject.toml | 8 -------- 3 files changed, 12 insertions(+), 38 deletions(-) delete mode 100644 .yamlfmt delete mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8b6627f..66a4f62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,30 +3,19 @@ default_language_version: # TODO remove _LEGACY once it's gone exclude: '\.(tsv|fasta|gb)$|^ingest/vendored/|^_LEGACY' repos: - - repo: https://github.com/snakemake/snakefmt - rev: v0.10.2 + - repo: https://github.com/pre-commit/sync-pre-commit-deps + rev: v0.0.1 hooks: - - id: snakefmt - language_version: python3.11 + - id: sync-pre-commit-deps + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck - repo: https://github.com/rhysd/actionlint rev: v1.6.27 hooks: - id: actionlint entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint - - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 - hooks: - - id: codespell - additional_dependencies: - - tomli - - repo: https://github.com/google/yamlfmt - rev: v0.12.1 - hooks: - - id: yamlfmt - - repo: https://github.com/pappasam/toml-sort - rev: v0.23.1 - hooks: - - id: toml-sort-fix - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: @@ -45,11 +34,9 @@ repos: - id: detect-private-key - id: end-of-file-fixer - id: fix-byte-order-marker - - repo: https://github.com/pre-commit/sync-pre-commit-deps - rev: v0.0.1 - hooks: - - id: sync-pre-commit-deps - - repo: https://github.com/shellcheck-py/shellcheck-py - rev: v0.10.0.1 + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.4.6 hooks: - - id: shellcheck + # Run the linter. + - id: ruff diff --git a/.yamlfmt b/.yamlfmt deleted file mode 100644 index 354963d..0000000 --- a/.yamlfmt +++ /dev/null @@ -1,5 +0,0 @@ -formatter: - type: basic - line_ending: lf - retain_line_breaks: true - max_line_length: 120 diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 2c839bd..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,8 +0,0 @@ -[tool.codespell] -write = '' -skip = '*.gff,*.gff3' - -[tool.snakefmt] -include = '\.smk$|^Snakefile' -exclude = '' -line_length = 120