add ingest from monkeypox repo

nextstrain · Mar 25, 2023 · 645acb2 · 645acb2
1 parent b2a6cfa
commit 645acb2
Show file tree

Hide file tree

Showing 39 changed files with 2,461 additions and 0 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -0,0 +1,87 @@
+# nextstrain.org/monkeypox/ingest
+
+This is the ingest pipeline for Monkeypox virus sequences.
+
+## Usage
+
+> NOTE: All command examples assume you are within the `ingest` directory.
+> If running commands from the outer `monkeypox` directory, please replace the `.` with `ingest`
+
+Fetch sequences with
+
+```sh
+nextstrain build --cpus 1 . data/sequences.ndjson
+```
+
+Run the complete ingest pipeline with
+
+```sh
+nextstrain build --cpus 1 .
+```
+
+This will produce two files (within the `ingest` directory):
+
+- data/metadata.tsv
+- data/sequences.fasta
+
+Run the complete ingest pipeline and upload results to AWS S3 with
+
+```sh
+nextstrain build . --configfiles config/config.yaml config/optional.yaml
+```
+
+### Adding new sequences not from GenBank
+
+#### Static Files
+
+Do the following to include sequences from static FASTA files.
+
+1. Convert the FASTA files to NDJSON files with:
+
+    ```sh
+    ./ingest/bin/fasta-to-ndjson \
+        --fasta {path-to-fasta-file} \
+        --fields {fasta-header-field-names} \
+        --separator {field-separator-in-header} \
+        --exclude {fields-to-exclude-in-output} \
+        > ingest/data/{file-name}.ndjson
+    ```
+
+2. Add the following to the `.gitignore` to allow the file to be included in the repo:
+
+    ```gitignore
+    !ingest/data/{file-name}.ndjson
+    ```
+
+3. Add the `file-name` (without the `.ndjson` extension) as a source to `ingest/config/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.
+
+## Configuration
+
+Configuration takes place in `config/config.yaml` by default.
+Optional configs for uploading files and Slack notifications are in `config/optional.yaml`.
+
+### Environment Variables
+
+The complete ingest pipeline with AWS S3 uploads and Slack notifications uses the following environment variables:
+
+#### Required
+
+- `AWS_DEFAULT_REGION`
+- `AWS_ACCESS_KEY_ID`
+- `AWS_SECRET_ACCESS_KEY`
+- `SLACK_TOKEN`
+- `SLACK_CHANNELS`
+
+#### Optional
+
+These are optional environment variables used in our automated pipeline for providing detailed Slack notifications.
+
+- `GITHUB_RUN_ID` - provided via [`github.run_id` in a GitHub Action workflow](https://docs.github.com/en/actions/learn-github-actions/contexts#github-context)
+- `AWS_BATCH_JOB_ID` - provided via [AWS Batch Job environment variables](https://docs.aws.amazon.com/batch/latest/userguide/job_env_vars.html)
+
+## Input data
+
+### GenBank data
+
+GenBank sequences and metadata are fetched via NCBI Virus.
+The exact URL used to fetch data is constructed in `bin/genbank-url`.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -0,0 +1,82 @@
+if not config:
+
+    configfile: "config/config.yaml"
+
+
+send_slack_notifications = config.get("send_slack_notifications", False)
+
+
+def _get_all_targets(wildcards):
+    # Default targets are the metadata TSV and sequences FASTA files
+    all_targets = ["data/sequences.fasta", "data/metadata.tsv"]
+
+    # Add additional targets based on upload config
+    upload_config = config.get("upload", {})
+
+    for target, params in upload_config.items():
+        files_to_upload = params.get("files_to_upload", [])
+        remote_file_names = params.get("remote_file_names", [])
+
+        if len(files_to_upload) != len(remote_file_names):
+            print(
+                f"Skipping file upload for {target!r} because the number of",
+                "files to upload does not match the number of remote file names.",
+            )
+        elif len(remote_file_names) != len(set(remote_file_names)):
+            print(
+                f"Skipping file upload for {target!r} because there are duplicate remote file names."
+            )
+        elif not params.get("dst"):
+            print(
+                f"Skipping file upload for {target!r} because the destintion was not defined."
+            )
+        else:
+            all_targets.extend(
+                expand(
+                    [
+                        f"data/upload/{target}/{{file_to_upload}}-to-{{remote_file_name}}.done"
+                    ],
+                    zip,
+                    file_to_upload=files_to_upload,
+                    remote_file_name=remote_file_names,
+                )
+            )
+
+    # Add additional targets for Nextstrain's internal Slack notifications
+    if send_slack_notifications:
+        all_targets.extend(
+            [
+                "data/notify/genbank-record-change.done",
+                "data/notify/metadata-diff.done",
+            ]
+        )
+
+    if config.get("trigger_rebuild"):
+        all_targets.append("data/trigger/rebuild.done")
+
+    return all_targets
+
+
+rule all:
+    input:
+        _get_all_targets,
+
+
+include: "workflow/snakemake_rules/fetch_sequences.smk"
+include: "workflow/snakemake_rules/transform.smk"
+include: "workflow/snakemake_rules/nextclade.smk"
+
+
+if config.get("upload"):
+
+    include: "workflow/snakemake_rules/upload.smk"
+
+
+if send_slack_notifications:
+
+    include: "workflow/snakemake_rules/slack_notifications.smk"
+
+
+if config.get("trigger_rebuild"):
+
+    include: "workflow/snakemake_rules/trigger_rebuild.smk"