[ingest] Copy ingest from dengue repo

Future commits will change this to work with zika data
nextstrain · May 3, 2023 · 342ead8 · 342ead8
1 parent 98d7d65
commit 342ead8
Show file tree

Hide file tree

Showing 40 changed files with 2,576 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,10 @@ results/
 auspice/
 build/
 
+ingest/.snakemake
+ingest/data
+ingest/logs
+
 # Sensitive environment variables
 environment*
 

diff --git a/ingest/README.md b/ingest/README.md
@@ -0,0 +1,87 @@
+# nextstrain.org/dengue/ingest
+
+This is the ingest pipeline for Dengue virus sequences.
+
+## Usage
+
+> NOTE: All command examples assume you are within the `ingest` directory.
+> If running commands from the outer `dengue` directory, please replace the `.` with `ingest`
+
+Fetch sequences with
+
+```sh
+nextstrain build --cpus 1 . data/sequences_all.ndjson
+```
+
+Run the complete ingest pipeline with
+
+```sh
+nextstrain build --cpus 1 .
+```
+
+This will produce two files (within the `ingest` directory):
+
+- data/metadata_all.tsv
+- data/sequences_all.fasta
+
+Run the complete ingest pipeline and upload results to AWS S3 with
+
+```sh
+nextstrain build . --configfiles config/config.yaml config/optional.yaml
+```
+
+### Adding new sequences not from GenBank
+
+#### Static Files
+
+Do the following to include sequences from static FASTA files.
+
+1. Convert the FASTA files to NDJSON files with:
+
+    ```sh
+    ./ingest/bin/fasta-to-ndjson \
+        --fasta {path-to-fasta-file} \
+        --fields {fasta-header-field-names} \
+        --separator {field-separator-in-header} \
+        --exclude {fields-to-exclude-in-output} \
+        > ingest/data/{file-name}.ndjson
+    ```
+
+2. Add the following to the `.gitignore` to allow the file to be included in the repo:
+
+    ```gitignore
+    !ingest/data/{file-name}.ndjson
+    ```
+
+3. Add the `file-name` (without the `.ndjson` extension) as a source to `ingest/config/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.
+
+## Configuration
+
+Configuration takes place in `config/config.yaml` by default.
+Optional configs for uploading files and Slack notifications are in `config/optional.yaml`.
+
+### Environment Variables
+
+The complete ingest pipeline with AWS S3 uploads and Slack notifications uses the following environment variables:
+
+#### Required
+
+- `AWS_DEFAULT_REGION`
+- `AWS_ACCESS_KEY_ID`
+- `AWS_SECRET_ACCESS_KEY`
+- `SLACK_TOKEN`
+- `SLACK_CHANNELS`
+
+#### Optional
+
+These are optional environment variables used in our automated pipeline for providing detailed Slack notifications.
+
+- `GITHUB_RUN_ID` - provided via [`github.run_id` in a GitHub Action workflow](https://docs.github.com/en/actions/learn-github-actions/contexts#github-context)
+- `AWS_BATCH_JOB_ID` - provided via [AWS Batch Job environment variables](https://docs.aws.amazon.com/batch/latest/userguide/job_env_vars.html)
+
+## Input data
+
+### GenBank data
+
+GenBank sequences and metadata are fetched via NCBI Virus.
+The exact URL used to fetch data is constructed in `bin/genbank-url`.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -0,0 +1,86 @@
+if not config:
+
+    configfile: "config/config.yaml"
+
+
+send_slack_notifications = config.get("send_slack_notifications", False)
+
+serotypes = ["all", "denv1", "denv2", "denv3", "denv4"]
+
+
+def _get_all_targets(wildcards):
+    # Default targets are the metadata TSV and sequences FASTA files
+    all_targets = expand(
+        ["data/sequences_{serotype}.fasta.zst", "data/metadata_{serotype}.tsv.zst"],
+        serotype=serotypes,
+    )
+
+    # Add additional targets based on upload config
+    upload_config = config.get("upload", {})
+
+    for target, params in upload_config.items():
+        files_to_upload = params.get("files_to_upload", [])
+        remote_file_names = params.get("remote_file_names", [])
+
+        if len(files_to_upload) != len(remote_file_names):
+            print(
+                f"Skipping file upload for {target!r} because the number of",
+                "files to upload does not match the number of remote file names.",
+            )
+        elif len(remote_file_names) != len(set(remote_file_names)):
+            print(
+                f"Skipping file upload for {target!r} because there are duplicate remote file names."
+            )
+        elif not params.get("dst"):
+            print(
+                f"Skipping file upload for {target!r} because the destintion was not defined."
+            )
+        else:
+            all_targets.extend(
+                expand(
+                    [
+                        f"data/upload/{target}/{{file_to_upload}}-to-{{remote_file_name}}.done"
+                    ],
+                    zip,
+                    file_to_upload=files_to_upload,
+                    remote_file_name=remote_file_names,
+                )
+            )
+
+    # Add additional targets for Nextstrain's internal Slack notifications
+    if send_slack_notifications:
+        all_targets.extend(
+            [
+                "data/notify/genbank-record-change.done",
+                "data/notify/metadata-diff.done",
+            ]
+        )
+
+    if config.get("trigger_rebuild"):
+        all_targets.append("data/trigger/rebuild.done")
+
+    return all_targets
+
+
+rule all:
+    input:
+        _get_all_targets,
+
+
+include: "workflow/snakemake_rules/fetch_sequences.smk"
+include: "workflow/snakemake_rules/transform.smk"
+
+
+if config.get("upload"):
+
+    include: "workflow/snakemake_rules/upload.smk"
+
+
+if send_slack_notifications:
+
+    include: "workflow/snakemake_rules/slack_notifications.smk"
+
+
+if config.get("trigger_rebuild"):
+
+    include: "workflow/snakemake_rules/trigger_rebuild.smk"