From 9d7d3cca8292a66591cbd884711ce753fd2f3d7c Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 11 Aug 2023 08:23:38 -0400
Subject: [PATCH 1/6] git subrepo clone (merge)
 https://github.com/nextstrain/ingest vendored

subrepo:
  subdir:   "vendored"
  merged:   "1eb8b30"
upstream:
  origin:   "https://github.com/nextstrain/ingest"
  branch:   "main"
  commit:   "1eb8b30"
git-subrepo:
  version:  "0.4.6"
  origin:   "https://github.com/ingydotnet/git-subrepo"
  commit:   "110b9eb"
---
 vendored/.github/pull_request_template.md |  16 ++
 vendored/.github/workflows/ci.yaml        |  13 ++
 vendored/.gitrepo                         |  12 ++
 vendored/.shellcheckrc                    |   6 +
 vendored/README.md                        |  91 +++++++++
 vendored/apply-geolocation-rules          | 234 ++++++++++++++++++++++
 vendored/cloudfront-invalidate            |  42 ++++
 vendored/download-from-s3                 |  48 +++++
 vendored/merge-user-metadata              |  55 +++++
 vendored/notify-on-diff                   |  35 ++++
 vendored/notify-on-job-fail               |  23 +++
 vendored/notify-on-job-start              |  27 +++
 vendored/notify-on-record-change          |  53 +++++
 vendored/notify-slack                     |  56 ++++++
 vendored/s3-object-exists                 |   8 +
 vendored/sha256sum                        |  15 ++
 vendored/transform-authors                |  66 ++++++
 vendored/transform-field-names            |  48 +++++
 vendored/transform-genbank-location       |  43 ++++
 vendored/trigger                          |  56 ++++++
 vendored/trigger-on-new-data              |  32 +++
 vendored/upload-to-s3                     |  78 ++++++++
 22 files changed, 1057 insertions(+)
 create mode 100644 vendored/.github/pull_request_template.md
 create mode 100644 vendored/.github/workflows/ci.yaml
 create mode 100644 vendored/.gitrepo
 create mode 100644 vendored/.shellcheckrc
 create mode 100644 vendored/README.md
 create mode 100755 vendored/apply-geolocation-rules
 create mode 100755 vendored/cloudfront-invalidate
 create mode 100755 vendored/download-from-s3
 create mode 100755 vendored/merge-user-metadata
 create mode 100755 vendored/notify-on-diff
 create mode 100755 vendored/notify-on-job-fail
 create mode 100755 vendored/notify-on-job-start
 create mode 100755 vendored/notify-on-record-change
 create mode 100755 vendored/notify-slack
 create mode 100755 vendored/s3-object-exists
 create mode 100755 vendored/sha256sum
 create mode 100755 vendored/transform-authors
 create mode 100755 vendored/transform-field-names
 create mode 100755 vendored/transform-genbank-location
 create mode 100755 vendored/trigger
 create mode 100755 vendored/trigger-on-new-data
 create mode 100755 vendored/upload-to-s3

diff --git a/vendored/.github/pull_request_template.md b/vendored/.github/pull_request_template.md
new file mode 100644
index 00000000..ed4a5b27
--- /dev/null
+++ b/vendored/.github/pull_request_template.md
@@ -0,0 +1,16 @@
+### Description of proposed changes
+
+<!-- What is the goal of this pull request? What does this pull request change? -->
+
+### Related issue(s)
+
+<!-- Link any related issues here. -->
+
+### Checklist
+
+<!-- Make sure checks are successful at the bottom of the PR. -->
+
+- [ ] Checks pass
+- [ ] If adding a script, add an entry for it in the README.
+
+<!-- 🙌 Thank you for contributing to Nextstrain! ✨ -->
diff --git a/vendored/.github/workflows/ci.yaml b/vendored/.github/workflows/ci.yaml
new file mode 100644
index 00000000..dcb3b898
--- /dev/null
+++ b/vendored/.github/workflows/ci.yaml
@@ -0,0 +1,13 @@
+name: CI
+
+on:
+  - push
+  - pull_request
+  - workflow_dispatch
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: nextstrain/.github/actions/shellcheck@master
diff --git a/vendored/.gitrepo b/vendored/.gitrepo
new file mode 100644
index 00000000..3fe78c05
--- /dev/null
+++ b/vendored/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/nextstrain/ingest
+	branch = main
+	commit = 1eb8b30428d5f66adac201f0a246a7ab4bdc9792
+	parent = 6fd5a9b1d87e59fab35173dbedf376632154943b
+	method = merge
+	cmdver = 0.4.6
diff --git a/vendored/.shellcheckrc b/vendored/.shellcheckrc
new file mode 100644
index 00000000..ebed438c
--- /dev/null
+++ b/vendored/.shellcheckrc
@@ -0,0 +1,6 @@
+# Use of this file requires Shellcheck v0.7.0 or newer.
+#
+# SC2064 - We intentionally want variables to expand immediately within traps
+#          so the trap can not fail due to variable interpolation later.
+#
+disable=SC2064
diff --git a/vendored/README.md b/vendored/README.md
new file mode 100644
index 00000000..0311a551
--- /dev/null
+++ b/vendored/README.md
@@ -0,0 +1,91 @@
+# ingest
+
+Shared internal tooling for pathogen data ingest.  Used by our individual
+pathogen repos which produce Nextstrain builds.  Expected to be vendored by
+each pathogen repo using `git subtree`.
+
+Some tools may only live here temporarily before finding a permanent home in
+`augur curate` or Nextstrain CLI.  Others may happily live out their days here.
+
+## Vendoring
+
+Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts.
+(See discussion on this decision in https://github.com/nextstrain/ingest/issues/3)
+
+If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation).
+Then add the latest ingest scripts to the pathogen repo by running:
+
+```
+git subrepo clone https://github.com/nextstrain/ingest ingest/vendored
+```
+
+Any future updates of ingest scripts can be pulled in with:
+
+```
+git subrepo pull ingest/vendored
+```
+
+## History
+
+Much of this tooling originated in
+[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru
+[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/).
+It subsequently proliferated from [monkeypox][] to other pathogen repos
+([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily
+thru copying.  To [counter that
+proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079),
+this repo was made.
+
+[monkeypox]: https://github.com/nextstrain/monkeypox
+[rsv]: https://github.com/nextstrain/rsv
+[zika]: https://github.com/nextstrain/zika/pull/24
+[dengue]: https://github.com/nextstrain/dengue/pull/10
+[hepatitisB]: https://github.com/nextstrain/hepatitisB
+[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov
+
+## Elsewhere
+
+The creation of this repo, in both the abstract and concrete, and the general
+approach to "ingest" has been discussed in various internal places, including:
+
+- https://github.com/nextstrain/private/issues/59
+- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i)
+- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079)
+- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit)
+- _…many others_
+
+## Scripts
+
+Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools.
+
+- [notify-on-diff](notify-on-diff) - Send Slack message with diff of a local file and an S3 object
+- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch
+- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch
+- [notify-on-record-change](notify-on-recod-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`.
+  If the S3 object's metadata does not have `recordcount`, then will attempt to download S3 object to count lines locally, which only supports `xz` compressed S3 objects.
+- [notify-slack](notify-slack) - Send message or file to Slack
+- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts
+- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events.
+- [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message`
+  A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated.
+
+Potential Nextstrain CLI scripts
+
+- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.
+- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104).
+  This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script.
+- [upload-to-s3](upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL.
+  Skips upload if the local file's hash is identical to the S3 object's metadata `sha256sum`.
+  Adds the following user defined metadata to uploaded S3 object:
+    - `sha256sum` - hash of the file generated by [sha256sum](sha256sum)
+    - `recordcount` - the line count of the file
+- [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL.
+  Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`.
+
+Potential augur curate scripts
+
+- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records
+- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records
+- [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
+- [transform-field-names](transform-field-names) - Rename fields of NDJSON records
+- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
diff --git a/vendored/apply-geolocation-rules b/vendored/apply-geolocation-rules
new file mode 100755
index 00000000..776cf16a
--- /dev/null
+++ b/vendored/apply-geolocation-rules
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Applies user curated geolocation rules to the geolocation fields in the NDJSON
+records from stdin. The modified records are output to stdout. This does not do
+any additional transformations on top of the user curations.
+"""
+import argparse
+import json
+from collections import defaultdict
+from sys import exit, stderr, stdin, stdout
+
+
+class CyclicGeolocationRulesError(Exception):
+    pass
+
+
+def load_geolocation_rules(geolocation_rules_file):
+    """
+    Loads the geolocation rules from the provided *geolocation_rules_file*.
+    Returns the rules as a dict:
+    {
+        regions: {
+            countries: {
+                divisions: {
+                    locations: corrected_geolocations_tuple
+                }
+            }
+        }
+    }
+    """
+    geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+    with open(geolocation_rules_file, 'r') as rules_fh:
+        for line in rules_fh:
+            # ignore comments
+            if line.strip()=="" or line.lstrip()[0] == '#':
+                continue
+
+            row = line.strip('\n').split('\t')
+            # Skip lines that cannot be split into raw and annotated geolocations
+            if len(row) != 2:
+                print(
+                    f"WARNING: Could not decode geolocation rule {line!r}.",
+                    "Please make sure rules are formatted as",
+                    "'region/country/division/location<tab>region/country/division/location'.",
+                    file=stderr)
+                continue
+
+            # remove trailing comments
+            row[-1] = row[-1].partition('#')[0].rstrip()
+            raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') )
+
+            # Skip lines where raw or annotated geolocations cannot be split into 4 fields
+            if len(raw) != 4:
+                print(
+                    f"WARNING: Could not decode the raw geolocation {row[0]!r}.",
+                    "Please make sure it is formatted as 'region/country/division/location'.",
+                    file=stderr
+                )
+                continue
+
+            if len(annot) != 4:
+                print(
+                    f"WARNING: Could not decode the annotated geolocation {row[1]!r}.",
+                    "Please make sure it is formatted as 'region/country/division/location'.",
+                    file=stderr
+                )
+                continue
+
+
+            geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot
+
+    return geolocation_rules
+
+
+def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None):
+    """
+    Gets the annotated geolocation for the *raw_geolocation* in the provided
+    *geolocation_rules*.
+
+    Recursively traverses the *geolocation_rules* until we get the annotated
+    geolocation, which must be a Tuple. Returns `None` if there are no
+    applicable rules for the provided *raw_geolocation*.
+
+    Rules are applied in the order of region, country, division, location.
+    First checks the provided raw values for geolocation fields, then if there
+    are not matches, tries to use general rules marked with '*'.
+    """
+    # Always instantiate the rule traversal as an empty list if not provided,
+    # e.g. the first call of this recursive function
+    if rule_traversal is None:
+        rule_traversal = []
+
+    current_rules = geolocation_rules
+    # Traverse the geolocation rules based using the rule_traversal values
+    for field_value in rule_traversal:
+        current_rules = current_rules.get(field_value)
+        # If we hit `None`, then we know there are no matching rules, so stop the rule traversal
+        if current_rules is None:
+            break
+
+    # We've found the tuple of the annotated geolocation
+    if isinstance(current_rules, tuple):
+        return current_rules
+
+    # We've reach the next level of geolocation rules,
+    # so try to traverse the rules with the next target in raw_geolocation
+    if isinstance(current_rules, dict):
+        next_traversal_target = raw_geolocation[len(rule_traversal)]
+        rule_traversal.append(next_traversal_target)
+        return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal)
+
+    # We did not find any matching rule for the last traversal target
+    if current_rules is None:
+        # If we've used all general rules and we still haven't found a match,
+        # then there are no applicable rules for this geolocation
+        if all(value == '*' for value in rule_traversal):
+            return None
+
+        # If we failed to find matching rule with a general rule as the last
+        # traversal target, then delete all trailing '*'s to reset rule_traversal
+        # to end with the last index that is currently NOT a '*'
+        # [A, *, B, *] => [A, *, B]
+        # [A, B, *, *] => [A, B]
+        # [A, *, *, *] => [A]
+        if rule_traversal[-1] == '*':
+            # Find the index of the first of the consecutive '*' from the
+            # end of the rule_traversal
+            # [A, *, B, *] => first_consecutive_general_rule_index = 3
+            # [A, B, *, *] => first_consecutive_general_rule_index = 2
+            # [A, *, *, *] => first_consecutive_general_rule_index = 1
+            for index, field_value in reversed(list(enumerate(rule_traversal))):
+                if field_value == '*':
+                    first_consecutive_general_rule_index = index
+                else:
+                    break
+
+            rule_traversal = rule_traversal[:first_consecutive_general_rule_index]
+
+        # Set the final value to '*' in hopes that by moving to a general rule,
+        # we can find a matching rule.
+        rule_traversal[-1] = '*'
+
+        return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal)
+
+
+def transform_geolocations(geolocation_rules, geolocation):
+    """
+    Transform the provided *geolocation* by looking it up in the provided
+    *geolocation_rules*.
+
+    This will use all rules that apply to the geolocation and rules will
+    be applied in the order of region, country, division, location.
+
+    Returns the original geolocation if no geolocation rules apply.
+
+    Raises a `CyclicGeolocationRulesError` if more than 1000 rules have
+    been applied to the raw geolocation.
+    """
+    transformed_values = geolocation
+    rules_applied = 0
+    continue_to_apply = True
+
+    while continue_to_apply:
+        annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values)
+
+        # Stop applying rules if no annotated values were found
+        if annotated_values is None:
+            continue_to_apply = False
+        else:
+            rules_applied += 1
+
+            if rules_applied > 1000:
+                raise CyclicGeolocationRulesError(
+                    "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}."
+                )
+
+            # Create a new list of values for comparison to previous values
+            new_values = list(transformed_values)
+            for index, value in enumerate(annotated_values):
+                # Keep original value if annotated value is '*'
+                if value != '*':
+                    new_values[index] = value
+
+            # Stop applying rules if this rule did not change the values,
+            # since this means we've reach rules with '*' that no longer change values
+            if new_values == transformed_values:
+                continue_to_apply = False
+
+            transformed_values = new_values
+
+    return transformed_values
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--region-field", default="region",
+        help="Field that contains regions in NDJSON records.")
+    parser.add_argument("--country-field", default="country",
+        help="Field that contains countries in NDJSON records.")
+    parser.add_argument("--division-field", default="division",
+        help="Field that contains divisions in NDJSON records.")
+    parser.add_argument("--location-field", default="location",
+        help="Field that contains location in NDJSON records.")
+    parser.add_argument("--geolocation-rules", metavar="TSV", required=True,
+        help="TSV file of geolocation rules with the format: " +
+             "'<raw_geolocation><tab><annotated_geolocation>' where the raw and annotated geolocations " +
+             "are formatted as '<region>/<country>/<division>/<location>'. " +
+             "If creating a general rule, then the raw field value can be substituted with '*'." +
+             "Lines starting with '#' will be ignored as comments." +
+             "Trailing '#' will be ignored as comments.")
+
+    args = parser.parse_args()
+
+    location_fields = [args.region_field, args.country_field, args.division_field, args.location_field]
+
+    geolocation_rules = load_geolocation_rules(args.geolocation_rules)
+
+    for record in stdin:
+        record = json.loads(record)
+
+        try:
+            annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields])
+        except CyclicGeolocationRulesError as e:
+            print(e, file=stderr)
+            exit(1)
+
+        for index, field in enumerate(location_fields):
+            record[field] = annotated_values[index]
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/vendored/cloudfront-invalidate b/vendored/cloudfront-invalidate
new file mode 100755
index 00000000..dec48529
--- /dev/null
+++ b/vendored/cloudfront-invalidate
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
+set -euo pipefail
+
+main() {
+    local domain="$1"
+    shift
+    local paths=("$@")
+    local distribution invalidation
+
+    echo "-> Finding CloudFront distribution"
+    distribution=$(
+        aws cloudfront list-distributions \
+            --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \
+            --output text
+    )
+
+    if [[ -z $distribution || $distribution == None ]]; then
+        exec >&2
+        echo "Unable to find CloudFront distribution id for $domain"
+        echo
+        echo "Are your AWS CLI credentials for the right account?"
+        exit 1
+    fi
+
+    echo "-> Creating CloudFront invalidation for distribution $distribution"
+    invalidation=$(
+        aws cloudfront create-invalidation \
+            --distribution-id "$distribution" \
+            --paths "${paths[@]}" \
+            --query Invalidation.Id \
+            --output text
+    )
+
+    echo "-> Waiting for CloudFront invalidation $invalidation to complete"
+    echo "   Ctrl-C to stop waiting."
+    aws cloudfront wait invalidation-completed \
+        --distribution-id "$distribution" \
+        --id "$invalidation"
+}
+
+main "$@"
diff --git a/vendored/download-from-s3 b/vendored/download-from-s3
new file mode 100755
index 00000000..44f7ff34
--- /dev/null
+++ b/vendored/download-from-s3
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -euo pipefail
+
+bin="$(dirname "$0")"
+
+main() {
+    local src="${1:?A source s3:// URL is required as the first argument.}"
+    local dst="${2:?A destination file path is required as the second argument.}"
+    # How many lines to subsample to. 0 means no subsampling. Optional.
+    # It is not advised to use this for actual subsampling! This is intended to be
+    # used for debugging workflows with large datasets such as ncov-ingest as
+    # described in https://github.com/nextstrain/ncov-ingest/pull/367
+
+    # Uses `tsv-sample` to subsample, so it will not work as expected with files
+    # that have a single record split across multiple lines (i.e. FASTA sequences)
+    local n="${3:-0}"
+
+    local s3path="${src#s3://}"
+    local bucket="${s3path%%/*}"
+    local key="${s3path#*/}"
+
+    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
+    dst_hash="$("$bin/sha256sum" < "$dst" || true)"
+    src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+
+    echo "[ INFO] Downloading $src → $dst"
+    if [[ $src_hash != "$dst_hash" ]]; then
+        aws s3 cp --no-progress "$src" - |
+        if [[ "$src" == *.gz ]]; then
+            gunzip -cfq
+        elif  [[ "$src" == *.xz ]]; then
+            xz -T0 -dcq
+        elif [[ "$src" == *.zst ]]; then
+            zstd -T0 -dcq
+        else
+            cat
+        fi |
+        if [[ "$n" -gt 0 ]]; then
+            tsv-sample -H -i -n "$n"
+        else
+            cat
+        fi >"$dst"
+    else
+        echo "[ INFO] Files are identical, skipping download"
+    fi
+}
+
+main "$@"
diff --git a/vendored/merge-user-metadata b/vendored/merge-user-metadata
new file mode 100755
index 00000000..341c2dfa
--- /dev/null
+++ b/vendored/merge-user-metadata
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Merges user curated annotations with the NDJSON records from stdin, with the user
+curations overwriting the existing fields. The modified records are output
+to stdout. This does not do any additional transformations on top of the user
+curations.
+"""
+import argparse
+import csv
+import json
+from collections import defaultdict
+from sys import exit, stdin, stderr, stdout
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--annotations", metavar="TSV", required=True,
+        help="Manually curated annotations TSV file. " +
+             "The TSV should not have a header and should have exactly three columns: " +
+             "id to match existing metadata, field name, and field value. " +
+             "If there are multiple annotations for the same id and field, then the last value is used. " +
+             "Lines starting with '#' are treated as comments. " +
+             "Any '#' after the field value are treated as comments.")
+    parser.add_argument("--id-field", default="accession",
+        help="The ID field in the metadata to use to merge with the annotations.")
+
+    args = parser.parse_args()
+
+    annotations = defaultdict(dict)
+    with open(args.annotations, 'r') as annotations_fh:
+        csv_reader = csv.reader(annotations_fh, delimiter='\t')
+        for row in csv_reader:
+            if not row or row[0].lstrip()[0] == '#':
+                    continue
+            elif len(row) != 3:
+                print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr)
+                continue
+            id, field, value = row
+            annotations[id][field] = value.partition('#')[0].rstrip()
+
+    for record in stdin:
+        record = json.loads(record)
+
+        record_id = record.get(args.id_field)
+        if record_id is None:
+            print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr)
+            exit(1)
+
+        record.update(annotations.get(record_id, {}))
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/vendored/notify-on-diff b/vendored/notify-on-diff
new file mode 100755
index 00000000..c304d6b5
--- /dev/null
+++ b/vendored/notify-on-diff
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+bin="$(dirname "$0")"
+
+src="${1:?A source file is required as the first argument.}"
+dst="${2:?A destination s3:// URL is required as the second argument.}"
+
+dst_local="$(mktemp -t s3-file-XXXXXX)"
+diff="$(mktemp -t diff-XXXXXX)"
+
+trap "rm -f '$dst_local' '$diff'" EXIT
+
+# if the file is not already present, just exit
+"$bin"/s3-object-exists "$dst" || exit 0
+
+"$bin"/download-from-s3 "$dst" "$dst_local"
+
+# diff's exit code is 0 for no differences, 1 for differences found, and >1 for errors
+diff_exit_code=0
+diff "$dst_local" "$src" > "$diff" || diff_exit_code=$?
+
+if [[ "$diff_exit_code" -eq 1 ]]; then
+    echo "Notifying Slack about diff."
+    "$bin"/notify-slack --upload "$src.diff" < "$diff"
+elif [[ "$diff_exit_code" -gt 1 ]]; then
+    echo "Notifying Slack about diff failure"
+    "$bin"/notify-slack "Diff failed for $src"
+else
+    echo "No change in $src."
+fi
diff --git a/vendored/notify-on-job-fail b/vendored/notify-on-job-fail
new file mode 100755
index 00000000..02cb6bad
--- /dev/null
+++ b/vendored/notify-on-job-fail
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+: "${AWS_BATCH_JOB_ID:=}"
+: "${GITHUB_RUN_ID:=}"
+
+bin="$(dirname "$0")"
+job_name="${1:?A job name is required as the first argument}"
+github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}"
+
+echo "Notifying Slack about failed ${job_name} job."
+message="❌ ${job_name} job has FAILED 😞 "
+
+if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
+    message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>) for error details. "
+elif [[ -n "${GITHUB_RUN_ID}" ]]; then
+    message+="See GitHub Action <https://github.com/${github_repo}/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}> for error details. "
+fi
+
+"$bin"/notify-slack "$message"
diff --git a/vendored/notify-on-job-start b/vendored/notify-on-job-start
new file mode 100755
index 00000000..3e44bb09
--- /dev/null
+++ b/vendored/notify-on-job-start
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+: "${AWS_BATCH_JOB_ID:=}"
+: "${GITHUB_RUN_ID:=}"
+
+bin="$(dirname "$0")"
+job_name="${1:?A job name is required as the first argument}"
+github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}"
+build_dir="${3:-ingest}"
+
+echo "Notifying Slack about started ${job_name} job."
+message="${job_name} job has started."
+
+if [[ -n "${GITHUB_RUN_ID}" ]]; then
+  message+=" The job was submitted by GitHub Action <https://github.com/${github_repo}/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}>."
+fi
+
+if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
+  message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>)."
+  message+=" Follow along in your local clone of ${github_repo} with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ${build_dir}"'```'
+fi
+
+"$bin"/notify-slack "$message"
diff --git a/vendored/notify-on-record-change b/vendored/notify-on-record-change
new file mode 100755
index 00000000..c0bf8f7e
--- /dev/null
+++ b/vendored/notify-on-record-change
@@ -0,0 +1,53 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+bin="$(dirname "$0")"
+
+src="${1:?A source ndjson file is required as the first argument.}"
+dst="${2:?A destination ndjson s3:// URL is required as the second argument.}"
+source_name=${3:?A record source name is required as the third argument.}
+
+# if the file is not already present, just exit
+"$bin"/s3-object-exists "$dst" || exit 0
+
+s3path="${dst#s3://}"
+bucket="${s3path%%/*}"
+key="${s3path#*/}"
+
+src_record_count="$(wc -l < "$src")"
+
+# Try getting record count from S3 object metadata
+dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)"
+if [[ -z "$dst_record_count" ]]; then
+  # This object doesn't have the record count stored as metadata
+  # We have to download it and count the lines locally
+  dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))"
+fi
+
+added_records="$(( src_record_count - dst_record_count ))"
+
+printf "%'4d %s\n" "$src_record_count" "$src"
+printf "%'4d %s\n" "$dst_record_count" "$dst"
+printf "%'4d added records\n" "$added_records"
+
+slack_message=""
+
+if [[ $added_records -gt 0 ]]; then
+    echo "Notifying Slack about added records (n=$added_records)"
+    slack_message="📈 New records (n=$added_records) found on $source_name."
+
+elif [[ $added_records -lt 0 ]]; then
+    echo "Notifying Slack about fewer records (n=$added_records)"
+    slack_message="📉 Fewer records (n=$added_records) found on $source_name."
+
+else
+    echo "Notifying Slack about same number of records"
+    slack_message="⛔ No new records found on $source_name."
+fi
+
+slack_message+=" (Total record count: $src_record_count)"
+
+"$bin"/notify-slack "$slack_message"
diff --git a/vendored/notify-slack b/vendored/notify-slack
new file mode 100755
index 00000000..db98bfb8
--- /dev/null
+++ b/vendored/notify-slack
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+upload=0
+output=/dev/null
+thread_ts=""
+broadcast=0
+args=()
+
+for arg; do
+    case "$arg" in
+        --upload)
+            upload=1;;
+        --output=*)
+            output="${arg#*=}";;
+        --thread-ts=*)
+            thread_ts="${arg#*=}";;
+        --broadcast)
+            broadcast=1;;
+        *)
+            args+=("$arg");;
+    esac
+done
+
+set -- "${args[@]}"
+
+text="${1:?Some message text is required.}"
+
+if [[ "$upload" == 1 ]]; then
+    echo "Uploading data to Slack with the message: $text"
+    curl https://slack.com/api/files.upload \
+        --header "Authorization: Bearer $SLACK_TOKEN" \
+        --form-string channels="$SLACK_CHANNELS" \
+        --form-string title="$text" \
+        --form-string filename="$text" \
+        --form-string thread_ts="$thread_ts" \
+        --form file=@/dev/stdin \
+        --form filetype=text \
+        --fail --silent --show-error \
+        --http1.1 \
+        --output "$output"
+else
+    echo "Posting Slack message: $text"
+    curl https://slack.com/api/chat.postMessage \
+        --header "Authorization: Bearer $SLACK_TOKEN" \
+        --form-string channel="$SLACK_CHANNELS" \
+        --form-string text="$text" \
+        --form-string thread_ts="$thread_ts" \
+        --form-string reply_broadcast="$broadcast" \
+        --fail --silent --show-error \
+        --http1.1 \
+        --output "$output"
+fi
diff --git a/vendored/s3-object-exists b/vendored/s3-object-exists
new file mode 100755
index 00000000..faac4219
--- /dev/null
+++ b/vendored/s3-object-exists
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euo pipefail
+
+url="${1#s3://}"
+bucket="${url%%/*}"
+key="${url#*/}"
+
+aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null
diff --git a/vendored/sha256sum b/vendored/sha256sum
new file mode 100755
index 00000000..32d7ef82
--- /dev/null
+++ b/vendored/sha256sum
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""
+Portable sha256sum utility.
+"""
+from hashlib import sha256
+from sys import stdin
+
+chunk_size = 5 * 1024**2 # 5 MiB
+
+h = sha256()
+
+for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
+    h.update(chunk)
+
+print(h.hexdigest())
diff --git a/vendored/transform-authors b/vendored/transform-authors
new file mode 100755
index 00000000..0bade20e
--- /dev/null
+++ b/vendored/transform-authors
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
+record from stdin and outputs modified records to stdout.
+
+Note: This is a "best effort" approach and can potentially mangle the author name.
+"""
+import argparse
+import json
+import re
+from sys import stderr, stdin, stdout
+
+
+def parse_authors(record: dict, authors_field: str, default_value: str,
+    index: int, abbr_authors_field: str = None) -> dict:
+    # Strip and normalize whitespace
+    new_authors = re.sub(r'\s+', ' ', record[authors_field])
+
+    if new_authors == "":
+        new_authors = default_value
+    else:
+        # Split authors list on comma/semicolon
+        # OR "and"/"&" with at least one space before and after
+        new_authors = re.split(r'(?:\s*[,，;；]\s*|\s+(?:and|&)\s+)', new_authors)[0]
+
+        # if it does not already end with " et al.", add it
+        if not new_authors.strip('. ').endswith(" et al"):
+            new_authors += ' et al'
+
+    if abbr_authors_field:
+        if record.get(abbr_authors_field):
+            print(
+                f"WARNING: the {abbr_authors_field!r} field already exists",
+                f"in record {index} and will be overwritten!",
+                file=stderr
+            )
+
+        record[abbr_authors_field] = new_authors
+    else:
+        record[authors_field] = new_authors
+
+    return record
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--authors-field", default="authors",
+        help="The field containing list of authors.")
+    parser.add_argument("--default-value", default="?",
+        help="Default value to use if authors list is empty.")
+    parser.add_argument("--abbr-authors-field",
+        help="The field for the generated abbreviated authors. " +
+             "If not provided, the original authors field will be modified.")
+
+    args = parser.parse_args()
+
+    for index, record in enumerate(stdin):
+        record = json.loads(record)
+
+        parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/vendored/transform-field-names b/vendored/transform-field-names
new file mode 100755
index 00000000..fde223fc
--- /dev/null
+++ b/vendored/transform-field-names
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Renames fields of the NDJSON record from stdin and outputs modified records
+to stdout.
+"""
+import argparse
+import json
+from sys import stderr, stdin, stdout
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--field-map", nargs="+",
+        help="Fields names in the NDJSON record mapped to new field names, " +
+             "formatted as '{old_field_name}={new_field_name}'. " +
+             "If the old field does not exist in record, the new field will be added with an empty string value." +
+             "If the new field already exists in record, then the renaming of the old field will be skipped.")
+    parser.add_argument("--force", action="store_true",
+        help="Force renaming of old field even if the new field already exists. " +
+             "Please keep in mind this will overwrite the value of the new field.")
+
+    args = parser.parse_args()
+
+    field_map = {}
+    for field in args.field_map:
+        old_name, new_name = field.split('=')
+        field_map[old_name] = new_name
+
+    for record in stdin:
+        record = json.loads(record)
+
+        for old_field, new_field in field_map.items():
+
+            if record.get(new_field) and not args.force:
+                print(
+                    f"WARNING: skipping rename of {old_field} because record",
+                    f"already has a field named {new_field}.",
+                    file=stderr
+                )
+                continue
+
+            record[new_field] = record.pop(old_field, '')
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/vendored/transform-genbank-location b/vendored/transform-genbank-location
new file mode 100755
index 00000000..70ba56fb
--- /dev/null
+++ b/vendored/transform-genbank-location
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
+fields: 'country', 'division', and 'location'. Checks that a record is from
+GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
+
+Outputs the modified record to stdout.
+"""
+import json
+from sys import stdin, stdout
+
+
+def parse_location(record: dict) -> dict:
+    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
+    # See GenBank docs for their "country" field:
+    # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
+    geographic_data = record['location'].split(':')
+
+    country = geographic_data[0]
+    division = ''
+    location = ''
+
+    if len(geographic_data) == 2:
+        division , _ , location = geographic_data[1].partition(',')
+
+    record['country'] = country.strip()
+    record['division'] = division.strip()
+    record['location'] = location.strip()
+
+    return record
+
+
+if __name__ == '__main__':
+
+    for record in stdin:
+        record = json.loads(record)
+
+        database = record.get('database', '')
+        if database in {'GenBank', 'RefSeq'}:
+            parse_location(record)
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/vendored/trigger b/vendored/trigger
new file mode 100755
index 00000000..11d1b635
--- /dev/null
+++ b/vendored/trigger
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${PAT_GITHUB_DISPATCH:=}"
+
+github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}"
+event_type="${2:?An event type is required as the second argument.}"
+shift 2
+
+if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
+    cat >&2 <<.
+You must specify options to curl for your GitHub credentials.  For example, you
+can specify your GitHub username, and will be prompted for your password:
+
+  $0 $github_repo $event_type --user <your-github-username>
+
+Be sure to enter a personal access token¹ as your password since GitHub has
+discontinued password authentication to the API starting on November 13, 2020².
+
+You can also store your credentials or a personal access token in a netrc
+file³:
+
+  machine api.github.com
+  login <your-username>
+  password <your-token>
+
+and then tell curl to use it:
+
+  $0 $github_repo $event_type --netrc
+
+which will then not require you to type your password every time.
+
+¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line
+² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password
+³ https://ec.haxx.se/usingcurl/usingcurl-netrc
+.
+    exit 1
+fi
+
+auth=':'
+if [[ -n $PAT_GITHUB_DISPATCH ]]; then
+  auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
+fi
+
+if curl -fsS "https://api.github.com/repos/${github_repo}/dispatches" \
+    -H 'Accept: application/vnd.github.v3+json' \
+    -H 'Content-Type: application/json' \
+    -H "$auth" \
+    -d '{"event_type":"'"$event_type"'"}' \
+    "$@"
+then
+    echo "Successfully triggered $event_type"
+else
+    echo "Request failed" >&2
+    exit 1
+fi
diff --git a/vendored/trigger-on-new-data b/vendored/trigger-on-new-data
new file mode 100755
index 00000000..ef71d88d
--- /dev/null
+++ b/vendored/trigger-on-new-data
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}"
+
+bin="$(dirname "$0")"
+
+github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}"
+event_type="${2:?An event type is required as the second argument.}"
+metadata="${3:?A metadata upload output file is required as the third argument.}"
+sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}"
+identical_file_message="${5:-files are identical}"
+
+new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?)
+new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?)
+
+slack_message=""
+
+# grep exit status 0 for found match, 1 for no match, 2 if an error occurred
+if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then
+    slack_message="Triggering new builds due to updated metadata and/or sequences"
+    "$bin"/trigger "$github_repo" "$event_type"
+elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then
+    slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files."
+else
+    slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated."
+fi
+
+
+if ! "$bin"/notify-slack "$slack_message"; then
+    echo "Notifying Slack failed, but exiting with success anyway."
+fi
diff --git a/vendored/upload-to-s3 b/vendored/upload-to-s3
new file mode 100755
index 00000000..31cd49bf
--- /dev/null
+++ b/vendored/upload-to-s3
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -euo pipefail
+
+bin="$(dirname "$0")"
+
+main() {
+    local quiet=0
+
+    for arg; do
+        case "$arg" in
+            --quiet)
+                quiet=1
+                shift;;
+            *)
+                break;;
+        esac
+    done
+
+    local src="${1:?A source file is required as the first argument.}"
+    local dst="${2:?A destination s3:// URL is required as the second argument.}"
+    local cloudfront_domain="${3:-}"
+
+    local s3path="${dst#s3://}"
+    local bucket="${s3path%%/*}"
+    local key="${s3path#*/}"
+
+    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
+    src_hash="$("$bin/sha256sum" < "$src")"
+    dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+
+    if [[ $src_hash != "$dst_hash" ]]; then
+        # The record count may have changed
+        src_record_count="$(wc -l < "$src")"
+
+        echo "Uploading $src → $dst"
+        if [[ "$dst" == *.gz ]]; then
+            gzip -c "$src"
+        elif  [[ "$dst" == *.xz ]]; then
+            xz -2 -T0 -c "$src"
+        elif [[ "$dst" == *.zst ]]; then
+            zstd -T0 -c "$src"
+        else
+            cat "$src"
+        fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")"
+
+        if [[ -n $cloudfront_domain ]]; then
+            echo "Creating CloudFront invalidation for $cloudfront_domain/$key"
+            if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then
+                echo "CloudFront invalidation failed, but exiting with success anyway."
+            fi
+        fi
+
+        if [[ $quiet == 1 ]]; then
+            echo "Quiet mode. No Slack notification sent."
+            exit 0
+        fi
+
+        if ! "$bin"/notify-slack "Updated $dst available."; then
+            echo "Notifying Slack failed, but exiting with success anyway."
+        fi
+    else
+        echo "Uploading $src → $dst: files are identical, skipping upload"
+    fi
+}
+
+content-type() {
+    case "$1" in
+        *.tsv)      echo --content-type=text/tab-separated-values;;
+        *.csv)      echo --content-type=text/comma-separated-values;;
+        *.ndjson)   echo --content-type=application/x-ndjson;;
+        *.gz)       echo --content-type=application/gzip;;
+        *.xz)       echo --content-type=application/x-xz;;
+        *.zst)      echo --content-type=application/zstd;;
+        *)          echo --content-type=text/plain;;
+    esac
+}
+
+main "$@"

From 56335d2ff7ae945ac5d5260f72ffdba39da0e5aa Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 31 Jul 2023 15:18:14 -0700
Subject: [PATCH 2/6] Describe subrepo setup

The previous commit was created by the following command:

    git subrepo clone https://github.com/nextstrain/ingest vendored

Add a section in the README on how to use this directory in the future.
---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index af89f31c..445af45a 100644
--- a/README.md
+++ b/README.md
@@ -150,3 +150,16 @@ aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < /
 - `AWS_SECRET_ACCESS_KEY`
 - `SLACK_TOKEN`
 - `SLACK_CHANNELS`
+
+## `vendored`
+
+This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in `vendored`, from [nextstrain/ingest](https://github.com/nextstrain/ingest). To pull new changes from the central ingest repository, first install `git subrepo`, then run:
+
+```sh
+git subrepo pull vendored
+```
+
+Changes should not be pushed using `git subrepo push`.
+
+1. For pathogen-specific changes, make them in this repository via a pull request.
+2. For pathogen-agnostic changes, make them on [nextstrain/ingest](https://github.com/nextstrain/ingest) via pull request there, then use `git subrepo pull` to add those changes to this repository.

From 4ba0375d50d00b73bbcc0788e3b903091a0705b0 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 11 Aug 2023 08:57:35 -0400
Subject: [PATCH 3/6] Use centralized scripts that are functionally identical

Remove the copies in this repo and update references.
---
 bin/cloudfront-invalidate                     | 42 ----------
 bin/download-from-s3                          | 41 ----------
 bin/local-ingest-gisaid                       | 20 ++---
 bin/notify-on-additional-info-change          |  3 +-
 bin/notify-on-duplicate-biosample-change      |  5 +-
 bin/notify-on-flagged-metadata-change         |  5 +-
 bin/notify-on-record-change                   | 53 -------------
 bin/s3-object-exists                          |  8 --
 bin/sha256sum                                 | 15 ----
 bin/upload-to-s3                              | 78 -------------------
 workflow/snakemake_rules/fetch_sequences.smk  | 20 ++---
 workflow/snakemake_rules/nextclade.smk        | 16 ++--
 .../snakemake_rules/slack_notifications.smk   |  2 +-
 workflow/snakemake_rules/upload.smk           |  2 +-
 14 files changed, 38 insertions(+), 272 deletions(-)
 delete mode 100755 bin/cloudfront-invalidate
 delete mode 100755 bin/download-from-s3
 delete mode 100755 bin/notify-on-record-change
 delete mode 100755 bin/s3-object-exists
 delete mode 100755 bin/sha256sum
 delete mode 100755 bin/upload-to-s3

diff --git a/bin/cloudfront-invalidate b/bin/cloudfront-invalidate
deleted file mode 100755
index dec48529..00000000
--- a/bin/cloudfront-invalidate
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
-set -euo pipefail
-
-main() {
-    local domain="$1"
-    shift
-    local paths=("$@")
-    local distribution invalidation
-
-    echo "-> Finding CloudFront distribution"
-    distribution=$(
-        aws cloudfront list-distributions \
-            --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \
-            --output text
-    )
-
-    if [[ -z $distribution || $distribution == None ]]; then
-        exec >&2
-        echo "Unable to find CloudFront distribution id for $domain"
-        echo
-        echo "Are your AWS CLI credentials for the right account?"
-        exit 1
-    fi
-
-    echo "-> Creating CloudFront invalidation for distribution $distribution"
-    invalidation=$(
-        aws cloudfront create-invalidation \
-            --distribution-id "$distribution" \
-            --paths "${paths[@]}" \
-            --query Invalidation.Id \
-            --output text
-    )
-
-    echo "-> Waiting for CloudFront invalidation $invalidation to complete"
-    echo "   Ctrl-C to stop waiting."
-    aws cloudfront wait invalidation-completed \
-        --distribution-id "$distribution" \
-        --id "$invalidation"
-}
-
-main "$@"
diff --git a/bin/download-from-s3 b/bin/download-from-s3
deleted file mode 100755
index 281bf227..00000000
--- a/bin/download-from-s3
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-main() {
-    local src="${1:?A source s3:// URL is required as the first argument.}"
-    local dst="${2:?A destination file path is required as the second argument.}"
-    local n="${3:-0}" # How many lines to subsample to. 0 means no subsampling. Optional.
-
-    local s3path="${src#s3://}"
-    local bucket="${s3path%%/*}"
-    local key="${s3path#*/}"
-
-    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    dst_hash="$("$bin/sha256sum" < "$dst" || true)"
-    src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
-
-    echo "[ INFO] Downloading $src → $dst"
-    if [[ $src_hash != "$dst_hash" ]]; then
-        aws s3 cp --no-progress "$src" - |
-        if [[ "$src" == *.gz ]]; then
-            gunzip -cfq
-        elif  [[ "$src" == *.xz ]]; then
-            xz -T0 -dcq
-        elif [[ "$src" == *.zst ]]; then
-            zstd -T0 -dcq
-        else
-            cat
-        fi |
-        if [[ "$n" -gt 0 ]]; then
-            tsv-sample -H -i -n "$n"
-        else
-            cat
-        fi >"$dst"
-    else
-        echo "[ INFO] Files are identical, skipping download"
-    fi
-}
-
-main "$@"
diff --git a/bin/local-ingest-gisaid b/bin/local-ingest-gisaid
index 7f6d8b07..892be334 100755
--- a/bin/local-ingest-gisaid
+++ b/bin/local-ingest-gisaid
@@ -79,8 +79,8 @@ main() {
 download-inputs() {
   mkdir -p "${INPUT_DIR}"
 
-  ./bin/download-from-s3 "${S3_BUCKET}/additional_info.tsv.gz" "data/gisaid/inputs/additional_info.tsv"
-  ./bin/download-from-s3 "${S3_BUCKET}/metadata.tsv.gz" "data/gisaid/inputs/metadata.tsv"
+  ./vendored/download-from-s3 "${S3_BUCKET}/additional_info.tsv.gz" "data/gisaid/inputs/additional_info.tsv"
+  ./vendored/download-from-s3 "${S3_BUCKET}/metadata.tsv.gz" "data/gisaid/inputs/metadata.tsv"
 }
 
 download-gisaid() {
@@ -160,16 +160,16 @@ ingest() {
 }
 
 upload-outputs() {
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.gz"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.gz"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.gz"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.xz"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.gz"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.gz"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.gz"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.xz"
 
   # Parallel uploads of zstd compressed files to slowly transition to this format
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.zst"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.zst"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.zst"
-  ./bin/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.zst"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.zst"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.zst"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.zst"
+  ./vendored/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.zst"
 }
 
 print-help() {
diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change
index 95190617..185589a5 100755
--- a/bin/notify-on-additional-info-change
+++ b/bin/notify-on-additional-info-change
@@ -5,12 +5,13 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source additional info TSV file is required as the first argument.}"
 dst="${2:?A destination additional info TSV s3:// URL is required as the second argument.}"
 
 # if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
+"$vendored"/s3-object-exists "$dst" || exit 0
 
 # Remove rows where columns 3 (additional_host_info) and 4 (additional_location_info) are empty.
 # Compare the S3 version with the local version.
diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change
index 6e94ac93..5e7983ca 100755
--- a/bin/notify-on-duplicate-biosample-change
+++ b/bin/notify-on-duplicate-biosample-change
@@ -5,6 +5,7 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source duplicate BioSample txt file is required as the first argument.}"
 dst="${2:?A destination duplicate BioSample txt s3:// URL is required as the second argument.}"
@@ -16,9 +17,9 @@ diff="$(mktemp -t duplicate-biosample-additions-XXXXXX)"
 trap "rm -f '$dst_local' '$diff'" EXIT
 
 # if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
+"$vendored"/s3-object-exists "$dst" || exit 0
 
-"$bin"/download-from-s3 "$dst" "$dst_local"
+"$vendored"/download-from-s3 "$dst" "$dst_local"
 
 comm -13 \
     <(sort "$dst_local") \
diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change
index 55069f49..0b93784d 100755
--- a/bin/notify-on-flagged-metadata-change
+++ b/bin/notify-on-flagged-metadata-change
@@ -5,6 +5,7 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source flagged metadata txt file is required as the first argument.}"
 dst="${2:?A destination flagged metadata txt s3:// URL is required as the second argument.}"
@@ -16,9 +17,9 @@ diff="$(mktemp -t flagged-metadata-additions-XXXXXX)"
 trap "rm -f '$dst_local' '$diff'" EXIT
 
 # if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
+"$vendored"/s3-object-exists "$dst" || exit 0
 
-"$bin"/download-from-s3 "$dst" "$dst_local"
+"$vendored"/download-from-s3 "$dst" "$dst_local"
 
 comm -13 \
     <(sort "$dst_local") \
diff --git a/bin/notify-on-record-change b/bin/notify-on-record-change
deleted file mode 100755
index 7c8afd07..00000000
--- a/bin/notify-on-record-change
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-src="${1:?A source GenBank or GISAID ndjson file is required as the first argument.}"
-dst="${2:?A destination GenBank or GISAID ndjson s3:// URL is required as the second argument.}"
-source_name=${3:?A record source name is required as the third argument.}
-
-# if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
-
-s3path="${dst#s3://}"
-bucket="${s3path%%/*}"
-key="${s3path#*/}"
-
-src_record_count="$(wc -l < "$src")"
-
-# Try getting record count from S3 object metadata
-dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)"
-if [[ -z "$dst_record_count" ]]; then
-  # This object doesn't have the record count stored as metadata
-  # We have to download it and count the lines locally
-  dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))"
-fi
-
-added_records="$(( src_record_count - dst_record_count ))"
-
-printf "%'4d %s\n" "$src_record_count" "$src"
-printf "%'4d %s\n" "$dst_record_count" "$dst"
-printf "%'4d added records\n" "$added_records"
-
-slack_message=""
-
-if [[ $added_records -gt 0 ]]; then
-    echo "Notifying Slack about added records (n=$added_records)"
-    slack_message="📈 New nCoV records (n=$added_records) found on $source_name."
-
-elif [[ $added_records -lt 0 ]]; then
-    echo "Notifying Slack about fewer records (n=$added_records)"
-    slack_message="📉 Fewer nCoV records (n=$added_records) found on $source_name."
-
-else
-    echo "Notifying Slack about same number of records"
-    slack_message="⛔ No new nCoV records found on $source_name."
-fi
-
-slack_message+=" (Total record count: $src_record_count)"
-
-"$bin"/notify-slack "$slack_message"
diff --git a/bin/s3-object-exists b/bin/s3-object-exists
deleted file mode 100755
index faac4219..00000000
--- a/bin/s3-object-exists
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-url="${1#s3://}"
-bucket="${url%%/*}"
-key="${url#*/}"
-
-aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null
diff --git a/bin/sha256sum b/bin/sha256sum
deleted file mode 100755
index 32d7ef82..00000000
--- a/bin/sha256sum
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-"""
-Portable sha256sum utility.
-"""
-from hashlib import sha256
-from sys import stdin
-
-chunk_size = 5 * 1024**2 # 5 MiB
-
-h = sha256()
-
-for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
-    h.update(chunk)
-
-print(h.hexdigest())
diff --git a/bin/upload-to-s3 b/bin/upload-to-s3
deleted file mode 100755
index 31cd49bf..00000000
--- a/bin/upload-to-s3
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-main() {
-    local quiet=0
-
-    for arg; do
-        case "$arg" in
-            --quiet)
-                quiet=1
-                shift;;
-            *)
-                break;;
-        esac
-    done
-
-    local src="${1:?A source file is required as the first argument.}"
-    local dst="${2:?A destination s3:// URL is required as the second argument.}"
-    local cloudfront_domain="${3:-}"
-
-    local s3path="${dst#s3://}"
-    local bucket="${s3path%%/*}"
-    local key="${s3path#*/}"
-
-    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    src_hash="$("$bin/sha256sum" < "$src")"
-    dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
-
-    if [[ $src_hash != "$dst_hash" ]]; then
-        # The record count may have changed
-        src_record_count="$(wc -l < "$src")"
-
-        echo "Uploading $src → $dst"
-        if [[ "$dst" == *.gz ]]; then
-            gzip -c "$src"
-        elif  [[ "$dst" == *.xz ]]; then
-            xz -2 -T0 -c "$src"
-        elif [[ "$dst" == *.zst ]]; then
-            zstd -T0 -c "$src"
-        else
-            cat "$src"
-        fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")"
-
-        if [[ -n $cloudfront_domain ]]; then
-            echo "Creating CloudFront invalidation for $cloudfront_domain/$key"
-            if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then
-                echo "CloudFront invalidation failed, but exiting with success anyway."
-            fi
-        fi
-
-        if [[ $quiet == 1 ]]; then
-            echo "Quiet mode. No Slack notification sent."
-            exit 0
-        fi
-
-        if ! "$bin"/notify-slack "Updated $dst available."; then
-            echo "Notifying Slack failed, but exiting with success anyway."
-        fi
-    else
-        echo "Uploading $src → $dst: files are identical, skipping upload"
-    fi
-}
-
-content-type() {
-    case "$1" in
-        *.tsv)      echo --content-type=text/tab-separated-values;;
-        *.csv)      echo --content-type=text/comma-separated-values;;
-        *.ndjson)   echo --content-type=application/x-ndjson;;
-        *.gz)       echo --content-type=application/gzip;;
-        *.xz)       echo --content-type=application/x-xz;;
-        *.zst)      echo --content-type=application/zstd;;
-        *)          echo --content-type=text/plain;;
-    esac
-}
-
-main "$@"
diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk
index 0e03d7f9..7d1402d4 100644
--- a/workflow/snakemake_rules/fetch_sequences.smk
+++ b/workflow/snakemake_rules/fetch_sequences.smk
@@ -253,8 +253,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             ndjson = temp(f"data/{database}.ndjson")
         shell:
             """
-            ./bin/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} ||  \
-            ./bin/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines}
+            ./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines}
             """
 
     rule fetch_biosample_from_s3:
@@ -268,8 +268,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             biosample = temp("data/biosample.ndjson")
         shell:
             """
-            ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
-            ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
+            ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
             """
 
     rule fetch_rki_ndjson_from_s3:
@@ -281,8 +281,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             rki_ndjson = temp("data/rki.ndjson")
         shell:
             """
-            ./bin/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} ||  \
-            ./bin/download-from-s3 {params.file_on_s3_src} {output.rki_ndjson} {params.lines}
+            ./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.file_on_s3_src} {output.rki_ndjson} {params.lines}
             """
     rule fetch_cog_uk_accessions_from_s3:
         params:
@@ -293,8 +293,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv")
         shell:
             """
-            ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
-            ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
+            ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
             """
 
     rule fetch_cog_uk_metadata_from_s3:
@@ -306,8 +306,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             biosample = temp("data/cog_uk_metadata.csv")
         shell:
             """
-            ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
-            ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
+            ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines}
             """
 
     rule compress_cog_uk_metadata:
diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk
index 264d3734..921a4679 100644
--- a/workflow/snakemake_rules/nextclade.smk
+++ b/workflow/snakemake_rules/nextclade.smk
@@ -75,10 +75,10 @@ if config.get("s3_dst") and config.get("s3_src"):
             nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
         shell:
             """
-            ./bin/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 ||  \
-            ./bin/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 ||  \
-            ./bin/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} ||  \
-            ./bin/download-from-s3 {params.src_source} {output.nextclade} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 ||  \
+            ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 ||  \
+            ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} ||  \
             touch {output.nextclade}
             """
 
@@ -96,10 +96,10 @@ if config.get("s3_dst") and config.get("s3_src"):
             alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
         shell:
             """
-            ./bin/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 ||  \
-            ./bin/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 ||  \
-            ./bin/download-from-s3 {params.dst_source} {output.alignment} {params.lines} ||  \
-            ./bin/download-from-s3 {params.src_source} {output.alignment} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 ||  \
+            ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 ||  \
+            ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} ||  \
+            ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} ||  \
             touch {output.alignment}
             """
 
diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk
index 5add860f..cd9c4463 100644
--- a/workflow/snakemake_rules/slack_notifications.smk
+++ b/workflow/snakemake_rules/slack_notifications.smk
@@ -32,7 +32,7 @@ rule notify_on_record_change:
         touch(f"data/{database}/notify-on-record-change.done")
     shell:
         """
-        ./bin/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database}
+        ./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database}
         """
 
 
diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk
index 55103a62..7f139037 100644
--- a/workflow/snakemake_rules/upload.smk
+++ b/workflow/snakemake_rules/upload.smk
@@ -87,7 +87,7 @@ rule upload_single:
         cloudfront_domain = config.get("cloudfront_domain", ""),
     shell:
         """
-        ./bin/upload-to-s3 \
+        ./vendored/upload-to-s3 \
             {params.quiet} \
             {input:q} \
             {params.s3_bucket:q}/{wildcards.remote_filename:q} \

From abd812e616e6db14b417a254a5d245443f221399 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 11 Aug 2023 09:00:10 -0400
Subject: [PATCH 4/6] Use centralized trigger scripts

Both the centralized `trigger` and `trigger-on-new-data` take an
owner/repo pair as the first argument.
---
 .../fetch-and-ingest-genbank-master.yml       |  2 +-
 .../fetch-and-ingest-gisaid-master.yml        |  2 +-
 .github/workflows/ingest-genbank-master.yml   |  2 +-
 .github/workflows/ingest-gisaid-master.yml    |  2 +-
 .github/workflows/update-image.yml            |  2 +-
 README.md                                     | 12 ++--
 bin/trigger                                   | 56 -------------------
 bin/trigger-on-new-data                       | 32 -----------
 workflow/snakemake_rules/trigger.smk          |  6 +-
 9 files changed, 14 insertions(+), 102 deletions(-)
 delete mode 100755 bin/trigger
 delete mode 100755 bin/trigger-on-new-data

diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml
index c2b341a6..4034cd91 100644
--- a/.github/workflows/fetch-and-ingest-genbank-master.yml
+++ b/.github/workflows/fetch-and-ingest-genbank-master.yml
@@ -22,7 +22,7 @@ on:
     # sister GISAID job, so that we don't need to keep two schedules in our heads.
     - cron:  '7 18 * * *'
 
-  # Manually triggered using `./bin/trigger ncov-ingest genbank/fetch-and-ingest` (or `fetch-and-ingest`, which
+  # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest genbank/fetch-and-ingest` (or `fetch-and-ingest`, which
   # includes GISAID)
   repository_dispatch:
     types:
diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml
index d2459dd9..cc4c512d 100644
--- a/.github/workflows/fetch-and-ingest-gisaid-master.yml
+++ b/.github/workflows/fetch-and-ingest-gisaid-master.yml
@@ -22,7 +22,7 @@ on:
     # sister GenBank job, so that we don't need to keep two schedules in our heads.
     - cron:  '7 18 * * *'
 
-  # Manually triggered using `./bin/trigger ncov-ingest gisaid/fetch-and-ingest`
+  # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest`
   repository_dispatch:
     types:
       - gisaid/fetch-and-ingest
diff --git a/.github/workflows/ingest-genbank-master.yml b/.github/workflows/ingest-genbank-master.yml
index a8ddbd50..bdc2a413 100644
--- a/.github/workflows/ingest-genbank-master.yml
+++ b/.github/workflows/ingest-genbank-master.yml
@@ -1,7 +1,7 @@
 name: GenBank ingest
 
 on:
-  # Manually triggered using `./bin/trigger ncov-ingest genbank/ingest` (or `ingest`, which
+  # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest genbank/ingest` (or `ingest`, which
   # includes GISAID)
   repository_dispatch:
     types:
diff --git a/.github/workflows/ingest-gisaid-master.yml b/.github/workflows/ingest-gisaid-master.yml
index 7da212d9..3fc2e727 100644
--- a/.github/workflows/ingest-gisaid-master.yml
+++ b/.github/workflows/ingest-gisaid-master.yml
@@ -1,7 +1,7 @@
 name: GISAID ingest
 
 on:
-  # Manually triggered using `./bin/trigger ncov-ingest gisaid/ingest` (or `ingest`, which
+  # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest` (or `ingest`, which
   # includes GenBank)
   repository_dispatch:
     types:
diff --git a/.github/workflows/update-image.yml b/.github/workflows/update-image.yml
index c4042505..0251c30c 100644
--- a/.github/workflows/update-image.yml
+++ b/.github/workflows/update-image.yml
@@ -15,7 +15,7 @@ on:
       - yarn.lock
       - .github/workflows/update-image.yml
 
-  # Manually triggered using `./bin/trigger ncov-ingest update-image`
+  # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest update-image`
   repository_dispatch:
     types: update-image
 
diff --git a/README.md b/README.md
index 445af45a..6c7b33e9 100644
--- a/README.md
+++ b/README.md
@@ -76,18 +76,18 @@ AWS credentials are stored in this repository's secrets and are associated with
 
 A full run is now done in 3 steps via manual triggers:
 
-1. Fetch new sequences and ingest them by running `./bin/trigger ncov-ingest gisaid/fetch-and-ingest --user <your-github-username>`.
+1. Fetch new sequences and ingest them by running `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest --user <your-github-username>`.
 2. Add manual annotations, update location hierarchy as needed, and run ingest without fetching new sequences.
     - Pushes of `source-data/*-annotations.tsv` to the master branch will automatically trigger a run of ingest.
-    - You can also run ingest manually by running `./bin/trigger ncov-ingest gisaid/ingest --user <your-github-username>`.
-3. Once all manual fixes are complete, trigger a rebuild of [nextstrain/ncov](https://github.com/nextstrain/ncov) by running `./bin/trigger ncov gisaid/rebuild --user <your-github-username>`.
+    - You can also run ingest manually by running `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest --user <your-github-username>`.
+3. Once all manual fixes are complete, trigger a rebuild of [nextstrain/ncov](https://github.com/nextstrain/ncov) by running `./vendored/trigger ncov gisaid/rebuild --user <your-github-username>`.
 
-See the output of `./bin/trigger ncov-ingest gisaid/fetch-and-ingest --user <your-github-username>`, `./bin/trigger ncov-ingest gisaid/ingest` or `./bin/trigger ncov-ingest rebuild` for more information about authentication with GitHub.
+See the output of `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest --user <your-github-username>`, `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest` or `./vendored/trigger nextstrain/ncov-ingest rebuild` for more information about authentication with GitHub.
 
-Note: running `./bin/trigger ncov-ingest` posts a GitHub `repository_dispatch`.
+Note: running `./vendored/trigger nextstrain/ncov-ingest` posts a GitHub `repository_dispatch`.
 Regardless of which branch you are on, it will trigger the specified action on the master branch.
 
-Valid dispatch types for `./bin/trigger ncov-ingest` are:
+Valid dispatch types for `./vendored/trigger nextstrain/ncov-ingest` are:
 
 - `ingest` (both GISAID and GenBank)
 - `gisaid/ingest`
diff --git a/bin/trigger b/bin/trigger
deleted file mode 100755
index d40553b6..00000000
--- a/bin/trigger
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${PAT_GITHUB_DISPATCH:=}"
-
-repo="${1:?A repository name is required as the first argument.}"
-event_type="${2:?An event type is required as the second argument.}"
-shift 2
-
-if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
-    cat >&2 <<.
-You must specify options to curl for your GitHub credentials.  For example, you
-can specify your GitHub username, and will be prompted for your password:
-
-  $0 $repo $event_type --user <your-github-username>
-
-Be sure to enter a personal access token¹ as your password since GitHub has
-discontinued password authentication to the API starting on November 13, 2020².
-
-You can also store your credentials or a personal access token in a netrc
-file³:
-
-  machine api.github.com
-  login <your-username>
-  password <your-token>
-
-and then tell curl to use it:
-
-  $0 $repo $event_type --netrc
-
-which will then not require you to type your password every time.
-
-¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line
-² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password
-³ https://ec.haxx.se/usingcurl/usingcurl-netrc
-.
-    exit 1
-fi
-
-auth=':'
-if [[ -n $PAT_GITHUB_DISPATCH ]]; then
-  auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
-fi
-
-if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \
-    -H 'Accept: application/vnd.github.v3+json' \
-    -H 'Content-Type: application/json' \
-    -H "$auth" \
-    -d '{"event_type":"'"$event_type"'"}' \
-    "$@"
-then
-    echo "Successfully triggered $event_type"
-else
-    echo "Request failed" >&2
-    exit 1
-fi
diff --git a/bin/trigger-on-new-data b/bin/trigger-on-new-data
deleted file mode 100755
index 1af77988..00000000
--- a/bin/trigger-on-new-data
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-repo="${1:?A repository name is required as the first argument.}"
-event_type="${2:?An event type is required as the second argument.}"
-metadata="${3:?A metadata upload output file is required as the third argument.}"
-sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}"
-identical_file_message="${5:-files are identical}"
-
-new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?)
-new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?)
-
-slack_message=""
-
-# grep exit status 0 for found match, 1 for no match, 2 if an error occurred
-if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then
-    slack_message="Triggering new builds due to updated metadata and/or sequences"
-    "$bin"/trigger "$repo" "$event_type"
-elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then
-    slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files."
-else
-    slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated."
-fi
-
-
-if ! "$bin"/notify-slack "$slack_message"; then
-    echo "Notifying Slack failed, but exiting with success anyway."
-fi
diff --git a/workflow/snakemake_rules/trigger.smk b/workflow/snakemake_rules/trigger.smk
index d53d6bbb..09af7689 100644
--- a/workflow/snakemake_rules/trigger.smk
+++ b/workflow/snakemake_rules/trigger.smk
@@ -22,8 +22,8 @@ rule trigger_rebuild_pipeline:
         dispatch_type = f"{database}/rebuild"
     shell:
         """
-        ./bin/trigger-on-new-data \
-            ncov \
+        ./vendored/trigger-on-new-data \
+            nextstrain/ncov \
             {params.dispatch_type} \
             {input.metadata_upload} \
             {input.fasta_upload}
@@ -39,5 +39,5 @@ rule trigger_counts_pipeline:
         dispatch_type = f"{database}/clade-counts"
     shell:
         """
-        ./bin/trigger forecasts-ncov {params.dispatch_type}
+        ./vendored/trigger nextstrain/forecasts-ncov {params.dispatch_type}
         """

From a82898bde9c79fa6f66f509e7595739132e2b2e9 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 31 Jul 2023 15:40:38 -0700
Subject: [PATCH 5/6] Use centralized Slack notification scripts

Remove the copies in this repo and update references.

Add new positional arguments required by the centralized scripts.
---
 .../fetch-and-ingest-genbank-master.yml       |  2 +-
 .../fetch-and-ingest-gisaid-master.yml        |  2 +-
 .github/workflows/ingest-genbank-master.yml   |  2 +-
 .github/workflows/ingest-gisaid-master.yml    |  2 +-
 Snakefile                                     |  4 +-
 bin/notify-on-additional-info-change          |  2 +-
 bin/notify-on-duplicate-biosample-change      |  4 +-
 bin/notify-on-flagged-metadata-change         |  4 +-
 bin/notify-on-job-fail                        | 21 ---------
 bin/notify-on-job-start                       | 26 -----------
 bin/notify-on-problem-data                    |  3 +-
 bin/notify-slack                              | 44 -------------------
 .../snakemake_rules/slack_notifications.smk   |  4 +-
 13 files changed, 15 insertions(+), 105 deletions(-)
 delete mode 100755 bin/notify-on-job-fail
 delete mode 100755 bin/notify-on-job-start
 delete mode 100755 bin/notify-slack

diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml
index 4034cd91..fffd83fd 100644
--- a/.github/workflows/fetch-and-ingest-genbank-master.yml
+++ b/.github/workflows/fetch-and-ingest-genbank-master.yml
@@ -82,4 +82,4 @@ jobs:
 
     - name: notify_pipeline_failed
       if: ${{ failure() }}
-      run: ./bin/notify-on-job-fail
+      run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest
diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml
index cc4c512d..c4280389 100644
--- a/.github/workflows/fetch-and-ingest-gisaid-master.yml
+++ b/.github/workflows/fetch-and-ingest-gisaid-master.yml
@@ -85,4 +85,4 @@ jobs:
 
     - name: notify_pipeline_failed
       if: ${{ failure() }}
-      run: ./bin/notify-on-job-fail
+      run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest
diff --git a/.github/workflows/ingest-genbank-master.yml b/.github/workflows/ingest-genbank-master.yml
index bdc2a413..6734f14a 100644
--- a/.github/workflows/ingest-genbank-master.yml
+++ b/.github/workflows/ingest-genbank-master.yml
@@ -51,4 +51,4 @@ jobs:
 
     - name: notify_pipeline_failed
       if: ${{ failure() }}
-      run: ./bin/notify-on-job-fail
+      run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest
diff --git a/.github/workflows/ingest-gisaid-master.yml b/.github/workflows/ingest-gisaid-master.yml
index 3fc2e727..2d275f84 100644
--- a/.github/workflows/ingest-gisaid-master.yml
+++ b/.github/workflows/ingest-gisaid-master.yml
@@ -51,4 +51,4 @@ jobs:
 
     - name: notify_pipeline_failed
       if: ${{ failure() }}
-      run: ./bin/notify-on-job-fail
+      run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest
diff --git a/Snakefile b/Snakefile
index 7904c267..d8e527ac 100644
--- a/Snakefile
+++ b/Snakefile
@@ -92,7 +92,7 @@ onstart:
         print(f"\t${{{var}}}: " + ("YES" if os.environ.get(var, "") else "NO") + f"({description})")
     if send_notifications:
         message="🥗 GISAID ingest" if database=="gisaid" else "🥣 GenBank ingest"
-        shell(f"./bin/notify-on-job-start \"{message}\"")
+        shell(f"./vendored/notify-on-job-start \"{message}\" nextstrain/ncov-ingest")
 
 onsuccess:
     message = "✅ This pipeline has successfully finished 🎉"
@@ -104,7 +104,7 @@ onsuccess:
 onerror:
     print("Pipeline failed.")
     if send_notifications:
-        shell("./bin/notify-on-job-fail")
+        shell("./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest")
     if not config.get("keep_all_files", False):
         print("Removing intermediate files (set config option keep_all_files to skip this)")
         shell("./bin/clean")
diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change
index 185589a5..c6554c65 100755
--- a/bin/notify-on-additional-info-change
+++ b/bin/notify-on-additional-info-change
@@ -27,7 +27,7 @@ diff="$(
 
 if [[ -n "$diff" ]]; then
     echo "Notifying Slack about additional info change."
-    "$bin"/notify-slack --upload "additional-info-changes.txt" <<<"$diff"
+    "$vendored"/notify-slack --upload "additional-info-changes.txt" <<<"$diff"
 else
     echo "No additional info change."
 fi
diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change
index 5e7983ca..52a372b8 100755
--- a/bin/notify-on-duplicate-biosample-change
+++ b/bin/notify-on-duplicate-biosample-change
@@ -29,8 +29,8 @@ comm -13 \
 if [[ -s "$diff" ]]; then
     echo
     echo "Notifying Slack about duplicate BioSample additions."
-    "$bin"/notify-slack ":warning: Newly flagged duplicate BioSample strains"
-    "$bin"/notify-slack --upload "duplicate-biosample-additions.txt" < "$diff"
+    "$vendored"/notify-slack ":warning: Newly flagged duplicate BioSample strains"
+    "$vendored"/notify-slack --upload "duplicate-biosample-additions.txt" < "$diff"
 else
     echo "No flagged duplicate BioSample additions."
 fi
diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change
index 0b93784d..b10f2f22 100755
--- a/bin/notify-on-flagged-metadata-change
+++ b/bin/notify-on-flagged-metadata-change
@@ -29,8 +29,8 @@ comm -13 \
 if [[ -s "$diff" ]]; then
     echo
     echo "Notifying Slack about flagged metadata additions."
-    "$bin"/notify-slack ":waving_black_flag: Newly flagged metadata"
-    "$bin"/notify-slack --upload "flagged-metadata-additions.txt" < "$diff"
+    "$vendored"/notify-slack ":waving_black_flag: Newly flagged metadata"
+    "$vendored"/notify-slack --upload "flagged-metadata-additions.txt" < "$diff"
 else
     echo "No flagged metadata additions."
 fi
diff --git a/bin/notify-on-job-fail b/bin/notify-on-job-fail
deleted file mode 100755
index 3d49b934..00000000
--- a/bin/notify-on-job-fail
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-aws_batch_job_id="${AWS_BATCH_JOB_ID:-}"
-github_run_id="${GITHUB_RUN_ID:-}"
-
-echo "Notifying Slack about failed ingest job."
-message="❌ Ingest job has FAILED 😞 "
-
-if [ -n "${aws_batch_job_id}" ]; then
-    message+="See AWS Batch job \`${aws_batch_job_id}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${aws_batch_job_id}|link>) for error details. "
-elif [ -n "${github_run_id}" ]; then
-    message+="See GitHub Action <https://github.com/nextstrain/ncov-ingest/actions/runs/${github_run_id}?check_suite_focus=true|${github_run_id}> for error details. "
-fi
-
-"$bin"/notify-slack "$message"
diff --git a/bin/notify-on-job-start b/bin/notify-on-job-start
deleted file mode 100755
index 5d2f239c..00000000
--- a/bin/notify-on-job-start
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-export JOB_NAME="${1:?Job name is required as the first argument.}"
-export ADDITIONAL_INFO="${2:-}"
-export AWS_BATCH_JOB_ID="${AWS_BATCH_JOB_ID:-}"
-export GITHUB_RUN_ID="${GITHUB_RUN_ID:-}"
-export SLACK_TOKEN="${SLACK_TOKEN:-}"
-export SLACK_CHANNELS="${SLACK_CHANNELS:-}"
-
-if [ -n "${SLACK_TOKEN}" ] && [ -n "${SLACK_CHANNELS}" ]  && [ -n "${AWS_BATCH_JOB_ID}" ]; then
-    echo "Notifying Slack about started AWS Batch job. AWS_BATCH_JOB_ID=\"${AWS_BATCH_JOB_ID}\"."
-
-    msg_job_id="${JOB_NAME} from AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>) started."
-
-    msg_action=""
-    if [ -n "${GITHUB_RUN_ID}" ]; then
-      msg_action="The job was submitted by GitHub Action <https://github.com/nextstrain/ncov-ingest/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}>."
-    fi
-
-    msg_command="Follow along in your local \`ncov-ingest\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ."'```'
-
-    "$bin"/notify-slack "${msg_job_id} ${msg_action} ${msg_command} ${ADDITIONAL_INFO}"
-fi
diff --git a/bin/notify-on-problem-data b/bin/notify-on-problem-data
index 41507378..e172330d 100755
--- a/bin/notify-on-problem-data
+++ b/bin/notify-on-problem-data
@@ -5,12 +5,13 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 problem_data="${1:?A problem data TSV file is required as the first argument.}"
 
 if [[ -s "$problem_data" ]]; then
     echo "Notifying Slack about problem data."
-    "$bin"/notify-slack --upload "genbank-problem-data.tsv" < "$problem_data"
+    "$vendored"/notify-slack --upload "genbank-problem-data.tsv" < "$problem_data"
 else
     echo "No problem data found."
 fi
diff --git a/bin/notify-slack b/bin/notify-slack
deleted file mode 100755
index 6695d83f..00000000
--- a/bin/notify-slack
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-upload=0
-args=()
-
-for arg; do
-    case "$arg" in
-        --upload)
-            upload=1;;
-        *)
-            args+=("$arg");;
-    esac
-done
-
-set -- "${args[@]}"
-
-text="${1:?Some message text is required.}"
-
-if [[ "$upload" == 1 ]]; then
-    echo "Uploading data to Slack with the message: $text"
-    curl https://slack.com/api/files.upload \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channels="$SLACK_CHANNELS" \
-        --form-string title="$text" \
-        --form-string filename="$text" \
-        --form file=@/dev/stdin \
-        --form filetype=text \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output /dev/null
-else
-    echo "Posting Slack message: $text"
-    curl https://slack.com/api/chat.postMessage \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channel="$SLACK_CHANNELS" \
-        --form-string text="$text" \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output /dev/null
-fi
diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk
index cd9c4463..b3f50970 100644
--- a/workflow/snakemake_rules/slack_notifications.smk
+++ b/workflow/snakemake_rules/slack_notifications.smk
@@ -46,7 +46,7 @@ rule notify_gisaid:
     output:
         touch("data/gisaid/notify.done")
     run:
-        shell("./bin/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
+        shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
         shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz")
         shell("./bin/notify-on-flagged-metadata-change {input.flagged_metadata}  {params.s3_bucket}/flagged_metadata.txt.gz")
 
@@ -59,7 +59,7 @@ rule notify_genbank:
     output:
         touch("data/genbank/notify.done")
     run:
-        shell("./bin/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
+        shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
         # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script)
         shell("./bin/notify-on-problem-data data/genbank/problem_data.tsv")
         shell("./bin/notify-on-duplicate-biosample-change {input.duplicate_biosample} {params.s3_bucket}/duplicate_biosample.txt.gz")

From 6ce947ef304df900b58e28db60887b98e1ebbfee Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 31 Jul 2023 15:43:52 -0700
Subject: [PATCH 6/6] Remove unused bin variable

---
 bin/notify-on-additional-info-change     | 1 -
 bin/notify-on-duplicate-biosample-change | 1 -
 bin/notify-on-flagged-metadata-change    | 1 -
 bin/notify-on-problem-data               | 1 -
 4 files changed, 4 deletions(-)

diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change
index c6554c65..332db612 100755
--- a/bin/notify-on-additional-info-change
+++ b/bin/notify-on-additional-info-change
@@ -4,7 +4,6 @@ set -euo pipefail
 : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source additional info TSV file is required as the first argument.}"
diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change
index 52a372b8..33e3be1a 100755
--- a/bin/notify-on-duplicate-biosample-change
+++ b/bin/notify-on-duplicate-biosample-change
@@ -4,7 +4,6 @@ set -euo pipefail
 : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source duplicate BioSample txt file is required as the first argument.}"
diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change
index b10f2f22..834d684c 100755
--- a/bin/notify-on-flagged-metadata-change
+++ b/bin/notify-on-flagged-metadata-change
@@ -4,7 +4,6 @@ set -euo pipefail
 : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source flagged metadata txt file is required as the first argument.}"
diff --git a/bin/notify-on-problem-data b/bin/notify-on-problem-data
index e172330d..dc9d1ef9 100755
--- a/bin/notify-on-problem-data
+++ b/bin/notify-on-problem-data
@@ -4,7 +4,6 @@ set -euo pipefail
 : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 problem_data="${1:?A problem data TSV file is required as the first argument.}"