From 9d7d3cca8292a66591cbd884711ce753fd2f3d7c Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 11 Aug 2023 08:23:38 -0400 Subject: [PATCH 1/6] git subrepo clone (merge) https://github.com/nextstrain/ingest vendored subrepo: subdir: "vendored" merged: "1eb8b30" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "1eb8b30" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo" commit: "110b9eb" --- vendored/.github/pull_request_template.md | 16 ++ vendored/.github/workflows/ci.yaml | 13 ++ vendored/.gitrepo | 12 ++ vendored/.shellcheckrc | 6 + vendored/README.md | 91 +++++++++ vendored/apply-geolocation-rules | 234 ++++++++++++++++++++++ vendored/cloudfront-invalidate | 42 ++++ vendored/download-from-s3 | 48 +++++ vendored/merge-user-metadata | 55 +++++ vendored/notify-on-diff | 35 ++++ vendored/notify-on-job-fail | 23 +++ vendored/notify-on-job-start | 27 +++ vendored/notify-on-record-change | 53 +++++ vendored/notify-slack | 56 ++++++ vendored/s3-object-exists | 8 + vendored/sha256sum | 15 ++ vendored/transform-authors | 66 ++++++ vendored/transform-field-names | 48 +++++ vendored/transform-genbank-location | 43 ++++ vendored/trigger | 56 ++++++ vendored/trigger-on-new-data | 32 +++ vendored/upload-to-s3 | 78 ++++++++ 22 files changed, 1057 insertions(+) create mode 100644 vendored/.github/pull_request_template.md create mode 100644 vendored/.github/workflows/ci.yaml create mode 100644 vendored/.gitrepo create mode 100644 vendored/.shellcheckrc create mode 100644 vendored/README.md create mode 100755 vendored/apply-geolocation-rules create mode 100755 vendored/cloudfront-invalidate create mode 100755 vendored/download-from-s3 create mode 100755 vendored/merge-user-metadata create mode 100755 vendored/notify-on-diff create mode 100755 vendored/notify-on-job-fail create mode 100755 vendored/notify-on-job-start create mode 100755 vendored/notify-on-record-change create mode 100755 vendored/notify-slack create mode 100755 vendored/s3-object-exists create mode 100755 vendored/sha256sum create mode 100755 vendored/transform-authors create mode 100755 vendored/transform-field-names create mode 100755 vendored/transform-genbank-location create mode 100755 vendored/trigger create mode 100755 vendored/trigger-on-new-data create mode 100755 vendored/upload-to-s3 diff --git a/vendored/.github/pull_request_template.md b/vendored/.github/pull_request_template.md new file mode 100644 index 00000000..ed4a5b27 --- /dev/null +++ b/vendored/.github/pull_request_template.md @@ -0,0 +1,16 @@ +### Description of proposed changes + + + +### Related issue(s) + + + +### Checklist + + + +- [ ] Checks pass +- [ ] If adding a script, add an entry for it in the README. + + diff --git a/vendored/.github/workflows/ci.yaml b/vendored/.github/workflows/ci.yaml new file mode 100644 index 00000000..dcb3b898 --- /dev/null +++ b/vendored/.github/workflows/ci.yaml @@ -0,0 +1,13 @@ +name: CI + +on: + - push + - pull_request + - workflow_dispatch + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: nextstrain/.github/actions/shellcheck@master diff --git a/vendored/.gitrepo b/vendored/.gitrepo new file mode 100644 index 00000000..3fe78c05 --- /dev/null +++ b/vendored/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = https://github.com/nextstrain/ingest + branch = main + commit = 1eb8b30428d5f66adac201f0a246a7ab4bdc9792 + parent = 6fd5a9b1d87e59fab35173dbedf376632154943b + method = merge + cmdver = 0.4.6 diff --git a/vendored/.shellcheckrc b/vendored/.shellcheckrc new file mode 100644 index 00000000..ebed438c --- /dev/null +++ b/vendored/.shellcheckrc @@ -0,0 +1,6 @@ +# Use of this file requires Shellcheck v0.7.0 or newer. +# +# SC2064 - We intentionally want variables to expand immediately within traps +# so the trap can not fail due to variable interpolation later. +# +disable=SC2064 diff --git a/vendored/README.md b/vendored/README.md new file mode 100644 index 00000000..0311a551 --- /dev/null +++ b/vendored/README.md @@ -0,0 +1,91 @@ +# ingest + +Shared internal tooling for pathogen data ingest. Used by our individual +pathogen repos which produce Nextstrain builds. Expected to be vendored by +each pathogen repo using `git subtree`. + +Some tools may only live here temporarily before finding a permanent home in +`augur curate` or Nextstrain CLI. Others may happily live out their days here. + +## Vendoring + +Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts. +(See discussion on this decision in https://github.com/nextstrain/ingest/issues/3) + +If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation). +Then add the latest ingest scripts to the pathogen repo by running: + +``` +git subrepo clone https://github.com/nextstrain/ingest ingest/vendored +``` + +Any future updates of ingest scripts can be pulled in with: + +``` +git subrepo pull ingest/vendored +``` + +## History + +Much of this tooling originated in +[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru +[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/). +It subsequently proliferated from [monkeypox][] to other pathogen repos +([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily +thru copying. To [counter that +proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079), +this repo was made. + +[monkeypox]: https://github.com/nextstrain/monkeypox +[rsv]: https://github.com/nextstrain/rsv +[zika]: https://github.com/nextstrain/zika/pull/24 +[dengue]: https://github.com/nextstrain/dengue/pull/10 +[hepatitisB]: https://github.com/nextstrain/hepatitisB +[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov + +## Elsewhere + +The creation of this repo, in both the abstract and concrete, and the general +approach to "ingest" has been discussed in various internal places, including: + +- https://github.com/nextstrain/private/issues/59 +- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i) +- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079) +- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit) +- _…many others_ + +## Scripts + +Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools. + +- [notify-on-diff](notify-on-diff) - Send Slack message with diff of a local file and an S3 object +- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch +- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch +- [notify-on-record-change](notify-on-recod-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`. + If the S3 object's metadata does not have `recordcount`, then will attempt to download S3 object to count lines locally, which only supports `xz` compressed S3 objects. +- [notify-slack](notify-slack) - Send message or file to Slack +- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts +- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events. +- [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message` + A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated. + +Potential Nextstrain CLI scripts + +- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. +- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104). + This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script. +- [upload-to-s3](upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL. + Skips upload if the local file's hash is identical to the S3 object's metadata `sha256sum`. + Adds the following user defined metadata to uploaded S3 object: + - `sha256sum` - hash of the file generated by [sha256sum](sha256sum) + - `recordcount` - the line count of the file +- [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. + Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`. + +Potential augur curate scripts + +- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records +- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records +- [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' +- [transform-field-names](transform-field-names) - Rename fields of NDJSON records +- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) diff --git a/vendored/apply-geolocation-rules b/vendored/apply-geolocation-rules new file mode 100755 index 00000000..776cf16a --- /dev/null +++ b/vendored/apply-geolocation-rules @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +""" +Applies user curated geolocation rules to the geolocation fields in the NDJSON +records from stdin. The modified records are output to stdout. This does not do +any additional transformations on top of the user curations. +""" +import argparse +import json +from collections import defaultdict +from sys import exit, stderr, stdin, stdout + + +class CyclicGeolocationRulesError(Exception): + pass + + +def load_geolocation_rules(geolocation_rules_file): + """ + Loads the geolocation rules from the provided *geolocation_rules_file*. + Returns the rules as a dict: + { + regions: { + countries: { + divisions: { + locations: corrected_geolocations_tuple + } + } + } + } + """ + geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + with open(geolocation_rules_file, 'r') as rules_fh: + for line in rules_fh: + # ignore comments + if line.strip()=="" or line.lstrip()[0] == '#': + continue + + row = line.strip('\n').split('\t') + # Skip lines that cannot be split into raw and annotated geolocations + if len(row) != 2: + print( + f"WARNING: Could not decode geolocation rule {line!r}.", + "Please make sure rules are formatted as", + "'region/country/division/locationregion/country/division/location'.", + file=stderr) + continue + + # remove trailing comments + row[-1] = row[-1].partition('#')[0].rstrip() + raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') ) + + # Skip lines where raw or annotated geolocations cannot be split into 4 fields + if len(raw) != 4: + print( + f"WARNING: Could not decode the raw geolocation {row[0]!r}.", + "Please make sure it is formatted as 'region/country/division/location'.", + file=stderr + ) + continue + + if len(annot) != 4: + print( + f"WARNING: Could not decode the annotated geolocation {row[1]!r}.", + "Please make sure it is formatted as 'region/country/division/location'.", + file=stderr + ) + continue + + + geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot + + return geolocation_rules + + +def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None): + """ + Gets the annotated geolocation for the *raw_geolocation* in the provided + *geolocation_rules*. + + Recursively traverses the *geolocation_rules* until we get the annotated + geolocation, which must be a Tuple. Returns `None` if there are no + applicable rules for the provided *raw_geolocation*. + + Rules are applied in the order of region, country, division, location. + First checks the provided raw values for geolocation fields, then if there + are not matches, tries to use general rules marked with '*'. + """ + # Always instantiate the rule traversal as an empty list if not provided, + # e.g. the first call of this recursive function + if rule_traversal is None: + rule_traversal = [] + + current_rules = geolocation_rules + # Traverse the geolocation rules based using the rule_traversal values + for field_value in rule_traversal: + current_rules = current_rules.get(field_value) + # If we hit `None`, then we know there are no matching rules, so stop the rule traversal + if current_rules is None: + break + + # We've found the tuple of the annotated geolocation + if isinstance(current_rules, tuple): + return current_rules + + # We've reach the next level of geolocation rules, + # so try to traverse the rules with the next target in raw_geolocation + if isinstance(current_rules, dict): + next_traversal_target = raw_geolocation[len(rule_traversal)] + rule_traversal.append(next_traversal_target) + return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) + + # We did not find any matching rule for the last traversal target + if current_rules is None: + # If we've used all general rules and we still haven't found a match, + # then there are no applicable rules for this geolocation + if all(value == '*' for value in rule_traversal): + return None + + # If we failed to find matching rule with a general rule as the last + # traversal target, then delete all trailing '*'s to reset rule_traversal + # to end with the last index that is currently NOT a '*' + # [A, *, B, *] => [A, *, B] + # [A, B, *, *] => [A, B] + # [A, *, *, *] => [A] + if rule_traversal[-1] == '*': + # Find the index of the first of the consecutive '*' from the + # end of the rule_traversal + # [A, *, B, *] => first_consecutive_general_rule_index = 3 + # [A, B, *, *] => first_consecutive_general_rule_index = 2 + # [A, *, *, *] => first_consecutive_general_rule_index = 1 + for index, field_value in reversed(list(enumerate(rule_traversal))): + if field_value == '*': + first_consecutive_general_rule_index = index + else: + break + + rule_traversal = rule_traversal[:first_consecutive_general_rule_index] + + # Set the final value to '*' in hopes that by moving to a general rule, + # we can find a matching rule. + rule_traversal[-1] = '*' + + return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) + + +def transform_geolocations(geolocation_rules, geolocation): + """ + Transform the provided *geolocation* by looking it up in the provided + *geolocation_rules*. + + This will use all rules that apply to the geolocation and rules will + be applied in the order of region, country, division, location. + + Returns the original geolocation if no geolocation rules apply. + + Raises a `CyclicGeolocationRulesError` if more than 1000 rules have + been applied to the raw geolocation. + """ + transformed_values = geolocation + rules_applied = 0 + continue_to_apply = True + + while continue_to_apply: + annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values) + + # Stop applying rules if no annotated values were found + if annotated_values is None: + continue_to_apply = False + else: + rules_applied += 1 + + if rules_applied > 1000: + raise CyclicGeolocationRulesError( + "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}." + ) + + # Create a new list of values for comparison to previous values + new_values = list(transformed_values) + for index, value in enumerate(annotated_values): + # Keep original value if annotated value is '*' + if value != '*': + new_values[index] = value + + # Stop applying rules if this rule did not change the values, + # since this means we've reach rules with '*' that no longer change values + if new_values == transformed_values: + continue_to_apply = False + + transformed_values = new_values + + return transformed_values + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--region-field", default="region", + help="Field that contains regions in NDJSON records.") + parser.add_argument("--country-field", default="country", + help="Field that contains countries in NDJSON records.") + parser.add_argument("--division-field", default="division", + help="Field that contains divisions in NDJSON records.") + parser.add_argument("--location-field", default="location", + help="Field that contains location in NDJSON records.") + parser.add_argument("--geolocation-rules", metavar="TSV", required=True, + help="TSV file of geolocation rules with the format: " + + "'' where the raw and annotated geolocations " + + "are formatted as '///'. " + + "If creating a general rule, then the raw field value can be substituted with '*'." + + "Lines starting with '#' will be ignored as comments." + + "Trailing '#' will be ignored as comments.") + + args = parser.parse_args() + + location_fields = [args.region_field, args.country_field, args.division_field, args.location_field] + + geolocation_rules = load_geolocation_rules(args.geolocation_rules) + + for record in stdin: + record = json.loads(record) + + try: + annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields]) + except CyclicGeolocationRulesError as e: + print(e, file=stderr) + exit(1) + + for index, field in enumerate(location_fields): + record[field] = annotated_values[index] + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/vendored/cloudfront-invalidate b/vendored/cloudfront-invalidate new file mode 100755 index 00000000..dec48529 --- /dev/null +++ b/vendored/cloudfront-invalidate @@ -0,0 +1,42 @@ +#!/bin/bash +# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 +set -euo pipefail + +main() { + local domain="$1" + shift + local paths=("$@") + local distribution invalidation + + echo "-> Finding CloudFront distribution" + distribution=$( + aws cloudfront list-distributions \ + --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ + --output text + ) + + if [[ -z $distribution || $distribution == None ]]; then + exec >&2 + echo "Unable to find CloudFront distribution id for $domain" + echo + echo "Are your AWS CLI credentials for the right account?" + exit 1 + fi + + echo "-> Creating CloudFront invalidation for distribution $distribution" + invalidation=$( + aws cloudfront create-invalidation \ + --distribution-id "$distribution" \ + --paths "${paths[@]}" \ + --query Invalidation.Id \ + --output text + ) + + echo "-> Waiting for CloudFront invalidation $invalidation to complete" + echo " Ctrl-C to stop waiting." + aws cloudfront wait invalidation-completed \ + --distribution-id "$distribution" \ + --id "$invalidation" +} + +main "$@" diff --git a/vendored/download-from-s3 b/vendored/download-from-s3 new file mode 100755 index 00000000..44f7ff34 --- /dev/null +++ b/vendored/download-from-s3 @@ -0,0 +1,48 @@ +#!/bin/bash +set -euo pipefail + +bin="$(dirname "$0")" + +main() { + local src="${1:?A source s3:// URL is required as the first argument.}" + local dst="${2:?A destination file path is required as the second argument.}" + # How many lines to subsample to. 0 means no subsampling. Optional. + # It is not advised to use this for actual subsampling! This is intended to be + # used for debugging workflows with large datasets such as ncov-ingest as + # described in https://github.com/nextstrain/ncov-ingest/pull/367 + + # Uses `tsv-sample` to subsample, so it will not work as expected with files + # that have a single record split across multiple lines (i.e. FASTA sequences) + local n="${3:-0}" + + local s3path="${src#s3://}" + local bucket="${s3path%%/*}" + local key="${s3path#*/}" + + local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 + dst_hash="$("$bin/sha256sum" < "$dst" || true)" + src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + + echo "[ INFO] Downloading $src → $dst" + if [[ $src_hash != "$dst_hash" ]]; then + aws s3 cp --no-progress "$src" - | + if [[ "$src" == *.gz ]]; then + gunzip -cfq + elif [[ "$src" == *.xz ]]; then + xz -T0 -dcq + elif [[ "$src" == *.zst ]]; then + zstd -T0 -dcq + else + cat + fi | + if [[ "$n" -gt 0 ]]; then + tsv-sample -H -i -n "$n" + else + cat + fi >"$dst" + else + echo "[ INFO] Files are identical, skipping download" + fi +} + +main "$@" diff --git a/vendored/merge-user-metadata b/vendored/merge-user-metadata new file mode 100755 index 00000000..341c2dfa --- /dev/null +++ b/vendored/merge-user-metadata @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Merges user curated annotations with the NDJSON records from stdin, with the user +curations overwriting the existing fields. The modified records are output +to stdout. This does not do any additional transformations on top of the user +curations. +""" +import argparse +import csv +import json +from collections import defaultdict +from sys import exit, stdin, stderr, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--annotations", metavar="TSV", required=True, + help="Manually curated annotations TSV file. " + + "The TSV should not have a header and should have exactly three columns: " + + "id to match existing metadata, field name, and field value. " + + "If there are multiple annotations for the same id and field, then the last value is used. " + + "Lines starting with '#' are treated as comments. " + + "Any '#' after the field value are treated as comments.") + parser.add_argument("--id-field", default="accession", + help="The ID field in the metadata to use to merge with the annotations.") + + args = parser.parse_args() + + annotations = defaultdict(dict) + with open(args.annotations, 'r') as annotations_fh: + csv_reader = csv.reader(annotations_fh, delimiter='\t') + for row in csv_reader: + if not row or row[0].lstrip()[0] == '#': + continue + elif len(row) != 3: + print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) + continue + id, field, value = row + annotations[id][field] = value.partition('#')[0].rstrip() + + for record in stdin: + record = json.loads(record) + + record_id = record.get(args.id_field) + if record_id is None: + print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) + exit(1) + + record.update(annotations.get(record_id, {})) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/vendored/notify-on-diff b/vendored/notify-on-diff new file mode 100755 index 00000000..c304d6b5 --- /dev/null +++ b/vendored/notify-on-diff @@ -0,0 +1,35 @@ +#!/bin/bash + +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +bin="$(dirname "$0")" + +src="${1:?A source file is required as the first argument.}" +dst="${2:?A destination s3:// URL is required as the second argument.}" + +dst_local="$(mktemp -t s3-file-XXXXXX)" +diff="$(mktemp -t diff-XXXXXX)" + +trap "rm -f '$dst_local' '$diff'" EXIT + +# if the file is not already present, just exit +"$bin"/s3-object-exists "$dst" || exit 0 + +"$bin"/download-from-s3 "$dst" "$dst_local" + +# diff's exit code is 0 for no differences, 1 for differences found, and >1 for errors +diff_exit_code=0 +diff "$dst_local" "$src" > "$diff" || diff_exit_code=$? + +if [[ "$diff_exit_code" -eq 1 ]]; then + echo "Notifying Slack about diff." + "$bin"/notify-slack --upload "$src.diff" < "$diff" +elif [[ "$diff_exit_code" -gt 1 ]]; then + echo "Notifying Slack about diff failure" + "$bin"/notify-slack "Diff failed for $src" +else + echo "No change in $src." +fi diff --git a/vendored/notify-on-job-fail b/vendored/notify-on-job-fail new file mode 100755 index 00000000..02cb6bad --- /dev/null +++ b/vendored/notify-on-job-fail @@ -0,0 +1,23 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" + +echo "Notifying Slack about failed ${job_name} job." +message="❌ ${job_name} job has FAILED 😞 " + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` () for error details. " +elif [[ -n "${GITHUB_RUN_ID}" ]]; then + message+="See GitHub Action for error details. " +fi + +"$bin"/notify-slack "$message" diff --git a/vendored/notify-on-job-start b/vendored/notify-on-job-start new file mode 100755 index 00000000..3e44bb09 --- /dev/null +++ b/vendored/notify-on-job-start @@ -0,0 +1,27 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" +build_dir="${3:-ingest}" + +echo "Notifying Slack about started ${job_name} job." +message="${job_name} job has started." + +if [[ -n "${GITHUB_RUN_ID}" ]]; then + message+=" The job was submitted by GitHub Action ." +fi + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` ()." + message+=" Follow along in your local clone of ${github_repo} with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ${build_dir}"'```' +fi + +"$bin"/notify-slack "$message" diff --git a/vendored/notify-on-record-change b/vendored/notify-on-record-change new file mode 100755 index 00000000..c0bf8f7e --- /dev/null +++ b/vendored/notify-on-record-change @@ -0,0 +1,53 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +bin="$(dirname "$0")" + +src="${1:?A source ndjson file is required as the first argument.}" +dst="${2:?A destination ndjson s3:// URL is required as the second argument.}" +source_name=${3:?A record source name is required as the third argument.} + +# if the file is not already present, just exit +"$bin"/s3-object-exists "$dst" || exit 0 + +s3path="${dst#s3://}" +bucket="${s3path%%/*}" +key="${s3path#*/}" + +src_record_count="$(wc -l < "$src")" + +# Try getting record count from S3 object metadata +dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)" +if [[ -z "$dst_record_count" ]]; then + # This object doesn't have the record count stored as metadata + # We have to download it and count the lines locally + dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))" +fi + +added_records="$(( src_record_count - dst_record_count ))" + +printf "%'4d %s\n" "$src_record_count" "$src" +printf "%'4d %s\n" "$dst_record_count" "$dst" +printf "%'4d added records\n" "$added_records" + +slack_message="" + +if [[ $added_records -gt 0 ]]; then + echo "Notifying Slack about added records (n=$added_records)" + slack_message="📈 New records (n=$added_records) found on $source_name." + +elif [[ $added_records -lt 0 ]]; then + echo "Notifying Slack about fewer records (n=$added_records)" + slack_message="📉 Fewer records (n=$added_records) found on $source_name." + +else + echo "Notifying Slack about same number of records" + slack_message="⛔ No new records found on $source_name." +fi + +slack_message+=" (Total record count: $src_record_count)" + +"$bin"/notify-slack "$slack_message" diff --git a/vendored/notify-slack b/vendored/notify-slack new file mode 100755 index 00000000..db98bfb8 --- /dev/null +++ b/vendored/notify-slack @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +upload=0 +output=/dev/null +thread_ts="" +broadcast=0 +args=() + +for arg; do + case "$arg" in + --upload) + upload=1;; + --output=*) + output="${arg#*=}";; + --thread-ts=*) + thread_ts="${arg#*=}";; + --broadcast) + broadcast=1;; + *) + args+=("$arg");; + esac +done + +set -- "${args[@]}" + +text="${1:?Some message text is required.}" + +if [[ "$upload" == 1 ]]; then + echo "Uploading data to Slack with the message: $text" + curl https://slack.com/api/files.upload \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channels="$SLACK_CHANNELS" \ + --form-string title="$text" \ + --form-string filename="$text" \ + --form-string thread_ts="$thread_ts" \ + --form file=@/dev/stdin \ + --form filetype=text \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +else + echo "Posting Slack message: $text" + curl https://slack.com/api/chat.postMessage \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channel="$SLACK_CHANNELS" \ + --form-string text="$text" \ + --form-string thread_ts="$thread_ts" \ + --form-string reply_broadcast="$broadcast" \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +fi diff --git a/vendored/s3-object-exists b/vendored/s3-object-exists new file mode 100755 index 00000000..faac4219 --- /dev/null +++ b/vendored/s3-object-exists @@ -0,0 +1,8 @@ +#!/bin/bash +set -euo pipefail + +url="${1#s3://}" +bucket="${url%%/*}" +key="${url#*/}" + +aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/vendored/sha256sum b/vendored/sha256sum new file mode 100755 index 00000000..32d7ef82 --- /dev/null +++ b/vendored/sha256sum @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +Portable sha256sum utility. +""" +from hashlib import sha256 +from sys import stdin + +chunk_size = 5 * 1024**2 # 5 MiB + +h = sha256() + +for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): + h.update(chunk) + +print(h.hexdigest()) diff --git a/vendored/transform-authors b/vendored/transform-authors new file mode 100755 index 00000000..0bade20e --- /dev/null +++ b/vendored/transform-authors @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +""" +Abbreviates a full list of authors to be ' et al.' of the NDJSON +record from stdin and outputs modified records to stdout. + +Note: This is a "best effort" approach and can potentially mangle the author name. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +def parse_authors(record: dict, authors_field: str, default_value: str, + index: int, abbr_authors_field: str = None) -> dict: + # Strip and normalize whitespace + new_authors = re.sub(r'\s+', ' ', record[authors_field]) + + if new_authors == "": + new_authors = default_value + else: + # Split authors list on comma/semicolon + # OR "and"/"&" with at least one space before and after + new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] + + # if it does not already end with " et al.", add it + if not new_authors.strip('. ').endswith(" et al"): + new_authors += ' et al' + + if abbr_authors_field: + if record.get(abbr_authors_field): + print( + f"WARNING: the {abbr_authors_field!r} field already exists", + f"in record {index} and will be overwritten!", + file=stderr + ) + + record[abbr_authors_field] = new_authors + else: + record[authors_field] = new_authors + + return record + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--authors-field", default="authors", + help="The field containing list of authors.") + parser.add_argument("--default-value", default="?", + help="Default value to use if authors list is empty.") + parser.add_argument("--abbr-authors-field", + help="The field for the generated abbreviated authors. " + + "If not provided, the original authors field will be modified.") + + args = parser.parse_args() + + for index, record in enumerate(stdin): + record = json.loads(record) + + parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/vendored/transform-field-names b/vendored/transform-field-names new file mode 100755 index 00000000..fde223fc --- /dev/null +++ b/vendored/transform-field-names @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Renames fields of the NDJSON record from stdin and outputs modified records +to stdout. +""" +import argparse +import json +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--field-map", nargs="+", + help="Fields names in the NDJSON record mapped to new field names, " + + "formatted as '{old_field_name}={new_field_name}'. " + + "If the old field does not exist in record, the new field will be added with an empty string value." + + "If the new field already exists in record, then the renaming of the old field will be skipped.") + parser.add_argument("--force", action="store_true", + help="Force renaming of old field even if the new field already exists. " + + "Please keep in mind this will overwrite the value of the new field.") + + args = parser.parse_args() + + field_map = {} + for field in args.field_map: + old_name, new_name = field.split('=') + field_map[old_name] = new_name + + for record in stdin: + record = json.loads(record) + + for old_field, new_field in field_map.items(): + + if record.get(new_field) and not args.force: + print( + f"WARNING: skipping rename of {old_field} because record", + f"already has a field named {new_field}.", + file=stderr + ) + continue + + record[new_field] = record.pop(old_field, '') + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/vendored/transform-genbank-location b/vendored/transform-genbank-location new file mode 100755 index 00000000..70ba56fb --- /dev/null +++ b/vendored/transform-genbank-location @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate +fields: 'country', 'division', and 'location'. Checks that a record is from +GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". + +Outputs the modified record to stdout. +""" +import json +from sys import stdin, stdout + + +def parse_location(record: dict) -> dict: + # Expected pattern for the location field is "[:][, ]" + # See GenBank docs for their "country" field: + # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ + geographic_data = record['location'].split(':') + + country = geographic_data[0] + division = '' + location = '' + + if len(geographic_data) == 2: + division , _ , location = geographic_data[1].partition(',') + + record['country'] = country.strip() + record['division'] = division.strip() + record['location'] = location.strip() + + return record + + +if __name__ == '__main__': + + for record in stdin: + record = json.loads(record) + + database = record.get('database', '') + if database in {'GenBank', 'RefSeq'}: + parse_location(record) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/vendored/trigger b/vendored/trigger new file mode 100755 index 00000000..11d1b635 --- /dev/null +++ b/vendored/trigger @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail + +: "${PAT_GITHUB_DISPATCH:=}" + +github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}" +event_type="${2:?An event type is required as the second argument.}" +shift 2 + +if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then + cat >&2 <<. +You must specify options to curl for your GitHub credentials. For example, you +can specify your GitHub username, and will be prompted for your password: + + $0 $github_repo $event_type --user + +Be sure to enter a personal access token¹ as your password since GitHub has +discontinued password authentication to the API starting on November 13, 2020². + +You can also store your credentials or a personal access token in a netrc +file³: + + machine api.github.com + login + password + +and then tell curl to use it: + + $0 $github_repo $event_type --netrc + +which will then not require you to type your password every time. + +¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line +² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password +³ https://ec.haxx.se/usingcurl/usingcurl-netrc +. + exit 1 +fi + +auth=':' +if [[ -n $PAT_GITHUB_DISPATCH ]]; then + auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" +fi + +if curl -fsS "https://api.github.com/repos/${github_repo}/dispatches" \ + -H 'Accept: application/vnd.github.v3+json' \ + -H 'Content-Type: application/json' \ + -H "$auth" \ + -d '{"event_type":"'"$event_type"'"}' \ + "$@" +then + echo "Successfully triggered $event_type" +else + echo "Request failed" >&2 + exit 1 +fi diff --git a/vendored/trigger-on-new-data b/vendored/trigger-on-new-data new file mode 100755 index 00000000..ef71d88d --- /dev/null +++ b/vendored/trigger-on-new-data @@ -0,0 +1,32 @@ +#!/bin/bash +set -euo pipefail + +: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" + +bin="$(dirname "$0")" + +github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}" +event_type="${2:?An event type is required as the second argument.}" +metadata="${3:?A metadata upload output file is required as the third argument.}" +sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}" +identical_file_message="${5:-files are identical}" + +new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?) +new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?) + +slack_message="" + +# grep exit status 0 for found match, 1 for no match, 2 if an error occurred +if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then + slack_message="Triggering new builds due to updated metadata and/or sequences" + "$bin"/trigger "$github_repo" "$event_type" +elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then + slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files." +else + slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated." +fi + + +if ! "$bin"/notify-slack "$slack_message"; then + echo "Notifying Slack failed, but exiting with success anyway." +fi diff --git a/vendored/upload-to-s3 b/vendored/upload-to-s3 new file mode 100755 index 00000000..31cd49bf --- /dev/null +++ b/vendored/upload-to-s3 @@ -0,0 +1,78 @@ +#!/bin/bash +set -euo pipefail + +bin="$(dirname "$0")" + +main() { + local quiet=0 + + for arg; do + case "$arg" in + --quiet) + quiet=1 + shift;; + *) + break;; + esac + done + + local src="${1:?A source file is required as the first argument.}" + local dst="${2:?A destination s3:// URL is required as the second argument.}" + local cloudfront_domain="${3:-}" + + local s3path="${dst#s3://}" + local bucket="${s3path%%/*}" + local key="${s3path#*/}" + + local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 + src_hash="$("$bin/sha256sum" < "$src")" + dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + + if [[ $src_hash != "$dst_hash" ]]; then + # The record count may have changed + src_record_count="$(wc -l < "$src")" + + echo "Uploading $src → $dst" + if [[ "$dst" == *.gz ]]; then + gzip -c "$src" + elif [[ "$dst" == *.xz ]]; then + xz -2 -T0 -c "$src" + elif [[ "$dst" == *.zst ]]; then + zstd -T0 -c "$src" + else + cat "$src" + fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")" + + if [[ -n $cloudfront_domain ]]; then + echo "Creating CloudFront invalidation for $cloudfront_domain/$key" + if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then + echo "CloudFront invalidation failed, but exiting with success anyway." + fi + fi + + if [[ $quiet == 1 ]]; then + echo "Quiet mode. No Slack notification sent." + exit 0 + fi + + if ! "$bin"/notify-slack "Updated $dst available."; then + echo "Notifying Slack failed, but exiting with success anyway." + fi + else + echo "Uploading $src → $dst: files are identical, skipping upload" + fi +} + +content-type() { + case "$1" in + *.tsv) echo --content-type=text/tab-separated-values;; + *.csv) echo --content-type=text/comma-separated-values;; + *.ndjson) echo --content-type=application/x-ndjson;; + *.gz) echo --content-type=application/gzip;; + *.xz) echo --content-type=application/x-xz;; + *.zst) echo --content-type=application/zstd;; + *) echo --content-type=text/plain;; + esac +} + +main "$@" From 56335d2ff7ae945ac5d5260f72ffdba39da0e5aa Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 31 Jul 2023 15:18:14 -0700 Subject: [PATCH 2/6] Describe subrepo setup The previous commit was created by the following command: git subrepo clone https://github.com/nextstrain/ingest vendored Add a section in the README on how to use this directory in the future. --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index af89f31c..445af45a 100644 --- a/README.md +++ b/README.md @@ -150,3 +150,16 @@ aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < / - `AWS_SECRET_ACCESS_KEY` - `SLACK_TOKEN` - `SLACK_CHANNELS` + +## `vendored` + +This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in `vendored`, from [nextstrain/ingest](https://github.com/nextstrain/ingest). To pull new changes from the central ingest repository, first install `git subrepo`, then run: + +```sh +git subrepo pull vendored +``` + +Changes should not be pushed using `git subrepo push`. + +1. For pathogen-specific changes, make them in this repository via a pull request. +2. For pathogen-agnostic changes, make them on [nextstrain/ingest](https://github.com/nextstrain/ingest) via pull request there, then use `git subrepo pull` to add those changes to this repository. From 4ba0375d50d00b73bbcc0788e3b903091a0705b0 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 11 Aug 2023 08:57:35 -0400 Subject: [PATCH 3/6] Use centralized scripts that are functionally identical Remove the copies in this repo and update references. --- bin/cloudfront-invalidate | 42 ---------- bin/download-from-s3 | 41 ---------- bin/local-ingest-gisaid | 20 ++--- bin/notify-on-additional-info-change | 3 +- bin/notify-on-duplicate-biosample-change | 5 +- bin/notify-on-flagged-metadata-change | 5 +- bin/notify-on-record-change | 53 ------------- bin/s3-object-exists | 8 -- bin/sha256sum | 15 ---- bin/upload-to-s3 | 78 ------------------- workflow/snakemake_rules/fetch_sequences.smk | 20 ++--- workflow/snakemake_rules/nextclade.smk | 16 ++-- .../snakemake_rules/slack_notifications.smk | 2 +- workflow/snakemake_rules/upload.smk | 2 +- 14 files changed, 38 insertions(+), 272 deletions(-) delete mode 100755 bin/cloudfront-invalidate delete mode 100755 bin/download-from-s3 delete mode 100755 bin/notify-on-record-change delete mode 100755 bin/s3-object-exists delete mode 100755 bin/sha256sum delete mode 100755 bin/upload-to-s3 diff --git a/bin/cloudfront-invalidate b/bin/cloudfront-invalidate deleted file mode 100755 index dec48529..00000000 --- a/bin/cloudfront-invalidate +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 -set -euo pipefail - -main() { - local domain="$1" - shift - local paths=("$@") - local distribution invalidation - - echo "-> Finding CloudFront distribution" - distribution=$( - aws cloudfront list-distributions \ - --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ - --output text - ) - - if [[ -z $distribution || $distribution == None ]]; then - exec >&2 - echo "Unable to find CloudFront distribution id for $domain" - echo - echo "Are your AWS CLI credentials for the right account?" - exit 1 - fi - - echo "-> Creating CloudFront invalidation for distribution $distribution" - invalidation=$( - aws cloudfront create-invalidation \ - --distribution-id "$distribution" \ - --paths "${paths[@]}" \ - --query Invalidation.Id \ - --output text - ) - - echo "-> Waiting for CloudFront invalidation $invalidation to complete" - echo " Ctrl-C to stop waiting." - aws cloudfront wait invalidation-completed \ - --distribution-id "$distribution" \ - --id "$invalidation" -} - -main "$@" diff --git a/bin/download-from-s3 b/bin/download-from-s3 deleted file mode 100755 index 281bf227..00000000 --- a/bin/download-from-s3 +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -set -euo pipefail - -bin="$(dirname "$0")" - -main() { - local src="${1:?A source s3:// URL is required as the first argument.}" - local dst="${2:?A destination file path is required as the second argument.}" - local n="${3:-0}" # How many lines to subsample to. 0 means no subsampling. Optional. - - local s3path="${src#s3://}" - local bucket="${s3path%%/*}" - local key="${s3path#*/}" - - local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - dst_hash="$("$bin/sha256sum" < "$dst" || true)" - src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" - - echo "[ INFO] Downloading $src → $dst" - if [[ $src_hash != "$dst_hash" ]]; then - aws s3 cp --no-progress "$src" - | - if [[ "$src" == *.gz ]]; then - gunzip -cfq - elif [[ "$src" == *.xz ]]; then - xz -T0 -dcq - elif [[ "$src" == *.zst ]]; then - zstd -T0 -dcq - else - cat - fi | - if [[ "$n" -gt 0 ]]; then - tsv-sample -H -i -n "$n" - else - cat - fi >"$dst" - else - echo "[ INFO] Files are identical, skipping download" - fi -} - -main "$@" diff --git a/bin/local-ingest-gisaid b/bin/local-ingest-gisaid index 7f6d8b07..892be334 100755 --- a/bin/local-ingest-gisaid +++ b/bin/local-ingest-gisaid @@ -79,8 +79,8 @@ main() { download-inputs() { mkdir -p "${INPUT_DIR}" - ./bin/download-from-s3 "${S3_BUCKET}/additional_info.tsv.gz" "data/gisaid/inputs/additional_info.tsv" - ./bin/download-from-s3 "${S3_BUCKET}/metadata.tsv.gz" "data/gisaid/inputs/metadata.tsv" + ./vendored/download-from-s3 "${S3_BUCKET}/additional_info.tsv.gz" "data/gisaid/inputs/additional_info.tsv" + ./vendored/download-from-s3 "${S3_BUCKET}/metadata.tsv.gz" "data/gisaid/inputs/metadata.tsv" } download-gisaid() { @@ -160,16 +160,16 @@ ingest() { } upload-outputs() { - ./bin/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.gz" - ./bin/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.gz" - ./bin/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.gz" - ./bin/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.xz" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.gz" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.gz" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.gz" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.xz" # Parallel uploads of zstd compressed files to slowly transition to this format - ./bin/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.zst" - ./bin/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.zst" - ./bin/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.zst" - ./bin/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.zst" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/metadata.tsv" "${S3_BUCKET}/metadata.tsv.zst" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/additional_info.tsv" "${S3_BUCKET}/additional_info.tsv.zst" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/flagged_metadata.txt" "${S3_BUCKET}/flagged_metadata.txt.zst" + ./vendored/upload-to-s3 "${OUTPUT_DIR}/sequences.fasta" "${S3_BUCKET}/sequences.fasta.zst" } print-help() { diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change index 95190617..185589a5 100755 --- a/bin/notify-on-additional-info-change +++ b/bin/notify-on-additional-info-change @@ -5,12 +5,13 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored src="${1:?A source additional info TSV file is required as the first argument.}" dst="${2:?A destination additional info TSV s3:// URL is required as the second argument.}" # if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 +"$vendored"/s3-object-exists "$dst" || exit 0 # Remove rows where columns 3 (additional_host_info) and 4 (additional_location_info) are empty. # Compare the S3 version with the local version. diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change index 6e94ac93..5e7983ca 100755 --- a/bin/notify-on-duplicate-biosample-change +++ b/bin/notify-on-duplicate-biosample-change @@ -5,6 +5,7 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored src="${1:?A source duplicate BioSample txt file is required as the first argument.}" dst="${2:?A destination duplicate BioSample txt s3:// URL is required as the second argument.}" @@ -16,9 +17,9 @@ diff="$(mktemp -t duplicate-biosample-additions-XXXXXX)" trap "rm -f '$dst_local' '$diff'" EXIT # if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 +"$vendored"/s3-object-exists "$dst" || exit 0 -"$bin"/download-from-s3 "$dst" "$dst_local" +"$vendored"/download-from-s3 "$dst" "$dst_local" comm -13 \ <(sort "$dst_local") \ diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change index 55069f49..0b93784d 100755 --- a/bin/notify-on-flagged-metadata-change +++ b/bin/notify-on-flagged-metadata-change @@ -5,6 +5,7 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored src="${1:?A source flagged metadata txt file is required as the first argument.}" dst="${2:?A destination flagged metadata txt s3:// URL is required as the second argument.}" @@ -16,9 +17,9 @@ diff="$(mktemp -t flagged-metadata-additions-XXXXXX)" trap "rm -f '$dst_local' '$diff'" EXIT # if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 +"$vendored"/s3-object-exists "$dst" || exit 0 -"$bin"/download-from-s3 "$dst" "$dst_local" +"$vendored"/download-from-s3 "$dst" "$dst_local" comm -13 \ <(sort "$dst_local") \ diff --git a/bin/notify-on-record-change b/bin/notify-on-record-change deleted file mode 100755 index 7c8afd07..00000000 --- a/bin/notify-on-record-change +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -bin="$(dirname "$0")" - -src="${1:?A source GenBank or GISAID ndjson file is required as the first argument.}" -dst="${2:?A destination GenBank or GISAID ndjson s3:// URL is required as the second argument.}" -source_name=${3:?A record source name is required as the third argument.} - -# if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 - -s3path="${dst#s3://}" -bucket="${s3path%%/*}" -key="${s3path#*/}" - -src_record_count="$(wc -l < "$src")" - -# Try getting record count from S3 object metadata -dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)" -if [[ -z "$dst_record_count" ]]; then - # This object doesn't have the record count stored as metadata - # We have to download it and count the lines locally - dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))" -fi - -added_records="$(( src_record_count - dst_record_count ))" - -printf "%'4d %s\n" "$src_record_count" "$src" -printf "%'4d %s\n" "$dst_record_count" "$dst" -printf "%'4d added records\n" "$added_records" - -slack_message="" - -if [[ $added_records -gt 0 ]]; then - echo "Notifying Slack about added records (n=$added_records)" - slack_message="📈 New nCoV records (n=$added_records) found on $source_name." - -elif [[ $added_records -lt 0 ]]; then - echo "Notifying Slack about fewer records (n=$added_records)" - slack_message="📉 Fewer nCoV records (n=$added_records) found on $source_name." - -else - echo "Notifying Slack about same number of records" - slack_message="⛔ No new nCoV records found on $source_name." -fi - -slack_message+=" (Total record count: $src_record_count)" - -"$bin"/notify-slack "$slack_message" diff --git a/bin/s3-object-exists b/bin/s3-object-exists deleted file mode 100755 index faac4219..00000000 --- a/bin/s3-object-exists +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -euo pipefail - -url="${1#s3://}" -bucket="${url%%/*}" -key="${url#*/}" - -aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/bin/sha256sum b/bin/sha256sum deleted file mode 100755 index 32d7ef82..00000000 --- a/bin/sha256sum +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 -""" -Portable sha256sum utility. -""" -from hashlib import sha256 -from sys import stdin - -chunk_size = 5 * 1024**2 # 5 MiB - -h = sha256() - -for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): - h.update(chunk) - -print(h.hexdigest()) diff --git a/bin/upload-to-s3 b/bin/upload-to-s3 deleted file mode 100755 index 31cd49bf..00000000 --- a/bin/upload-to-s3 +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -set -euo pipefail - -bin="$(dirname "$0")" - -main() { - local quiet=0 - - for arg; do - case "$arg" in - --quiet) - quiet=1 - shift;; - *) - break;; - esac - done - - local src="${1:?A source file is required as the first argument.}" - local dst="${2:?A destination s3:// URL is required as the second argument.}" - local cloudfront_domain="${3:-}" - - local s3path="${dst#s3://}" - local bucket="${s3path%%/*}" - local key="${s3path#*/}" - - local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - src_hash="$("$bin/sha256sum" < "$src")" - dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" - - if [[ $src_hash != "$dst_hash" ]]; then - # The record count may have changed - src_record_count="$(wc -l < "$src")" - - echo "Uploading $src → $dst" - if [[ "$dst" == *.gz ]]; then - gzip -c "$src" - elif [[ "$dst" == *.xz ]]; then - xz -2 -T0 -c "$src" - elif [[ "$dst" == *.zst ]]; then - zstd -T0 -c "$src" - else - cat "$src" - fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")" - - if [[ -n $cloudfront_domain ]]; then - echo "Creating CloudFront invalidation for $cloudfront_domain/$key" - if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then - echo "CloudFront invalidation failed, but exiting with success anyway." - fi - fi - - if [[ $quiet == 1 ]]; then - echo "Quiet mode. No Slack notification sent." - exit 0 - fi - - if ! "$bin"/notify-slack "Updated $dst available."; then - echo "Notifying Slack failed, but exiting with success anyway." - fi - else - echo "Uploading $src → $dst: files are identical, skipping upload" - fi -} - -content-type() { - case "$1" in - *.tsv) echo --content-type=text/tab-separated-values;; - *.csv) echo --content-type=text/comma-separated-values;; - *.ndjson) echo --content-type=application/x-ndjson;; - *.gz) echo --content-type=application/gzip;; - *.xz) echo --content-type=application/x-xz;; - *.zst) echo --content-type=application/zstd;; - *) echo --content-type=text/plain;; - esac -} - -main "$@" diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk index 0e03d7f9..7d1402d4 100644 --- a/workflow/snakemake_rules/fetch_sequences.smk +++ b/workflow/snakemake_rules/fetch_sequences.smk @@ -253,8 +253,8 @@ if config.get("s3_dst") and config.get("s3_src"): ndjson = temp(f"data/{database}.ndjson") shell: """ - ./bin/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines} + ./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} || \ + ./vendored/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines} """ rule fetch_biosample_from_s3: @@ -268,8 +268,8 @@ if config.get("s3_dst") and config.get("s3_src"): biosample = temp("data/biosample.ndjson") shell: """ - ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} + ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ + ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} """ rule fetch_rki_ndjson_from_s3: @@ -281,8 +281,8 @@ if config.get("s3_dst") and config.get("s3_src"): rki_ndjson = temp("data/rki.ndjson") shell: """ - ./bin/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.rki_ndjson} {params.lines} + ./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} || \ + ./vendored/download-from-s3 {params.file_on_s3_src} {output.rki_ndjson} {params.lines} """ rule fetch_cog_uk_accessions_from_s3: params: @@ -293,8 +293,8 @@ if config.get("s3_dst") and config.get("s3_src"): biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv") shell: """ - ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} + ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ + ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} """ rule fetch_cog_uk_metadata_from_s3: @@ -306,8 +306,8 @@ if config.get("s3_dst") and config.get("s3_src"): biosample = temp("data/cog_uk_metadata.csv") shell: """ - ./bin/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ - ./bin/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} + ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ + ./vendored/download-from-s3 {params.file_on_s3_src} {output.biosample} {params.lines} """ rule compress_cog_uk_metadata: diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index 264d3734..921a4679 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -75,10 +75,10 @@ if config.get("s3_dst") and config.get("s3_src"): nextclade=f"data/{database}/nextclade{{reference}}_old.tsv", shell: """ - ./bin/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \ - ./bin/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \ - ./bin/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \ - ./bin/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \ + ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \ + ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \ + ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \ + ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \ touch {output.nextclade} """ @@ -96,10 +96,10 @@ if config.get("s3_dst") and config.get("s3_src"): alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"), shell: """ - ./bin/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \ - ./bin/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \ - ./bin/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \ - ./bin/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \ + ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \ + ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \ + ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \ + ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \ touch {output.alignment} """ diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk index 5add860f..cd9c4463 100644 --- a/workflow/snakemake_rules/slack_notifications.smk +++ b/workflow/snakemake_rules/slack_notifications.smk @@ -32,7 +32,7 @@ rule notify_on_record_change: touch(f"data/{database}/notify-on-record-change.done") shell: """ - ./bin/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database} + ./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database} """ diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 55103a62..7f139037 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -87,7 +87,7 @@ rule upload_single: cloudfront_domain = config.get("cloudfront_domain", ""), shell: """ - ./bin/upload-to-s3 \ + ./vendored/upload-to-s3 \ {params.quiet} \ {input:q} \ {params.s3_bucket:q}/{wildcards.remote_filename:q} \ From abd812e616e6db14b417a254a5d245443f221399 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 11 Aug 2023 09:00:10 -0400 Subject: [PATCH 4/6] Use centralized trigger scripts Both the centralized `trigger` and `trigger-on-new-data` take an owner/repo pair as the first argument. --- .../fetch-and-ingest-genbank-master.yml | 2 +- .../fetch-and-ingest-gisaid-master.yml | 2 +- .github/workflows/ingest-genbank-master.yml | 2 +- .github/workflows/ingest-gisaid-master.yml | 2 +- .github/workflows/update-image.yml | 2 +- README.md | 12 ++-- bin/trigger | 56 ------------------- bin/trigger-on-new-data | 32 ----------- workflow/snakemake_rules/trigger.smk | 6 +- 9 files changed, 14 insertions(+), 102 deletions(-) delete mode 100755 bin/trigger delete mode 100755 bin/trigger-on-new-data diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml index c2b341a6..4034cd91 100644 --- a/.github/workflows/fetch-and-ingest-genbank-master.yml +++ b/.github/workflows/fetch-and-ingest-genbank-master.yml @@ -22,7 +22,7 @@ on: # sister GISAID job, so that we don't need to keep two schedules in our heads. - cron: '7 18 * * *' - # Manually triggered using `./bin/trigger ncov-ingest genbank/fetch-and-ingest` (or `fetch-and-ingest`, which + # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest genbank/fetch-and-ingest` (or `fetch-and-ingest`, which # includes GISAID) repository_dispatch: types: diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml index d2459dd9..cc4c512d 100644 --- a/.github/workflows/fetch-and-ingest-gisaid-master.yml +++ b/.github/workflows/fetch-and-ingest-gisaid-master.yml @@ -22,7 +22,7 @@ on: # sister GenBank job, so that we don't need to keep two schedules in our heads. - cron: '7 18 * * *' - # Manually triggered using `./bin/trigger ncov-ingest gisaid/fetch-and-ingest` + # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest` repository_dispatch: types: - gisaid/fetch-and-ingest diff --git a/.github/workflows/ingest-genbank-master.yml b/.github/workflows/ingest-genbank-master.yml index a8ddbd50..bdc2a413 100644 --- a/.github/workflows/ingest-genbank-master.yml +++ b/.github/workflows/ingest-genbank-master.yml @@ -1,7 +1,7 @@ name: GenBank ingest on: - # Manually triggered using `./bin/trigger ncov-ingest genbank/ingest` (or `ingest`, which + # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest genbank/ingest` (or `ingest`, which # includes GISAID) repository_dispatch: types: diff --git a/.github/workflows/ingest-gisaid-master.yml b/.github/workflows/ingest-gisaid-master.yml index 7da212d9..3fc2e727 100644 --- a/.github/workflows/ingest-gisaid-master.yml +++ b/.github/workflows/ingest-gisaid-master.yml @@ -1,7 +1,7 @@ name: GISAID ingest on: - # Manually triggered using `./bin/trigger ncov-ingest gisaid/ingest` (or `ingest`, which + # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest` (or `ingest`, which # includes GenBank) repository_dispatch: types: diff --git a/.github/workflows/update-image.yml b/.github/workflows/update-image.yml index c4042505..0251c30c 100644 --- a/.github/workflows/update-image.yml +++ b/.github/workflows/update-image.yml @@ -15,7 +15,7 @@ on: - yarn.lock - .github/workflows/update-image.yml - # Manually triggered using `./bin/trigger ncov-ingest update-image` + # Manually triggered using `./vendored/trigger nextstrain/ncov-ingest update-image` repository_dispatch: types: update-image diff --git a/README.md b/README.md index 445af45a..6c7b33e9 100644 --- a/README.md +++ b/README.md @@ -76,18 +76,18 @@ AWS credentials are stored in this repository's secrets and are associated with A full run is now done in 3 steps via manual triggers: -1. Fetch new sequences and ingest them by running `./bin/trigger ncov-ingest gisaid/fetch-and-ingest --user `. +1. Fetch new sequences and ingest them by running `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest --user `. 2. Add manual annotations, update location hierarchy as needed, and run ingest without fetching new sequences. - Pushes of `source-data/*-annotations.tsv` to the master branch will automatically trigger a run of ingest. - - You can also run ingest manually by running `./bin/trigger ncov-ingest gisaid/ingest --user `. -3. Once all manual fixes are complete, trigger a rebuild of [nextstrain/ncov](https://github.com/nextstrain/ncov) by running `./bin/trigger ncov gisaid/rebuild --user `. + - You can also run ingest manually by running `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest --user `. +3. Once all manual fixes are complete, trigger a rebuild of [nextstrain/ncov](https://github.com/nextstrain/ncov) by running `./vendored/trigger ncov gisaid/rebuild --user `. -See the output of `./bin/trigger ncov-ingest gisaid/fetch-and-ingest --user `, `./bin/trigger ncov-ingest gisaid/ingest` or `./bin/trigger ncov-ingest rebuild` for more information about authentication with GitHub. +See the output of `./vendored/trigger nextstrain/ncov-ingest gisaid/fetch-and-ingest --user `, `./vendored/trigger nextstrain/ncov-ingest gisaid/ingest` or `./vendored/trigger nextstrain/ncov-ingest rebuild` for more information about authentication with GitHub. -Note: running `./bin/trigger ncov-ingest` posts a GitHub `repository_dispatch`. +Note: running `./vendored/trigger nextstrain/ncov-ingest` posts a GitHub `repository_dispatch`. Regardless of which branch you are on, it will trigger the specified action on the master branch. -Valid dispatch types for `./bin/trigger ncov-ingest` are: +Valid dispatch types for `./vendored/trigger nextstrain/ncov-ingest` are: - `ingest` (both GISAID and GenBank) - `gisaid/ingest` diff --git a/bin/trigger b/bin/trigger deleted file mode 100755 index d40553b6..00000000 --- a/bin/trigger +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${PAT_GITHUB_DISPATCH:=}" - -repo="${1:?A repository name is required as the first argument.}" -event_type="${2:?An event type is required as the second argument.}" -shift 2 - -if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then - cat >&2 <<. -You must specify options to curl for your GitHub credentials. For example, you -can specify your GitHub username, and will be prompted for your password: - - $0 $repo $event_type --user - -Be sure to enter a personal access token¹ as your password since GitHub has -discontinued password authentication to the API starting on November 13, 2020². - -You can also store your credentials or a personal access token in a netrc -file³: - - machine api.github.com - login - password - -and then tell curl to use it: - - $0 $repo $event_type --netrc - -which will then not require you to type your password every time. - -¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line -² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password -³ https://ec.haxx.se/usingcurl/usingcurl-netrc -. - exit 1 -fi - -auth=':' -if [[ -n $PAT_GITHUB_DISPATCH ]]; then - auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" -fi - -if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \ - -H 'Accept: application/vnd.github.v3+json' \ - -H 'Content-Type: application/json' \ - -H "$auth" \ - -d '{"event_type":"'"$event_type"'"}' \ - "$@" -then - echo "Successfully triggered $event_type" -else - echo "Request failed" >&2 - exit 1 -fi diff --git a/bin/trigger-on-new-data b/bin/trigger-on-new-data deleted file mode 100755 index 1af77988..00000000 --- a/bin/trigger-on-new-data +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" - -bin="$(dirname "$0")" - -repo="${1:?A repository name is required as the first argument.}" -event_type="${2:?An event type is required as the second argument.}" -metadata="${3:?A metadata upload output file is required as the third argument.}" -sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}" -identical_file_message="${5:-files are identical}" - -new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?) -new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?) - -slack_message="" - -# grep exit status 0 for found match, 1 for no match, 2 if an error occurred -if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then - slack_message="Triggering new builds due to updated metadata and/or sequences" - "$bin"/trigger "$repo" "$event_type" -elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then - slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files." -else - slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated." -fi - - -if ! "$bin"/notify-slack "$slack_message"; then - echo "Notifying Slack failed, but exiting with success anyway." -fi diff --git a/workflow/snakemake_rules/trigger.smk b/workflow/snakemake_rules/trigger.smk index d53d6bbb..09af7689 100644 --- a/workflow/snakemake_rules/trigger.smk +++ b/workflow/snakemake_rules/trigger.smk @@ -22,8 +22,8 @@ rule trigger_rebuild_pipeline: dispatch_type = f"{database}/rebuild" shell: """ - ./bin/trigger-on-new-data \ - ncov \ + ./vendored/trigger-on-new-data \ + nextstrain/ncov \ {params.dispatch_type} \ {input.metadata_upload} \ {input.fasta_upload} @@ -39,5 +39,5 @@ rule trigger_counts_pipeline: dispatch_type = f"{database}/clade-counts" shell: """ - ./bin/trigger forecasts-ncov {params.dispatch_type} + ./vendored/trigger nextstrain/forecasts-ncov {params.dispatch_type} """ From a82898bde9c79fa6f66f509e7595739132e2b2e9 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 31 Jul 2023 15:40:38 -0700 Subject: [PATCH 5/6] Use centralized Slack notification scripts Remove the copies in this repo and update references. Add new positional arguments required by the centralized scripts. --- .../fetch-and-ingest-genbank-master.yml | 2 +- .../fetch-and-ingest-gisaid-master.yml | 2 +- .github/workflows/ingest-genbank-master.yml | 2 +- .github/workflows/ingest-gisaid-master.yml | 2 +- Snakefile | 4 +- bin/notify-on-additional-info-change | 2 +- bin/notify-on-duplicate-biosample-change | 4 +- bin/notify-on-flagged-metadata-change | 4 +- bin/notify-on-job-fail | 21 --------- bin/notify-on-job-start | 26 ----------- bin/notify-on-problem-data | 3 +- bin/notify-slack | 44 ------------------- .../snakemake_rules/slack_notifications.smk | 4 +- 13 files changed, 15 insertions(+), 105 deletions(-) delete mode 100755 bin/notify-on-job-fail delete mode 100755 bin/notify-on-job-start delete mode 100755 bin/notify-slack diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml index 4034cd91..fffd83fd 100644 --- a/.github/workflows/fetch-and-ingest-genbank-master.yml +++ b/.github/workflows/fetch-and-ingest-genbank-master.yml @@ -82,4 +82,4 @@ jobs: - name: notify_pipeline_failed if: ${{ failure() }} - run: ./bin/notify-on-job-fail + run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml index cc4c512d..c4280389 100644 --- a/.github/workflows/fetch-and-ingest-gisaid-master.yml +++ b/.github/workflows/fetch-and-ingest-gisaid-master.yml @@ -85,4 +85,4 @@ jobs: - name: notify_pipeline_failed if: ${{ failure() }} - run: ./bin/notify-on-job-fail + run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest diff --git a/.github/workflows/ingest-genbank-master.yml b/.github/workflows/ingest-genbank-master.yml index bdc2a413..6734f14a 100644 --- a/.github/workflows/ingest-genbank-master.yml +++ b/.github/workflows/ingest-genbank-master.yml @@ -51,4 +51,4 @@ jobs: - name: notify_pipeline_failed if: ${{ failure() }} - run: ./bin/notify-on-job-fail + run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest diff --git a/.github/workflows/ingest-gisaid-master.yml b/.github/workflows/ingest-gisaid-master.yml index 3fc2e727..2d275f84 100644 --- a/.github/workflows/ingest-gisaid-master.yml +++ b/.github/workflows/ingest-gisaid-master.yml @@ -51,4 +51,4 @@ jobs: - name: notify_pipeline_failed if: ${{ failure() }} - run: ./bin/notify-on-job-fail + run: ./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest diff --git a/Snakefile b/Snakefile index 7904c267..d8e527ac 100644 --- a/Snakefile +++ b/Snakefile @@ -92,7 +92,7 @@ onstart: print(f"\t${{{var}}}: " + ("YES" if os.environ.get(var, "") else "NO") + f"({description})") if send_notifications: message="🥗 GISAID ingest" if database=="gisaid" else "🥣 GenBank ingest" - shell(f"./bin/notify-on-job-start \"{message}\"") + shell(f"./vendored/notify-on-job-start \"{message}\" nextstrain/ncov-ingest") onsuccess: message = "✅ This pipeline has successfully finished 🎉" @@ -104,7 +104,7 @@ onsuccess: onerror: print("Pipeline failed.") if send_notifications: - shell("./bin/notify-on-job-fail") + shell("./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest") if not config.get("keep_all_files", False): print("Removing intermediate files (set config option keep_all_files to skip this)") shell("./bin/clean") diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change index 185589a5..c6554c65 100755 --- a/bin/notify-on-additional-info-change +++ b/bin/notify-on-additional-info-change @@ -27,7 +27,7 @@ diff="$( if [[ -n "$diff" ]]; then echo "Notifying Slack about additional info change." - "$bin"/notify-slack --upload "additional-info-changes.txt" <<<"$diff" + "$vendored"/notify-slack --upload "additional-info-changes.txt" <<<"$diff" else echo "No additional info change." fi diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change index 5e7983ca..52a372b8 100755 --- a/bin/notify-on-duplicate-biosample-change +++ b/bin/notify-on-duplicate-biosample-change @@ -29,8 +29,8 @@ comm -13 \ if [[ -s "$diff" ]]; then echo echo "Notifying Slack about duplicate BioSample additions." - "$bin"/notify-slack ":warning: Newly flagged duplicate BioSample strains" - "$bin"/notify-slack --upload "duplicate-biosample-additions.txt" < "$diff" + "$vendored"/notify-slack ":warning: Newly flagged duplicate BioSample strains" + "$vendored"/notify-slack --upload "duplicate-biosample-additions.txt" < "$diff" else echo "No flagged duplicate BioSample additions." fi diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change index 0b93784d..b10f2f22 100755 --- a/bin/notify-on-flagged-metadata-change +++ b/bin/notify-on-flagged-metadata-change @@ -29,8 +29,8 @@ comm -13 \ if [[ -s "$diff" ]]; then echo echo "Notifying Slack about flagged metadata additions." - "$bin"/notify-slack ":waving_black_flag: Newly flagged metadata" - "$bin"/notify-slack --upload "flagged-metadata-additions.txt" < "$diff" + "$vendored"/notify-slack ":waving_black_flag: Newly flagged metadata" + "$vendored"/notify-slack --upload "flagged-metadata-additions.txt" < "$diff" else echo "No flagged metadata additions." fi diff --git a/bin/notify-on-job-fail b/bin/notify-on-job-fail deleted file mode 100755 index 3d49b934..00000000 --- a/bin/notify-on-job-fail +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -bin="$(dirname "$0")" - -aws_batch_job_id="${AWS_BATCH_JOB_ID:-}" -github_run_id="${GITHUB_RUN_ID:-}" - -echo "Notifying Slack about failed ingest job." -message="❌ Ingest job has FAILED 😞 " - -if [ -n "${aws_batch_job_id}" ]; then - message+="See AWS Batch job \`${aws_batch_job_id}\` () for error details. " -elif [ -n "${github_run_id}" ]; then - message+="See GitHub Action for error details. " -fi - -"$bin"/notify-slack "$message" diff --git a/bin/notify-on-job-start b/bin/notify-on-job-start deleted file mode 100755 index 5d2f239c..00000000 --- a/bin/notify-on-job-start +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -set -euo pipefail - -bin="$(dirname "$0")" - -export JOB_NAME="${1:?Job name is required as the first argument.}" -export ADDITIONAL_INFO="${2:-}" -export AWS_BATCH_JOB_ID="${AWS_BATCH_JOB_ID:-}" -export GITHUB_RUN_ID="${GITHUB_RUN_ID:-}" -export SLACK_TOKEN="${SLACK_TOKEN:-}" -export SLACK_CHANNELS="${SLACK_CHANNELS:-}" - -if [ -n "${SLACK_TOKEN}" ] && [ -n "${SLACK_CHANNELS}" ] && [ -n "${AWS_BATCH_JOB_ID}" ]; then - echo "Notifying Slack about started AWS Batch job. AWS_BATCH_JOB_ID=\"${AWS_BATCH_JOB_ID}\"." - - msg_job_id="${JOB_NAME} from AWS Batch job \`${AWS_BATCH_JOB_ID}\` () started." - - msg_action="" - if [ -n "${GITHUB_RUN_ID}" ]; then - msg_action="The job was submitted by GitHub Action ." - fi - - msg_command="Follow along in your local \`ncov-ingest\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ."'```' - - "$bin"/notify-slack "${msg_job_id} ${msg_action} ${msg_command} ${ADDITIONAL_INFO}" -fi diff --git a/bin/notify-on-problem-data b/bin/notify-on-problem-data index 41507378..e172330d 100755 --- a/bin/notify-on-problem-data +++ b/bin/notify-on-problem-data @@ -5,12 +5,13 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored problem_data="${1:?A problem data TSV file is required as the first argument.}" if [[ -s "$problem_data" ]]; then echo "Notifying Slack about problem data." - "$bin"/notify-slack --upload "genbank-problem-data.tsv" < "$problem_data" + "$vendored"/notify-slack --upload "genbank-problem-data.tsv" < "$problem_data" else echo "No problem data found." fi diff --git a/bin/notify-slack b/bin/notify-slack deleted file mode 100755 index 6695d83f..00000000 --- a/bin/notify-slack +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -upload=0 -args=() - -for arg; do - case "$arg" in - --upload) - upload=1;; - *) - args+=("$arg");; - esac -done - -set -- "${args[@]}" - -text="${1:?Some message text is required.}" - -if [[ "$upload" == 1 ]]; then - echo "Uploading data to Slack with the message: $text" - curl https://slack.com/api/files.upload \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channels="$SLACK_CHANNELS" \ - --form-string title="$text" \ - --form-string filename="$text" \ - --form file=@/dev/stdin \ - --form filetype=text \ - --fail --silent --show-error \ - --http1.1 \ - --output /dev/null -else - echo "Posting Slack message: $text" - curl https://slack.com/api/chat.postMessage \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channel="$SLACK_CHANNELS" \ - --form-string text="$text" \ - --fail --silent --show-error \ - --http1.1 \ - --output /dev/null -fi diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk index cd9c4463..b3f50970 100644 --- a/workflow/snakemake_rules/slack_notifications.smk +++ b/workflow/snakemake_rules/slack_notifications.smk @@ -46,7 +46,7 @@ rule notify_gisaid: output: touch("data/gisaid/notify.done") run: - shell("./bin/notify-slack --upload flagged-annotations < {input.flagged_annotations}") + shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}") shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz") shell("./bin/notify-on-flagged-metadata-change {input.flagged_metadata} {params.s3_bucket}/flagged_metadata.txt.gz") @@ -59,7 +59,7 @@ rule notify_genbank: output: touch("data/genbank/notify.done") run: - shell("./bin/notify-slack --upload flagged-annotations < {input.flagged_annotations}") + shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}") # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script) shell("./bin/notify-on-problem-data data/genbank/problem_data.tsv") shell("./bin/notify-on-duplicate-biosample-change {input.duplicate_biosample} {params.s3_bucket}/duplicate_biosample.txt.gz") From 6ce947ef304df900b58e28db60887b98e1ebbfee Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 31 Jul 2023 15:43:52 -0700 Subject: [PATCH 6/6] Remove unused bin variable --- bin/notify-on-additional-info-change | 1 - bin/notify-on-duplicate-biosample-change | 1 - bin/notify-on-flagged-metadata-change | 1 - bin/notify-on-problem-data | 1 - 4 files changed, 4 deletions(-) diff --git a/bin/notify-on-additional-info-change b/bin/notify-on-additional-info-change index c6554c65..332db612 100755 --- a/bin/notify-on-additional-info-change +++ b/bin/notify-on-additional-info-change @@ -4,7 +4,6 @@ set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored src="${1:?A source additional info TSV file is required as the first argument.}" diff --git a/bin/notify-on-duplicate-biosample-change b/bin/notify-on-duplicate-biosample-change index 52a372b8..33e3be1a 100755 --- a/bin/notify-on-duplicate-biosample-change +++ b/bin/notify-on-duplicate-biosample-change @@ -4,7 +4,6 @@ set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored src="${1:?A source duplicate BioSample txt file is required as the first argument.}" diff --git a/bin/notify-on-flagged-metadata-change b/bin/notify-on-flagged-metadata-change index b10f2f22..834d684c 100755 --- a/bin/notify-on-flagged-metadata-change +++ b/bin/notify-on-flagged-metadata-change @@ -4,7 +4,6 @@ set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored src="${1:?A source flagged metadata txt file is required as the first argument.}" diff --git a/bin/notify-on-problem-data b/bin/notify-on-problem-data index e172330d..dc9d1ef9 100755 --- a/bin/notify-on-problem-data +++ b/bin/notify-on-problem-data @@ -4,7 +4,6 @@ set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored problem_data="${1:?A problem data TSV file is required as the first argument.}"