From 2e7b6fc16bb1120d3af4894a6d6a5e57035ab45b Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 4 Oct 2024 14:39:21 -0700 Subject: [PATCH] Clean out scripts that are either unused or now in vendored --- ingest/scripts/cloudfront-invalidate | 42 -------- ingest/scripts/csv-to-ndjson | 17 ---- ingest/scripts/download-from-s3 | 34 ------- ingest/scripts/fasta-to-ndjson | 86 ---------------- ingest/scripts/fetch-from-genbank | 22 ---- ingest/scripts/genbank-url | 100 ------------------- ingest/scripts/join-metadata-and-clades.py | 77 -------------- ingest/scripts/ndjson-to-tsv-and-fasta | 67 ------------- ingest/scripts/notify-on-diff | 35 ------- ingest/scripts/notify-on-job-fail | 21 ---- ingest/scripts/notify-on-job-start | 24 ----- ingest/scripts/notify-on-record-change | 54 ---------- ingest/scripts/notify-slack | 58 ----------- ingest/scripts/reverse_reversed_sequences.py | 29 ------ ingest/scripts/s3-object-exists | 9 -- ingest/scripts/sha256sum | 16 --- ingest/scripts/trigger | 56 ----------- ingest/scripts/trigger-on-new-data | 30 ------ ingest/scripts/upload-to-s3 | 76 -------------- 19 files changed, 853 deletions(-) delete mode 100755 ingest/scripts/cloudfront-invalidate delete mode 100755 ingest/scripts/csv-to-ndjson delete mode 100755 ingest/scripts/download-from-s3 delete mode 100755 ingest/scripts/fasta-to-ndjson delete mode 100755 ingest/scripts/fetch-from-genbank delete mode 100755 ingest/scripts/genbank-url delete mode 100755 ingest/scripts/join-metadata-and-clades.py delete mode 100755 ingest/scripts/ndjson-to-tsv-and-fasta delete mode 100755 ingest/scripts/notify-on-diff delete mode 100755 ingest/scripts/notify-on-job-fail delete mode 100755 ingest/scripts/notify-on-job-start delete mode 100755 ingest/scripts/notify-on-record-change delete mode 100755 ingest/scripts/notify-slack delete mode 100755 ingest/scripts/reverse_reversed_sequences.py delete mode 100755 ingest/scripts/s3-object-exists delete mode 100755 ingest/scripts/sha256sum delete mode 100755 ingest/scripts/trigger delete mode 100755 ingest/scripts/trigger-on-new-data delete mode 100755 ingest/scripts/upload-to-s3 diff --git a/ingest/scripts/cloudfront-invalidate b/ingest/scripts/cloudfront-invalidate deleted file mode 100755 index dec4852..0000000 --- a/ingest/scripts/cloudfront-invalidate +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 -set -euo pipefail - -main() { - local domain="$1" - shift - local paths=("$@") - local distribution invalidation - - echo "-> Finding CloudFront distribution" - distribution=$( - aws cloudfront list-distributions \ - --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ - --output text - ) - - if [[ -z $distribution || $distribution == None ]]; then - exec >&2 - echo "Unable to find CloudFront distribution id for $domain" - echo - echo "Are your AWS CLI credentials for the right account?" - exit 1 - fi - - echo "-> Creating CloudFront invalidation for distribution $distribution" - invalidation=$( - aws cloudfront create-invalidation \ - --distribution-id "$distribution" \ - --paths "${paths[@]}" \ - --query Invalidation.Id \ - --output text - ) - - echo "-> Waiting for CloudFront invalidation $invalidation to complete" - echo " Ctrl-C to stop waiting." - aws cloudfront wait invalidation-completed \ - --distribution-id "$distribution" \ - --id "$invalidation" -} - -main "$@" diff --git a/ingest/scripts/csv-to-ndjson b/ingest/scripts/csv-to-ndjson deleted file mode 100755 index 86e8412..0000000 --- a/ingest/scripts/csv-to-ndjson +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -""" -Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest: -https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson - -Convert CSV on stdin to NDJSON on stdout. -""" -import csv -import json -from sys import stdin, stdout - -# 200 MiB; default is 128 KiB -csv.field_size_limit(200 * 1024 * 1024) - -for row in csv.DictReader(stdin): - json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:') - print() diff --git a/ingest/scripts/download-from-s3 b/ingest/scripts/download-from-s3 deleted file mode 100755 index c9dbab5..0000000 --- a/ingest/scripts/download-from-s3 +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest repo -set -euo pipefail - -bin="$(dirname "$0")" - -main() { - local src="${1:?A source s3:// URL is required as the first argument.}" - local dst="${2:?A destination file path is required as the second argument.}" - - local s3path="${src#s3://}" - local bucket="${s3path%%/*}" - local key="${s3path#*/}" - - local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - dst_hash="$("$bin/sha256sum" < "$dst" || true)" - src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" - - echo "[ INFO] Downloading $src → $dst" - if [[ $src_hash != "$dst_hash" ]]; then - aws s3 cp --no-progress "$src" - | - if [[ "$src" == *.gz ]]; then - gunzip -cfq - elif [[ "$src" == *.xz ]]; then - xz -T0 -dcq - else - cat - fi > "$dst" - else - echo "[ INFO] Files are identical, skipping download" - fi -} - -main "$@" diff --git a/ingest/scripts/fasta-to-ndjson b/ingest/scripts/fasta-to-ndjson deleted file mode 100755 index 1ee9f8f..0000000 --- a/ingest/scripts/fasta-to-ndjson +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -""" -Parse delimited fields from FASTA header into NDJSON format to stdout. -The output NDJSON records are guaranteed to have at least two fields: - 1. strain - 2. sequence - -Uses the `augur.io.read_sequences` function to read the FASTA file, -so `augur` must be installed in the environment running the script. -""" - -import argparse -import json -import sys - -from augur.io import read_sequences - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--fasta", required=True, - help="FASTA file to be transformed into NDJSON format") - parser.add_argument("--fields", nargs="+", - help="Fields in the FASTA header, listed in the same order as the header. " + - "These will be used as the keys in the final NDJSON output. " + - "One of the fields must be 'strain'. " + - "These cannot include the field 'sequence' as this field is reserved for the genomic sequence.") - parser.add_argument("--separator", default='|', - help="Field separator in the FASTA header") - parser.add_argument("--exclude", nargs="*", - help="List of fields to exclude from final NDJSON record. " - "These cannot include 'strain' or 'sequence'.") - - args = parser.parse_args() - - fasta_fields = [field.lower() for field in args.fields] - - exclude_fields = [] - if args.exclude: - exclude_fields = [field.lower() for field in args.exclude] - - passed_checks = True - - if 'strain' not in fasta_fields: - print("ERROR: FASTA fields must include a 'strain' field.", file=sys.stderr) - passed_checks = False - - if 'sequence' in fasta_fields: - print("ERROR: FASTA fields cannot include a 'sequence' field.", file=sys.stderr) - passed_checks = False - - if 'strain' in exclude_fields: - print("ERROR: The field 'strain' cannot be excluded from the output.", file=sys.stderr) - passed_checks = False - - if 'sequence' in exclude_fields: - print("ERROR: The field 'sequence' cannot be excluded from the output.", file=sys.stderr) - passed_checks = False - - missing_fields = [field for field in exclude_fields if field not in fasta_fields] - if missing_fields: - print(f"ERROR: The following exclude fields do not match any FASTA fields: {missing_fields}", file=sys.stderr) - passed_checks = False - - if not passed_checks: - print("ERROR: Failed to parse FASTA file into NDJSON records.","See detailed errors above.", file=sys.stderr) - sys.exit(1) - - sequences = read_sequences(args.fasta) - - for sequence in sequences: - field_values = [ - value.strip() - for value in sequence.description.split(args.separator) - ] - record = dict(zip(fasta_fields, field_values)) - record['sequence'] = str(sequence.seq).upper() - - for field in exclude_fields: - del record[field] - - json.dump(record, sys.stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/scripts/fetch-from-genbank b/ingest/scripts/fetch-from-genbank deleted file mode 100755 index f66715a..0000000 --- a/ingest/scripts/fetch-from-genbank +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# -# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest: -# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank -# -set -euo pipefail - -bin="$(dirname "$0")" - - -main() { - local ncbi_taxon_id="${1:?NCBI taxon id is required.}" - fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson -} - -fetch() { - curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \ - --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/dengue (hello@nextstrain.org)' -} - -main "$@" diff --git a/ingest/scripts/genbank-url b/ingest/scripts/genbank-url deleted file mode 100755 index 20120be..0000000 --- a/ingest/scripts/genbank-url +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate URL to download all Pathogen sequences and their curated metadata -from GenBank via NCBI Virus. - -The URL this program builds is based on the URL for SARS-CoV-2 constructed with - - https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url - -and observing the network activity at - - https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Zika%20virus,%20taxid:64320 -""" -from urllib.parse import urlencode -import argparse -from datetime import date - -def parse_args(): - parser = argparse.ArgumentParser( - description="Given an NCBI taxon ID, generate URL to download " - "all viral sequences and their curated metadata from GenBank via NCBI Virus." - ) - parser.add_argument( - "--ncbi-taxon-id", - help="NCBI Taxon ID.", - default="11082", - required=True - ) - return parser.parse_args() - -def build_query_url(ncbi_taxon_id: str): - """ - Generate URL to download all viral sequences and their curated metadata - from GenBank via NCBI Virus. - """ - today = str(date.today().strftime("%Y-%m-%d")) - endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" - params = { - # Search criteria - 'fq': [ - '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) - f'VirusLineageId_ss:({ncbi_taxon_id})', # NCBI Taxon id for virus - 'Division_s:("VRL")', # Restrict to viral sequences, avoid PAT - '{!tag=SLen_i}SLen_i:([5000 TO 15000])', # Longer than 5K bp, shorter than 15k bp - #'{!tag=CollectionDate_s}CollectionDate_s:([2013-01-01T00:00:00.00Z TO ' + today + 'T00:00:00.00Z ])', # If needed to restrict time period modify this line. Currenlty after 2013 - #'{!tag=UpdateDate_dt}UpdateDate_dt:([2022-01-01T00:00:00.00Z TO 2022-12-01T00:00:00.00Z ])', # Activate if only pulling recently modified or added data - ], - - # Unclear, but seems necessary. - 'q': '*:*', - - # Response format - 'cmd': 'download', - 'dlfmt': 'csv', - 'fl': ','.join( - ':'.join(names) for names in [ - # Pairs of (output column name, source data field). - ('genbank_accession', 'id'), - ('genbank_accession_rev', 'AccVer_s'), - ('database', 'SourceDB_s'), - ('strain', 'Isolate_s'), - ('strain_s', 'Strain_s'), # Derive strain name if Isolate_s is empty - ('viruslineage_ids', 'VirusLineageId_ss'), # Allows derivation of subtypes based on lineage ID - ('region', 'Region_s'), - ('location', 'CountryFull_s'), - ('collected', 'CollectionDate_s'), - ('submitted', 'CreateDate_dt'), - ('updated', 'UpdateDate_dt'), # Check for recent updates to the record - ('length', 'SLen_i'), - ('host', 'Host_s'), - ('isolation_source', 'Isolation_csv'), - ('bioproject_accession', 'BioProject_s'), - ('biosample_accession', 'BioSample_s'), - ('sra_accession', 'SRALink_csv'), - ('title', 'Definition_s'), # Should be PubMed title, not Definition - ('authors', 'Authors_csv'), - ('submitting_organization', 'SubmitterAffilFull_s'), - ('publications', 'PubMed_csv'), - ('sequence', 'Nucleotide_seq'), - ] - ), - - # Stable sort with GenBank accessions. - # Columns are source data fields, not our output columns. - 'sort': 'id asc', - - # This isn't Entrez, but include the same email parameter it requires just - # to be nice. - 'email': 'hello@nextstrain.org', - } - query = urlencode(params, doseq = True, encoding = "utf-8") - - print(f"{endpoint}?{query}") - -def main(): - args = parse_args() - build_query_url(args.ncbi_taxon_id) - -if __name__ == '__main__': - main() diff --git a/ingest/scripts/join-metadata-and-clades.py b/ingest/scripts/join-metadata-and-clades.py deleted file mode 100755 index 99ed732..0000000 --- a/ingest/scripts/join-metadata-and-clades.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import re -import sys -import pandas as pd - -NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' -VALUE_MISSING_DATA = '?' - -column_map = { - "clade": "clade", - "outbreak": "outbreak", - "lineage": "lineage", - "coverage": "coverage", - "totalMissing": "missing_data", - "totalSubstitutions": "divergence", - "totalNonACGTNs": "nonACGTN", - "qc.missingData.status": "QC_missing_data", - "qc.mixedSites.status": "QC_mixed_sites", - "qc.privateMutations.status": "QC_rare_mutations", - "qc.frameShifts.status": "QC_frame_shifts", - "qc.stopCodons.status": "QC_stop_codons", - "frameShifts": "frame_shifts", - "isReverseComplement": "is_reverse_complement", -# "deletions": "deletions", -# "insertions": "insertions" -# "substitutions": "substitutions", -# "aaSubstitutions": "aaSubstitutions" -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Joins metadata file with Nextclade clade output", - ) - parser.add_argument("--metadata") - parser.add_argument("--nextclade") - parser.add_argument("--id-field") - parser.add_argument("-o", default=sys.stdout) - return parser.parse_args() - -def main(): - args = parse_args() - - metadata = pd.read_csv(args.metadata, index_col=args.id_field, - sep='\t', low_memory=False, na_filter = False) - - # Read and rename clade column to be more descriptive - clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME, - sep='\t', low_memory=False, na_filter = False) \ - .rename(columns=column_map) - - clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x)) - - # Select columns in column map - clades = clades[list(column_map.values())] - - # Separate long from short columns - short_metadata = metadata.iloc[:,:-2].copy() - long_metadata = metadata.iloc[:,-2:].copy() - - # Concatenate on columns - result = pd.merge( - short_metadata, clades, - left_index=True, - right_index=True, - how='left' - ) - - # Add long columns to back - result = pd.concat([result, long_metadata], axis=1) - - result.to_csv(args.o, index_label=args.id_field, sep='\t') - - -if __name__ == '__main__': - main() diff --git a/ingest/scripts/ndjson-to-tsv-and-fasta b/ingest/scripts/ndjson-to-tsv-and-fasta deleted file mode 100755 index d9d7331..0000000 --- a/ingest/scripts/ndjson-to-tsv-and-fasta +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -""" -Parses NDJSON records from stdin to two different files: a metadata TSV and a -sequences FASTA. - -Records that do not have an ID or sequence will be excluded from the output files. -""" -import argparse -import csv -import json -from sys import stderr, stdin - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv", - help="The output metadata TSV file") - parser.add_argument("--fasta", metavar="FASTA", default="data/sequences.fasta", - help="The output sequences FASTA file") - parser.add_argument("--metadata-columns", nargs="+", - help="List of fields from the NDJSON records to include as columns in the metadata TSV. " + - "Metadata TSV columns will be in the order of the columns provided.") - parser.add_argument("--id-field", default='strain', - help="Field from the records to use as the sequence ID in the FASTA file.") - parser.add_argument("--sequence-field", default='sequence', - help="Field from the record that holds the genomic sequence for the FASTA file.") - - args = parser.parse_args() - - with open(args.metadata, 'wt') as metadata_output: - with open(args.fasta, 'wt') as fasta_output: - metadata_csv = csv.DictWriter( - metadata_output, - args.metadata_columns, - restval="", - extrasaction='ignore', - delimiter='\t', - lineterminator='\n', - ) - metadata_csv.writeheader() - - for index, record in enumerate(stdin): - record = json.loads(record) - - sequence_id = str(record.get(args.id_field, '')) - sequence = str(record.get(args.sequence_field, '')) - - if not sequence_id: - print( - f"WARNING: Record number {index} does not have a sequence ID.", - "This record will be excluded from the output files.", - file=stderr - ) - elif not sequence: - print( - f"WARNING: Record number {index} does not have a sequence.", - "This record will be excluded from the output files.", - file=stderr - ) - else: - metadata_csv.writerow(record) - - print(f">{sequence_id}", file=fasta_output) - print(f"{sequence}" , file= fasta_output) diff --git a/ingest/scripts/notify-on-diff b/ingest/scripts/notify-on-diff deleted file mode 100755 index c304d6b..0000000 --- a/ingest/scripts/notify-on-diff +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -bin="$(dirname "$0")" - -src="${1:?A source file is required as the first argument.}" -dst="${2:?A destination s3:// URL is required as the second argument.}" - -dst_local="$(mktemp -t s3-file-XXXXXX)" -diff="$(mktemp -t diff-XXXXXX)" - -trap "rm -f '$dst_local' '$diff'" EXIT - -# if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 - -"$bin"/download-from-s3 "$dst" "$dst_local" - -# diff's exit code is 0 for no differences, 1 for differences found, and >1 for errors -diff_exit_code=0 -diff "$dst_local" "$src" > "$diff" || diff_exit_code=$? - -if [[ "$diff_exit_code" -eq 1 ]]; then - echo "Notifying Slack about diff." - "$bin"/notify-slack --upload "$src.diff" < "$diff" -elif [[ "$diff_exit_code" -gt 1 ]]; then - echo "Notifying Slack about diff failure" - "$bin"/notify-slack "Diff failed for $src" -else - echo "No change in $src." -fi diff --git a/ingest/scripts/notify-on-job-fail b/ingest/scripts/notify-on-job-fail deleted file mode 100755 index 23d3a92..0000000 --- a/ingest/scripts/notify-on-job-fail +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -: "${AWS_BATCH_JOB_ID:=}" -: "${GITHUB_RUN_ID:=}" - -bin="$(dirname "$0")" - -echo "Notifying Slack about failed ingest job." -message="❌ Ingest job has FAILED 😞 " - -if [ -n "${AWS_BATCH_JOB_ID}" ]; then - message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` () for error details. " -elif [ -n "${GITHUB_RUN_ID}" ]; then - message+="See GitHub Action for error details. " -fi - -"$bin"/notify-slack "$message" diff --git a/ingest/scripts/notify-on-job-start b/ingest/scripts/notify-on-job-start deleted file mode 100755 index 9410fa3..0000000 --- a/ingest/scripts/notify-on-job-start +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -: "${AWS_BATCH_JOB_ID:=}" -: "${GITHUB_RUN_ID:=}" - -bin="$(dirname "$0")" - -echo "Notifying Slack about started ingest job." -message="🐵 Monkeypox ingest job has started." - -if [[ -n "${GITHUB_RUN_ID}" ]]; then - message+=" The job was submitted by GitHub Action ." -fi - -if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then - message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` ()." - message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ingest/"'```' -fi - -"$bin"/notify-slack "$message" diff --git a/ingest/scripts/notify-on-record-change b/ingest/scripts/notify-on-record-change deleted file mode 100755 index 595835b..0000000 --- a/ingest/scripts/notify-on-record-change +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -bin="$(dirname "$0")" - -src="${1:?A source ndjson file is required as the first argument.}" -dst="${2:?A destination ndjson s3:// URL is required as the second argument.}" -source_name=${3:?A record source name is required as the third argument.} - -# if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 - -s3path="${dst#s3://}" -bucket="${s3path%%/*}" -key="${s3path#*/}" - -src_record_count="$(wc -l < "$src")" - -# Try getting record count from S3 object metadata -dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)" -if [[ -z "$dst_record_count" ]]; then - # This object doesn't have the record count stored as metadata - # We have to download it and count the lines locally - dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))" -fi - -added_records="$(( src_record_count - dst_record_count ))" - -printf "%'4d %s\n" "$src_record_count" "$src" -printf "%'4d %s\n" "$dst_record_count" "$dst" -printf "%'4d added records\n" "$added_records" - -slack_message="" - -if [[ $added_records -gt 0 ]]; then - echo "Notifying Slack about added records (n=$added_records)" - slack_message="📈 New monkeypox records (n=$added_records) found on $source_name." - -elif [[ $added_records -lt 0 ]]; then - echo "Notifying Slack about fewer records (n=$added_records)" - slack_message="📉 Fewer monkeypox records (n=$added_records) found on $source_name." - -else - echo "Notifying Slack about same number of records" - slack_message="⛔ No new monkeypox records found on $source_name." -fi - -slack_message+=" (Total record count: $src_record_count)" - -"$bin"/notify-slack "$slack_message" diff --git a/ingest/scripts/notify-slack b/ingest/scripts/notify-slack deleted file mode 100755 index 6ca20de..0000000 --- a/ingest/scripts/notify-slack +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest repo -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -upload=0 -output=/dev/null -thread_ts="" -broadcast=0 -args=() - -for arg; do - case "$arg" in - --upload) - upload=1;; - --output=*) - output="${arg#*=}";; - --thread-ts=*) - thread_ts="${arg#*=}";; - --broadcast) - broadcast=1;; - *) - args+=("$arg");; - esac -done - -set -- "${args[@]}" - -text="${1:?Some message text is required.}" - -if [[ "$upload" == 1 ]]; then - echo "Uploading data to Slack with the message: $text" - curl https://slack.com/api/files.upload \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channels="$SLACK_CHANNELS" \ - --form-string title="$text" \ - --form-string filename="$text" \ - --form-string thread_ts="$thread_ts" \ - --form-string reply_broadcast="$broadcast" \ - --form file=@/dev/stdin \ - --form filetype=text \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -else - echo "Posting Slack message: $text" - curl https://slack.com/api/chat.postMessage \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channel="$SLACK_CHANNELS" \ - --form-string text="$text" \ - --form-string thread_ts="$thread_ts" \ - --form-string reply_broadcast="$broadcast" \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -fi diff --git a/ingest/scripts/reverse_reversed_sequences.py b/ingest/scripts/reverse_reversed_sequences.py deleted file mode 100755 index 1ee9be2..0000000 --- a/ingest/scripts/reverse_reversed_sequences.py +++ /dev/null @@ -1,29 +0,0 @@ -import pandas as pd -import argparse -from Bio import SeqIO - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="Reverse-complement reverse-complemented sequence", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--metadata', type=str, required=True, help="input metadata") - parser.add_argument('--sequences', type=str, required=True, help="input sequences") - parser.add_argument('--output', type=str, required=True, help="output sequences") - args = parser.parse_args() - - metadata = pd.read_csv(args.metadata, sep='\t') - - # Read in fasta file - with open(args.sequences, 'r') as f_in: - with open(args.output, 'w') as f_out: - for seq in SeqIO.parse(f_in, 'fasta'): - # Check if metadata['reverse'] is True - if metadata.loc[metadata['accession'] == seq.id, 'reverse'].values[0] == True: - # Reverse-complement sequence - seq.seq = seq.seq.reverse_complement() - print("Reverse-complementing sequence:", seq.id) - - # Write sequences to file - SeqIO.write(seq, f_out, 'fasta') diff --git a/ingest/scripts/s3-object-exists b/ingest/scripts/s3-object-exists deleted file mode 100755 index d586d0b..0000000 --- a/ingest/scripts/s3-object-exists +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest -set -euo pipefail - -url="${1#s3://}" -bucket="${url%%/*}" -key="${url#*/}" - -aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/ingest/scripts/sha256sum b/ingest/scripts/sha256sum deleted file mode 100755 index aa05af0..0000000 --- a/ingest/scripts/sha256sum +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 -# Originally copied from nextstrain/ncov-ingest repo -""" -Portable sha256sum utility. -""" -from hashlib import sha256 -from sys import stdin - -chunk_size = 5 * 1024**2 # 5 MiB - -h = sha256() - -for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): - h.update(chunk) - -print(h.hexdigest()) diff --git a/ingest/scripts/trigger b/ingest/scripts/trigger deleted file mode 100755 index d40553b..0000000 --- a/ingest/scripts/trigger +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${PAT_GITHUB_DISPATCH:=}" - -repo="${1:?A repository name is required as the first argument.}" -event_type="${2:?An event type is required as the second argument.}" -shift 2 - -if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then - cat >&2 <<. -You must specify options to curl for your GitHub credentials. For example, you -can specify your GitHub username, and will be prompted for your password: - - $0 $repo $event_type --user - -Be sure to enter a personal access token¹ as your password since GitHub has -discontinued password authentication to the API starting on November 13, 2020². - -You can also store your credentials or a personal access token in a netrc -file³: - - machine api.github.com - login - password - -and then tell curl to use it: - - $0 $repo $event_type --netrc - -which will then not require you to type your password every time. - -¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line -² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password -³ https://ec.haxx.se/usingcurl/usingcurl-netrc -. - exit 1 -fi - -auth=':' -if [[ -n $PAT_GITHUB_DISPATCH ]]; then - auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" -fi - -if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \ - -H 'Accept: application/vnd.github.v3+json' \ - -H 'Content-Type: application/json' \ - -H "$auth" \ - -d '{"event_type":"'"$event_type"'"}' \ - "$@" -then - echo "Successfully triggered $event_type" -else - echo "Request failed" >&2 - exit 1 -fi diff --git a/ingest/scripts/trigger-on-new-data b/ingest/scripts/trigger-on-new-data deleted file mode 100755 index 760a018..0000000 --- a/ingest/scripts/trigger-on-new-data +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" - -bin="$(dirname "$0")" - -metadata="${1:?A metadata upload output file is required as the first argument.}" -sequences="${2:?An sequence FASTA upload output file is required as the second argument.}" -identical_file_message="${3:-files are identical}" - -new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?) -new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?) - -slack_message="" - -# grep exit status 0 for found match, 1 for no match, 2 if an error occurred -if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then - slack_message="Triggering new builds due to updated metadata and/or sequences" - "$bin"/trigger "monkeypox" "rebuild" -elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then - slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files." -else - slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated." -fi - - -if ! "$bin"/notify-slack "$slack_message"; then - echo "Notifying Slack failed, but exiting with success anyway." -fi diff --git a/ingest/scripts/upload-to-s3 b/ingest/scripts/upload-to-s3 deleted file mode 100755 index b993c3d..0000000 --- a/ingest/scripts/upload-to-s3 +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest repo -set -euo pipefail - -bin="$(dirname "$0")" - -main() { - local quiet=0 - - for arg; do - case "$arg" in - --quiet) - quiet=1 - shift;; - *) - break;; - esac - done - - local src="${1:?A source file is required as the first argument.}" - local dst="${2:?A destination s3:// URL is required as the second argument.}" - local cloudfront_domain="${3:-}" - - local s3path="${dst#s3://}" - local bucket="${s3path%%/*}" - local key="${s3path#*/}" - - local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - src_hash="$("$bin/sha256sum" < "$src")" - dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" - - if [[ $src_hash != "$dst_hash" ]]; then - # The record count may have changed - src_record_count="$(wc -l < "$src")" - - echo "Uploading $src → $dst" - if [[ "$dst" == *.gz ]]; then - gzip -c "$src" - elif [[ "$dst" == *.xz ]]; then - xz -2 -T0 -c "$src" - else - cat "$src" - fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")" - - if [[ -n $cloudfront_domain ]]; then - echo "Creating CloudFront invalidation for $cloudfront_domain/$key" - if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then - echo "CloudFront invalidation failed, but exiting with success anyway." - fi - fi - - if [[ $quiet == 1 ]]; then - echo "Quiet mode. No Slack notification sent." - exit 0 - fi - - if ! "$bin"/notify-slack "Updated $dst available."; then - echo "Notifying Slack failed, but exiting with success anyway." - fi - else - echo "Uploading $src → $dst: files are identical, skipping upload" - fi -} - -content-type() { - case "$1" in - *.tsv) echo --content-type=text/tab-separated-values;; - *.csv) echo --content-type=text/comma-separated-values;; - *.ndjson) echo --content-type=application/x-ndjson;; - *.gz) echo --content-type=application/gzip;; - *.xz) echo --content-type=application/x-xz;; - *) echo --content-type=text/plain;; - esac -} - -main "$@"