From 2e7b6fc16bb1120d3af4894a6d6a5e57035ab45b Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Fri, 4 Oct 2024 14:39:21 -0700
Subject: [PATCH] Clean out scripts that are either unused or now in vendored

---
 ingest/scripts/cloudfront-invalidate         |  42 --------
 ingest/scripts/csv-to-ndjson                 |  17 ----
 ingest/scripts/download-from-s3              |  34 -------
 ingest/scripts/fasta-to-ndjson               |  86 ----------------
 ingest/scripts/fetch-from-genbank            |  22 ----
 ingest/scripts/genbank-url                   | 100 -------------------
 ingest/scripts/join-metadata-and-clades.py   |  77 --------------
 ingest/scripts/ndjson-to-tsv-and-fasta       |  67 -------------
 ingest/scripts/notify-on-diff                |  35 -------
 ingest/scripts/notify-on-job-fail            |  21 ----
 ingest/scripts/notify-on-job-start           |  24 -----
 ingest/scripts/notify-on-record-change       |  54 ----------
 ingest/scripts/notify-slack                  |  58 -----------
 ingest/scripts/reverse_reversed_sequences.py |  29 ------
 ingest/scripts/s3-object-exists              |   9 --
 ingest/scripts/sha256sum                     |  16 ---
 ingest/scripts/trigger                       |  56 -----------
 ingest/scripts/trigger-on-new-data           |  30 ------
 ingest/scripts/upload-to-s3                  |  76 --------------
 19 files changed, 853 deletions(-)
 delete mode 100755 ingest/scripts/cloudfront-invalidate
 delete mode 100755 ingest/scripts/csv-to-ndjson
 delete mode 100755 ingest/scripts/download-from-s3
 delete mode 100755 ingest/scripts/fasta-to-ndjson
 delete mode 100755 ingest/scripts/fetch-from-genbank
 delete mode 100755 ingest/scripts/genbank-url
 delete mode 100755 ingest/scripts/join-metadata-and-clades.py
 delete mode 100755 ingest/scripts/ndjson-to-tsv-and-fasta
 delete mode 100755 ingest/scripts/notify-on-diff
 delete mode 100755 ingest/scripts/notify-on-job-fail
 delete mode 100755 ingest/scripts/notify-on-job-start
 delete mode 100755 ingest/scripts/notify-on-record-change
 delete mode 100755 ingest/scripts/notify-slack
 delete mode 100755 ingest/scripts/reverse_reversed_sequences.py
 delete mode 100755 ingest/scripts/s3-object-exists
 delete mode 100755 ingest/scripts/sha256sum
 delete mode 100755 ingest/scripts/trigger
 delete mode 100755 ingest/scripts/trigger-on-new-data
 delete mode 100755 ingest/scripts/upload-to-s3

diff --git a/ingest/scripts/cloudfront-invalidate b/ingest/scripts/cloudfront-invalidate
deleted file mode 100755
index dec4852..0000000
--- a/ingest/scripts/cloudfront-invalidate
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
-set -euo pipefail
-
-main() {
-    local domain="$1"
-    shift
-    local paths=("$@")
-    local distribution invalidation
-
-    echo "-> Finding CloudFront distribution"
-    distribution=$(
-        aws cloudfront list-distributions \
-            --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \
-            --output text
-    )
-
-    if [[ -z $distribution || $distribution == None ]]; then
-        exec >&2
-        echo "Unable to find CloudFront distribution id for $domain"
-        echo
-        echo "Are your AWS CLI credentials for the right account?"
-        exit 1
-    fi
-
-    echo "-> Creating CloudFront invalidation for distribution $distribution"
-    invalidation=$(
-        aws cloudfront create-invalidation \
-            --distribution-id "$distribution" \
-            --paths "${paths[@]}" \
-            --query Invalidation.Id \
-            --output text
-    )
-
-    echo "-> Waiting for CloudFront invalidation $invalidation to complete"
-    echo "   Ctrl-C to stop waiting."
-    aws cloudfront wait invalidation-completed \
-        --distribution-id "$distribution" \
-        --id "$invalidation"
-}
-
-main "$@"
diff --git a/ingest/scripts/csv-to-ndjson b/ingest/scripts/csv-to-ndjson
deleted file mode 100755
index 86e8412..0000000
--- a/ingest/scripts/csv-to-ndjson
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env python3
-"""
-Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest:
-https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson
-
-Convert CSV on stdin to NDJSON on stdout.
-"""
-import csv
-import json
-from sys import stdin, stdout
-
-# 200 MiB; default is 128 KiB
-csv.field_size_limit(200 * 1024 * 1024)
-
-for row in csv.DictReader(stdin):
-    json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
-    print()
diff --git a/ingest/scripts/download-from-s3 b/ingest/scripts/download-from-s3
deleted file mode 100755
index c9dbab5..0000000
--- a/ingest/scripts/download-from-s3
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest repo
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-main() {
-    local src="${1:?A source s3:// URL is required as the first argument.}"
-    local dst="${2:?A destination file path is required as the second argument.}"
-
-    local s3path="${src#s3://}"
-    local bucket="${s3path%%/*}"
-    local key="${s3path#*/}"
-
-    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    dst_hash="$("$bin/sha256sum" < "$dst" || true)"
-    src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
-
-    echo "[ INFO] Downloading $src → $dst"
-    if [[ $src_hash != "$dst_hash" ]]; then
-        aws s3 cp --no-progress "$src" - |
-        if [[ "$src" == *.gz ]]; then
-            gunzip -cfq
-        elif  [[ "$src" == *.xz ]]; then
-            xz -T0 -dcq
-        else
-            cat
-        fi > "$dst"
-    else
-        echo "[ INFO] Files are identical, skipping download"
-    fi
-}
-
-main "$@"
diff --git a/ingest/scripts/fasta-to-ndjson b/ingest/scripts/fasta-to-ndjson
deleted file mode 100755
index 1ee9f8f..0000000
--- a/ingest/scripts/fasta-to-ndjson
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parse delimited fields from FASTA header into NDJSON format to stdout.
-The output NDJSON records are guaranteed to have at least two fields:
-    1. strain
-    2. sequence
-
-Uses the `augur.io.read_sequences` function to read the FASTA file,
-so `augur` must be installed in the environment running the script.
-"""
-
-import argparse
-import json
-import sys
-
-from augur.io import read_sequences
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--fasta", required=True,
-        help="FASTA file to be transformed into NDJSON format")
-    parser.add_argument("--fields", nargs="+",
-        help="Fields in the FASTA header, listed in the same order as the header. " +
-             "These will be used as the keys in the final NDJSON output. " +
-             "One of the fields must be 'strain'. " +
-             "These cannot include the field 'sequence' as this field is reserved for the genomic sequence.")
-    parser.add_argument("--separator", default='|',
-        help="Field separator in the FASTA header")
-    parser.add_argument("--exclude", nargs="*",
-        help="List of fields to exclude from final NDJSON record. "
-             "These cannot include 'strain' or 'sequence'.")
-
-    args = parser.parse_args()
-
-    fasta_fields = [field.lower() for field in args.fields]
-
-    exclude_fields = []
-    if args.exclude:
-        exclude_fields = [field.lower() for field in args.exclude]
-
-    passed_checks = True
-
-    if 'strain' not in fasta_fields:
-        print("ERROR: FASTA fields must include a 'strain' field.", file=sys.stderr)
-        passed_checks = False
-
-    if 'sequence' in fasta_fields:
-        print("ERROR: FASTA fields cannot include a 'sequence' field.", file=sys.stderr)
-        passed_checks = False
-
-    if 'strain' in exclude_fields:
-        print("ERROR: The field 'strain' cannot be excluded from the output.", file=sys.stderr)
-        passed_checks = False
-
-    if 'sequence' in exclude_fields:
-        print("ERROR: The field 'sequence' cannot be excluded from the output.", file=sys.stderr)
-        passed_checks = False
-
-    missing_fields = [field for field in exclude_fields if field not in fasta_fields]
-    if missing_fields:
-        print(f"ERROR: The following exclude fields do not match any FASTA fields: {missing_fields}", file=sys.stderr)
-        passed_checks = False
-
-    if not passed_checks:
-        print("ERROR: Failed to parse FASTA file into NDJSON records.","See detailed errors above.", file=sys.stderr)
-        sys.exit(1)
-
-    sequences = read_sequences(args.fasta)
-
-    for sequence in sequences:
-        field_values = [
-            value.strip()
-            for value in sequence.description.split(args.separator)
-        ]
-        record = dict(zip(fasta_fields, field_values))
-        record['sequence'] = str(sequence.seq).upper()
-
-        for field in exclude_fields:
-            del record[field]
-
-        json.dump(record, sys.stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/scripts/fetch-from-genbank b/ingest/scripts/fetch-from-genbank
deleted file mode 100755
index f66715a..0000000
--- a/ingest/scripts/fetch-from-genbank
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-#
-# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest:
-#   https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank
-#
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-
-main() {
-    local ncbi_taxon_id="${1:?NCBI taxon id is required.}"
-    fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson
-}
-
-fetch() {
-    curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \
-        --fail --silent --show-error --http1.1 \
-        --header 'User-Agent: https://github.com/nextstrain/dengue (hello@nextstrain.org)'
-}
-
-main "$@"
diff --git a/ingest/scripts/genbank-url b/ingest/scripts/genbank-url
deleted file mode 100755
index 20120be..0000000
--- a/ingest/scripts/genbank-url
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generate URL to download all Pathogen sequences and their curated metadata
-from GenBank via NCBI Virus.
-
-The URL this program builds is based on the URL for SARS-CoV-2 constructed with
-
-    https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url
-
-and observing the network activity at
-
-    https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Zika%20virus,%20taxid:64320
-"""
-from urllib.parse import urlencode
-import argparse
-from datetime import date
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Given an NCBI taxon ID, generate URL to download "
-        "all viral sequences and their curated metadata from GenBank via NCBI Virus."
-    )
-    parser.add_argument(
-        "--ncbi-taxon-id",
-        help="NCBI Taxon ID.",
-        default="11082",
-        required=True
-    )
-    return parser.parse_args()
-
-def build_query_url(ncbi_taxon_id: str):
-    """
-    Generate URL to download all viral sequences and their curated metadata
-    from GenBank via NCBI Virus.
-    """
-    today = str(date.today().strftime("%Y-%m-%d"))
-    endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
-    params = {
-        # Search criteria
-        'fq': [
-            '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
-            f'VirusLineageId_ss:({ncbi_taxon_id})', # NCBI Taxon id for virus
-            'Division_s:("VRL")', # Restrict to viral sequences, avoid PAT
-            '{!tag=SLen_i}SLen_i:([5000 TO 15000])', # Longer than 5K bp, shorter than 15k bp
-            #'{!tag=CollectionDate_s}CollectionDate_s:([2013-01-01T00:00:00.00Z TO ' + today + 'T00:00:00.00Z ])', # If needed to restrict time period modify this line. Currenlty after 2013
-            #'{!tag=UpdateDate_dt}UpdateDate_dt:([2022-01-01T00:00:00.00Z TO 2022-12-01T00:00:00.00Z ])', # Activate if only pulling recently modified or added data
-        ],
-
-        # Unclear, but seems necessary.
-        'q': '*:*',
-
-        # Response format
-        'cmd': 'download',
-        'dlfmt': 'csv',
-        'fl': ','.join(
-            ':'.join(names) for names in [
-                # Pairs of (output column name, source data field).
-                ('genbank_accession',       'id'),
-                ('genbank_accession_rev',   'AccVer_s'),
-                ('database',                'SourceDB_s'),
-                ('strain',                  'Isolate_s'),
-                ('strain_s',               'Strain_s'), # Derive strain name if Isolate_s is empty
-                ('viruslineage_ids',        'VirusLineageId_ss'), # Allows derivation of subtypes based on lineage ID
-                ('region',                  'Region_s'),
-                ('location',                'CountryFull_s'),
-                ('collected',               'CollectionDate_s'),
-                ('submitted',               'CreateDate_dt'),
-                ('updated',                 'UpdateDate_dt'), # Check for recent updates to the record
-                ('length',                  'SLen_i'),
-                ('host',                    'Host_s'),
-                ('isolation_source',        'Isolation_csv'),
-                ('bioproject_accession',    'BioProject_s'),
-                ('biosample_accession',     'BioSample_s'),
-                ('sra_accession',           'SRALink_csv'),
-                ('title',                   'Definition_s'), # Should be PubMed title, not Definition
-                ('authors',                 'Authors_csv'),
-                ('submitting_organization', 'SubmitterAffilFull_s'),
-                ('publications',            'PubMed_csv'),
-                ('sequence',                'Nucleotide_seq'),
-            ]
-        ),
-
-        # Stable sort with GenBank accessions.
-        # Columns are source data fields, not our output columns.
-        'sort': 'id asc',
-
-        # This isn't Entrez, but include the same email parameter it requires just
-        # to be nice.
-        'email': 'hello@nextstrain.org',
-    }
-    query = urlencode(params, doseq = True, encoding = "utf-8")
-
-    print(f"{endpoint}?{query}")
-
-def main():
-    args = parse_args()
-    build_query_url(args.ncbi_taxon_id)
-
-if __name__ == '__main__':
-    main()
diff --git a/ingest/scripts/join-metadata-and-clades.py b/ingest/scripts/join-metadata-and-clades.py
deleted file mode 100755
index 99ed732..0000000
--- a/ingest/scripts/join-metadata-and-clades.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import re
-import sys
-import pandas as pd
-
-NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
-VALUE_MISSING_DATA = '?'
-
-column_map = {
-    "clade": "clade",
-    "outbreak": "outbreak",
-    "lineage": "lineage",
-    "coverage": "coverage",
-    "totalMissing": "missing_data",
-    "totalSubstitutions": "divergence",
-    "totalNonACGTNs": "nonACGTN",
-    "qc.missingData.status": "QC_missing_data",
-    "qc.mixedSites.status": "QC_mixed_sites",
-    "qc.privateMutations.status": "QC_rare_mutations",
-    "qc.frameShifts.status": "QC_frame_shifts",
-    "qc.stopCodons.status": "QC_stop_codons",
-    "frameShifts": "frame_shifts",
-    "isReverseComplement": "is_reverse_complement",
-#    "deletions": "deletions",
-#    "insertions": "insertions"
-#    "substitutions": "substitutions",
-#    "aaSubstitutions": "aaSubstitutions"
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Joins metadata file with Nextclade clade output",
-    )
-    parser.add_argument("--metadata")
-    parser.add_argument("--nextclade")
-    parser.add_argument("--id-field")
-    parser.add_argument("-o", default=sys.stdout)
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-
-    metadata = pd.read_csv(args.metadata, index_col=args.id_field,
-                           sep='\t', low_memory=False, na_filter = False)
-
-    # Read and rename clade column to be more descriptive
-    clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
-                         sep='\t', low_memory=False, na_filter = False) \
-            .rename(columns=column_map)
-    
-    clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x))
-
-    # Select columns in column map
-    clades = clades[list(column_map.values())]
-
-    # Separate long from short columns
-    short_metadata = metadata.iloc[:,:-2].copy()
-    long_metadata = metadata.iloc[:,-2:].copy()
-
-    # Concatenate on columns
-    result = pd.merge(
-        short_metadata, clades,
-        left_index=True,
-        right_index=True,
-        how='left'
-    )
-
-    # Add long columns to back
-    result = pd.concat([result, long_metadata], axis=1)
-
-    result.to_csv(args.o, index_label=args.id_field, sep='\t')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ingest/scripts/ndjson-to-tsv-and-fasta b/ingest/scripts/ndjson-to-tsv-and-fasta
deleted file mode 100755
index d9d7331..0000000
--- a/ingest/scripts/ndjson-to-tsv-and-fasta
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parses NDJSON records from stdin to two different files: a metadata TSV and a
-sequences FASTA.
-
-Records that do not have an ID or sequence will be excluded from the output files.
-"""
-import argparse
-import csv
-import json
-from sys import stderr, stdin
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv",
-        help="The output metadata TSV file")
-    parser.add_argument("--fasta", metavar="FASTA", default="data/sequences.fasta",
-        help="The output sequences FASTA file")
-    parser.add_argument("--metadata-columns", nargs="+",
-        help="List of fields from the NDJSON records to include as columns in the metadata TSV. " +
-             "Metadata TSV columns will be in the order of the columns provided.")
-    parser.add_argument("--id-field", default='strain',
-        help="Field from the records to use as the sequence ID in the FASTA file.")
-    parser.add_argument("--sequence-field", default='sequence',
-        help="Field from the record that holds the genomic sequence for the FASTA file.")
-
-    args = parser.parse_args()
-
-    with open(args.metadata, 'wt') as metadata_output:
-        with open(args.fasta, 'wt') as fasta_output:
-            metadata_csv = csv.DictWriter(
-                metadata_output,
-                args.metadata_columns,
-                restval="",
-                extrasaction='ignore',
-                delimiter='\t',
-                lineterminator='\n',
-            )
-            metadata_csv.writeheader()
-
-            for index, record in enumerate(stdin):
-                record = json.loads(record)
-
-                sequence_id = str(record.get(args.id_field, ''))
-                sequence = str(record.get(args.sequence_field, ''))
-
-                if not sequence_id:
-                    print(
-                        f"WARNING: Record number {index} does not have a sequence ID.",
-                        "This record will be excluded from the output files.",
-                        file=stderr
-                    )
-                elif not sequence:
-                    print(
-                        f"WARNING: Record number {index} does not have a sequence.",
-                        "This record will be excluded from the output files.",
-                        file=stderr
-                    )
-                else:
-                    metadata_csv.writerow(record)
-
-                    print(f">{sequence_id}", file=fasta_output)
-                    print(f"{sequence}" , file= fasta_output)
diff --git a/ingest/scripts/notify-on-diff b/ingest/scripts/notify-on-diff
deleted file mode 100755
index c304d6b..0000000
--- a/ingest/scripts/notify-on-diff
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-src="${1:?A source file is required as the first argument.}"
-dst="${2:?A destination s3:// URL is required as the second argument.}"
-
-dst_local="$(mktemp -t s3-file-XXXXXX)"
-diff="$(mktemp -t diff-XXXXXX)"
-
-trap "rm -f '$dst_local' '$diff'" EXIT
-
-# if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
-
-"$bin"/download-from-s3 "$dst" "$dst_local"
-
-# diff's exit code is 0 for no differences, 1 for differences found, and >1 for errors
-diff_exit_code=0
-diff "$dst_local" "$src" > "$diff" || diff_exit_code=$?
-
-if [[ "$diff_exit_code" -eq 1 ]]; then
-    echo "Notifying Slack about diff."
-    "$bin"/notify-slack --upload "$src.diff" < "$diff"
-elif [[ "$diff_exit_code" -gt 1 ]]; then
-    echo "Notifying Slack about diff failure"
-    "$bin"/notify-slack "Diff failed for $src"
-else
-    echo "No change in $src."
-fi
diff --git a/ingest/scripts/notify-on-job-fail b/ingest/scripts/notify-on-job-fail
deleted file mode 100755
index 23d3a92..0000000
--- a/ingest/scripts/notify-on-job-fail
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-: "${AWS_BATCH_JOB_ID:=}"
-: "${GITHUB_RUN_ID:=}"
-
-bin="$(dirname "$0")"
-
-echo "Notifying Slack about failed ingest job."
-message="❌ Ingest job has FAILED 😞 "
-
-if [ -n "${AWS_BATCH_JOB_ID}" ]; then
-    message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>) for error details. "
-elif [ -n "${GITHUB_RUN_ID}" ]; then
-    message+="See GitHub Action <https://github.com/nextstrain/monkeypox/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}> for error details. "
-fi
-
-"$bin"/notify-slack "$message"
diff --git a/ingest/scripts/notify-on-job-start b/ingest/scripts/notify-on-job-start
deleted file mode 100755
index 9410fa3..0000000
--- a/ingest/scripts/notify-on-job-start
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-: "${AWS_BATCH_JOB_ID:=}"
-: "${GITHUB_RUN_ID:=}"
-
-bin="$(dirname "$0")"
-
-echo "Notifying Slack about started ingest job."
-message="🐵 Monkeypox ingest job has started."
-
-if [[ -n "${GITHUB_RUN_ID}" ]]; then
-  message+=" The job was submitted by GitHub Action <https://github.com/nextstrain/monkeypox/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}>."
-fi
-
-if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
-  message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>)."
-  message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ingest/"'```'
-fi
-
-"$bin"/notify-slack "$message"
diff --git a/ingest/scripts/notify-on-record-change b/ingest/scripts/notify-on-record-change
deleted file mode 100755
index 595835b..0000000
--- a/ingest/scripts/notify-on-record-change
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-src="${1:?A source ndjson file is required as the first argument.}"
-dst="${2:?A destination ndjson s3:// URL is required as the second argument.}"
-source_name=${3:?A record source name is required as the third argument.}
-
-# if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
-
-s3path="${dst#s3://}"
-bucket="${s3path%%/*}"
-key="${s3path#*/}"
-
-src_record_count="$(wc -l < "$src")"
-
-# Try getting record count from S3 object metadata
-dst_record_count="$(aws s3api head-object --bucket "$bucket" --key "$key" --query "Metadata.recordcount || ''" --output text 2>/dev/null || true)"
-if [[ -z "$dst_record_count" ]]; then
-  # This object doesn't have the record count stored as metadata
-  # We have to download it and count the lines locally
-  dst_record_count="$(wc -l < <(aws s3 cp --no-progress "$dst" - | xz -T0 -dcfq))"
-fi
-
-added_records="$(( src_record_count - dst_record_count ))"
-
-printf "%'4d %s\n" "$src_record_count" "$src"
-printf "%'4d %s\n" "$dst_record_count" "$dst"
-printf "%'4d added records\n" "$added_records"
-
-slack_message=""
-
-if [[ $added_records -gt 0 ]]; then
-    echo "Notifying Slack about added records (n=$added_records)"
-    slack_message="📈 New monkeypox records (n=$added_records) found on $source_name."
-
-elif [[ $added_records -lt 0 ]]; then
-    echo "Notifying Slack about fewer records (n=$added_records)"
-    slack_message="📉 Fewer monkeypox records (n=$added_records) found on $source_name."
-
-else
-    echo "Notifying Slack about same number of records"
-    slack_message="⛔ No new monkeypox records found on $source_name."
-fi
-
-slack_message+=" (Total record count: $src_record_count)"
-
-"$bin"/notify-slack "$slack_message"
diff --git a/ingest/scripts/notify-slack b/ingest/scripts/notify-slack
deleted file mode 100755
index 6ca20de..0000000
--- a/ingest/scripts/notify-slack
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest repo
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-upload=0
-output=/dev/null
-thread_ts=""
-broadcast=0
-args=()
-
-for arg; do
-    case "$arg" in
-        --upload)
-            upload=1;;
-        --output=*)
-            output="${arg#*=}";;
-        --thread-ts=*)
-            thread_ts="${arg#*=}";;
-        --broadcast)
-            broadcast=1;;
-        *)
-            args+=("$arg");;
-    esac
-done
-
-set -- "${args[@]}"
-
-text="${1:?Some message text is required.}"
-
-if [[ "$upload" == 1 ]]; then
-    echo "Uploading data to Slack with the message: $text"
-    curl https://slack.com/api/files.upload \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channels="$SLACK_CHANNELS" \
-        --form-string title="$text" \
-        --form-string filename="$text" \
-        --form-string thread_ts="$thread_ts" \
-        --form-string reply_broadcast="$broadcast" \
-        --form file=@/dev/stdin \
-        --form filetype=text \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output "$output"
-else
-    echo "Posting Slack message: $text"
-    curl https://slack.com/api/chat.postMessage \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channel="$SLACK_CHANNELS" \
-        --form-string text="$text" \
-        --form-string thread_ts="$thread_ts" \
-        --form-string reply_broadcast="$broadcast" \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output "$output"
-fi
diff --git a/ingest/scripts/reverse_reversed_sequences.py b/ingest/scripts/reverse_reversed_sequences.py
deleted file mode 100755
index 1ee9be2..0000000
--- a/ingest/scripts/reverse_reversed_sequences.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import pandas as pd
-import argparse
-from Bio import SeqIO
-
-if __name__=="__main__":
-    parser = argparse.ArgumentParser(
-        description="Reverse-complement reverse-complemented sequence",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--metadata', type=str, required=True, help="input metadata")
-    parser.add_argument('--sequences', type=str, required=True, help="input sequences")
-    parser.add_argument('--output', type=str, required=True, help="output sequences")
-    args = parser.parse_args()
-
-    metadata = pd.read_csv(args.metadata, sep='\t')
-    
-    # Read in fasta file
-    with open(args.sequences, 'r') as f_in:
-        with open(args.output, 'w') as f_out:
-            for seq in SeqIO.parse(f_in, 'fasta'):
-                # Check if metadata['reverse'] is True
-                if metadata.loc[metadata['accession'] == seq.id, 'reverse'].values[0] == True:
-                    # Reverse-complement sequence
-                    seq.seq = seq.seq.reverse_complement()
-                    print("Reverse-complementing sequence:", seq.id)
-                    
-                # Write sequences to file
-                SeqIO.write(seq, f_out, 'fasta')
diff --git a/ingest/scripts/s3-object-exists b/ingest/scripts/s3-object-exists
deleted file mode 100755
index d586d0b..0000000
--- a/ingest/scripts/s3-object-exists
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest
-set -euo pipefail
-
-url="${1#s3://}"
-bucket="${url%%/*}"
-key="${url#*/}"
-
-aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null
diff --git a/ingest/scripts/sha256sum b/ingest/scripts/sha256sum
deleted file mode 100755
index aa05af0..0000000
--- a/ingest/scripts/sha256sum
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python3
-# Originally copied from nextstrain/ncov-ingest repo
-"""
-Portable sha256sum utility.
-"""
-from hashlib import sha256
-from sys import stdin
-
-chunk_size = 5 * 1024**2 # 5 MiB
-
-h = sha256()
-
-for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
-    h.update(chunk)
-
-print(h.hexdigest())
diff --git a/ingest/scripts/trigger b/ingest/scripts/trigger
deleted file mode 100755
index d40553b..0000000
--- a/ingest/scripts/trigger
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${PAT_GITHUB_DISPATCH:=}"
-
-repo="${1:?A repository name is required as the first argument.}"
-event_type="${2:?An event type is required as the second argument.}"
-shift 2
-
-if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
-    cat >&2 <<.
-You must specify options to curl for your GitHub credentials.  For example, you
-can specify your GitHub username, and will be prompted for your password:
-
-  $0 $repo $event_type --user <your-github-username>
-
-Be sure to enter a personal access token¹ as your password since GitHub has
-discontinued password authentication to the API starting on November 13, 2020².
-
-You can also store your credentials or a personal access token in a netrc
-file³:
-
-  machine api.github.com
-  login <your-username>
-  password <your-token>
-
-and then tell curl to use it:
-
-  $0 $repo $event_type --netrc
-
-which will then not require you to type your password every time.
-
-¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line
-² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password
-³ https://ec.haxx.se/usingcurl/usingcurl-netrc
-.
-    exit 1
-fi
-
-auth=':'
-if [[ -n $PAT_GITHUB_DISPATCH ]]; then
-  auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
-fi
-
-if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \
-    -H 'Accept: application/vnd.github.v3+json' \
-    -H 'Content-Type: application/json' \
-    -H "$auth" \
-    -d '{"event_type":"'"$event_type"'"}' \
-    "$@"
-then
-    echo "Successfully triggered $event_type"
-else
-    echo "Request failed" >&2
-    exit 1
-fi
diff --git a/ingest/scripts/trigger-on-new-data b/ingest/scripts/trigger-on-new-data
deleted file mode 100755
index 760a018..0000000
--- a/ingest/scripts/trigger-on-new-data
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}"
-
-bin="$(dirname "$0")"
-
-metadata="${1:?A metadata upload output file is required as the first argument.}"
-sequences="${2:?An sequence FASTA upload output file is required as the second argument.}"
-identical_file_message="${3:-files are identical}"
-
-new_metadata=$(grep "$identical_file_message" "$metadata" >/dev/null; echo $?)
-new_sequences=$(grep "$identical_file_message" "$sequences" >/dev/null; echo $?)
-
-slack_message=""
-
-# grep exit status 0 for found match, 1 for no match, 2 if an error occurred
-if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then
-    slack_message="Triggering new builds due to updated metadata and/or sequences"
-    "$bin"/trigger "monkeypox" "rebuild"
-elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then
-    slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files."
-else
-    slack_message="Skipping trigger of rebuild: Unable to determine if data has been updated."
-fi
-
-
-if ! "$bin"/notify-slack "$slack_message"; then
-    echo "Notifying Slack failed, but exiting with success anyway."
-fi
diff --git a/ingest/scripts/upload-to-s3 b/ingest/scripts/upload-to-s3
deleted file mode 100755
index b993c3d..0000000
--- a/ingest/scripts/upload-to-s3
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest repo
-set -euo pipefail
-
-bin="$(dirname "$0")"
-
-main() {
-    local quiet=0
-
-    for arg; do
-        case "$arg" in
-            --quiet)
-                quiet=1
-                shift;;
-            *)
-                break;;
-        esac
-    done
-
-    local src="${1:?A source file is required as the first argument.}"
-    local dst="${2:?A destination s3:// URL is required as the second argument.}"
-    local cloudfront_domain="${3:-}"
-
-    local s3path="${dst#s3://}"
-    local bucket="${s3path%%/*}"
-    local key="${s3path#*/}"
-
-    local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    src_hash="$("$bin/sha256sum" < "$src")"
-    dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
-
-    if [[ $src_hash != "$dst_hash" ]]; then
-        # The record count may have changed
-        src_record_count="$(wc -l < "$src")"
-
-        echo "Uploading $src → $dst"
-        if [[ "$dst" == *.gz ]]; then
-            gzip -c "$src"
-        elif  [[ "$dst" == *.xz ]]; then
-            xz -2 -T0 -c "$src"
-        else
-            cat "$src"
-        fi | aws s3 cp --no-progress - "$dst" --metadata sha256sum="$src_hash",recordcount="$src_record_count" "$(content-type "$dst")"
-
-        if [[ -n $cloudfront_domain ]]; then
-            echo "Creating CloudFront invalidation for $cloudfront_domain/$key"
-            if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then
-                echo "CloudFront invalidation failed, but exiting with success anyway."
-            fi
-        fi
-
-        if [[ $quiet == 1 ]]; then
-            echo "Quiet mode. No Slack notification sent."
-            exit 0
-        fi
-
-        if ! "$bin"/notify-slack "Updated $dst available."; then
-            echo "Notifying Slack failed, but exiting with success anyway."
-        fi
-    else
-        echo "Uploading $src → $dst: files are identical, skipping upload"
-    fi
-}
-
-content-type() {
-    case "$1" in
-        *.tsv)      echo --content-type=text/tab-separated-values;;
-        *.csv)      echo --content-type=text/comma-separated-values;;
-        *.ndjson)   echo --content-type=application/x-ndjson;;
-        *.gz)       echo --content-type=application/gzip;;
-        *.xz)       echo --content-type=application/x-xz;;
-        *)          echo --content-type=text/plain;;
-    esac
-}
-
-main "$@"