Skip to content

Commit

Permalink
Add automation for public build #13
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Sep 28, 2024
2 parents 39fab96 + a146a3e commit 10ff0af
Show file tree
Hide file tree
Showing 21 changed files with 12,918 additions and 25 deletions.
17 changes: 17 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Dependabot configuration file
# <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
#
# Each ecosystem is checked on a scheduled interval defined below. To trigger
# a check manually, go to
#
# https://github.com/nextstrain/wnv/network/updates
#
# and look for a "Check for updates" button. You may need to click around a
# bit first.
---
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
16 changes: 16 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: CI

on:
push:
branches:
- main
pull_request:
workflow_dispatch:
# Routinely check that we continue to work in the face of external changes.
schedule:
# Every day at 18:37 UTC / 10:37 Seattle (winter) / 11:37 Seattle (summer)
- cron: "37 18 * * *"

jobs:
ci:
uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
102 changes: 102 additions & 0 deletions .github/workflows/ingest-to-phylogenetic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: Ingest to phylogenetic

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
schedule:
# Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
#
# Note the actual runs might be late.
# Numerous people were confused, about that, including me:
# - https://gh.neting.ccmunity/t/scheduled-action-running-consistently-late/138025/11
# - https://github.com/github/docs/issues/3059
#
# Note, '*' is a special character in YAML, so you have to quote this string.
#
# Docs:
# - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
#
# Tool that deciphers this particular format of crontab string:
# - https://crontab.guru/
#
# Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast.
# We were running into invalid zip archive errors at 9am PDT, so hoping an hour
# delay will lower the error frequency
- cron: '0 17 * * *'

workflow_dispatch:
inputs:
ingest_image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
phylogenetic_image:
description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
required: false

jobs:
ingest:
permissions:
id-token: write
uses: ./.github/workflows/ingest.yaml
secrets: inherit
with:
image: ${{ inputs.ingest_image }}

# Check if ingest results include new data by checking for the cache
# of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
# GitHub will remove any cache entries that have not been accessed in over 7 days,
# so if the workflow has not been run over 7 days then it will trigger phylogenetic.
check-new-data:
needs: [ingest]
runs-on: ubuntu-latest
outputs:
cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
steps:
- name: Get sha256sum
id: get-sha256sum
env:
AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }}
run: |
s3_urls=(
"s3://nextstrain-data/files/workflows/wnv/all/metadata.tsv.zst"
"s3://nextstrain-data/files/workflows/wnv/all/sequences.fasta.zst"
)
# Code below is modified from ingest/upload-to-s3
# https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
no_hash=0000000000000000000000000000000000000000000000000000000000000000
for s3_url in "${s3_urls[@]}"; do
s3path="${s3_url#s3://}"
bucket="${s3path%%/*}"
key="${s3path#*/}"
s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
echo "${s3_hash}" | tee -a ingest-output-sha256sum
done
- name: Check cache
id: check-cache
uses: actions/cache@v4
with:
path: ingest-output-sha256sum
key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
lookup-only: true

phylogenetic:
needs: [check-new-data]
if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
permissions:
id-token: write
uses: ./.github/workflows/phylogenetic.yaml
secrets: inherit
with:
image: ${{ inputs.phylogenetic_image }}
80 changes: 80 additions & 0 deletions .github/workflows/ingest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Ingest

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
workflow_call:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string

workflow_dispatch:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string
trial_name:
description: |
Trial name for outputs.
If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/wnv/
If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/wnv/trials/<trial_name>/
required: false
type: string

jobs:
set_config_overrides:
runs-on: ubuntu-latest
steps:
- id: config
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
run: |
config=""
if [[ "$TRIAL_NAME" ]]; then
config+="--config"
config+=" s3_dst='s3://nextstrain-data/files/workflows/wnv/trials/"$TRIAL_NAME"'"
fi
echo "config=$config" >> "$GITHUB_OUTPUT"
outputs:
config_overrides: ${{ steps.config.outputs.config }}

ingest:
needs: [set_config_overrides]
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
env: |
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
run: |
nextstrain build \
ingest \
upload_all \
--configfile build-configs/nextstrain-automation/config.yaml \
$CONFIG_OVERRIDES
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: ingest-build-output
artifact-paths: |
ingest/results/
ingest/benchmarks/
ingest/logs/
ingest/.snakemake/log/
107 changes: 107 additions & 0 deletions .github/workflows/phylogenetic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Phylogenetic

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
workflow_call:
inputs:
image:
description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
required: false
type: string

workflow_dispatch:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string
trial_name:
description: |
Trial name for deploying builds.
If not set, builds will overwrite existing builds at s3://nextstrain-data/wnv*
If set, builds will be deployed to s3://nextstrain-staging/wnv_trials_<trial_name>_*
required: false
type: string
sequences_url:
description: |
URL for the sequences.fasta.zst file
If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
required: false
type: string
metadata_url:
description: |
URL for the metadata.tsv.zst file
If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
required: false
type: string

jobs:
set_config_overrides:
runs-on: ubuntu-latest
steps:
- id: config
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
SEQUENCES_URL: ${{ inputs.sequences_url }}
METADATA_URL: ${{ inputs.metadata_url }}
run: |
config=""
if [[ "$TRIAL_NAME" ]]; then
config+=" deploy_url='s3://nextstrain-staging/wnv_trials_"$TRIAL_NAME"_'"
fi
if [[ "$SEQUENCES_URL" ]]; then
config+=" sequences_url='"$SEQUENCES_URL"'"
fi
if [[ "$METADATA_URL" ]]; then
config+=" metadata_url='"$METADATA_URL"'"
fi
if [[ $config ]]; then
config="--config $config"
fi
echo "config=$config" >> "$GITHUB_OUTPUT"
outputs:
config_overrides: ${{ steps.config.outputs.config }}

phylogenetic:
needs: [set_config_overrides]
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
env: |
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
run: |
nextstrain build \
phylogenetic \
deploy_all \
--configfile build-configs/nextstrain-automation/config.yaml \
$CONFIG_OVERRIDES
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: phylogenetic-build-output
artifact-paths: |
phylogenetic/auspice/
phylogenetic/results/
phylogenetic/benchmarks/
phylogenetic/logs/
phylogenetic/.snakemake/log/
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ data/
results/
logs/
benchmarks/
*RAxML*

# Sensitive environment variables
environment*
Expand Down
8 changes: 2 additions & 6 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,12 @@ workdir: workflow.current_basedir
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

serotypes = ["all"]
wildcard_constraints:
serotype = "|".join(serotypes)

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
rule all:
input:
sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
sequences="results/sequences.fasta",
metadata="results/metadata.tsv",

# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
# If there are build-specific customizations, they should be added with the
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,9 @@ cloudfront_domain: "data.nextstrain.org"

# Nextstrain AWS S3 Bucket with pathogen prefix
# Replace <pathogen> with the pathogen repo name.
s3_dst: "s3://nextstrain-data/files/workflows/<pathogen>"
s3_dst: "s3://nextstrain-data/files/workflows/wnv"

# Mapping of files to upload
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.zst: results/sequences.fasta
alignments.fasta.zst: results/alignment.fasta
translations.zip: results/translations.zip
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ rule upload_to_s3:
cloudfront_domain=config["cloudfront_domain"],
shell:
"""
./vendored/upload-to-s3 \
./scripts/upload-to-s3 \
{params.quiet} \
{input.file_to_upload:q} \
{params.s3_dst:q}/{wildcards.remote_file:q} \
Expand Down
Loading

0 comments on commit 10ff0af

Please sign in to comment.