diff --git a/.github/workflows/run-batch.yml b/.github/workflows/run-batch.yml index f9ec9ec..728e3aa 100644 --- a/.github/workflows/run-batch.yml +++ b/.github/workflows/run-batch.yml @@ -17,6 +17,13 @@ on: - simulated - scpca - full + output_mode: + description: Workflow output mode + type: choice + default: staging + options: + - staging + - prod permissions: id-token: write # This is required for requesting the JWT @@ -45,10 +52,13 @@ jobs: revision: ${{ github.event_name == 'push' && github.ref_name || inputs.revision }} # default run mode is full for release events, otherwise use the specified mode run_mode: ${{ github.event_name == 'push' && 'full' || inputs.run_mode }} + # default output mode is prod for release events, otherwise use the specified mode + output_mode: ${{ github.event_name == 'push' && 'prod' || inputs.output_mode }} run: | echo '#!/bin/bash' > scripts/tmux_launch.sh echo "export GITHUB_TAG=$revision" >> scripts/tmux_launch.sh echo "export RUN_MODE=$run_mode" >> scripts/tmux_launch.sh + echo "export OUTPUT_MODE=$output_mode" >> scripts/tmux_launch.sh echo 'tmux new-session -d -s nextflow /opt/nextflow/scripts/run_nextflow.sh' >> scripts/tmux_launch.sh chmod +x scripts/tmux_launch.sh diff --git a/README.md b/README.md index a6ab574..f68a7ab 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The workflow is currently set up to run best via AWS batch, but some testing may You will need to have appropriate AWS credentials set up to run the workflow on AWS and access the data files. In general, you must have `workload` access in an OpenScPCA AWS account to run the workflow. -### Running the workflow from GitHub Actions +### Running the workflow using GitHub Actions The most common way to run the workflow will be to run the GitHub Action (GHA) responsible for running the workflow. The GHA is run automatically when a new release tag is created or by manually triggering the workflow. @@ -26,7 +26,8 @@ The GHA workflow will run automatically when a new release tag is created, which 3. Run the main workflow using the real ScPCA data. 4. Upload all Nextflow logs, traces, and html run reports to `s3://openscpca-nf-data/logs/full/`, organized by date. -Alternatively, manual launches of the GHA workflow can be triggered by a [`workflow_dispatch` trigger](https://github.com/AlexsLemonade/OpenScPCA-nf/actions/workflows/run-batch.yml), which will allow you to specify a specific run mode. +Alternatively, manual launches of the GHA workflow can be triggered by a [`workflow_dispatch` trigger](https://github.com/AlexsLemonade/OpenScPCA-nf/actions/workflows/run-batch.yml), which will allow you to specify specific run and output modes. + The run modes available are: - `test`: runs only a simple test workflow to check configuration @@ -34,6 +35,17 @@ The run modes available are: - `scpca`: runs the workflow using the current ScPCA data release - `full`: simulates data based on the current ScPCA data release, then runs the workflow using the simulated data and current ScPCA data release (this is same as the behavior of the automatic release workflow) +By default, the output mode will be set to `staging`, so all outputs will be saved to S3 buckets that are not shared with users and can not overwrite current production data. +With the `prod` output mode, results will be accessible visible to users. + +The following buckets are used for each output mode. + +| bucket description | `staging` | `prod` | +| -------------------------- | ---------------------------------------------- | ---------------------------------------------------- | +| simulated test data | `s3://openscpca-test-data-release-staging` | `s3://openscpca-test-data-release-public-access` | +| simulated workflow results | `s3://openscpca-test-workflow-results-staging` | `s3://openscpca-test-workflow-results-public-access` | +| scpca workflow results | `s3://openscpca-nf-workflow-results-staging` | `s3://openscpca-nf-workflow-results` | + For each run, all Nextflow logs, traces, and html run reports will be uploaded to `s3://openscpca-nf-data/logs/{run_mode}/`, organized by date of the run. ### Running the workflow manually diff --git a/modules/simulate-sce/main.nf b/modules/simulate-sce/main.nf index c94333a..d91d8a3 100644 --- a/modules/simulate-sce/main.nf +++ b/modules/simulate-sce/main.nf @@ -5,13 +5,12 @@ // module parameters -params.sim_pubdir = 's3://openscpca-test-data-release-public-access/test' params.simulate_sce_container = 'public.ecr.aws/openscpca/simulate-sce:latest' process permute_metadata { container params.simulate_sce_container tag "$project_id" - publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy' + publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy' input: tuple val(project_id), path(metadata_file, stageAs: 'input/*') @@ -35,7 +34,7 @@ process simulate_sample { container params.simulate_sce_container label "mem_8" tag "$project_id-$sample_id" - publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy' + publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy' input: tuple val(project_id), val(sample_id), @@ -57,9 +56,9 @@ process simulate_sample { stub: """ mkdir ${sample_id} - for f in ${rds_files}; do - touch ${sample_id}/\$(basename \$f) - touch ${sample_id}/\$(basename \${f%.rds}.h5ad) + for file in ${rds_files}; do + touch "${sample_id}/\$(basename \$file)" + touch "${sample_id}/\$(basename \${file%.rds}.h5ad)" done """ } @@ -67,7 +66,7 @@ process simulate_sample { process permute_bulk{ container params.simulate_sce_container tag "$project_id" - publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy' + publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy' input: tuple val(project_id), path(bulk_quant, stageAs: 'input/*'), diff --git a/nextflow.config b/nextflow.config index b9498d4..bb0087a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,11 +11,12 @@ manifest { nextflow.enable.moduleBinaries = true -// global parameters for workflows +// global default parameters for workflows: output buckets are set to staging by default params { release_prefix = "2024-05-01" release_bucket = "s3://openscpca-data-release" - results_bucket = "s3://openscpca-nf-workflow-results" + results_bucket = "s3://openscpca-nf-workflow-results-staging" + sim_bucket = "s3://openscpca-test-data-release-staging" project = "all" } @@ -31,7 +32,20 @@ profiles { docker.enabled = true docker.userEmulation = true } + prod { + params { + results_bucket = "s3://openscpca-nf-workflow-results" + sim_bucket = "s3://openscpca-test-data-release-public-access" + } + } simulated { + params { + release_prefix = "test" + release_bucket = "s3://openscpca-test-data-release-staging" + results_bucket = "s3://openscpca-test-workflow-results-staging" + } + } + prod_simulated { params { release_prefix = "test" release_bucket = "s3://openscpca-test-data-release-public-access" @@ -49,7 +63,7 @@ profiles { release_prefix = "test" release_bucket = "s3://openscpca-test-data-release-public-access" // test bucket results_bucket = "test/results" // local output - sim_pubdir = "test/simulated" // local output + sim_bucket = "test/simulated" // local output project = "SCPCP000012" // a small project } } diff --git a/scripts/run_nextflow.sh b/scripts/run_nextflow.sh index ce9d2bd..6f585cc 100755 --- a/scripts/run_nextflow.sh +++ b/scripts/run_nextflow.sh @@ -1,21 +1,33 @@ #!/bin/bash set -u -# Run the OpenScPCA Nextflow pipeline with options to specify the run mode -# Available run modes are: +# Run the OpenScPCA Nextflow pipeline with options to specify the run mode and output +# +# Available RUN_MODE values are: # test: run the test workflow only # simulated: run the main workflow with simulated data # scpca: run the main workflow with real data from ScPCA -# full: run the data simulation workflow, -# followed by the main pipeline with both simulated and real data +# full: run the data simulation workflow, followed +# by the main pipeline with both simulated and real data, +# +# OUTPUT_MODE is either `staging` or `prod`, and determines which buckets are used for output GITHUB_TAG=${GITHUB_TAG:-main} RUN_MODE=${RUN_MODE:-test} +OUTPUT_MODE=${OUTPUT_MODE:-staging} -profile="batch" date=$(date "+%Y-%m-%d") datetime=$(date "+%Y-%m-%dT%H%M") +profile="batch" +sim_profile="${profile},simulated" +# Add prod profiles if output is set to prod +if [ "$OUTPUT_MODE" == "prod" ]; then + profile="${profile},prod" + sim_profile="${profile},prod_simulated" +fi + + cd /opt/nextflow nextflow pull AlexsLemonade/OpenScPCA-nf -revision $GITHUB_TAG @@ -55,7 +67,7 @@ fi if [ "$RUN_MODE" == "simulated" ] || [ "$RUN_MODE" == "full" ]; then nextflow run AlexsLemonade/OpenScPCA-nf \ -revision $GITHUB_TAG \ - -profile "${profile},simulated" \ + -profile $sim_profile \ -with-report ${datetime}_simulated_report.html \ -with-trace ${datetime}_simulated_trace.txt