AlexsLemonade · jashapiro · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/.github/workflows/run-batch.yml b/.github/workflows/run-batch.yml
@@ -17,6 +17,13 @@ on:
           - simulated
           - scpca
           - full
+      output_mode:
+        description: Workflow output mode
+        type: choice
+        default: staging
+        options:
+          - staging
+          - prod
 
 permissions:
   id-token: write # This is required for requesting the JWT
@@ -45,10 +52,13 @@ jobs:
           revision: ${{ github.event_name == 'push' && github.ref_name || inputs.revision }}
           # default run mode is full for release events, otherwise use the specified mode
           run_mode: ${{ github.event_name == 'push' && 'full' || inputs.run_mode }}
+          # default output mode is prod for release events, otherwise use the specified mode
+          output_mode: ${{ github.event_name == 'push' && 'prod' || inputs.output_mode }}
         run: |
           echo '#!/bin/bash' > scripts/tmux_launch.sh
           echo "export GITHUB_TAG=$revision" >> scripts/tmux_launch.sh
           echo "export RUN_MODE=$run_mode" >> scripts/tmux_launch.sh
+          echo "export OUTPUT_MODE=$output_mode" >> scripts/tmux_launch.sh
           echo 'tmux new-session -d -s nextflow /opt/nextflow/scripts/run_nextflow.sh' >> scripts/tmux_launch.sh
           chmod +x scripts/tmux_launch.sh
 

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ The workflow is currently set up to run best via AWS batch, but some testing may
 You will need to have appropriate AWS credentials set up to run the workflow on AWS and access the data files.
 In general, you must have `workload` access in an OpenScPCA AWS account to run the workflow.
 
-### Running the workflow from GitHub Actions
+### Running the workflow using GitHub Actions
 
 The most common way to run the workflow will be to run the GitHub Action (GHA) responsible for running the workflow.
 The GHA is run automatically when a new release tag is created or by manually triggering the workflow.
@@ -26,14 +26,26 @@ The GHA workflow will run automatically when a new release tag is created, which
 3. Run the main workflow using the real ScPCA data.
 4. Upload all Nextflow logs, traces, and html run reports to `s3://openscpca-nf-data/logs/full/`, organized by date.
 
-Alternatively, manual launches of the GHA workflow can be triggered by a [`workflow_dispatch` trigger](https://github.com/AlexsLemonade/OpenScPCA-nf/actions/workflows/run-batch.yml), which will allow you to specify a specific run mode.
+Alternatively, manual launches of the GHA workflow can be triggered by a [`workflow_dispatch` trigger](https://github.com/AlexsLemonade/OpenScPCA-nf/actions/workflows/run-batch.yml), which will allow you to specify specific run and output modes.
+
 The run modes available are:
 
 - `test`: runs only a simple test workflow to check configuration
 - `simulated`: runs the workflow using simulated data
 - `scpca`: runs the workflow using the current ScPCA data release
 - `full`: simulates data based on the current ScPCA data release, then runs the workflow using the simulated data and current ScPCA data release (this is same as the behavior of the automatic release workflow)
 
+By default, the output mode will be set to `staging`, so all outputs will be saved to S3 buckets that are not shared with users and can not overwrite current production data.
+With the `prod` output mode, results will be accessible visible to users.
-With the `prod` output mode, results will be accessible visible to users.
+With the `prod` output mode, results will be accessible to users.
-With the `prod` output mode, results will be accessible visible to users.
+With the `prod` output mode, results will be accessible to users.
+`prod` output mode should used for versioned releases of the workflow, and when running on new ScPCA data releases.
-With the `prod` output mode, results will be accessible visible to users.
+With the `prod` output mode, results will be accessible to users.
-With the `prod` output mode, results will be accessible visible to users.
+With the `prod` output mode, results will be accessible to users.
+`prod` output mode should used for versioned releases of the workflow, and when running on new ScPCA data releases.
+
+The following buckets are used for each output mode.
+
+| bucket description         | `staging`                                      | `prod`                                               |
+| -------------------------- | ---------------------------------------------- | ---------------------------------------------------- |
+| simulated test data        | `s3://openscpca-test-data-release-staging`     | `s3://openscpca-test-data-release-public-access`     |
+| simulated workflow results | `s3://openscpca-test-workflow-results-staging` | `s3://openscpca-test-workflow-results-public-access` |
+| scpca workflow results     | `s3://openscpca-nf-workflow-results-staging`   | `s3://openscpca-nf-workflow-results`                 |
+
 For each run, all Nextflow logs, traces, and html run reports will be uploaded to `s3://openscpca-nf-data/logs/{run_mode}/`, organized by date of the run.
 
 ### Running the workflow manually

diff --git a/modules/simulate-sce/main.nf b/modules/simulate-sce/main.nf
@@ -5,13 +5,12 @@
 
 
 // module parameters
-params.sim_pubdir = 's3://openscpca-test-data-release-public-access/test'
 params.simulate_sce_container = 'public.ecr.aws/openscpca/simulate-sce:latest'
 
 process permute_metadata {
   container params.simulate_sce_container
   tag "$project_id"
-  publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy'
+  publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy'
   input:
     tuple val(project_id),
           path(metadata_file, stageAs: 'input/*')
@@ -35,7 +34,7 @@ process simulate_sample {
   container params.simulate_sce_container
   label "mem_8"
   tag "$project_id-$sample_id"
-  publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy'
+  publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy'
   input:
     tuple val(project_id),
           val(sample_id),
@@ -57,17 +56,17 @@ process simulate_sample {
   stub:
     """
     mkdir ${sample_id}
-    for f in ${rds_files}; do
-      touch ${sample_id}/\$(basename \$f)
-      touch ${sample_id}/\$(basename \${f%.rds}.h5ad)
+    for file in ${rds_files}; do
+      touch "${sample_id}/\$(basename \$file)"
+      touch "${sample_id}/\$(basename \${file%.rds}.h5ad)"
     done
     """
 }
 
 process permute_bulk{
   container params.simulate_sce_container
   tag "$project_id"
-  publishDir "${params.sim_pubdir}/${project_id}", mode: 'copy'
+  publishDir "${params.sim_bucket}/${params.release_prefix}/${project_id}", mode: 'copy'
   input:
     tuple val(project_id),
           path(bulk_quant, stageAs: 'input/*'),

diff --git a/nextflow.config b/nextflow.config
@@ -11,11 +11,12 @@ manifest {
 
 nextflow.enable.moduleBinaries = true
 
-// global parameters for workflows
+// global default parameters for workflows: output buckets are set to staging by default
 params {
   release_prefix = "2024-05-01"
   release_bucket = "s3://openscpca-data-release"
-  results_bucket = "s3://openscpca-nf-workflow-results"
+  results_bucket = "s3://openscpca-nf-workflow-results-staging"
+  sim_bucket = "s3://openscpca-test-data-release-staging"
   project = "all"
 }
 
@@ -31,7 +32,20 @@ profiles {
     docker.enabled = true
     docker.userEmulation = true
   }
+  prod {
+    params {
+      results_bucket = "s3://openscpca-nf-workflow-results"
+      sim_bucket = "s3://openscpca-test-data-release-public-access"
+    }
+  }
   simulated {
+    params {
+      release_prefix = "test"
+      release_bucket = "s3://openscpca-test-data-release-staging"
+      results_bucket = "s3://openscpca-test-workflow-results-staging"
+    }
+  }
+  prod_simulated {
     params {
       release_prefix = "test"
       release_bucket = "s3://openscpca-test-data-release-public-access"
@@ -49,7 +63,7 @@ profiles {
       release_prefix = "test"
       release_bucket = "s3://openscpca-test-data-release-public-access" // test bucket
       results_bucket = "test/results" // local output
-      sim_pubdir = "test/simulated" // local output
+      sim_bucket = "test/simulated" // local output
       project = "SCPCP000012" // a small project
     }
   }

diff --git a/scripts/run_nextflow.sh b/scripts/run_nextflow.sh
@@ -1,21 +1,33 @@
 #!/bin/bash
 set -u
 
-# Run the OpenScPCA Nextflow pipeline with options to specify the run mode
-# Available run modes are:
+# Run the OpenScPCA Nextflow pipeline with options to specify the run mode and output
+#
+# Available RUN_MODE values are:
 #   test:      run the test workflow only
 #   simulated: run the main workflow with simulated data
 #   scpca:     run the main workflow with real data from ScPCA
-#   full:      run the data simulation workflow,
-#              followed by the main pipeline with both simulated and real data
+#   full:      run the data simulation workflow, followed
+#              by the main pipeline with both simulated and real data,
+#
+# OUTPUT_MODE is either `staging` or `prod`, and determines which buckets are used for output
 
 GITHUB_TAG=${GITHUB_TAG:-main}
 RUN_MODE=${RUN_MODE:-test}
+OUTPUT_MODE=${OUTPUT_MODE:-staging}
 
-profile="batch"
 date=$(date "+%Y-%m-%d")
 datetime=$(date "+%Y-%m-%dT%H%M")
 
+profile="batch"
+sim_profile="${profile},simulated"
+# Add prod profiles if output is set to prod
+if [ "$OUTPUT_MODE" == "prod" ]; then
+  profile="${profile},prod"
+  sim_profile="${profile},prod_simulated"
+fi
+
+
 cd /opt/nextflow
 nextflow pull AlexsLemonade/OpenScPCA-nf -revision $GITHUB_TAG
 
@@ -55,7 +67,7 @@ fi
 if [ "$RUN_MODE" == "simulated" ] || [ "$RUN_MODE" == "full" ]; then
   nextflow run AlexsLemonade/OpenScPCA-nf \
     -revision $GITHUB_TAG \
-    -profile "${profile},simulated" \
+    -profile $sim_profile \
     -with-report ${datetime}_simulated_report.html \
     -with-trace  ${datetime}_simulated_trace.txt