Merge pull request #174 from TARGENE/allofus

Allofus - Adapt targene for use on the All of Us Researcher Workbench
TARGENE · Nov 13, 2024 · 080867e · 080867e
2 parents 4c3e011 + 29d42bb
commit 080867e
Show file tree

Hide file tree

Showing 32 changed files with 776 additions and 35 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -19,6 +19,7 @@ jobs:
           - "ukb_estimands_file.jl"
           - "custom_cohort_flat_config.jl"
           - "ukb_interactions_group_config.jl"
+          - "empty_qq.jl"
           - "make_dataset.jl"
           - "null_simulation.jl"
           - "realistic_simulation.jl"

diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ docs/Manifest.toml
 *.DS_Store
 trace.txt*
 trace.txt
+trace*txt
 simulation/
 null_simulation/
 causal_gap_test/
diff --git a/conf/allofus.config b/conf/allofus.config
@@ -0,0 +1,61 @@
+includeConfig 'base.config'
+includeConfig 'container.config'
+
+// Define the number of CPUs based on memory requested
+def calculateCpus(memory) {
+    def mem
+    if (memory instanceof nextflow.util.MemoryUnit) {
+        mem = memory.toMega()
+    } else {
+        error "Memory not formatted correctly. Please specify memory as MemoryUnit."
+    } 
+    // Google Cloud Platform (GCP) restricts a maximum of 6 GB (6144 MB) provided per-CPU 
+    // Compute # of CPUs based on this (1 CPU minimum required)
+    def cpus = Math.max(1, Math.ceil(mem / 6144).intValue()) 
+
+    // # of CPUs required to be even by GCP for tasks with CPUs > 1
+    if (cpus > 1) {
+        cpus = cpus + (cpus % 2)
+    }
+
+    return cpus
+}
+
+process {
+    // Override CPUs based on task memory
+    cpus = { calculateCpus(task.memory) }
+
+    withLabel: multithreaded {
+        cpus = { calculateCpus(task.memory) }
+    }
+
+    withLabel: bigmem {
+        cpus = { calculateCpus(task.memory) }
+    }
+
+    // Set google appropriate error strategy
+    errorStrategy = {task.exitStatus in [143,137,104,134,139,14,140,151] ? 'retry' : 'finish'}
+    maxRetries = 3
+
+    // Base container required when running only through docker only
+    container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${base_image_version}"
+
+    withLabel: simulation_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${simulation_image_version}"
+    }
+    withLabel: targenecore_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${targenecore_image_version}"
+    }
+    withLabel: ukb_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${ukb_image_version}"
+    }
+    withLabel: tmle_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${tmle_image_version}"
+    }
+    withLabel: plink_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${plink_image_version}"
+    }
+    withLabel: pca_image {
+        container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${pca_image_version}"
+    }
+}
diff --git a/conf/container.config b/conf/container.config
@@ -1,20 +1,28 @@
+simulation_image_version = "olivierlabayle/targene-simulations:0.0"
+targenecore_image_version = "olivierlabayle/tl-core:0.10"
+ukb_image_version = "olivierlabayle/ukbmain:0.5"
+tmle_image_version = "olivierlabayle/targeted-estimation:0.10"
+plink_image_version = "olivierlabayle/plink2:0.1.0"
+pca_image_version = "roskamsh/flashpca:0.1.0"
+base_image_version = "roskamsh/commandlinetools:0.1.1"
+
 process {
     withLabel: simulation_image {
-        container = "olivierlabayle/targene-simulations:0.0"
+        container = simulation_image_version 
     }
     withLabel: targenecore_image {
-        container = "olivierlabayle/tl-core:0.10"
+        container = targenecore_image_version 
     }
     withLabel: ukb_image {
-        container = "olivierlabayle/ukbmain:0.5"
+        container = ukb_image_version 
     }
     withLabel: tmle_image {
-        container = "olivierlabayle/targeted-estimation:0.10"
+        container = tmle_image_version
     }
     withLabel: plink_image {
-        container = "olivierlabayle/plink2:0.1.0"
+        container = plink_image_version 
     }
     withLabel: pca_image {
-        container = "ktetleycampbell/flashpca:1.0"
+        container = pca_image_version 
     } 
-}
+}
diff --git a/docs/make.jl b/docs/make.jl
@@ -20,6 +20,7 @@ makedocs(;
             joinpath("examples", "gwas.md"),
             joinpath("examples", "phewas.md"),
             joinpath("examples", "interactions.md"),
+            joinpath("examples", "allofus.md")
         ],
         "User Guide" => [
             "overview.md",

diff --git a/docs/src/examples/allofus.md b/docs/src/examples/allofus.md
@@ -0,0 +1,64 @@
+# All of Us
+
+Analyses within the All of Us (AoU) Researcher Workbench using genetic data must be run within the `Controlled Tier Access` (See [`Data Access`](https://www.researchallofus.org/data-tools/data-access/)). Workspaces launched within this tier will automatically have nextflow installed and can use TarGene immediately. 
+
+Each Workspace will be assigned a bucket for storage on Google Cloud, that can be found on the right hand panel of the `About` page of your workspace. Each Workspace will also contain Google Cloud-specific credentials in order to submit jobs via the Google Lifesciences API. These can be found in your Workspace-specific nextflow profile, located at `~/.nextflow/config`, and available by using the flag `-profile gls` when you run nextflow. TarGene requires some additional configuration to run on the AoU Researcher workbench that is built into the `allofus` profile, which can be combined with your Workspace-specific `gls` configuration to batch out jobs when running TarGene on this platform.
+
+We reccommend running this by first entering a `Cloud Analysis Terminal` on your current Workspace, creating a configuration for the analysis you would like to run, and running TarGene in a screen session. See [`Workflows in the All of Us Researched Workbench`](https://support.researchallofus.org/hc/en-us/articles/4811899197076-Workflows-in-the-All-of-Us-Researcher-Workbench-Nextflow-and-Cromwell) for more information.
+
+A minimalist run configuration to run a flat config run on the AoU Researcher Workbench using TarGene looks like the following:
+
+```conf
+params {
+    COHORT = "ALLOFUS"
+    ESTIMANDS_CONFIG = "allofus_config.yaml"
+
+    // All of Us srWGS data
+    BGEN_FILE = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/clinvar_v7.1/bgen/clinvar.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}"
+    BED_FILES = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/clinvar_v7.1/plink_bed/clinvar.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}"
+    TRAITS_DATASET = "allofus_traits.csv"
+}
+```
+
+Apart from the data related parameters, there are two main parameters here: `ESTIMANDS_CONFIG` and the `ESTIMATORS_CONFIG`. These parameters describe the estimands (questions of interest) and how to estimate them respectively. 
+
+The `ESTIMANDS_CONFIG` here follows the same format as the `flat` configuration detailed in the [PheWAS](@ref) section. Here we are estimating the ATE of the FTO variant (encoded as chr16:53767042:T:C in the BGEN data for the AoU cohort) on the traits present in our `allofus_traits.csv` file. We have also added `Sex at birth` as a covariate. 
+
+```yaml
+type: flat
+
+estimands:
+  - type: ATE
+
+variants:
+  - chr16:53767042:T:C
+
+outcome_extra_covariates:
+  - "Sex at birth"
+```
+
+The optional `outcome_extra_covariates` are variables to be used as extra predictors of the outcome (but not as confounders). This information must be contained in the `allofus_traits.csv` file, along with your outcomes-of-interest. 
+
+The `allofus_traits.csv` might look as follows:
+
+| SAMPLE_ID | Sex at birth | Height (cm) |
+|-----------|--------------|-------------|
+| 100000    | Male         | 180         |
+| 100002    | Female       | 165         |
+| 100004    | Male         | 175         |
+| 100010    | Female       | 160         |
+| ...       | ...          | ...         |
+
+Here we have not specified any value for `ESTIMATORS_CONFIG`, and so the default, `ESTIMATORS_CONFIG = "wtmle-ose--tunedxgboost"`, will be used. This defines the estimation strategy, and more specifically, that we will be using Targeted Minimum-Loss Estimator as well as a One Step Estimator with a tuned XGBoost model to learn the outcome models (``Q_Y``) and propensity scores (``G``).
+
+Then TarGene can then be run on the AoU Researcher Workbench as follows:
+
+```bash
+nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile gls,allofus
+```
+
+By default, this will generate results in the `results/` directory in your `Cloud Analysis Terminal`. Once complete, you can upload these to your Workspace bucket using the following command:
+
+```bash
+gsutil -m -u $GOOGLE_PROJECT cp -r results/* gs://path/to/workspace/bucket
+```
diff --git a/docs/src/examples/gwas.md b/docs/src/examples/gwas.md
@@ -52,5 +52,5 @@ Finally, the `ESTIMATORS_CONFIG = "wtmle--glm"` defines the estimation strategy.
 The GWAS can then be run as follows:
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
+nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
 ```
diff --git a/docs/src/examples/interactions.md b/docs/src/examples/interactions.md
@@ -55,5 +55,5 @@ params {
 As usual, the pipeline can then be run as follows:
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
+nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
 ```
diff --git a/docs/src/examples/phewas.md b/docs/src/examples/phewas.md
@@ -46,5 +46,5 @@ params {
 And the command-line to be run:
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
+nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -70,7 +70,7 @@ nextflow run https://github.com/TARGENE/targene-pipeline/ -r TARGENE_VERSION -c
 
 where:
 
-- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.0`.
+- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.1`.
 - `CONFIG_FILE` is a plain Nextflow configuration file describing what you want to do. Writing this configuration file is the hard work that this documentation is all about! However it needs not be scary, can could be as simple as:
 
 ```conf

diff --git a/docs/src/overview.md b/docs/src/overview.md
@@ -10,7 +10,7 @@ nextflow run https://github.com/TARGENE/targene-pipeline/ -r TARGENE_VERSION -en
 
 where:
 
-- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.0`
+- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.1`
 - `WORKFLOW_NAME` is any of the [TarGene workflows](@ref "Project Configuration")
 - `P` is an optional [Nextflow profile](https://www.nextflow.io/docs/latest/config.html) describing the computing platform (see [Platform Configuration](@ref)).
 

diff --git a/docs/src/simulations/null_simulation.md b/docs/src/simulations/null_simulation.md
@@ -11,7 +11,7 @@ The goal of the Null Generating Process is to result in the theoretical null hyp
 To run the null simulation, the `NULL_SIMULATION` entry should be added to the Nextflow command-line as follows
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -entry NULL_SIMULATION
+nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -entry NULL_SIMULATION
 ```
 
 ## Output

diff --git a/docs/src/simulations/realistic_simulation.md b/docs/src/simulations/realistic_simulation.md
@@ -23,7 +23,7 @@ The second requirement for the simulation to be realistic is that the density es
 To run the null simulation, the `REALISTIC_SIMULATION` entry should be added to the Nextflow command-line as follows
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -entry REALISTIC_SIMULATION
+nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -entry REALISTIC_SIMULATION
 ```
 
 ## Output

diff --git a/docs/src/targene/data_sources.md b/docs/src/targene/data_sources.md
@@ -144,3 +144,17 @@ Additional optional UK-Biobank files for preprocessing and filtering are:
 
 - `QC_FILE`: A path to the UK-Biobank SNP quality control [`ukb_snp_qc.txt`](https://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1955) file.
 - `UKB_WITHDRAWAL_LIST`: A path to the withdrawal sample list to exclude removed participants from the study.
+
+### All of Us Cohort
+
+TarGene can now be run on the All of Us (AoU) Cohort through the AoU Researcher Workbench. A Workspace must first be created in order to run your analysis-of-interest (See [`Creating a Workspace`](https://support.researchallofus.org/hc/en-us/articles/30143658322836-Creating-a-Workspace)). Your Traits Dataset can then be built on the AoU Researcher Workbench through their interactive Dataset Builder tool (See [`Dataset Builder`](https://support.researchallofus.org/hc/en-us/articles/4556645124244-Building-a-Dataset-with-the-Dataset-Builder) for more information on this). 
+
+The dataset created using these tools must be tailored to pull the traits, confounders and/or covariates-of-interest from the All of Us Cohort, subsetting for samples for which genetic data is available. As TarGene requires both PLINK BED and BGEN files, these must be available for the participants for which trait data is being pulled. 
+
+Using the Dataset Builder tool, you can create a `TRAITS_DATASET` that matches the requirements specific to the `COHORT = "CUSTOM"` mode. This requires your partipicant IDs (named, by default, as `person_id` in AoU) to be contained within a column named `SAMPLE_ID`, and any covariates or confounders to be included in subsequent columns. Please ensure that there is only one value provided for each confounder or covariate. 
+
+Please note that the AoU cohort includes data compiled across Electronic Health Records, and therefore the same patient may have multiple entries for a given measurement (for example, Body Mass Index (BMI) measured across multiple GP appointments). This must be dealt with accordingly when configuring your Traits Dataset. For example, you may choose to pick the most recent measurement for a given participant. The AoU Researcher Workbench provides interactive jupyter notebooks where python can be leveraged to perform these kinds of operations. 
+
+The AoU cohort provides some smaller callsets derived from WGS data, for which both PLINK BED and BGEN files are available (termed `srWGS callsets`). We recommend using these with TarGene if you decide to run analyses using the AoU cohort. For more information about these callsets, see [`Short Read WGS Callsets`](https://support.researchallofus.org/hc/en-us/articles/14929793660948-Smaller-Callsets-for-Analyzing-Short-Read-WGS-SNP-Indel-Data-with-Hail-MT-VCF-and-PLINK) and [`Controlled CDR directory`](https://support.researchallofus.org/hc/en-us/articles/4616869437204-Controlled-CDR-Directory).
+
+Please be aware that the callset for ACAF-thresholded genetic data is extremely large and may result in longer runtimes. This is because TarGene is run in parallel using the Google Lifesciences API on the AoU Researcher Workbench. Since this is a Cloud-based platform, each task is run individually on a runner Virtual Machine (VM), so all data relevant to a task must be copied onto that VM before the task can be executed. This results in a large amount of copying overhead, as symbolic links cannot be leveraged between tasks. 
diff --git a/docs/src/targene/overview.md b/docs/src/targene/overview.md
@@ -17,7 +17,7 @@ An overview of the workflow is presented in the following diagram.
 ## Example Run Command
 
 ```bash
-nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -profile local -resume
+nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -profile local -resume
 ```
 
 We now describe step by step how to setup a TarGene run configuration.
diff --git a/modules/confounders.nf b/modules/confounders.nf
@@ -102,4 +102,4 @@ process FlashPCA {
     script:
         input_prefix = bedfiles[0].toString().minus('.bed')
         "/home/flashpca-user/flashpca/flashpca --bfile ${input_prefix} --ndim ${params.NB_PCS} --numthreads ${task.cpus} --suffix .${genotypes_id}.txt"
-}
+}
diff --git a/modules/estimation.nf b/modules/estimation.nf
@@ -3,8 +3,7 @@ process TMLE {
     label 'tmle_image'
 
     input:
-        tuple path(dataset), path(estimands_file)
-        path estimator_file
+        tuple path(dataset), path(estimands_file), path(estimator_file)
 
     output:
         path "${hdf5out}"

diff --git a/modules/utils.nf b/modules/utils.nf
@@ -24,4 +24,29 @@ def filepath_matches_chr_prefix(fp, chr_prefix){
 def leave_chr_out(chr_prefix, bed_files){
     def bed_files_not_matching_chr_prefix = bed_files.findAll{ fp -> !filepath_matches_chr_prefix(fp, chr_prefix) }
     return [chr_prefix, bed_files_not_matching_chr_prefix]
-}
+}
+
+def CreateEstimatorsConfigChannel(configValue) {
+    estimators_ch = Channel.empty()
+    // Ensure configValue is a list
+    if (!(configValue instanceof List)) {
+        configValue = [configValue]
+    } 
+
+    // Iterate through each value of this list
+    for (estimator in configValue) {
+        def configFile = file(estimator)
+        def alreadyCreatedFile = file("${params.OUTDIR}/${estimator}") // Check if created from previous run
+        // If it's not an existing file, create an empty file with this name
+        if (!configFile.exists() && !alreadyCreatedFile.exists()) {
+            file(params.OUTDIR).mkdirs() // Create OUTDIR if it doesn't exist
+            configFile = file("${params.OUTDIR}/${estimator}")
+            configFile.text = '' // Create an empty file
+        } else if (alreadyCreatedFile.exists()) {
+            configFile = alreadyCreatedFile
+        }
+        estimators_ch = estimators_ch.mix(Channel.value(configFile))
+    }
+
+    return estimators_ch
+}
diff --git a/nextflow.config b/nextflow.config
@@ -24,4 +24,8 @@ profiles {
     singularity {
         includeConfig 'conf/singularity.config'
     }
+    allofus {
+        includeConfig 'conf/allofus.config'
+    }
 }
+
diff --git a/subworkflows/estimation.nf b/subworkflows/estimation.nf
@@ -7,10 +7,7 @@ workflow EstimationWorkflow {
 
     main:
         // Run the estimation process for each estimands configuration
-        tmle_results = TMLE(
-            dataset_and_estimands,
-            estimators_config,
-        ).collect()
+        tmle_results = TMLE(dataset_and_estimands.combine(estimators_config)).collect()
 
         // Generate TarGene Outputs
         GenerateOutputs(tmle_results)

diff --git a/test/assets/empty_qq.yaml b/test/assets/empty_qq.yaml
@@ -0,0 +1,13 @@
+type: flat
+
+estimands:
+  - type: ATE
+
+variants:
+  - 1:238411180:T:C
+
+outcome_extra_covariates:
+  - "Skin colour"
+
+extra_confounders:
+  - "Number of vehicles in household"