Skip to content

Commit

Permalink
Merge pull request #174 from TARGENE/allofus
Browse files Browse the repository at this point in the history
Allofus - Adapt targene for use on the All of Us Researcher Workbench
  • Loading branch information
roskamsh authored Nov 13, 2024
2 parents 4c3e011 + 29d42bb commit 080867e
Show file tree
Hide file tree
Showing 32 changed files with 776 additions and 35 deletions.
1 change: 1 addition & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
- "ukb_estimands_file.jl"
- "custom_cohort_flat_config.jl"
- "ukb_interactions_group_config.jl"
- "empty_qq.jl"
- "make_dataset.jl"
- "null_simulation.jl"
- "realistic_simulation.jl"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ docs/Manifest.toml
*.DS_Store
trace.txt*
trace.txt
trace*txt
simulation/
null_simulation/
causal_gap_test/
61 changes: 61 additions & 0 deletions conf/allofus.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
includeConfig 'base.config'
includeConfig 'container.config'

// Define the number of CPUs based on memory requested
def calculateCpus(memory) {
def mem
if (memory instanceof nextflow.util.MemoryUnit) {
mem = memory.toMega()
} else {
error "Memory not formatted correctly. Please specify memory as MemoryUnit."
}
// Google Cloud Platform (GCP) restricts a maximum of 6 GB (6144 MB) provided per-CPU
// Compute # of CPUs based on this (1 CPU minimum required)
def cpus = Math.max(1, Math.ceil(mem / 6144).intValue())

// # of CPUs required to be even by GCP for tasks with CPUs > 1
if (cpus > 1) {
cpus = cpus + (cpus % 2)
}

return cpus
}

process {
// Override CPUs based on task memory
cpus = { calculateCpus(task.memory) }

withLabel: multithreaded {
cpus = { calculateCpus(task.memory) }
}

withLabel: bigmem {
cpus = { calculateCpus(task.memory) }
}

// Set google appropriate error strategy
errorStrategy = {task.exitStatus in [143,137,104,134,139,14,140,151] ? 'retry' : 'finish'}
maxRetries = 3

// Base container required when running only through docker only
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${base_image_version}"

withLabel: simulation_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${simulation_image_version}"
}
withLabel: targenecore_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${targenecore_image_version}"
}
withLabel: ukb_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${ukb_image_version}"
}
withLabel: tmle_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${tmle_image_version}"
}
withLabel: plink_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${plink_image_version}"
}
withLabel: pca_image {
container = "us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod/${pca_image_version}"
}
}
22 changes: 15 additions & 7 deletions conf/container.config
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
simulation_image_version = "olivierlabayle/targene-simulations:0.0"
targenecore_image_version = "olivierlabayle/tl-core:0.10"
ukb_image_version = "olivierlabayle/ukbmain:0.5"
tmle_image_version = "olivierlabayle/targeted-estimation:0.10"
plink_image_version = "olivierlabayle/plink2:0.1.0"
pca_image_version = "roskamsh/flashpca:0.1.0"
base_image_version = "roskamsh/commandlinetools:0.1.1"

process {
withLabel: simulation_image {
container = "olivierlabayle/targene-simulations:0.0"
container = simulation_image_version
}
withLabel: targenecore_image {
container = "olivierlabayle/tl-core:0.10"
container = targenecore_image_version
}
withLabel: ukb_image {
container = "olivierlabayle/ukbmain:0.5"
container = ukb_image_version
}
withLabel: tmle_image {
container = "olivierlabayle/targeted-estimation:0.10"
container = tmle_image_version
}
withLabel: plink_image {
container = "olivierlabayle/plink2:0.1.0"
container = plink_image_version
}
withLabel: pca_image {
container = "ktetleycampbell/flashpca:1.0"
container = pca_image_version
}
}
}
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ makedocs(;
joinpath("examples", "gwas.md"),
joinpath("examples", "phewas.md"),
joinpath("examples", "interactions.md"),
joinpath("examples", "allofus.md")
],
"User Guide" => [
"overview.md",
Expand Down
64 changes: 64 additions & 0 deletions docs/src/examples/allofus.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# All of Us

Analyses within the All of Us (AoU) Researcher Workbench using genetic data must be run within the `Controlled Tier Access` (See [`Data Access`](https://www.researchallofus.org/data-tools/data-access/)). Workspaces launched within this tier will automatically have nextflow installed and can use TarGene immediately.

Each Workspace will be assigned a bucket for storage on Google Cloud, that can be found on the right hand panel of the `About` page of your workspace. Each Workspace will also contain Google Cloud-specific credentials in order to submit jobs via the Google Lifesciences API. These can be found in your Workspace-specific nextflow profile, located at `~/.nextflow/config`, and available by using the flag `-profile gls` when you run nextflow. TarGene requires some additional configuration to run on the AoU Researcher workbench that is built into the `allofus` profile, which can be combined with your Workspace-specific `gls` configuration to batch out jobs when running TarGene on this platform.

We reccommend running this by first entering a `Cloud Analysis Terminal` on your current Workspace, creating a configuration for the analysis you would like to run, and running TarGene in a screen session. See [`Workflows in the All of Us Researched Workbench`](https://support.researchallofus.org/hc/en-us/articles/4811899197076-Workflows-in-the-All-of-Us-Researcher-Workbench-Nextflow-and-Cromwell) for more information.

A minimalist run configuration to run a flat config run on the AoU Researcher Workbench using TarGene looks like the following:

```conf
params {
COHORT = "ALLOFUS"
ESTIMANDS_CONFIG = "allofus_config.yaml"
// All of Us srWGS data
BGEN_FILE = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/clinvar_v7.1/bgen/clinvar.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}"
BED_FILES = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/clinvar_v7.1/plink_bed/clinvar.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}"
TRAITS_DATASET = "allofus_traits.csv"
}
```

Apart from the data related parameters, there are two main parameters here: `ESTIMANDS_CONFIG` and the `ESTIMATORS_CONFIG`. These parameters describe the estimands (questions of interest) and how to estimate them respectively.

The `ESTIMANDS_CONFIG` here follows the same format as the `flat` configuration detailed in the [PheWAS](@ref) section. Here we are estimating the ATE of the FTO variant (encoded as chr16:53767042:T:C in the BGEN data for the AoU cohort) on the traits present in our `allofus_traits.csv` file. We have also added `Sex at birth` as a covariate.

```yaml
type: flat

estimands:
- type: ATE

variants:
- chr16:53767042:T:C

outcome_extra_covariates:
- "Sex at birth"
```
The optional `outcome_extra_covariates` are variables to be used as extra predictors of the outcome (but not as confounders). This information must be contained in the `allofus_traits.csv` file, along with your outcomes-of-interest.

The `allofus_traits.csv` might look as follows:

| SAMPLE_ID | Sex at birth | Height (cm) |
|-----------|--------------|-------------|
| 100000 | Male | 180 |
| 100002 | Female | 165 |
| 100004 | Male | 175 |
| 100010 | Female | 160 |
| ... | ... | ... |

Here we have not specified any value for `ESTIMATORS_CONFIG`, and so the default, `ESTIMATORS_CONFIG = "wtmle-ose--tunedxgboost"`, will be used. This defines the estimation strategy, and more specifically, that we will be using Targeted Minimum-Loss Estimator as well as a One Step Estimator with a tuned XGBoost model to learn the outcome models (``Q_Y``) and propensity scores (``G``).

Then TarGene can then be run on the AoU Researcher Workbench as follows:

```bash
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile gls,allofus
```

By default, this will generate results in the `results/` directory in your `Cloud Analysis Terminal`. Once complete, you can upload these to your Workspace bucket using the following command:

```bash
gsutil -m -u $GOOGLE_PROJECT cp -r results/* gs://path/to/workspace/bucket
```
2 changes: 1 addition & 1 deletion docs/src/examples/gwas.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,5 @@ Finally, the `ESTIMATORS_CONFIG = "wtmle--glm"` defines the estimation strategy.
The GWAS can then be run as follows:

```bash
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
```
2 changes: 1 addition & 1 deletion docs/src/examples/interactions.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,5 @@ params {
As usual, the pipeline can then be run as follows:

```bash
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
```
2 changes: 1 addition & 1 deletion docs/src/examples/phewas.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ params {
And the command-line to be run:

```bash
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.0 -profile local
nextflow run https://github.com/TARGENE/targene-pipeline -r v0.11.1 -profile local
```
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ nextflow run https://github.com/TARGENE/targene-pipeline/ -r TARGENE_VERSION -c

where:

- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.0`.
- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.1`.
- `CONFIG_FILE` is a plain Nextflow configuration file describing what you want to do. Writing this configuration file is the hard work that this documentation is all about! However it needs not be scary, can could be as simple as:

```conf
Expand Down
2 changes: 1 addition & 1 deletion docs/src/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ nextflow run https://github.com/TARGENE/targene-pipeline/ -r TARGENE_VERSION -en

where:

- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.0`
- `TARGENE_VERSION` is the latest TarGene version, e.g. `v0.11.1`
- `WORKFLOW_NAME` is any of the [TarGene workflows](@ref "Project Configuration")
- `P` is an optional [Nextflow profile](https://www.nextflow.io/docs/latest/config.html) describing the computing platform (see [Platform Configuration](@ref)).

Expand Down
2 changes: 1 addition & 1 deletion docs/src/simulations/null_simulation.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The goal of the Null Generating Process is to result in the theoretical null hyp
To run the null simulation, the `NULL_SIMULATION` entry should be added to the Nextflow command-line as follows

```bash
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -entry NULL_SIMULATION
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -entry NULL_SIMULATION
```

## Output
Expand Down
2 changes: 1 addition & 1 deletion docs/src/simulations/realistic_simulation.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ The second requirement for the simulation to be realistic is that the density es
To run the null simulation, the `REALISTIC_SIMULATION` entry should be added to the Nextflow command-line as follows

```bash
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -entry REALISTIC_SIMULATION
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -entry REALISTIC_SIMULATION
```

## Output
Expand Down
14 changes: 14 additions & 0 deletions docs/src/targene/data_sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,17 @@ Additional optional UK-Biobank files for preprocessing and filtering are:

- `QC_FILE`: A path to the UK-Biobank SNP quality control [`ukb_snp_qc.txt`](https://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1955) file.
- `UKB_WITHDRAWAL_LIST`: A path to the withdrawal sample list to exclude removed participants from the study.

### All of Us Cohort

TarGene can now be run on the All of Us (AoU) Cohort through the AoU Researcher Workbench. A Workspace must first be created in order to run your analysis-of-interest (See [`Creating a Workspace`](https://support.researchallofus.org/hc/en-us/articles/30143658322836-Creating-a-Workspace)). Your Traits Dataset can then be built on the AoU Researcher Workbench through their interactive Dataset Builder tool (See [`Dataset Builder`](https://support.researchallofus.org/hc/en-us/articles/4556645124244-Building-a-Dataset-with-the-Dataset-Builder) for more information on this).

The dataset created using these tools must be tailored to pull the traits, confounders and/or covariates-of-interest from the All of Us Cohort, subsetting for samples for which genetic data is available. As TarGene requires both PLINK BED and BGEN files, these must be available for the participants for which trait data is being pulled.

Using the Dataset Builder tool, you can create a `TRAITS_DATASET` that matches the requirements specific to the `COHORT = "CUSTOM"` mode. This requires your partipicant IDs (named, by default, as `person_id` in AoU) to be contained within a column named `SAMPLE_ID`, and any covariates or confounders to be included in subsequent columns. Please ensure that there is only one value provided for each confounder or covariate.

Please note that the AoU cohort includes data compiled across Electronic Health Records, and therefore the same patient may have multiple entries for a given measurement (for example, Body Mass Index (BMI) measured across multiple GP appointments). This must be dealt with accordingly when configuring your Traits Dataset. For example, you may choose to pick the most recent measurement for a given participant. The AoU Researcher Workbench provides interactive jupyter notebooks where python can be leveraged to perform these kinds of operations.

The AoU cohort provides some smaller callsets derived from WGS data, for which both PLINK BED and BGEN files are available (termed `srWGS callsets`). We recommend using these with TarGene if you decide to run analyses using the AoU cohort. For more information about these callsets, see [`Short Read WGS Callsets`](https://support.researchallofus.org/hc/en-us/articles/14929793660948-Smaller-Callsets-for-Analyzing-Short-Read-WGS-SNP-Indel-Data-with-Hail-MT-VCF-and-PLINK) and [`Controlled CDR directory`](https://support.researchallofus.org/hc/en-us/articles/4616869437204-Controlled-CDR-Directory).

Please be aware that the callset for ACAF-thresholded genetic data is extremely large and may result in longer runtimes. This is because TarGene is run in parallel using the Google Lifesciences API on the AoU Researcher Workbench. Since this is a Cloud-based platform, each task is run individually on a runner Virtual Machine (VM), so all data relevant to a task must be copied onto that VM before the task can be executed. This results in a large amount of copying overhead, as symbolic links cannot be leveraged between tasks.
2 changes: 1 addition & 1 deletion docs/src/targene/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ An overview of the workflow is presented in the following diagram.
## Example Run Command

```bash
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.0 -profile local -resume
nextflow run https://github.com/TARGENE/targene-pipeline/ -r v0.11.1 -profile local -resume
```

We now describe step by step how to setup a TarGene run configuration.
2 changes: 1 addition & 1 deletion modules/confounders.nf
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,4 @@ process FlashPCA {
script:
input_prefix = bedfiles[0].toString().minus('.bed')
"/home/flashpca-user/flashpca/flashpca --bfile ${input_prefix} --ndim ${params.NB_PCS} --numthreads ${task.cpus} --suffix .${genotypes_id}.txt"
}
}
3 changes: 1 addition & 2 deletions modules/estimation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ process TMLE {
label 'tmle_image'

input:
tuple path(dataset), path(estimands_file)
path estimator_file
tuple path(dataset), path(estimands_file), path(estimator_file)

output:
path "${hdf5out}"
Expand Down
27 changes: 26 additions & 1 deletion modules/utils.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,29 @@ def filepath_matches_chr_prefix(fp, chr_prefix){
def leave_chr_out(chr_prefix, bed_files){
def bed_files_not_matching_chr_prefix = bed_files.findAll{ fp -> !filepath_matches_chr_prefix(fp, chr_prefix) }
return [chr_prefix, bed_files_not_matching_chr_prefix]
}
}

def CreateEstimatorsConfigChannel(configValue) {
estimators_ch = Channel.empty()
// Ensure configValue is a list
if (!(configValue instanceof List)) {
configValue = [configValue]
}

// Iterate through each value of this list
for (estimator in configValue) {
def configFile = file(estimator)
def alreadyCreatedFile = file("${params.OUTDIR}/${estimator}") // Check if created from previous run
// If it's not an existing file, create an empty file with this name
if (!configFile.exists() && !alreadyCreatedFile.exists()) {
file(params.OUTDIR).mkdirs() // Create OUTDIR if it doesn't exist
configFile = file("${params.OUTDIR}/${estimator}")
configFile.text = '' // Create an empty file
} else if (alreadyCreatedFile.exists()) {
configFile = alreadyCreatedFile
}
estimators_ch = estimators_ch.mix(Channel.value(configFile))
}

return estimators_ch
}
4 changes: 4 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@ profiles {
singularity {
includeConfig 'conf/singularity.config'
}
allofus {
includeConfig 'conf/allofus.config'
}
}

5 changes: 1 addition & 4 deletions subworkflows/estimation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ workflow EstimationWorkflow {

main:
// Run the estimation process for each estimands configuration
tmle_results = TMLE(
dataset_and_estimands,
estimators_config,
).collect()
tmle_results = TMLE(dataset_and_estimands.combine(estimators_config)).collect()

// Generate TarGene Outputs
GenerateOutputs(tmle_results)
Expand Down
13 changes: 13 additions & 0 deletions test/assets/empty_qq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
type: flat

estimands:
- type: ATE

variants:
- 1:238411180:T:C

outcome_extra_covariates:
- "Skin colour"

extra_confounders:
- "Number of vehicles in household"
Loading

0 comments on commit 080867e

Please sign in to comment.