Merge pull request #58 from TARGENE/brh_data_source

Compatibility with custom dataset
TARGENE · Dec 4, 2023 · 3f96860 · 3f96860
2 parents a126a78 + 2f22712
commit 3f96860
Show file tree

Hide file tree

Showing 21 changed files with 652 additions and 52 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -15,8 +15,9 @@ jobs:
       fail-fast: false
       matrix:
         testrun:
-          - "from_actors.jl"
-          - "from_param_files.jl"
+          - "ukb_from_actors.jl"
+          - "ukb_from_param_file.jl"
+          - "custom_from_actors.jl"
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1

diff --git a/.gitignore b/.gitignore
@@ -12,5 +12,6 @@ LocalPreferences.toml
 test/sandbox.jl
 docs/Manifest.toml
 *.DS_Store
+trace.txt*
 test/Manifest.toml
-trace.txt
+trace.txt
diff --git a/conf/ci_jobs/custom_from_actors.config b/conf/ci_jobs/custom_from_actors.config
@@ -0,0 +1,22 @@
+params {
+    PARAMETER_PLAN = "FROM_ACTORS" 
+    PARAMETER_FILE = "test/data/parameters/parameters.yaml" 
+    DECRYPTED_DATASET = "test/data/traits.csv"
+    COHORT = "CUSTOM"
+    BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
+    BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
+    NB_PCS = 6
+    TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
+    ESTIMATORFILE = "test/data/estimator.jl"
+    NB_VAR_ESTIMATORS = 10
+    PVAL_THRESHOLD = 1
+    BQTLS = "test/data/actors/bqtls_multiple_TFs.csv"
+    TRANS_ACTORS = "test/data/actors/eqtls_multiple_TFs.csv"
+    ORDERS = "1,2"
+    POSITIVITY_CONSTRAINT = 0.01
+    GRM_NSPLITS = 10
+    BATCH_SIZE = 400
+    MAX_PERMUTATION_TESTS = 100
+    MAF_MATCHING_RELTOL = 0.9
+
+}
diff --git a/conf/ci_jobs/from_actors.config → conf/ci_jobs/ukb_from_actors.config b/conf/ci_jobs/from_actors.config → conf/ci_jobs/ukb_from_actors.config
@@ -1,11 +1,11 @@
 params {
     PARAMETER_PLAN = "FROM_ACTORS" 
     DECRYPTED_DATASET = "test/data/dataset.csv"
-    UKBB_BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
-    UKBB_BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
+    COHORT = "UKBB"
+    BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
+    BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
     QC_FILE = "test/data/qc_file.csv"
     LD_BLOCKS = "test/data/ld_blocks.txt"
-    FLASHPCA_EXCLUSION_REGIONS = "test/data/exclusion_regions_hg19.txt"
     WITHDRAWAL_LIST = "test/data/withdrawal_list.txt"
     NB_PCS = 6
     TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
@@ -21,4 +21,4 @@ params {
     MAX_PERMUTATION_TESTS = 100
     MAF_MATCHING_RELTOL = 0.9
     N_RANDOM_VARIANTS = 5
-}
+}
diff --git a/conf/ci_jobs/from_param_file.config → conf/ci_jobs/ukb_from_param_file.config b/conf/ci_jobs/from_param_file.config → conf/ci_jobs/ukb_from_param_file.config
@@ -2,11 +2,11 @@ params {
     PARAMETER_PLAN = "FROM_PARAM_FILE" 
     PARAMETER_FILE = "test/data/parameters/parameters.yaml" 
     DECRYPTED_DATASET = "test/data/dataset.csv"
-    UKBB_BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
-    UKBB_BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
+    COHORT = "UKBB"
+    BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
+    BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
     QC_FILE = "test/data/qc_file.csv"
     LD_BLOCKS = "test/data/ld_blocks.txt"
-    FLASHPCA_EXCLUSION_REGIONS = "test/data/exclusion_regions_hg19.txt"
     WITHDRAWAL_LIST = "test/data/withdrawal_list.txt"
     NB_PCS = 6
     TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
@@ -15,4 +15,4 @@ params {
 
     POSITIVITY_CONSTRAINT = 0.0
 
-}
+}
diff --git a/data/exclusion_regions_hg19.txt b/data/exclusion_regions_hg19.txt
@@ -0,0 +1,4 @@
+5 44000000 51500000 r1
+6 25000000 33500000 r2
+8 8000000 12000000 r3
+11 45000000 57000000 r4
diff --git a/data/exclusion_regions_hg38.txt b/data/exclusion_regions_hg38.txt
@@ -0,0 +1,4 @@
+5 43999898 52204166 r1
+6 24999772 33532223 r2
+8 8142478 12142491 r3
+11 44978449 57232526 r4
diff --git a/docs/src/contribution_guide.md b/docs/src/contribution_guide.md
@@ -40,18 +40,12 @@ Currently, all TarGene building blocks (executables) are provided as docker imag
 
 ## Note on the pipeline's tests
 
-The pipeline is automatically tested for every push/pull-request made to the github repository. The tests will require that Julia and the container engine of your choice be installed (see below). We currently have a suite of two tests corresponding to the two main usages of the pipeline. They can be run locally as follows:
+The pipeline is automatically tested for every push/pull-request made to the github repository. The tests will require that Julia and the container engine of your choice be installed (see below). Each test corresponds to a pipeline run and a file in the `test` directory is associated with it. Each test run can be launched locally as follows:
 
 ```bash
-julia --project=test --startup-file=no test/from_param_files.jl -profile PROFILE -resume
+julia --project=test --startup-file=no test/TESTFILE -profile PROFILE -resume
 ```
 
-and
-
-```bash
-julia --project=test --startup-file=no test/from_actors.jl -profile PROFILE -resume
-```
-
-where `PROFILE` is one of: 'local' (local with singularity engine), 'ci' (local with singularity engine), 'eddie' (SGE with singularity engine.).
+where `TESTFILE` is the corresponding file and `PROFILE` is one of: 'local' (local with singularity engine), 'ci' (local with singularity engine), 'eddie' (SGE with singularity engine.).
 
 The tests can also be run interactively via the Julia REPL.
diff --git a/docs/src/data_sources.md b/docs/src/data_sources.md
@@ -1,6 +1,10 @@
 # Setting a data source
 
-Currently, only the UK-Biobank is supported, work is under way to enable custom data sources!
+The following section will describe the data sources that are currently supported by TarGene.
+
+## Custom Dataset
+
+TarGene supports custom datasets which must include both genetic data (see [Genetic Data](@ref)) and trait data (a .csv file). Please ensure that the annotation of SNPs in your .bgen files matches the annotation used when speicfying the parameters. When running a custom dataset, you must set the `COHORT` parameter in the nextflow configuration file to the name of your cohort or custom dataset (anything **else** than "UKBB"). The trait data for this mode must also be specified in the `DECRYPTED_DATASET` parameter in the nextflow configuration file. Please ensure the Sample IDs that map the genetic data to the trait data are included as the first column of your trait data, with the column name `SAMPLE_ID`.
 
 ## UK-Biobank
 
@@ -83,13 +87,13 @@ To come back to the pipeline specification, if a "typical" main dataset has alre
 
 Note: Since this decrypted dataset is a plain CSV file, one may build and add extra columns to it. Any column in the decrypted dataset which does not correspond to a UK-Biobank field will be considered as such.
 
-### Genetic Data
-
-We are currently using both .bgen and .bed files, furthermore, we assume that the data is **unphased**. Those are respectively provided with the `UKBB_BGEN_FILES` and `UKBB_BED_FILES` parameters. Since the UK-Biobank genotyping data is split in chromosomes, it should be of the form `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}` and `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}` respectively.
-
 ### Additional Files
 
 Additional UK-Biobank required files for preprocessing and filtering are:
 
 - `QC_FILE`: A path to the UK-Biobank SNP quaility control [`ukb_snp_qc.txt`](https://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1955) file.
 - `WITHDRAWAL_LIST`: A path to the withdrawal sample list to exclude removed participants from the study.
+
+## Genetic Data
+
+We are currently using both .bgen and .bed files, furthermore, we assume that the data is **unphased**. Those are respectively provided with the `BGEN_FILES` and `BED_FILES` parameters. It is also assumed that the genotyping data is split in chromosomes, it should be of the form `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}` and `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}` respectively.
diff --git a/docs/src/nextflow_params.md b/docs/src/nextflow_params.md
@@ -4,18 +4,19 @@ Here is a list of all the pipeline parameters:
 
 ## [Setting a data source](@ref)
 
+- **`COHORT` (required):** Current default for this is UKBB. If set to a value other than UKBB, this will not run UKBB-specific trait extraction. If `COHORT` is not UKBB, you must specify your trait data in the `DECRYPTED_DATASET` parameter.
 - **`ENCRYPTED_DATASET` (required unless a `DECRYPTED_DATASET` is given)**: Path to a UK-Biobank encrypted main dataset.
 - **`ENCODING_FILE` (required unless a `DECRYPTED_DATASET` is given)**: If an encrypted dataset is provided, an encoding file must accompany it.
-- `DECRYPTED_DATASET` (optional): Path to a UK-Biobank decrypted main dataset. If set, `ENCRYPTED_DATASET` is ignored.
+- `DECRYPTED_DATASET` (optional): Path to a UK-Biobank decrypted main dataset. If set, `ENCRYPTED_DATASET` is ignored. **If you are running this for a non-UKBB cohort, your sample IDs must be specified in the first column of this CSV file, with the column name `SAMPLE_ID`.**
 - **`TRAITS_CONFIG` (required)**: Configuration file describing which traits should be extracted from the main dataset.
 - **`WITHDRAWAL_LIST` (required)**: List of participants withdrawn from the study.
-- **`QC_FILE` (required)**: Genotyping quality control file from the UK-Biobank study.
-- **`UKBB_BED_FILES` (required)**: Path expression to PLINK BED files.
-- **`UKBB_BGEN_FILES` (required)**: Path expression to iputed BGEN files.
+- `QC_FILE` (optional): Genotyping quality control file from the UK-Biobank study. This is **required** when `COHORT`="UKBB".
+- **`BED_FILES` (required)**: Path expression to PLINK BED files.
+- **`BGEN_FILES` (required)**: Path expression to imputed BGEN files.
 
 ## [Adjusting for confounders](@ref)
 
-- **`LD_BLOCKS` (required)**: A path to pre-identified linkage desequlibrium blocks around the variants that will be queried for causal effect estimation. Those will be removed from the data.
+- **`LD_BLOCKS` (optional)**: A path to pre-identified linkage disequlibrium blocks around the variants that will be queried for causal effect estimation. Those will be removed from the data. It is good practice to specify `LD_BLOCKS`, as it will remove SNPs correlated with your variants-of-interest before running PCA. 
 - **`FLASHPCA_EXCLUSION_REGIONS` (required)**: A path to the flashpca special exclusion regions which is provided in their repository.
 - `MAF_THRESHOLD` (optional): Only variants with that minor allele frequency are considered
 - `NB_PCS` (optional, default: 6): The number of PCA components to extract.
@@ -30,8 +31,8 @@ If `PARAMETER_PLAN`="FROM\_PARAM\_FILE":
 
 If `PARAMETER_PLAN`="FROM_ACTORS":
 
-- **`BQTLS` (required)**: A CSV file containing binding quantitative trait loci.
-- **`TRANS_ACTORS` (required)**: A prefix to CSV files containing quantitative trait loci potentially interacting with the previous bqtls.
+- **`BQTLS` (required)**: A CSV file containing binding quantitative trait loci (bQTLs). If multiple transcription factors (TFs) are included in a single run, you must include a column called `TF`, which specifies the TF associated with each bQTL.
+- **`TRANS_ACTORS` (required)**: A prefix to CSV files containing quantitative trait loci potentially interacting with the previous bqtls. If multiple transcription factors (TFs) are included in a single run, you must include a column called `TF`, which specifies the TF associated with each transactor.
 - `EXTRA_CONFOUNDERS` (optional, default: nothing): Path to additional confounders file, one per line, no header.
 - `EXTRA_COVARIATES` (optional, default: nothing): Path to additional covariates file, one per line, no header.
 - `ENVIRONMENTALS` (optional, default: nothing): Path to additional environmental treatments file, one per line, no header.

diff --git a/main.nf b/main.nf
@@ -21,8 +21,12 @@ params.OUTDIR = "${launchDir}/results"
 params.RESULTS_FILE = "${params.OUTDIR}/summary.csv"
 params.TMLE_INPUT_DATASET = "${params.OUTDIR}/tmle_inputs/final.data.arrow"
 
+params.COHORT = "UKBB"
 params.TRAITS_CONFIG = "NO_UKB_TRAIT_CONFIG"
 params.WITHDRAWAL_LIST = 'NO_WITHDRAWAL_LIST'
+params.QC_FILE = "NO_QC_FILE"
+params.LD_BLOCKS = "NO_LD_BLOCKS"
+params.FLASHPCA_EXCLUSION_REGIONS = "data/exclusion_regions_hg19.txt"
 
 params.BATCH_SIZE = 400
 params.EXTRA_CONFOUNDERS = 'NO_EXTRA_CONFOUNDER'
@@ -60,10 +64,15 @@ workflow extractTraits {
         decrypted_dataset = Channel.value(file("$params.DECRYPTED_DATASET"))
     }
 
-    TraitsFromUKB(decrypted_dataset, traits_config, withdrawal_list)
+    if (params.COHORT == "UKBB") {
+        extracted_traits = TraitsFromUKB(decrypted_dataset, traits_config, withdrawal_list)
+    } 
+    else {
+        extracted_traits = decrypted_dataset 
+    }
 
     emit:
-        TraitsFromUKB.out
+        extracted_traits
 }
 
 workflow generateIIDGenotypes {
@@ -74,7 +83,7 @@ workflow generateIIDGenotypes {
         qc_file = Channel.value(file("$params.QC_FILE"))
         flashpca_excl_reg = Channel.value(file("$params.FLASHPCA_EXCLUSION_REGIONS"))
         ld_blocks = Channel.value(file("$params.LD_BLOCKS"))
-        bed_files_ch = Channel.fromFilePairs("$params.UKBB_BED_FILES", size: 3, checkIfExists: true){ file -> file.baseName }
+        bed_files_ch = Channel.fromFilePairs("$params.BED_FILES", size: 3, checkIfExists: true){ file -> file.baseName }
 
         IIDGenotypes(flashpca_excl_reg, ld_blocks, bed_files_ch, qc_file, traits)
 
@@ -95,14 +104,26 @@ workflow geneticConfounders {
 
 }
 
+workflow runPCA {
+    // Extract traits
+    extractTraits()
+
+    // Generate IID Genotypes
+    generateIIDGenotypes(extractTraits.out)
+
+    // Genetic confounders up to NB_PCS
+    geneticConfounders(generateIIDGenotypes.out) 
+
+}
+
 workflow generateTMLEEstimates {
     take:
         traits
         genetic_confounders
 
     main:
         estimator_file = Channel.value(file("$params.ESTIMATORFILE", checkIfExists: true))
-        bgen_files = Channel.fromPath("$params.UKBB_BGEN_FILES", checkIfExists: true).collect()
+        bgen_files = Channel.fromPath("$params.BGEN_FILES", checkIfExists: true).collect()
 
         if (params.PARAMETER_PLAN == "FROM_ACTORS") {
             bqtls = Channel.value(file("$params.BQTLS"))
@@ -178,16 +199,16 @@ workflow negativeControl {
 
     // Random Variants parameter files generation
     if (params.PARAMETER_PLAN == "FROM_ACTORS") {
-        bgen_files = Channel.fromPath("$params.UKBB_BGEN_FILES", checkIfExists: true).collect()
+        bgen_files = Channel.fromPath("$params.BGEN_FILES", checkIfExists: true).collect()
         trans_actors = Channel.fromPath("$params.TRANS_ACTORS", checkIfExists: true).collect()
         GenerateRandomVariantsTestsData(trans_actors, bgen_files, results_file)
     }
 }
 
 workflow {
-    // Extract traits
+    // Extract traits for UKBB
     extractTraits()
-
+    
     // Generate IID Genotypes
     generateIIDGenotypes(extractTraits.out)
 
@@ -210,4 +231,4 @@ workflow {
     }
 
     MergeOutputs(generateTMLEEstimates.out.tmle_csvs.collect(), sieve_csv, "summary.csv")
-}
+}
diff --git a/modules/genotypes.nf b/modules/genotypes.nf
@@ -14,12 +14,15 @@ process filterBED{
 
     script:
         prefix = bedfiles[0].toString().minus('.bed')
+        qc_file = params.QC_FILE !== 'NO_QC_FILE' ? "--qcfile $qcfile" : '' 
+        ld_blocks = params.LD_BLOCKS !== 'NO_LD_BLOCKS' ? "--ld-blocks $ld_blocks" : ''
+
         """
         TEMPD=\$(mktemp -d)
         JULIA_DEPOT_PATH=\$TEMPD:/opt julia --project=/TargeneCore.jl --startup-file=no /TargeneCore.jl/bin/prepare_confounders.jl \
-        --input $prefix --output filtered.$prefix --qcfile $qcfile --maf-threshold $params.MAF_THRESHOLD --ld-blocks $ld_blocks --traits $traits filter
+                    --input $prefix --output filtered.$prefix $qc_file --maf-threshold $params.MAF_THRESHOLD \
+                    $ld_blocks --traits $traits filter
         """
-
 }
 
 
@@ -95,4 +98,4 @@ workflow IIDGenotypes{
 
     emit:
         SampleQCFilter.out
-}
+}
diff --git a/modules/negative_control.nf b/modules/negative_control.nf
@@ -46,7 +46,7 @@ process GeneratePermutationTestsData {
 }
 
 process GenerateRandomVariantsTestsData {
-    container "olivierlabayle/negative-controls:0.1"
+    container "olivierlabayle/negative-controls:0.2"
     publishDir "${params.OUTDIR}", mode: 'symlink'
     label "bigmem"
 

diff --git a/test/custom_from_actors.jl b/test/custom_from_actors.jl
@@ -0,0 +1,37 @@
+# "local" profile assumes singularity is installed
+args = length(ARGS) > 0 ? ARGS : ["-profile", "local", "-resume"] 
+
+include("utils.jl")
+
+@testset "Test custom_from_actors.config" begin
+    cmd = `nextflow run main.nf -c conf/ci_jobs/custom_from_actors.config $args`
+    @info string("The following command will be run:\n", cmd)
+
+    r = run(cmd)
+    @test r.exitcode == 0
+
+    output = CSV.read(joinpath("results", "summary.csv"), DataFrame)
+    dataset = DataFrame(Arrow.Table(joinpath("results", "tmle_inputs", "final.data.arrow")))
+    bQTLs = Symbol.(CSV.read(joinpath("test", "data", "actors", "bqtls.csv"), DataFrame).ID)
+
+    @test names(output) == vcat(SUMMARY_COLUMNS, SIEVE_COLUMNS, ADJUTMENT_COL)
+    # 2 bQTLs and 1 trans-actor
+    @test Set(unique(output.TREATMENTS)) == Set(["1:238411180:T:C", "3:3502414:T:C", "1:238411180:T:C_&_2:14983:G:A", "3:3502414:T:C_&_2:14983:G:A"])
+
+    test_n_success_more_than_threshold(output, 20)
+
+    # Here we test that the process generateIIDGenotypes has been run once for each chromosome
+    n_chr_files = filter(x -> startswith(x, "LDpruned.filtered.ukb_chr"), readdir(joinpath("results","ld_pruned_chromosomes")))
+    # There should be 4 files for each chromosome
+    @test length(n_chr_files) == 4*3
+
+    ## Checking parameter files correspond to either bQTL only or bQTL/eQTL
+    parameters_tf1 = parameters_from_yaml(joinpath("results", "parameters", "final.TF1.param_1.yaml"))
+    parameters_tf2 = parameters_from_yaml(joinpath("results", "parameters", "final.TF2.param_1.yaml"))
+    @test size(parameters_tf1, 1) == 296
+    @test size(parameters_tf2, 1) == 148
+    for Ψ in vcat(parameters_tf1, parameters_tf2)
+        @test keys(Ψ.treatment)[1] ∈ bQTLs
+    end
+
+end
diff --git a/test/data/actors/bqtls_multiple_TFs.csv b/test/data/actors/bqtls_multiple_TFs.csv
@@ -0,0 +1,3 @@
+ID,TF
+"1:238411180:T:C",TF1
+"3:3502414:T:C",TF2
diff --git a/test/data/actors/eqtls_multiple_TFs.csv b/test/data/actors/eqtls_multiple_TFs.csv
@@ -0,0 +1,3 @@
+ID,TF
+"2:14983:G:A",TF1
+"2:14983:G:A",TF2
diff --git a/test/data/exclusion_regions_hg19.txt b/test/data/exclusion_regions_hg19.txt
@@ -1,4 +1,4 @@
 5 44000000 51500000 r1
 6 25000000 33500000 r2
 8 8000000 12000000 r3
-11 45000000 57000000 r4
+11 45000000 57000000 r4