Skip to content

Commit

Permalink
Merge pull request #58 from TARGENE/brh_data_source
Browse files Browse the repository at this point in the history
Compatibility with custom dataset
  • Loading branch information
olivierlabayle authored Dec 4, 2023
2 parents a126a78 + 2f22712 commit 3f96860
Show file tree
Hide file tree
Showing 21 changed files with 652 additions and 52 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ jobs:
fail-fast: false
matrix:
testrun:
- "from_actors.jl"
- "from_param_files.jl"
- "ukb_from_actors.jl"
- "ukb_from_param_file.jl"
- "custom_from_actors.jl"
steps:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@v1
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ LocalPreferences.toml
test/sandbox.jl
docs/Manifest.toml
*.DS_Store
trace.txt*
test/Manifest.toml
trace.txt
trace.txt
22 changes: 22 additions & 0 deletions conf/ci_jobs/custom_from_actors.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
params {
PARAMETER_PLAN = "FROM_ACTORS"
PARAMETER_FILE = "test/data/parameters/parameters.yaml"
DECRYPTED_DATASET = "test/data/traits.csv"
COHORT = "CUSTOM"
BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
NB_PCS = 6
TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
ESTIMATORFILE = "test/data/estimator.jl"
NB_VAR_ESTIMATORS = 10
PVAL_THRESHOLD = 1
BQTLS = "test/data/actors/bqtls_multiple_TFs.csv"
TRANS_ACTORS = "test/data/actors/eqtls_multiple_TFs.csv"
ORDERS = "1,2"
POSITIVITY_CONSTRAINT = 0.01
GRM_NSPLITS = 10
BATCH_SIZE = 400
MAX_PERMUTATION_TESTS = 100
MAF_MATCHING_RELTOL = 0.9

}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
params {
PARAMETER_PLAN = "FROM_ACTORS"
DECRYPTED_DATASET = "test/data/dataset.csv"
UKBB_BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
UKBB_BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
COHORT = "UKBB"
BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
QC_FILE = "test/data/qc_file.csv"
LD_BLOCKS = "test/data/ld_blocks.txt"
FLASHPCA_EXCLUSION_REGIONS = "test/data/exclusion_regions_hg19.txt"
WITHDRAWAL_LIST = "test/data/withdrawal_list.txt"
NB_PCS = 6
TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
Expand All @@ -21,4 +21,4 @@ params {
MAX_PERMUTATION_TESTS = 100
MAF_MATCHING_RELTOL = 0.9
N_RANDOM_VARIANTS = 5
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ params {
PARAMETER_PLAN = "FROM_PARAM_FILE"
PARAMETER_FILE = "test/data/parameters/parameters.yaml"
DECRYPTED_DATASET = "test/data/dataset.csv"
UKBB_BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
UKBB_BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
COHORT = "UKBB"
BED_FILES = "test/data/unphased_bed/ukb_chr{1,2,3}.{bed,bim,fam}"
BGEN_FILES = "test/data/unphased_bgen/ukb_chr{1,2,3}.{bgen,bgen.bgi,sample}"
QC_FILE = "test/data/qc_file.csv"
LD_BLOCKS = "test/data/ld_blocks.txt"
FLASHPCA_EXCLUSION_REGIONS = "test/data/exclusion_regions_hg19.txt"
WITHDRAWAL_LIST = "test/data/withdrawal_list.txt"
NB_PCS = 6
TRAITS_CONFIG = "test/data/ukbconfig_small.yaml"
Expand All @@ -15,4 +15,4 @@ params {

POSITIVITY_CONSTRAINT = 0.0

}
}
4 changes: 4 additions & 0 deletions data/exclusion_regions_hg19.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
5 44000000 51500000 r1
6 25000000 33500000 r2
8 8000000 12000000 r3
11 45000000 57000000 r4
4 changes: 4 additions & 0 deletions data/exclusion_regions_hg38.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
5 43999898 52204166 r1
6 24999772 33532223 r2
8 8142478 12142491 r3
11 44978449 57232526 r4
12 changes: 3 additions & 9 deletions docs/src/contribution_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,12 @@ Currently, all TarGene building blocks (executables) are provided as docker imag

## Note on the pipeline's tests

The pipeline is automatically tested for every push/pull-request made to the github repository. The tests will require that Julia and the container engine of your choice be installed (see below). We currently have a suite of two tests corresponding to the two main usages of the pipeline. They can be run locally as follows:
The pipeline is automatically tested for every push/pull-request made to the github repository. The tests will require that Julia and the container engine of your choice be installed (see below). Each test corresponds to a pipeline run and a file in the `test` directory is associated with it. Each test run can be launched locally as follows:

```bash
julia --project=test --startup-file=no test/from_param_files.jl -profile PROFILE -resume
julia --project=test --startup-file=no test/TESTFILE -profile PROFILE -resume
```

and

```bash
julia --project=test --startup-file=no test/from_actors.jl -profile PROFILE -resume
```

where `PROFILE` is one of: 'local' (local with singularity engine), 'ci' (local with singularity engine), 'eddie' (SGE with singularity engine.).
where `TESTFILE` is the corresponding file and `PROFILE` is one of: 'local' (local with singularity engine), 'ci' (local with singularity engine), 'eddie' (SGE with singularity engine.).

The tests can also be run interactively via the Julia REPL.
14 changes: 9 additions & 5 deletions docs/src/data_sources.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Setting a data source

Currently, only the UK-Biobank is supported, work is under way to enable custom data sources!
The following section will describe the data sources that are currently supported by TarGene.

## Custom Dataset

TarGene supports custom datasets which must include both genetic data (see [Genetic Data](@ref)) and trait data (a .csv file). Please ensure that the annotation of SNPs in your .bgen files matches the annotation used when speicfying the parameters. When running a custom dataset, you must set the `COHORT` parameter in the nextflow configuration file to the name of your cohort or custom dataset (anything **else** than "UKBB"). The trait data for this mode must also be specified in the `DECRYPTED_DATASET` parameter in the nextflow configuration file. Please ensure the Sample IDs that map the genetic data to the trait data are included as the first column of your trait data, with the column name `SAMPLE_ID`.

## UK-Biobank

Expand Down Expand Up @@ -83,13 +87,13 @@ To come back to the pipeline specification, if a "typical" main dataset has alre

Note: Since this decrypted dataset is a plain CSV file, one may build and add extra columns to it. Any column in the decrypted dataset which does not correspond to a UK-Biobank field will be considered as such.

### Genetic Data

We are currently using both .bgen and .bed files, furthermore, we assume that the data is **unphased**. Those are respectively provided with the `UKBB_BGEN_FILES` and `UKBB_BED_FILES` parameters. Since the UK-Biobank genotyping data is split in chromosomes, it should be of the form `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}` and `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}` respectively.

### Additional Files

Additional UK-Biobank required files for preprocessing and filtering are:

- `QC_FILE`: A path to the UK-Biobank SNP quaility control [`ukb_snp_qc.txt`](https://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1955) file.
- `WITHDRAWAL_LIST`: A path to the withdrawal sample list to exclude removed participants from the study.

## Genetic Data

We are currently using both .bgen and .bed files, furthermore, we assume that the data is **unphased**. Those are respectively provided with the `BGEN_FILES` and `BED_FILES` parameters. It is also assumed that the genotyping data is split in chromosomes, it should be of the form `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bgen,sample,bgen.bgi}` and `PREFIX_TO_CHROMOSOMES{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}.{bed,bim,fam}` respectively.
15 changes: 8 additions & 7 deletions docs/src/nextflow_params.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@ Here is a list of all the pipeline parameters:

## [Setting a data source](@ref)

- **`COHORT` (required):** Current default for this is UKBB. If set to a value other than UKBB, this will not run UKBB-specific trait extraction. If `COHORT` is not UKBB, you must specify your trait data in the `DECRYPTED_DATASET` parameter.
- **`ENCRYPTED_DATASET` (required unless a `DECRYPTED_DATASET` is given)**: Path to a UK-Biobank encrypted main dataset.
- **`ENCODING_FILE` (required unless a `DECRYPTED_DATASET` is given)**: If an encrypted dataset is provided, an encoding file must accompany it.
- `DECRYPTED_DATASET` (optional): Path to a UK-Biobank decrypted main dataset. If set, `ENCRYPTED_DATASET` is ignored.
- `DECRYPTED_DATASET` (optional): Path to a UK-Biobank decrypted main dataset. If set, `ENCRYPTED_DATASET` is ignored. **If you are running this for a non-UKBB cohort, your sample IDs must be specified in the first column of this CSV file, with the column name `SAMPLE_ID`.**
- **`TRAITS_CONFIG` (required)**: Configuration file describing which traits should be extracted from the main dataset.
- **`WITHDRAWAL_LIST` (required)**: List of participants withdrawn from the study.
- **`QC_FILE` (required)**: Genotyping quality control file from the UK-Biobank study.
- **`UKBB_BED_FILES` (required)**: Path expression to PLINK BED files.
- **`UKBB_BGEN_FILES` (required)**: Path expression to iputed BGEN files.
- `QC_FILE` (optional): Genotyping quality control file from the UK-Biobank study. This is **required** when `COHORT`="UKBB".
- **`BED_FILES` (required)**: Path expression to PLINK BED files.
- **`BGEN_FILES` (required)**: Path expression to imputed BGEN files.

## [Adjusting for confounders](@ref)

- **`LD_BLOCKS` (required)**: A path to pre-identified linkage desequlibrium blocks around the variants that will be queried for causal effect estimation. Those will be removed from the data.
- **`LD_BLOCKS` (optional)**: A path to pre-identified linkage disequlibrium blocks around the variants that will be queried for causal effect estimation. Those will be removed from the data. It is good practice to specify `LD_BLOCKS`, as it will remove SNPs correlated with your variants-of-interest before running PCA.
- **`FLASHPCA_EXCLUSION_REGIONS` (required)**: A path to the flashpca special exclusion regions which is provided in their repository.
- `MAF_THRESHOLD` (optional): Only variants with that minor allele frequency are considered
- `NB_PCS` (optional, default: 6): The number of PCA components to extract.
Expand All @@ -30,8 +31,8 @@ If `PARAMETER_PLAN`="FROM\_PARAM\_FILE":

If `PARAMETER_PLAN`="FROM_ACTORS":

- **`BQTLS` (required)**: A CSV file containing binding quantitative trait loci.
- **`TRANS_ACTORS` (required)**: A prefix to CSV files containing quantitative trait loci potentially interacting with the previous bqtls.
- **`BQTLS` (required)**: A CSV file containing binding quantitative trait loci (bQTLs). If multiple transcription factors (TFs) are included in a single run, you must include a column called `TF`, which specifies the TF associated with each bQTL.
- **`TRANS_ACTORS` (required)**: A prefix to CSV files containing quantitative trait loci potentially interacting with the previous bqtls. If multiple transcription factors (TFs) are included in a single run, you must include a column called `TF`, which specifies the TF associated with each transactor.
- `EXTRA_CONFOUNDERS` (optional, default: nothing): Path to additional confounders file, one per line, no header.
- `EXTRA_COVARIATES` (optional, default: nothing): Path to additional covariates file, one per line, no header.
- `ENVIRONMENTALS` (optional, default: nothing): Path to additional environmental treatments file, one per line, no header.
Expand Down
37 changes: 29 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ params.OUTDIR = "${launchDir}/results"
params.RESULTS_FILE = "${params.OUTDIR}/summary.csv"
params.TMLE_INPUT_DATASET = "${params.OUTDIR}/tmle_inputs/final.data.arrow"

params.COHORT = "UKBB"
params.TRAITS_CONFIG = "NO_UKB_TRAIT_CONFIG"
params.WITHDRAWAL_LIST = 'NO_WITHDRAWAL_LIST'
params.QC_FILE = "NO_QC_FILE"
params.LD_BLOCKS = "NO_LD_BLOCKS"
params.FLASHPCA_EXCLUSION_REGIONS = "data/exclusion_regions_hg19.txt"

params.BATCH_SIZE = 400
params.EXTRA_CONFOUNDERS = 'NO_EXTRA_CONFOUNDER'
Expand Down Expand Up @@ -60,10 +64,15 @@ workflow extractTraits {
decrypted_dataset = Channel.value(file("$params.DECRYPTED_DATASET"))
}

TraitsFromUKB(decrypted_dataset, traits_config, withdrawal_list)
if (params.COHORT == "UKBB") {
extracted_traits = TraitsFromUKB(decrypted_dataset, traits_config, withdrawal_list)
}
else {
extracted_traits = decrypted_dataset
}

emit:
TraitsFromUKB.out
extracted_traits
}

workflow generateIIDGenotypes {
Expand All @@ -74,7 +83,7 @@ workflow generateIIDGenotypes {
qc_file = Channel.value(file("$params.QC_FILE"))
flashpca_excl_reg = Channel.value(file("$params.FLASHPCA_EXCLUSION_REGIONS"))
ld_blocks = Channel.value(file("$params.LD_BLOCKS"))
bed_files_ch = Channel.fromFilePairs("$params.UKBB_BED_FILES", size: 3, checkIfExists: true){ file -> file.baseName }
bed_files_ch = Channel.fromFilePairs("$params.BED_FILES", size: 3, checkIfExists: true){ file -> file.baseName }

IIDGenotypes(flashpca_excl_reg, ld_blocks, bed_files_ch, qc_file, traits)

Expand All @@ -95,14 +104,26 @@ workflow geneticConfounders {

}

workflow runPCA {
// Extract traits
extractTraits()

// Generate IID Genotypes
generateIIDGenotypes(extractTraits.out)

// Genetic confounders up to NB_PCS
geneticConfounders(generateIIDGenotypes.out)

}

workflow generateTMLEEstimates {
take:
traits
genetic_confounders

main:
estimator_file = Channel.value(file("$params.ESTIMATORFILE", checkIfExists: true))
bgen_files = Channel.fromPath("$params.UKBB_BGEN_FILES", checkIfExists: true).collect()
bgen_files = Channel.fromPath("$params.BGEN_FILES", checkIfExists: true).collect()

if (params.PARAMETER_PLAN == "FROM_ACTORS") {
bqtls = Channel.value(file("$params.BQTLS"))
Expand Down Expand Up @@ -178,16 +199,16 @@ workflow negativeControl {

// Random Variants parameter files generation
if (params.PARAMETER_PLAN == "FROM_ACTORS") {
bgen_files = Channel.fromPath("$params.UKBB_BGEN_FILES", checkIfExists: true).collect()
bgen_files = Channel.fromPath("$params.BGEN_FILES", checkIfExists: true).collect()
trans_actors = Channel.fromPath("$params.TRANS_ACTORS", checkIfExists: true).collect()
GenerateRandomVariantsTestsData(trans_actors, bgen_files, results_file)
}
}

workflow {
// Extract traits
// Extract traits for UKBB
extractTraits()

// Generate IID Genotypes
generateIIDGenotypes(extractTraits.out)

Expand All @@ -210,4 +231,4 @@ workflow {
}

MergeOutputs(generateTMLEEstimates.out.tmle_csvs.collect(), sieve_csv, "summary.csv")
}
}
9 changes: 6 additions & 3 deletions modules/genotypes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ process filterBED{

script:
prefix = bedfiles[0].toString().minus('.bed')
qc_file = params.QC_FILE !== 'NO_QC_FILE' ? "--qcfile $qcfile" : ''
ld_blocks = params.LD_BLOCKS !== 'NO_LD_BLOCKS' ? "--ld-blocks $ld_blocks" : ''

"""
TEMPD=\$(mktemp -d)
JULIA_DEPOT_PATH=\$TEMPD:/opt julia --project=/TargeneCore.jl --startup-file=no /TargeneCore.jl/bin/prepare_confounders.jl \
--input $prefix --output filtered.$prefix --qcfile $qcfile --maf-threshold $params.MAF_THRESHOLD --ld-blocks $ld_blocks --traits $traits filter
--input $prefix --output filtered.$prefix $qc_file --maf-threshold $params.MAF_THRESHOLD \
$ld_blocks --traits $traits filter
"""

}


Expand Down Expand Up @@ -95,4 +98,4 @@ workflow IIDGenotypes{

emit:
SampleQCFilter.out
}
}
2 changes: 1 addition & 1 deletion modules/negative_control.nf
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ process GeneratePermutationTestsData {
}

process GenerateRandomVariantsTestsData {
container "olivierlabayle/negative-controls:0.1"
container "olivierlabayle/negative-controls:0.2"
publishDir "${params.OUTDIR}", mode: 'symlink'
label "bigmem"

Expand Down
37 changes: 37 additions & 0 deletions test/custom_from_actors.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# "local" profile assumes singularity is installed
args = length(ARGS) > 0 ? ARGS : ["-profile", "local", "-resume"]

include("utils.jl")

@testset "Test custom_from_actors.config" begin
cmd = `nextflow run main.nf -c conf/ci_jobs/custom_from_actors.config $args`
@info string("The following command will be run:\n", cmd)

r = run(cmd)
@test r.exitcode == 0

output = CSV.read(joinpath("results", "summary.csv"), DataFrame)
dataset = DataFrame(Arrow.Table(joinpath("results", "tmle_inputs", "final.data.arrow")))
bQTLs = Symbol.(CSV.read(joinpath("test", "data", "actors", "bqtls.csv"), DataFrame).ID)

@test names(output) == vcat(SUMMARY_COLUMNS, SIEVE_COLUMNS, ADJUTMENT_COL)
# 2 bQTLs and 1 trans-actor
@test Set(unique(output.TREATMENTS)) == Set(["1:238411180:T:C", "3:3502414:T:C", "1:238411180:T:C_&_2:14983:G:A", "3:3502414:T:C_&_2:14983:G:A"])

test_n_success_more_than_threshold(output, 20)

# Here we test that the process generateIIDGenotypes has been run once for each chromosome
n_chr_files = filter(x -> startswith(x, "LDpruned.filtered.ukb_chr"), readdir(joinpath("results","ld_pruned_chromosomes")))
# There should be 4 files for each chromosome
@test length(n_chr_files) == 4*3

## Checking parameter files correspond to either bQTL only or bQTL/eQTL
parameters_tf1 = parameters_from_yaml(joinpath("results", "parameters", "final.TF1.param_1.yaml"))
parameters_tf2 = parameters_from_yaml(joinpath("results", "parameters", "final.TF2.param_1.yaml"))
@test size(parameters_tf1, 1) == 296
@test size(parameters_tf2, 1) == 148
for Ψ in vcat(parameters_tf1, parameters_tf2)
@test keys.treatment)[1] bQTLs
end

end
3 changes: 3 additions & 0 deletions test/data/actors/bqtls_multiple_TFs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID,TF
"1:238411180:T:C",TF1
"3:3502414:T:C",TF2
3 changes: 3 additions & 0 deletions test/data/actors/eqtls_multiple_TFs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID,TF
"2:14983:G:A",TF1
"2:14983:G:A",TF2
2 changes: 1 addition & 1 deletion test/data/exclusion_regions_hg19.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
5 44000000 51500000 r1
6 25000000 33500000 r2
8 8000000 12000000 r3
11 45000000 57000000 r4
11 45000000 57000000 r4
Loading

0 comments on commit 3f96860

Please sign in to comment.