fix: fix gene-level p-value adjustment (use Benjamini-Hochberg instea…

…d of Bonferroni-Holm) (#64)
snakemake-workflows · Dec 2, 2022 · 6ea1682 · 6ea1682
1 parent bd9c58d
commit 6ea1682
Show file tree

Hide file tree

Showing 16 changed files with 86 additions and 108 deletions.
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -9,11 +9,6 @@ resources:
     release: "104"
     # genome build
     build: GRCh38
-    # this is the version of the bioconda package `bioconductor-org.{species}`.eg.db` that
-    # you want -- this needs to be compatible with the versions `r-base` and the
-    # bioconductor packages specified e.g. in `envs/` files `fgsea.yaml`, `spia.yaml` and
-    # `ens_gene_to_go.yaml`
-    species_db_version: "3.13"
     # pfam release to use for annotation of domains in differential splicing analysis
     pfam: "33.0"
     representative_transcripts: canonical
@@ -59,7 +54,7 @@ diffsplice:
   remove_noncoding_orfs: false
   # False discovery rate to control for.
   fdr: 1.0
-  # Minimum size of differential isoform usage effect 
+  # Minimum size of differential isoform usage effect
   # (see dIFcutoff, https://rdrr.io/github/kvittingseerup/IsoformSwitchAnalyzeR/man/IsoformSwitchTestDEXSeq.html)
   min_effect_size: 0.0
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -9,18 +9,13 @@ resources:
     release: "104"
     # genome build
     build: GRCh38
-    # this is the version of the bioconda package `bioconductor-org.{species}`.eg.db` that
-    # you want -- this needs to be compatible with the versions `r-base` and the
-    # bioconductor packages specified e.g. in `envs/` files `fgsea.yaml`, `spia.yaml` and
-    # `ens_gene_to_go.yaml`
-    species_db_version: "3.13"
     # pfam release to use for annotation of domains in differential splicing analysis
     pfam: "33.0"
     # Choose strategy for selecting representative transcripts for each gene.
     # Possible values:
     #   - canonical (use the canonical transcript from ensembl, only works for human at the moment)
     #   - mostsignificant (use the most significant transcript)
-    #   - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use; 
+    #   - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
     #     the user has to ensure that there is only one ID per gene given)
     representative_transcripts: canonical
   ontology:
@@ -49,7 +44,7 @@ diffexp:
       # Binary valued covariate that shall be used for fold change/effect size
       # based downstream analyses.
       primary_variable: condition
-      # base level of the primary variable (will be considered as denominator 
+      # base level of the primary variable (will be considered as denominator
       # in the fold change/effect size estimation).
       base_level: untreated
   # significance level to use for volcano, ma- and qq-plots
@@ -67,7 +62,7 @@ diffsplice:
   remove_noncoding_orfs: false
   # False discovery rate to control for.
   fdr: 0.05
-  # Minimum size of differential isoform usage effect 
+  # Minimum size of differential isoform usage effect
   # (see dIFcutoff, https://rdrr.io/github/kvittingseerup/IsoformSwitchAnalyzeR/man/IsoformSwitchTestDEXSeq.html)
   min_effect_size: 0.1
 

diff --git a/workflow/envs/enrichment.yaml b/workflow/envs/enrichment.yaml
@@ -0,0 +1,10 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - r-base =4.2
+  - bioconductor-spia =2.50
+  - bioconductor-graphite =1.44
+  - r-tidyverse =1.3
+  - bioconductor-fgsea =1.24
diff --git a/workflow/envs/ens_gene_to_go.yaml b/workflow/envs/ens_gene_to_go.yaml
diff --git a/workflow/envs/fgsea.yaml b/workflow/envs/fgsea.yaml
diff --git a/workflow/envs/spia.yaml b/workflow/envs/spia.yaml
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,6 +1,7 @@
 from snakemake.utils import validate
 import pandas as pd
-
+import yaml
+from pathlib import Path
 
 ##### load config and sample sheets #####
 
@@ -106,16 +107,26 @@ def get_bioc_species_name():
     return first_letter + subspecies
 
 
-def get_bioc_species_pkg(wildcards):
+def get_bioc_species_pkg():
     """Get the package bioconductor package name for the the species in config.yaml"""
     species_letters = get_bioc_species_name()[0:2].capitalize()
     return "org.{species}.eg.db".format(species=species_letters)
 
 
-def get_bioc_pkg_path(wildcards):
-    return "resources/bioconductor/lib/R/library/{pkg}".format(
-        pkg=get_bioc_species_pkg(wildcards)
-    )
+def render_enrichment_env():
+    species_pkg = f"bioconductor-{get_bioc_species_pkg()}"
+    with open(workflow.source_path("../envs/enrichment.yaml")) as f:
+        env = yaml.load(f, Loader=yaml.SafeLoader)
+    env["dependencies"].append(species_pkg)
+    env_path = Path("resources/envs/enrichment.yaml")
+    env_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(env_path, "w") as f:
+        yaml.dump(env, f)
+    return env_path.absolute()
+
+
+bioc_species_pkg = get_bioc_species_pkg()
+enrichment_env = render_enrichment_env()
 
 
 def kallisto_params(wildcards, input):

diff --git a/workflow/rules/enrichment.smk b/workflow/rules/enrichment.smk
@@ -1,28 +1,15 @@
 from pathlib import Path
 
 
-rule download_bioconductor_species_database:
-    output:
-        directory("resources/bioconductor/lib/R/library/{package}"),  # TODO: encode version in path!
-    params:
-        path=lambda wc, output: Path(output[0]).parents[3],
-        version=config["resources"]["ref"]["species_db_version"],
-    log:
-        "logs/resources/bioconductor/{package}.log",
-    shell:
-        "conda create --quiet --yes -p {params.path} --channel conda-forge --channel bioconda "
-        "bioconductor-{wildcards.package}={params.version} > {log} 2>&1"
-
-
 # topology- and interaction-aware pathway enrichment analysis
 
 
 # TODO consider cellphonedb for receptor ligand interaction (Sarah Teichmann, Nature Methods?)
 rule spia:
     input:
         samples="results/sleuth/samples.tsv",
-        species_anno=get_bioc_pkg_path,
         diffexp="results/tables/diffexp/{model}.genes-representative.diffexp.tsv",
+        spia_db="resources/spia-db.rds",
     output:
         table=report(
             "results/tables/pathways/{model}.pathways.tsv",
@@ -31,13 +18,12 @@ rule spia:
         ),
         plots="results/plots/pathways/{model}.spia-perturbation-plots.pdf",
     params:
-        bioc_pkg=get_bioc_species_pkg,
-        species=get_bioc_species_name(),
+        bioc_species_pkg=bioc_species_pkg,
         pathway_db=config["enrichment"]["spia"]["pathway_database"],
         covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
         common_src=str(workflow.source_path("../scripts/common.R")),
     conda:
-        "../envs/spia.yaml"
+        enrichment_env
     log:
         "logs/tables/pathways/{model}.spia-pathways.log",
     threads: 16
@@ -52,7 +38,6 @@ rule fgsea:
     input:
         samples="results/sleuth/samples.tsv",
         diffexp="results/tables/diffexp/{model}.genes-representative.diffexp.tsv",
-        species_anno=get_bioc_pkg_path,
         gene_sets=config["enrichment"]["fgsea"]["gene_sets_file"],
     output:
         enrichment=report(
@@ -81,14 +66,14 @@ rule fgsea:
             category="Gene set enrichment analysis",
         ),
     params:
-        bioc_pkg=get_bioc_species_pkg,
+        bioc_species_pkg=bioc_species_pkg,
         model=get_model,
         gene_set_fdr=config["enrichment"]["fgsea"]["fdr_gene_set"],
         eps=config["enrichment"]["fgsea"]["eps"],
         covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
         common_src=str(workflow.source_path("../scripts/common.R")),
     conda:
-        "../envs/fgsea.yaml"
+        enrichment_env
     log:
         "logs/tables/fgsea/{model}.gene-set-enrichment.log",
     threads: 8
@@ -114,7 +99,7 @@ rule fgsea_plot_gene_sets:
         covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
         common_src=str(workflow.source_path("../scripts/common.R")),
     conda:
-        "../envs/fgsea.yaml"
+        enrichment_env
     log:
         "logs/plots/fgsea/{model}.plot_fgsea_gene_set.log",
     script:
@@ -125,15 +110,13 @@ rule fgsea_plot_gene_sets:
 
 
 rule ens_gene_to_go:
-    input:
-        species_anno=get_bioc_pkg_path,
     output:
         "resources/ontology/ens_gene_to_go.tsv",
     params:
-        bioc_pkg=get_bioc_species_pkg,
+        bioc_species_pkg=bioc_species_pkg,
         common_src=str(workflow.source_path("../scripts/common.R")),
     conda:
-        "../envs/ens_gene_to_go.yaml"
+        enrichment_env
     log:
         "logs/resources/ens_gene_to_go.log",
     script:

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -105,3 +105,21 @@ rule calculate_cpat_logit_model:
     shell:
         "make_logitModel.py --hex={input.hexamers} --cgene={input.cds} "
         "--ngene={input.ncrna} -o {params.prefix} 2> {log}"
+
+
+rule get_spia_db:
+    output:
+        "resources/spia-db.rds",
+    log:
+        "logs/spia-db.log",
+    params:
+        bioc_species_pkg=bioc_species_pkg,
+        species=get_bioc_species_name(),
+        pathway_db=config["enrichment"]["spia"]["pathway_database"],
+        common_src=str(workflow.source_path("../scripts/common.R")),
+    conda:
+        enrichment_env
+    retries: 3
+    cache: True
+    script:
+        "../scripts/get-spia-db.R"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -22,8 +22,6 @@ properties:
             type: string
           build:
             type: string
-          species_db_version:
-            type: string
           pfam:
             type: string
           representative_transcripts:
@@ -32,7 +30,6 @@ properties:
           - species
           - release
           - build
-          - species_db_version
           - pfam
           - representative_transcripts
       ontology:
@@ -136,7 +133,6 @@ properties:
         required:
           - pathway_database
 
-
   bootstrap_plots:
     type: object
     properties:

diff --git a/workflow/scripts/common.R b/workflow/scripts/common.R
@@ -1,21 +1,5 @@
 library("tidyverse")
 
-load_bioconductor_package <- function(path_to_bioc_pkg, pkg_name) {
-
-    lib <- str_remove(path_to_bioc_pkg, pkg_name)
-
-    # ensure that dependencies of the pkg are also found at same location
-    .libPaths( c( lib , .libPaths() ) )
-
-    library(pkg_name, character.only = TRUE)
-
-    print(str_c("loaded package ", pkg_name))
-
-    # ensure that library() calls outside this function don't go looking in the
-    # location needed here
-    .libPaths( .libPaths()[-1] )
-}
-
 get_prefix_col <- function(prefix, col_names) {
 
     covariate <- snakemake@params[["covariate"]]

diff --git a/workflow/scripts/ens_gene_to_go.R b/workflow/scripts/ens_gene_to_go.R
@@ -2,14 +2,13 @@ log <- file(snakemake@log[[1]], open="wt")
 sink(log)
 sink(log, type="message")
 
+library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)
+
 # provides `tidyverse` and load_bioconductor_package()
 source(snakemake@params[["common_src"]])
 
-pkg <- snakemake@params[["bioc_pkg"]]
-load_bioconductor_package(snakemake@input[["species_anno"]], pkg)
-
-ens_gene_to_go <- AnnotationDbi::select(  get(pkg),
-                                          keys=keys(get(pkg), keytype="ENSEMBL"),
+ens_gene_to_go <- AnnotationDbi::select(  get(snakemake@params[["bioc_species_pkg"]]),
+                                          keys=keys(get(snakemake@params[["bioc_species_pkg"]]), keytype="ENSEMBL"),
                                           columns=c("GO"),
                                           keytype="ENSEMBL"
                                           ) %>%

diff --git a/workflow/scripts/fgsea.R b/workflow/scripts/fgsea.R
@@ -3,15 +3,13 @@ sink(log)
 sink(log, type="message")
 
 library("fgsea")
+library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)
 
 # provides library("tidyverse") and functions load_bioconductor_package() and
 # get_prefix_col(), the latter requires snakemake@output[["samples"]] and
 # snakemake@params[["covariate"]]
 source(snakemake@params[["common_src"]])
 
-pkg <- snakemake@params[["bioc_pkg"]]
-load_bioconductor_package(snakemake@input[["species_anno"]], pkg)
-
 gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
 diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
                   drop_na(ext_gene) %>%
@@ -128,7 +126,7 @@ height = .7 * (length(selected_gene_sets) + 2)
 
 # table plot of all gene sets
 tg <- plotGseaTable(
-            pathway = selected_gene_sets,
+            pathways = selected_gene_sets,
             stats = ranked_genes,
             fgseaRes = fgsea_res,
             gseaParam = 1,
@@ -143,7 +141,7 @@ height = .7 * (length(selected_gene_sets) + 2)
 
 # table plot of all gene sets
 tg <- plotGseaTable(
-            pathway = selected_gene_sets,
+            pathways = selected_gene_sets,
             stats = ranked_genes,
             fgseaRes = fgsea_res,
             gseaParam = 1,

diff --git a/workflow/scripts/get-spia-db.R b/workflow/scripts/get-spia-db.R
@@ -0,0 +1,19 @@
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+library("SPIA")
+library("graphite")
+library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)
+
+# provides library("tidyverse") and functions load_bioconductor_package() and
+# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
+# snakemake@params[["covariate"]]
+source(snakemake@params[["common_src"]])
+
+pw_db <- snakemake@params[["pathway_db"]]
+
+db <- pathways(snakemake@params[["species"]], pw_db)
+db <- convertIdentifiers(db, "ENSEMBL")
+
+saveRDS(db, snakemake@output[[1]])
diff --git a/workflow/scripts/sleuth-diffexp.R b/workflow/scripts/sleuth-diffexp.R
@@ -168,7 +168,7 @@ write_results <- function(so, mode, output, output_all) {
         stop("No canonical transcripts found (does ensembl support canonical transcript annotation for your species?")
       }
       # Control FDR again, because we have less tests now.
-      all$qval <- p.adjust(all$pval)
+      all$qval <- p.adjust(all$pval, method = "BH")
     } else if (mode == "custom") {
       # load custom ID list
       id_version_pattern <- "\\.\\d+$"