Skip to content

Commit

Permalink
fix: fix gene-level p-value adjustment (use Benjamini-Hochberg instea…
Browse files Browse the repository at this point in the history
…d of Bonferroni-Holm) (#64)
  • Loading branch information
johanneskoester authored Dec 2, 2022
1 parent bd9c58d commit 6ea1682
Show file tree
Hide file tree
Showing 16 changed files with 86 additions and 108 deletions.
7 changes: 1 addition & 6 deletions .test/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@ resources:
release: "104"
# genome build
build: GRCh38
# this is the version of the bioconda package `bioconductor-org.{species}`.eg.db` that
# you want -- this needs to be compatible with the versions `r-base` and the
# bioconductor packages specified e.g. in `envs/` files `fgsea.yaml`, `spia.yaml` and
# `ens_gene_to_go.yaml`
species_db_version: "3.13"
# pfam release to use for annotation of domains in differential splicing analysis
pfam: "33.0"
representative_transcripts: canonical
Expand Down Expand Up @@ -59,7 +54,7 @@ diffsplice:
remove_noncoding_orfs: false
# False discovery rate to control for.
fdr: 1.0
# Minimum size of differential isoform usage effect
# Minimum size of differential isoform usage effect
# (see dIFcutoff, https://rdrr.io/github/kvittingseerup/IsoformSwitchAnalyzeR/man/IsoformSwitchTestDEXSeq.html)
min_effect_size: 0.0

Expand Down
11 changes: 3 additions & 8 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,13 @@ resources:
release: "104"
# genome build
build: GRCh38
# this is the version of the bioconda package `bioconductor-org.{species}`.eg.db` that
# you want -- this needs to be compatible with the versions `r-base` and the
# bioconductor packages specified e.g. in `envs/` files `fgsea.yaml`, `spia.yaml` and
# `ens_gene_to_go.yaml`
species_db_version: "3.13"
# pfam release to use for annotation of domains in differential splicing analysis
pfam: "33.0"
# Choose strategy for selecting representative transcripts for each gene.
# Possible values:
# - canonical (use the canonical transcript from ensembl, only works for human at the moment)
# - mostsignificant (use the most significant transcript)
# - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
# - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
# the user has to ensure that there is only one ID per gene given)
representative_transcripts: canonical
ontology:
Expand Down Expand Up @@ -49,7 +44,7 @@ diffexp:
# Binary valued covariate that shall be used for fold change/effect size
# based downstream analyses.
primary_variable: condition
# base level of the primary variable (will be considered as denominator
# base level of the primary variable (will be considered as denominator
# in the fold change/effect size estimation).
base_level: untreated
# significance level to use for volcano, ma- and qq-plots
Expand All @@ -67,7 +62,7 @@ diffsplice:
remove_noncoding_orfs: false
# False discovery rate to control for.
fdr: 0.05
# Minimum size of differential isoform usage effect
# Minimum size of differential isoform usage effect
# (see dIFcutoff, https://rdrr.io/github/kvittingseerup/IsoformSwitchAnalyzeR/man/IsoformSwitchTestDEXSeq.html)
min_effect_size: 0.1

Expand Down
10 changes: 10 additions & 0 deletions workflow/envs/enrichment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- r-base =4.2
- bioconductor-spia =2.50
- bioconductor-graphite =1.44
- r-tidyverse =1.3
- bioconductor-fgsea =1.24
7 changes: 0 additions & 7 deletions workflow/envs/ens_gene_to_go.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions workflow/envs/fgsea.yaml

This file was deleted.

9 changes: 0 additions & 9 deletions workflow/envs/spia.yaml

This file was deleted.

23 changes: 17 additions & 6 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from snakemake.utils import validate
import pandas as pd

import yaml
from pathlib import Path

##### load config and sample sheets #####

Expand Down Expand Up @@ -106,16 +107,26 @@ def get_bioc_species_name():
return first_letter + subspecies


def get_bioc_species_pkg(wildcards):
def get_bioc_species_pkg():
"""Get the package bioconductor package name for the the species in config.yaml"""
species_letters = get_bioc_species_name()[0:2].capitalize()
return "org.{species}.eg.db".format(species=species_letters)


def get_bioc_pkg_path(wildcards):
return "resources/bioconductor/lib/R/library/{pkg}".format(
pkg=get_bioc_species_pkg(wildcards)
)
def render_enrichment_env():
species_pkg = f"bioconductor-{get_bioc_species_pkg()}"
with open(workflow.source_path("../envs/enrichment.yaml")) as f:
env = yaml.load(f, Loader=yaml.SafeLoader)
env["dependencies"].append(species_pkg)
env_path = Path("resources/envs/enrichment.yaml")
env_path.parent.mkdir(parents=True, exist_ok=True)
with open(env_path, "w") as f:
yaml.dump(env, f)
return env_path.absolute()


bioc_species_pkg = get_bioc_species_pkg()
enrichment_env = render_enrichment_env()


def kallisto_params(wildcards, input):
Expand Down
33 changes: 8 additions & 25 deletions workflow/rules/enrichment.smk
Original file line number Diff line number Diff line change
@@ -1,28 +1,15 @@
from pathlib import Path


rule download_bioconductor_species_database:
output:
directory("resources/bioconductor/lib/R/library/{package}"), # TODO: encode version in path!
params:
path=lambda wc, output: Path(output[0]).parents[3],
version=config["resources"]["ref"]["species_db_version"],
log:
"logs/resources/bioconductor/{package}.log",
shell:
"conda create --quiet --yes -p {params.path} --channel conda-forge --channel bioconda "
"bioconductor-{wildcards.package}={params.version} > {log} 2>&1"


# topology- and interaction-aware pathway enrichment analysis


# TODO consider cellphonedb for receptor ligand interaction (Sarah Teichmann, Nature Methods?)
rule spia:
input:
samples="results/sleuth/samples.tsv",
species_anno=get_bioc_pkg_path,
diffexp="results/tables/diffexp/{model}.genes-representative.diffexp.tsv",
spia_db="resources/spia-db.rds",
output:
table=report(
"results/tables/pathways/{model}.pathways.tsv",
Expand All @@ -31,13 +18,12 @@ rule spia:
),
plots="results/plots/pathways/{model}.spia-perturbation-plots.pdf",
params:
bioc_pkg=get_bioc_species_pkg,
species=get_bioc_species_name(),
bioc_species_pkg=bioc_species_pkg,
pathway_db=config["enrichment"]["spia"]["pathway_database"],
covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
common_src=str(workflow.source_path("../scripts/common.R")),
conda:
"../envs/spia.yaml"
enrichment_env
log:
"logs/tables/pathways/{model}.spia-pathways.log",
threads: 16
Expand All @@ -52,7 +38,6 @@ rule fgsea:
input:
samples="results/sleuth/samples.tsv",
diffexp="results/tables/diffexp/{model}.genes-representative.diffexp.tsv",
species_anno=get_bioc_pkg_path,
gene_sets=config["enrichment"]["fgsea"]["gene_sets_file"],
output:
enrichment=report(
Expand Down Expand Up @@ -81,14 +66,14 @@ rule fgsea:
category="Gene set enrichment analysis",
),
params:
bioc_pkg=get_bioc_species_pkg,
bioc_species_pkg=bioc_species_pkg,
model=get_model,
gene_set_fdr=config["enrichment"]["fgsea"]["fdr_gene_set"],
eps=config["enrichment"]["fgsea"]["eps"],
covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
common_src=str(workflow.source_path("../scripts/common.R")),
conda:
"../envs/fgsea.yaml"
enrichment_env
log:
"logs/tables/fgsea/{model}.gene-set-enrichment.log",
threads: 8
Expand All @@ -114,7 +99,7 @@ rule fgsea_plot_gene_sets:
covariate=lambda w: config["diffexp"]["models"][w.model]["primary_variable"],
common_src=str(workflow.source_path("../scripts/common.R")),
conda:
"../envs/fgsea.yaml"
enrichment_env
log:
"logs/plots/fgsea/{model}.plot_fgsea_gene_set.log",
script:
Expand All @@ -125,15 +110,13 @@ rule fgsea_plot_gene_sets:


rule ens_gene_to_go:
input:
species_anno=get_bioc_pkg_path,
output:
"resources/ontology/ens_gene_to_go.tsv",
params:
bioc_pkg=get_bioc_species_pkg,
bioc_species_pkg=bioc_species_pkg,
common_src=str(workflow.source_path("../scripts/common.R")),
conda:
"../envs/ens_gene_to_go.yaml"
enrichment_env
log:
"logs/resources/ens_gene_to_go.log",
script:
Expand Down
18 changes: 18 additions & 0 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,21 @@ rule calculate_cpat_logit_model:
shell:
"make_logitModel.py --hex={input.hexamers} --cgene={input.cds} "
"--ngene={input.ncrna} -o {params.prefix} 2> {log}"


rule get_spia_db:
output:
"resources/spia-db.rds",
log:
"logs/spia-db.log",
params:
bioc_species_pkg=bioc_species_pkg,
species=get_bioc_species_name(),
pathway_db=config["enrichment"]["spia"]["pathway_database"],
common_src=str(workflow.source_path("../scripts/common.R")),
conda:
enrichment_env
retries: 3
cache: True
script:
"../scripts/get-spia-db.R"
4 changes: 0 additions & 4 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ properties:
type: string
build:
type: string
species_db_version:
type: string
pfam:
type: string
representative_transcripts:
Expand All @@ -32,7 +30,6 @@ properties:
- species
- release
- build
- species_db_version
- pfam
- representative_transcripts
ontology:
Expand Down Expand Up @@ -136,7 +133,6 @@ properties:
required:
- pathway_database


bootstrap_plots:
type: object
properties:
Expand Down
16 changes: 0 additions & 16 deletions workflow/scripts/common.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,5 @@
library("tidyverse")

load_bioconductor_package <- function(path_to_bioc_pkg, pkg_name) {

lib <- str_remove(path_to_bioc_pkg, pkg_name)

# ensure that dependencies of the pkg are also found at same location
.libPaths( c( lib , .libPaths() ) )

library(pkg_name, character.only = TRUE)

print(str_c("loaded package ", pkg_name))

# ensure that library() calls outside this function don't go looking in the
# location needed here
.libPaths( .libPaths()[-1] )
}

get_prefix_col <- function(prefix, col_names) {

covariate <- snakemake@params[["covariate"]]
Expand Down
9 changes: 4 additions & 5 deletions workflow/scripts/ens_gene_to_go.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@ log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides `tidyverse` and load_bioconductor_package()
source(snakemake@params[["common_src"]])

pkg <- snakemake@params[["bioc_pkg"]]
load_bioconductor_package(snakemake@input[["species_anno"]], pkg)

ens_gene_to_go <- AnnotationDbi::select( get(pkg),
keys=keys(get(pkg), keytype="ENSEMBL"),
ens_gene_to_go <- AnnotationDbi::select( get(snakemake@params[["bioc_species_pkg"]]),
keys=keys(get(snakemake@params[["bioc_species_pkg"]]), keytype="ENSEMBL"),
columns=c("GO"),
keytype="ENSEMBL"
) %>%
Expand Down
8 changes: 3 additions & 5 deletions workflow/scripts/fgsea.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@ sink(log)
sink(log, type="message")

library("fgsea")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pkg <- snakemake@params[["bioc_pkg"]]
load_bioconductor_package(snakemake@input[["species_anno"]], pkg)

gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
drop_na(ext_gene) %>%
Expand Down Expand Up @@ -128,7 +126,7 @@ height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
pathway = selected_gene_sets,
pathways = selected_gene_sets,
stats = ranked_genes,
fgseaRes = fgsea_res,
gseaParam = 1,
Expand All @@ -143,7 +141,7 @@ height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
pathway = selected_gene_sets,
pathways = selected_gene_sets,
stats = ranked_genes,
fgseaRes = fgsea_res,
gseaParam = 1,
Expand Down
19 changes: 19 additions & 0 deletions workflow/scripts/get-spia-db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("SPIA")
library("graphite")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pw_db <- snakemake@params[["pathway_db"]]

db <- pathways(snakemake@params[["species"]], pw_db)
db <- convertIdentifiers(db, "ENSEMBL")

saveRDS(db, snakemake@output[[1]])
2 changes: 1 addition & 1 deletion workflow/scripts/sleuth-diffexp.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ write_results <- function(so, mode, output, output_all) {
stop("No canonical transcripts found (does ensembl support canonical transcript annotation for your species?")
}
# Control FDR again, because we have less tests now.
all$qval <- p.adjust(all$pval)
all$qval <- p.adjust(all$pval, method = "BH")
} else if (mode == "custom") {
# load custom ID list
id_version_pattern <- "\\.\\d+$"
Expand Down
Loading

0 comments on commit 6ea1682

Please sign in to comment.