Skip to content

Commit

Permalink
fix: Updated the get-transcript-info.R file and its dependencies (#73)
Browse files Browse the repository at this point in the history
Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
Co-authored-by: Manuel Philip <manuel@sleepy.uk-essen.de>
Co-authored-by: Manuel Philip <manuel@bashful.uk-essen.de>
Co-authored-by: Johannes Köster <johannes.koester@tu-dortmund.de>
Co-authored-by: Manuel Philip <mphilip@c41.ikim.uk-essen.de>
  • Loading branch information
6 people authored Jun 14, 2023
1 parent 4704319 commit e44d424
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 57 deletions.
1 change: 1 addition & 0 deletions workflow/envs/biomart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ channels:
dependencies:
- bioconductor-biomart =2.46
- r-tidyverse =1.3
- r-dplyr =1.0.9
9 changes: 9 additions & 0 deletions workflow/resources/datavzrd/diffexp-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,15 @@ views:
domain:
- 0.0
- 1.0
chromosome_name:
optional: true
display-mode: hidden
transcript_mane_select:
optional: true
display-mode: hidden
ensembl_transcript_id_version:
optional: true
display-mode: hidden
test_stat:
display-mode: hidden
rss:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/quant_3prime.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ if is_3prime_experiment:

rule kallisto_3prime_index:
input:
fasta="resources/transcriptome.3prime.fasta",
fasta="resources/transcriptome_clean.3prime.fasta",
output:
index="results/kallisto_3prime/transcripts.3prime.idx",
log:
Expand Down
1 change: 1 addition & 0 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ rule get_transcript_info:
params:
species=get_bioc_species_name(),
version=config["resources"]["ref"]["release"],
three_prime_activated=is_3prime_experiment,
log:
"logs/get_transcript_info.log",
conda:
Expand Down
142 changes: 86 additions & 56 deletions workflow/scripts/get-transcript-info.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("biomaRt")
library("tidyverse")
library("dplyr")

# this variable holds a mirror name until
# useEnsembl succeeds ("www" is last, because
# useEnsembl succeeds ("www" is last, because
# of very frequent "Internal Server Error"s)
mart <- "useast"
rounds <- 0
while ( class(mart)[[1]] != "Mart" ) {
while (class(mart)[[1]] != "Mart") {
mart <- tryCatch(
{
# done here, because error function does not
Expand Down Expand Up @@ -39,69 +40,98 @@ while ( class(mart)[[1]] != "Mart" ) {
}
# hop to next mirror
mart <- switch(mart,
useast = "uswest",
uswest = "asia",
asia = "www",
www = {
# wait before starting another round through the mirrors,
# hoping that intermittent problems disappear
Sys.sleep(30)
"useast"
}
)
useast = "uswest",
uswest = "asia",
asia = "www",
www = {
# wait before starting another round through the mirrors,
# hoping that intermittent problems disappear
Sys.sleep(30)
"useast"
}
)
}
)
}
three_prime_activated <- snakemake@params[["three_prime_activated"]]

attributes <- c("ensembl_transcript_id",
"ensembl_gene_id",
"external_gene_name",
"description")

has_canonical <- "transcript_is_canonical" %in% biomaRt::listAttributes(mart=mart)$name

if (has_canonical) {
attributes <- c(attributes, "transcript_is_canonical")
has_canonical <-
"transcript_is_canonical" %in% biomaRt::listAttributes(mart = mart)$name
#Check if three_prime_activated is activated or else if transcipts are cononical
if (has_canonical && three_prime_activated) {
attributes <- c(attributes, "transcript_is_canonical", "chromosome_name",
"transcript_mane_select", "ensembl_transcript_id_version")
has_mane_select <-
"transcript_mane_select" %in% biomaRt::listAttributes(mart = mart)$name
}else if (has_canonical) {
attributes <- c(attributes, "transcript_is_canonical")
}

t2g <- biomaRt::getBM(
attributes = attributes,
mart = mart,
useCache = FALSE
)
if (!has_canonical) {
t2g <- t2g %>% add_column(transcript_is_canonical = NA)
attributes = attributes,
mart = mart,
useCache = FALSE
)
# Set columns as NA if three_prime_activated is set to false or if the transcipts are not canonical
if (!has_canonical || !three_prime_activated) {
t2g <- t2g %>% add_column(chromosome_name = NA, transcript_mane_select = NA,
ensembl_transcript_id_version = NA)
}else if (!has_canonical) {
t2g <- t2g %>% add_column(transcript_is_canonical = NA)
}

t2g <- t2g %>%
rename( target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id,
ext_gene = external_gene_name,
gene_desc = description,
canonical = transcript_is_canonical
) %>%
mutate_at(
vars(gene_desc),
function(values) { str_trim(map(values, function (v) { str_split(v, r"{\[}")[[1]][1]})) } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
) %>%
mutate_at(
vars(canonical),
function(values) {
as_vector(
map(
str_trim(values),
function(v) {
if (is.na(v)) {
NA
} else if (v == "1") {
TRUE
} else if (v == "0") {
FALSE
}
}
)
)
rename(
target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id,
ext_gene = external_gene_name,
gene_desc = description,
canonical = transcript_is_canonical,
chromosome_name = chromosome_name,
transcript_mane_select = transcript_mane_select,
ensembl_transcript_id_version = ensembl_transcript_id_version,
) %>%
mutate_at(
vars(gene_desc),
function(values) {
str_trim(map(values, function(v) {
str_split(v, r"{\[}")[[1]][1]
}))
} # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
) %>%
mutate_at(
vars(canonical),
function(values) {
as_vector(
map(
str_trim(values),
function(v) {
if (is.na(v)) {
NA
} else if (v == "1") {
TRUE
} else if (v == "0") {
FALSE
}
}
)

)
}
)
# Check if 3-prime-rna-seq is activated, filter transcipts that are mane selected and filter chromosomes that are defined as "patch"
if (three_prime_activated) {
if (has_mane_select) {
t2g <- t2g %>%
filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
filter(str_detect(transcript_mane_select, ""))
}else {
stop(
str_c(
"needed mane_selected column in biomart if three prime mode is activated"
)
)
}
}
write_rds(t2g, file = snakemake@output[[1]], compress = "gz")

0 comments on commit e44d424

Please sign in to comment.