-
Notifications
You must be signed in to change notification settings - Fork 2
/
classify_SingleR.R
118 lines (99 loc) · 3.7 KB
/
classify_SingleR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env Rscript
# This script is used to classify and annotate cells using SingleR
# import libraries
suppressPackageStartupMessages({
library(optparse)
library(SingleCellExperiment)
})
# set up arguments
option_list <- list(
make_option(
opt_str = c("-i", "--input_sce_file"),
type = "character",
help = "path to rds file with input sce object"
),
make_option(
opt_str = c("-o", "--output_sce_file"),
type = "character",
help = "path to output rds file to store processed sce object. Must end in .rds"
),
make_option(
opt_str = c("--singler_models"),
type = "character",
help = "list of models generated for use with SingleR. Each input file contains
a list of models generated from a single reference, one each for each label type:
`label.main`, `label.fine`, and `label.ont`."
),
make_option(
opt_str = c("--seed"),
type = "integer",
help = "A random seed for reproducibility."
),
make_option(
opt_str = c("-t", "--threads"),
type = "integer",
default = 1,
help = "Number of multiprocessing threads to use."
)
)
opt <- parse_args(OptionParser(option_list = option_list))
# Set up -----------------------------------------------------------------------
# set seed
set.seed(opt$random_seed)
# check that input file file exists
if(!file.exists(opt$input_sce_file)){
stop("Missing input SCE file")
}
# check that references all exist
model_files <- unlist(stringr::str_split(opt$singler_models, ","))
if(!all(file.exists(model_files))){
missing_files <- model_files[which(!file.exists(model_files))]
glue::glue("
Missing model file(s): {missing_files}
")
stop("Please make sure that all provided SingleR models exist.")
}
# set up multiprocessing params
if(opt$threads > 1){
bp_param = BiocParallel::MulticoreParam(opt$threads)
} else {
bp_param = BiocParallel::SerialParam()
}
# read in input rds file
sce <- readr::read_rds(opt$input_sce_file)
# read in references as a list of lists
# each file contains a named list of models generated using the same reference dataset
# but unique labels in the reference dataset
model_names <- stringr::str_remove(basename(model_files), "_model.rds")
names(model_files) <- model_names
model_list <- purrr::map(model_files, readr::read_rds) |>
# ensure we have label type before reference name
# example: label.main_HumanPrimaryCellAtlasData
# where `label.main` is the name of the model stored in the file and
# `HumanPrimaryCellAtlasData` is the name of the reference used for each file containing a list of models
purrr::imap(\(model_list, ref_name){
names(model_list) <- glue::glue("{names(model_list)}_{ref_name}")
model_list
}) |>
purrr::flatten()
# SingleR classify -------------------------------------------------------------
# create a partial function for mapping easily
classify_sce <- purrr::partial(SingleR::classifySingleR,
test = sce,
fine.tune=TRUE,
BPPARAM = bp_param)
# run singleR for all provided models
all_singler_results <- model_list |>
purrr::map(classify_sce)
# Annotate sce -----------------------------------------------------------------
# create a dataframe with a single column of annotations for each model used
all_annotations_df <- all_singler_results |>
purrr::map_dfc(\(result) result$pruned.labels ) |>
DataFrame()
colData(sce) <- cbind(colData(sce), all_annotations_df)
# store results in metadata
metadata(sce)$singler_results <- all_singler_results
# export sce with annotations added
readr::write_rds(sce,
opt$output_sce_file,
compress = 'gz')