Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev gene mismatch #1594

Merged
merged 11 commits into from
Aug 27, 2022
Binary file removed .DS_Store
Binary file not shown.
244 changes: 164 additions & 80 deletions R/MultiModal.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,30 @@
#' @param input A character of paths to 10x feature hdf5 file(s). These will traditionally have a suffix similar to "filtered_feature_bc_matrix.h5".
#' @param names A character of sample names associated with each input file.
#' @param strictMatch Only relevant when multiple input files are used. A boolean that indictes whether rows (genes) that do not match perfectly in the matrices
#' should be removed (`strictMatch = TRUE`) or coerced (`strictMatch = FALSE`). CellRanger seems to occassionally use different ensembl ids for the same gene across
#' should be removed (`strictMatch = TRUE`) or coerced (`strictMatch = FALSE`). Cell Ranger seems to occassionally use different ensembl ids for the same gene across
#' different samples. If you are comfortable tolerating such mismatches, you can coerce all matrices to fit together, in which case the gene metadata present in
#' the first listed sample will be applied to all matrices for that particular gene entry. Regardless of what value is used for `strictMatch`, this function
#' cannot tolerate mismatched gene names, only mismatched metadata for the same gene.
#' the first listed sample will be applied to all matrices for that particular gene entry. Sometimes, the h5 files from Cell Ranger have mismatches in the actual gene names.
#' See the `force` paramter for handling mismatched gene names.
#' @param verbose Only relevant when multiple input files are used. A boolean that indicates whether messaging about mismatches should be verbose (`TRUE`) or minimal (`FALSE`)
#' @param featureType The name of the feature to extract from the 10x feature file.
#' See https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices for more information.
#' @param features A genomic ranges object containing a "name" column to help fill missing 10x intervals for RSE.
#' For example, in hg38 features provided could be using Bioconductors `genes(EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86)`.
#' @param features A genomic ranges object containing a "name" column to help fill missing 10x intervals for RSE. With the default CellRanger output, mitochondrial
#' genes are not given genomic ranges. If you wish to recover these transcripts, for example for QC-based filtering, you must provide ranges for these genes. The same
#' applies for any genes in the provided `h5` files that are missing ranges. As an example, in hg38 you could use Bioconductors
#' `features = genes(EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86)`. Note that you must load the corresponding Bioconductor package to have access to the `genes` function.
#' @param genesToExclude A character vector of gene names to be excluded from the returned `SummarizedExperiment` object. This can be useful when dealing with very large
#' datasets because R has an upper limit for how large a given object can be. If you have >1.5 million cells, you cannot create a matrix in R with all ~30,000 genes so you
#' can use this parameter to remove genes upfront.
#' @export
import10xFeatureMatrix <- function(
input = NULL,
names = NULL,
strictMatch = TRUE,
verbose = TRUE,
featureType = "Gene Expression",
features = NULL
features = NULL,
genesToExclude = NULL,
force = FALSE
){

.validInput(input = input, name = "input", valid = c("character"))
Expand All @@ -34,67 +41,113 @@ import10xFeatureMatrix <- function(
.validInput(input = verbose, name = "verbose", valid = c("boolean"))
.validInput(input = featureType, name = "featureType", valid = c("character"))
.validInput(input = features, name = "features", valid = c("GRanges", "NULL"))
.validInput(input = genesToExclude, name = "genesToExclude", valid = c("character","null"))
.validInput(input = force, name = "force", valid = c("boolean"))

if (!all(file.exists(input))) {
stop("Not all input file paths exist!")
}

message("Importing Feature Matrix ", 1, " of ", length(input))
rse_final <- .import10xToSE(
h5 = input[1],
type10x = featureType,
name = names[1],
ranges = features
)
#convert the gene names to exclude to gene indices to exclude. These must be removed from each imported SE upfront
#currently, this assumes that all other samples will follow the same indexing of the first sample which may be a risky
#assumption but I dont want to match each sample individually based on gene names as we already know that some samples
#use different gene names
if(!is.null(genesToExclude)) {
genesToExclude <- which(rownames(rse_final) %in% genesToExclude)
}

if(length(genesToExclude) > 0) {
rse_final <- rse_final[-genesToExclude,]
}

rowsToRemove <- c() #row indices that need to be removed from rse_final because they do not match across samples being merged

#if more than one filtered feature barcode matrix is supplied, then merge the RSE objects
if (length(input) > 1) {
message("Merging individual RNA objects...")

#merge all others into 1st

rowsToRemove <- c() #rows that have previously been removed from rse_final
#merge all others into 1st -- the first sample is the template sample

#for each additional feature matrix (starting with the second), look for mismatches with rse_final and merge accordingly
for (i in seq(2, length(input))){

message(sprintf("\nMerging %s", names[i]))
message("Importing Feature Matrix ", i, " of ", length(input))

#Read RSE
res_i <- .import10xToSE(
rse_i <- .import10xToSE(
h5 = input[i],
type10x = featureType,
name = names[i],
ranges = features
)


if(length(genesToExclude) > 0) {
rse_i <- rse_i[-genesToExclude,]
}

mismatchWarning <- TRUE #a boolean to prevent output of the warning message many times and only output it once

if (!identical(rownames(rse_final), rownames(res_i))) {
stop("Error - rownames (genes) of individual RNA objects are not equivalent.")

if (!identical(rownames(rse_final), rownames(rse_i))) {
if(force) {
#remove rows from rse_i that dont exist in rse_final
iNotFinal <- which(rownames(rse_i) %ni% rownames(rse_final))
geneMismatch <- rownames(rse_i)[iNotFinal]
if(length(iNotFinal) > 0) {
rse_i <- rse_i[-iNotFinal,]
}

#earmark mismatched rows for downstream removal that dont exist in rse_i but do exist in rse_final
finalNotI <- which(rownames(rse_final) %ni% rownames(rse_i))
rowsToRemove <- unique(c(rowsToRemove, finalNotI))
geneMismatch <- unique(c(geneMismatch, rownames(rse_final)[finalNotI]))
#add dummy rows to rse_i with these gene names and rowData. this allows for proper alignment but these rows will eventually be removed
dummySE <- rse_i[1:length(finalNotI)]
rowData(dummySE) <- rowData(rse_final[rownames(rse_final)[finalNotI],])
rowRanges(dummySE) <- rowRanges(rse_final[rownames(rse_final)[finalNotI],])
rownames(dummySE) <- rownames(rse_final)[finalNotI]
rse_i <- SummarizedExperiment::rbind(rse_i, dummySE)
rse_i <- ArchR:::.sortRSE(rse_i)

#check to ensure that the rownames now exactly match. if not something is wrong
if(!identical(rownames(rse_final), rownames(rse_i))) {
stop("Error - Something has gone wrong with attempting to align gene names across your individual h5 files. Please report to GitHub Issues.")
}

message(sprintf("Warning! Some gene names do not match between sample %s and the reference sample (sample #1). \"force = TRUE\" so the following genes will be removed:\n%s\n", i, paste(geneMismatch, collapse = ",")))

} else {
stop("Error - rownames (genes) of individual RNA objects are not equivalent. To bypass this, set \"force = TRUE\" and the offending genes will be removed.")
}
}
if (!identical(colnames(rowData(rse_final)), colnames(rowData(res_i)))) {
stop("Error - rowData (gene metadata) of individual RNA objects have different columns. This is highly unusual and merging has been aborted.")
if (!identical(colnames(rowData(rse_final)), colnames(rowData(rse_i)))) {
stop("Error - rowData (gene metadata) of individual RNA objects have different columns. This is highly unusual and merging has been aborted. Please check your h5 input files..")
}
if (!identical(names(assays(rse_final)), names(assays(res_i)))) {
stop("Error - available assays of individual RNA objects are not equivalent. Each object is expected to only have one assay named 'counts'.")
if (!identical(names(assays(rse_final)), names(assays(rse_i)))) {
stop("Error - available assays of individual RNA objects are not equivalent. Each object is expected to only have one assay named 'counts'. Merging has been aborted. Please check your h5 input files.")
}

#check each column in rowData to check for mismatches that should be thrown as warnings
#occasionally, it seems like 10x is annotating different ensembl IDs to the same gene which seems like a bad way to go
#occasionally, it seems like 10x is annotating different ensembl IDs to the same gene which seems like a bad way to go.
#this is a bit heavy-handed but it seems like the safest thing to do is report any mismatch rather than merge blindly

for (x in seq_len(ncol(rowData(rse_final)))){
if (!identical(rowData(rse_final)[,x], rowData(res_i)[,x])) {
if (!identical(rowData(rse_final)[,x], rowData(rse_i)[,x])) {
if(mismatchWarning) {
message(sprintf("Warning! Some values within column \"%s\" of the rowData (gene metadata) of your objects do not precisely match!", colnames(rowData(rse_final))[x]))
message("This is often caused by slight variations in Ensembl IDs and gene locations used by cellranger across different samples. ArchR will ignore these mismatches and allow merging to proceed but you should check to make sure that these are ok for your data.\n")
message("This is often caused by slight variations in Ensembl IDs and gene locations used by cellranger across different samples. ArchR will ignore these mismatches and allow merging to proceed but you should check to make sure that these are ok for your data. See the strictMatch parameter for how these mismatches will be handled.\n")
mismatchWarning <- FALSE
}

#detect all of the mismatches betwenn rse_final and the current featureMat
mismatch <- which(rowData(rse_final)[,x] != rowData(res_i)[,x])
mismatch <- which(rowData(rse_final)[,x] != rowData(rse_i)[,x])
#for each detected mismatch, handle the mismatch according to the value of strictMatch
for (y in 1:length(mismatch)) {
if (verbose) {
Expand All @@ -106,45 +159,57 @@ import10xFeatureMatrix <- function(
}
rowsToRemove <- unique(c(rowsToRemove, mismatch[y]))
#temporarily force the data to match so that merging can occur easily. Mismatched rows will be removed later
rowData(res_i)[mismatch[y],] <- rowData(rse_final)[mismatch[y],]
rowRanges(res_i)[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]]
rowData(rse_i)[mismatch[y],] <- rowData(rse_final)[mismatch[y],]
rowRanges(rse_i)[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]]
} else {
if (verbose) {
message("strictMatch = FALSE so mismatching information will be coerced to match the first sample provided.")
}
rowData(res_i)[mismatch[y],] <- rowData(rse_final)[mismatch[y],]
rowRanges(res_i)[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]]
rowData(rse_i)[mismatch[y],] <- rowData(rse_final)[mismatch[y],]
rowRanges(rse_i)[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]]
}
}
}
}

rse_final <- SummarizedExperiment::cbind(rse_final, res_i)

#merge rse_i into res_final and garbage collect
rse_final <- SummarizedExperiment::cbind(rse_final, rse_i)
remove(rse_i)
gc()
}


}

if(length(rowsToRemove) > 0){
rse_final <- rse_final[-rowsToRemove,]
}

if (strictMatch) {
if(length(rowsToRemove) > 0) {
rse_final <- rse_final[-rowsToRemove,]
if("rowRanges" %in% slotNames(rse_final)){
idxNA <- which(as.character(seqnames(rse_final)) =="chrNA")
if(length(idxNA) > 0){
namesNA <- paste0(rowRanges(rse_final)$name[idxNA], collapse=",")
warning(
"These features had no interval present and were given a fake GRanges location on `chrNA`:\n\n",
namesNA,
"\n\nEither modify the output SE or see `features` input to use a reference GRanges to add to these features!"
)
}
}

rse_final

}

.import10xToSE <- function(
h5 = NULL,
type10x = NULL,
name = NULL,
ranges = NULL
){

){
#Shape
shape <- h5read(h5, "/matrix/shape")

#Read features10x
features10x <- h5read(h5, "/matrix/features")
features10x <- lapply(seq_along(features10x), function(x){
Expand All @@ -161,7 +226,7 @@ import10xFeatureMatrix <- function(
}
})
features10x <- Reduce("cbind",features10x[!unlist(lapply(features10x,is.null))])

#Determine Idx
if(!is.null(type10x)){
idx <- which(paste0(features10x$feature_type) %in% type10x)
Expand All @@ -177,54 +242,73 @@ import10xFeatureMatrix <- function(
)
)
}

#Subset
features10x <- features10x[idx, , drop=FALSE]

#Interval
if("interval" %in% colnames(features10x)){

idxNA <- which(features10x$interval=="NA")

if(length(idxNA) > 0){

#if ranges = NULL, then interval is ignored(?)
if(!is.null(ranges) & is(ranges, "GRanges")){
#Fix ranges
idx1 <- paste0(seqnames(ranges)) %in% c(1:22, "X", "Y", "MT")
if(length(idx1) > 0){
ranges2 <- GRanges(
seqnames = ifelse(idx1, paste0("chr",seqnames(ranges)), paste0(seqnames(ranges))),
ranges = GenomicRanges::ranges(ranges)
)
mcols(ranges2) <- mcols(ranges)
}

#Try To Use Ranges To Match
features10xNA <- features10x[which(features10x$interval=="NA"),,drop=FALSE]
namesNA <- features10xNA$name
idxFix <- match(namesNA, mcols(ranges2)[, grep("name", colnames(mcols(ranges)), ignore.case=TRUE)])
if(length(idxFix[!is.na(idxFix)]) > 0){
message("Correcting missing intervals...")
idx2 <- which(!is.na(idxFix))
rangesFix <- ranges2[idxFix[idx2]]
strand(rangesFix) <- "*"
features10xNA$interval[idx2] <- paste0(rangesFix)
features10x[which(features10x$interval=="NA"), ] <- features10xNA
}

#NA add Fake Chromosome
features10xNA <- features10x[which(features10x$interval=="NA"),,drop=FALSE]
if(nrow(features10xNA) > 0){
features10xNA$interval <- paste0("chrNA:1-1")
features10x[which(features10x$interval=="NA"), ] <- features10xNA
}

features10x$ranges <- GRanges(paste0(features10x$interval))
features10x$interval <- NULL

#Fix ranges
idx1 <- paste0(seqnames(ranges)) %in% c(1:22, "X", "Y", "MT")
if(length(idx1) > 0){
ranges2 <- GRanges(
seqnames = ifelse(idx1, paste0("chr",seqnames(ranges)), paste0(seqnames(ranges))),
ranges = GenomicRanges::ranges(ranges)
)
mcols(ranges2) <- mcols(ranges)
}
}else{

#Try To Use Ranges To Match
features10xNA <- features10x[which(features10x$interval=="NA"),,drop=FALSE]
namesNA <- features10xNA$name
idxFix <- match(namesNA, mcols(ranges2)[, grep("name", colnames(mcols(ranges)), ignore.case=TRUE)])
if(length(idxFix[!is.na(idxFix)]) > 0){
message("Correcting missing intervals...")
idx2 <- which(!is.na(idxFix))
rangesFix <- ranges2[idxFix[idx2]]
strand(rangesFix) <- "*"
features10xNA$interval[idx2] <- paste0(rangesFix)
features10x[which(features10x$interval=="NA"), ] <- features10xNA
}
#NA add Fake Chromosome
features10xNA <- features10x[which(features10x$interval=="NA"),,drop=FALSE]
if(nrow(features10xNA) > 0){
features10xNA$interval <- paste0("chrNA:1-1")
features10x[which(features10x$interval=="NA"), ] <- features10xNA
}

#NA add Fake Chromosome
features10xNA <- features10x[which(features10x$interval=="NA"),,drop=FALSE]
if(nrow(features10xNA) > 0){
features10xNA$interval <- paste0("chrUNK:1-1")
features10x[which(features10x$interval=="NA"), ] <- features10xNA
}
features10x$ranges <- GRanges(paste0(features10x$interval))
features10x$interval <- NULL

}

}else {
features10x$ranges <- GRanges(paste0(features10x$interval))
features10x$interval <- NULL
}

features10x$ranges <- GRanges(paste0(features10x$interval))
features10x$interval <- NULL


}

#Read Matrix
mat <- sparseMatrix(
i = h5read(h5, "/matrix/indices"),
Expand All @@ -239,11 +323,11 @@ import10xFeatureMatrix <- function(
}else{
colnames(mat) <- barcodes
}

#Subset
mat <- mat[idx, , drop = FALSE]
gc()

#Summarized Experiment
if("ranges" %in% colnames(features10x)){
mat <- SummarizedExperiment(
Expand All @@ -262,10 +346,10 @@ import10xFeatureMatrix <- function(
rowData = features10x
)
}

rownames(mat) <- rowData(mat)$name
.sortRSE(mat)

}

.sortRSE <- function(rse){
Expand Down