diff --git a/DESCRIPTION b/DESCRIPTION index 59c29d6..4d22f32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: zellkonverter Title: Conversion Between scRNA-seq Objects -Version: 1.1.3 -Date: 2021-01-22 +Version: 1.1.4 +Date: 2021-02-18 Authors@R: c(person(given = "Luke", family = "Zappia", diff --git a/NAMESPACE b/NAMESPACE index 68aeeaa..ccfe92a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,12 @@ export(writeH5AD) import(SingleCellExperiment) import(SummarizedExperiment) importClassesFrom(Matrix,dgCMatrix) +importFrom(DelayedArray,blockApply) importFrom(DelayedArray,is_sparse) +importFrom(DelayedArray,nzdata) +importFrom(DelayedArray,nzindex) +importFrom(DelayedArray,rowAutoGrid) +importFrom(DelayedArray,type) importFrom(Matrix,sparseMatrix) importFrom(Matrix,t) importFrom(S4Vectors,DataFrame) diff --git a/NEWS.md b/NEWS.md index efb200c..83dfc2c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,11 @@ * Bioconductor 3.13, April 2021 +## zellkonverter 1.1.4 (2021-02-18) + +* Handle writing **DelayedArray** assays on the R side in `writeH5AD()` + (PR #35, Fixes #32) + ## zellkonverter 1.1.3 (2021-01-22) * Adjust `SCE2AnnData()` example (Fixes #31) diff --git a/R/write.R b/R/write.R index 1ec05c0..c2c21a2 100644 --- a/R/write.R +++ b/R/write.R @@ -10,15 +10,31 @@ #' be ignored when writing to `file`. #' #' @details +#' +#' ## Environment +#' +#' When first run, this function will instantiate a conda environment +#' containing all of the necessary dependencies. This will not be performed on +#' any subsequent run or if any other **zellkonverter** function has been run +#' prior to this one. +#' +#' ## Skipping assays +#' #' Setting `skip_assays=TRUE` can occasionally be useful if the matrices in #' `sce` are stored in a format that is not amenable for efficient conversion #' to a **numpy**-compatible format. In such cases, it can be better to create #' an empty placeholder dataset in `file` and fill it in R afterwards. #' -#' When first run, this function will instantiate a conda environment -#' containing all of the necessary dependencies. This will not be performed on -#' any subsequent run or if any other -#' **zellkonverter** function has been run prior to this one. +#' ## **DelayedArray** assays +#' +#' If `sce` contains any **DelayedArray** matrices as assays `writeH5AD()` will +#' write them to disk using the **rhdf5** package directly rather than via +#' Python to avoid instantiating them in memory. However there is currently +#' an issue which prevents this being done for sparse **DelayedArray** matrices. +#' +#' ## Known conversion issues +#' +#' ### Coercion to factors #' #' The **anndata** package automatically converts some character vectors to #' factors when saving `.h5ad` files. This can effect columns of `rowData(sce)` @@ -50,13 +66,47 @@ #' #' @export #' @importFrom basilisk basiliskRun +#' @importFrom Matrix sparseMatrix +#' @importFrom DelayedArray is_sparse writeH5AD <- function(sce, file, X_name = NULL, skip_assays = FALSE) { + # Loop over and replace DelayedArrays. + ass_list <- assays(sce) + is_da <- logical(length(ass_list)) + for (a in seq_along(ass_list)) { + # Skip sparse DelayedArrays due to rhdf5 issue + # https://github.com/grimbough/rhdf5/issues/79 + if (is(ass_list[[a]], "DelayedMatrix") && !is_sparse(ass_list[[a]])) { + is_da[a] <- TRUE + assay(sce, a, withDimnames=FALSE) <- .make_fake_mat(dim(sce)) + } + } + file <- path.expand(file) basiliskRun( env = anndata_env, fun = .H5ADwriter, sce = sce, file = file, X_name = X_name, skip_assays = skip_assays ) + + # Going back out and replacing each of them. + if (any(is_da)) { + for (p in which(is_da)) { + if (p == 1L) { + curp <- "X" + } else { + curp <- file.path("layers", assayNames(sce)[p]) + } + rhdf5::h5delete(file, curp) + mat <- ass_list[[p]] + + if (!is_sparse(mat)) { + HDF5Array::writeHDF5Array(mat, file=file, name=curp) + } else { + .write_CSR_matrix(file, name=curp, mat=mat) + } + } + } + invisible(NULL) } @@ -66,3 +116,65 @@ writeH5AD <- function(sce, file, X_name = NULL, skip_assays = FALSE) { adata <- SCE2AnnData(sce, X_name = X_name, skip_assays = skip_assays) adata$write_h5ad(file) } + +# nocov start + +# Skipping code coverage on these function because they aren't used until the +# sparse DelayedArray rhdf5 issue mentioned above is addressed + +#' @importFrom DelayedArray blockApply rowAutoGrid type +.write_CSR_matrix <- function(file, name, mat, chunk_dim=10000) { + handle <- rhdf5::H5Fopen(file) + on.exit(rhdf5::H5Fclose(handle)) + + rhdf5::h5createGroup(handle, name) + ghandle <- rhdf5::H5Gopen(handle, name) + on.exit(rhdf5::H5Gclose(ghandle), add=TRUE, after=FALSE) + rhdf5::h5writeAttribute("csc_matrix", ghandle, "encoding-type") + rhdf5::h5writeAttribute("0.1.0", ghandle, "encoding-version") + rhdf5::h5writeAttribute(rev(dim(mat)), ghandle, "shape") + + rhdf5::h5createDataset(handle, file.path(name, "data"), dims=0, maxdims=rhdf5::H5Sunlimited(), + H5type=if (type(mat)=="integer") "H5T_NATIVE_INT32" else "H5T_NATIVE_DOUBLE", chunk = chunk_dim) + rhdf5::h5createDataset(handle, file.path(name, "indices"), dims=0, maxdims=rhdf5::H5Sunlimited(), + H5type="H5T_NATIVE_UINT32", chunk = chunk_dim) + + env <- new.env() # persist the 'last' counter. + env$last <- 0L + out <- blockApply(mat, grid=rowAutoGrid(mat), FUN=.blockwise_sparse_writer, env=env, + file=handle, name=name, as.sparse=TRUE) + + out <- as.double(unlist(out)) + iname <- file.path(name, "indptr") + rhdf5::h5createDataset(handle, iname, dims=length(out)+1L, H5type="H5T_NATIVE_UINT64") + rhdf5::h5writeDataset(c(0, cumsum(out)), handle, iname) +} + +#' @importFrom DelayedArray nzdata nzindex +.blockwise_sparse_writer <- function(block, env, file, name) { + nzdex <- nzindex(block) + i <- nzdex[,1] + j <- nzdex[,2] + v <- nzdata(block) + + o <- order(i) + i <- i[o] + j <- j[o] + v <- v[o] + + last <- env$last + index <- list(last + seq_along(j)) + + iname <- file.path(name, "indices") + rhdf5::h5set_extent(file, iname, last + length(j)) + rhdf5::h5writeDataset(j - 1L, file, iname, index=index) + + vname <- file.path(name, "data") + rhdf5::h5set_extent(file, vname, last + length(j)) + rhdf5::h5writeDataset(v, file, vname, index=index) + + env$last <- last + length(j) + tabulate(i, nrow(block)) +} + +# nocov end diff --git a/man/writeH5AD.Rd b/man/writeH5AD.Rd index ed01178..65f91de 100644 --- a/man/writeH5AD.Rd +++ b/man/writeH5AD.Rd @@ -24,21 +24,41 @@ A \code{NULL} is invisibly returned. Write a H5AD file from a \linkS4class{SingleCellExperiment} object. } \details{ +\subsection{Environment}{ + +When first run, this function will instantiate a conda environment +containing all of the necessary dependencies. This will not be performed on +any subsequent run or if any other \strong{zellkonverter} function has been run +prior to this one. +} + +\subsection{Skipping assays}{ + Setting \code{skip_assays=TRUE} can occasionally be useful if the matrices in \code{sce} are stored in a format that is not amenable for efficient conversion to a \strong{numpy}-compatible format. In such cases, it can be better to create an empty placeholder dataset in \code{file} and fill it in R afterwards. +} -When first run, this function will instantiate a conda environment -containing all of the necessary dependencies. This will not be performed on -any subsequent run or if any other -\strong{zellkonverter} function has been run prior to this one. +\subsection{\strong{DelayedArray} assays}{ + +If \code{sce} contains any \strong{DelayedArray} matrices as assays \code{writeH5AD()} will +write them to disk using the \strong{rhdf5} package directly rather than via +Python to avoid instantiating them in memory. However there is currently +an issue which prevents this being done for sparse \strong{DelayedArray} matrices. +} + +\subsection{Known conversion issues}{ +\subsection{Coercion to factors}{ The \strong{anndata} package automatically converts some character vectors to factors when saving \code{.h5ad} files. This can effect columns of \code{rowData(sce)} and \code{colData(sce)} which may change type when the \code{.h5ad} file is read back into R. } + +} +} \examples{ # Using the Zeisel brain dataset if (requireNamespace("scRNAseq", quietly = TRUE)) { diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index 1184df5..0d37f5c 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -2,6 +2,7 @@ # library(testthat); library(zellkonverter); source("test-write.R") library(scRNAseq) + sce <- ZeiselBrainData() reducedDim(sce, "WHEE") <- matrix(runif(ncol(sce) * 10), ncol = 10) @@ -85,3 +86,53 @@ test_that("writeH5AD works in a separate process", { basilisk::setBasiliskShared(oldshare) basilisk::setBasiliskFork(oldfork) }) + +test_that("writeH5AD DelayedArray X works", { + + delayed_sce <- sce + counts(delayed_sce) <- DelayedArray::DelayedArray(counts(delayed_sce)) + + temp <- tempfile(fileext = '.h5ad') + + writeH5AD(delayed_sce, temp, X_name = "counts") + expect_true(file.exists(temp)) + + out <- readH5AD(temp) + + expect_identical(counts(sce), assay(out, "X")) +}) + +test_that("writeH5AD sparse DelayedArray X works", { + + delayed_sce <- sce + sparse_counts <- as(counts(delayed_sce), "dgCMatrix") + counts(delayed_sce) <- DelayedArray::DelayedArray(sparse_counts) + + temp <- tempfile(fileext = '.h5ad') + + writeH5AD(delayed_sce, temp, X_name = "counts") + expect_true(file.exists(temp)) + + out <- readH5AD(temp) + + # Sparse DelayedArrays are currently coerced into memory + # This expectation will need to be changed once that is fixed + expect_identical(sparse_counts, assay(out, "X")) +}) + +test_that("writeH5AD DelayedArray layer works", { + + delayed_sce <- sce + assay(delayed_sce, "layer") <- DelayedArray::DelayedArray( + counts(delayed_sce) + ) + + temp <- tempfile(fileext = '.h5ad') + + writeH5AD(delayed_sce, temp) + expect_true(file.exists(temp)) + + out <- readH5AD(temp) + + expect_identical(counts(sce), assay(out, "layer")) +})