Merge pull request #774 from AlexsLemonade/jashapiro/array-pca

AlexsLemonade · Jul 30, 2024 · 634e3e5 · 634e3e5
2 parents 574d4b9 + 6a4a59f
commit 634e3e5
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 65 deletions.
diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py
diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
@@ -63,7 +63,7 @@
 scvi.settings.num_threads = args.threads
 
 # compile extension regex
-file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
+file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
 if not os.path.exists(args.anndata_file):
@@ -78,7 +78,7 @@
     raise FileExistsError("--reference file not found.")
 
 # make sure output file path is tsv file
-if not args.output_predictions.endswith(".tsv"):
+if not args.output_predictions.casefold().endswith(".tsv"):
     raise ValueError("--output_predictions must provide a file path ending in tsv")
 
 # read in references as marker gene tables

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+This script takes an AnnData object and checks for the `logcounts` in layers.
+If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X`
+
+In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column.
+If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy.
+"""
+
+import argparse
+import os
+import re
+
+import anndata
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i",
+    "--anndata_file",
+    dest="anndata_file",
+    required=True,
+    help="Path to HDF5 file with processed AnnData object",
+)
+parser.add_argument(
+    "--pca_meta_file",
+    dest="pca_meta_file",
+    required=False,
+    help="Path to file with a table of variance explained by each PCA component",
+)
+parser.add_argument(
+    "--pca_not_centered",
+    dest="pca_centered",
+    action="store_false",
+    help="Indicate that the PCA table is not zero-centered",
+)
+parser.add_argument(
+    "--hvg_name",
+    dest="hvg_name",
+    default="highly_variable_genes",
+    help=(
+        "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
+        " Use the value 'none' if no highly variable genes were not used."
+    ),
+)
+parser.add_argument(
+    "-u",
+    "--uncompressed",
+    dest="compress",
+    action="store_false",
+    help="Output an uncompressed HDF5 file",
+)
+
+args = parser.parse_args()
+
+# compile extension regex
+file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)
+
+# check that input file exists, if it does exist, make sure it's an h5 file
+if not os.path.exists(args.anndata_file):
+    raise FileExistsError("`input_anndata` does not exist.")
+elif not file_ext.search(args.anndata_file):
+    raise ValueError(
+        "input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata."
+    )
+
+# if there is a pca_meta_file, check that it exists and has a tsv extension
+if args.pca_meta_file:
+    if not os.path.exists(args.pca_meta_file):
+        raise FileExistsError("`pca_meta_file` does not exist.")
+    elif not args.pca_meta_file.casefold().endswith(".tsv"):
+        raise ValueError("`pca_meta_file` must end in .tsv.")
+
+# read in anndata
+adata = anndata.read_h5ad(args.anndata_file)
+
+# if logcounts is present
+if "logcounts" in adata.layers:
+    # move X to raw.X by creating the raw adata
+    adata.raw = adata
+    # move logcounts to X and rename
+    adata.X = adata.layers["logcounts"]
+    adata.uns["X_name"] = "logcounts"
+    del adata.layers["logcounts"]
+
+# convert DataFrames in obsm to ndarrays
+for key, value in adata.obsm.items():
+    if isinstance(value, pd.DataFrame):
+        adata.obsm[key] = value.to_numpy()
+
+# convert highly variable genes to a column if given
+use_hvg = args.hvg_name.casefold() != "none"
+if use_hvg:
+    if args.hvg_name not in adata.uns.keys():
+        raise ValueError("`hvg_name` must be present in the `uns` data for the object")
+    adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name])
+
+
+# add pca adata to uns if pca_meta_file is provided in the format created by scanpy
+if args.pca_meta_file:
+    pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0)
+    if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns:
+        raise ValueError(
+            "`pca_meta_file` must contain columns `variance` and `variance_ratio`"
+        )
+    pca_object = {
+        "param": {
+            "zero_center": args.pca_centered,
+            "use_highly_variable": use_hvg,
+            "mask_var": ("highly_variable" if use_hvg else None),
+        },
+        "variance": pca_meta["variance"].to_numpy(),
+        "variance_ratio": pca_meta["variance_ratio"].to_numpy(),
+    }
+    adata.uns["pca"] = pca_object
+
+# export adata
+adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)
diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
@@ -23,6 +23,12 @@ option_list <- list(
     type = "character",
     help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
   ),
+  make_option(
+    opt_str = c("--output_pca_tsv"),
+    default = NULL,
+    type = "character",
+    help = "path to output a table of variance explained by each principal component. Must end in .tsv"
+  ),
   make_option(
     opt_str = c("--feature_name"),
     type = "character",
@@ -62,12 +68,17 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) {
   stop("output rna file name must end in .hdf5, .h5, or .h5ad")
 }
 
+# check that the pca file is a tsv
+if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) {
+  stop("output pca file name must end in .tsv")
+}
+
 # Merged object function  ------------------------------------------------------
 
 # this function updates merged object formatting for anndata export
 format_merged_sce <- function(sce) {
   # paste X to any present reduced dim names
-  reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
+  reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
   return(sce)
 }
 
@@ -120,8 +131,8 @@ format_czi <- function(sce) {
   # so everything gets set to false
   rowData(sce)$feature_is_filtered <- FALSE
 
-  # paste X to any present reduced dim names
-  reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
+  # paste X to any present reduced dim names, converting to lower case
+  reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
 
   return(sce)
 }
@@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata(
   compression = ifelse(opt$compress_output, "gzip", "none")
 )
 
+# Get PCA metadata for AnnData
+if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) {
+  pca_meta_df <- data.frame(
+    PC = 1:ncol(reducedDims(sce)$X_pca),
+    variance = attr(reducedDims(sce)$X_pca, "varExplained"),
+    variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100
+  )
+
+  # write pca to tsv
+  readr::write_tsv(pca_meta_df, opt$output_pca_tsv)
+}
+
 message("Exported RNA")
 
 # AltExp to AnnData -----------------------------------------------------------

diff --git a/merge.nf b/merge.nf
@@ -109,8 +109,8 @@ process export_anndata {
         ${has_adt ? "--feature_name adt" : ''}
 
       # move normalized counts to X in AnnData
-      move_counts_anndata.py --anndata_file ${rna_h5ad_file}
-      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
+      reformat_anndata.py --anndata_file ${rna_h5ad_file}
+      ${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
       """
     stub:
       rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"

diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
@@ -12,21 +12,23 @@ process export_anndata {
     script:
       rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
       feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
+      pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv"
       feature_present = meta.feature_type in ["adt"]
       """
       sce_to_anndata.R \
         --input_sce_file ${sce_file} \
         --output_rna_h5 ${rna_h5ad_file} \
         --output_feature_h5 ${feature_h5ad_file} \
+        --output_pca_tsv ${pca_meta_file} \
         ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
         ${file_type != "processed" ? "--compress_output" : ''}
 
-      # move any normalized counts to X in AnnData
+      # move any normalized counts to X in AnnData, convert matrices, and add PCA metadata
       if [ "${file_type}" = "processed" ]; then
-        move_counts_anndata.py --anndata_file ${rna_h5ad_file}
+        reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file}
         # move counts in feature data, if the file exists
         if [ -f "${feature_h5ad_file}" ]; then
-          move_counts_anndata.py --anndata_file ${feature_h5ad_file}
+          reformat_anndata.py --anndata_file ${feature_h5ad_file}
         fi
       fi