From 0c9bdf329087f0751fc4d064f36e788928e514d9 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 26 Jul 2024 14:08:42 -0400 Subject: [PATCH 01/10] convert obsm to ndarrays --- bin/move_counts_anndata.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py index 75e0412d..9db7926a 100755 --- a/bin/move_counts_anndata.py +++ b/bin/move_counts_anndata.py @@ -3,12 +3,15 @@ # This script takes an AnnData object and checks for the `logcounts` # in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts) # is moved to `raw.X` +# In addition, any DataFrames in `obsm` are conerted to ndarrays + import argparse import os import re import anndata as adata +import pandas as pd parser = argparse.ArgumentParser() parser.add_argument( @@ -51,5 +54,10 @@ object.uns["X_name"] = "logcounts" del object.layers["logcounts"] - # export object - object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) +# convert DataFrames in obsm to ndarrays +for key, value in object.obsm.items(): + if isinstance(value, pd.DataFrame): + object.obsm[key] = value.to_numpy() + +# export object +object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) From f31126ffb2e21470f6149a926aee5ae9a084f514 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 26 Jul 2024 14:09:08 -0400 Subject: [PATCH 02/10] use lower case for obsm --- bin/sce_to_anndata.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R index 8c7b3216..6236556c 100755 --- a/bin/sce_to_anndata.R +++ b/bin/sce_to_anndata.R @@ -120,8 +120,8 @@ format_czi <- function(sce) { # so everything gets set to false rowData(sce)$feature_is_filtered <- FALSE - # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + # paste X to any present reduced dim names, converting to lower case + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) } From 8b83978037587cb642b6d0782b5bfa11d472f4ca Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 26 Jul 2024 14:34:41 -0400 Subject: [PATCH 03/10] add PCA variance output --- bin/sce_to_anndata.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R index 6236556c..80af7100 100755 --- a/bin/sce_to_anndata.R +++ b/bin/sce_to_anndata.R @@ -23,6 +23,12 @@ option_list <- list( type = "character", help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5" ), + make_option( + opt_str = c("--output_pca_tsv"), + default = NULL, + type = "character", + help = "path to output a table of variance explained by each principal component. Must end in .tsv" + ), make_option( opt_str = c("--feature_name"), type = "character", @@ -62,6 +68,11 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) { stop("output rna file name must end in .hdf5, .h5, or .h5ad") } +# check that the pca file is a tsv +if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) { + stop("output pca file name must end in .tsv") +} + # Merged object function ------------------------------------------------------ # this function updates merged object formatting for anndata export @@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata( compression = ifelse(opt$compress_output, "gzip", "none") ) +# Get PCA metadata for AnnData +if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) { + pca_meta_df <- data.frame( + PC = 1:ncol(reducedDims(sce)$X_pca), + variance = attr(reducedDims(sce)$X_pca, "varExplained"), + variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100 + ) + + # write pca to tsv + readr::write_tsv(pca_meta_df, opt$output_pca_tsv) +} + message("Exported RNA") # AltExp to AnnData ----------------------------------------------------------- From 83045190dfc1687a297800e87a06478076a1cf98 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 26 Jul 2024 14:35:34 -0400 Subject: [PATCH 04/10] add variable gene conversion and PCA metadata --- bin/move_counts_anndata.py | 73 ++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py index 9db7926a..d4cd975c 100755 --- a/bin/move_counts_anndata.py +++ b/bin/move_counts_anndata.py @@ -1,16 +1,17 @@ #!/usr/bin/env python3 +""" +This script takes an AnnData object and checks for the `logcounts` in layers. +If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X` -# This script takes an AnnData object and checks for the `logcounts` -# in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts) -# is moved to `raw.X` -# In addition, any DataFrames in `obsm` are conerted to ndarrays - +In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column. +If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy. +""" import argparse import os import re -import anndata as adata +import anndata import pandas as pd parser = argparse.ArgumentParser() @@ -28,6 +29,12 @@ action="store_false", help="Output an uncompressed HDF5 file", ) +parser.add_argument( + "--pca_meta_file", + dest="pca_meta_file", + required=False, + help="Path to file with a table of variance explained by each PCA component", +) args = parser.parse_args() @@ -39,25 +46,55 @@ raise FileExistsError("`input_anndata` does not exist.") elif not file_ext.search(args.anndata_file): raise ValueError( - "--input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData object." + "input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata." ) +# if there is a pca_meta_file, check that it exists and has a tsv extension +if args.pca_meta_file: + if not os.path.exists(args.pca_meta_file): + raise FileExistsError("`pca_meta_file` does not exist.") + elif not args.pca_meta_file.endswith(".tsv"): + raise ValueError("`pca_meta_file` must end in .tsv.") + # read in anndata -object = adata.read_h5ad(args.anndata_file) +adata = anndata.read_h5ad(args.anndata_file) # if logcounts is present -if "logcounts" in object.layers: - # move X to raw.X by creating the raw object - object.raw = object +if "logcounts" in adata.layers: + # move X to raw.X by creating the raw adata + adata.raw = adata # move logcounts to X and rename - object.X = object.layers["logcounts"] - object.uns["X_name"] = "logcounts" - del object.layers["logcounts"] + adata.X = adata.layers["logcounts"] + adata.uns["X_name"] = "logcounts" + del adata.layers["logcounts"] # convert DataFrames in obsm to ndarrays -for key, value in object.obsm.items(): +for key, value in adata.obsm.items(): if isinstance(value, pd.DataFrame): - object.obsm[key] = value.to_numpy() + adata.obsm[key] = value.to_numpy() + +# convert highly variable genes to a column +adata.var["highly_variable"] = adata.var.gene_ids.isin( + adata.uns["highly_variable_genes"] +) + +# add pca adata to uns if pca_meta_file is provided in the format created by scanpy +if args.pca_meta_file: + pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0) + if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns: + raise ValueError( + "`pca_meta_file` must contain columns `variance` and `variance_ratio`" + ) + pca_object = { + "param": { + "zero_center": True, + "use_highly_variable": True, + "mask_var": "highly_variable", + }, + "variance": pca_meta["variance"].to_numpy(), + "variance_ratio": pca_meta["variance_ratio"].to_numpy(), + } + adata.uns["pca"] = pca_object -# export object -object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) +# export adata +adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) From e7d4758543b605a66a8fb941bff6abed21c488de Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 26 Jul 2024 14:46:48 -0400 Subject: [PATCH 05/10] change script name --- bin/{move_counts_anndata.py => reformat_anndata.py} | 12 ++++++------ merge.nf | 4 ++-- modules/export-anndata.nf | 8 +++++--- 3 files changed, 13 insertions(+), 11 deletions(-) rename bin/{move_counts_anndata.py => reformat_anndata.py} (100%) diff --git a/bin/move_counts_anndata.py b/bin/reformat_anndata.py similarity index 100% rename from bin/move_counts_anndata.py rename to bin/reformat_anndata.py index d4cd975c..ea1d0d26 100755 --- a/bin/move_counts_anndata.py +++ b/bin/reformat_anndata.py @@ -22,6 +22,12 @@ required=True, help="Path to HDF5 file with processed AnnData object", ) +parser.add_argument( + "--pca_meta_file", + dest="pca_meta_file", + required=False, + help="Path to file with a table of variance explained by each PCA component", +) parser.add_argument( "-u", "--uncompressed", @@ -29,12 +35,6 @@ action="store_false", help="Output an uncompressed HDF5 file", ) -parser.add_argument( - "--pca_meta_file", - dest="pca_meta_file", - required=False, - help="Path to file with a table of variance explained by each PCA component", -) args = parser.parse_args() diff --git a/merge.nf b/merge.nf index 19e47125..d72ed3ef 100644 --- a/merge.nf +++ b/merge.nf @@ -109,8 +109,8 @@ process export_anndata { ${has_adt ? "--feature_name adt" : ''} # move normalized counts to X in AnnData - move_counts_anndata.py --anndata_file ${rna_h5ad_file} - ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''} + reformat_anndata.py --anndata_file ${rna_h5ad_file} + ${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''} """ stub: rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad" diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf index de42cc62..bb83209d 100644 --- a/modules/export-anndata.nf +++ b/modules/export-anndata.nf @@ -12,21 +12,23 @@ process export_anndata { script: rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad" feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad" + pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv" feature_present = meta.feature_type in ["adt"] """ sce_to_anndata.R \ --input_sce_file ${sce_file} \ --output_rna_h5 ${rna_h5ad_file} \ --output_feature_h5 ${feature_h5ad_file} \ + --output_pca_tsv ${pca_meta_file} \ ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \ ${file_type != "processed" ? "--compress_output" : ''} - # move any normalized counts to X in AnnData + # move any normalized counts to X in AnnData, convert matrices, and add PCA metadata if [ "${file_type}" = "processed" ]; then - move_counts_anndata.py --anndata_file ${rna_h5ad_file} + reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file} # move counts in feature data, if the file exists if [ -f "${feature_h5ad_file}" ]; then - move_counts_anndata.py --anndata_file ${feature_h5ad_file} + reformat_anndata.py --anndata_file ${feature_h5ad_file} fi fi From 588475ba460019db3d8aee4011a85d39a6c2d28b Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 29 Jul 2024 09:39:10 -0400 Subject: [PATCH 06/10] make tsv case-insensitive --- bin/predict_cellassign.py | 4 ++-- bin/reformat_anndata.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py index 22fc6d32..eb9bf431 100755 --- a/bin/predict_cellassign.py +++ b/bin/predict_cellassign.py @@ -63,7 +63,7 @@ scvi.settings.num_threads = args.threads # compile extension regex -file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE) +file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE) # check that input file exists, if it does exist, make sure it's an h5 file if not os.path.exists(args.anndata_file): @@ -78,7 +78,7 @@ raise FileExistsError("--reference file not found.") # make sure output file path is tsv file -if not args.output_predictions.endswith(".tsv"): +if not args.output_predictions.casefold().endswith(".tsv"): raise ValueError("--output_predictions must provide a file path ending in tsv") # read in references as marker gene tables diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py index ea1d0d26..f6d09b50 100755 --- a/bin/reformat_anndata.py +++ b/bin/reformat_anndata.py @@ -39,7 +39,7 @@ args = parser.parse_args() # compile extension regex -file_ext = re.compile(r"\.hdf5$|\.h5$|\.h5ad$", re.IGNORECASE) +file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE) # check that input file exists, if it does exist, make sure it's an h5 file if not os.path.exists(args.anndata_file): @@ -53,7 +53,7 @@ if args.pca_meta_file: if not os.path.exists(args.pca_meta_file): raise FileExistsError("`pca_meta_file` does not exist.") - elif not args.pca_meta_file.endswith(".tsv"): + elif not args.pca_meta_file.casefold().endswith(".tsv"): raise ValueError("`pca_meta_file` must end in .tsv.") # read in anndata From 09904b546859e6b9bddf4b797dec465c000e1623 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 29 Jul 2024 10:53:55 -0400 Subject: [PATCH 07/10] add args for pca values --- bin/reformat_anndata.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py index f6d09b50..435bb573 100755 --- a/bin/reformat_anndata.py +++ b/bin/reformat_anndata.py @@ -28,6 +28,18 @@ required=False, help="Path to file with a table of variance explained by each PCA component", ) +parser.add_argument( + "--pca_not_centered", + dest="pca_centered", + action="store_false", + help="Indicate that the PCA table is not zero-centered", +) +parser.add_argument( + "--mask_var", + dest="mask_var", + default="highly_variable", + help="Indicate that variable used for highly variable genes. Use the value 'None' if no highly variable genes were used.", +) parser.add_argument( "-u", "--uncompressed", @@ -87,9 +99,9 @@ ) pca_object = { "param": { - "zero_center": True, - "use_highly_variable": True, - "mask_var": "highly_variable", + "zero_center": args.pca_centered, + "use_highly_variable": args.highly_variable.casefold() != "none", + "mask_var": args.highly_variable, }, "variance": pca_meta["variance"].to_numpy(), "variance_ratio": pca_meta["variance_ratio"].to_numpy(), From 8dc9701e3e6b4b7dfa7cdecdfc917e00d5a43e97 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 29 Jul 2024 16:51:56 -0400 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> --- bin/reformat_anndata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py index 435bb573..dba521c6 100755 --- a/bin/reformat_anndata.py +++ b/bin/reformat_anndata.py @@ -38,7 +38,7 @@ "--mask_var", dest="mask_var", default="highly_variable", - help="Indicate that variable used for highly variable genes. Use the value 'None' if no highly variable genes were used.", + help="Indicate the name used to store highly variable genes associated with the PCA present in the exported AnnData object. Use the value 'None' if no highly variable genes were used.", ) parser.add_argument( "-u", @@ -100,8 +100,8 @@ pca_object = { "param": { "zero_center": args.pca_centered, - "use_highly_variable": args.highly_variable.casefold() != "none", - "mask_var": args.highly_variable, + "use_highly_variable": args.mask_var.casefold() != "none", + "mask_var": args.mask_var, }, "variance": pca_meta["variance"].to_numpy(), "variance_ratio": pca_meta["variance_ratio"].to_numpy(), From 734ee95a86292ce0bd65c10ce874ad3dfc0462f0 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 29 Jul 2024 17:01:09 -0400 Subject: [PATCH 09/10] Only do hvg copying if hvg is specified --- bin/reformat_anndata.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py index dba521c6..b6348b64 100755 --- a/bin/reformat_anndata.py +++ b/bin/reformat_anndata.py @@ -35,10 +35,13 @@ help="Indicate that the PCA table is not zero-centered", ) parser.add_argument( - "--mask_var", - dest="mask_var", - default="highly_variable", - help="Indicate the name used to store highly variable genes associated with the PCA present in the exported AnnData object. Use the value 'None' if no highly variable genes were used.", + "--hvg_name", + dest="hvg_name", + default="highly_variable_genes", + help=( + "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object." + " Use the value 'none' if no highly variable genes were not used." + ), ) parser.add_argument( "-u", @@ -85,10 +88,13 @@ if isinstance(value, pd.DataFrame): adata.obsm[key] = value.to_numpy() -# convert highly variable genes to a column -adata.var["highly_variable"] = adata.var.gene_ids.isin( - adata.uns["highly_variable_genes"] -) +# convert highly variable genes to a column if given +use_hvg = args.hvg_name.casefold() != "none" +if use_hvg: + if args.hvg_name not in adata.uns.keys(): + raise ValueError("`hvg_name` must be present in the `uns` data for the object") + adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name]) + # add pca adata to uns if pca_meta_file is provided in the format created by scanpy if args.pca_meta_file: @@ -100,8 +106,8 @@ pca_object = { "param": { "zero_center": args.pca_centered, - "use_highly_variable": args.mask_var.casefold() != "none", - "mask_var": args.mask_var, + "use_highly_variable": use_hvg, + "mask_var": ("highly_variable" if use_hvg else None), }, "variance": pca_meta["variance"].to_numpy(), "variance_ratio": pca_meta["variance_ratio"].to_numpy(), From 6a4a59f8e3b9f6f25af8bf1bcc3cee22734c8fc6 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 29 Jul 2024 17:05:33 -0400 Subject: [PATCH 10/10] lower case the merged too --- bin/sce_to_anndata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R index 80af7100..db2784cb 100755 --- a/bin/sce_to_anndata.R +++ b/bin/sce_to_anndata.R @@ -78,7 +78,7 @@ if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv # this function updates merged object formatting for anndata export format_merged_sce <- function(sce) { # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) }