diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py deleted file mode 100755 index 75e0412d..00000000 --- a/bin/move_counts_anndata.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -# This script takes an AnnData object and checks for the `logcounts` -# in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts) -# is moved to `raw.X` - -import argparse -import os -import re - -import anndata as adata - -parser = argparse.ArgumentParser() -parser.add_argument( - "-i", - "--anndata_file", - dest="anndata_file", - required=True, - help="Path to HDF5 file with processed AnnData object", -) -parser.add_argument( - "-u", - "--uncompressed", - dest="compress", - action="store_false", - help="Output an uncompressed HDF5 file", -) - -args = parser.parse_args() - -# compile extension regex -file_ext = re.compile(r"\.hdf5$|\.h5$|\.h5ad$", re.IGNORECASE) - -# check that input file exists, if it does exist, make sure it's an h5 file -if not os.path.exists(args.anndata_file): - raise FileExistsError("`input_anndata` does not exist.") -elif not file_ext.search(args.anndata_file): - raise ValueError( - "--input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData object." - ) - -# read in anndata -object = adata.read_h5ad(args.anndata_file) - -# if logcounts is present -if "logcounts" in object.layers: - # move X to raw.X by creating the raw object - object.raw = object - # move logcounts to X and rename - object.X = object.layers["logcounts"] - object.uns["X_name"] = "logcounts" - del object.layers["logcounts"] - - # export object - object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py index 22fc6d32..eb9bf431 100755 --- a/bin/predict_cellassign.py +++ b/bin/predict_cellassign.py @@ -63,7 +63,7 @@ scvi.settings.num_threads = args.threads # compile extension regex -file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE) +file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE) # check that input file exists, if it does exist, make sure it's an h5 file if not os.path.exists(args.anndata_file): @@ -78,7 +78,7 @@ raise FileExistsError("--reference file not found.") # make sure output file path is tsv file -if not args.output_predictions.endswith(".tsv"): +if not args.output_predictions.casefold().endswith(".tsv"): raise ValueError("--output_predictions must provide a file path ending in tsv") # read in references as marker gene tables diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py new file mode 100755 index 00000000..fdecae1b --- /dev/null +++ b/bin/reformat_anndata.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +This script takes an AnnData object and checks for the `logcounts` in layers. +If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X` + +In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column. +If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy. +""" + +import argparse +import os +import re + +import anndata +import pandas as pd + +parser = argparse.ArgumentParser() +parser.add_argument( + "-i", + "--anndata_file", + dest="anndata_file", + required=True, + help="Path to HDF5 file with processed AnnData object", +) +parser.add_argument( + "--pca_meta_file", + dest="pca_meta_file", + required=False, + help="Path to file with a table of variance explained by each PCA component", +) +parser.add_argument( + "--pca_not_centered", + dest="pca_centered", + action="store_false", + help="Indicate that the PCA table is not zero-centered", +) +parser.add_argument( + "--hvg_name", + dest="hvg_name", + default="highly_variable_genes", + help=( + "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object." + "Use the value 'none' if no highly variable genes were used." + ), +) +parser.add_argument( + "-u", + "--uncompressed", + dest="compress", + action="store_false", + help="Output an uncompressed HDF5 file", +) + +args = parser.parse_args() + +# compile extension regex +file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE) + +# check that input file exists, if it does exist, make sure it's an h5 file +if not os.path.exists(args.anndata_file): + raise FileExistsError("`input_anndata` does not exist.") +elif not file_ext.search(args.anndata_file): + raise ValueError( + "input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata." + ) + +# if there is a pca_meta_file, check that it exists and has a tsv extension +if args.pca_meta_file: + if not os.path.exists(args.pca_meta_file): + raise FileExistsError("`pca_meta_file` does not exist.") + elif not args.pca_meta_file.casefold().endswith(".tsv"): + raise ValueError("`pca_meta_file` must end in .tsv.") + +# read in anndata +adata = anndata.read_h5ad(args.anndata_file) + +# if logcounts is present +if "logcounts" in adata.layers: + # move X to raw.X by creating the raw adata + adata.raw = adata + # move logcounts to X and rename + adata.X = adata.layers["logcounts"] + adata.uns["X_name"] = "logcounts" + del adata.layers["logcounts"] + +# convert DataFrames in obsm to ndarrays +for key, value in adata.obsm.items(): + if isinstance(value, pd.DataFrame): + adata.obsm[key] = value.to_numpy() + +# convert highly variable genes to a column if given +use_hvg = args.hvg_name.casefold() != "none" +if use_hvg: + if args.hvg_name not in adata.uns.keys(): + raise ValueError("`hvg_name` must be present in the `uns` data for the object") + adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name]) + + +# add pca adata to uns if pca_meta_file is provided in the format created by scanpy +if args.pca_meta_file: + pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0) + if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns: + raise ValueError( + "`pca_meta_file` must contain columns `variance` and `variance_ratio`" + ) + pca_object = { + "param": { + "zero_center": args.pca_centered, + "use_highly_variable": use_hvg, + "mask_var": ("highly_variable" if use_hvg else None), + }, + "variance": pca_meta["variance"].to_numpy(), + "variance_ratio": pca_meta["variance_ratio"].to_numpy(), + } + adata.uns["pca"] = pca_object + +# export adata +adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R index 8c7b3216..52b3d058 100755 --- a/bin/sce_to_anndata.R +++ b/bin/sce_to_anndata.R @@ -23,6 +23,12 @@ option_list <- list( type = "character", help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5" ), + make_option( + opt_str = c("--output_pca_tsv"), + default = NULL, + type = "character", + help = "path to output a table of variance explained by each principal component. Must end in .tsv" + ), make_option( opt_str = c("--feature_name"), type = "character", @@ -62,12 +68,17 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) { stop("output rna file name must end in .hdf5, .h5, or .h5ad") } +# check that the pca file is a tsv +if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) { + stop("output pca file name must end in .tsv") +} + # Merged object function ------------------------------------------------------ # this function updates merged object formatting for anndata export format_merged_sce <- function(sce) { # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) } @@ -120,8 +131,8 @@ format_czi <- function(sce) { # so everything gets set to false rowData(sce)$feature_is_filtered <- FALSE - # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + # paste X to any present reduced dim names, converting to lower case + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) } @@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata( compression = ifelse(opt$compress_output, "gzip", "none") ) +# Get PCA metadata for AnnData +if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) { + pca_meta_df <- data.frame( + PC = 1:ncol(reducedDims(sce)$X_pca), + variance = attr(reducedDims(sce)$X_pca, "varExplained"), + variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100 + ) + + # write pca to tsv + readr::write_tsv(pca_meta_df, opt$output_pca_tsv) +} + message("Exported RNA") # AltExp to AnnData ----------------------------------------------------------- @@ -209,5 +232,6 @@ if (!is.null(opt$feature_name)) { ") ) } + message("Exported alt") } } diff --git a/external-instructions.md b/external-instructions.md index 75bb478d..b017ff0e 100644 --- a/external-instructions.md +++ b/external-instructions.md @@ -86,12 +86,12 @@ Using the above command will run the workflow from the `main` branch of the work To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command. To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag. -The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.2` version. +The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.3` version. Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases). ```sh nextflow run AlexsLemonade/scpca-nf \ - -r v0.8.2 \ + -r v0.8.3 \ -config \ -profile ``` @@ -325,7 +325,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well. When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using. -For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.2`, then you will want to set `-r v0.8.2` for `get_refs.py` as well to be sure you have the correct containers. +For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.3`, then you will want to set `-r v0.8.3` for `get_refs.py` as well to be sure you have the correct containers. By default, `get_refs.py` will download files and images associated with the latest release. If your system uses Docker, you can add the `--docker` flag: diff --git a/internal-instructions.md b/internal-instructions.md index e4d049ba..8b757fcb 100644 --- a/internal-instructions.md +++ b/internal-instructions.md @@ -87,7 +87,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release: ``` -nextflow run AlexsLemonade/scpca-nf -r v0.8.2 -profile ccdl,batch --project SCPCP000000 +nextflow run AlexsLemonade/scpca-nf -r v0.8.3 -profile ccdl,batch --project SCPCP000000 ``` ### Processing example data diff --git a/merge.nf b/merge.nf index 19e47125..bedb77ed 100644 --- a/merge.nf +++ b/merge.nf @@ -109,8 +109,8 @@ process export_anndata { ${has_adt ? "--feature_name adt" : ''} # move normalized counts to X in AnnData - move_counts_anndata.py --anndata_file ${rna_h5ad_file} - ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''} + reformat_anndata.py --anndata_file ${rna_h5ad_file} + ${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''} """ stub: rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad" @@ -144,11 +144,11 @@ workflow { || (it.scpca_sample_id in run_ids) } .map{[ - seq_unit: it.seq_unit, - technology: it.technology, project_id: it.scpca_project_id, library_id: it.scpca_library_id, - sample_id: it.scpca_sample_id.split(";").sort().join(",") + sample_id: it.scpca_sample_id.split(";").sort().join(","), + seq_unit: it.seq_unit, + technology: it.technology ]} // get all projects that contain at least one library with CITEseq @@ -162,13 +162,23 @@ workflow { .collect{it.project_id} .map{it.unique()} + oversized_projects = libraries_ch + .map{[ + it.project_id, // pull out project id for grouping + it + ]} + .groupTuple(by: 0) // group by project id + .filter{it[1].size() > params.max_merge_libraries} // get projects with more samples than max merge + .collect{it[0]} // get project id + filtered_libraries_ch = libraries_ch // only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data .filter{it.seq_unit in ['cell', 'nucleus']} - // remove any multiplexed projects + // remove any multiplexed projects or oversized projects // future todo: only filter library ids that are multiplexed, but keep all other non-multiplexed libraries .branch{ multiplexed: it.project_id in multiplex_projects.getVal() + oversized: it.project_id in oversized_projects.getVal() single_sample: true } @@ -178,6 +188,12 @@ workflow { log.warn("Not merging ${it.project_id} because it contains multiplexed libraries.") } + filtered_libraries_ch.oversized + .unique{ it.project_id } + .subscribe{ + log.warn("Not merging ${it.project_id} because it contains too many libraries.") + } + // print out warning message for any libraries not included in merging filtered_libraries_ch.single_sample .map{[ diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf index de42cc62..89d1aa5d 100644 --- a/modules/export-anndata.nf +++ b/modules/export-anndata.nf @@ -12,21 +12,23 @@ process export_anndata { script: rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad" feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad" + pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv" feature_present = meta.feature_type in ["adt"] """ sce_to_anndata.R \ --input_sce_file ${sce_file} \ --output_rna_h5 ${rna_h5ad_file} \ --output_feature_h5 ${feature_h5ad_file} \ + --output_pca_tsv ${pca_meta_file} \ ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \ ${file_type != "processed" ? "--compress_output" : ''} - # move any normalized counts to X in AnnData + # move any normalized counts to X in AnnData, convert matrices, and add PCA metadata if [ "${file_type}" = "processed" ]; then - move_counts_anndata.py --anndata_file ${rna_h5ad_file} + reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file} # move counts in feature data, if the file exists if [ -f "${feature_h5ad_file}" ]; then - move_counts_anndata.py --anndata_file ${feature_h5ad_file} + reformat_anndata.py --anndata_file ${feature_h5ad_file} --hvg_name "none" fi fi diff --git a/nextflow.config b/nextflow.config index dcac34ed..3a6cf39b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,7 +5,7 @@ manifest { homePage = 'https://github.com/AlexsLemonade/scpca-nf' mainScript = 'main.nf' defaultBranch = 'main' - version = 'v0.8.2' + version = 'v0.8.3' } // global parameters for workflows @@ -61,7 +61,7 @@ params { // Merge workflow-specfic options params.reuse_merge = false // if later steps fail, you can use `--reuse_merge` reuse the merged RDS object during a rerun - + params.max_merge_libraries = 100 // maximum number of libraries that can be merged // Docker container images includeConfig 'config/containers.config'