From 0c9bdf329087f0751fc4d064f36e788928e514d9 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 26 Jul 2024 14:08:42 -0400
Subject: [PATCH 01/15] convert obsm to ndarrays

---
 bin/move_counts_anndata.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py
index 75e0412d..9db7926a 100755
--- a/bin/move_counts_anndata.py
+++ b/bin/move_counts_anndata.py
@@ -3,12 +3,15 @@
 # This script takes an AnnData object and checks for the `logcounts`
 # in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts)
 # is moved to `raw.X`
+# In addition, any DataFrames in `obsm` are conerted to ndarrays
+
 
 import argparse
 import os
 import re
 
 import anndata as adata
+import pandas as pd
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -51,5 +54,10 @@
     object.uns["X_name"] = "logcounts"
     del object.layers["logcounts"]
 
-    # export object
-    object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)
+# convert DataFrames in obsm to ndarrays
+for key, value in object.obsm.items():
+    if isinstance(value, pd.DataFrame):
+        object.obsm[key] = value.to_numpy()
+
+# export object
+object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)

From f31126ffb2e21470f6149a926aee5ae9a084f514 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 26 Jul 2024 14:09:08 -0400
Subject: [PATCH 02/15] use lower case for obsm

---
 bin/sce_to_anndata.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
index 8c7b3216..6236556c 100755
--- a/bin/sce_to_anndata.R
+++ b/bin/sce_to_anndata.R
@@ -120,8 +120,8 @@ format_czi <- function(sce) {
   # so everything gets set to false
   rowData(sce)$feature_is_filtered <- FALSE
 
-  # paste X to any present reduced dim names
-  reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
+  # paste X to any present reduced dim names, converting to lower case
+  reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
 
   return(sce)
 }

From 8b83978037587cb642b6d0782b5bfa11d472f4ca Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 26 Jul 2024 14:34:41 -0400
Subject: [PATCH 03/15] add PCA variance output

---
 bin/sce_to_anndata.R | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
index 6236556c..80af7100 100755
--- a/bin/sce_to_anndata.R
+++ b/bin/sce_to_anndata.R
@@ -23,6 +23,12 @@ option_list <- list(
     type = "character",
     help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
   ),
+  make_option(
+    opt_str = c("--output_pca_tsv"),
+    default = NULL,
+    type = "character",
+    help = "path to output a table of variance explained by each principal component. Must end in .tsv"
+  ),
   make_option(
     opt_str = c("--feature_name"),
     type = "character",
@@ -62,6 +68,11 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) {
   stop("output rna file name must end in .hdf5, .h5, or .h5ad")
 }
 
+# check that the pca file is a tsv
+if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) {
+  stop("output pca file name must end in .tsv")
+}
+
 # Merged object function  ------------------------------------------------------
 
 # this function updates merged object formatting for anndata export
@@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata(
   compression = ifelse(opt$compress_output, "gzip", "none")
 )
 
+# Get PCA metadata for AnnData
+if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) {
+  pca_meta_df <- data.frame(
+    PC = 1:ncol(reducedDims(sce)$X_pca),
+    variance = attr(reducedDims(sce)$X_pca, "varExplained"),
+    variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100
+  )
+
+  # write pca to tsv
+  readr::write_tsv(pca_meta_df, opt$output_pca_tsv)
+}
+
 message("Exported RNA")
 
 # AltExp to AnnData -----------------------------------------------------------

From 83045190dfc1687a297800e87a06478076a1cf98 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 26 Jul 2024 14:35:34 -0400
Subject: [PATCH 04/15] add variable gene conversion and PCA metadata

---
 bin/move_counts_anndata.py | 73 ++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 18 deletions(-)

diff --git a/bin/move_counts_anndata.py b/bin/move_counts_anndata.py
index 9db7926a..d4cd975c 100755
--- a/bin/move_counts_anndata.py
+++ b/bin/move_counts_anndata.py
@@ -1,16 +1,17 @@
 #!/usr/bin/env python3
+"""
+This script takes an AnnData object and checks for the `logcounts` in layers.
+If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X`
 
-# This script takes an AnnData object and checks for the `logcounts`
-# in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts)
-# is moved to `raw.X`
-# In addition, any DataFrames in `obsm` are conerted to ndarrays
-
+In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column.
+If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy.
+"""
 
 import argparse
 import os
 import re
 
-import anndata as adata
+import anndata
 import pandas as pd
 
 parser = argparse.ArgumentParser()
@@ -28,6 +29,12 @@
     action="store_false",
     help="Output an uncompressed HDF5 file",
 )
+parser.add_argument(
+    "--pca_meta_file",
+    dest="pca_meta_file",
+    required=False,
+    help="Path to file with a table of variance explained by each PCA component",
+)
 
 args = parser.parse_args()
 
@@ -39,25 +46,55 @@
     raise FileExistsError("`input_anndata` does not exist.")
 elif not file_ext.search(args.anndata_file):
     raise ValueError(
-        "--input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData object."
+        "input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata."
     )
 
+# if there is a pca_meta_file, check that it exists and has a tsv extension
+if args.pca_meta_file:
+    if not os.path.exists(args.pca_meta_file):
+        raise FileExistsError("`pca_meta_file` does not exist.")
+    elif not args.pca_meta_file.endswith(".tsv"):
+        raise ValueError("`pca_meta_file` must end in .tsv.")
+
 # read in anndata
-object = adata.read_h5ad(args.anndata_file)
+adata = anndata.read_h5ad(args.anndata_file)
 
 # if logcounts is present
-if "logcounts" in object.layers:
-    # move X to raw.X by creating the raw object
-    object.raw = object
+if "logcounts" in adata.layers:
+    # move X to raw.X by creating the raw adata
+    adata.raw = adata
     # move logcounts to X and rename
-    object.X = object.layers["logcounts"]
-    object.uns["X_name"] = "logcounts"
-    del object.layers["logcounts"]
+    adata.X = adata.layers["logcounts"]
+    adata.uns["X_name"] = "logcounts"
+    del adata.layers["logcounts"]
 
 # convert DataFrames in obsm to ndarrays
-for key, value in object.obsm.items():
+for key, value in adata.obsm.items():
     if isinstance(value, pd.DataFrame):
-        object.obsm[key] = value.to_numpy()
+        adata.obsm[key] = value.to_numpy()
+
+# convert highly variable genes to a column
+adata.var["highly_variable"] = adata.var.gene_ids.isin(
+    adata.uns["highly_variable_genes"]
+)
+
+# add pca adata to uns if pca_meta_file is provided in the format created by scanpy
+if args.pca_meta_file:
+    pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0)
+    if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns:
+        raise ValueError(
+            "`pca_meta_file` must contain columns `variance` and `variance_ratio`"
+        )
+    pca_object = {
+        "param": {
+            "zero_center": True,
+            "use_highly_variable": True,
+            "mask_var": "highly_variable",
+        },
+        "variance": pca_meta["variance"].to_numpy(),
+        "variance_ratio": pca_meta["variance_ratio"].to_numpy(),
+    }
+    adata.uns["pca"] = pca_object
 
-# export object
-object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)
+# export adata
+adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)

From e7d4758543b605a66a8fb941bff6abed21c488de Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 26 Jul 2024 14:46:48 -0400
Subject: [PATCH 05/15] change script name

---
 bin/{move_counts_anndata.py => reformat_anndata.py} | 12 ++++++------
 merge.nf                                            |  4 ++--
 modules/export-anndata.nf                           |  8 +++++---
 3 files changed, 13 insertions(+), 11 deletions(-)
 rename bin/{move_counts_anndata.py => reformat_anndata.py} (100%)

diff --git a/bin/move_counts_anndata.py b/bin/reformat_anndata.py
similarity index 100%
rename from bin/move_counts_anndata.py
rename to bin/reformat_anndata.py
index d4cd975c..ea1d0d26 100755
--- a/bin/move_counts_anndata.py
+++ b/bin/reformat_anndata.py
@@ -22,6 +22,12 @@
     required=True,
     help="Path to HDF5 file with processed AnnData object",
 )
+parser.add_argument(
+    "--pca_meta_file",
+    dest="pca_meta_file",
+    required=False,
+    help="Path to file with a table of variance explained by each PCA component",
+)
 parser.add_argument(
     "-u",
     "--uncompressed",
@@ -29,12 +35,6 @@
     action="store_false",
     help="Output an uncompressed HDF5 file",
 )
-parser.add_argument(
-    "--pca_meta_file",
-    dest="pca_meta_file",
-    required=False,
-    help="Path to file with a table of variance explained by each PCA component",
-)
 
 args = parser.parse_args()
 
diff --git a/merge.nf b/merge.nf
index 19e47125..d72ed3ef 100644
--- a/merge.nf
+++ b/merge.nf
@@ -109,8 +109,8 @@ process export_anndata {
         ${has_adt ? "--feature_name adt" : ''}
 
       # move normalized counts to X in AnnData
-      move_counts_anndata.py --anndata_file ${rna_h5ad_file}
-      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
+      reformat_anndata.py --anndata_file ${rna_h5ad_file}
+      ${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
       """
     stub:
       rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
index de42cc62..bb83209d 100644
--- a/modules/export-anndata.nf
+++ b/modules/export-anndata.nf
@@ -12,21 +12,23 @@ process export_anndata {
     script:
       rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
       feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
+      pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv"
       feature_present = meta.feature_type in ["adt"]
       """
       sce_to_anndata.R \
         --input_sce_file ${sce_file} \
         --output_rna_h5 ${rna_h5ad_file} \
         --output_feature_h5 ${feature_h5ad_file} \
+        --output_pca_tsv ${pca_meta_file} \
         ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
         ${file_type != "processed" ? "--compress_output" : ''}
 
-      # move any normalized counts to X in AnnData
+      # move any normalized counts to X in AnnData, convert matrices, and add PCA metadata
       if [ "${file_type}" = "processed" ]; then
-        move_counts_anndata.py --anndata_file ${rna_h5ad_file}
+        reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file}
         # move counts in feature data, if the file exists
         if [ -f "${feature_h5ad_file}" ]; then
-          move_counts_anndata.py --anndata_file ${feature_h5ad_file}
+          reformat_anndata.py --anndata_file ${feature_h5ad_file}
         fi
       fi
 

From 588475ba460019db3d8aee4011a85d39a6c2d28b Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 29 Jul 2024 09:39:10 -0400
Subject: [PATCH 06/15] make tsv case-insensitive

---
 bin/predict_cellassign.py | 4 ++--
 bin/reformat_anndata.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
index 22fc6d32..eb9bf431 100755
--- a/bin/predict_cellassign.py
+++ b/bin/predict_cellassign.py
@@ -63,7 +63,7 @@
 scvi.settings.num_threads = args.threads
 
 # compile extension regex
-file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
+file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
 if not os.path.exists(args.anndata_file):
@@ -78,7 +78,7 @@
     raise FileExistsError("--reference file not found.")
 
 # make sure output file path is tsv file
-if not args.output_predictions.endswith(".tsv"):
+if not args.output_predictions.casefold().endswith(".tsv"):
     raise ValueError("--output_predictions must provide a file path ending in tsv")
 
 # read in references as marker gene tables
diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index ea1d0d26..f6d09b50 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -39,7 +39,7 @@
 args = parser.parse_args()
 
 # compile extension regex
-file_ext = re.compile(r"\.hdf5$|\.h5$|\.h5ad$", re.IGNORECASE)
+file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
 if not os.path.exists(args.anndata_file):
@@ -53,7 +53,7 @@
 if args.pca_meta_file:
     if not os.path.exists(args.pca_meta_file):
         raise FileExistsError("`pca_meta_file` does not exist.")
-    elif not args.pca_meta_file.endswith(".tsv"):
+    elif not args.pca_meta_file.casefold().endswith(".tsv"):
         raise ValueError("`pca_meta_file` must end in .tsv.")
 
 # read in anndata

From 09904b546859e6b9bddf4b797dec465c000e1623 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 29 Jul 2024 10:53:55 -0400
Subject: [PATCH 07/15] add args for pca values

---
 bin/reformat_anndata.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index f6d09b50..435bb573 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -28,6 +28,18 @@
     required=False,
     help="Path to file with a table of variance explained by each PCA component",
 )
+parser.add_argument(
+    "--pca_not_centered",
+    dest="pca_centered",
+    action="store_false",
+    help="Indicate that the PCA table is not zero-centered",
+)
+parser.add_argument(
+    "--mask_var",
+    dest="mask_var",
+    default="highly_variable",
+    help="Indicate that variable used for highly variable genes. Use the value 'None' if no highly variable genes were used.",
+)
 parser.add_argument(
     "-u",
     "--uncompressed",
@@ -87,9 +99,9 @@
         )
     pca_object = {
         "param": {
-            "zero_center": True,
-            "use_highly_variable": True,
-            "mask_var": "highly_variable",
+            "zero_center": args.pca_centered,
+            "use_highly_variable": args.highly_variable.casefold() != "none",
+            "mask_var": args.highly_variable,
         },
         "variance": pca_meta["variance"].to_numpy(),
         "variance_ratio": pca_meta["variance_ratio"].to_numpy(),

From 8dc9701e3e6b4b7dfa7cdecdfc917e00d5a43e97 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 29 Jul 2024 16:51:56 -0400
Subject: [PATCH 08/15] Apply suggestions from code review

Co-authored-by: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
---
 bin/reformat_anndata.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index 435bb573..dba521c6 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -38,7 +38,7 @@
     "--mask_var",
     dest="mask_var",
     default="highly_variable",
-    help="Indicate that variable used for highly variable genes. Use the value 'None' if no highly variable genes were used.",
+    help="Indicate the name used to store highly variable genes associated with the PCA present in the exported AnnData object. Use the value 'None' if no highly variable genes were used.",
 )
 parser.add_argument(
     "-u",
@@ -100,8 +100,8 @@
     pca_object = {
         "param": {
             "zero_center": args.pca_centered,
-            "use_highly_variable": args.highly_variable.casefold() != "none",
-            "mask_var": args.highly_variable,
+            "use_highly_variable": args.mask_var.casefold() != "none",
+            "mask_var": args.mask_var,
         },
         "variance": pca_meta["variance"].to_numpy(),
         "variance_ratio": pca_meta["variance_ratio"].to_numpy(),

From 734ee95a86292ce0bd65c10ce874ad3dfc0462f0 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 29 Jul 2024 17:01:09 -0400
Subject: [PATCH 09/15] Only do hvg copying if hvg is specified

---
 bin/reformat_anndata.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index dba521c6..b6348b64 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -35,10 +35,13 @@
     help="Indicate that the PCA table is not zero-centered",
 )
 parser.add_argument(
-    "--mask_var",
-    dest="mask_var",
-    default="highly_variable",
-    help="Indicate the name used to store highly variable genes associated with the PCA present in the exported AnnData object. Use the value 'None' if no highly variable genes were used.",
+    "--hvg_name",
+    dest="hvg_name",
+    default="highly_variable_genes",
+    help=(
+        "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
+        " Use the value 'none' if no highly variable genes were not used."
+    ),
 )
 parser.add_argument(
     "-u",
@@ -85,10 +88,13 @@
     if isinstance(value, pd.DataFrame):
         adata.obsm[key] = value.to_numpy()
 
-# convert highly variable genes to a column
-adata.var["highly_variable"] = adata.var.gene_ids.isin(
-    adata.uns["highly_variable_genes"]
-)
+# convert highly variable genes to a column if given
+use_hvg = args.hvg_name.casefold() != "none"
+if use_hvg:
+    if args.hvg_name not in adata.uns.keys():
+        raise ValueError("`hvg_name` must be present in the `uns` data for the object")
+    adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name])
+
 
 # add pca adata to uns if pca_meta_file is provided in the format created by scanpy
 if args.pca_meta_file:
@@ -100,8 +106,8 @@
     pca_object = {
         "param": {
             "zero_center": args.pca_centered,
-            "use_highly_variable": args.mask_var.casefold() != "none",
-            "mask_var": args.mask_var,
+            "use_highly_variable": use_hvg,
+            "mask_var": ("highly_variable" if use_hvg else None),
         },
         "variance": pca_meta["variance"].to_numpy(),
         "variance_ratio": pca_meta["variance_ratio"].to_numpy(),

From 6a4a59f8e3b9f6f25af8bf1bcc3cee22734c8fc6 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 29 Jul 2024 17:05:33 -0400
Subject: [PATCH 10/15] lower case the merged too

---
 bin/sce_to_anndata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
index 80af7100..db2784cb 100755
--- a/bin/sce_to_anndata.R
+++ b/bin/sce_to_anndata.R
@@ -78,7 +78,7 @@ if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv
 # this function updates merged object formatting for anndata export
 format_merged_sce <- function(sce) {
   # paste X to any present reduced dim names
-  reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
+  reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
   return(sce)
 }
 

From 12224b2b8f7aefd2ea1726f666487980dc1919a1 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Thu, 1 Aug 2024 14:26:38 -0500
Subject: [PATCH 11/15] add param for max merge

---
 merge.nf        | 24 ++++++++++++++++++++----
 nextflow.config |  2 +-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/merge.nf b/merge.nf
index d72ed3ef..bedb77ed 100644
--- a/merge.nf
+++ b/merge.nf
@@ -144,11 +144,11 @@ workflow {
         || (it.scpca_sample_id in run_ids)
       }
       .map{[
-        seq_unit: it.seq_unit,
-        technology: it.technology,
         project_id: it.scpca_project_id,
         library_id: it.scpca_library_id,
-        sample_id: it.scpca_sample_id.split(";").sort().join(",")
+        sample_id: it.scpca_sample_id.split(";").sort().join(","),
+        seq_unit: it.seq_unit,
+        technology: it.technology
       ]}
 
     // get all projects that contain at least one library with CITEseq
@@ -162,13 +162,23 @@ workflow {
       .collect{it.project_id}
       .map{it.unique()}
 
+    oversized_projects = libraries_ch
+      .map{[
+        it.project_id, // pull out project id for grouping
+        it
+      ]}
+      .groupTuple(by: 0) // group by project id
+      .filter{it[1].size() > params.max_merge_libraries} // get projects with more samples than max merge
+      .collect{it[0]} // get project id
+
     filtered_libraries_ch = libraries_ch
       // only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
       .filter{it.seq_unit in ['cell', 'nucleus']}
-      // remove any multiplexed projects
+      // remove any multiplexed projects or oversized projects
       // future todo: only filter library ids that are multiplexed, but keep all other non-multiplexed libraries
       .branch{
         multiplexed: it.project_id in multiplex_projects.getVal()
+        oversized: it.project_id in oversized_projects.getVal()
         single_sample: true
       }
 
@@ -178,6 +188,12 @@ workflow {
         log.warn("Not merging ${it.project_id} because it contains multiplexed libraries.")
       }
 
+    filtered_libraries_ch.oversized
+      .unique{ it.project_id }
+      .subscribe{
+        log.warn("Not merging ${it.project_id} because it contains too many libraries.")
+      }
+
     // print out warning message for any libraries not included in merging
     filtered_libraries_ch.single_sample
       .map{[
diff --git a/nextflow.config b/nextflow.config
index dcac34ed..b6c4afde 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -61,7 +61,7 @@ params {
 
   // Merge workflow-specfic options
   params.reuse_merge = false // if later steps fail, you can use `--reuse_merge` reuse the merged RDS object during a rerun
-
+  params.max_merge_libraries = 100 // maximum number of libraries to merge
 
   // Docker container images
   includeConfig 'config/containers.config'

From 6ceb4419956d81956cf1903befdd3d960298056d Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Thu, 1 Aug 2024 15:08:08 -0500
Subject: [PATCH 12/15] Update parameter wording

Co-authored-by: Stephanie Spielman <stephanie.spielman@gmail.com>
---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index b6c4afde..a0aad836 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -61,7 +61,7 @@ params {
 
   // Merge workflow-specfic options
   params.reuse_merge = false // if later steps fail, you can use `--reuse_merge` reuse the merged RDS object during a rerun
-  params.max_merge_libraries = 100 // maximum number of libraries to merge
+  params.max_merge_libraries = 100 // maximum number of libraries that can be merged
 
   // Docker container images
   includeConfig 'config/containers.config'

From 89df240646f1582c989d175bee24fd2ec7f51ba3 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 2 Aug 2024 13:46:33 -0500
Subject: [PATCH 13/15] no hvgs for alt exps

---
 bin/reformat_anndata.py   | 2 +-
 bin/sce_to_anndata.R      | 1 +
 modules/export-anndata.nf | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index b6348b64..0354a6cc 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -40,7 +40,7 @@
     default="highly_variable_genes",
     help=(
         "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
-        " Use the value 'none' if no highly variable genes were not used."
+        " Use the value 'none' if no highly variable genes were used."
     ),
 )
 parser.add_argument(
diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
index db2784cb..52b3d058 100755
--- a/bin/sce_to_anndata.R
+++ b/bin/sce_to_anndata.R
@@ -232,5 +232,6 @@ if (!is.null(opt$feature_name)) {
         ")
       )
     }
+    message("Exported alt")
   }
 }
diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
index bb83209d..89d1aa5d 100644
--- a/modules/export-anndata.nf
+++ b/modules/export-anndata.nf
@@ -28,7 +28,7 @@ process export_anndata {
         reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file}
         # move counts in feature data, if the file exists
         if [ -f "${feature_h5ad_file}" ]; then
-          reformat_anndata.py --anndata_file ${feature_h5ad_file}
+          reformat_anndata.py --anndata_file ${feature_h5ad_file} --hvg_name "none"
         fi
       fi
 

From 5de0a78f6e182206d993de2a2cd25bf472e0dfe0 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 2 Aug 2024 13:47:55 -0500
Subject: [PATCH 14/15] update versions

---
 external-instructions.md | 6 +++---
 internal-instructions.md | 2 +-
 nextflow.config          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/external-instructions.md b/external-instructions.md
index 75bb478d..b017ff0e 100644
--- a/external-instructions.md
+++ b/external-instructions.md
@@ -86,12 +86,12 @@ Using the above command will run the workflow from the `main` branch of the work
 To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command.
 
 To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag.
-The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.2` version.
+The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.3` version.
 Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases).
 
 ```sh
 nextflow run AlexsLemonade/scpca-nf \
-  -r v0.8.2 \
+  -r v0.8.3 \
   -config <path to config file>  \
   -profile <name of profile>
 ```
@@ -325,7 +325,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra
 
 If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well.
 When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using.
-For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.2`, then you will want to set `-r v0.8.2` for `get_refs.py` as well to be sure you have the correct containers.
+For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.3`, then you will want to set `-r v0.8.3` for `get_refs.py` as well to be sure you have the correct containers.
 By default, `get_refs.py` will download files and images associated with the latest release.
 
 If your system uses Docker, you can add the `--docker` flag:
diff --git a/internal-instructions.md b/internal-instructions.md
index e4d049ba..8b757fcb 100644
--- a/internal-instructions.md
+++ b/internal-instructions.md
@@ -87,7 +87,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more
 When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release:
 
 ```
-nextflow run AlexsLemonade/scpca-nf -r v0.8.2 -profile ccdl,batch --project SCPCP000000
+nextflow run AlexsLemonade/scpca-nf -r v0.8.3 -profile ccdl,batch --project SCPCP000000
 ```
 
 ### Processing example data
diff --git a/nextflow.config b/nextflow.config
index a0aad836..3a6cf39b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -5,7 +5,7 @@ manifest {
   homePage = 'https://github.com/AlexsLemonade/scpca-nf'
   mainScript = 'main.nf'
   defaultBranch = 'main'
-  version = 'v0.8.2'
+  version = 'v0.8.3'
 }
 
 // global parameters for workflows

From a1090b9ffc191af8f6a9900686760a5895d65784 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:00:57 -0500
Subject: [PATCH 15/15] spacing

Co-authored-by: Stephanie Spielman <stephanie.spielman@gmail.com>
---
 bin/reformat_anndata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/reformat_anndata.py b/bin/reformat_anndata.py
index 0354a6cc..fdecae1b 100755
--- a/bin/reformat_anndata.py
+++ b/bin/reformat_anndata.py
@@ -40,7 +40,7 @@
     default="highly_variable_genes",
     help=(
         "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
-        " Use the value 'none' if no highly variable genes were used."
+        "Use the value 'none' if no highly variable genes were used."
     ),
 )
 parser.add_argument(