Merge pull request #712 from AlexsLemonade/jashapiro/compress-simplify

Reduce SCE file sizes
AlexsLemonade · Mar 5, 2024 · 4246991 · 4246991
2 parents 579f246 + 2ce8f5e
commit 4246991
Show file tree

Hide file tree

Showing 10 changed files with 19 additions and 14 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
         args: [--update-only, --title=**Table of Contents**]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff for linting and formatting python
-    rev: v0.2.1
+    rev: v0.3.0
     hooks:
       # Run the linter.
       - id: ruff

diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R
@@ -247,4 +247,4 @@ if (!is.null(opt$cellassign_predictions)) {
 }
 
 # export annotated object with cellassign assignments
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/add_demux_sce.R b/bin/add_demux_sce.R
@@ -125,4 +125,4 @@ if (length(cellhash_ids) > 1) {
 }
 
 # write filtered sce to output
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/add_submitter_annotations.R b/bin/add_submitter_annotations.R
@@ -107,4 +107,4 @@ colData(sce) <- DataFrame(
 metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "submitter")
 
 # Write SCE back to file
-readr::write_rds(sce, opt$sce_file, compress = "gz")
+readr::write_rds(sce, opt$sce_file, compress = "bz2")
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -106,5 +106,5 @@ metadata(singler_results)$cell_ontology_df <- singler_model$cell_ontology_df
 readr::write_rds(
   singler_results,
   opt$output_singler_results_file,
-  compress = "gz"
+  compress = "bz2"
 )
diff --git a/bin/cluster_sce.R b/bin/cluster_sce.R
@@ -101,4 +101,4 @@ if (!opt$pca_name %in% reducedDimNames(sce)) {
 }
 
 # export -------------------
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/filter_sce.R b/bin/filter_sce.R
@@ -202,4 +202,4 @@ if (!is.null(ambient_profile)) {
 
 
 # write filtered sce to output
-readr::write_rds(filtered_sce, opt$filtered_file, compress = "gz")
+readr::write_rds(filtered_sce, opt$filtered_file, compress = "bz2")
diff --git a/bin/generate_unfiltered_sce.R b/bin/generate_unfiltered_sce.R
@@ -205,4 +205,4 @@ if (length(sample_type) == 1) {
 metadata(unfiltered_sce)$sample_type <- sample_type
 
 # write to rds
-readr::write_rds(unfiltered_sce, opt$unfiltered_file, compress = "gz")
+readr::write_rds(unfiltered_sce, opt$unfiltered_file, compress = "bz2")
diff --git a/bin/merge_sces.R b/bin/merge_sces.R
@@ -9,12 +9,12 @@ option_list <- list(
   make_option(
     opt_str = c("--input_library_ids"),
     type = "character",
-    help = "Comma separated list of library IDs corresponding to the libraries being integrated."
+    help = "Comma separated list of library IDs corresponding to the libraries being merged."
   ),
   make_option(
     opt_str = c("--input_sce_files"),
     type = "character",
-    help = "Comma separated list of input sce file paths corresponding to the sces being integrated."
+    help = "Comma separated list of input sce file paths corresponding to the sces being merged."
   ),
   make_option(
     opt_str = c("-o", "--output_sce_file"),
@@ -68,7 +68,7 @@ if (is.null(opt$input_sce_files)) {
 }
 
 if (length(input_sce_files) == 1) {
-  stop("Only 1 input file provided, no merging or integration will be performed for this group")
+  stop("Only 1 input file provided, no merging will be performed for this group")
 }
 
 # use library ids to name list of input files
@@ -175,7 +175,7 @@ if ("cellassign" %in% all_celltypes) {
 
 # Update some SCE information  -------------------------------------------------
 # - Add a new colData column with any additional modalities
-# - Remove cluster parameters from metadata
+# - Remove cluster parameters and miQC model from metadata
 sce_list <- sce_list |>
   purrr::map(\(sce){
     additional_modalities <- altExpNames(sce)
@@ -187,6 +187,7 @@ sce_list <- sce_list |>
     metadata(sce)$cluster_algorithm <- NULL
     metadata(sce)$cluster_weighting <- NULL
     metadata(sce)$cluster_nn <- NULL
+    metadata(sce)$miQC_model <- NULL
 
     return(sce)
   })

diff --git a/bin/post_process_sce.R b/bin/post_process_sce.R
@@ -144,6 +144,10 @@ if (alt_exp %in% altExpNames(sce)) {
 # filter sce using criteria in scpca_filter (not adt_scpca_filter)
 processed_sce <- sce[, which(sce$scpca_filter == "Keep")]
 
+# drop miQC model from processed object
+metadata(processed_sce)$miQC_model <- NULL
+
+
 # replace existing stats with recalculated gene stats
 drop_cols <- colnames(rowData(processed_sce, alt)) %in% c("mean", "detected")
 rowData(processed_sce) <- rowData(processed_sce)[!drop_cols]
@@ -273,7 +277,7 @@ if (length(reducedDimNames(processed_sce)) == 0) {
 # Export --------------
 
 # write out  filtered SCE with additional filtering column
-readr::write_rds(sce, opt$out_filtered_sce_file, compress = "gz")
+readr::write_rds(sce, opt$out_filtered_sce_file, compress = "bz2")
 
 # write out processed SCE
-readr::write_rds(processed_sce, opt$out_processed_sce_file, compress = "gz")
+readr::write_rds(processed_sce, opt$out_processed_sce_file, compress = "bz2")