cytomining · gwaybio · Dec 22, 2022 · Dec 16, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -69,6 +69,14 @@ class SingleCells(object):
         Name of the fields of view feature.
     object_feature : str, default "Metadata_ObjectNumber"
         Object number feature.
+    default_datatype_float: type
+        Numpy floating point datatype to use for load_compartment and resulting
+        dataframes. This parameter may be used to assist with performance-related
+        issues by reducing the memory required for floating-point data. 
+        For example, using np.float32 instead of np.float64 for this parameter 
+        will reduce memory consumed by float columns by roughly 50%.
+        Please note: using any besides np.float64 are experimentally
+        unverified.
 
     Notes
     -----
@@ -105,6 +113,7 @@ def __init__(
         fields_of_view="all",
         fields_of_view_feature="Metadata_Site",
         object_feature="Metadata_ObjectNumber",
+        default_datatype_float=np.float64,
     ):
         """Constructor method"""
         # Check compartments specified
@@ -139,6 +148,7 @@ def __init__(
         self.compartment_linking_cols = compartment_linking_cols
         self.fields_of_view_feature = fields_of_view_feature
         self.object_feature = object_feature
+        self.default_datatype_float = default_datatype_float
 
         # Confirm that the compartments and linking cols are formatted properly
         assert_linking_cols_complete(
@@ -436,6 +446,9 @@ def split_column_categories(self, col_names):
     def load_compartment(self, compartment):
         """Creates the compartment dataframe.
 
+        Note: makes use of default_datatype_float attribute
+        for setting a default floating point datatype.
+
         Parameters
         ----------
         compartment : str
@@ -455,8 +468,10 @@ def load_compartment(self, compartment):
         meta_cols, feat_cols = self.split_column_categories(col_names)
         num_meta, num_feats = len(meta_cols), len(feat_cols)
 
-        # Use pre-allocated np.array for data
-        feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
+        # Use pre-allocated np.array for feature data
+        feats = np.empty(
+            shape=(num_cells, num_feats), dtype=self.default_datatype_float
+        )
         # Use pre-allocated pd.DataFrame for metadata
         metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
 
@@ -748,7 +763,9 @@ def merge_single_cells(
 
                 else:
                     sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
+                        self.load_compartment(
+                            compartment=right_compartment
+                        ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -3,6 +3,7 @@
 import random
 import tempfile
 
+import numpy as np
 import pandas as pd
 import pytest
 from pycytominer import aggregate, annotate, normalize
@@ -293,6 +294,42 @@ def test_load_compartment():
         check_dtype=False,
     )
 
+    # test load_compartment with non-default default_datatype_float
+    # create new SingleCells based on AP
+    float32_loaded_compartment_df = SingleCells(
+        sql_file=TMP_SQLITE_FILE, default_datatype_float=np.float32
+    ).load_compartment(compartment="cells")
+
+    # for uniformly handling metadata types for both dataframes
+    metadata_types = {"ObjectNumber": "int64"}
+
+    # create deep copy of CELLS_DF with manually re-typed float columns as float32
+    cells_df_for_compare = CELLS_DF.copy(deep=True).astype(
+        # cast any float type columns to float32 for expected comparison
+        {
+            colname: np.float32
+            for colname in CELLS_DF.columns
+            # check for only columns which are of float type
+            if pd.api.types.is_float(CELLS_DF[colname].dtype)
+            # check for columns which are of 'int64' type
+            # note: pd.api.types.is_integer sometimes is unable to detect int64
+            or CELLS_DF[colname].dtype == "int64"
+            # avoid recasting the metadata_types
+            and colname not in metadata_types.keys()
+        }
+        # use float32_loaded_compartment_df column order for comparison below
+    )[float32_loaded_compartment_df.columns]
+
+    # cast metadata types in the same way for comparisons
+    float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types)
+    cells_df_for_compare = cells_df_for_compare.astype(metadata_types)
+
+    # perform comparison of dataframes
+    pd.testing.assert_frame_equal(
+        float32_loaded_compartment_df,
+        cells_df_for_compare,
+    )
+
 
 def test_sc_count_sql_table():
     # Iterate over initialized compartments
@@ -436,14 +473,6 @@ def test_merge_single_cells():
         traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
     )
 
-
-def test_merge_single_cells_subset():
-    sc_merged_df = AP_SUBSET.merge_single_cells()
-    assert (sc_merged_df.shape[1]) == 13
-    non_meta_cols = [x for x in sc_merged_df.columns if "Metadata" not in x]
-    assert len(non_meta_cols) == len([x for x in non_meta_cols if x in SUBSET_FEATURES])
-
-
 def test_merge_single_cells_subsample():
 
     for subsample_frac in [0.1, 0.5, 0.9]:
@@ -1050,3 +1079,12 @@ def test_load_non_canonical_image_table():
         result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"),
         sc_aggregated_df,
     )
+
+def test_singlecells_default_datatype():
+    """
+    Testing various use of SingleCells class attribute
+    default_datatype_float with non-default options.
+    """
+
+
+