Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding single cell module #111

Merged
merged 16 commits into from
Dec 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 15 additions & 281 deletions pycytominer/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,301 +1,22 @@
"""
Aggregate single cell data based on given grouping variables.
Aggregation assumes input data is arrayed such that each well has a single perturbation.
Aggregate profiles based on given grouping variables.
"""

import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from pycytominer.cyto_utils import (
output,
check_compartments,
check_aggregate_operation,
infer_cp_features,
)


class AggregateProfiles:
"""
Class to aggregate single cell morphological profiles into per-well measurements.
You can currently choose to do this by either median or mean.
"""

def __init__(
self,
sql_file,
strata=["Metadata_Plate", "Metadata_Well"],
features="infer",
operation="median",
output_file="none",
compartments=["cells", "cytoplasm", "nuclei"],
merge_cols=["TableNumber", "ImageNumber"],
load_image_data=True,
subsample_frac=1,
subsample_n="all",
subsampling_random_state="none",
):
"""
Arguments:
sql_file - string or sqlalchemy connection
strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
features - [default: "all"] or list indicating features that should be aggregated
operation - [default: "median"] a string indicating how the data is aggregated
currently only supports one of ['mean', 'median']
output_file - [default: "none"] string if specified, write to location
compartments - list of compartments to process
merge_cols - column indicating which columns to merge images and compartments
load_image_data - [default: True] whether or not to load the image table.
subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
single cells to select
subsample_n - [default: "all"] int indicating how many samples to include
subsampling_random_state - [default: "none"] the random state to init subsample
"""
# Check compartments specified
check_compartments(compartments)

# Check if correct operation is specified
operation = check_aggregate_operation(operation)

# Check that the subsample_frac is between 0 and 1
assert (
0 < subsample_frac and 1 >= subsample_frac
), "subsample_frac must be between 0 and 1"

self.sql_file = sql_file
self.strata = strata
self.features = features
self.operation = operation.lower()
self.output_file = output_file
self.compartments = compartments
self.merge_cols = merge_cols
self.subsample_frac = subsample_frac
self.subsample_n = subsample_n
self.subset_data_df = "none"
self.subsampling_random_state = subsampling_random_state
self.is_aggregated = False
self.is_subset_computed = False

if self.subsample_n != "all":
self.set_subsample_n(self.subsample_n)

# Connect to sqlite engine
self.engine = create_engine(self.sql_file)
self.conn = self.engine.connect()

# Throw an error if both subsample_frac and subsample_n is set
self._check_subsampling()

if load_image_data:
self.load_image()

def _check_subsampling(self):
# Check that the user didn't specify both subset frac and subsample all
assert (
self.subsample_frac == 1 or self.subsample_n == "all"
), "Do not set both subsample_frac and subsample_n"

def set_output_file(self, output_file):
self.output_file = output_file

def set_subsample_frac(self, subsample_frac):
self.subsample_frac = subsample_frac
self._check_subsampling()

def set_subsample_n(self, subsample_n):
try:
self.subsample_n = int(subsample_n)
except ValueError:
raise ValueError("subsample n must be an integer or coercable")
self._check_subsampling()

def set_subsample_random_state(self, random_state):
self.subsampling_random_state = random_state

def load_image(self):
"""
Load image table from sqlite file
"""
# Extract image metadata
image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata))
image_query = "select {} from image".format(image_cols)
self.image_df = pd.read_sql(sql=image_query, con=self.conn)

def count_cells(self, compartment="cells", count_subset=False):
"""
Determine how many objects are present in each well (or subset of each well).

Arguments:
compartment - [default: "cells"] string indicating the compartment name to subset
count_subset - [default: False] count the number of objects in the current subset partition.
If set to True you must have set up a subset with get_subsample beforehand.
"""
check_compartments(compartment)

if count_subset:
assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
assert self.is_subset_computed, "Make sure to get_subsample() first!"
count_df = (
self.subset_data_df.groupby(self.strata)["ObjectNumber"]
.count()
.reset_index()
.rename({"ObjectNumber": "cell_count"}, axis="columns")
)
else:
query_cols = "TableNumber, ImageNumber, ObjectNumber"
query = "select {} from {}".format(query_cols, compartment)
count_df = self.image_df.merge(
pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
)
count_df = (
count_df.groupby(self.strata)["ObjectNumber"]
.count()
.reset_index()
.rename({"ObjectNumber": "cell_count"}, axis="columns")
)

return count_df

def subsample_profiles(self, x):
"""
Sample a Pandas DataFrame given the subsampling fraction
"""
if self.subsampling_random_state == "none":
random_state = np.random.randint(0, 10000, size=1)[0]
self.set_subsample_random_state(random_state)

if self.subsample_frac == 1:
return pd.DataFrame.sample(
x,
n=self.subsample_n,
replace=True,
random_state=self.subsampling_random_state,
)
else:
return pd.DataFrame.sample(
x, frac=self.subsample_frac, random_state=self.subsampling_random_state
)

def get_subsample(self, compartment="cells"):
"""
Extract subsample from sqlite file

Arguments:
compartment - [default: "cells"] string indicating the compartment to subset
"""
check_compartments(compartment)

query_cols = "TableNumber, ImageNumber, ObjectNumber"
query = "select {} from {}".format(query_cols, compartment)

# Load query and merge with image_df
query_df = self.image_df.merge(
pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
)

self.subset_data_df = (
query_df.groupby(self.strata)
.apply(lambda x: self.subsample_profiles(x))
.reset_index(drop=True)
)

self.is_subset_computed = True

def aggregate_compartment(self, compartment, compute_subsample=False):
"""
Aggregate morphological profiles

Arguments:
compartment - str indicating specific compartment to extract

Return:
Either the merged object file or write object to disk
"""
check_compartments(compartment)

compartment_query = "select * from {}".format(compartment)

if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
self.get_subsample(compartment=compartment)

population_df = self.image_df.merge(
pd.read_sql(sql=compartment_query, con=self.conn),
how="inner",
on=self.merge_cols,
)

object_df = aggregate(
population_df=population_df,
strata=self.strata,
features=self.features,
operation=self.operation,
subset_data_df=self.subset_data_df,
)

return object_df

def aggregate_profiles(
self,
compute_subsample="False",
output_file="none",
compression=None,
float_format=None,
):
"""
Aggregate and merge compartments. This is the primary entry to this class.

Arguments:
compute_subsample - [default: False] boolean if subsample should be computed.
NOTE: Must be specified to perform subsampling. Will not
apply subsetting if set to False even if subsample is
initialized
output_file - [default: "none"] if provided, will write annotated profiles to file
if not specified, will return the annotated profiles. We recommend
that this output file be suffixed with "_augmented.csv".
compression - [default: None] the mechanism to compress. See cyto_utils/output.py for options.
float_format - [default: None] decimal precision to use in writing output file
For example, use "%.3g" for 3 decimal precision.

Return:
if output_file is set, then write to file. If not then return
"""

if output_file != "none":
self.set_output_file(output_file)

aggregated = (
self.aggregate_compartment(
compartment="cells", compute_subsample=compute_subsample
)
.merge(
self.aggregate_compartment(compartment="cytoplasm"),
on=self.strata,
how="inner",
)
.merge(
self.aggregate_compartment(compartment="nuclei"),
on=self.strata,
how="inner",
)
)

self.is_aggregated = True

if self.output_file != "none":
output(
df=aggregated,
output_filename=self.output_file,
compression=compression,
float_format=float_format,
)
else:
return aggregated


def aggregate(
population_df,
strata=["Metadata_Plate", "Metadata_Well"],
features="infer",
operation="median",
output_file="none",
subset_data_df="none",
):
"""
Expand All @@ -307,6 +28,9 @@ def aggregate(
features - [default: "all"] or list indicating features that should be aggregated
operation - [default: "median"] a string indicating how the data is aggregated
currently only supports one of ['mean', 'median']
output_file - [default: "none"] if provided, will write aggregated profiles to file
if not specified, will return the aggregated profiles. We recommend
naming the file based on the plate name.
subset_data_df - [default: "none"] a pandas dataframe indicating how to subset the input

Return:
Expand Down Expand Up @@ -349,4 +73,14 @@ def aggregate(
if col in population_df.columns:
population_df = population_df.drop([col], axis="columns")

if output_file != "none":
output(
df=population_df,
output_filename=output_file,
compression=compression,
float_format=float_format,
)
else:
return population_df

return population_df
7 changes: 7 additions & 0 deletions pycytominer/cyto_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from .output import output
from .util import (
check_compartments,
get_default_compartments,
load_known_metadata_dictionary,
check_correlation_method,
check_aggregate_operation,
check_consensus_operation,
get_pairwise_correlation,
)
from .single_cell_ingest_utils import (
get_default_linking_cols,
assert_linking_cols_complete,
provide_linking_cols_feature_name_update,
)
from .load import (
load_profiles,
load_platemap,
Expand All @@ -17,6 +23,7 @@
count_na_features,
infer_cp_features,
drop_outlier_features,
convert_compartment_format_to_list,
)
from .write_gct import write_gct
from .modz import modz
Loading