diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py index 64eb7e8..943c458 100644 --- a/cell2cell/__init__.py +++ b/cell2cell/__init__.py @@ -14,4 +14,4 @@ from cell2cell import tensor from cell2cell import utils -__version__ = "0.6.0" \ No newline at end of file +__version__ = "0.6.1" \ No newline at end of file diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py index 487ea6a..95bebd8 100644 --- a/cell2cell/preprocessing/__init__.py +++ b/cell2cell/preprocessing/__init__.py @@ -4,7 +4,7 @@ from cell2cell.preprocessing.cutoffs import (get_constant_cutoff, get_cutoffs, get_global_percentile_cutoffs, get_local_percentile_cutoffs) -from cell2cell.preprocessing.find_elements import (find_duplicates) +from cell2cell.preprocessing.find_elements import (find_duplicates, get_element_abundances, get_elements_over_fraction) from cell2cell.preprocessing.gene_ontology import (find_all_children_of_go_term, find_go_terms_from_keyword, get_genes_from_go_hierarchy, get_genes_from_go_terms) from cell2cell.preprocessing.integrate_data import (get_thresholded_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms, diff --git a/cell2cell/preprocessing/find_elements.py b/cell2cell/preprocessing/find_elements.py index 20a506a..d28d246 100644 --- a/cell2cell/preprocessing/find_elements.py +++ b/cell2cell/preprocessing/find_elements.py @@ -2,7 +2,8 @@ from __future__ import absolute_import -from collections import defaultdict +import itertools +from collections import defaultdict, Counter def find_duplicates(element_list): '''Function based on: https://stackoverflow.com/a/5419576/12032899 @@ -25,4 +26,51 @@ def find_duplicates(element_list): duplicate_dict = {key : locs for key,locs in tally.items() if len(locs)>1} - return duplicate_dict \ No newline at end of file + return duplicate_dict + + +def get_element_abundances(element_lists): + '''Computes the fraction of occurrence of each element + in a list of lists. + + Parameters + ---------- + element_lists : list + List of lists of elements. Elements will be + counted only once in each of the lists. + + Returns + ------- + abundance_dict : dict + Dictionary containing the number of times that an + element was present, divided by the total number of + lists in `element_lists`. + ''' + abundance_dict = Counter(itertools.chain(*map(set, element_lists))) + total = len(element_lists) + abundance_dict = {k : v/total for k, v in abundance_dict.items()} + return abundance_dict + + +def get_elements_over_fraction(abundance_dict, fraction): + '''Obtains a list of elements with the + fraction of occurrence at least the threshold. + + Parameters + ---------- + abundance_dict : dict + Dictionary containing the number of times that an + element was present, divided by the total number of + possible occurrences. + + fraction : float + Threshold to filter the elements. Elements with at least + this threshold will be included. + + Returns + ------- + elements : list + List of elements that met the fraction criteria. + ''' + elements = [k for k, v in abundance_dict.items() if v >= fraction] + return elements \ No newline at end of file diff --git a/cell2cell/stats/permutation.py b/cell2cell/stats/permutation.py index 241bc5b..f7a05af 100644 --- a/cell2cell/stats/permutation.py +++ b/cell2cell/stats/permutation.py @@ -49,12 +49,20 @@ def compute_pvalue_from_dist(obs_value, dist, consider_size=False, comparison='u P-value obtained from comparing the observed value and values in the distribution. ''' + # Omit nan values + dist_ = [x for x in dist if ~np.isnan(x)] + + # All values in dist are NaNs or obs_value is NaN + if (len(dist_) == 0) | np.isnan(obs_value): + return 1.0 + + # No NaN values if comparison == 'lower': - pval = scipy.stats.percentileofscore(dist, obs_value) / 100.0 + pval = scipy.stats.percentileofscore(dist_, obs_value) / 100.0 elif comparison == 'upper': - pval = 1.0 - scipy.stats.percentileofscore(dist, obs_value) / 100.0 + pval = 1.0 - scipy.stats.percentileofscore(dist_, obs_value) / 100.0 elif comparison == 'different': - percentile = scipy.stats.percentileofscore(dist, obs_value) / 100.0 + percentile = scipy.stats.percentileofscore(dist_, obs_value) / 100.0 if percentile <= 0.5: pval = 2.0 * percentile else: @@ -63,7 +71,7 @@ def compute_pvalue_from_dist(obs_value, dist, consider_size=False, comparison='u raise NotImplementedError('Comparison {} is not implemented'.format(comparison)) if (consider_size) & (pval == 0.): - pval = 1./(len(dist) + 1e-6) + pval = 1./(len(dist_) + 1e-6) return pval diff --git a/cell2cell/tensor/external_scores.py b/cell2cell/tensor/external_scores.py index c40e7b5..4ff5f3d 100644 --- a/cell2cell/tensor/external_scores.py +++ b/cell2cell/tensor/external_scores.py @@ -4,12 +4,13 @@ import pandas as pd from collections import defaultdict +from cell2cell.preprocessing.find_elements import get_element_abundances, get_elements_over_fraction from cell2cell.tensor.tensor import PreBuiltTensor def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receptor_col, score_col, how='inner', - lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None, order_labels=None, - sort_elements=True, device=None): + outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None, + order_labels=None, sort_elements=True, device=None): '''Generates an InteractionTensor from a dictionary containing dataframes for all contexts. @@ -55,6 +56,13 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, contexts (intersection), while all cell types that are present across contexts (union). + outer_fraction : float, default=0.0 + Threshold to filter the elements when `how` includes any outer option. + Elements with a fraction abundance across contexts (in `context_df_dict`) + at least this threshold will be included. When this value is 0, considers + all elements across the samples. When this value is 1, it acts as using + `how='inner'`. + lr_fill : float, default=numpy.nan Value to fill communication scores when a ligand-receptor pair is not present across all contexts. @@ -123,41 +131,32 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receiver_dict[k].update(df[receiver_col].unique().tolist()) # Subset LR pairs, sender and receiver cells given parameter 'how' - for i, k in enumerate(context_order): - if i == 0: - inter_lrs = set(lr_dict[k]) - inter_senders = set(sender_dict[k]) - inter_receivers = set(receiver_dict[k]) - - union_lrs = set(lr_dict[k]) - union_senders = set(sender_dict[k]) - union_receivers = set(receiver_dict[k]) - - else: - inter_lrs = inter_lrs.intersection(set(lr_dict[k])) - inter_senders = inter_senders.intersection(set(sender_dict[k])) - inter_receivers = inter_receivers.intersection(set(receiver_dict[k])) - - union_lrs = union_lrs.union(set(lr_dict[k])) - union_senders = union_senders.union(set(sender_dict[k])) - union_receivers = union_receivers.union(set(receiver_dict[k])) + df_lrs = [list(lr_dict[k]) for k in context_order] + df_senders = [list(sender_dict[k]) for k in context_order] + df_receivers = [list(receiver_dict[k]) for k in context_order] if how == 'inner': - lr_pairs = list(inter_lrs) - sender_cells = list(inter_senders) - receiver_cells = list(inter_receivers) + lr_pairs = list(set.intersection(*map(set, df_lrs))) + sender_cells = list(set.intersection(*map(set, df_senders))) + receiver_cells = list(set.intersection(*map(set, df_receivers))) elif how == 'outer': - lr_pairs = list(union_lrs) - sender_cells = list(union_senders) - receiver_cells = list(union_receivers) + lr_pairs = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_lrs), + fraction=outer_fraction) + sender_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_senders), + fraction=outer_fraction) + receiver_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_receivers), + fraction=outer_fraction) elif how == 'outer_lrs': - lr_pairs = list(union_lrs) - sender_cells = list(inter_senders) - receiver_cells = list(inter_receivers) + lr_pairs = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_lrs), + fraction=outer_fraction) + sender_cells = list(set.intersection(*map(set, df_senders))) + receiver_cells = list(set.intersection(*map(set, df_receivers))) elif how == 'outer_cells': - lr_pairs = list(inter_lrs) - sender_cells = list(union_senders) - receiver_cells = list(union_receivers) + lr_pairs = list(set.intersection(*map(set, df_lrs))) + sender_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_senders), + fraction=outer_fraction) + receiver_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_receivers), + fraction=outer_fraction) else: raise ValueError("Not a valid input for parameter 'how'") diff --git a/cell2cell/tensor/tensor.py b/cell2cell/tensor/tensor.py index faab85a..3172548 100644 --- a/cell2cell/tensor/tensor.py +++ b/cell2cell/tensor/tensor.py @@ -8,6 +8,7 @@ from tqdm.auto import tqdm from cell2cell.core.communication_scores import compute_ccc_matrix, aggregate_ccc_matrices +from cell2cell.preprocessing.find_elements import get_element_abundances, get_elements_over_fraction from cell2cell.preprocessing.ppi import get_genes_from_complexes from cell2cell.preprocessing.rnaseq import add_complexes_to_expression from cell2cell.preprocessing.ppi import filter_ppi_by_proteins @@ -50,6 +51,13 @@ class BaseTensor(): contexts (intersection), while all cell types that are present across contexts (union). + outer_fraction : float + Threshold to filter the elements when `how` includes any outer option. + Elements with a fraction abundance across samples (in `rnaseq_matrices`) + at least this threshold will be included. When this value is 0, considers + all elements across the samples. When this value is 1, it acts as using + `how='inner'`. + tensor : tensorly.tensor Tensor object created with the library tensorly. @@ -114,6 +122,7 @@ def __init__(self): # Save variables for this class self.communication_score = None self.how = None + self.outer_fraction = None self.tensor = None self.genes = None self.cells = None @@ -474,7 +483,7 @@ def export_factor_loadings(self, filename): print('Loadings of the tensor factorization were successfully saved into {}'.format(filename)) def excluded_value_fraction(self): - '''Returns the fraction of missing/excluded values in the tensor, + '''Returns the fraction of excluded values in the tensor, given the values that are masked in tensor.mask Returns @@ -490,6 +499,38 @@ def excluded_value_fraction(self): excluded_fraction = 1.0 - fraction.item() return excluded_fraction + def sparsity_fraction(self): + '''Returns the fraction of values that are zeros in the tensor, + given the values that are in tensor.loc_zeros + + Returns + ------- + sparsity_fraction : float + Fraction of values that are real zeros. + ''' + if self.loc_zeros is None: + print("The interaction tensor does not have zeros") + return 0.0 + else: + sparsity_fraction = tl.sum(self.loc_zeros) / tl.prod(tl.tensor(self.tensor.shape)) + return sparsity_fraction + + def missing_fraction(self): + '''Returns the fraction of values that are missing (NaNs) in the tensor, + given the values that are in tensor.loc_nans + + Returns + ------- + missing_fraction : float + Fraction of values that are real zeros. + ''' + if self.loc_nans is None: + print("The interaction tensor does not have zeros") + return 0.0 + else: + missing_fraction = tl.sum(self.loc_nans) / tl.prod(tl.tensor(self.tensor.shape)) + return missing_fraction + def explained_variance(self): '''Computes the explained variance score for a tensor decomposition. Inspired on the function in sklearn.metrics.explained_variance_score. @@ -560,6 +601,13 @@ class InteractionTensor(BaseTensor): contexts (intersection), while all cell types that are present across contexts (union). + outer_fraction : float, default=0.0 + Threshold to filter the elements when `how` includes any outer option. + Elements with a fraction abundance across samples (in `rnaseq_matrices`) + at least this threshold will be included. When this value is 0, considers + all elements across the samples. When this value is 1, it acts as using + `how='inner'`. + communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- receptor pair by a pair of cells/tissues/samples. @@ -620,7 +668,7 @@ class InteractionTensor(BaseTensor): verbose : boolean, default=False Whether printing or not steps of the analysis. ''' - def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=None, how='inner', + def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=None, how='inner', outer_fraction=0.0, communication_score='expression_mean', complex_sep=None, complex_agg_method='min', upper_letter_comparison=True, interaction_columns=('A', 'B'), group_ppi_by=None, group_ppi_method='gmean', device=None, verbose=True): @@ -656,6 +704,7 @@ def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=N tensor, genes, cells, ppi_names, mask = build_context_ccc_tensor(rnaseq_matrices=mod_rnaseq_matrices, ppi_data=ppi_data, how=how, + outer_fraction=outer_fraction, communication_score=communication_score, complex_sep=complex_sep, upper_letter_comparison=upper_letter_comparison, @@ -680,6 +729,7 @@ def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=N # Save variables for this class self.communication_score = communication_score self.how = how + self.outer_fraction = outer_fraction if device is None: self.tensor = tl.tensor(tensor) self.mask = mask @@ -736,6 +786,15 @@ def __init__(self, tensor, order_names, order_labels=None, mask=None, loc_nans=N # Init BaseTensor BaseTensor.__init__(self) + # Initialize tensor + try: + context = tl.context(tensor) + except: + context = {'dtype': tensor.dtype, 'device' : None} + tensor = tl.to_numpy(tensor) + if mask is not None: + mask = tl.to_numpy(mask) + # Location of NaNs and zeros tmp_nans = (np.isnan(tensor)).astype(int) # Find extra NaNs that were not considered if loc_nans is None: @@ -751,30 +810,34 @@ def __init__(self, tensor, order_names, order_labels=None, mask=None, loc_nans=N # Store tensor tensor_ = np.nan_to_num(tensor) - if device is None: + if device is not None: + context['device'] = device + if 'device' not in context.keys(): self.tensor = tl.tensor(tensor_) - self.mask = mask - else: - if tl.get_backend() == 'pytorch': - self.tensor = tl.tensor(tensor_, device=device) - if mask is not None: - self.mask = tl.tensor(mask, device=device) - else: - self.mask = mask + if mask is None: + self.mask = mask else: - self.tensor = tl.tensor(tensor_) + self.mask = tl.tensor(mask) + else: + self.tensor = tl.tensor(tensor_, device=context['device']) + if mask is None: self.mask = mask + else: + self.mask = tl.tensor(mask, device=context['device']) + + # Potential TODO: make loc_nans and loc_zeros to be a tensor object using the same context. + self.order_names = order_names if order_labels is None: - self.order_labels = ['Dimension-{}'.format(i + 1) for i in range(self.tensor.shape)] + self.order_labels = ['Dimension-{}'.format(i + 1) for i in range(len(self.tensor.shape))] else: self.order_labels = order_labels assert len(self.tensor.shape) == len(self.order_labels), "The length of order_labels must match the number of orders/dimensions in the tensor" -def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communication_score='expression_product', - complex_sep=None, upper_letter_comparison=True, interaction_columns=('A', 'B'), - group_ppi_by=None, group_ppi_method='gmean', verbose=True): +def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', outer_fraction=0.0, + communication_score='expression_product', complex_sep=None, upper_letter_comparison=True, + interaction_columns=('A', 'B'), group_ppi_by=None, group_ppi_method='gmean', verbose=True): '''Builds a 4D-Communication tensor. Takes the gene expression matrices and the list of PPIs to compute the communication scores between the interacting cells for each PPI. @@ -805,6 +868,13 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati contexts (intersection), while all cell types that are present across contexts (union). + outer_fraction : float, default=0.0 + Threshold to filter the elements when `how` includes any outer option. + Elements with a fraction abundance across samples (in `rnaseq_matrices`) + at least this threshold will be included. When this value is 0, considers + all elements across the samples. When this value is 1, it acts as using + `how='inner'`. + communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- receptor pair by a pair of cells/tissues/samples. @@ -877,23 +947,23 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati ''' df_idxs = [list(rnaseq.index) for rnaseq in rnaseq_matrices] df_cols = [list(rnaseq.columns) for rnaseq in rnaseq_matrices] - inter_genes = set.intersection(*map(set, df_idxs)) - inter_cells = set.intersection(*map(set, df_cols)) - union_genes = set.union(*map(set, df_idxs)) - union_cells = set.union(*map(set, df_cols)) if how == 'inner': - genes = inter_genes - cells = inter_cells + genes = set.intersection(*map(set, df_idxs)) + cells = set.intersection(*map(set, df_cols)) elif how == 'outer': - genes = union_genes - cells = union_cells + genes = set(get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_idxs), + fraction=outer_fraction)) + cells = set(get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_cols), + fraction=outer_fraction)) elif how == 'outer_genes': - genes = union_genes - cells = inter_cells + genes = set(get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_idxs), + fraction=outer_fraction)) + cells = set.intersection(*map(set, df_cols)) elif how == 'outer_cells': - genes = inter_genes - cells = union_cells + genes = set.intersection(*map(set, df_idxs)) + cells = set(get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_cols), + fraction=outer_fraction)) else: raise ValueError('Provide a valid way to build the tensor; "how" must be "inner", "outer", "outer_genes" or "outer_cells"') @@ -1110,7 +1180,7 @@ def generate_tensor_metadata(interaction_tensor, metadata_dicts, fill_with_order return metadata -def interactions_to_tensor(interactions, experiment='single_cell', context_names=None, how='inner', +def interactions_to_tensor(interactions, experiment='single_cell', context_names=None, how='inner', outer_fraction=0.0, communication_score='expression_product', upper_letter_comparison=True, verbose=True): '''Takes a list of Interaction pipelines (see classes in cell2cell.analysis.pipelines) and generates a communication @@ -1144,6 +1214,13 @@ def interactions_to_tensor(interactions, experiment='single_cell', context_names contexts (intersection), while all cell types that are present across contexts (union). + outer_fraction : float, default=0.0 + Threshold to filter the elements when `how` includes any outer option. + Elements with a fraction abundance across samples at least this + threshold will be included. When this value is 0, considers + all elements across the samples. When this value is 1, it acts as using + `how='inner'`. + communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- receptor pair by a pair of cells/tissues/samples. @@ -1196,6 +1273,7 @@ def interactions_to_tensor(interactions, experiment='single_cell', context_names ppi_data=ppi_data, context_names=context_names, how=how, + outer_fraction=outer_fraction, complex_sep=complex_sep, complex_agg_method=complex_agg_method, interaction_columns=interaction_columns, diff --git a/cell2cell/tensor/tensor_manipulation.py b/cell2cell/tensor/tensor_manipulation.py index 9b977e5..e62ff77 100644 --- a/cell2cell/tensor/tensor_manipulation.py +++ b/cell2cell/tensor/tensor_manipulation.py @@ -58,17 +58,27 @@ def concatenate_interaction_tensors(interaction_tensors, axis, order_labels, rem for tensor in interaction_tensors[1:]: assert elements == tensor.order_names[i], "Tensors must have the same elements in the other axes." + # Initialize tensors into a numpy object for performing subset + # Use the same context as first tensor for everything + try: + context = tl.context(interaction_tensors[0].tensor) + except: + context = {'dtype': interaction_tensors[0].tensor.dtype, 'device' : None} # Concatenate tensors - concat_tensor = tl.concatenate([tensor.tensor for tensor in interaction_tensors], axis=axis) + concat_tensor = tl.concatenate([tensor.tensor.to('cpu') for tensor in interaction_tensors], axis=axis) if mask is not None: assert mask.shape == concat_tensor.shape, "Mask must have the same shape of the concatenated tensor. Here: {}".format(concat_tensor.shape) else: # Generate a new mask from all previous masks if all are not None if all([tensor.mask is not None for tensor in interaction_tensors]): - mask = tl.concatenate([tensor.mask for tensor in interaction_tensors], axis=axis) + mask = tl.concatenate([tensor.mask.to('cpu') for tensor in interaction_tensors], axis=axis) else: mask = None + concat_tensor = tl.tensor(concat_tensor, device=context['device']) + if mask is not None: + mask = tl.tensor(mask, device=context['device']) + # Concatenate names of elements for the given axis but keep the others as in one tensor order_names = [] for i in range(shape): diff --git a/release/0.6.1-notes.md b/release/0.6.1-notes.md new file mode 100644 index 0000000..7bc105e --- /dev/null +++ b/release/0.6.1-notes.md @@ -0,0 +1,20 @@ +# Release Notes - cell2cell v0.6.1 + +## New features +- Implemented the option to filter for cells/genes/lr pairs that are present in a given +fraction of samples/contexts in addition to using the union or intersection to build a +tensor derived from `BaseTensor`. This can be controlled with the parameter `outer_fraction` +in the classes/functions available in `cell2cell.tensor.tensor` and `cell2cell.tensor.external_scores`. +- Added method `sparsity_fraction()` to `cell2cell.tensor.tensor.BaseTensor`, which computes the fraction of +values in the tensor that are real zeros. +- Added method `missing_fraction()` to `cell2cell.tensor.tensor.BaseTensor`, which computes the fraction of +values in the tensor that are missing or NaNs. + +## Feature updates +- `cellcell2.stats.permutation.compute_pvalue_from_dist()` ignores NaN values. + +## Fixed Bugs +- Fixed bug of `cell2cell.tensor.concatenate_interaction_tensors()` that did not allow +concatenating tensors when using a tensorly backend different to numpy. +- Fixed bug to deal with GPU tensors in `cell2cell.tensor.tensor.PreBuiltTensor` +- Fixed bug about dimension labelling in `cell2cell.tensor.tensor.PreBuiltTensor` \ No newline at end of file