diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py index 6315850..64eb7e8 100644 --- a/cell2cell/__init__.py +++ b/cell2cell/__init__.py @@ -14,4 +14,4 @@ from cell2cell import tensor from cell2cell import utils -__version__ = "0.5.11" \ No newline at end of file +__version__ = "0.6.0" \ No newline at end of file diff --git a/cell2cell/analysis/pipelines.py b/cell2cell/analysis/pipelines.py index ba3aa94..1a11720 100644 --- a/cell2cell/analysis/pipelines.py +++ b/cell2cell/analysis/pipelines.py @@ -62,6 +62,7 @@ class BulkInteractions: - 'bray_curtis' : Bray-Curtis-like score. - 'jaccard' : Jaccard-like score. - 'count' : Number of LR pairs that the pair of cells use. + - 'icellnet' : Sum of the L-R expression product of a pair of cells cci_type : str, default='undirected' Specifies whether computing the cci_score in a directed or undirected @@ -93,6 +94,7 @@ class BulkInteractions: - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -122,13 +124,13 @@ class BulkInteractions: For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". - complex_agg_method : str Method to aggregate the expression value of multiple genes in a complex. - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. ref_ppi : pandas.DataFrame Reference list of protein-protein interactions (or ligand-receptor pairs) used @@ -164,6 +166,7 @@ class BulkInteractions: - 'bray_curtis' - 'jaccard' - 'count' + - 'icellnet' - 'cci_type' : is the type of interaction between two cells. If it is undirected, all ligands and receptors are considered from both cells. @@ -274,6 +277,7 @@ def compute_pairwise_cci_scores(self, cci_score=None, use_ppi_score=False, verbo - 'bray_curtis' : Bray-Curtis-like score. - 'jaccard' : Jaccard-like score. - 'count' : Number of LR pairs that the pair of cells use. + - 'icellnet' : Sum of the L-R expression product of a pair of cells use_ppi_score : boolean, default=False Whether using a weight of LR pairs specified in the ppi_data @@ -422,6 +426,7 @@ class SingleCellInteractions: - 'bray_curtis' : Bray-Curtis-like score. - 'jaccard' : Jaccard-like score. - 'count' : Number of LR pairs that the pair of cells use. + - 'icellnet' : Sum of the L-R expression product of a pair of cells cci_type : str, default='undirected' Specifies whether computing the cci_score in a directed or undirected @@ -466,6 +471,7 @@ class SingleCellInteractions: - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -503,6 +509,7 @@ class SingleCellInteractions: - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. ref_ppi : pandas.DataFrame Reference list of protein-protein interactions (or ligand-receptor pairs) used @@ -538,6 +545,7 @@ class SingleCellInteractions: - 'bray_curtis' - 'jaccard' - 'count' + - 'icellnet' - 'cci_type' : is the type of interaction between two cells. If it is undirected, all ligands and receptors are considered from both cells. @@ -638,15 +646,16 @@ def __init__(self, rnaseq_data, ppi_data, metadata, interaction_columns=('A', 'B self.analysis_setup['cci_type'] = cci_type # Initialize PPI - ppi_data_ = ppi.filter_ppi_by_proteins(ppi_data=ppi_data, proteins=genes, complex_sep=complex_sep, upper_letter_comparison=False, interaction_columns=interaction_columns) + self.ppi_data = ppi.remove_ppi_bidirectionality(ppi_data=ppi_data_, interaction_columns=interaction_columns, verbose=verbose) + if self.analysis_setup['cci_type'] == 'undirected': self.ref_ppi = self.ppi_data self.ppi_data = ppi.bidirectional_ppi_for_cci(ppi_data=self.ppi_data, @@ -854,6 +863,7 @@ def initialize_interaction_space(rnaseq_data, ppi_data, cutoff_setup, analysis_s - 'bray_curtis' - 'jaccard' - 'count' + - 'icellnet' - 'cci_type' : is the type of interaction between two cells. If it is undirected, all ligands and receptors are considered from both cells. diff --git a/cell2cell/core/__init__.py b/cell2cell/core/__init__.py index 33f64e0..2bf34d9 100644 --- a/cell2cell/core/__init__.py +++ b/cell2cell/core/__init__.py @@ -2,8 +2,9 @@ from __future__ import absolute_import -from cell2cell.core.cci_scores import (compute_braycurtis_like_cci_score, compute_count_score, compute_jaccard_like_cci_score, - matmul_bray_curtis_like, matmul_count_active, matmul_jaccard_like) +from cell2cell.core.cci_scores import (compute_braycurtis_like_cci_score, compute_count_score, compute_icellnet_score, + compute_jaccard_like_cci_score, matmul_bray_curtis_like, matmul_count_active, + matmul_jaccard_like) from cell2cell.core.cell import (Cell, get_cells_from_rnaseq) from cell2cell.core.communication_scores import (get_binary_scores, get_continuous_scores, compute_ccc_matrix, aggregate_ccc_matrices) from cell2cell.core.interaction_space import (generate_interaction_elements, InteractionSpace) \ No newline at end of file diff --git a/cell2cell/core/cci_scores.py b/cell2cell/core/cci_scores.py index 407853b..64f8c4e 100644 --- a/cell2cell/core/cci_scores.py +++ b/cell2cell/core/cci_scores.py @@ -139,6 +139,45 @@ def compute_count_score(cell1, cell2, ppi_score=None): return cci_score +def compute_icellnet_score(cell1, cell2, ppi_score=None): + '''Calculates the sum of communication scores + for the interaction between two cells. Based on ICELLNET. + + Parameters + ---------- + cell1 : cell2cell.core.cell.Cell + First cell-type/tissue/sample to compute interaction + between a pair of them. In a directed interaction, + this is the sender. + + cell2 : cell2cell.core.cell.Cell + Second cell-type/tissue/sample to compute interaction + between a pair of them. In a directed interaction, + this is the receiver. + + Returns + ------- + cci_score : float + Overall score for the interaction between a pair of + cell-types/tissues/samples. + ''' + c1 = cell1.weighted_ppi['A'].values + c2 = cell2.weighted_ppi['B'].values + + if (len(c1) == 0) or (len(c2) == 0): + return 0.0 + + if ppi_score is None: + ppi_score = np.array([1.0] * len(c1)) + + mult = c1 * c2 * ppi_score + cci_score = np.nansum(mult) + + if cci_score is np.nan: + return 0.0 + return cci_score + + def matmul_jaccard_like(A_scores, B_scores, ppi_score=None): '''Computes Jaccard-like scores using matrices of proteins by cell-types/tissues/samples. diff --git a/cell2cell/core/interaction_space.py b/cell2cell/core/interaction_space.py index b4cdf87..d9c946b 100644 --- a/cell2cell/core/interaction_space.py +++ b/cell2cell/core/interaction_space.py @@ -30,7 +30,7 @@ def generate_pairs(cells, cci_type, self_interaction=True, remove_duplicates=Tru self_interaction : boolean, default=True Whether considering autocrine interactions (pair A-A, B-B, etc). - remove_duplicates : booleanm default=True + remove_duplicates : boolean, default=True Whether removing duplicates when a list of cells is passed and names are duplicated. If False and a list [A, A, B] is passed, pairs could be [A-A, A-A, A-B, A-A, A-A, A-B, B-A, B-A, B-B] when self_interaction is True @@ -110,6 +110,7 @@ def generate_interaction_elements(modified_rnaseq, ppi_data, cci_type='undirecte - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. interaction_columns : tuple, default=('A', 'B') Contains the names of the columns where to find the partners in a @@ -245,6 +246,7 @@ class InteractionSpace(): - 'bray_curtis' - 'jaccard' - 'count' + - 'icellnet' cci_type : str, default='undirected' Type of interaction between two cells. If it is undirected, all ligands @@ -272,6 +274,7 @@ class InteractionSpace(): - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. interaction_columns : tuple, default=('A', 'B') Contains the names of the columns where to find the partners in a @@ -303,6 +306,7 @@ class InteractionSpace(): - 'bray_curtis' - 'jaccard' - 'count' + - 'icellnet' cci_type : str Type of interaction between two cells. If it is undirected, all ligands @@ -332,7 +336,7 @@ class InteractionSpace(): to store CCI scores(under key 'cci_matrix'). A communication matrix is also stored in this object when the communication scores are computed in the InteractionSpace class (under key - 'communication_score') + 'communication_matrix') distance_matrix : pandas.DataFrame Contains distances for each pair of cells, computed from @@ -408,6 +412,7 @@ def pair_cci_score(self, cell1, cell2, cci_score='bray_curtis', use_ppi_score=Fa - 'bray_curtis' : Bray-Curtis-like score - 'jaccard' : Jaccard-like score - 'count' : Number of LR pairs that the pair of cells uses + - 'icellnet' : Sum of the L-R expression product of a pair of cells use_ppi_score : boolean, default=False Whether using a weight of LR pairs specified in the ppi_data @@ -438,6 +443,8 @@ def pair_cci_score(self, cell1, cell2, cci_score='bray_curtis', use_ppi_score=Fa cci_value = cci_scores.compute_jaccard_like_cci_score(cell1, cell2, ppi_score=ppi_score) elif cci_score == 'count': cci_value = cci_scores.compute_count_score(cell1, cell2, ppi_score=ppi_score) + elif cci_score == 'icellnet': + cci_value = cci_scores.compute_icellnet_score(cell1, cell2, ppi_score=ppi_score) else: raise NotImplementedError("CCI score {} to compute pairwise cell-interactions is not implemented".format(cci_score)) return cci_value @@ -457,6 +464,7 @@ def compute_pairwise_cci_scores(self, cci_score=None, use_ppi_score=False, verbo - 'bray_curtis' : Bray-Curtis-like score - 'jaccard' : Jaccard-like score - 'count' : Number of LR pairs that the pair of cells uses + - 'icellnet' : Sum of the L-R expression product of a pair of cells use_ppi_score : boolean, default=False Whether using a weight of LR pairs specified in the ppi_data @@ -502,7 +510,7 @@ def compute_pairwise_cci_scores(self, cci_score=None, use_ppi_score=False, verbo # ) # Generate distance matrix - if cci_score != 'count': + if ~(cci_score in ['count', 'icellnet']): self.distance_matrix = self.interaction_elements['cci_matrix'].apply(lambda x: 1 - x) else: #self.distance_matrix = self.interaction_elements['cci_matrix'].div(self.interaction_elements['cci_matrix'].max().max()).apply(lambda x: 1 - x) diff --git a/cell2cell/external/tensorly_nn_cp.py b/cell2cell/external/tensorly_nn_cp.py index d8d942b..3ead97e 100644 --- a/cell2cell/external/tensorly_nn_cp.py +++ b/cell2cell/external/tensorly_nn_cp.py @@ -5,7 +5,7 @@ import warnings import tensorly as tl -from tensorly.random import check_random_state, random_cp +from tensorly.random import random_cp # check_random_state, # check_random_state only available in tensorly 0.5.1 from tensorly.base import unfold from tensorly.cp_tensor import (CPTensor, unfolding_dot_khatri_rao, cp_norm, diff --git a/cell2cell/plotting/cci_plot.py b/cell2cell/plotting/cci_plot.py index 7fa5846..83c5682 100644 --- a/cell2cell/plotting/cci_plot.py +++ b/cell2cell/plotting/cci_plot.py @@ -105,9 +105,9 @@ def clustermap_cci(interaction_space, method='ward', optimal_leaf=True, metadata # Drop excluded cells if excluded_cells is not None: df = distance_matrix.loc[~distance_matrix.index.isin(excluded_cells), - ~distance_matrix.columns.isin(excluded_cells)] + ~distance_matrix.columns.isin(excluded_cells)].copy() else: - df = distance_matrix + df = distance_matrix.copy() # Check symmetry to get linkage symmetric = check_symmetry(df) diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py index af181cc..487ea6a 100644 --- a/cell2cell/preprocessing/__init__.py +++ b/cell2cell/preprocessing/__init__.py @@ -4,6 +4,7 @@ from cell2cell.preprocessing.cutoffs import (get_constant_cutoff, get_cutoffs, get_global_percentile_cutoffs, get_local_percentile_cutoffs) +from cell2cell.preprocessing.find_elements import (find_duplicates) from cell2cell.preprocessing.gene_ontology import (find_all_children_of_go_term, find_go_terms_from_keyword, get_genes_from_go_hierarchy, get_genes_from_go_terms) from cell2cell.preprocessing.integrate_data import (get_thresholded_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms, diff --git a/cell2cell/preprocessing/find_elements.py b/cell2cell/preprocessing/find_elements.py new file mode 100644 index 0000000..20a506a --- /dev/null +++ b/cell2cell/preprocessing/find_elements.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +from collections import defaultdict + +def find_duplicates(element_list): + '''Function based on: https://stackoverflow.com/a/5419576/12032899 + Finds duplicate items and list their index location. + + Parameters + ---------- + element_list : list + List of elements + + Returns + ------- + duplicate_dict : dict + Dictionary with duplicate items. Keys are the items, and values + are lists with the respective indexes where they are. + ''' + tally = defaultdict(list) + for i,item in enumerate(element_list): + tally[item].append(i) + + duplicate_dict = {key : locs for key,locs in tally.items() + if len(locs)>1} + return duplicate_dict \ No newline at end of file diff --git a/cell2cell/preprocessing/rnaseq.py b/cell2cell/preprocessing/rnaseq.py index 63a08f4..8e1477f 100644 --- a/cell2cell/preprocessing/rnaseq.py +++ b/cell2cell/preprocessing/rnaseq.py @@ -163,6 +163,7 @@ def add_complexes_to_expression(rnaseq_data, complexes, agg_method='min'): - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. Returns ------- @@ -180,8 +181,8 @@ def add_complexes_to_expression(rnaseq_data, complexes, agg_method='min'): tmp_rna.loc[k] = df.min().values.tolist() elif agg_method == 'mean': tmp_rna.loc[k] = df.mean().values.tolist() - # elif agg_method == 'gmean': - # tmp_rna.loc[k] = df.gmean().values.tolist() # Not implemented + elif agg_method == 'gmean': + tmp_rna.loc[k] = df.apply(lambda x: np.exp(np.mean(np.log(x)))).values.tolist() else: ValueError("{} is not a valid agg_method".format(agg_method)) else: diff --git a/cell2cell/tensor/__init__.py b/cell2cell/tensor/__init__.py index df91324..6676983 100644 --- a/cell2cell/tensor/__init__.py +++ b/cell2cell/tensor/__init__.py @@ -4,4 +4,5 @@ from cell2cell.tensor.metrics import (correlation_index) from cell2cell.tensor.tensor import (InteractionTensor, PreBuiltTensor, build_context_ccc_tensor, generate_tensor_metadata, interactions_to_tensor) +from cell2cell.tensor.tensor_manipulation import (concatenate_interaction_tensors) from cell2cell.tensor.subset import (subset_tensor, subset_metadata) diff --git a/cell2cell/tensor/factorization.py b/cell2cell/tensor/factorization.py index a355801..89e42ec 100644 --- a/cell2cell/tensor/factorization.py +++ b/cell2cell/tensor/factorization.py @@ -4,14 +4,12 @@ from tqdm.auto import tqdm from kneed import KneeLocator -from cell2cell.external import non_negative_parafac -# Replace previous line with line below once the random_state is available for init='random' -#from tensorly.decomposition import non_negative_parafac +from tensorly.decomposition import non_negative_parafac, non_negative_parafac_hals, parafac, constrained_parafac import tensorly as tl -def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init='svd', random_state=None, mask=None, - verbose=False, **kwargs): +def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init='svd', svd='numpy_svd', random_state=None, + mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs): '''Performs the Tensor Factorization Parameters @@ -26,12 +24,26 @@ def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init= tf_type : str, default='non_negative_cp' Type of Tensor Factorization. - - 'non_negative_cp' : Non-negative PARAFAC, as implemented in Tensorly + + - 'non_negative_cp' : Non-negative PARAFAC through the traditional ALS. + - 'non_negative_cp_hals' : Non-negative PARAFAC through the Hierarchical ALS. + It reaches an optimal solution faster than the + traditional ALS, but it does not allow a mask. + - 'parafac' : PARAFAC through the traditional ALS. It allows negative loadings. + - 'constrained_parafac' : PARAFAC through the traditional ALS. It allows + negative loadings. Also, it incorporates L1 and L2 + regularization, includes a 'non_negative' option, and + allows constraining the sparsity of the decomposition. + For more information, see + http://tensorly.org/stable/modules/generated/tensorly.decomposition.constrained_parafac.html#tensorly.decomposition.constrained_parafac init : str, default='svd' Initialization method for computing the Tensor Factorization. {‘svd’, ‘random’} + svd : str, default='numpy_svd' + Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS + random_state : int, default=None Seed for randomization. @@ -40,6 +52,17 @@ def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init= a boolean array of the same shape as the original tensor and should be 0 where the values are missing and 1 everywhere else. + tol : float, default=10e-7 + Tolerance for the decomposition algorithm to stop when the variation in + the reconstruction error is less than the tolerance. Lower `tol` helps + to improve the solution obtained from the decomposition, but it takes + longer to run. + + n_iter_max : int, default=100 + Maximum number of iteration to reach an optimal solution with the + decomposition algorithm. Higher `n_iter_max`helps to improve the solution + obtained from the decomposition, but it takes longer to run. + verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -51,21 +74,70 @@ def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init= cp_tf : CPTensor A tensorly object containing a list of initialized factors of the tensor decomposition where element `i` is of shape (tensor.shape[i], rank). + + errors : list + A list of reconstruction errors at each iteration of the algorithms. + Returned only when `kwargs` contains 'return_errors' : True. ''' - if mask is not None: - init = 'random' + # For returning errors + return_errors = False + if kwargs is not None: + if 'return_errors' in kwargs.keys(): + return_errors = kwargs['return_errors'] + if tf_type == 'non_negative_cp': cp_tf = non_negative_parafac(tensor=tensor, rank=rank, - init=init, + init='random' if mask is not None else init, + svd=svd, random_state=random_state, mask=mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs) + + if return_errors: + cp_tf, errors = cp_tf + + elif tf_type == 'non_negative_cp_hals': + cp_tf, errors = non_negative_parafac_hals(tensor=tensor, + rank=rank, + init=init, + svd=svd, + #random_state=random_state, # Not implemented in tensorly 0.7.0 commented for now + n_iter_max=n_iter_max, + tol=tol, + verbose=verbose, + **kwargs) + elif tf_type == 'parafac': + cp_tf, errors = parafac(tensor=tensor, + rank=rank, + init='random' if mask is not None else init, + svd=svd, + random_state=random_state, + mask=mask, + n_iter_max=n_iter_max, + tol=tol, + verbose=verbose, + **kwargs) + + elif tf_type == 'constrained_parafac': + cp_tf, errors = constrained_parafac(tensor=tensor, + rank=rank, + init='random' if mask is not None else init, + svd=svd, + random_state=random_state, + n_iter_max=n_iter_max, + verbose=verbose, + **kwargs) else: raise ValueError('Not a valid tf_type.') - return cp_tf + if return_errors: + return cp_tf, errors + else: + return cp_tf def _compute_elbow(loss): @@ -86,8 +158,8 @@ def _compute_elbow(loss): return rank -def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init='svd', random_state=None, mask=None, - verbose=False, disable_pbar=False, **kwargs): +def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init='svd', svd='numpy_svd', random_state=None, + mask=None, n_iter_max=100, tol=10e-7, verbose=False, disable_pbar=False, **kwargs): '''Performs an elbow analysis with just one run of a tensor factorization for each rank @@ -108,6 +180,9 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init=' Initialization method for computing the Tensor Factorization. {‘svd’, ‘random’} + svd : str, default='numpy_svd' + Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS + random_state : int, default=None Seed for randomization. @@ -116,6 +191,17 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init=' a boolean array of the same shape as the original tensor and should be 0 where the values are missing and 1 everywhere else. + tol : float, default=10e-7 + Tolerance for the decomposition algorithm to stop when the variation in + the reconstruction error is less than the tolerance. Lower `tol` helps + to improve the solution obtained from the decomposition, but it takes + longer to run. + + n_iter_max : int, default=100 + Maximum number of iteration to reach an optimal solution with the + decomposition algorithm. Higher `n_iter_max`helps to improve the solution + obtained from the decomposition, but it takes longer to run. + verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -142,8 +228,11 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init=' rank=r, tf_type=tf_type, init=init, + svd=svd, random_state=random_state, mask=mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs) # This helps to obtain proper error when the mask is not None. @@ -155,8 +244,8 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init=' return loss -def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_negative_cp', init='svd', random_state=None, - mask=None, verbose=False, **kwargs): +def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_negative_cp', init='svd', svd='numpy_svd', + random_state=None, mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs): '''Performs an elbow analysis with multiple runs of a tensor factorization for each rank @@ -219,8 +308,11 @@ def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_n rank=r, tf_type=tf_type, init=init, + svd=svd, random_state=rs, mask=mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs) # This helps to obtain proper error when the mask is not None. diff --git a/cell2cell/tensor/subset.py b/cell2cell/tensor/subset.py index 570052a..fa5c5d3 100644 --- a/cell2cell/tensor/subset.py +++ b/cell2cell/tensor/subset.py @@ -4,8 +4,9 @@ import numpy as np import tensorly as tl +from cell2cell.preprocessing.find_elements import find_duplicates -def find_element_indexes(interaction_tensor, elements, axis=0, remove_duplicates=True, original_order=False): +def find_element_indexes(interaction_tensor, elements, axis=0, remove_duplicates=True, keep='first', original_order=False): '''Finds the location/indexes of a list of elements in one of the axis of an InteractionTensor. @@ -25,8 +26,16 @@ def find_element_indexes(interaction_tensor, elements, axis=0, remove_duplicates remove_duplicates : boolean, default=True Whether removing duplicated names in `elements`. + keep : str, default='first' + Determines which duplicates (if any) to keep. + Options are: + + - first : Drop duplicates except for the first occurrence. + - last : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + original_order : boolean, default=False - Wheter keeping the original order of the elements in + Whether keeping the original order of the elements in interaction_tensor.order_names[axis] or keeping the new order as indicated in `elements`. @@ -41,20 +50,42 @@ def find_element_indexes(interaction_tensor, elements, axis=0, remove_duplicates assert axis < len \ (interaction_tensor.order_names), "List index out of range. interaction_tensor.order_names must have element names for each axis of the tensor." - if remove_duplicates: - elements = sorted(set(elements), key=elements.index) + elements = sorted(set(elements), key=list(elements).index) if original_order: # Avoids error for considering elements not in the tensor elements = set(elements).intersection(set(interaction_tensor.order_names[axis])) elements = sorted(elements, key=interaction_tensor.order_names[axis].index) + + # Find duplicates if we are removing them + to_exclude = [] + if remove_duplicates: + dup_dict = find_duplicates(interaction_tensor.order_names[axis]) + + if len(dup_dict) > 0: # Only if we have duplicate items + if keep == 'first': + for k, v in dup_dict.items(): + to_exclude.extend(v[1:]) + elif keep == 'last': + for k, v in dup_dict.items(): + to_exclude.extend(v[:-1]) + elif not keep: + for k, v in dup_dict.items(): + to_exclude.extend(v) + else: + raise ValueError("Not a valid option was selected for the parameter `keep`") + + # Find indexes in the tensor indexes = sum \ ([np.where(np.asarray(interaction_tensor.order_names[axis]) == element)[0].tolist() for element in elements], []) + + # Exclude duplicates if any to exclude + indexes = [idx for idx in indexes if idx not in to_exclude] return indexes -def subset_tensor(interaction_tensor, subset_dict, remove_duplicates=True, original_order=False): +def subset_tensor(interaction_tensor, subset_dict, remove_duplicates=True, keep='first', original_order=False): '''Subsets an InteractionTensor to contain only specific elements in respective dimensions. @@ -75,8 +106,16 @@ def subset_tensor(interaction_tensor, subset_dict, remove_duplicates=True, origi remove_duplicates : boolean, default=True Whether removing duplicated names in `elements`. + keep : str, default='first' + Determines which duplicates (if any) to keep. + Options are: + + - first : Drop duplicates except for the first occurrence. + - last : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + original_order : boolean, default=False - Wheter keeping the original order of the elements in + Whether keeping the original order of the elements in interaction_tensor.order_names or keeping the new order as indicated in the lists in the `subset_dict`. @@ -110,6 +149,7 @@ def subset_tensor(interaction_tensor, subset_dict, remove_duplicates=True, origi elements=v, axis=k, remove_duplicates=remove_duplicates, + keep=keep, original_order=original_order ) if len(idx) == 0: diff --git a/cell2cell/tensor/tensor.py b/cell2cell/tensor/tensor.py index b010d54..faab85a 100644 --- a/cell2cell/tensor/tensor.py +++ b/cell2cell/tensor/tensor.py @@ -129,8 +129,13 @@ def __init__(self): self.loc_nans = None self.loc_zeros = None - def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='svd', random_state=None, verbose=False, - runs=1, normalize_loadings=True, var_ordered_factors=True, **kwargs): + def copy(self): + import copy + return copy.deepcopy(self) + + def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='svd', svd='numpy_svd', random_state=None, + runs=1, normalize_loadings=True, var_ordered_factors=True, n_iter_max=100, tol=10e-7, + verbose=False, **kwargs): '''Performs a Tensor Factorization. There are no returns, instead the attributes factors and rank of the Tensor class are updated. @@ -150,12 +155,12 @@ def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='sv Initialization method for computing the Tensor Factorization. {‘svd’, ‘random’} + svd : str, default='numpy_svd' + Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS + random_state : int, default=None Seed for randomization. - verbose : boolean, default=False - Whether printing or not steps of the analysis. - runs : int, default=1 Number of models to choose among and find the lowest error. This helps to avoid local minima when using runs > 1. @@ -169,6 +174,20 @@ def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='sv highest to lowest variance. `normalize_loadings` must be True. Otherwise, this parameter is ignored. + tol : float, default=10e-7 + Tolerance for the decomposition algorithm to stop when the variation in + the reconstruction error is less than the tolerance. Lower `tol` helps + to improve the solution obtained from the decomposition, but it takes + longer to run. + + n_iter_max : int, default=100 + Maximum number of iteration to reach an optimal solution with the + decomposition algorithm. Higher `n_iter_max`helps to improve the solution + obtained from the decomposition, but it takes longer to run. + + verbose : boolean, default=False + Whether printing or not steps of the analysis. + **kwargs : dict Extra arguments for the tensor factorization according to inputs in tensorly. ''' @@ -190,8 +209,11 @@ def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='sv rank=rank, tf_type=tf_type, init=init, + svd=svd, random_state=random_state_, mask=self.mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs) # This helps to obtain proper error when the mask is not None. @@ -248,9 +270,9 @@ def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='sv [pd.DataFrame(tl.to_numpy(f), index=idx, columns=factor_names) for f, idx in zip(factors, self.order_names)])) self.rank = rank - def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp', init='random', random_state=None, - automatic_elbow=True, manual_elbow=None, mask=None, ci='std', figsize=(4, 2.25), fontsize=14, filename=None, - verbose=False, **kwargs): + def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp', init='random', svd='numpy_svd', + random_state=None, n_iter_max=100, tol=10e-7, automatic_elbow=True, manual_elbow=None, + mask=None, ci='std', figsize=(4, 2.25), fontsize=14, filename=None, verbose=False, **kwargs): '''Elbow analysis on the error achieved by the Tensor Factorization for selecting the number of factors to use. A plot is made with the results. @@ -265,15 +287,40 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' tf_type : str, default='non_negative_cp' Type of Tensor Factorization. - - 'non_negative_cp' : Non-negative PARAFAC, as implemented in Tensorly + + - 'non_negative_cp' : Non-negative PARAFAC through the traditional ALS. + - 'non_negative_cp_hals' : Non-negative PARAFAC through the Hierarchical ALS. + It reaches an optimal solution faster than the + traditional ALS, but it does not allow a mask. + - 'parafac' : PARAFAC through the traditional ALS. It allows negative loadings. + - 'constrained_parafac' : PARAFAC through the traditional ALS. It allows + negative loadings. Also, it incorporates L1 and L2 + regularization, includes a 'non_negative' option, and + allows constraining the sparsity of the decomposition. + For more information, see + http://tensorly.org/stable/modules/generated/tensorly.decomposition.constrained_parafac.html#tensorly.decomposition.constrained_parafac init : str, default='svd' Initialization method for computing the Tensor Factorization. {‘svd’, ‘random’} + svd : str, default='numpy_svd' + Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS + random_state : int, default=None Seed for randomization. + tol : float, default=10e-7 + Tolerance for the decomposition algorithm to stop when the variation in + the reconstruction error is less than the tolerance. Lower `tol` helps + to improve the solution obtained from the decomposition, but it takes + longer to run. + + n_iter_max : int, default=100 + Maximum number of iteration to reach an optimal solution with the + decomposition algorithm. Higher `n_iter_max`helps to improve the solution + obtained from the decomposition, but it takes longer to run. + automatic_elbow : boolean, default=True Whether using an automatic strategy to find the elbow. If True, the method implemented by the package kneed is used. @@ -331,8 +378,11 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' upper_rank=upper_rank, tf_type=tf_type, init=init, + svd=svd, random_state=random_state, mask=mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs ) @@ -351,8 +401,11 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' runs=runs, tf_type=tf_type, init=init, + svd=svd, random_state=random_state, mask=mask, + n_iter_max=n_iter_max, + tol=tol, verbose=verbose, **kwargs ) @@ -533,6 +586,7 @@ class InteractionTensor(BaseTensor): - 'min' : Minimum expression value among all genes. - 'mean' : Average expression value among all genes. + - 'gmean' : Geometric mean expression value among all genes. upper_letter_comparison : boolean, default=True Whether making uppercase the gene names in the expression matrices and the @@ -1120,6 +1174,7 @@ def interactions_to_tensor(interactions, experiment='single_cell', context_names ppis = [] rnaseq_matrices = [] complex_sep = interactions[0].complex_sep + complex_agg_method = interactions[0].complex_agg_method interaction_columns = interactions[0].interaction_columns for Int_ in interactions: if Int_.analysis_setup['cci_type'] == 'undirected': @@ -1142,6 +1197,7 @@ def interactions_to_tensor(interactions, experiment='single_cell', context_names context_names=context_names, how=how, complex_sep=complex_sep, + complex_agg_method=complex_agg_method, interaction_columns=interaction_columns, communication_score=communication_score, upper_letter_comparison=upper_letter_comparison, diff --git a/cell2cell/tensor/tensor_manipulation.py b/cell2cell/tensor/tensor_manipulation.py new file mode 100644 index 0000000..9b977e5 --- /dev/null +++ b/cell2cell/tensor/tensor_manipulation.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +import tensorly as tl + +from cell2cell.tensor.tensor import PreBuiltTensor +from cell2cell.tensor.subset import subset_tensor + + +def concatenate_interaction_tensors(interaction_tensors, axis, order_labels, remove_duplicates=False, keep='first', + mask=None, device=None): + '''Concatenates interaction tensors in a given tensor dimension or axis. + + Parameters + ---------- + interaction_tensors : list + List of any tensor class in cell2cell.tensor. + + axis : int + The axis along which the arrays will be joined. If axis is None, arrays are flattened before use. + + order_labels : list + List of labels for dimensions or orders in the tensor. + + remove_duplicates : boolean, default=False + Whether removing duplicated names in the concatenated axis. + + keep : str, default='first' + Determines which duplicates (if any) to keep. + Options are: + + - first : Drop duplicates except for the first occurrence. + - last : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + + mask : ndarray list + Helps avoiding missing values during a tensor factorization. A mask should be + a boolean array of the same shape as the original tensor and should be 0 + where the values are missing and 1 everywhere else. This must be of equal shape + as the concatenated tensor. + + device : str, default=None + Device to use when backend is pytorch. Options are: + {'cpu', 'cuda', None} + + Returns + ------- + concatenated_tensor : cell2cell.tensor.PreBuiltTensor + Final tensor after concatenation. It is a PreBuiltTensor that works + any interaction tensor based on the class BaseTensor. + ''' + # Assert if all other dimensions contains the same elements: + shape = len(interaction_tensors[0].tensor.shape) + assert all(shape == len(tensor.tensor.shape) for tensor in interaction_tensors[1:]), "Tensors must have same number of dimensions" + + for i in range(shape): + if i != axis: + elements = interaction_tensors[0].order_names[i] + for tensor in interaction_tensors[1:]: + assert elements == tensor.order_names[i], "Tensors must have the same elements in the other axes." + + + # Concatenate tensors + concat_tensor = tl.concatenate([tensor.tensor for tensor in interaction_tensors], axis=axis) + if mask is not None: + assert mask.shape == concat_tensor.shape, "Mask must have the same shape of the concatenated tensor. Here: {}".format(concat_tensor.shape) + else: # Generate a new mask from all previous masks if all are not None + if all([tensor.mask is not None for tensor in interaction_tensors]): + mask = tl.concatenate([tensor.mask for tensor in interaction_tensors], axis=axis) + else: + mask = None + + # Concatenate names of elements for the given axis but keep the others as in one tensor + order_names = [] + for i in range(shape): + tmp_names = [] + if i == axis: + for tensor in interaction_tensors: + tmp_names += tensor.order_names[i] + else: + tmp_names = interaction_tensors[0].order_names[i] + order_names.append(tmp_names) + + # Generate final object + concatenated_tensor = PreBuiltTensor(tensor=concat_tensor, + order_names=order_names, + order_labels=order_labels, + mask=mask, # Change if you want to omit values in the decomposition + device=device + ) + + # Remove duplicates + if remove_duplicates: + concatenated_tensor = subset_tensor(interaction_tensor=concatenated_tensor, + subset_dict={axis: order_names[axis]}, + remove_duplicates=remove_duplicates, + keep=keep, + original_order=False) + return concatenated_tensor \ No newline at end of file diff --git a/docs/documentation.md b/docs/documentation.md index beb4f17..1605526 100644 --- a/docs/documentation.md +++ b/docs/documentation.md @@ -1,7 +1,7 @@ # Documentation for *cell2cell* This documentation is for our *cell2cell* suite, which includes the [regular cell2cell](https://www.biorxiv.org/content/10.1101/2020.11.22.392217v3) -and [Tensor-cell2cell](https://doi.org/10.1101/2021.09.20.461129) tools. The former is for inferring cell-cell interactions +and [Tensor-cell2cell](https://doi.org/10.1038/s41467-022-31369-2) tools. The former is for inferring cell-cell interactions and communication in one sample or context, while the latter is for deconvolving complex patterns of cell-cell communication across multiple samples or contexts simultaneously into interpretable factors representing patterns of communication. diff --git a/docs/index.md b/docs/index.md index 024deca..f63d9f6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,6 +5,12 @@ [pb]: https://badge.fury.io/py/cell2cell.svg [pypi]: https://pypi.org/project/cell2cell/ +## Getting started +Please refer to the [cell2cell website](https://earmingol.github.io/cell2cell/), +which includes tutorials and documentation + + + ## Installation **First, [install Anaconda following this tutorial](https://docs.anaconda.com/anaconda/install/)** @@ -46,13 +52,19 @@ body of *C. elegans*** is [available here](https://github.com/LewisLabUCSD/Celeg - Jupyter notebooks for reproducing the results in the manuscript of Tensor-cell2cell [are available and can be run online in codeocean.com](https://doi.org/10.24433/CO.0051950.v2). + It specifically contains analyses on datasets of **COVID-19, Autism Spectrum Disorders (ASD) and the embryonic development + of *C. elegans***. These analyses evaluate changes in + cell-cell communication dependent on: + - [Different severities of COVID-19](https://files.codeocean.com/files/verified/bffc457e-caa6-4c39-b869-f52330804db0_v2.0/results.5afea95c-aec4-455d-b06e-b0c12ef10df1/06-BALF-Tensor-Factorization.html) + - [ASD condition of patients](https://files.codeocean.com/files/verified/bffc457e-caa6-4c39-b869-f52330804db0_v2.0/results.5afea95c-aec4-455d-b06e-b0c12ef10df1/11-Brain-ASD-Tensor-Factorization.html) + - [Multiple time points of the *C. elegans* development](https://files.codeocean.com/files/verified/bffc457e-caa6-4c39-b869-f52330804db0_v2.0/results.5afea95c-aec4-455d-b06e-b0c12ef10df1/08-Celegans-Tensor-Factorization.html) - **Detailed tutorials for running Tensor-cell2cell and downstream analyses:** - [Obtaining patterns of cell-cell communication with Tensor-cell2cell](https://earmingol.github.io/cell2cell/tutorials/ASD/01-Tensor-Factorization-ASD/) - [Downstream analysis 1: Factor-specific analyses](https://earmingol.github.io/cell2cell/tutorials/ASD/02-Factor-Specific-ASD/) - [Downstream analysis 2: Gene Set Enrichment Analysis](https://earmingol.github.io/cell2cell/tutorials/ASD/03-GSEA-ASD/) - **Do you have precomputed communication scores?** Re-use them as a prebuilt tensor as [exemplified here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/Loading-PreBuiltTensor.ipynb). This allows reusing previous tensors you built or even plugging in communication scores from other tools. -- **Run Tensor-cell2cell much faster!** An example to perform the analysis using a **NVIDIA GPU** is [available here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/GPU-Example.ipynb) +- **Run Tensor-cell2cell much faster!** An example to perform the analysis using a **Nvidia GPU** is [available here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/GPU-Example.ipynb) --- @@ -74,7 +86,7 @@ associated with Memory. This may happen when the tensor is big enough to make th *bioRxiv*, (2020). **DOI: 10.1101/2020.11.22.392217** -- **Tensor-cell2cell** should be cited using this pre-print in bioRxiv: +- **Tensor-cell2cell** should be cited using this research article: - Armingol E., Baghdassarian H., Martino C., Perez-Lopez A., Aamodt C., Knight R., Lewis N.E. - [Context-aware deconvolution of cell-cell communication with Tensor-cell2cell](https://doi.org/10.1101/2021.09.20.461129) - *bioRxiv*, (2021). **DOI: 10.1101/2021.09.20.461129** \ No newline at end of file + [Context-aware deconvolution of cell-cell communication with Tensor-cell2cell](https://doi.org/10.1038/s41467-022-31369-2) + *Nat. Commun.* **13**, 3665 (2022). **DOI: 10.1038/s41467-022-31369-2** \ No newline at end of file diff --git a/release/0.6.0-notes.md b/release/0.6.0-notes.md new file mode 100644 index 0000000..b849754 --- /dev/null +++ b/release/0.6.0-notes.md @@ -0,0 +1,33 @@ +# Release Notes - cell2cell v0.6.0 + +## New features +- Added 'gmean' as method to compute expression of protein complexes. +It involves function ```cell2cell.preprocessing.rnaseq.add_complexes_to_expression()``` +and all objects calling it. +- Added new parameters for improving robustness of tensor factorization. These are +```n_iter_max``` and ```tol```. Higher n_iter_max and lower tol retrieves better optimal +solutions, but at the expense of more running time. Available in: +```cell2cell.tensor.factorization._compute_tensor_factorization()``` +and in ```cell2cell.tensor.tensor.BaseTensor.compute_tensor_factorization()``` and all heir classes. +- Similar to the previous point, the parameter ```svd``` was added to these functions. This allows to control +the type of svd method to use when using ```init='svd'```. See documentation for more information. +- Added new methods/options for running a tensor decomposition in ```cell2cell.tensor.factorization._compute_tensor_factorization()``` +and in ```cell2cell.tensor.tensor.BaseTensor.compute_tensor_factorization()``` and all heir classes. + This can be controlled with the parameter ```tf_type```. See documentation for +more options. +- Added option to do a deep copy of any tensor of the class ```cell2cell.tensor.tensor.BaseTensor``` and its +heir classes. Available through ```BaseTensor.copy()```. +- Added new CCI score based on ICELLNET (```cell2cell.core.cci_scores```). Available in the functions +of the regular cell2cell tool (```cell2cell.core.interaction_space```, ```cell2cell.analysis.pipelines.BulkInteractions```, +and ```cell2cell.analysis.pipelines.SingleCellInteractions```) +- Added new function to handle duplicate elements ```cell2cell.preprocessing.find_elements.find_duplicates()``` +- Modified functions in ```cell2cell.tensor.subset``` to handle duplicate elements +- Added new function to concatenate InteractionTensors: ```cell2cell.tensor.tensor_manipulation.concatenate_interaction_tensors()``` + +## Feature updates +- Updated dependency version of tensorly to 0.7.0 + +## Fixed Bugs +- Fixed bug of return_errors in tensor decomposition using regular non_negative_parafac. + New version of tensorly returns decomposition and error as a tuple in other decomposition methods. +- Fixed bug of changing diagonal values of the input matrix to zeros when using ```cell2cell.plotting.cci_plot.clustermap_cci``` \ No newline at end of file diff --git a/setup.py b/setup.py index 37d7a4a..5c3a9f8 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ def run(self): 'tqdm', 'statsmodels', 'statannotations', - 'tensorly == 0.5.1', + 'tensorly == 0.7.0', 'kneed', 'scanpy' ],