Merge pull request #15 from earmingol/new_version

Updates for v0.6.1
earmingol · Oct 28, 2022 · 97f498b · 97f498b
2 parents 14c9e76 + 2656038
commit 97f498b
Show file tree

Hide file tree

Showing 8 changed files with 234 additions and 71 deletions.
diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py
@@ -14,4 +14,4 @@
 from cell2cell import tensor
 from cell2cell import utils
 
-__version__ = "0.6.0"
+__version__ = "0.6.1"
diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py
@@ -4,7 +4,7 @@
 
 from cell2cell.preprocessing.cutoffs import (get_constant_cutoff, get_cutoffs, get_global_percentile_cutoffs,
                                              get_local_percentile_cutoffs)
-from cell2cell.preprocessing.find_elements import (find_duplicates)
+from cell2cell.preprocessing.find_elements import (find_duplicates, get_element_abundances, get_elements_over_fraction)
 from cell2cell.preprocessing.gene_ontology import (find_all_children_of_go_term, find_go_terms_from_keyword,
                                                    get_genes_from_go_hierarchy, get_genes_from_go_terms)
 from cell2cell.preprocessing.integrate_data import (get_thresholded_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms,

diff --git a/cell2cell/preprocessing/find_elements.py b/cell2cell/preprocessing/find_elements.py
@@ -2,7 +2,8 @@
 
 from __future__ import absolute_import
 
-from collections import defaultdict
+import itertools
+from collections import defaultdict, Counter
 
 def find_duplicates(element_list):
     '''Function based on: https://stackoverflow.com/a/5419576/12032899
@@ -25,4 +26,51 @@ def find_duplicates(element_list):
 
     duplicate_dict = {key : locs for key,locs in tally.items()
                             if len(locs)>1}
-    return duplicate_dict
+    return duplicate_dict
+
+
+def get_element_abundances(element_lists):
+    '''Computes the fraction of occurrence of each element
+    in a list of lists.
+
+    Parameters
+    ----------
+    element_lists : list
+        List of lists of elements. Elements will be
+        counted only once in each of the lists.
+
+    Returns
+    -------
+    abundance_dict : dict
+        Dictionary containing the number of times that an
+        element was present, divided by the total number of
+        lists in `element_lists`.
+    '''
+    abundance_dict = Counter(itertools.chain(*map(set, element_lists)))
+    total = len(element_lists)
+    abundance_dict = {k : v/total for k, v in abundance_dict.items()}
+    return abundance_dict
+
+
+def get_elements_over_fraction(abundance_dict, fraction):
+    '''Obtains a list of elements with the
+    fraction of occurrence at least the threshold.
+
+    Parameters
+    ----------
+    abundance_dict : dict
+        Dictionary containing the number of times that an
+        element was present, divided by the total number of
+        possible occurrences.
+
+    fraction : float
+        Threshold to filter the elements. Elements with at least
+        this threshold will be included.
+
+    Returns
+    -------
+    elements : list
+        List of elements that met the fraction criteria.
+    '''
+    elements = [k for k, v in abundance_dict.items() if v >= fraction]
+    return elements
diff --git a/cell2cell/stats/permutation.py b/cell2cell/stats/permutation.py
@@ -49,12 +49,20 @@ def compute_pvalue_from_dist(obs_value, dist, consider_size=False, comparison='u
         P-value obtained from comparing the observed value and values in the
         distribution.
     '''
+    # Omit nan values
+    dist_ = [x for x in dist if ~np.isnan(x)]
+
+    # All values in dist are NaNs or obs_value is NaN
+    if (len(dist_) == 0) | np.isnan(obs_value):
+        return 1.0
+
+    # No NaN values
     if comparison == 'lower':
-        pval = scipy.stats.percentileofscore(dist, obs_value) / 100.0
+        pval = scipy.stats.percentileofscore(dist_, obs_value) / 100.0
     elif comparison == 'upper':
-        pval = 1.0 - scipy.stats.percentileofscore(dist, obs_value) / 100.0
+        pval = 1.0 - scipy.stats.percentileofscore(dist_, obs_value) / 100.0
     elif comparison == 'different':
-        percentile = scipy.stats.percentileofscore(dist, obs_value) / 100.0
+        percentile = scipy.stats.percentileofscore(dist_, obs_value) / 100.0
         if percentile <= 0.5:
             pval = 2.0 * percentile
         else:
@@ -63,7 +71,7 @@ def compute_pvalue_from_dist(obs_value, dist, consider_size=False, comparison='u
         raise NotImplementedError('Comparison {} is not implemented'.format(comparison))
 
     if (consider_size) & (pval == 0.):
-        pval = 1./(len(dist) + 1e-6)
+        pval = 1./(len(dist_) + 1e-6)
 
     return pval
 

diff --git a/cell2cell/tensor/external_scores.py b/cell2cell/tensor/external_scores.py
@@ -4,12 +4,13 @@
 import pandas as pd
 
 from collections import defaultdict
+from cell2cell.preprocessing.find_elements import get_element_abundances, get_elements_over_fraction
 from cell2cell.tensor.tensor import PreBuiltTensor
 
 
 def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receptor_col, score_col, how='inner',
-                         lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None, order_labels=None,
-                         sort_elements=True, device=None):
+                         outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None,
+                         order_labels=None, sort_elements=True, device=None):
     '''Generates an InteractionTensor from a dictionary
     containing dataframes for all contexts.
 
@@ -55,6 +56,13 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
                           contexts (intersection), while all cell types that are
                           present across contexts (union).
 
+    outer_fraction : float, default=0.0
+        Threshold to filter the elements when `how` includes any outer option.
+        Elements with a fraction abundance across contexts (in `context_df_dict`)
+        at least this threshold will be included. When this value is 0, considers
+        all elements across the samples. When this value is 1, it acts as using
+        `how='inner'`.
+
     lr_fill : float, default=numpy.nan
         Value to fill communication scores when a ligand-receptor pair is not
         present across all contexts.
@@ -123,41 +131,32 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
         receiver_dict[k].update(df[receiver_col].unique().tolist())
 
     # Subset LR pairs, sender and receiver cells given parameter 'how'
-    for i, k in enumerate(context_order):
-        if i == 0:
-            inter_lrs = set(lr_dict[k])
-            inter_senders = set(sender_dict[k])
-            inter_receivers = set(receiver_dict[k])
-
-            union_lrs = set(lr_dict[k])
-            union_senders = set(sender_dict[k])
-            union_receivers = set(receiver_dict[k])
-
-        else:
-            inter_lrs = inter_lrs.intersection(set(lr_dict[k]))
-            inter_senders = inter_senders.intersection(set(sender_dict[k]))
-            inter_receivers = inter_receivers.intersection(set(receiver_dict[k]))
-
-            union_lrs = union_lrs.union(set(lr_dict[k]))
-            union_senders = union_senders.union(set(sender_dict[k]))
-            union_receivers = union_receivers.union(set(receiver_dict[k]))
+    df_lrs = [list(lr_dict[k]) for k in context_order]
+    df_senders = [list(sender_dict[k]) for k in context_order]
+    df_receivers  = [list(receiver_dict[k]) for k in context_order]
 
     if how == 'inner':
-        lr_pairs = list(inter_lrs)
-        sender_cells = list(inter_senders)
-        receiver_cells = list(inter_receivers)
+        lr_pairs = list(set.intersection(*map(set, df_lrs)))
+        sender_cells = list(set.intersection(*map(set, df_senders)))
+        receiver_cells = list(set.intersection(*map(set, df_receivers)))
     elif how == 'outer':
-        lr_pairs = list(union_lrs)
-        sender_cells = list(union_senders)
-        receiver_cells = list(union_receivers)
+        lr_pairs = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_lrs),
+                                              fraction=outer_fraction)
+        sender_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_senders),
+                                              fraction=outer_fraction)
+        receiver_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_receivers),
+                                              fraction=outer_fraction)
     elif how == 'outer_lrs':
-        lr_pairs = list(union_lrs)
-        sender_cells = list(inter_senders)
-        receiver_cells = list(inter_receivers)
+        lr_pairs = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_lrs),
+                                              fraction=outer_fraction)
+        sender_cells = list(set.intersection(*map(set, df_senders)))
+        receiver_cells = list(set.intersection(*map(set, df_receivers)))
     elif how == 'outer_cells':
-        lr_pairs = list(inter_lrs)
-        sender_cells = list(union_senders)
-        receiver_cells = list(union_receivers)
+        lr_pairs = list(set.intersection(*map(set, df_lrs)))
+        sender_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_senders),
+                                                  fraction=outer_fraction)
+        receiver_cells = get_elements_over_fraction(abundance_dict=get_element_abundances(element_lists=df_receivers),
+                                                    fraction=outer_fraction)
     else:
         raise ValueError("Not a valid input for parameter 'how'")