Merge pull request #21 from earmingol/new_version

Updates for v0.6.4
earmingol · Jan 21, 2023 · b0c1851 · b0c1851
2 parents efee215 + 254bc19
commit b0c1851
Show file tree

Hide file tree

Showing 14 changed files with 279 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -64,7 +64,8 @@ body of *C. elegans*** is [available here](https://github.com/LewisLabUCSD/Celeg
     - [Downstream analysis 2: Gene Set Enrichment Analysis](https://earmingol.github.io/cell2cell/tutorials/ASD/03-GSEA-ASD/)
 - **Do you have precomputed communication scores?** Re-use them as a prebuilt tensor as [exemplified here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/Loading-PreBuiltTensor.ipynb).
   This allows reusing previous tensors you built or even plugging in communication scores from other tools.
-- **Run Tensor-cell2cell much faster!** An example to perform the analysis using a **Nvidia GPU** is [available here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/GPU-Example.ipynb)
+- **Run Tensor-cell2cell MUCH FASTER and ON THE CLOUD!** An example to perform the analysis on
+ **Google Colab while using a NVIDIA GPU** is [available here](https://colab.research.google.com/drive/1xE6Pm1u-XoSWV8a3oYpixUFj64FIDtl0?usp=sharing)
 
 
 ---

diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py
@@ -14,4 +14,4 @@
 from cell2cell import tensor
 from cell2cell import utils
 
-__version__ = "0.6.3"
+__version__ = "0.6.4"
diff --git a/cell2cell/analysis/tensor_pipelines.py b/cell2cell/analysis/tensor_pipelines.py
@@ -8,10 +8,10 @@
 
 
 def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tensor=False, rank=None,
-                                  tf_optimization='regular', random_state=None, device=None, elbow_metric='error',
-                                  smooth_elbow=False, upper_rank=25, tf_init='random', tf_svd='numpy_svd', cmaps=None,
-                                  sample_col='Element', group_col='Category', fig_fontsize=14, output_folder=None,
-                                  output_fig=True, fig_format='pdf'):
+                                  tf_optimization='regular', random_state=None, backend=None, device=None,
+                                  elbow_metric='error', smooth_elbow=False, upper_rank=25, tf_init='random',
+                                  tf_svd='numpy_svd', cmaps=None, sample_col='Element', group_col='Category',
+                                  fig_fontsize=14, output_folder=None, output_fig=True, fig_format='pdf'):
     '''
     Runs basic pipeline of Tensor-cell2cell (excluding downstream analyses).
 
@@ -49,8 +49,14 @@ def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tens
     random_state : boolean, default=None
         Seed for randomization.
 
+    backend : str, default=None
+        Backend that TensorLy will use to perform calculations
+        on this tensor. When None, the default backend used is
+        the currently active backend, usually is ('numpy'). Options are:
+        {'cupy', 'jax', 'mxnet', 'numpy', 'pytorch', 'tensorflow'}
+
     device : str, default=None
-        Device to use when backend is pytorch. Options are:
+        Device to use when backend allows multiple devices. Options are:
          {'cpu', 'cuda:0', None}
 
     elbow_metric : str, default='error'
@@ -149,6 +155,9 @@ def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tens
     else:
         raise ValueError("`factorization_type` must be either 'robust' or 'regular'.")
 
+    if backend is not None:
+        tl.set_backend(backend)
+
     if device is not None:
         try:
             interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor, device=device)

diff --git a/cell2cell/datasets/__init__.py b/cell2cell/datasets/__init__.py
@@ -2,6 +2,7 @@
 
 from __future__ import absolute_import
 
+from cell2cell.datasets.anndata import (balf_covid)
 from cell2cell.datasets.heuristic_data import (HeuristicGOTerms)
 from cell2cell.datasets.random_data import (generate_random_rnaseq, generate_random_ppi, generate_random_cci_scores,
                                             generate_random_metadata)

diff --git a/cell2cell/datasets/anndata.py b/cell2cell/datasets/anndata.py
@@ -0,0 +1,31 @@
+from scanpy.readwrite import read
+
+
+def balf_covid(filename='BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'):
+    """BALF samples from COVID-19 patients
+    The data consists in 63k immune and epithelial cells in lungs
+    from 3 control, 3 moderate COVID-19, and 6 severe COVID-19 patients.
+
+    This dataset was previously published in [1], and this objects contains
+    the raw counts for the annotated cell types available in:
+    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE145926
+
+
+    References:
+    [1] Liao, M., Liu, Y., Yuan, J. et al.
+        Single-cell landscape of bronchoalveolar immune cells in patients
+        with COVID-19. Nat Med 26, 842–844 (2020).
+        https://doi.org/10.1038/s41591-020-0901-9
+
+    Parameters
+    ----------
+        filename : str, default='BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'
+            Path to the h5ad file in case it was manually downloaded.
+
+    Returns
+    -------
+        Annotated data matrix.
+    """
+    url = 'https://zenodo.org/record/7535867/files/BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'
+    adata = read(filename, backup_url=url)
+    return adata
diff --git a/cell2cell/io/__init__.py b/cell2cell/io/__init__.py
@@ -2,6 +2,8 @@
 
 from __future__ import absolute_import
 
+from cell2cell.io.directories import (create_directory, get_files_from_directory)
 from cell2cell.io.read_data import (load_cutoffs, load_go_annotations, load_go_terms, load_metadata, load_ppi,
-                                    load_rnaseq, load_table, load_variable_with_pickle, load_tensor_factors)
+                                    load_rnaseq, load_table, load_tables_from_directory, load_variable_with_pickle,
+                                    load_tensor, load_tensor_factors)
 from cell2cell.io.save_data import (export_variable_with_pickle)
diff --git a/cell2cell/io/directories.py b/cell2cell/io/directories.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import os
+
+
+def create_directory(pathname):
+    '''Creates a directory.
+
+    Uses a path to create a directory. It creates
+    all intermediate folders before creating the
+    leaf folder.
+
+    Parameters
+    ----------
+    pathname : str
+        Full path of the folder to create.
+    '''
+    if not os.path.isdir(pathname):
+        os.makedirs(pathname)
+        print("{} was created successfully.".format(pathname))
+    else:
+        print("{} already exists.".format(pathname))
+
+
+def get_files_from_directory(pathname, dir_in_filepath=False):
+    '''Obtains a list of filenames in a folder.
+
+    Parameters
+    ----------
+    pathname : str
+        Full path of the folder to explore.
+
+    dir_in_filepath : boolean, default=False
+        Whether adding `pathname` to the filenames
+
+    Returns
+    -------
+    filenames : list
+        A list containing the names (strings) of the files
+        in the folder.
+    '''
+    directory = os.fsencode(pathname)
+    filenames = [pathname + '/' + os.fsdecode(file) if dir_in_filepath else os.fsdecode(file) for file in os.listdir(directory)]
+    return filenames
diff --git a/cell2cell/io/read_data.py b/cell2cell/io/read_data.py
@@ -7,6 +7,7 @@
 import pandas as pd
 import numpy as np
 from cell2cell.preprocessing import rnaseq, ppi
+from cell2cell.io.directories import get_files_from_directory
 
 
 def load_table(filename, format='auto', sep='\t', sheet_name=0, compression=None, verbose=True, **kwargs):
@@ -96,6 +97,84 @@ def load_table(filename, format='auto', sep='\t', sheet_name=0, compression=None
     return table
 
 
+def load_tables_from_directory(pathname, extension, sep='\t', sheet_name=0, compression=None, verbose=True, **kwargs):
+    '''Opens all tables with the same extension in a folder.
+
+    Parameters
+    ----------
+    pathname : str
+        Full path of the folder to explore.
+
+    extension : str
+        Extension of the file.
+        Options are:
+
+        - 'excel' : An excel file, either .xls or .xlsx
+        - 'csv' : Comma separated value format
+        - 'tsv' : Tab separated value format
+        - 'txt' : Text file
+
+    sep : str, default='\t'
+        Separation between columns. Examples are: '\t', ' ', ';', ',', etc.
+
+    sheet_name : str, int, list, or None, default=0
+        Strings are used for sheet names. Integers are used in zero-indexed
+        sheet positions. Lists of strings/integers are used to request
+        multiple sheets. Specify None to get all sheets.
+        Available cases:
+
+        - Defaults to 0: 1st sheet as a DataFrame
+        - 1: 2nd sheet as a DataFrame
+        - "Sheet1": Load sheet with name “Sheet1”
+        - [0, 1, "Sheet5"]: Load first, second and sheet named
+            “Sheet5” as a dict of DataFrame
+        - None: All sheets.
+
+    compression : str, or None, default=‘infer’
+        For on-the-fly decompression of on-disk data. If ‘infer’, detects
+        compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’
+        (otherwise no decompression). If using ‘zip’, the ZIP file must contain
+        only one data file to be read in. Set to None for no decompression.
+        Options: {‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}
+
+    verbose : boolean, default=True
+        Whether printing or not steps of the analysis.
+
+    **kwargs : dict
+        Extra arguments for loading files with the respective pandas function
+        given the format of the file.
+
+    Returns
+    -------
+    data : dict
+        Dictionary containing the tables (pandas.DataFrame) loaded from the files.
+        Keys are the filenames without the extension and values are the dataframes.
+    '''
+    assert extension in ['excel', 'csv', 'tsv', 'txt'], "Enter a valid `extension`."
+
+    filenames = get_files_from_directory(pathname=pathname,
+                                         dir_in_filepath=True)
+
+    data = dict()
+    if compression is None:
+        comp = ''
+    else:
+        assert compression in ['gzip', 'bz2', 'zip', 'xz'], "Enter a valid `compression`."
+        comp = '.' + compression
+    for filename in filenames:
+        if filename.endswith('.' + extension + comp):
+            print('Loading {}'.format(filename))
+            basename = os.path.basename(filename)
+            sample = basename.split('.' + extension)[0]
+            data[sample] = load_table(filename=filename,
+                                      format=extension,
+                                      sep=sep,
+                                      sheet_name=sheet_name,
+                                      compression=compression,
+                                      verbose=verbose, **kwargs)
+    return data
+
+
 def load_rnaseq(rnaseq_file, gene_column, drop_nangenes=True, log_transformation=False, verbose=True, **kwargs):
     '''
     Loads a gene expression matrix for a RNA-seq experiment. Preprocessing
@@ -414,6 +493,62 @@ def load_variable_with_pickle(filename):
     return variable
 
 
+def load_tensor(filename, backend=None, device=None):
+    '''Imports a communication tensor that could be used
+    with Tensor-cell2cell.
+
+    Parameters
+    ----------
+    filename : str
+        Absolute path to a file storing a communication tensor
+        that was previously saved by using pickle.
+
+    backend : str, default=None
+        Backend that TensorLy will use to perform calculations
+        on this tensor. When None, the default backend used is
+        the currently active backend, usually is ('numpy'). Options are:
+        {'cupy', 'jax', 'mxnet', 'numpy', 'pytorch', 'tensorflow'}
+
+    device : str, default=None
+        Device to use when backend allows using multiple devices. Options are:
+         {'cpu', 'cuda:0', None}
+
+    Returns
+    -------
+    interaction_tensor : cell2cell.tensor.BaseTensor
+        A communication tensor generated with any of the tensor class in
+        cell2cell.tensor.
+    '''
+    interaction_tensor = load_variable_with_pickle(filename)
+
+    if 'tl' not in globals():
+        import tensorly as tl
+
+    if backend is not None:
+        tl.set_backend(backend)
+
+    if device is None:
+        interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor)
+        interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans)
+        interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros)
+        if interaction_tensor.mask is not None:
+            interaction_tensor.mask = tl.tensor(interaction_tensor.mask)
+    else:
+        if tl.get_backend() in ['pytorch', 'tensorflow']:  # Potential TODO: Include other backends that support different devices
+            interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor, device=device)
+            interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans, device=device)
+            interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros, device=device)
+            if interaction_tensor.mask is not None:
+                interaction_tensor.mask = tl.tensor(interaction_tensor.mask, device=device)
+        else:
+            interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor)
+            interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans)
+            interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros)
+            if interaction_tensor.mask is not None:
+                interaction_tensor.mask = tl.tensor(interaction_tensor.mask)
+    return interaction_tensor
+
+
 def load_tensor_factors(filename):
     '''Imports factors previously exported from a tensor
     decomposition done in a cell2cell.tensor.BaseTensor-like object.

diff --git a/cell2cell/tensor/external_scores.py b/cell2cell/tensor/external_scores.py
@@ -4,13 +4,14 @@
 import pandas as pd
 
 from collections import defaultdict
+from tqdm.auto import tqdm
 from cell2cell.preprocessing.find_elements import get_element_abundances, get_elements_over_fraction
 from cell2cell.tensor.tensor import PreBuiltTensor
 
 
 def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receptor_col, score_col, how='inner',
-                         outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None,
-                         order_labels=None, sort_elements=True, device=None):
+                         outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', dup_aggregation='max',
+                         context_order=None, order_labels=None, sort_elements=True, device=None):
     '''Generates an InteractionTensor from a dictionary
     containing dataframes for all contexts.
 
@@ -74,6 +75,15 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
     lr_sep : str, default='^'
         Separation character to join ligands and receptors into a LR pair name.
 
+    dup_aggregation : str, default='max'
+        Approach to aggregate communication score if there are multiple instances
+        of an LR pair for a specific sender-receiver pair in one of the dataframes.
+
+        - 'max' : Maximum of the multiple instances
+        - 'min' : Minimum of the multiple instances
+        - 'mean' : Average of the multiple instances
+        - 'median' : Median of the multiple instances
+
     context_order : list, default=None
         List used to sort the contexts when building the tensor. Elements must
         be all elements in context_df_dict.keys().
@@ -169,7 +179,7 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
 
     # Build temporal tensor to pass to PreBuiltTensor
     tmp_tensor = []
-    for k in context_order:
+    for k in tqdm(context_order):
         v = cont_dict[k]
         # 3D tensor for the context
         tmp_3d_tensor = []
@@ -178,6 +188,9 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
             if df.shape[0] == 0:  # TODO: Check behavior when df is empty
                 df = pd.DataFrame(lr_fill, index=sender_cells, columns=receiver_cells)
             else:
+                if df[cols[:-1]].duplicated().any():
+                    assert dup_aggregation in ['max', 'min', 'mean', 'median'], "Please use a valid option for `dup_aggregation`."
+                    df = getattr(df.groupby(cols[:-1]), dup_aggregation)().reset_index()
                 df = df.pivot(index=sender_col, columns=receiver_col, values=score_col)
                 df = df.reindex(sender_cells, fill_value=cell_fill).reindex(receiver_cells, fill_value=cell_fill, axis='columns')