Skip to content

Commit

Permalink
Merge pull request #21 from earmingol/new_version
Browse files Browse the repository at this point in the history
Updates for v0.6.4
  • Loading branch information
earmingol committed Jan 21, 2023
2 parents efee215 + 254bc19 commit b0c1851
Show file tree
Hide file tree
Showing 14 changed files with 279 additions and 19 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ body of *C. elegans*** is [available here](https://github.com/LewisLabUCSD/Celeg
- [Downstream analysis 2: Gene Set Enrichment Analysis](https://earmingol.github.io/cell2cell/tutorials/ASD/03-GSEA-ASD/)
- **Do you have precomputed communication scores?** Re-use them as a prebuilt tensor as [exemplified here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/Loading-PreBuiltTensor.ipynb).
This allows reusing previous tensors you built or even plugging in communication scores from other tools.
- **Run Tensor-cell2cell much faster!** An example to perform the analysis using a **Nvidia GPU** is [available here](https://github.com/earmingol/cell2cell/blob/master/examples/tensor_cell2cell/GPU-Example.ipynb)
- **Run Tensor-cell2cell MUCH FASTER and ON THE CLOUD!** An example to perform the analysis on
**Google Colab while using a NVIDIA GPU** is [available here](https://colab.research.google.com/drive/1xE6Pm1u-XoSWV8a3oYpixUFj64FIDtl0?usp=sharing)


---
Expand Down
2 changes: 1 addition & 1 deletion cell2cell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
from cell2cell import tensor
from cell2cell import utils

__version__ = "0.6.3"
__version__ = "0.6.4"
19 changes: 14 additions & 5 deletions cell2cell/analysis/tensor_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@


def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tensor=False, rank=None,
tf_optimization='regular', random_state=None, device=None, elbow_metric='error',
smooth_elbow=False, upper_rank=25, tf_init='random', tf_svd='numpy_svd', cmaps=None,
sample_col='Element', group_col='Category', fig_fontsize=14, output_folder=None,
output_fig=True, fig_format='pdf'):
tf_optimization='regular', random_state=None, backend=None, device=None,
elbow_metric='error', smooth_elbow=False, upper_rank=25, tf_init='random',
tf_svd='numpy_svd', cmaps=None, sample_col='Element', group_col='Category',
fig_fontsize=14, output_folder=None, output_fig=True, fig_format='pdf'):
'''
Runs basic pipeline of Tensor-cell2cell (excluding downstream analyses).
Expand Down Expand Up @@ -49,8 +49,14 @@ def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tens
random_state : boolean, default=None
Seed for randomization.
backend : str, default=None
Backend that TensorLy will use to perform calculations
on this tensor. When None, the default backend used is
the currently active backend, usually is ('numpy'). Options are:
{'cupy', 'jax', 'mxnet', 'numpy', 'pytorch', 'tensorflow'}
device : str, default=None
Device to use when backend is pytorch. Options are:
Device to use when backend allows multiple devices. Options are:
{'cpu', 'cuda:0', None}
elbow_metric : str, default='error'
Expand Down Expand Up @@ -149,6 +155,9 @@ def run_tensor_cell2cell_pipeline(interaction_tensor, tensor_metadata, copy_tens
else:
raise ValueError("`factorization_type` must be either 'robust' or 'regular'.")

if backend is not None:
tl.set_backend(backend)

if device is not None:
try:
interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor, device=device)
Expand Down
1 change: 1 addition & 0 deletions cell2cell/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import absolute_import

from cell2cell.datasets.anndata import (balf_covid)
from cell2cell.datasets.heuristic_data import (HeuristicGOTerms)
from cell2cell.datasets.random_data import (generate_random_rnaseq, generate_random_ppi, generate_random_cci_scores,
generate_random_metadata)
Expand Down
31 changes: 31 additions & 0 deletions cell2cell/datasets/anndata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from scanpy.readwrite import read


def balf_covid(filename='BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'):
"""BALF samples from COVID-19 patients
The data consists in 63k immune and epithelial cells in lungs
from 3 control, 3 moderate COVID-19, and 6 severe COVID-19 patients.
This dataset was previously published in [1], and this objects contains
the raw counts for the annotated cell types available in:
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE145926
References:
[1] Liao, M., Liu, Y., Yuan, J. et al.
Single-cell landscape of bronchoalveolar immune cells in patients
with COVID-19. Nat Med 26, 842–844 (2020).
https://doi.org/10.1038/s41591-020-0901-9
Parameters
----------
filename : str, default='BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'
Path to the h5ad file in case it was manually downloaded.
Returns
-------
Annotated data matrix.
"""
url = 'https://zenodo.org/record/7535867/files/BALF-COVID19-Liao_et_al-NatMed-2020.h5ad'
adata = read(filename, backup_url=url)
return adata
4 changes: 3 additions & 1 deletion cell2cell/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import absolute_import

from cell2cell.io.directories import (create_directory, get_files_from_directory)
from cell2cell.io.read_data import (load_cutoffs, load_go_annotations, load_go_terms, load_metadata, load_ppi,
load_rnaseq, load_table, load_variable_with_pickle, load_tensor_factors)
load_rnaseq, load_table, load_tables_from_directory, load_variable_with_pickle,
load_tensor, load_tensor_factors)
from cell2cell.io.save_data import (export_variable_with_pickle)
46 changes: 46 additions & 0 deletions cell2cell/io/directories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import

import os


def create_directory(pathname):
'''Creates a directory.
Uses a path to create a directory. It creates
all intermediate folders before creating the
leaf folder.
Parameters
----------
pathname : str
Full path of the folder to create.
'''
if not os.path.isdir(pathname):
os.makedirs(pathname)
print("{} was created successfully.".format(pathname))
else:
print("{} already exists.".format(pathname))


def get_files_from_directory(pathname, dir_in_filepath=False):
'''Obtains a list of filenames in a folder.
Parameters
----------
pathname : str
Full path of the folder to explore.
dir_in_filepath : boolean, default=False
Whether adding `pathname` to the filenames
Returns
-------
filenames : list
A list containing the names (strings) of the files
in the folder.
'''
directory = os.fsencode(pathname)
filenames = [pathname + '/' + os.fsdecode(file) if dir_in_filepath else os.fsdecode(file) for file in os.listdir(directory)]
return filenames
135 changes: 135 additions & 0 deletions cell2cell/io/read_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd
import numpy as np
from cell2cell.preprocessing import rnaseq, ppi
from cell2cell.io.directories import get_files_from_directory


def load_table(filename, format='auto', sep='\t', sheet_name=0, compression=None, verbose=True, **kwargs):
Expand Down Expand Up @@ -96,6 +97,84 @@ def load_table(filename, format='auto', sep='\t', sheet_name=0, compression=None
return table


def load_tables_from_directory(pathname, extension, sep='\t', sheet_name=0, compression=None, verbose=True, **kwargs):
'''Opens all tables with the same extension in a folder.
Parameters
----------
pathname : str
Full path of the folder to explore.
extension : str
Extension of the file.
Options are:
- 'excel' : An excel file, either .xls or .xlsx
- 'csv' : Comma separated value format
- 'tsv' : Tab separated value format
- 'txt' : Text file
sep : str, default='\t'
Separation between columns. Examples are: '\t', ' ', ';', ',', etc.
sheet_name : str, int, list, or None, default=0
Strings are used for sheet names. Integers are used in zero-indexed
sheet positions. Lists of strings/integers are used to request
multiple sheets. Specify None to get all sheets.
Available cases:
- Defaults to 0: 1st sheet as a DataFrame
- 1: 2nd sheet as a DataFrame
- "Sheet1": Load sheet with name “Sheet1”
- [0, 1, "Sheet5"]: Load first, second and sheet named
“Sheet5” as a dict of DataFrame
- None: All sheets.
compression : str, or None, default=‘infer’
For on-the-fly decompression of on-disk data. If ‘infer’, detects
compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’
(otherwise no decompression). If using ‘zip’, the ZIP file must contain
only one data file to be read in. Set to None for no decompression.
Options: {‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}
verbose : boolean, default=True
Whether printing or not steps of the analysis.
**kwargs : dict
Extra arguments for loading files with the respective pandas function
given the format of the file.
Returns
-------
data : dict
Dictionary containing the tables (pandas.DataFrame) loaded from the files.
Keys are the filenames without the extension and values are the dataframes.
'''
assert extension in ['excel', 'csv', 'tsv', 'txt'], "Enter a valid `extension`."

filenames = get_files_from_directory(pathname=pathname,
dir_in_filepath=True)

data = dict()
if compression is None:
comp = ''
else:
assert compression in ['gzip', 'bz2', 'zip', 'xz'], "Enter a valid `compression`."
comp = '.' + compression
for filename in filenames:
if filename.endswith('.' + extension + comp):
print('Loading {}'.format(filename))
basename = os.path.basename(filename)
sample = basename.split('.' + extension)[0]
data[sample] = load_table(filename=filename,
format=extension,
sep=sep,
sheet_name=sheet_name,
compression=compression,
verbose=verbose, **kwargs)
return data


def load_rnaseq(rnaseq_file, gene_column, drop_nangenes=True, log_transformation=False, verbose=True, **kwargs):
'''
Loads a gene expression matrix for a RNA-seq experiment. Preprocessing
Expand Down Expand Up @@ -414,6 +493,62 @@ def load_variable_with_pickle(filename):
return variable


def load_tensor(filename, backend=None, device=None):
'''Imports a communication tensor that could be used
with Tensor-cell2cell.
Parameters
----------
filename : str
Absolute path to a file storing a communication tensor
that was previously saved by using pickle.
backend : str, default=None
Backend that TensorLy will use to perform calculations
on this tensor. When None, the default backend used is
the currently active backend, usually is ('numpy'). Options are:
{'cupy', 'jax', 'mxnet', 'numpy', 'pytorch', 'tensorflow'}
device : str, default=None
Device to use when backend allows using multiple devices. Options are:
{'cpu', 'cuda:0', None}
Returns
-------
interaction_tensor : cell2cell.tensor.BaseTensor
A communication tensor generated with any of the tensor class in
cell2cell.tensor.
'''
interaction_tensor = load_variable_with_pickle(filename)

if 'tl' not in globals():
import tensorly as tl

if backend is not None:
tl.set_backend(backend)

if device is None:
interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor)
interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans)
interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros)
if interaction_tensor.mask is not None:
interaction_tensor.mask = tl.tensor(interaction_tensor.mask)
else:
if tl.get_backend() in ['pytorch', 'tensorflow']: # Potential TODO: Include other backends that support different devices
interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor, device=device)
interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans, device=device)
interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros, device=device)
if interaction_tensor.mask is not None:
interaction_tensor.mask = tl.tensor(interaction_tensor.mask, device=device)
else:
interaction_tensor.tensor = tl.tensor(interaction_tensor.tensor)
interaction_tensor.loc_nans = tl.tensor(interaction_tensor.loc_nans)
interaction_tensor.loc_zeros = tl.tensor(interaction_tensor.loc_zeros)
if interaction_tensor.mask is not None:
interaction_tensor.mask = tl.tensor(interaction_tensor.mask)
return interaction_tensor


def load_tensor_factors(filename):
'''Imports factors previously exported from a tensor
decomposition done in a cell2cell.tensor.BaseTensor-like object.
Expand Down
19 changes: 16 additions & 3 deletions cell2cell/tensor/external_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pandas as pd

from collections import defaultdict
from tqdm.auto import tqdm
from cell2cell.preprocessing.find_elements import get_element_abundances, get_elements_over_fraction
from cell2cell.tensor.tensor import PreBuiltTensor


def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receptor_col, score_col, how='inner',
outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None,
order_labels=None, sort_elements=True, device=None):
outer_fraction=0.0, lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', dup_aggregation='max',
context_order=None, order_labels=None, sort_elements=True, device=None):
'''Generates an InteractionTensor from a dictionary
containing dataframes for all contexts.
Expand Down Expand Up @@ -74,6 +75,15 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
lr_sep : str, default='^'
Separation character to join ligands and receptors into a LR pair name.
dup_aggregation : str, default='max'
Approach to aggregate communication score if there are multiple instances
of an LR pair for a specific sender-receiver pair in one of the dataframes.
- 'max' : Maximum of the multiple instances
- 'min' : Minimum of the multiple instances
- 'mean' : Average of the multiple instances
- 'median' : Median of the multiple instances
context_order : list, default=None
List used to sort the contexts when building the tensor. Elements must
be all elements in context_df_dict.keys().
Expand Down Expand Up @@ -169,7 +179,7 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,

# Build temporal tensor to pass to PreBuiltTensor
tmp_tensor = []
for k in context_order:
for k in tqdm(context_order):
v = cont_dict[k]
# 3D tensor for the context
tmp_3d_tensor = []
Expand All @@ -178,6 +188,9 @@ def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col,
if df.shape[0] == 0: # TODO: Check behavior when df is empty
df = pd.DataFrame(lr_fill, index=sender_cells, columns=receiver_cells)
else:
if df[cols[:-1]].duplicated().any():
assert dup_aggregation in ['max', 'min', 'mean', 'median'], "Please use a valid option for `dup_aggregation`."
df = getattr(df.groupby(cols[:-1]), dup_aggregation)().reset_index()
df = df.pivot(index=sender_col, columns=receiver_col, values=score_col)
df = df.reindex(sender_cells, fill_value=cell_fill).reindex(receiver_cells, fill_value=cell_fill, axis='columns')

Expand Down
Loading

0 comments on commit b0c1851

Please sign in to comment.