Skip to content

Commit

Permalink
Skip labels before loop (#394)
Browse files Browse the repository at this point in the history
* Skip labels before loop

* Fix lint

---------

Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com>
  • Loading branch information
johnarevalo and mumichae authored Aug 7, 2024
1 parent 3d982c4 commit 5547cd1
Showing 1 changed file with 58 additions and 58 deletions.
116 changes: 58 additions & 58 deletions scib/metrics/kbet.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,39 +107,72 @@ def kBET(
# set upper bound for k0
size_max = 2**31 - 1

# check if neighborhood size too small or only one batch in subset
counts = adata_tmp.obs.groupby(label_key).agg(
{label_key: "count", batch_key: "nunique"}
)
labels = counts.query(f"{label_key}>=10 and {batch_key} > 1").index
skipped = counts.index.difference(labels)
print(f"{len(skipped)} labels consist of a single batch or is too small. Skip.")
# prepare call of kBET per cluster
kBET_scores = {"cluster": [], "kBET": []}
for clus in adata_tmp.obs[label_key].unique():
kBET_scores = {"cluster": list(skipped), "kBET": [np.nan] * len(skipped)}
for clus in labels:

# subset by label
adata_sub = adata_tmp[adata_tmp.obs[label_key] == clus, :].copy()

# check if neighborhood size too small or only one batch in subset
if np.logical_or(
adata_sub.n_obs < 10, len(adata_sub.obs[batch_key].cat.categories) == 1
):
print(f"{clus} consists of a single batch or is too small. Skip.")
score = np.nan
else:
quarter_mean = np.floor(
np.mean(adata_sub.obs[batch_key].value_counts()) / 4
).astype("int")
k0 = np.min([70, np.max([10, quarter_mean])])
# check k0 for reasonability
if k0 * adata_sub.n_obs >= size_max:
k0 = np.floor(size_max / adata_sub.n_obs).astype("int")
quarter_mean = np.floor(
np.mean(adata_sub.obs[batch_key].value_counts()) / 4
).astype("int")
k0 = np.min([70, np.max([10, quarter_mean])])
# check k0 for reasonability
if k0 * adata_sub.n_obs >= size_max:
k0 = np.floor(size_max / adata_sub.n_obs).astype("int")

matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))
matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))

if verbose:
print(f"Use {k0} nearest neighbors.")
n_comp, labs = scipy.sparse.csgraph.connected_components(
adata_sub.obsp["connectivities"], connection="strong"
if verbose:
print(f"Use {k0} nearest neighbors.")
n_comp, labs = scipy.sparse.csgraph.connected_components(
adata_sub.obsp["connectivities"], connection="strong"
)

if n_comp == 1: # a single component to compute kBET on
try:
nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
# call kBET
score = kBET_single(
matrix=matrix,
batch=adata_sub.obs[batch_key],
knn=nn_index_tmp
+ 1, # nn_index in python is 0-based and 1-based in R
verbose=verbose,
k0=k0,
)
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection

else:
# check the number of components where kBET can be computed upon
comp_size = pd.value_counts(labs)
# check which components are small
comp_size_thresh = 3 * k0
idx_nonan = np.flatnonzero(
np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)

if n_comp == 1: # a single component to compute kBET on
# check if 75% of all cells can be used for kBET run
if len(idx_nonan) / len(labs) >= 0.75:
# create another subset of components, assume they are not visited in a diffusion process
adata_sub_sub = adata_sub[idx_nonan, :].copy()
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp[:] = np.nan

try:
nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype(
"float"
)
# call kBET
score = kBET_single(
matrix=matrix,
Expand All @@ -152,41 +185,8 @@ def kBET(
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection

else:
# check the number of components where kBET can be computed upon
comp_size = pd.value_counts(labs)
# check which components are small
comp_size_thresh = 3 * k0
idx_nonan = np.flatnonzero(
np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)

# check if 75% of all cells can be used for kBET run
if len(idx_nonan) / len(labs) >= 0.75:
# create another subset of components, assume they are not visited in a diffusion process
adata_sub_sub = adata_sub[idx_nonan, :].copy()
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp[:] = np.nan

try:
nn_index_tmp[idx_nonan] = diffusion_nn(
adata_sub_sub, k=k0
).astype("float")
# call kBET
score = kBET_single(
matrix=matrix,
batch=adata_sub.obs[batch_key],
knn=nn_index_tmp
+ 1, # nn_index in python is 0-based and 1-based in R
verbose=verbose,
k0=k0,
)
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection
else: # if there are too many too small connected components, set kBET score to 1
score = 1 # i.e. 100% rejection
else: # if there are too many too small connected components, set kBET score to 1
score = 1 # i.e. 100% rejection

kBET_scores["cluster"].append(clus)
kBET_scores["kBET"].append(score)
Expand Down

0 comments on commit 5547cd1

Please sign in to comment.