From 672aca252a0289bdfdcca6fda70f411cba611b33 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 9 Apr 2021 18:33:34 +0100 Subject: [PATCH 01/22] WIP NBLAST score matrix calculation --- navis/nbl/smat.py | 289 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 navis/nbl/smat.py diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py new file mode 100644 index 00000000..5c0fc167 --- /dev/null +++ b/navis/nbl/smat.py @@ -0,0 +1,289 @@ +from itertools import permutations +import sys +import os +from collections import Counter +from concurrent.futures import ProcessPoolExecutor + +import pandas as pd +import numpy as np + +epsilon = sys.float_info.epsilon +cpu_count = max(1, (os.cpu_count() or 2) - 1) + + +def ensure_inf_bounds(numbers): + numbers[0] = -np.inf + numbers[1] = np.inf + return np.array(numbers) + + +def chunksize(it_len, cpu_count, min_chunk=50): + return max(min_chunk, int(it_len / (cpu_count * 4))) + + +class ScoreMatrixBuilder: + def __init__( + self, + dotprops, + matching_sets, + nonmatching=None, + alpha=False, + seed=1991, + ): + """Class for building a score matrix for NBLAST. + + dist_bins and dot_bins must additionally be given using the set_* or calc_* methods. + + Parameters + ---------- + dotprops : dict or list of Dotprops + An indexable sequence of all neurons which will be used as the training set, + as Dotprops objects. + matching_sets : list of sets of indices into dotprops + Sets of neurons, as indices into dotprops, which should be considered matches. + nonmatching : set of indices into dotprops, optional + Set of neurons, as indices into dotprops, which should not be considered matches. + If not given, all dotprops will be used + (on the assumption that matches are a small subset of possible pairs). + alpha : bool, optional + Whether to multiply dot product by alpha (local colinearity) + when calculating dist-dots. + seed : int, optional + Non-matching pairs are drawn at random using this seed, by default 1991 + """ + self.rng = np.random.default_rng(seed) + self.dotprops = dotprops + self.matching_sets = matching_sets + self.alpha = alpha + self._nonmatching = nonmatching + self.dist_bins = None + self.dot_bins = None + + def _dotprop_keys(self): + try: + return self.dotprops.keys() + except AttributeError: + return range(len(self.dotprops)) + + @property + def nonmatching(self): + """Indices of nonmatching set of neurons""" + if self._nonmatching is None: + return set(self._dotprop_keys()) + return self._nonmatching + + def set_dist_bins(self, bins): + """Set distance bins to use for score matrix. + + The first and last values are set to -inf and inf respectively, + so that the bins explicitly cover the entire domain, + even though negatives are not possible, in principle. + + Parameters + ---------- + bins : list of float + First and last values are effectively ignored, as above. + + Returns + ------- + np.ndarray of float + The modified bin boundaries. + """ + self.dist_bins = ensure_inf_bounds(bins) + return self.dist_bins + + def calc_dist_bins(self, n_bins, base, min_exp, max_exp): + """Calculate distance bins in a logarithmic sequence. + + n_bins values are spread evenly between min_exp and max_exp (inclusive). + These are transformed into boundaries by base**values. + Then inf is added to the end, and the first boundary is replaced by -inf. + + Parameters + ---------- + n_bins : int + Number of bins (i.e. number of boundaries - 1) + base : float + Value to be raised to some power to create the boundaries. + min_exp : float + Exponent of the lowest boundary (actual boundary will be replaced by -inf) + max_exp : float + Exponent of the highest boundary (the lower bound of the inf-containing bin) + + Returns + ------- + np.ndarray of float + The modified bin boundaries. + """ + bins = np.array( + list(np.logspace(min_exp, max_exp, n_bins, base=base)) + [np.inf] + ) + return self.set_dist_bins(bins) + + def set_dot_bins(self, bins): + """Set dot product bins to use for score matrix. + + The first and last values are set to -inf and inf respectively, + so that the bins explicitly cover the entire domain, + even though values outside of 0-1 are not possible, in principle. + + Parameters + ---------- + bins : list of float + First and last values are effectively ignored, as above. + + Returns + ------- + np.ndarray of float + The modified bin boundaries. + """ + self.dot_bins = ensure_inf_bounds(bins) + return self.dot_bins + + def calc_dot_bins(self, n_bins): + """Calculate dot product bins in a linear sequence between 0 and 1. + + Parameters + ---------- + n_bins : int + Number of bins (i.e. number of boundaries - 1). + + Returns + ------- + np.ndarray of float + The modified bin boundaries. + """ + bins = np.linspace(0, 1, n_bins + 1) + return self.set_dot_bins(bins) + + def _yield_matching_pairs(self): + for ms in self.matching_sets: + for q, t in permutations(ms, 2): + if q != t: + yield q, t + + def _yield_nonmatching_pairs(self): + for q, t in permutations(self.nonmatching): + if q != t: + yield q, t + + def _query_to_dist_dot_idxs(self, q_idx, t_idx, counts=None): + q = self.dotprops[q_idx] + response = q.dist_dots(self.dotprops[t_idx], use_alpha=self.alpha) + if self.alpha: + dists, dots, alphas = response + dots *= alphas + else: + dists, dots = response + + dist_idxs = np.digitize(dists, self.dist_bins) - 1 + dot_idxs = np.digitize(dots, self.dot_bins) - 1 + if counts is None: + counts = np.zeros( + (len(self.dist_bins), len(self.dot_bins)), + dtype=int, + ) + for dist_idx, dot_idx in zip(dist_idxs, dot_idxs): + counts[dist_idx, dot_idx] += 1 + return counts + + def _counts_array(self, idx_pairs, threads=None): + counts = np.zeros( + (len(self.dist_bins), len(self.dot_bins)), + dtype=int, + ) + if threads is None or threads == 0 and cpu_count == 1: + for q_idx, t_idx in idx_pairs: + counts = self._query_to_dist_dot_idxs(q_idx, t_idx, counts) + return counts + + threads = threads or cpu_count + + with ProcessPoolExecutor(threads) as exe: + for these_counts in exe.map( + self._query_to_dist_dot_idxs, + idx_pairs[:, 0], + idx_pairs[:, 1], + chunksize=chunksize(len(idx_pairs), threads) + ): + counts += these_counts + + return counts + + def build(self, threads=None): + """Build the score matrix. + + All non-identical neuron pairs within all matching sets are selected, + and distdots calculated for those pairs. + Then, the minimum number of non-matching pairs are randomly drawn + so that at least as many distdots can be calculated for non-matching + pairs. + + In each bin of the 2D score matrix, the log2 odds ratio of a distdot + in that bin belonging to a match vs. non-match is calculated. + + Parameters + ---------- + threads : int, optional + If None, act in serial. + If 0, use cpu_count - 1. + Otherwise, use the given value. + + Returns + ------- + pd.DataFrame + Suitable for passing to navis.nbl.ScoringFunction + + Raises + ------ + ValueError + If dist_bins or dot_bins are not set. + """ + if self.dot_bins is None or self.dist_bins is None: + raise ValueError("dot_bins and dist_bins must be set or calculated") + + matching_pairs = set(self._yield_matching_pairs()) + # need to know the eventual distdot count + # so we know how many non-matching pairs to draw + q_idx_count = Counter(p[0] for p in matching_pairs) + n_matching_dist_dots = sum( + len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() + ) + + # pre-calculating which pairs we're going to use, + # rather than drawing them as we need them, + # means that we can parallelise the later step more effectively. + # Slowdowns here are basically meaningless + # because of how long distdot calculation will take + all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) + nonmatching_pairs = [] + n_nonmatching_dist_dots = 0 + while n_nonmatching_dist_dots < n_matching_dist_dots: + idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1) + nonmatching_pairs.append(all_nonmatching_pairs.pop(idx)) + n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]]) + + match_counts = self._counts_array(matching_pairs, threads) + nonmatch_counts = self._counts_array(nonmatching_pairs, threads) + + # account for there being different total numbers of match/nonmatch dist dots + matching_factor = nonmatch_counts.sum() / match_counts.sum() + + cells = np.log2( + (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) + ) + + index = [ + f"({left},{right}]" + for left, right in zip( + [-np.inf] + list(self.dist_bins), list(self.dist_bins) + [np.inf] + ) + ] + columns = [ + f"({left},{right}]" + for left, right in zip( + [-np.inf] + list(self.dot_bins), list(self.dot_bins) + [np.inf] + ) + ] + + return pd.DataFrame(cells, index, columns) From 11994317e49ee6c727fbf9f07d10121abae332fd Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 9 Apr 2021 19:12:56 +0100 Subject: [PATCH 02/22] Make bin boundary handling more consistent --- navis/nbl/smat.py | 60 +++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 5c0fc167..2212f7bd 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -11,10 +11,19 @@ cpu_count = max(1, (os.cpu_count() or 2) - 1) -def ensure_inf_bounds(numbers): - numbers[0] = -np.inf - numbers[1] = np.inf - return np.array(numbers) +def ensure_inf_bounds(numbers, right_one=True): + lst = list(numbers) + if lst[0] <= 0: + lst[0] = -np.inf + else: + lst.insert(0, -np.inf) + + if right_one and lst[-1] >= 1: + lst[-1] = np.inf + elif lst[-1] != np.inf: + lst.append(np.inf) + + return np.array(lst) def chunksize(it_len, cpu_count, min_chunk=50): @@ -72,8 +81,13 @@ def nonmatching(self): return set(self._dotprop_keys()) return self._nonmatching - def set_dist_bins(self, bins): - """Set distance bins to use for score matrix. + def set_dist_boundaries(self, boundaries): + """Set distance bins to use for score matrix by their internal boundaries. + + If the lowest boundary is <=0, it will be changed to -inf. + If the lowest boundary is >0, -inf will be prepended. + + If the highest boundary is not inf, it will be appended. The first and last values are set to -inf and inf respectively, so that the bins explicitly cover the entire domain, @@ -89,15 +103,17 @@ def set_dist_bins(self, bins): np.ndarray of float The modified bin boundaries. """ - self.dist_bins = ensure_inf_bounds(bins) + self.dist_bins = ensure_inf_bounds(boundaries) return self.dist_bins def calc_dist_bins(self, n_bins, base, min_exp, max_exp): - """Calculate distance bins in a logarithmic sequence. + """Calculate distance boundaries in a logarithmic sequence. - n_bins values are spread evenly between min_exp and max_exp (inclusive). + base**min_exp will be the upper bound of the lowest bin; + base**max_exp will be the lower bound of the highest bin. + + n_bins - 1 values are spread evenly between min_exp and max_exp (inclusive). These are transformed into boundaries by base**values. - Then inf is added to the end, and the first boundary is replaced by -inf. Parameters ---------- @@ -115,17 +131,19 @@ def calc_dist_bins(self, n_bins, base, min_exp, max_exp): np.ndarray of float The modified bin boundaries. """ - bins = np.array( - list(np.logspace(min_exp, max_exp, n_bins, base=base)) + [np.inf] + # n_bins - 1 because inf outer boundaries are added + return self.set_dist_boundaries( + np.logspace(min_exp, max_exp, n_bins - 1, base=base) ) - return self.set_dist_bins(bins) - def set_dot_bins(self, bins): - """Set dot product bins to use for score matrix. + def set_dot_boundaries(self, boundaries): + """Set dot product bins to use for score matrix by their internal boundaries. - The first and last values are set to -inf and inf respectively, - so that the bins explicitly cover the entire domain, - even though values outside of 0-1 are not possible, in principle. + Dot products, even normalised by alpha values, + should be in the range 0,1. + However, due to float imprecision, they sometimes aren't, + so the lowest bound is set to -inf and highest bound set to inf + just in case. Parameters ---------- @@ -137,12 +155,14 @@ def set_dot_bins(self, bins): np.ndarray of float The modified bin boundaries. """ - self.dot_bins = ensure_inf_bounds(bins) + self.dot_bins = ensure_inf_bounds(boundaries, True) return self.dot_bins def calc_dot_bins(self, n_bins): """Calculate dot product bins in a linear sequence between 0 and 1. + Internally, 0 and 1 will be replaced with -inf and inf respectively. + Parameters ---------- n_bins : int @@ -154,7 +174,7 @@ def calc_dot_bins(self, n_bins): The modified bin boundaries. """ bins = np.linspace(0, 1, n_bins + 1) - return self.set_dot_bins(bins) + return self.set_dot_boundaries(bins[1:-1]) def _yield_matching_pairs(self): for ms in self.matching_sets: From c52294c04a015dc7b542ad222d172e29303a75fa Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 13 Apr 2021 18:10:52 +0100 Subject: [PATCH 03/22] Refactor smat.py for ND case Also include lookup classes. --- navis/nbl/smat.py | 518 ++++++++++++++++++++++++++++------------------ 1 file changed, 320 insertions(+), 198 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 2212f7bd..5aa1c4f5 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -1,72 +1,135 @@ +from __future__ import annotations from itertools import permutations import sys import os from collections import Counter from concurrent.futures import ProcessPoolExecutor +from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple -import pandas as pd import numpy as np +import pandas as pd + +from ..core.neurons import Dotprops + +DEFAULT_SEED = 1991 epsilon = sys.float_info.epsilon cpu_count = max(1, (os.cpu_count() or 2) - 1) +implicit_interval = "[)" -def ensure_inf_bounds(numbers, right_one=True): - lst = list(numbers) - if lst[0] <= 0: - lst[0] = -np.inf +def chunksize(it_len, cpu_count, min_chunk=50): + return max(min_chunk, int(it_len / (cpu_count * 4))) + + +def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf): + """Ensures that boundaries cover -inf to inf. + + If the left or rightmost values lie within the interval ``(left, right)``, + ``-inf`` or ``inf`` will be prepended or appended respectively. + Otherwise, the left and right values will be set to + ``-inf`` and ``inf`` respectively. + + Parameters + ---------- + arr : array-like of floats + Array of boundaries + left : float, optional + If the first value is greater than this, prepend -inf; + otherwise replace that value with -inf. + right : float, optional + If the last value is less than this, append inf; + otherwise replace that value with inf. + + Returns + ------- + np.ndarray of floats + Boundaries from -inf to inf + + Raises + ------ + ValueError + If values of ``arr`` are not monotonically increasing. + """ + if not np.all(np.diff(arr) > 0): + raise ValueError("Boundaries are not monotonically increasing") + + items = list(arr) + if items[0] <= left: + items[0] = -np.inf else: - lst.insert(0, -np.inf) + items.insert(0, -np.inf) - if right_one and lst[-1] >= 1: - lst[-1] = np.inf - elif lst[-1] != np.inf: - lst.append(np.inf) + if items[-1] >= right: + items[-1] = np.inf + else: + items.append(np.inf) - return np.array(lst) + return np.array(items) -def chunksize(it_len, cpu_count, min_chunk=50): - return max(min_chunk, int(it_len / (cpu_count * 4))) +def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]: + for a, b in pairs: + if a != b: + yield a, b -class ScoreMatrixBuilder: +class LookupNdBuilder: def __init__( self, dotprops, matching_sets, + boundaries: Sequence[Sequence[float]], + fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], nonmatching=None, - alpha=False, - seed=1991, - ): - """Class for building a score matrix for NBLAST. - - dist_bins and dot_bins must additionally be given using the set_* or calc_* methods. + seed=DEFAULT_SEED, + ) -> None: + f"""Class for building an N-dimensional score lookup for NBLAST. Parameters ---------- dotprops : dict or list of Dotprops An indexable sequence of all neurons which will be used as the training set, as Dotprops objects. - matching_sets : list of sets of indices into dotprops - Sets of neurons, as indices into dotprops, which should be considered matches. - nonmatching : set of indices into dotprops, optional - Set of neurons, as indices into dotprops, which should not be considered matches. - If not given, all dotprops will be used + matching_sets : list of sets of int + Sets of neurons, as indices into ``dotprops``, which should be considered matches. + boundaries : sequence of array-likes of floats + List of lists, where the inner lists are monotonically increasing + from -inf to inf. + The length of the outer list is the dimensionality of the lookup table. + + The inner lists are the boundaries of bins for that dimension, + i.e. an inner list of N items describes N-1 bins. + If an inner list is not inf-bounded, + -inf and inf will be prepended and appended. + + See the ``wrap_bounds`` convenience function. + fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] + Function taking 2 arguments, + both instances of ``navis.core.neurons.Dotprops``, + and returning a list of 1D ``numpy.ndarray``s of floats. + The length of the list must be the same as the length of ``boundaries``. + The length of the ``array``s must be the same + as the number of points in the first argument. + + This function returns values describing the quality of + point matches from a query to a target neuron. + nonmatching : set of int, optional + Set of neurons, as indices into ``dotprops``, + which should not be considered matches. + If not given, all ``dotprops`` will be used (on the assumption that matches are a small subset of possible pairs). - alpha : bool, optional - Whether to multiply dot product by alpha (local colinearity) - when calculating dist-dots. seed : int, optional - Non-matching pairs are drawn at random using this seed, by default 1991 + Non-matching pairs are drawn at random using this seed, + by default {DEFAULT_SEED} """ - self.rng = np.random.default_rng(seed) self.dotprops = dotprops self.matching_sets = matching_sets - self.alpha = alpha self._nonmatching = nonmatching - self.dist_bins = None - self.dot_bins = None + self.fn = fn + self.boundaries = [wrap_bounds(b) for b in boundaries] + + self.rng = np.random.default_rng(seed) def _dotprop_keys(self): try: @@ -81,156 +144,85 @@ def nonmatching(self): return set(self._dotprop_keys()) return self._nonmatching - def set_dist_boundaries(self, boundaries): - """Set distance bins to use for score matrix by their internal boundaries. - - If the lowest boundary is <=0, it will be changed to -inf. - If the lowest boundary is >0, -inf will be prepended. - - If the highest boundary is not inf, it will be appended. - - The first and last values are set to -inf and inf respectively, - so that the bins explicitly cover the entire domain, - even though negatives are not possible, in principle. - - Parameters - ---------- - bins : list of float - First and last values are effectively ignored, as above. - - Returns - ------- - np.ndarray of float - The modified bin boundaries. - """ - self.dist_bins = ensure_inf_bounds(boundaries) - return self.dist_bins - - def calc_dist_bins(self, n_bins, base, min_exp, max_exp): - """Calculate distance boundaries in a logarithmic sequence. - - base**min_exp will be the upper bound of the lowest bin; - base**max_exp will be the lower bound of the highest bin. - - n_bins - 1 values are spread evenly between min_exp and max_exp (inclusive). - These are transformed into boundaries by base**values. - - Parameters - ---------- - n_bins : int - Number of bins (i.e. number of boundaries - 1) - base : float - Value to be raised to some power to create the boundaries. - min_exp : float - Exponent of the lowest boundary (actual boundary will be replaced by -inf) - max_exp : float - Exponent of the highest boundary (the lower bound of the inf-containing bin) - - Returns - ------- - np.ndarray of float - The modified bin boundaries. - """ - # n_bins - 1 because inf outer boundaries are added - return self.set_dist_boundaries( - np.logspace(min_exp, max_exp, n_bins - 1, base=base) - ) - - def set_dot_boundaries(self, boundaries): - """Set dot product bins to use for score matrix by their internal boundaries. - - Dot products, even normalised by alpha values, - should be in the range 0,1. - However, due to float imprecision, they sometimes aren't, - so the lowest bound is set to -inf and highest bound set to inf - just in case. - - Parameters - ---------- - bins : list of float - First and last values are effectively ignored, as above. - - Returns - ------- - np.ndarray of float - The modified bin boundaries. - """ - self.dot_bins = ensure_inf_bounds(boundaries, True) - return self.dot_bins - - def calc_dot_bins(self, n_bins): - """Calculate dot product bins in a linear sequence between 0 and 1. - - Internally, 0 and 1 will be replaced with -inf and inf respectively. - - Parameters - ---------- - n_bins : int - Number of bins (i.e. number of boundaries - 1). - - Returns - ------- - np.ndarray of float - The modified bin boundaries. - """ - bins = np.linspace(0, 1, n_bins + 1) - return self.set_dot_boundaries(bins[1:-1]) - def _yield_matching_pairs(self): for ms in self.matching_sets: - for q, t in permutations(ms, 2): - if q != t: - yield q, t + yield from yield_not_same(permutations(ms, 2)) def _yield_nonmatching_pairs(self): - for q, t in permutations(self.nonmatching): - if q != t: - yield q, t - - def _query_to_dist_dot_idxs(self, q_idx, t_idx, counts=None): - q = self.dotprops[q_idx] - response = q.dist_dots(self.dotprops[t_idx], use_alpha=self.alpha) - if self.alpha: - dists, dots, alphas = response - dots *= alphas - else: - dists, dots = response + # todo: this could be much better, use meshgrid or shuffle index arrays + return yield_not_same(permutations(self.nonmatching, 2)) + + def _empty_counts(self): + shape = [len(b) - 1 for b in self.boundaries] + return np.zeros(shape, int) + + def _query_to_idxs(self, q_idx, t_idx, counts=None): + response = self.fn(self.dotprops[q_idx], self.dotprops[t_idx]) + idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)] - dist_idxs = np.digitize(dists, self.dist_bins) - 1 - dot_idxs = np.digitize(dots, self.dot_bins) - 1 if counts is None: - counts = np.zeros( - (len(self.dist_bins), len(self.dot_bins)), - dtype=int, - ) - for dist_idx, dot_idx in zip(dist_idxs, dot_idxs): - counts[dist_idx, dot_idx] += 1 + counts = self._empty_counts() + + for idx in zip(*idxs): + counts[idx] += 1 + return counts def _counts_array(self, idx_pairs, threads=None): - counts = np.zeros( - (len(self.dist_bins), len(self.dot_bins)), - dtype=int, - ) + counts = self._empty_counts() if threads is None or threads == 0 and cpu_count == 1: for q_idx, t_idx in idx_pairs: - counts = self._query_to_dist_dot_idxs(q_idx, t_idx, counts) + counts = self._query_to_idxs(q_idx, t_idx, counts) return counts threads = threads or cpu_count with ProcessPoolExecutor(threads) as exe: for these_counts in exe.map( - self._query_to_dist_dot_idxs, + self._query_to_idxs, idx_pairs[:, 0], idx_pairs[:, 1], - chunksize=chunksize(len(idx_pairs), threads) + chunksize=chunksize(len(idx_pairs), threads), ): counts += these_counts return counts - def build(self, threads=None): + def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: + matching_pairs = set(self._yield_matching_pairs()) + # need to know the eventual distdot count + # so we know how many non-matching pairs to draw + q_idx_count = Counter(p[0] for p in matching_pairs) + n_matching_dist_dots = sum( + len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() + ) + + # pre-calculating which pairs we're going to use, + # rather than drawing them as we need them, + # means that we can parallelise the later step more effectively. + # Slowdowns here are practically meaningless + # because of how long distdot calculation will take + all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) + nonmatching_pairs = [] + n_nonmatching_dist_dots = 0 + while n_nonmatching_dist_dots < n_matching_dist_dots: + idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1) + nonmatching_pairs.append(all_nonmatching_pairs.pop(idx)) + n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]]) + + match_counts = self._counts_array(matching_pairs, threads) + nonmatch_counts = self._counts_array(nonmatching_pairs, threads) + + # account for there being different total numbers of match/nonmatch dist dots + matching_factor = nonmatch_counts.sum() / match_counts.sum() + + cells = np.log2( + (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) + ) + + return self.boundaries, cells + + def build(self, threads=None) -> LookupNd: """Build the score matrix. All non-identical neuron pairs within all matching sets are selected, @@ -239,7 +231,7 @@ def build(self, threads=None): so that at least as many distdots can be calculated for non-matching pairs. - In each bin of the 2D score matrix, the log2 odds ratio of a distdot + In each bin of the score matrix, the log2 odds ratio of a distdot in that bin belonging to a match vs. non-match is calculated. Parameters @@ -259,51 +251,181 @@ def build(self, threads=None): ValueError If dist_bins or dot_bins are not set. """ - if self.dot_bins is None or self.dist_bins is None: - raise ValueError("dot_bins and dist_bins must be set or calculated") + return LookupNd(*self._build(threads)) - matching_pairs = set(self._yield_matching_pairs()) - # need to know the eventual distdot count - # so we know how many non-matching pairs to draw - q_idx_count = Counter(p[0] for p in matching_pairs) - n_matching_dist_dots = sum( - len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() - ) - # pre-calculating which pairs we're going to use, - # rather than drawing them as we need them, - # means that we can parallelise the later step more effectively. - # Slowdowns here are basically meaningless - # because of how long distdot calculation will take - all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) - nonmatching_pairs = [] - n_nonmatching_dist_dots = 0 - while n_nonmatching_dist_dots < n_matching_dist_dots: - idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1) - nonmatching_pairs.append(all_nonmatching_pairs.pop(idx)) - n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]]) +def dist_dot(q: Dotprops, t: Dotprops): + return list(q.dist_dots(t)) - match_counts = self._counts_array(matching_pairs, threads) - nonmatch_counts = self._counts_array(nonmatching_pairs, threads) - # account for there being different total numbers of match/nonmatch dist dots - matching_factor = nonmatch_counts.sum() / match_counts.sum() +def dist_dot_alpha(q: Dotprops, t: Dotprops): + dist, dot, alpha = q.dist_dots(t, alpha=True) + return [dist, dot * np.sqrt(alpha)] - cells = np.log2( - (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) + +class LookupDistDotBuilder(LookupNdBuilder): + def __init__( + self, + dotprops, + matching_sets, + dist_boundaries, + dot_boundaries, + nonmatching=None, + use_alpha=False, + seed=DEFAULT_SEED, + ): + f"""Class for building a 2-dimensional score lookup for NBLAST. + + The scores are + + 1. The distances between best-matching points + 2. The dot products of direction vectors around those points, + optionally scaled by the colinearity ``alpha``. + + Parameters + ---------- + dotprops : dict or list of Dotprops + An indexable sequence of all neurons which will be used as the training set, + as Dotprops objects. + matching_sets : list of sets of int + Sets of neurons, as indices into ``dotprops``, which should be considered matches. + dist_boundaries : array-like of floats + Monotonically increasing boundaries between distance bins + (i.e. N values makes N-1 bins). + If the first value > 0, -inf will be prepended, + otherwise the first value will be replaced with -inf. + inf will be appended unless the last value is already inf. + + Consider using ``numpy.logspace`` or ``numpy.geomspace`` + to generate the inner boundaries of the bins. + dot_boundaries : array-like of floats + Monotonically increasing boundaries between + (possibly alpha-scaled) dot product bins + (i.e. N values makes N-1 bins). + If the first value > 0, -inf will be prepended, + otherwise the first value will be replaced with -inf. + If the last value < 1, inf will be appended, + otherwise the last value will be replaced with inf. + + Consider using ``numpy.linspace`` between 0 and 1. + nonmatching : set of int, optional + Set of neurons, as indices into ``dotprops``, + which should not be considered matches. + If not given, all ``dotprops`` will be used + (on the assumption that matches are a small subset of possible pairs). + use_alpha : bool, optional + If true, multiply the dot product by the geometric mean + of the matched points' alpha values + (i.e. ``sqrt(alpha1 * alpha2)``). + seed : int, optional + Non-matching pairs are drawn at random using this seed, + by default {DEFAULT_SEED} + """ + fn = dist_dot_alpha if use_alpha else dist_dot + super().__init__( + dotprops, + matching_sets, + [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], + fn, + nonmatching, + seed, ) - index = [ - f"({left},{right}]" - for left, right in zip( - [-np.inf] + list(self.dist_bins), list(self.dist_bins) + [np.inf] - ) - ] - columns = [ - f"({left},{right}]" - for left, right in zip( - [-np.inf] + list(self.dot_bins), list(self.dot_bins) + [np.inf] + def build(self, threads=None) -> Lookup2d: + return Lookup2d(*self._build(threads)) + + +class LookupNd: + def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): + if [len(b) - 1 for b in boundaries] != list(cells.shape): + raise ValueError("boundaries and cells have inconsistent bin counts") + self.boundaries = boundaries + self.cells = cells + + def __call__(self, *args): + if len(args) != len(self.boundaries): + raise TypeError( + f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given" ) + + idxs = np.array( + [np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)] + ).T + out = self.cells[idxs] + if np.isscalar(args[0]): + return out[0] + else: + return out + + +def format_boundaries(arr): + return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])] + + +def parse_boundary(item, strict=False): + explicit_interval = item[0] + item[-1] + if strict and item[0] + item[-1] != implicit_interval: + raise ValueError( + f"Enclosing characters {explicit_interval} " + f"do not match implicit interval specifiers {implicit_interval}" + ) + return tuple(float(i) for i in item[1:-1].split(",")) + + +def parse_boundaries(items, strict=False): + last = None + for item in items: + lower, upper = parse_boundary(item, strict) + yield lower + last = upper + yield last + + +class Lookup2d(LookupNd): + """Convenience class inheriting from LookupNd for the common 2D case. + + Provides IO with pandas DataFrames. + """ + + def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): + if len(boundaries) != 2: + raise ValueError("boundaries must be of length 2; cells must be 2D") + super().__init__(boundaries, cells) + + def to_dataframe(self) -> pd.DataFrame: + # numpy.digitize includes left, excludes right, i.e. "[left,right)" + return pd.DataFrame( + self.cells, + format_boundaries(self.boundaries[0]), + format_boundaries(self.boundaries[1]), + ) + + @classmethod + def from_dataframe(cls, df: pd.DataFrame, strict=False): + f"""Parse score matrix from a dataframe with string index and column labels. + + Expects the index and column labels to specify an interval + like ``f"[{{lower}},{{upper}})"``. + + Will replace the lowermost and uppermost bound with -inf and inf + if they are not already. + + Parameters + ---------- + strict : bool, optional + If falsey (default), ignores parentheses and brackets, + effectively normalising to + ``{implicit_interval[0]}lower,upper{implicit_interval[1]}`` + as an implementation detail. + Otherwise, raises a ValueError if parens/brackets + do not match the implementation. + """ + boundaries = [ + np.array(list(parse_boundaries(arr, strict))) + for arr in (df.index, df.columns) ] + for b in boundaries: + b[0] = -np.inf + b[-1] = np.inf - return pd.DataFrame(cells, index, columns) + return cls(boundaries, df.to_numpy()) From b24ed9bebf6161b104cfbbcd095370c804eebd09 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 10:51:16 +0100 Subject: [PATCH 04/22] minor: capitalise constant --- navis/nbl/smat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 5aa1c4f5..081921b6 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -15,7 +15,7 @@ epsilon = sys.float_info.epsilon cpu_count = max(1, (os.cpu_count() or 2) - 1) -implicit_interval = "[)" +IMPLICIT_INTERVAL = "[)" def chunksize(it_len, cpu_count, min_chunk=50): @@ -364,10 +364,10 @@ def format_boundaries(arr): def parse_boundary(item, strict=False): explicit_interval = item[0] + item[-1] - if strict and item[0] + item[-1] != implicit_interval: + if strict and item[0] + item[-1] != IMPLICIT_INTERVAL: raise ValueError( f"Enclosing characters {explicit_interval} " - f"do not match implicit interval specifiers {implicit_interval}" + f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}" ) return tuple(float(i) for i in item[1:-1].split(",")) @@ -415,7 +415,7 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False): strict : bool, optional If falsey (default), ignores parentheses and brackets, effectively normalising to - ``{implicit_interval[0]}lower,upper{implicit_interval[1]}`` + ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}`` as an implementation detail. Otherwise, raises a ValueError if parens/brackets do not match the implementation. From 4aefc5a3509270ed20018b9f077c60ee46772288 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 10:51:48 +0100 Subject: [PATCH 05/22] Tests: add fixtures for loading data --- tests/conftest.py | 54 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5bd67dc4..33a1286e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,16 +6,16 @@ import navis -@pytest.fixture +@pytest.fixture(scope="session") def data_dir(): return Path(__file__).resolve().parent.parent / "navis" / "data" @pytest.fixture( - params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"] + params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"], ) -def swc_source(request, data_dir: Path): - swc_path: Path = data_dir / "swc" / "722817260.swc" +def swc_source(request, swc_paths): + swc_path = swc_paths[0] if request.param == "Path": yield swc_path elif request.param == "pathstr": @@ -39,9 +39,9 @@ def swc_source(request, data_dir: Path): @pytest.fixture( params=["dirstr", "dirpath", "list", "listwithdir"], ) -def swc_source_multi(request, data_dir: Path): - dpath = data_dir / "swc" - fpath = dpath / "722817260.swc" +def swc_source_multi(request, swc_paths): + fpath = swc_paths[0] + dpath = fpath.parent if request.param == "dirstr": yield str(dpath) elif request.param == "dirpath": @@ -52,3 +52,43 @@ def swc_source_multi(request, data_dir: Path): yield [dpath, fpath] else: raise ValueError(f"Unknown parameter '{request.param}'") + + +def data_paths(dpath, glob="*"): + return sorted(dpath.glob(glob)) + + +@pytest.fixture(scope="session") +def swc_paths(data_dir: Path): + return data_paths(data_dir / "swc", "*.swc") + + +@pytest.fixture(scope="session") +def gml_paths(data_dir: Path): + return data_paths(data_dir / "gml", "*.gml") + + +@pytest.fixture(scope="session") +def obj_paths(data_dir: Path): + return data_paths(data_dir / "obj", "*.obj") + + +@pytest.fixture(scope="session") +def synapses_paths(data_dir: Path): + return data_paths(data_dir / "synapses", "*.csv") + + +@pytest.fixture(scope="session") +def volumes_paths(data_dir: Path): + return data_paths(data_dir / "volumes", "*.obj") + + +@pytest.fixture +def neurons(swc_paths, synapses_paths): + swc_reader = navis.io.swc_io.SwcReader() + out = [] + for swc_path, syn_path in zip(swc_paths, synapses_paths): + neuron = swc_reader.read_file_path(swc_path) + neuron.connectors = pd.read_csv(syn_path) + out.append(neuron) + return out From da3b6b518d7501aab2fbd782b975ff39313e2541 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 14:01:20 +0100 Subject: [PATCH 06/22] Dotprops: implement __len__ --- navis/core/neurons.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/navis/core/neurons.py b/navis/core/neurons.py index b8622be9..7e962b4b 100644 --- a/navis/core/neurons.py +++ b/navis/core/neurons.py @@ -2637,3 +2637,6 @@ def to_skeleton(self, scale_vec: float = 1) -> TreeNeuron: tn._soma = self._soma return tn + + def __len__(self): + return len(self.points) From 511f7c19d2454d296866b95d28a8dcfdbbc6239f Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 14:01:55 +0100 Subject: [PATCH 07/22] smat: fixes for tests --- navis/nbl/smat.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 081921b6..6beae345 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -5,12 +5,15 @@ from collections import Counter from concurrent.futures import ProcessPoolExecutor from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple +import logging import numpy as np import pandas as pd from ..core.neurons import Dotprops +logger = logging.getLogger(__name__) + DEFAULT_SEED = 1991 epsilon = sys.float_info.epsilon @@ -37,9 +40,11 @@ def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf): left : float, optional If the first value is greater than this, prepend -inf; otherwise replace that value with -inf. + Defaults to -inf if None. right : float, optional If the last value is less than this, append inf; otherwise replace that value with inf. + Defaults to inf if None. Returns ------- @@ -51,6 +56,11 @@ def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf): ValueError If values of ``arr`` are not monotonically increasing. """ + if left is None: + left = -np.inf + if right is None: + right = np.inf + if not np.all(np.diff(arr) > 0): raise ValueError("Boundaries are not monotonically increasing") @@ -178,6 +188,7 @@ def _counts_array(self, idx_pairs, threads=None): threads = threads or cpu_count with ProcessPoolExecutor(threads) as exe: + idx_pairs = np.asarray(idx_pairs, dtype=int) for these_counts in exe.map( self._query_to_idxs, idx_pairs[:, 0], @@ -189,7 +200,7 @@ def _counts_array(self, idx_pairs, threads=None): return counts def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: - matching_pairs = set(self._yield_matching_pairs()) + matching_pairs = sorted(set(self._yield_matching_pairs())) # need to know the eventual distdot count # so we know how many non-matching pairs to draw q_idx_count = Counter(p[0] for p in matching_pairs) @@ -206,7 +217,7 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: nonmatching_pairs = [] n_nonmatching_dist_dots = 0 while n_nonmatching_dist_dots < n_matching_dist_dots: - idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1) + idx = self.rng.integers(0, len(all_nonmatching_pairs)) nonmatching_pairs.append(all_nonmatching_pairs.pop(idx)) n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]]) @@ -215,6 +226,8 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: # account for there being different total numbers of match/nonmatch dist dots matching_factor = nonmatch_counts.sum() / match_counts.sum() + if np.any(match_counts + nonmatch_counts == 0): + logger.warning("Some lookup cells have no data in them") cells = np.log2( (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) @@ -348,14 +361,11 @@ def __call__(self, *args): f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given" ) - idxs = np.array( - [np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)] - ).T + idxs = tuple( + np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries) + ) out = self.cells[idxs] - if np.isscalar(args[0]): - return out[0] - else: - return out + return out def format_boundaries(arr): From b29a2542bff7da882c05c3165e740fd78707e1ed Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 14:02:05 +0100 Subject: [PATCH 08/22] Unit tests for smat --- tests/test_nbl/__init__.py | 0 tests/test_nbl/test_smat.py | 163 ++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 tests/test_nbl/__init__.py create mode 100644 tests/test_nbl/test_smat.py diff --git a/tests/test_nbl/__init__.py b/tests/test_nbl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py new file mode 100644 index 00000000..0183a1e1 --- /dev/null +++ b/tests/test_nbl/test_smat.py @@ -0,0 +1,163 @@ +from navis.core.neurons import Dotprops +import pytest + +import numpy as np + +from navis.nbl.smat import ( + wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder +) + + +SMALLEST_DIM_SIZE = 3 +SEED = 1991 + + +@pytest.mark.parametrize( + ["arr", "left", "right", "expected"], + [ + ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), + ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), + ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]), + ], +) +def test_wrap_bounds(arr, left, right, expected): + assert np.allclose(wrap_bounds(arr, left, right), expected) + + +def test_wrap_bounds_error(): + with pytest.raises(ValueError): + wrap_bounds([1, 2, 1]) + + +def lookup_args(ndim): + f""" + Create arguments for an ND lookup table. + + The first dimension is of size {SMALLEST_DIM_SIZE}, + and subsequent dimensions are 1 longer than the previous. + The data in the cells are the sequence from 0 + to the size of the array. + The boundaries are 0 to the length of the dimension, + with the left and rightmost values replaced with -inf and inf respectively. + + Examples + -------- + >>> lookup_args(2) + ( + [ + array([-inf, 1, 2, inf]), + array([-inf, 1, 2, 3, inf]), + ], + array([ + [ 0, 1, 2, 3 ], + [ 4, 5, 6, 7 ], + [ 8, 9, 10, 11 ], + ]), + ) + """ + shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim)) + cells = np.arange(np.product(shape)).reshape(shape) + boundaries = [] + for s in shape: + b = np.arange(s + 1, dtype=float) + b[0] = -np.inf + b[-1] = np.inf + boundaries.append(b) + return boundaries, cells + + +def fmt_array(arg): + if np.isscalar(arg): + return str(arg) + else: + return "[" + ",".join(fmt_array(v) for v in arg) + "]" + + +@pytest.mark.parametrize( + ["ndim"], [[1], [2], [3], [4], [5]], ids=lambda x: f"{x}D" +) +@pytest.mark.parametrize( + ["arg"], + [ + (-1000,), + (0,), + (1,), + (1.5,), + (2,), + (1000,), + ([-1000, 0, 1, 1.5, 2, 1000],), + ], + ids=fmt_array, +) +def test_lookupNd(ndim, arg): + lookup = LookupNd(*lookup_args(ndim)) + + args = [arg for _ in range(ndim)] + expected_arr_idx = np.floor([ + np.clip(arg, 0, dim + SMALLEST_DIM_SIZE - 1) for dim in range(ndim) + ]).astype(int) + expected_val = np.ravel_multi_index( + tuple(expected_arr_idx), lookup.cells.shape + ) + + response = lookup(*args) + assert np.all(response == expected_val) + + +@pytest.mark.parametrize(["strict"], [(False,), (True,)]) +def test_lookup2d_roundtrip(strict): + lookup = Lookup2d(*lookup_args(2)) + df = lookup.to_dataframe() + lookup2 = Lookup2d.from_dataframe(df, strict=strict) + assert np.allclose(lookup.cells, lookup2.cells) + for b1, b2 in zip(lookup.boundaries, lookup2.boundaries): + assert np.allclose(b1, b2) + + +def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): + k = 5 + dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons] + n_orig = len(dotprops) + + # make jittered copies of these neurons + rng = np.random.default_rng(SEED) + jitter_sigma = 50 + matching_sets = [] + for idx, dp in enumerate(dotprops[:]): + dotprops.append( + Dotprops( + dp.points + rng.normal(0, jitter_sigma, dp.points.shape), k + ) + ) + # assign each neuron its jittered self as a match + matching_sets.append({idx, idx + n_orig}) + + # original neurons should all not match each other + nonmatching = list(range(n_orig)) + + # max distance between any 2 points in the data + # for calculating dist boundaries + max_dist = np.linalg.norm( + np.ptp( + np.concatenate([dp.points for dp in dotprops], axis=0), axis=0, + ) + ) + + return LookupDistDotBuilder( + dotprops, + matching_sets, + np.geomspace(10, max_dist, 5)[:-1], + np.linspace(0, 1, 5), + nonmatching, + alpha, + seed=SEED + 1, + ) + + +@pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)]) +@pytest.mark.parametrize(["alpha"], [(True,), (False,)]) +def test_lookupdistdotbuilder_builds(neurons, threads, alpha): + builder = prepare_lookupdistdotbuilder(neurons, alpha) + lookup = builder.build(threads) + # `pytest -rP` to see output + print(lookup.to_dataframe()) From 72dfe4cd8da5d0fa16185e4309f53ad6793a66de Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 14 Apr 2021 14:13:34 +0100 Subject: [PATCH 09/22] Do not sort matched pairs --- navis/nbl/smat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 6beae345..7e1f2d3b 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -200,7 +200,7 @@ def _counts_array(self, idx_pairs, threads=None): return counts def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: - matching_pairs = sorted(set(self._yield_matching_pairs())) + matching_pairs = list(set(self._yield_matching_pairs())) # need to know the eventual distdot count # so we know how many non-matching pairs to draw q_idx_count = Counter(p[0] for p in matching_pairs) From 41ee0d00130c72418f1f9e2b774029b063f06f4d Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 15 Apr 2021 13:39:11 +0100 Subject: [PATCH 10/22] FutureWarning ScoringFunction, base Nblaster on Lookup2d --- navis/nbl/nblast_funcs.py | 64 +++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index ae02324e..b6bb3ec0 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -15,6 +15,9 @@ import numbers import os +from warnings import warn +import operator +from pathlib import Path import numpy as np import pandas as pd @@ -23,14 +26,15 @@ from typing import Union, Optional from typing_extensions import Literal +from .smat import Lookup2d from .. import core, utils from ..core import NeuronList, Dotprops, make_dotprops from .. import config __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] -fp = os.path.dirname(__file__) -smat_path = os.path.join(fp, 'score_mats') +fp = Path(__file__).resolve().parent +smat_path = fp / 'score_mats' logger = config.logger @@ -40,6 +44,7 @@ class ScoringFunction: def __init__(self, smat): if isinstance(smat, type(None)): + self._deprecation_warning("operator.mul") self.scoring_function = self.pass_through elif isinstance(smat, (pd.DataFrame, str)): self.parse_matrix(smat) @@ -47,6 +52,12 @@ def __init__(self, smat): else: raise TypeError + def _deprecation_warning(self, alternative: str): + warn( + f"{type(self).__qualname__} is deprecated, use {alternative}.", + FutureWarning, + ) + def __call__(self, dist, dot): return self.scoring_function(dist, dot) @@ -63,10 +74,15 @@ def score_lookup(self, dist, dot): def parse_matrix(self, smat): """Parse matrix.""" if isinstance(smat, str): + self._deprecation_warning( + f"{Lookup2d.__qualname__}.from_dataframe(pandas.read_csv(smat, index_col=0))" + ) smat = pd.read_csv(smat, index_col=0) + else: + self._deprecation_warning(f"{Lookup2d.__qualname__}.from_dataframe(smat)") if not isinstance(smat, pd.DataFrame): - raise TypeError(f'Excepted filepath or DataFrame, got "{type(smat)}"') + raise TypeError(f'Expected filepath or DataFrame, got "{type(smat)}"') self.cells = smat.to_numpy() @@ -98,21 +114,28 @@ class NBlaster: changing parameters (e.g. ``use_alpha``) at a later stage will mess things up! + The highly flexible ``smat`` argument converts raw point match parameters + nto a single score representing how good that match is. + This can be any callable which takes 2 floats or same-length numpy arrays + (distance and dot product; if ``use_alpha`` is truthy, + scale dot product by the geometric mean of the alpha colinearity values) + and returns a float or numpy array. + If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above. + If path-like, converts this into a dataframe and uses as above. + If ``None``, uses ``operator.mul``. + If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast). + Parameters ---------- use_alpha : bool Whether or not to use alpha values for the scoring. If True, the dotproduct of nearest neighbor vectors will be scaled by ``sqrt(alpha1 * alpha2)``. - normalzed : bool + normalized : bool If True, will normalize scores by the best possible score (i.e. self-self) of the query neuron. - smat : str | pd.DataFrame - Score matrix. If 'auto' (default), will use scoring matrices - from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be - generated as the product of the distances and the dotproduct - of the vectors of nearest-neighbor pairs. + smat : Callable[[float, float], float] | str | os.PathLike | pd.DataFrame + See above for more details. progress : bool If True, will show a progress bar. @@ -124,15 +147,24 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True) self.normalized = normalized self.progress = progress - if smat == 'auto': + if smat is None: + smat = operator.mul + elif smat == 'auto': if self.use_alpha: - smat = pd.read_csv(f'{smat_path}/smat_alpha_fcwb.csv', - index_col=0) + smat = smat_path / 'smat_alpha_fcwb.csv' else: - smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv', - index_col=0) + smat = smat_path / 'smat_fcwb.csv' + + if isinstance(smat, (str, os.PathLike)): + smat = pd.read_csv(smat, index_col=0) + + if isinstance(smat, pd.DataFrame): + smat = Lookup2d.from_dataframe(smat) + + if not callable(smat): + raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") - self.score_fn = ScoringFunction(smat) + self.score_fn = smat self.self_hits = [] self.dotprops = [] From cab7e816a4cd609ca34942a21b24a46d0b59e189 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 15 Apr 2021 14:19:43 +0100 Subject: [PATCH 11/22] Functionally test for valid score function --- navis/nbl/nblast_funcs.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index b6bb3ec0..b52fb332 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -23,7 +23,7 @@ import pandas as pd from concurrent.futures import ProcessPoolExecutor -from typing import Union, Optional +from typing import Union, Optional, Callable from typing_extensions import Literal from .smat import Lookup2d @@ -107,8 +107,27 @@ def parse_interval(self, s): return float(s.strip("([])").split(",")[-1]) +score_func_description = """ +NBLAST score functions take 2 floats or numpy arrays of floats of length N +(for matched dotprop points/tangents, distance and dot product; +the latter possibly scaled by the geometric mean of the alpha colinearity values) +and returns a float or N-length numpy array of floats. +""".strip() + + +def is_score_function(fn: Callable[[float, float], float]): + f"""Ensure that the score function is valid for NBLAST. + + {score_func_description} + """ + test_arr = np.array([0.5] * 3) + out = fn(test_arr, test_arr) + + return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype + + class NBlaster: - """Implements version 2 of the NBLAST algorithm. + f"""Implements version 2 of the NBLAST algorithm. Please note that some properties are computed on initialization and changing parameters (e.g. ``use_alpha``) at a later stage will mess things @@ -116,10 +135,8 @@ class NBlaster: The highly flexible ``smat`` argument converts raw point match parameters nto a single score representing how good that match is. - This can be any callable which takes 2 floats or same-length numpy arrays - (distance and dot product; if ``use_alpha`` is truthy, - scale dot product by the geometric mean of the alpha colinearity values) - and returns a float or numpy array. + Most simply, it is an NBLAST score function. + {score_func_description} If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above. If path-like, converts this into a dataframe and uses as above. If ``None``, uses ``operator.mul``. @@ -164,6 +181,9 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True) if not callable(smat): raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + if not is_score_function(smat): + raise ValueError("smat is not a valid NBLAST score function, see documentation") + self.score_fn = smat self.self_hits = [] From 78f90a859f3fbb9cb06f5821ed5fd73a48456920 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 15 Apr 2021 14:24:18 +0100 Subject: [PATCH 12/22] Split smat-parsing into function --- navis/nbl/nblast_funcs.py | 61 +++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index b52fb332..660df143 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -126,13 +126,8 @@ def is_score_function(fn: Callable[[float, float], float]): return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype -class NBlaster: - f"""Implements version 2 of the NBLAST algorithm. - - Please note that some properties are computed on initialization and - changing parameters (e.g. ``use_alpha``) at a later stage will mess things - up! - +def parse_score_fn(smat, use_alpha): + f""" The highly flexible ``smat`` argument converts raw point match parameters nto a single score representing how good that match is. Most simply, it is an NBLAST score function. @@ -141,6 +136,36 @@ class NBlaster: If path-like, converts this into a dataframe and uses as above. If ``None``, uses ``operator.mul``. If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast). + """ + if smat is None: + smat = operator.mul + elif smat == 'auto': + if use_alpha: + smat = smat_path / 'smat_alpha_fcwb.csv' + else: + smat = smat_path / 'smat_fcwb.csv' + + if isinstance(smat, (str, os.PathLike)): + smat = pd.read_csv(smat, index_col=0) + + if isinstance(smat, pd.DataFrame): + smat = Lookup2d.from_dataframe(smat) + + if not callable(smat): + raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + + if not is_score_function(smat): + raise ValueError("smat is not a valid NBLAST score function, see documentation") + + +class NBlaster: + f"""Implements version 2 of the NBLAST algorithm. + + Please note that some properties are computed on initialization and + changing parameters (e.g. ``use_alpha``) at a later stage will mess things + up! + + {parse_score_fn.__doc__} Parameters ---------- @@ -164,27 +189,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True) self.normalized = normalized self.progress = progress - if smat is None: - smat = operator.mul - elif smat == 'auto': - if self.use_alpha: - smat = smat_path / 'smat_alpha_fcwb.csv' - else: - smat = smat_path / 'smat_fcwb.csv' - - if isinstance(smat, (str, os.PathLike)): - smat = pd.read_csv(smat, index_col=0) - - if isinstance(smat, pd.DataFrame): - smat = Lookup2d.from_dataframe(smat) - - if not callable(smat): - raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") - - if not is_score_function(smat): - raise ValueError("smat is not a valid NBLAST score function, see documentation") - - self.score_fn = smat + self.score_fn = parse_score_fn(smat, use_alpha) self.self_hits = [] self.dotprops = [] From 3641ac589a9f5739825013ad013d4e66706b073c Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 15 Apr 2021 14:38:44 +0100 Subject: [PATCH 13/22] Refactor smat-checking --- navis/nbl/nblast_funcs.py | 84 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index 660df143..b572a3ee 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -115,57 +115,21 @@ def parse_interval(self, s): """.strip() -def is_score_function(fn: Callable[[float, float], float]): - f"""Ensure that the score function is valid for NBLAST. - - {score_func_description} - """ - test_arr = np.array([0.5] * 3) - out = fn(test_arr, test_arr) - - return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype +class NBlaster: + f"""Implements version 2 of the NBLAST algorithm. + Please note that some properties are computed on initialization and + changing parameters (e.g. ``use_alpha``) at a later stage will mess things + up! -def parse_score_fn(smat, use_alpha): - f""" The highly flexible ``smat`` argument converts raw point match parameters - nto a single score representing how good that match is. + into a single score representing how good that match is. Most simply, it is an NBLAST score function. {score_func_description} If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above. If path-like, converts this into a dataframe and uses as above. If ``None``, uses ``operator.mul``. If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast). - """ - if smat is None: - smat = operator.mul - elif smat == 'auto': - if use_alpha: - smat = smat_path / 'smat_alpha_fcwb.csv' - else: - smat = smat_path / 'smat_fcwb.csv' - - if isinstance(smat, (str, os.PathLike)): - smat = pd.read_csv(smat, index_col=0) - - if isinstance(smat, pd.DataFrame): - smat = Lookup2d.from_dataframe(smat) - - if not callable(smat): - raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") - - if not is_score_function(smat): - raise ValueError("smat is not a valid NBLAST score function, see documentation") - - -class NBlaster: - f"""Implements version 2 of the NBLAST algorithm. - - Please note that some properties are computed on initialization and - changing parameters (e.g. ``use_alpha``) at a later stage will mess things - up! - - {parse_score_fn.__doc__} Parameters ---------- @@ -189,11 +153,45 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True) self.normalized = normalized self.progress = progress - self.score_fn = parse_score_fn(smat, use_alpha) + self.score_fn = self._parse_score_fn(smat) self.self_hits = [] self.dotprops = [] + def _parse_score_fn(self, smat): + if smat is None: + smat = operator.mul + elif smat == 'auto': + if self.use_alpha: + smat = smat_path / 'smat_alpha_fcwb.csv' + else: + smat = smat_path / 'smat_fcwb.csv' + + if isinstance(smat, (str, os.PathLike)): + smat = pd.read_csv(smat, index_col=0) + + if isinstance(smat, pd.DataFrame): + smat = Lookup2d.from_dataframe(smat) + + if not callable(smat): + raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + + if not isinstance(smat(0.5, 0.5), float): + raise ValueError("smat does not take 2 floats and return a float") + + test_arr = np.array([0.5] * 3) + try: + out = smat(test_arr, test_arr) + except Exception as e: + raise ValueError(f"Failed to use smat with numpy arrays: {e}") + + if out.shape != test_arr.shape: + raise ValueError( + f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}" + ) + + return smat + def append(self, dotprops): """Append dotprops.""" if isinstance(dotprops, (NeuronList, list)): From 982c69ce5a997ef429823500a92dd4d3f024717e Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 15 Apr 2021 18:32:46 +0100 Subject: [PATCH 14/22] Refactor LookupDistDotBuilder - Use composition instead of inheritance to allow boundaries to be defined later - Add (questionable) helper methods for determining boundaries --- navis/nbl/smat.py | 135 ++++++++++++++++++++++++++++++++---- tests/test_nbl/test_smat.py | 42 +++++++---- 2 files changed, 148 insertions(+), 29 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 7e1f2d3b..2f79cc26 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -5,15 +5,13 @@ from collections import Counter from concurrent.futures import ProcessPoolExecutor from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple -import logging import numpy as np import pandas as pd +from ..config import logger from ..core.neurons import Dotprops -logger = logging.getLogger(__name__) - DEFAULT_SEED = 1991 epsilon = sys.float_info.epsilon @@ -276,13 +274,11 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops): return [dist, dot * np.sqrt(alpha)] -class LookupDistDotBuilder(LookupNdBuilder): +class LookupDistDotBuilder: def __init__( self, dotprops, matching_sets, - dist_boundaries, - dot_boundaries, nonmatching=None, use_alpha=False, seed=DEFAULT_SEED, @@ -295,6 +291,10 @@ def __init__( 2. The dot products of direction vectors around those points, optionally scaled by the colinearity ``alpha``. + The boundaries for the bins for each of these two values need to be set. + Use the ``.calc_(dist|dot)_boundaries`` methods to estimate values + based on the data, or the ``.set_(dist|dot)_boundaries`` to use your own. + Parameters ---------- dotprops : dict or list of Dotprops @@ -334,18 +334,123 @@ def __init__( Non-matching pairs are drawn at random using this seed, by default {DEFAULT_SEED} """ - fn = dist_dot_alpha if use_alpha else dist_dot - super().__init__( - dotprops, - matching_sets, - [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], - fn, - nonmatching, - seed, + self.dotprops = dotprops + self.matching_sets = matching_sets + self._dist_boundaries = None + self._dot_boundaries = None + self.fn = dist_dot_alpha if use_alpha else dist_dot + self.nonmatching = nonmatching + self.seed = seed + + @property + def dist_boundaries(self): + return self._dist_boundaries + + def set_dist_boundaries(self, bounds): + """Set the boundaries for point match distance bins. + + ``numpy.geomspace`` is helpful if you know approximately + what the upper bound of the lowest bin and the lower bound of the + highest bin should be. + + Parameters + ---------- + bounds : array-like of float + Bin boundaries to set (may be expanded to cover full domain). + Should not go below 0. + + Returns + ------- + np.ndarray + The (possibly expanded) bin boundaries which will be used. + """ + self._dist_boundaries = wrap_bounds(bounds, 0) + return self._dist_boundaries + + def calc_dist_boundaries(self, n_bins, scale=10): + """Estimate appropriate point match distance bins from the scale of the data. + + The lower bound of the highest bin will be estimated as ``1/scale`` the size + of the diagonal of the axis aligned bounding box of the smallest dotprop: + there is unlikely to be any value in a point match + at a distance the same order of magnitude as a neuron's size. + + Each subsequent lower bound will be half of that bin's upper bound; + therefore the upper bound of the lowest bin will be + ``highest_lower / scale ** (n_bins - 1)``. + + These estimates are quite conservative and may not yield good results for your use case, + but you may want to use them as a guide for your own automatically-determined boundaries. + + Parameters + ---------- + n_bins : int + Number of bins to create. + scale : float, optional + The multiplication from one bound to another, default 10. + + Returns + ------- + np.ndarray + The bin boundaries which will be used (length ``n_bins + 1``). + """ + + smallest_diag = np.inf + for dp in self.dotprops: + smallest_diag = min( + smallest_diag, np.linalg.norm(np.ptp(dp.points, axis=0)) + ) + highest_lower = smallest_diag / scale + return self.set_dist_boundaries( + highest_lower / scale ** np.arange(n_bins - 1)[::-1] ) + @property + def dot_boundaries(self): + return self._dot_boundaries + + def set_dot_boundaries(self, bounds): + """Set the boundaries for point match tangent dot product bins. + + Parameters + ---------- + bounds : array-like of float + Bin boundaries to set (may be expanded to cover full domain). + Should not go below 0 or above 1. + + Returns + ------- + np.ndarray + The (possibly expanded) bin boundaries which will be used. + """ + self._dot_boundaries = wrap_bounds(bounds, 0, 1) + return self._dot_boundaries + + def calc_dot_boundaries(self, n_bins): + """Linearly interpolate bin boundaries between 0 and 1. + + Parameters + ---------- + n_bins : int + Number of bins to use. + + Returns + ------- + np.ndarray + Bin boundaries which will be used (length ``n_bins + 1``) + """ + return self.set_dot_boundaries(np.linspace(0, 1, n_bins + 1)) + def build(self, threads=None) -> Lookup2d: - return Lookup2d(*self._build(threads)) + builder = LookupNdBuilder( + self.dotprops, + self.matching_sets, + [self.dist_boundaries, self.dot_boundaries], + self.fn, + self.nonmatching, + self.seed + ) + return Lookup2d(*builder._build(threads)) class LookupNd: diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index 0183a1e1..daf2951e 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -114,8 +114,27 @@ def test_lookup2d_roundtrip(strict): assert np.allclose(b1, b2) -def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): - k = 5 +def test_calc_dist_boundaries(neurons): + dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons] + builder = LookupDistDotBuilder(dotprops, []) + n_bins = 5 + bounds = builder.calc_dist_boundaries(n_bins) + assert bounds.shape == (n_bins + 1,) + assert bounds[0] == -np.inf + assert bounds[-1] == np.inf + + +def test_calc_dot_boundaries(neurons): + dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons] + builder = LookupDistDotBuilder(dotprops, []) + n_bins = 5 + bounds = builder.calc_dot_boundaries(n_bins) + assert bounds.shape == (n_bins + 1,) + assert bounds[0] == -np.inf + assert bounds[-1] == np.inf + + +def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False): dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons] n_orig = len(dotprops) @@ -135,29 +154,24 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): # original neurons should all not match each other nonmatching = list(range(n_orig)) - # max distance between any 2 points in the data - # for calculating dist boundaries - max_dist = np.linalg.norm( - np.ptp( - np.concatenate([dp.points for dp in dotprops], axis=0), axis=0, - ) - ) - - return LookupDistDotBuilder( + builder = LookupDistDotBuilder( dotprops, matching_sets, - np.geomspace(10, max_dist, 5)[:-1], - np.linspace(0, 1, 5), nonmatching, alpha, seed=SEED + 1, ) + if with_bounds: + n_bins = 5 + builder.calc_dist_boundaries(n_bins) + builder.calc_dot_boundaries(n_bins) + return builder @pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)]) @pytest.mark.parametrize(["alpha"], [(True,), (False,)]) def test_lookupdistdotbuilder_builds(neurons, threads, alpha): - builder = prepare_lookupdistdotbuilder(neurons, alpha) + builder = prepare_lookupdistdotbuilder(neurons, alpha, with_bounds=True) lookup = builder.build(threads) # `pytest -rP` to see output print(lookup.to_dataframe()) From 268d81d67ccc66923c21cce980287d2fd743782c Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 16 Apr 2021 16:02:08 +0100 Subject: [PATCH 15/22] Revert "Refactor LookupDistDotBuilder" This reverts commit 779616f769f8b7fd0360848601926883117a228d. --- navis/nbl/smat.py | 135 ++++-------------------------------- tests/test_nbl/test_smat.py | 42 ++++------- 2 files changed, 29 insertions(+), 148 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 2f79cc26..7e1f2d3b 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -5,13 +5,15 @@ from collections import Counter from concurrent.futures import ProcessPoolExecutor from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple +import logging import numpy as np import pandas as pd -from ..config import logger from ..core.neurons import Dotprops +logger = logging.getLogger(__name__) + DEFAULT_SEED = 1991 epsilon = sys.float_info.epsilon @@ -274,11 +276,13 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops): return [dist, dot * np.sqrt(alpha)] -class LookupDistDotBuilder: +class LookupDistDotBuilder(LookupNdBuilder): def __init__( self, dotprops, matching_sets, + dist_boundaries, + dot_boundaries, nonmatching=None, use_alpha=False, seed=DEFAULT_SEED, @@ -291,10 +295,6 @@ def __init__( 2. The dot products of direction vectors around those points, optionally scaled by the colinearity ``alpha``. - The boundaries for the bins for each of these two values need to be set. - Use the ``.calc_(dist|dot)_boundaries`` methods to estimate values - based on the data, or the ``.set_(dist|dot)_boundaries`` to use your own. - Parameters ---------- dotprops : dict or list of Dotprops @@ -334,123 +334,18 @@ def __init__( Non-matching pairs are drawn at random using this seed, by default {DEFAULT_SEED} """ - self.dotprops = dotprops - self.matching_sets = matching_sets - self._dist_boundaries = None - self._dot_boundaries = None - self.fn = dist_dot_alpha if use_alpha else dist_dot - self.nonmatching = nonmatching - self.seed = seed - - @property - def dist_boundaries(self): - return self._dist_boundaries - - def set_dist_boundaries(self, bounds): - """Set the boundaries for point match distance bins. - - ``numpy.geomspace`` is helpful if you know approximately - what the upper bound of the lowest bin and the lower bound of the - highest bin should be. - - Parameters - ---------- - bounds : array-like of float - Bin boundaries to set (may be expanded to cover full domain). - Should not go below 0. - - Returns - ------- - np.ndarray - The (possibly expanded) bin boundaries which will be used. - """ - self._dist_boundaries = wrap_bounds(bounds, 0) - return self._dist_boundaries - - def calc_dist_boundaries(self, n_bins, scale=10): - """Estimate appropriate point match distance bins from the scale of the data. - - The lower bound of the highest bin will be estimated as ``1/scale`` the size - of the diagonal of the axis aligned bounding box of the smallest dotprop: - there is unlikely to be any value in a point match - at a distance the same order of magnitude as a neuron's size. - - Each subsequent lower bound will be half of that bin's upper bound; - therefore the upper bound of the lowest bin will be - ``highest_lower / scale ** (n_bins - 1)``. - - These estimates are quite conservative and may not yield good results for your use case, - but you may want to use them as a guide for your own automatically-determined boundaries. - - Parameters - ---------- - n_bins : int - Number of bins to create. - scale : float, optional - The multiplication from one bound to another, default 10. - - Returns - ------- - np.ndarray - The bin boundaries which will be used (length ``n_bins + 1``). - """ - - smallest_diag = np.inf - for dp in self.dotprops: - smallest_diag = min( - smallest_diag, np.linalg.norm(np.ptp(dp.points, axis=0)) - ) - highest_lower = smallest_diag / scale - return self.set_dist_boundaries( - highest_lower / scale ** np.arange(n_bins - 1)[::-1] + fn = dist_dot_alpha if use_alpha else dist_dot + super().__init__( + dotprops, + matching_sets, + [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], + fn, + nonmatching, + seed, ) - @property - def dot_boundaries(self): - return self._dot_boundaries - - def set_dot_boundaries(self, bounds): - """Set the boundaries for point match tangent dot product bins. - - Parameters - ---------- - bounds : array-like of float - Bin boundaries to set (may be expanded to cover full domain). - Should not go below 0 or above 1. - - Returns - ------- - np.ndarray - The (possibly expanded) bin boundaries which will be used. - """ - self._dot_boundaries = wrap_bounds(bounds, 0, 1) - return self._dot_boundaries - - def calc_dot_boundaries(self, n_bins): - """Linearly interpolate bin boundaries between 0 and 1. - - Parameters - ---------- - n_bins : int - Number of bins to use. - - Returns - ------- - np.ndarray - Bin boundaries which will be used (length ``n_bins + 1``) - """ - return self.set_dot_boundaries(np.linspace(0, 1, n_bins + 1)) - def build(self, threads=None) -> Lookup2d: - builder = LookupNdBuilder( - self.dotprops, - self.matching_sets, - [self.dist_boundaries, self.dot_boundaries], - self.fn, - self.nonmatching, - self.seed - ) - return Lookup2d(*builder._build(threads)) + return Lookup2d(*self._build(threads)) class LookupNd: diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index daf2951e..0183a1e1 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -114,27 +114,8 @@ def test_lookup2d_roundtrip(strict): assert np.allclose(b1, b2) -def test_calc_dist_boundaries(neurons): - dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons] - builder = LookupDistDotBuilder(dotprops, []) - n_bins = 5 - bounds = builder.calc_dist_boundaries(n_bins) - assert bounds.shape == (n_bins + 1,) - assert bounds[0] == -np.inf - assert bounds[-1] == np.inf - - -def test_calc_dot_boundaries(neurons): - dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons] - builder = LookupDistDotBuilder(dotprops, []) - n_bins = 5 - bounds = builder.calc_dot_boundaries(n_bins) - assert bounds.shape == (n_bins + 1,) - assert bounds[0] == -np.inf - assert bounds[-1] == np.inf - - -def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False): +def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): + k = 5 dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons] n_orig = len(dotprops) @@ -154,24 +135,29 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False): # original neurons should all not match each other nonmatching = list(range(n_orig)) - builder = LookupDistDotBuilder( + # max distance between any 2 points in the data + # for calculating dist boundaries + max_dist = np.linalg.norm( + np.ptp( + np.concatenate([dp.points for dp in dotprops], axis=0), axis=0, + ) + ) + + return LookupDistDotBuilder( dotprops, matching_sets, + np.geomspace(10, max_dist, 5)[:-1], + np.linspace(0, 1, 5), nonmatching, alpha, seed=SEED + 1, ) - if with_bounds: - n_bins = 5 - builder.calc_dist_boundaries(n_bins) - builder.calc_dot_boundaries(n_bins) - return builder @pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)]) @pytest.mark.parametrize(["alpha"], [(True,), (False,)]) def test_lookupdistdotbuilder_builds(neurons, threads, alpha): - builder = prepare_lookupdistdotbuilder(neurons, alpha, with_bounds=True) + builder = prepare_lookupdistdotbuilder(neurons, alpha) lookup = builder.build(threads) # `pytest -rP` to see output print(lookup.to_dataframe()) From 32b756df5990b3eba5d61359f8154248fc02d8ba Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 16 Apr 2021 16:17:40 +0100 Subject: [PATCH 16/22] smat: minor refactor --- navis/nbl/smat.py | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 7e1f2d3b..e60a07d4 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -90,7 +90,7 @@ def __init__( dotprops, matching_sets, boundaries: Sequence[Sequence[float]], - fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], + match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], nonmatching=None, seed=DEFAULT_SEED, ) -> None: @@ -114,7 +114,7 @@ def __init__( -inf and inf will be prepended and appended. See the ``wrap_bounds`` convenience function. - fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] + match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] Function taking 2 arguments, both instances of ``navis.core.neurons.Dotprops``, and returning a list of 1D ``numpy.ndarray``s of floats. @@ -136,7 +136,7 @@ def __init__( self.dotprops = dotprops self.matching_sets = matching_sets self._nonmatching = nonmatching - self.fn = fn + self.match_fn = match_fn self.boundaries = [wrap_bounds(b) for b in boundaries] self.rng = np.random.default_rng(seed) @@ -167,7 +167,7 @@ def _empty_counts(self): return np.zeros(shape, int) def _query_to_idxs(self, q_idx, t_idx, counts=None): - response = self.fn(self.dotprops[q_idx], self.dotprops[t_idx]) + response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)] if counts is None: @@ -199,15 +199,7 @@ def _counts_array(self, idx_pairs, threads=None): return counts - def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: - matching_pairs = list(set(self._yield_matching_pairs())) - # need to know the eventual distdot count - # so we know how many non-matching pairs to draw - q_idx_count = Counter(p[0] for p in matching_pairs) - n_matching_dist_dots = sum( - len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() - ) - + def _pick_nonmatching_pairs(self, n_matching_qual_vals): # pre-calculating which pairs we're going to use, # rather than drawing them as we need them, # means that we can parallelise the later step more effectively. @@ -215,11 +207,25 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: # because of how long distdot calculation will take all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) nonmatching_pairs = [] - n_nonmatching_dist_dots = 0 - while n_nonmatching_dist_dots < n_matching_dist_dots: + n_nonmatching_qual_vals = 0 + while n_nonmatching_qual_vals < n_matching_qual_vals: idx = self.rng.integers(0, len(all_nonmatching_pairs)) - nonmatching_pairs.append(all_nonmatching_pairs.pop(idx)) - n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]]) + nonmatching_pair = all_nonmatching_pairs.pop(idx) + nonmatching_pairs.append(nonmatching_pair) + n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]]) + + return nonmatching_pairs + + def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: + matching_pairs = list(set(self._yield_matching_pairs())) + # need to know the eventual distdot count + # so we know how many non-matching pairs to draw + q_idx_count = Counter(p[0] for p in matching_pairs) + n_matching_qual_vals = sum( + len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() + ) + + nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals) match_counts = self._counts_array(matching_pairs, threads) nonmatch_counts = self._counts_array(nonmatching_pairs, threads) @@ -334,12 +340,12 @@ def __init__( Non-matching pairs are drawn at random using this seed, by default {DEFAULT_SEED} """ - fn = dist_dot_alpha if use_alpha else dist_dot + match_fn = dist_dot_alpha if use_alpha else dist_dot super().__init__( dotprops, matching_sets, [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], - fn, + match_fn, nonmatching, seed, ) From c9b6715a4640aeaca7885695a2b71898285cfa5f Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 11:00:03 +0100 Subject: [PATCH 17/22] Minor refactor --- navis/nbl/smat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index e60a07d4..72c3ab69 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -186,14 +186,15 @@ def _counts_array(self, idx_pairs, threads=None): return counts threads = threads or cpu_count + idx_pairs = np.asarray(idx_pairs, dtype=int) + chunks = chunksize(len(idx_pairs), threads) with ProcessPoolExecutor(threads) as exe: - idx_pairs = np.asarray(idx_pairs, dtype=int) for these_counts in exe.map( self._query_to_idxs, idx_pairs[:, 0], idx_pairs[:, 1], - chunksize=chunksize(len(idx_pairs), threads), + chunksize=chunks, ): counts += these_counts From cb3a64b0218f63afcda6480dc01b395ebf7d62be Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 11:34:11 +0100 Subject: [PATCH 18/22] Re-order NBLAST builder tests to avoid deadlock The tests are not run in parallel with each other so there shouldn't be any OS-level interference. I also tested without session-scoped test fixtures. The deadlock occurs on the first parallelised test which runs directly after a non-parallelised test - by re-ordering the tests, the deadlock is avoided. --- tests/test_nbl/test_smat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index 0183a1e1..f47527f0 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -154,8 +154,8 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): ) -@pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)]) @pytest.mark.parametrize(["alpha"], [(True,), (False,)]) +@pytest.mark.parametrize(["threads"], [(0,), (2,), (None,)]) def test_lookupdistdotbuilder_builds(neurons, threads, alpha): builder = prepare_lookupdistdotbuilder(neurons, alpha) lookup = builder.build(threads) From e9e837a9341db6aa53a352f20c713c338783882f Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 13:16:26 +0100 Subject: [PATCH 19/22] Significant refactors --- navis/nbl/nblast_funcs.py | 104 +++++++++-------------------- navis/nbl/smat.py | 135 +++++++++++++++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 82 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index b572a3ee..4eaf2bf4 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -16,31 +16,38 @@ import numbers import os from warnings import warn -import operator -from pathlib import Path import numpy as np import pandas as pd from concurrent.futures import ProcessPoolExecutor -from typing import Union, Optional, Callable +from typing import Union, Optional from typing_extensions import Literal -from .smat import Lookup2d +from .smat import Lookup2d, parse_score_fn, SCORE_FN_DESCR from .. import core, utils from ..core import NeuronList, Dotprops, make_dotprops from .. import config __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] -fp = Path(__file__).resolve().parent -smat_path = fp / 'score_mats' logger = config.logger class ScoringFunction: - """Class representing scoring function.""" + """[DEPRECATED] Class representing scoring function. + + Use a ``navis.nbl.smat.Lookup2d`` for score functions + like those used in Costa et al. 2016 + (see the ``.from_dataframe()`` class method + to load it from pandas). + The actual score matrices from FCWB can be accessed with + ``navis.nbl.smat.smat_fcwb()``. + + Otherwise, use any compatible callable, + e.g. ``operator.mul`` (used by this class for ``smat=None``). + """ def __init__(self, smat): if isinstance(smat, type(None)): @@ -107,14 +114,6 @@ def parse_interval(self, s): return float(s.strip("([])").split(",")[-1]) -score_func_description = """ -NBLAST score functions take 2 floats or numpy arrays of floats of length N -(for matched dotprop points/tangents, distance and dot product; -the latter possibly scaled by the geometric mean of the alpha colinearity values) -and returns a float or N-length numpy array of floats. -""".strip() - - class NBlaster: f"""Implements version 2 of the NBLAST algorithm. @@ -125,11 +124,7 @@ class NBlaster: The highly flexible ``smat`` argument converts raw point match parameters into a single score representing how good that match is. Most simply, it is an NBLAST score function. - {score_func_description} - If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above. - If path-like, converts this into a dataframe and uses as above. - If ``None``, uses ``operator.mul``. - If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast). + {SCORE_FN_DESCR} Parameters ---------- @@ -140,8 +135,8 @@ class NBlaster: normalized : bool If True, will normalize scores by the best possible score (i.e. self-self) of the query neuron. - smat : Callable[[float, float], float] | str | os.PathLike | pd.DataFrame - See above for more details. + smat : Callable[[float, float], float] | str | os.PathLike | pd.DataFrame, default "smat" + See ``navis.nbl.smat.parse_score_fn``. progress : bool If True, will show a progress bar. @@ -153,45 +148,11 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True) self.normalized = normalized self.progress = progress - self.score_fn = self._parse_score_fn(smat) + self.score_fn = parse_score_fn(smat, use_alpha) self.self_hits = [] self.dotprops = [] - def _parse_score_fn(self, smat): - if smat is None: - smat = operator.mul - elif smat == 'auto': - if self.use_alpha: - smat = smat_path / 'smat_alpha_fcwb.csv' - else: - smat = smat_path / 'smat_fcwb.csv' - - if isinstance(smat, (str, os.PathLike)): - smat = pd.read_csv(smat, index_col=0) - - if isinstance(smat, pd.DataFrame): - smat = Lookup2d.from_dataframe(smat) - - if not callable(smat): - raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") - - if not isinstance(smat(0.5, 0.5), float): - raise ValueError("smat does not take 2 floats and return a float") - - test_arr = np.array([0.5] * 3) - try: - out = smat(test_arr, test_arr) - except Exception as e: - raise ValueError(f"Failed to use smat with numpy arrays: {e}") - - if out.shape != test_arr.shape: - raise ValueError( - f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}" - ) - - return smat - def append(self, dotprops): """Append dotprops.""" if isinstance(dotprops, (NeuronList, list)): @@ -412,12 +373,8 @@ def nblast_smart(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotpro that have lots of branches. normalized : bool, optional Whether to return normalized NBLAST scores. - smat : str | pd.DataFrame - Score matrix. If 'auto' (default), will use scoring matrices - from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be - generated as the product of the distances and the dotproduct - of the vectors of nearest-neighbor pairs. + smat : str | pd.DataFrame, default "auto" + Score matrix. See ``navis.nbl.smat.parse_score_fn``. progress : bool Whether to show progress bars. @@ -467,6 +424,7 @@ def nblast_smart(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotpro A synapse-based variant of NBLAST. """ + smat = parse_score_fn(smat, use_alpha) utils.eval_param(criterion, name='criterion', allowed_values=("percentile", "score", "N")) @@ -679,15 +637,12 @@ def nblast(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotprops'], individual processes. use_alpha : bool, optional Emphasizes neurons' straight parts (backbone) over parts - that have lots of branches. + that have lots of branches. normalized : bool, optional Whether to return normalized NBLAST scores. smat : str | pd.DataFrame - Score matrix. If 'auto' (default), will use scoring matrices - from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be - generated as the product of the distances and the dotproduct - of the vectors of nearest-neighbor pairs. + Score matrix. + See ``navis.nbl.smat.parse_score_fn``. progress : bool Whether to show progress bars. @@ -732,6 +687,8 @@ def nblast(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotprops'], A synapse-based variant of NBLAST. """ + smat = parse_score_fn(smat, use_alpha) + # Check if query or targets are in microns # Note this test can return `None` if it can't be determined if check_microns(query) is False: @@ -839,11 +796,8 @@ def nblast_allbyall(x: NeuronList, normalized : bool, optional Whether to return normalized NBLAST scores. smat : str | pd.DataFrame, optional - Score matrix. If 'auto' (default), will use scoring matrices - from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be - generated as the product of the distances and the dotproduct - of the vectors of nearest-neighbor pairs. + Score matrix. + See ``navis.nbl.smat.parse_score_fn``. progress : bool Whether to show progress bars. @@ -883,6 +837,8 @@ def nblast_allbyall(x: NeuronList, For generic query -> target nblasts. """ + smat = parse_score_fn(smat, use_alpha) + # Check if query or targets are in microns # Note this test can return `None` if it can't be determined if check_microns(x) is False: diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 72c3ab69..e61230f4 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -6,6 +6,10 @@ from concurrent.futures import ProcessPoolExecutor from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple import logging +from pathlib import Path +from functools import lru_cache +from copy import deepcopy +import operator import numpy as np import pandas as pd @@ -20,6 +24,9 @@ cpu_count = max(1, (os.cpu_count() or 2) - 1) IMPLICIT_INTERVAL = "[)" +fp = Path(__file__).resolve().parent +smat_path = fp / 'score_mats' + def chunksize(it_len, cpu_count, min_chunk=50): return max(min_chunk, int(it_len / (cpu_count * 4))) @@ -390,12 +397,16 @@ def parse_boundary(item, strict=False): def parse_boundaries(items, strict=False): - last = None + # declaring upper first and then checking for None + # pleases the type checker, and handles the case where + # len(items) == 0 + upper = None for item in items: lower, upper = parse_boundary(item, strict) yield lower - last = upper - yield last + if upper is None: + return + yield upper class Lookup2d(LookupNd): @@ -437,12 +448,120 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False): Otherwise, raises a ValueError if parens/brackets do not match the implementation. """ - boundaries = [ - np.array(list(parse_boundaries(arr, strict))) - for arr in (df.index, df.columns) - ] - for b in boundaries: + boundaries = [] + for arr in (df.index, df.columns): + b = np.array(list(parse_boundaries(arr, strict))) b[0] = -np.inf b[-1] = np.inf + boundaries.append(b) return cls(boundaries, df.to_numpy()) + + +@lru_cache +def _smat_fcwb(alpha=False): + # cached private function defers construction + # until needed (speeding startup), + # but avoids repeated reads (speeding later uses) + fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha] + fpath = smat_path / fname + + return Lookup2d.from_dataframe( + pd.read_csv(fpath, index_col=0) + ) + + +def smat_fcwb(alpha=False): + # deepcopied so that mutations do not propagate to cache + return deepcopy(_smat_fcwb(alpha)) + + +def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): + """Checks functionally that the callable can be used as a score function. + + Parameters + ---------- + nargs : optional int, default 2 + How many positional arguments the score function should have. + scalar : optional bool, default True + Check that the function can be used on ``nargs`` scalars. + array : optional bool, default True + Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s. + + Raises + ------ + ValueError + If the score function is not appropriate. + """ + if scalar: + scalars = [0.5] * nargs + if not isinstance(fn(*scalars), float): + raise ValueError("smat does not take 2 floats and return a float") + + if array: + test_arr = np.array([0.5] * 3) + arrs = [test_arr] * nargs + try: + out = fn(*arrs) + except Exception as e: + raise ValueError(f"Failed to use smat with numpy arrays: {e}") + + if out.shape != test_arr.shape: + raise ValueError( + f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}" + ) + + +SCORE_FN_DESCR = """ +NBLAST score functions take 2 floats or numpy arrays of floats of length N +(for matched dotprop points/tangents, distance and dot product; +the latter possibly scaled by the geometric mean of the alpha colinearity values) +and returns a float or N-length numpy array of floats. +""".strip().replace("\n", " ") + + +def parse_score_fn(smat, alpha=False): + f"""Interpret ``smat`` as a score function. + Primarily for backwards compatibility. + + {SCORE_FN_DESCR} + + Parameters + ---------- + smat : None | "auto" | str | os.PathLike | pandas.DataFrame | Callable[[float, float], float] + If ``None``, use ``operator.mul``. + If ``"auto"``, use ``navis.nbl.smat.smat_fcwb(alpha)``. + If a dataframe, use ``navis.nbl.smat.Lookup2d.from_dataframe(smat)``. + If another string or path-like, load from CSV in a dataframe and uses as above. + Also checks the signature of the callable. + Raises an error, probably a ValueError, if it can't be interpreted. + alpha : optional bool, default False + If ``smat`` is None, choose whether to use the FCWB matrices + with or without alpha. + + Returns + ------- + Callable + + Raises + ------ + ValueError + If score function cannot be interpreted. + """ + if smat is None: + smat = operator.mul + elif smat == 'auto': + smat = smat_fcwb(alpha) + + if isinstance(smat, (str, os.PathLike)): + smat = pd.read_csv(smat, index_col=0) + + if isinstance(smat, pd.DataFrame): + smat = Lookup2d.from_dataframe(smat) + + if not callable(smat): + raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + + check_score_fn(smat) + + return smat From 92fc6324296aac48c8fed3e52078c546d53145f0 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 13:16:51 +0100 Subject: [PATCH 20/22] Use parse_score_fn in synblast --- navis/nbl/synblast_funcs.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py index 2208b49b..17c39807 100644 --- a/navis/nbl/synblast_funcs.py +++ b/navis/nbl/synblast_funcs.py @@ -29,6 +29,7 @@ from ..core import NeuronList, BaseNeuron from .nblast_funcs import check_microns, find_optimal_partition, ScoringFunction +from .smat import parse_score_fn __all__ = ['synblast'] @@ -76,11 +77,7 @@ def __init__(self, normalized=True, by_type=True, self.progress = progress self.by_type = by_type - if smat == 'auto': - smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv', - index_col=0) - - self.score_fn = ScoringFunction(smat) + self.score_fn = parse_score_fn(smat, False) self.self_hits = [] self.trees = [] From eef4b65b70bfb4235a698562dae6b0fff9cc30fc Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 13:19:59 +0100 Subject: [PATCH 21/22] Fix lru_cache usage --- navis/nbl/smat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index e61230f4..2dd77473 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -458,7 +458,7 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False): return cls(boundaries, df.to_numpy()) -@lru_cache +@lru_cache(maxsize=None) def _smat_fcwb(alpha=False): # cached private function defers construction # until needed (speeding startup), From e8ecdfb97a31b47ecca0c5aed51937d5f7e181f9 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 29 Apr 2021 14:22:03 +0100 Subject: [PATCH 22/22] Expose some of smat.py to the nbl module --- navis/nbl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/navis/nbl/__init__.py b/navis/nbl/__init__.py index 6ce105c8..d1e63f13 100644 --- a/navis/nbl/__init__.py +++ b/navis/nbl/__init__.py @@ -15,5 +15,6 @@ from .nblast_funcs import nblast, nblast_allbyall, nblast_smart from .synblast_funcs import synblast +from .smat import Lookup2d, smat_fcwb, parse_score_fn __all__ = ['nblast', 'nblast_allbyall', 'nblast_smart', 'synblast']