From 672aca252a0289bdfdcca6fda70f411cba611b33 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Fri, 9 Apr 2021 18:33:34 +0100
Subject: [PATCH 01/22] WIP NBLAST score matrix calculation

---
 navis/nbl/smat.py | 289 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 289 insertions(+)
 create mode 100644 navis/nbl/smat.py

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
new file mode 100644
index 00000000..5c0fc167
--- /dev/null
+++ b/navis/nbl/smat.py
@@ -0,0 +1,289 @@
+from itertools import permutations
+import sys
+import os
+from collections import Counter
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+import numpy as np
+
+epsilon = sys.float_info.epsilon
+cpu_count = max(1, (os.cpu_count() or 2) - 1)
+
+
+def ensure_inf_bounds(numbers):
+    numbers[0] = -np.inf
+    numbers[1] = np.inf
+    return np.array(numbers)
+
+
+def chunksize(it_len, cpu_count, min_chunk=50):
+    return max(min_chunk, int(it_len / (cpu_count * 4)))
+
+
+class ScoreMatrixBuilder:
+    def __init__(
+        self,
+        dotprops,
+        matching_sets,
+        nonmatching=None,
+        alpha=False,
+        seed=1991,
+    ):
+        """Class for building a score matrix for NBLAST.
+
+        dist_bins and dot_bins must additionally be given using the set_* or calc_* methods.
+
+        Parameters
+        ----------
+        dotprops : dict or list of Dotprops
+            An indexable sequence of all neurons which will be used as the training set,
+            as Dotprops objects.
+        matching_sets : list of sets of indices into dotprops
+            Sets of neurons, as indices into dotprops, which should be considered matches.
+        nonmatching : set of indices into dotprops, optional
+            Set of neurons, as indices into dotprops, which should not be considered matches.
+            If not given, all dotprops will be used
+            (on the assumption that matches are a small subset of possible pairs).
+        alpha : bool, optional
+            Whether to multiply dot product by alpha (local colinearity)
+            when calculating dist-dots.
+        seed : int, optional
+            Non-matching pairs are drawn at random using this seed, by default 1991
+        """
+        self.rng = np.random.default_rng(seed)
+        self.dotprops = dotprops
+        self.matching_sets = matching_sets
+        self.alpha = alpha
+        self._nonmatching = nonmatching
+        self.dist_bins = None
+        self.dot_bins = None
+
+    def _dotprop_keys(self):
+        try:
+            return self.dotprops.keys()
+        except AttributeError:
+            return range(len(self.dotprops))
+
+    @property
+    def nonmatching(self):
+        """Indices of nonmatching set of neurons"""
+        if self._nonmatching is None:
+            return set(self._dotprop_keys())
+        return self._nonmatching
+
+    def set_dist_bins(self, bins):
+        """Set distance bins to use for score matrix.
+
+        The first and last values are set to -inf and inf respectively,
+        so that the bins explicitly cover the entire domain,
+        even though negatives are not possible, in principle.
+
+        Parameters
+        ----------
+        bins : list of float
+            First and last values are effectively ignored, as above.
+
+        Returns
+        -------
+        np.ndarray of float
+            The modified bin boundaries.
+        """
+        self.dist_bins = ensure_inf_bounds(bins)
+        return self.dist_bins
+
+    def calc_dist_bins(self, n_bins, base, min_exp, max_exp):
+        """Calculate distance bins in a logarithmic sequence.
+
+        n_bins values are spread evenly between min_exp and max_exp (inclusive).
+        These are transformed into boundaries by base**values.
+        Then inf is added to the end, and the first boundary is replaced by -inf.
+
+        Parameters
+        ----------
+        n_bins : int
+            Number of bins (i.e. number of boundaries - 1)
+        base : float
+            Value to be raised to some power to create the boundaries.
+        min_exp : float
+            Exponent of the lowest boundary (actual boundary will be replaced by -inf)
+        max_exp : float
+            Exponent of the highest boundary (the lower bound of the inf-containing bin)
+
+        Returns
+        -------
+        np.ndarray of float
+            The modified bin boundaries.
+        """
+        bins = np.array(
+            list(np.logspace(min_exp, max_exp, n_bins, base=base)) + [np.inf]
+        )
+        return self.set_dist_bins(bins)
+
+    def set_dot_bins(self, bins):
+        """Set dot product bins to use for score matrix.
+
+        The first and last values are set to -inf and inf respectively,
+        so that the bins explicitly cover the entire domain,
+        even though values outside of 0-1 are not possible, in principle.
+
+        Parameters
+        ----------
+        bins : list of float
+            First and last values are effectively ignored, as above.
+
+        Returns
+        -------
+        np.ndarray of float
+            The modified bin boundaries.
+        """
+        self.dot_bins = ensure_inf_bounds(bins)
+        return self.dot_bins
+
+    def calc_dot_bins(self, n_bins):
+        """Calculate dot product bins in a linear sequence between 0 and 1.
+
+        Parameters
+        ----------
+        n_bins : int
+            Number of bins (i.e. number of boundaries - 1).
+
+        Returns
+        -------
+        np.ndarray of float
+            The modified bin boundaries.
+        """
+        bins = np.linspace(0, 1, n_bins + 1)
+        return self.set_dot_bins(bins)
+
+    def _yield_matching_pairs(self):
+        for ms in self.matching_sets:
+            for q, t in permutations(ms, 2):
+                if q != t:
+                    yield q, t
+
+    def _yield_nonmatching_pairs(self):
+        for q, t in permutations(self.nonmatching):
+            if q != t:
+                yield q, t
+
+    def _query_to_dist_dot_idxs(self, q_idx, t_idx, counts=None):
+        q = self.dotprops[q_idx]
+        response = q.dist_dots(self.dotprops[t_idx], use_alpha=self.alpha)
+        if self.alpha:
+            dists, dots, alphas = response
+            dots *= alphas
+        else:
+            dists, dots = response
+
+        dist_idxs = np.digitize(dists, self.dist_bins) - 1
+        dot_idxs = np.digitize(dots, self.dot_bins) - 1
+        if counts is None:
+            counts = np.zeros(
+                (len(self.dist_bins), len(self.dot_bins)),
+                dtype=int,
+            )
+        for dist_idx, dot_idx in zip(dist_idxs, dot_idxs):
+            counts[dist_idx, dot_idx] += 1
+        return counts
+
+    def _counts_array(self, idx_pairs, threads=None):
+        counts = np.zeros(
+            (len(self.dist_bins), len(self.dot_bins)),
+            dtype=int,
+        )
+        if threads is None or threads == 0 and cpu_count == 1:
+            for q_idx, t_idx in idx_pairs:
+                counts = self._query_to_dist_dot_idxs(q_idx, t_idx, counts)
+            return counts
+
+        threads = threads or cpu_count
+
+        with ProcessPoolExecutor(threads) as exe:
+            for these_counts in exe.map(
+                self._query_to_dist_dot_idxs,
+                idx_pairs[:, 0],
+                idx_pairs[:, 1],
+                chunksize=chunksize(len(idx_pairs), threads)
+            ):
+                counts += these_counts
+
+        return counts
+
+    def build(self, threads=None):
+        """Build the score matrix.
+
+        All non-identical neuron pairs within all matching sets are selected,
+        and distdots calculated for those pairs.
+        Then, the minimum number of non-matching pairs are randomly drawn
+        so that at least as many distdots can be calculated for non-matching
+        pairs.
+
+        In each bin of the 2D score matrix, the log2 odds ratio of a distdot
+        in that bin belonging to a match vs. non-match is calculated.
+
+        Parameters
+        ----------
+        threads : int, optional
+            If None, act in serial.
+            If 0, use cpu_count - 1.
+            Otherwise, use the given value.
+
+        Returns
+        -------
+        pd.DataFrame
+            Suitable for passing to navis.nbl.ScoringFunction
+
+        Raises
+        ------
+        ValueError
+            If dist_bins or dot_bins are not set.
+        """
+        if self.dot_bins is None or self.dist_bins is None:
+            raise ValueError("dot_bins and dist_bins must be set or calculated")
+
+        matching_pairs = set(self._yield_matching_pairs())
+        # need to know the eventual distdot count
+        # so we know how many non-matching pairs to draw
+        q_idx_count = Counter(p[0] for p in matching_pairs)
+        n_matching_dist_dots = sum(
+            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
+        )
+
+        # pre-calculating which pairs we're going to use,
+        # rather than drawing them as we need them,
+        # means that we can parallelise the later step more effectively.
+        # Slowdowns here are basically meaningless
+        # because of how long distdot calculation will take
+        all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
+        nonmatching_pairs = []
+        n_nonmatching_dist_dots = 0
+        while n_nonmatching_dist_dots < n_matching_dist_dots:
+            idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1)
+            nonmatching_pairs.append(all_nonmatching_pairs.pop(idx))
+            n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]])
+
+        match_counts = self._counts_array(matching_pairs, threads)
+        nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
+
+        # account for there being different total numbers of match/nonmatch dist dots
+        matching_factor = nonmatch_counts.sum() / match_counts.sum()
+
+        cells = np.log2(
+            (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
+        )
+
+        index = [
+            f"({left},{right}]"
+            for left, right in zip(
+                [-np.inf] + list(self.dist_bins), list(self.dist_bins) + [np.inf]
+            )
+        ]
+        columns = [
+            f"({left},{right}]"
+            for left, right in zip(
+                [-np.inf] + list(self.dot_bins), list(self.dot_bins) + [np.inf]
+            )
+        ]
+
+        return pd.DataFrame(cells, index, columns)

From 11994317e49ee6c727fbf9f07d10121abae332fd Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Fri, 9 Apr 2021 19:12:56 +0100
Subject: [PATCH 02/22] Make bin boundary handling more consistent

---
 navis/nbl/smat.py | 60 +++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 5c0fc167..2212f7bd 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -11,10 +11,19 @@
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
 
 
-def ensure_inf_bounds(numbers):
-    numbers[0] = -np.inf
-    numbers[1] = np.inf
-    return np.array(numbers)
+def ensure_inf_bounds(numbers, right_one=True):
+    lst = list(numbers)
+    if lst[0] <= 0:
+        lst[0] = -np.inf
+    else:
+        lst.insert(0, -np.inf)
+
+    if right_one and lst[-1] >= 1:
+        lst[-1] = np.inf
+    elif lst[-1] != np.inf:
+        lst.append(np.inf)
+
+    return np.array(lst)
 
 
 def chunksize(it_len, cpu_count, min_chunk=50):
@@ -72,8 +81,13 @@ def nonmatching(self):
             return set(self._dotprop_keys())
         return self._nonmatching
 
-    def set_dist_bins(self, bins):
-        """Set distance bins to use for score matrix.
+    def set_dist_boundaries(self, boundaries):
+        """Set distance bins to use for score matrix by their internal boundaries.
+
+        If the lowest boundary is <=0, it will be changed to -inf.
+        If the lowest boundary is >0, -inf will be prepended.
+
+        If the highest boundary is not inf, it will be appended.
 
         The first and last values are set to -inf and inf respectively,
         so that the bins explicitly cover the entire domain,
@@ -89,15 +103,17 @@ def set_dist_bins(self, bins):
         np.ndarray of float
             The modified bin boundaries.
         """
-        self.dist_bins = ensure_inf_bounds(bins)
+        self.dist_bins = ensure_inf_bounds(boundaries)
         return self.dist_bins
 
     def calc_dist_bins(self, n_bins, base, min_exp, max_exp):
-        """Calculate distance bins in a logarithmic sequence.
+        """Calculate distance boundaries in a logarithmic sequence.
 
-        n_bins values are spread evenly between min_exp and max_exp (inclusive).
+        base**min_exp will be the upper bound of the lowest bin;
+        base**max_exp will be the lower bound of the highest bin.
+
+        n_bins - 1 values are spread evenly between min_exp and max_exp (inclusive).
         These are transformed into boundaries by base**values.
-        Then inf is added to the end, and the first boundary is replaced by -inf.
 
         Parameters
         ----------
@@ -115,17 +131,19 @@ def calc_dist_bins(self, n_bins, base, min_exp, max_exp):
         np.ndarray of float
             The modified bin boundaries.
         """
-        bins = np.array(
-            list(np.logspace(min_exp, max_exp, n_bins, base=base)) + [np.inf]
+        # n_bins - 1 because inf outer boundaries are added
+        return self.set_dist_boundaries(
+            np.logspace(min_exp, max_exp, n_bins - 1, base=base)
         )
-        return self.set_dist_bins(bins)
 
-    def set_dot_bins(self, bins):
-        """Set dot product bins to use for score matrix.
+    def set_dot_boundaries(self, boundaries):
+        """Set dot product bins to use for score matrix by their internal boundaries.
 
-        The first and last values are set to -inf and inf respectively,
-        so that the bins explicitly cover the entire domain,
-        even though values outside of 0-1 are not possible, in principle.
+        Dot products, even normalised by alpha values,
+        should be in the range 0,1.
+        However, due to float imprecision, they sometimes aren't,
+        so the lowest bound is set to -inf and highest bound set to inf
+        just in case.
 
         Parameters
         ----------
@@ -137,12 +155,14 @@ def set_dot_bins(self, bins):
         np.ndarray of float
             The modified bin boundaries.
         """
-        self.dot_bins = ensure_inf_bounds(bins)
+        self.dot_bins = ensure_inf_bounds(boundaries, True)
         return self.dot_bins
 
     def calc_dot_bins(self, n_bins):
         """Calculate dot product bins in a linear sequence between 0 and 1.
 
+        Internally, 0 and 1 will be replaced with -inf and inf respectively.
+
         Parameters
         ----------
         n_bins : int
@@ -154,7 +174,7 @@ def calc_dot_bins(self, n_bins):
             The modified bin boundaries.
         """
         bins = np.linspace(0, 1, n_bins + 1)
-        return self.set_dot_bins(bins)
+        return self.set_dot_boundaries(bins[1:-1])
 
     def _yield_matching_pairs(self):
         for ms in self.matching_sets:

From c52294c04a015dc7b542ad222d172e29303a75fa Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Tue, 13 Apr 2021 18:10:52 +0100
Subject: [PATCH 03/22] Refactor smat.py for ND case

Also include lookup classes.
---
 navis/nbl/smat.py | 518 ++++++++++++++++++++++++++++------------------
 1 file changed, 320 insertions(+), 198 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 2212f7bd..5aa1c4f5 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -1,72 +1,135 @@
+from __future__ import annotations
 from itertools import permutations
 import sys
 import os
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
+from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
 
-import pandas as pd
 import numpy as np
+import pandas as pd
+
+from ..core.neurons import Dotprops
+
+DEFAULT_SEED = 1991
 
 epsilon = sys.float_info.epsilon
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
+implicit_interval = "[)"
 
 
-def ensure_inf_bounds(numbers, right_one=True):
-    lst = list(numbers)
-    if lst[0] <= 0:
-        lst[0] = -np.inf
+def chunksize(it_len, cpu_count, min_chunk=50):
+    return max(min_chunk, int(it_len / (cpu_count * 4)))
+
+
+def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf):
+    """Ensures that boundaries cover -inf to inf.
+
+    If the left or rightmost values lie within the interval ``(left, right)``,
+    ``-inf`` or ``inf`` will be prepended or appended respectively.
+    Otherwise, the left and right values will be set to
+    ``-inf`` and ``inf`` respectively.
+
+    Parameters
+    ----------
+    arr : array-like of floats
+        Array of boundaries
+    left : float, optional
+        If the first value is greater than this, prepend -inf;
+        otherwise replace that value with -inf.
+    right : float, optional
+        If the last value is less than this, append inf;
+        otherwise replace that value with inf.
+
+    Returns
+    -------
+    np.ndarray of floats
+        Boundaries from -inf to inf
+
+    Raises
+    ------
+    ValueError
+        If values of ``arr`` are not monotonically increasing.
+    """
+    if not np.all(np.diff(arr) > 0):
+        raise ValueError("Boundaries are not monotonically increasing")
+
+    items = list(arr)
+    if items[0] <= left:
+        items[0] = -np.inf
     else:
-        lst.insert(0, -np.inf)
+        items.insert(0, -np.inf)
 
-    if right_one and lst[-1] >= 1:
-        lst[-1] = np.inf
-    elif lst[-1] != np.inf:
-        lst.append(np.inf)
+    if items[-1] >= right:
+        items[-1] = np.inf
+    else:
+        items.append(np.inf)
 
-    return np.array(lst)
+    return np.array(items)
 
 
-def chunksize(it_len, cpu_count, min_chunk=50):
-    return max(min_chunk, int(it_len / (cpu_count * 4)))
+def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]:
+    for a, b in pairs:
+        if a != b:
+            yield a, b
 
 
-class ScoreMatrixBuilder:
+class LookupNdBuilder:
     def __init__(
         self,
         dotprops,
         matching_sets,
+        boundaries: Sequence[Sequence[float]],
+        fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
         nonmatching=None,
-        alpha=False,
-        seed=1991,
-    ):
-        """Class for building a score matrix for NBLAST.
-
-        dist_bins and dot_bins must additionally be given using the set_* or calc_* methods.
+        seed=DEFAULT_SEED,
+    ) -> None:
+        f"""Class for building an N-dimensional score lookup for NBLAST.
 
         Parameters
         ----------
         dotprops : dict or list of Dotprops
             An indexable sequence of all neurons which will be used as the training set,
             as Dotprops objects.
-        matching_sets : list of sets of indices into dotprops
-            Sets of neurons, as indices into dotprops, which should be considered matches.
-        nonmatching : set of indices into dotprops, optional
-            Set of neurons, as indices into dotprops, which should not be considered matches.
-            If not given, all dotprops will be used
+        matching_sets : list of sets of int
+            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
+        boundaries : sequence of array-likes of floats
+            List of lists, where the inner lists are monotonically increasing
+            from -inf to inf.
+            The length of the outer list is the dimensionality of the lookup table.
+
+            The inner lists are the boundaries of bins for that dimension,
+            i.e. an inner list of N items describes N-1 bins.
+            If an inner list is not inf-bounded,
+            -inf and inf will be prepended and appended.
+
+            See the ``wrap_bounds`` convenience function.
+        fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
+            Function taking 2 arguments,
+            both instances of ``navis.core.neurons.Dotprops``,
+            and returning a list of 1D ``numpy.ndarray``s of floats.
+            The length of the list must be the same as the length of ``boundaries``.
+            The length of the ``array``s must be the same
+            as the number of points in the first argument.
+
+            This function returns values describing the quality of
+            point matches from a query to a target neuron.
+        nonmatching : set of int, optional
+            Set of neurons, as indices into ``dotprops``,
+            which should not be considered matches.
+            If not given, all ``dotprops`` will be used
             (on the assumption that matches are a small subset of possible pairs).
-        alpha : bool, optional
-            Whether to multiply dot product by alpha (local colinearity)
-            when calculating dist-dots.
         seed : int, optional
-            Non-matching pairs are drawn at random using this seed, by default 1991
+            Non-matching pairs are drawn at random using this seed,
+            by default {DEFAULT_SEED}
         """
-        self.rng = np.random.default_rng(seed)
         self.dotprops = dotprops
         self.matching_sets = matching_sets
-        self.alpha = alpha
         self._nonmatching = nonmatching
-        self.dist_bins = None
-        self.dot_bins = None
+        self.fn = fn
+        self.boundaries = [wrap_bounds(b) for b in boundaries]
+
+        self.rng = np.random.default_rng(seed)
 
     def _dotprop_keys(self):
         try:
@@ -81,156 +144,85 @@ def nonmatching(self):
             return set(self._dotprop_keys())
         return self._nonmatching
 
-    def set_dist_boundaries(self, boundaries):
-        """Set distance bins to use for score matrix by their internal boundaries.
-
-        If the lowest boundary is <=0, it will be changed to -inf.
-        If the lowest boundary is >0, -inf will be prepended.
-
-        If the highest boundary is not inf, it will be appended.
-
-        The first and last values are set to -inf and inf respectively,
-        so that the bins explicitly cover the entire domain,
-        even though negatives are not possible, in principle.
-
-        Parameters
-        ----------
-        bins : list of float
-            First and last values are effectively ignored, as above.
-
-        Returns
-        -------
-        np.ndarray of float
-            The modified bin boundaries.
-        """
-        self.dist_bins = ensure_inf_bounds(boundaries)
-        return self.dist_bins
-
-    def calc_dist_bins(self, n_bins, base, min_exp, max_exp):
-        """Calculate distance boundaries in a logarithmic sequence.
-
-        base**min_exp will be the upper bound of the lowest bin;
-        base**max_exp will be the lower bound of the highest bin.
-
-        n_bins - 1 values are spread evenly between min_exp and max_exp (inclusive).
-        These are transformed into boundaries by base**values.
-
-        Parameters
-        ----------
-        n_bins : int
-            Number of bins (i.e. number of boundaries - 1)
-        base : float
-            Value to be raised to some power to create the boundaries.
-        min_exp : float
-            Exponent of the lowest boundary (actual boundary will be replaced by -inf)
-        max_exp : float
-            Exponent of the highest boundary (the lower bound of the inf-containing bin)
-
-        Returns
-        -------
-        np.ndarray of float
-            The modified bin boundaries.
-        """
-        # n_bins - 1 because inf outer boundaries are added
-        return self.set_dist_boundaries(
-            np.logspace(min_exp, max_exp, n_bins - 1, base=base)
-        )
-
-    def set_dot_boundaries(self, boundaries):
-        """Set dot product bins to use for score matrix by their internal boundaries.
-
-        Dot products, even normalised by alpha values,
-        should be in the range 0,1.
-        However, due to float imprecision, they sometimes aren't,
-        so the lowest bound is set to -inf and highest bound set to inf
-        just in case.
-
-        Parameters
-        ----------
-        bins : list of float
-            First and last values are effectively ignored, as above.
-
-        Returns
-        -------
-        np.ndarray of float
-            The modified bin boundaries.
-        """
-        self.dot_bins = ensure_inf_bounds(boundaries, True)
-        return self.dot_bins
-
-    def calc_dot_bins(self, n_bins):
-        """Calculate dot product bins in a linear sequence between 0 and 1.
-
-        Internally, 0 and 1 will be replaced with -inf and inf respectively.
-
-        Parameters
-        ----------
-        n_bins : int
-            Number of bins (i.e. number of boundaries - 1).
-
-        Returns
-        -------
-        np.ndarray of float
-            The modified bin boundaries.
-        """
-        bins = np.linspace(0, 1, n_bins + 1)
-        return self.set_dot_boundaries(bins[1:-1])
-
     def _yield_matching_pairs(self):
         for ms in self.matching_sets:
-            for q, t in permutations(ms, 2):
-                if q != t:
-                    yield q, t
+            yield from yield_not_same(permutations(ms, 2))
 
     def _yield_nonmatching_pairs(self):
-        for q, t in permutations(self.nonmatching):
-            if q != t:
-                yield q, t
-
-    def _query_to_dist_dot_idxs(self, q_idx, t_idx, counts=None):
-        q = self.dotprops[q_idx]
-        response = q.dist_dots(self.dotprops[t_idx], use_alpha=self.alpha)
-        if self.alpha:
-            dists, dots, alphas = response
-            dots *= alphas
-        else:
-            dists, dots = response
+        # todo: this could be much better, use meshgrid or shuffle index arrays
+        return yield_not_same(permutations(self.nonmatching, 2))
+
+    def _empty_counts(self):
+        shape = [len(b) - 1 for b in self.boundaries]
+        return np.zeros(shape, int)
+
+    def _query_to_idxs(self, q_idx, t_idx, counts=None):
+        response = self.fn(self.dotprops[q_idx], self.dotprops[t_idx])
+        idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)]
 
-        dist_idxs = np.digitize(dists, self.dist_bins) - 1
-        dot_idxs = np.digitize(dots, self.dot_bins) - 1
         if counts is None:
-            counts = np.zeros(
-                (len(self.dist_bins), len(self.dot_bins)),
-                dtype=int,
-            )
-        for dist_idx, dot_idx in zip(dist_idxs, dot_idxs):
-            counts[dist_idx, dot_idx] += 1
+            counts = self._empty_counts()
+
+        for idx in zip(*idxs):
+            counts[idx] += 1
+
         return counts
 
     def _counts_array(self, idx_pairs, threads=None):
-        counts = np.zeros(
-            (len(self.dist_bins), len(self.dot_bins)),
-            dtype=int,
-        )
+        counts = self._empty_counts()
         if threads is None or threads == 0 and cpu_count == 1:
             for q_idx, t_idx in idx_pairs:
-                counts = self._query_to_dist_dot_idxs(q_idx, t_idx, counts)
+                counts = self._query_to_idxs(q_idx, t_idx, counts)
             return counts
 
         threads = threads or cpu_count
 
         with ProcessPoolExecutor(threads) as exe:
             for these_counts in exe.map(
-                self._query_to_dist_dot_idxs,
+                self._query_to_idxs,
                 idx_pairs[:, 0],
                 idx_pairs[:, 1],
-                chunksize=chunksize(len(idx_pairs), threads)
+                chunksize=chunksize(len(idx_pairs), threads),
             ):
                 counts += these_counts
 
         return counts
 
-    def build(self, threads=None):
+    def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
+        matching_pairs = set(self._yield_matching_pairs())
+        # need to know the eventual distdot count
+        # so we know how many non-matching pairs to draw
+        q_idx_count = Counter(p[0] for p in matching_pairs)
+        n_matching_dist_dots = sum(
+            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
+        )
+
+        # pre-calculating which pairs we're going to use,
+        # rather than drawing them as we need them,
+        # means that we can parallelise the later step more effectively.
+        # Slowdowns here are practically meaningless
+        # because of how long distdot calculation will take
+        all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
+        nonmatching_pairs = []
+        n_nonmatching_dist_dots = 0
+        while n_nonmatching_dist_dots < n_matching_dist_dots:
+            idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1)
+            nonmatching_pairs.append(all_nonmatching_pairs.pop(idx))
+            n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]])
+
+        match_counts = self._counts_array(matching_pairs, threads)
+        nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
+
+        # account for there being different total numbers of match/nonmatch dist dots
+        matching_factor = nonmatch_counts.sum() / match_counts.sum()
+
+        cells = np.log2(
+            (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
+        )
+
+        return self.boundaries, cells
+
+    def build(self, threads=None) -> LookupNd:
         """Build the score matrix.
 
         All non-identical neuron pairs within all matching sets are selected,
@@ -239,7 +231,7 @@ def build(self, threads=None):
         so that at least as many distdots can be calculated for non-matching
         pairs.
 
-        In each bin of the 2D score matrix, the log2 odds ratio of a distdot
+        In each bin of the score matrix, the log2 odds ratio of a distdot
         in that bin belonging to a match vs. non-match is calculated.
 
         Parameters
@@ -259,51 +251,181 @@ def build(self, threads=None):
         ValueError
             If dist_bins or dot_bins are not set.
         """
-        if self.dot_bins is None or self.dist_bins is None:
-            raise ValueError("dot_bins and dist_bins must be set or calculated")
+        return LookupNd(*self._build(threads))
 
-        matching_pairs = set(self._yield_matching_pairs())
-        # need to know the eventual distdot count
-        # so we know how many non-matching pairs to draw
-        q_idx_count = Counter(p[0] for p in matching_pairs)
-        n_matching_dist_dots = sum(
-            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
-        )
 
-        # pre-calculating which pairs we're going to use,
-        # rather than drawing them as we need them,
-        # means that we can parallelise the later step more effectively.
-        # Slowdowns here are basically meaningless
-        # because of how long distdot calculation will take
-        all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
-        nonmatching_pairs = []
-        n_nonmatching_dist_dots = 0
-        while n_nonmatching_dist_dots < n_matching_dist_dots:
-            idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1)
-            nonmatching_pairs.append(all_nonmatching_pairs.pop(idx))
-            n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]])
+def dist_dot(q: Dotprops, t: Dotprops):
+    return list(q.dist_dots(t))
 
-        match_counts = self._counts_array(matching_pairs, threads)
-        nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
 
-        # account for there being different total numbers of match/nonmatch dist dots
-        matching_factor = nonmatch_counts.sum() / match_counts.sum()
+def dist_dot_alpha(q: Dotprops, t: Dotprops):
+    dist, dot, alpha = q.dist_dots(t, alpha=True)
+    return [dist, dot * np.sqrt(alpha)]
 
-        cells = np.log2(
-            (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
+
+class LookupDistDotBuilder(LookupNdBuilder):
+    def __init__(
+        self,
+        dotprops,
+        matching_sets,
+        dist_boundaries,
+        dot_boundaries,
+        nonmatching=None,
+        use_alpha=False,
+        seed=DEFAULT_SEED,
+    ):
+        f"""Class for building a 2-dimensional score lookup for NBLAST.
+
+        The scores are
+
+        1. The distances between best-matching points
+        2. The dot products of direction vectors around those points,
+            optionally scaled by the colinearity ``alpha``.
+
+        Parameters
+        ----------
+        dotprops : dict or list of Dotprops
+            An indexable sequence of all neurons which will be used as the training set,
+            as Dotprops objects.
+        matching_sets : list of sets of int
+            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
+        dist_boundaries : array-like of floats
+            Monotonically increasing boundaries between distance bins
+            (i.e. N values makes N-1 bins).
+            If the first value > 0, -inf will be prepended,
+            otherwise the first value will be replaced with -inf.
+            inf will be appended unless the last value is already inf.
+
+            Consider using ``numpy.logspace`` or ``numpy.geomspace``
+            to generate the inner boundaries of the bins.
+        dot_boundaries : array-like of floats
+            Monotonically increasing boundaries between
+            (possibly alpha-scaled) dot product bins
+            (i.e. N values makes N-1 bins).
+            If the first value > 0, -inf will be prepended,
+            otherwise the first value will be replaced with -inf.
+            If the last value < 1, inf will be appended,
+            otherwise the last value will be replaced with inf.
+
+            Consider using ``numpy.linspace`` between 0 and 1.
+        nonmatching : set of int, optional
+            Set of neurons, as indices into ``dotprops``,
+            which should not be considered matches.
+            If not given, all ``dotprops`` will be used
+            (on the assumption that matches are a small subset of possible pairs).
+        use_alpha : bool, optional
+            If true, multiply the dot product by the geometric mean
+            of the matched points' alpha values
+            (i.e. ``sqrt(alpha1 * alpha2)``).
+        seed : int, optional
+            Non-matching pairs are drawn at random using this seed,
+            by default {DEFAULT_SEED}
+        """
+        fn = dist_dot_alpha if use_alpha else dist_dot
+        super().__init__(
+            dotprops,
+            matching_sets,
+            [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
+            fn,
+            nonmatching,
+            seed,
         )
 
-        index = [
-            f"({left},{right}]"
-            for left, right in zip(
-                [-np.inf] + list(self.dist_bins), list(self.dist_bins) + [np.inf]
-            )
-        ]
-        columns = [
-            f"({left},{right}]"
-            for left, right in zip(
-                [-np.inf] + list(self.dot_bins), list(self.dot_bins) + [np.inf]
+    def build(self, threads=None) -> Lookup2d:
+        return Lookup2d(*self._build(threads))
+
+
+class LookupNd:
+    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
+        if [len(b) - 1 for b in boundaries] != list(cells.shape):
+            raise ValueError("boundaries and cells have inconsistent bin counts")
+        self.boundaries = boundaries
+        self.cells = cells
+
+    def __call__(self, *args):
+        if len(args) != len(self.boundaries):
+            raise TypeError(
+                f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given"
             )
+
+        idxs = np.array(
+            [np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)]
+        ).T
+        out = self.cells[idxs]
+        if np.isscalar(args[0]):
+            return out[0]
+        else:
+            return out
+
+
+def format_boundaries(arr):
+    return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])]
+
+
+def parse_boundary(item, strict=False):
+    explicit_interval = item[0] + item[-1]
+    if strict and item[0] + item[-1] != implicit_interval:
+        raise ValueError(
+            f"Enclosing characters {explicit_interval} "
+            f"do not match implicit interval specifiers {implicit_interval}"
+        )
+    return tuple(float(i) for i in item[1:-1].split(","))
+
+
+def parse_boundaries(items, strict=False):
+    last = None
+    for item in items:
+        lower, upper = parse_boundary(item, strict)
+        yield lower
+        last = upper
+    yield last
+
+
+class Lookup2d(LookupNd):
+    """Convenience class inheriting from LookupNd for the common 2D case.
+
+    Provides IO with pandas DataFrames.
+    """
+
+    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
+        if len(boundaries) != 2:
+            raise ValueError("boundaries must be of length 2; cells must be 2D")
+        super().__init__(boundaries, cells)
+
+    def to_dataframe(self) -> pd.DataFrame:
+        # numpy.digitize includes left, excludes right, i.e. "[left,right)"
+        return pd.DataFrame(
+            self.cells,
+            format_boundaries(self.boundaries[0]),
+            format_boundaries(self.boundaries[1]),
+        )
+
+    @classmethod
+    def from_dataframe(cls, df: pd.DataFrame, strict=False):
+        f"""Parse score matrix from a dataframe with string index and column labels.
+
+        Expects the index and column labels to specify an interval
+        like ``f"[{{lower}},{{upper}})"``.
+
+        Will replace the lowermost and uppermost bound with -inf and inf
+        if they are not already.
+
+        Parameters
+        ----------
+        strict : bool, optional
+            If falsey (default), ignores parentheses and brackets,
+            effectively normalising to
+            ``{implicit_interval[0]}lower,upper{implicit_interval[1]}``
+            as an implementation detail.
+            Otherwise, raises a ValueError if parens/brackets
+            do not match the implementation.
+        """
+        boundaries = [
+            np.array(list(parse_boundaries(arr, strict)))
+            for arr in (df.index, df.columns)
         ]
+        for b in boundaries:
+            b[0] = -np.inf
+            b[-1] = np.inf
 
-        return pd.DataFrame(cells, index, columns)
+        return cls(boundaries, df.to_numpy())

From b24ed9bebf6161b104cfbbcd095370c804eebd09 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 10:51:16 +0100
Subject: [PATCH 04/22] minor: capitalise constant

---
 navis/nbl/smat.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 5aa1c4f5..081921b6 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -15,7 +15,7 @@
 
 epsilon = sys.float_info.epsilon
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
-implicit_interval = "[)"
+IMPLICIT_INTERVAL = "[)"
 
 
 def chunksize(it_len, cpu_count, min_chunk=50):
@@ -364,10 +364,10 @@ def format_boundaries(arr):
 
 def parse_boundary(item, strict=False):
     explicit_interval = item[0] + item[-1]
-    if strict and item[0] + item[-1] != implicit_interval:
+    if strict and item[0] + item[-1] != IMPLICIT_INTERVAL:
         raise ValueError(
             f"Enclosing characters {explicit_interval} "
-            f"do not match implicit interval specifiers {implicit_interval}"
+            f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}"
         )
     return tuple(float(i) for i in item[1:-1].split(","))
 
@@ -415,7 +415,7 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False):
         strict : bool, optional
             If falsey (default), ignores parentheses and brackets,
             effectively normalising to
-            ``{implicit_interval[0]}lower,upper{implicit_interval[1]}``
+            ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}``
             as an implementation detail.
             Otherwise, raises a ValueError if parens/brackets
             do not match the implementation.

From 4aefc5a3509270ed20018b9f077c60ee46772288 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 10:51:48 +0100
Subject: [PATCH 05/22] Tests: add fixtures for loading data

---
 tests/conftest.py | 54 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5bd67dc4..33a1286e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,16 +6,16 @@
 import navis
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def data_dir():
     return Path(__file__).resolve().parent.parent / "navis" / "data"
 
 
 @pytest.fixture(
-    params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"]
+    params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"],
 )
-def swc_source(request, data_dir: Path):
-    swc_path: Path = data_dir / "swc" / "722817260.swc"
+def swc_source(request, swc_paths):
+    swc_path = swc_paths[0]
     if request.param == "Path":
         yield swc_path
     elif request.param == "pathstr":
@@ -39,9 +39,9 @@ def swc_source(request, data_dir: Path):
 @pytest.fixture(
     params=["dirstr", "dirpath", "list", "listwithdir"],
 )
-def swc_source_multi(request, data_dir: Path):
-    dpath = data_dir / "swc"
-    fpath = dpath / "722817260.swc"
+def swc_source_multi(request, swc_paths):
+    fpath = swc_paths[0]
+    dpath = fpath.parent
     if request.param == "dirstr":
         yield str(dpath)
     elif request.param == "dirpath":
@@ -52,3 +52,43 @@ def swc_source_multi(request, data_dir: Path):
         yield [dpath, fpath]
     else:
         raise ValueError(f"Unknown parameter '{request.param}'")
+
+
+def data_paths(dpath, glob="*"):
+    return sorted(dpath.glob(glob))
+
+
+@pytest.fixture(scope="session")
+def swc_paths(data_dir: Path):
+    return data_paths(data_dir / "swc", "*.swc")
+
+
+@pytest.fixture(scope="session")
+def gml_paths(data_dir: Path):
+    return data_paths(data_dir / "gml", "*.gml")
+
+
+@pytest.fixture(scope="session")
+def obj_paths(data_dir: Path):
+    return data_paths(data_dir / "obj", "*.obj")
+
+
+@pytest.fixture(scope="session")
+def synapses_paths(data_dir: Path):
+    return data_paths(data_dir / "synapses", "*.csv")
+
+
+@pytest.fixture(scope="session")
+def volumes_paths(data_dir: Path):
+    return data_paths(data_dir / "volumes", "*.obj")
+
+
+@pytest.fixture
+def neurons(swc_paths, synapses_paths):
+    swc_reader = navis.io.swc_io.SwcReader()
+    out = []
+    for swc_path, syn_path in zip(swc_paths, synapses_paths):
+        neuron = swc_reader.read_file_path(swc_path)
+        neuron.connectors = pd.read_csv(syn_path)
+        out.append(neuron)
+    return out

From da3b6b518d7501aab2fbd782b975ff39313e2541 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 14:01:20 +0100
Subject: [PATCH 06/22] Dotprops: implement __len__

---
 navis/core/neurons.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/navis/core/neurons.py b/navis/core/neurons.py
index b8622be9..7e962b4b 100644
--- a/navis/core/neurons.py
+++ b/navis/core/neurons.py
@@ -2637,3 +2637,6 @@ def to_skeleton(self, scale_vec: float = 1) -> TreeNeuron:
         tn._soma = self._soma
 
         return tn
+
+    def __len__(self):
+        return len(self.points)

From 511f7c19d2454d296866b95d28a8dcfdbbc6239f Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 14:01:55 +0100
Subject: [PATCH 07/22] smat: fixes for tests

---
 navis/nbl/smat.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 081921b6..6beae345 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -5,12 +5,15 @@
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
 from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
+import logging
 
 import numpy as np
 import pandas as pd
 
 from ..core.neurons import Dotprops
 
+logger = logging.getLogger(__name__)
+
 DEFAULT_SEED = 1991
 
 epsilon = sys.float_info.epsilon
@@ -37,9 +40,11 @@ def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf):
     left : float, optional
         If the first value is greater than this, prepend -inf;
         otherwise replace that value with -inf.
+        Defaults to -inf if None.
     right : float, optional
         If the last value is less than this, append inf;
         otherwise replace that value with inf.
+        Defaults to inf if None.
 
     Returns
     -------
@@ -51,6 +56,11 @@ def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf):
     ValueError
         If values of ``arr`` are not monotonically increasing.
     """
+    if left is None:
+        left = -np.inf
+    if right is None:
+        right = np.inf
+
     if not np.all(np.diff(arr) > 0):
         raise ValueError("Boundaries are not monotonically increasing")
 
@@ -178,6 +188,7 @@ def _counts_array(self, idx_pairs, threads=None):
         threads = threads or cpu_count
 
         with ProcessPoolExecutor(threads) as exe:
+            idx_pairs = np.asarray(idx_pairs, dtype=int)
             for these_counts in exe.map(
                 self._query_to_idxs,
                 idx_pairs[:, 0],
@@ -189,7 +200,7 @@ def _counts_array(self, idx_pairs, threads=None):
         return counts
 
     def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
-        matching_pairs = set(self._yield_matching_pairs())
+        matching_pairs = sorted(set(self._yield_matching_pairs()))
         # need to know the eventual distdot count
         # so we know how many non-matching pairs to draw
         q_idx_count = Counter(p[0] for p in matching_pairs)
@@ -206,7 +217,7 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
         nonmatching_pairs = []
         n_nonmatching_dist_dots = 0
         while n_nonmatching_dist_dots < n_matching_dist_dots:
-            idx = self.rng.integers(0, len(all_nonmatching_pairs) + 1)
+            idx = self.rng.integers(0, len(all_nonmatching_pairs))
             nonmatching_pairs.append(all_nonmatching_pairs.pop(idx))
             n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]])
 
@@ -215,6 +226,8 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
 
         # account for there being different total numbers of match/nonmatch dist dots
         matching_factor = nonmatch_counts.sum() / match_counts.sum()
+        if np.any(match_counts + nonmatch_counts == 0):
+            logger.warning("Some lookup cells have no data in them")
 
         cells = np.log2(
             (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
@@ -348,14 +361,11 @@ def __call__(self, *args):
                 f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given"
             )
 
-        idxs = np.array(
-            [np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)]
-        ).T
+        idxs = tuple(
+            np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)
+        )
         out = self.cells[idxs]
-        if np.isscalar(args[0]):
-            return out[0]
-        else:
-            return out
+        return out
 
 
 def format_boundaries(arr):

From b29a2542bff7da882c05c3165e740fd78707e1ed Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 14:02:05 +0100
Subject: [PATCH 08/22] Unit tests for smat

---
 tests/test_nbl/__init__.py  |   0
 tests/test_nbl/test_smat.py | 163 ++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)
 create mode 100644 tests/test_nbl/__init__.py
 create mode 100644 tests/test_nbl/test_smat.py

diff --git a/tests/test_nbl/__init__.py b/tests/test_nbl/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
new file mode 100644
index 00000000..0183a1e1
--- /dev/null
+++ b/tests/test_nbl/test_smat.py
@@ -0,0 +1,163 @@
+from navis.core.neurons import Dotprops
+import pytest
+
+import numpy as np
+
+from navis.nbl.smat import (
+    wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder
+)
+
+
+SMALLEST_DIM_SIZE = 3
+SEED = 1991
+
+
+@pytest.mark.parametrize(
+    ["arr", "left", "right", "expected"],
+    [
+        ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
+        ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
+        ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]),
+    ],
+)
+def test_wrap_bounds(arr, left, right, expected):
+    assert np.allclose(wrap_bounds(arr, left, right), expected)
+
+
+def test_wrap_bounds_error():
+    with pytest.raises(ValueError):
+        wrap_bounds([1, 2, 1])
+
+
+def lookup_args(ndim):
+    f"""
+    Create arguments for an ND lookup table.
+
+    The first dimension is of size {SMALLEST_DIM_SIZE},
+    and subsequent dimensions are 1 longer than the previous.
+    The data in the cells are the sequence from 0
+    to the size of the array.
+    The boundaries are 0 to the length of the dimension,
+    with the left and rightmost values replaced with -inf and inf respectively.
+
+    Examples
+    --------
+    >>> lookup_args(2)
+    (
+        [
+            array([-inf, 1, 2, inf]),
+            array([-inf, 1, 2, 3, inf]),
+        ],
+        array([
+            [ 0,  1,  2,  3 ],
+            [ 4,  5,  6,  7 ],
+            [ 8,  9, 10, 11 ],
+        ]),
+    )
+    """
+    shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim))
+    cells = np.arange(np.product(shape)).reshape(shape)
+    boundaries = []
+    for s in shape:
+        b = np.arange(s + 1, dtype=float)
+        b[0] = -np.inf
+        b[-1] = np.inf
+        boundaries.append(b)
+    return boundaries, cells
+
+
+def fmt_array(arg):
+    if np.isscalar(arg):
+        return str(arg)
+    else:
+        return "[" + ",".join(fmt_array(v) for v in arg) + "]"
+
+
+@pytest.mark.parametrize(
+    ["ndim"], [[1], [2], [3], [4], [5]], ids=lambda x: f"{x}D"
+)
+@pytest.mark.parametrize(
+    ["arg"],
+    [
+        (-1000,),
+        (0,),
+        (1,),
+        (1.5,),
+        (2,),
+        (1000,),
+        ([-1000, 0, 1, 1.5, 2, 1000],),
+    ],
+    ids=fmt_array,
+)
+def test_lookupNd(ndim, arg):
+    lookup = LookupNd(*lookup_args(ndim))
+
+    args = [arg for _ in range(ndim)]
+    expected_arr_idx = np.floor([
+        np.clip(arg, 0, dim + SMALLEST_DIM_SIZE - 1) for dim in range(ndim)
+    ]).astype(int)
+    expected_val = np.ravel_multi_index(
+        tuple(expected_arr_idx), lookup.cells.shape
+    )
+
+    response = lookup(*args)
+    assert np.all(response == expected_val)
+
+
+@pytest.mark.parametrize(["strict"], [(False,), (True,)])
+def test_lookup2d_roundtrip(strict):
+    lookup = Lookup2d(*lookup_args(2))
+    df = lookup.to_dataframe()
+    lookup2 = Lookup2d.from_dataframe(df, strict=strict)
+    assert np.allclose(lookup.cells, lookup2.cells)
+    for b1, b2 in zip(lookup.boundaries, lookup2.boundaries):
+        assert np.allclose(b1, b2)
+
+
+def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
+    k = 5
+    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons]
+    n_orig = len(dotprops)
+
+    # make jittered copies of these neurons
+    rng = np.random.default_rng(SEED)
+    jitter_sigma = 50
+    matching_sets = []
+    for idx, dp in enumerate(dotprops[:]):
+        dotprops.append(
+            Dotprops(
+                dp.points + rng.normal(0, jitter_sigma, dp.points.shape), k
+            )
+        )
+        # assign each neuron its jittered self as a match
+        matching_sets.append({idx, idx + n_orig})
+
+    # original neurons should all not match each other
+    nonmatching = list(range(n_orig))
+
+    # max distance between any 2 points in the data
+    # for calculating dist boundaries
+    max_dist = np.linalg.norm(
+        np.ptp(
+            np.concatenate([dp.points for dp in dotprops], axis=0), axis=0,
+        )
+    )
+
+    return LookupDistDotBuilder(
+        dotprops,
+        matching_sets,
+        np.geomspace(10, max_dist, 5)[:-1],
+        np.linspace(0, 1, 5),
+        nonmatching,
+        alpha,
+        seed=SEED + 1,
+    )
+
+
+@pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)])
+@pytest.mark.parametrize(["alpha"], [(True,), (False,)])
+def test_lookupdistdotbuilder_builds(neurons, threads, alpha):
+    builder = prepare_lookupdistdotbuilder(neurons, alpha)
+    lookup = builder.build(threads)
+    # `pytest -rP` to see output
+    print(lookup.to_dataframe())

From 72dfe4cd8da5d0fa16185e4309f53ad6793a66de Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Wed, 14 Apr 2021 14:13:34 +0100
Subject: [PATCH 09/22] Do not sort matched pairs

---
 navis/nbl/smat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 6beae345..7e1f2d3b 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -200,7 +200,7 @@ def _counts_array(self, idx_pairs, threads=None):
         return counts
 
     def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
-        matching_pairs = sorted(set(self._yield_matching_pairs()))
+        matching_pairs = list(set(self._yield_matching_pairs()))
         # need to know the eventual distdot count
         # so we know how many non-matching pairs to draw
         q_idx_count = Counter(p[0] for p in matching_pairs)

From 41ee0d00130c72418f1f9e2b774029b063f06f4d Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 15 Apr 2021 13:39:11 +0100
Subject: [PATCH 10/22] FutureWarning ScoringFunction, base Nblaster on
 Lookup2d

---
 navis/nbl/nblast_funcs.py | 64 +++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index ae02324e..b6bb3ec0 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -15,6 +15,9 @@
 
 import numbers
 import os
+from warnings import warn
+import operator
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -23,14 +26,15 @@
 from typing import Union, Optional
 from typing_extensions import Literal
 
+from .smat import Lookup2d
 from .. import core, utils
 from ..core import NeuronList, Dotprops, make_dotprops
 from .. import config
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
-fp = os.path.dirname(__file__)
-smat_path = os.path.join(fp, 'score_mats')
+fp = Path(__file__).resolve().parent
+smat_path = fp / 'score_mats'
 
 logger = config.logger
 
@@ -40,6 +44,7 @@ class ScoringFunction:
 
     def __init__(self, smat):
         if isinstance(smat, type(None)):
+            self._deprecation_warning("operator.mul")
             self.scoring_function = self.pass_through
         elif isinstance(smat, (pd.DataFrame, str)):
             self.parse_matrix(smat)
@@ -47,6 +52,12 @@ def __init__(self, smat):
         else:
             raise TypeError
 
+    def _deprecation_warning(self, alternative: str):
+        warn(
+            f"{type(self).__qualname__} is deprecated, use {alternative}.",
+            FutureWarning,
+        )
+
     def __call__(self, dist, dot):
         return self.scoring_function(dist, dot)
 
@@ -63,10 +74,15 @@ def score_lookup(self, dist, dot):
     def parse_matrix(self, smat):
         """Parse matrix."""
         if isinstance(smat, str):
+            self._deprecation_warning(
+                f"{Lookup2d.__qualname__}.from_dataframe(pandas.read_csv(smat, index_col=0))"
+            )
             smat = pd.read_csv(smat, index_col=0)
+        else:
+            self._deprecation_warning(f"{Lookup2d.__qualname__}.from_dataframe(smat)")
 
         if not isinstance(smat, pd.DataFrame):
-            raise TypeError(f'Excepted filepath or DataFrame, got "{type(smat)}"')
+            raise TypeError(f'Expected filepath or DataFrame, got "{type(smat)}"')
 
         self.cells = smat.to_numpy()
 
@@ -98,21 +114,28 @@ class NBlaster:
     changing parameters (e.g. ``use_alpha``) at a later stage will mess things
     up!
 
+    The highly flexible ``smat`` argument converts raw point match parameters
+    nto a single score representing how good that match is.
+    This can be any callable which takes 2 floats or same-length numpy arrays
+    (distance and dot product; if ``use_alpha`` is truthy,
+    scale dot product by the geometric mean of the alpha colinearity values)
+    and returns a float or numpy array.
+    If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above.
+    If path-like, converts this into a dataframe and uses as above.
+    If ``None``, uses ``operator.mul``.
+    If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast).
+
     Parameters
     ----------
     use_alpha :     bool
                     Whether or not to use alpha values for the scoring.
                     If True, the dotproduct of nearest neighbor vectors will
                     be scaled by ``sqrt(alpha1 * alpha2)``.
-    normalzed :     bool
+    normalized :    bool
                     If True, will normalize scores by the best possible score
                     (i.e. self-self) of the query neuron.
-    smat :          str | pd.DataFrame
-                    Score matrix. If 'auto' (default), will use scoring matrices
-                    from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
-                    generated as the product of the distances and the dotproduct
-                    of the vectors of nearest-neighbor pairs.
+    smat :          Callable[[float, float], float] | str | os.PathLike | pd.DataFrame
+                    See above for more details.
     progress :      bool
                     If True, will show a progress bar.
 
@@ -124,15 +147,24 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True)
         self.normalized = normalized
         self.progress = progress
 
-        if smat == 'auto':
+        if smat is None:
+            smat = operator.mul
+        elif smat == 'auto':
             if self.use_alpha:
-                smat = pd.read_csv(f'{smat_path}/smat_alpha_fcwb.csv',
-                                   index_col=0)
+                smat = smat_path / 'smat_alpha_fcwb.csv'
             else:
-                smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv',
-                                   index_col=0)
+                smat = smat_path / 'smat_fcwb.csv'
+
+        if isinstance(smat, (str, os.PathLike)):
+            smat = pd.read_csv(smat, index_col=0)
+
+        if isinstance(smat, pd.DataFrame):
+            smat = Lookup2d.from_dataframe(smat)
+
+        if not callable(smat):
+            raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
 
-        self.score_fn = ScoringFunction(smat)
+        self.score_fn = smat
 
         self.self_hits = []
         self.dotprops = []

From cab7e816a4cd609ca34942a21b24a46d0b59e189 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 15 Apr 2021 14:19:43 +0100
Subject: [PATCH 11/22] Functionally test for valid score function

---
 navis/nbl/nblast_funcs.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index b6bb3ec0..b52fb332 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -23,7 +23,7 @@
 import pandas as pd
 
 from concurrent.futures import ProcessPoolExecutor
-from typing import Union, Optional
+from typing import Union, Optional, Callable
 from typing_extensions import Literal
 
 from .smat import Lookup2d
@@ -107,8 +107,27 @@ def parse_interval(self, s):
         return float(s.strip("([])").split(",")[-1])
 
 
+score_func_description = """
+NBLAST score functions take 2 floats or numpy arrays of floats of length N
+(for matched dotprop points/tangents, distance and dot product;
+the latter possibly scaled by the geometric mean of the alpha colinearity values)
+and returns a float or N-length numpy array of floats.
+""".strip()
+
+
+def is_score_function(fn: Callable[[float, float], float]):
+    f"""Ensure that the score function is valid for NBLAST.
+
+    {score_func_description}
+    """
+    test_arr = np.array([0.5] * 3)
+    out = fn(test_arr, test_arr)
+
+    return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype
+
+
 class NBlaster:
-    """Implements version 2 of the NBLAST algorithm.
+    f"""Implements version 2 of the NBLAST algorithm.
 
     Please note that some properties are computed on initialization and
     changing parameters (e.g. ``use_alpha``) at a later stage will mess things
@@ -116,10 +135,8 @@ class NBlaster:
 
     The highly flexible ``smat`` argument converts raw point match parameters
     nto a single score representing how good that match is.
-    This can be any callable which takes 2 floats or same-length numpy arrays
-    (distance and dot product; if ``use_alpha`` is truthy,
-    scale dot product by the geometric mean of the alpha colinearity values)
-    and returns a float or numpy array.
+    Most simply, it is an NBLAST score function.
+    {score_func_description}
     If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above.
     If path-like, converts this into a dataframe and uses as above.
     If ``None``, uses ``operator.mul``.
@@ -164,6 +181,9 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True)
         if not callable(smat):
             raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
 
+        if not is_score_function(smat):
+            raise ValueError("smat is not a valid NBLAST score function, see documentation")
+
         self.score_fn = smat
 
         self.self_hits = []

From 78f90a859f3fbb9cb06f5821ed5fd73a48456920 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 15 Apr 2021 14:24:18 +0100
Subject: [PATCH 12/22] Split smat-parsing into function

---
 navis/nbl/nblast_funcs.py | 61 +++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index b52fb332..660df143 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -126,13 +126,8 @@ def is_score_function(fn: Callable[[float, float], float]):
     return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype
 
 
-class NBlaster:
-    f"""Implements version 2 of the NBLAST algorithm.
-
-    Please note that some properties are computed on initialization and
-    changing parameters (e.g. ``use_alpha``) at a later stage will mess things
-    up!
-
+def parse_score_fn(smat, use_alpha):
+    f"""
     The highly flexible ``smat`` argument converts raw point match parameters
     nto a single score representing how good that match is.
     Most simply, it is an NBLAST score function.
@@ -141,6 +136,36 @@ class NBlaster:
     If path-like, converts this into a dataframe and uses as above.
     If ``None``, uses ``operator.mul``.
     If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast).
+    """
+    if smat is None:
+        smat = operator.mul
+    elif smat == 'auto':
+        if use_alpha:
+            smat = smat_path / 'smat_alpha_fcwb.csv'
+        else:
+            smat = smat_path / 'smat_fcwb.csv'
+
+    if isinstance(smat, (str, os.PathLike)):
+        smat = pd.read_csv(smat, index_col=0)
+
+    if isinstance(smat, pd.DataFrame):
+        smat = Lookup2d.from_dataframe(smat)
+
+    if not callable(smat):
+        raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
+
+    if not is_score_function(smat):
+        raise ValueError("smat is not a valid NBLAST score function, see documentation")
+
+
+class NBlaster:
+    f"""Implements version 2 of the NBLAST algorithm.
+
+    Please note that some properties are computed on initialization and
+    changing parameters (e.g. ``use_alpha``) at a later stage will mess things
+    up!
+
+    {parse_score_fn.__doc__}
 
     Parameters
     ----------
@@ -164,27 +189,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True)
         self.normalized = normalized
         self.progress = progress
 
-        if smat is None:
-            smat = operator.mul
-        elif smat == 'auto':
-            if self.use_alpha:
-                smat = smat_path / 'smat_alpha_fcwb.csv'
-            else:
-                smat = smat_path / 'smat_fcwb.csv'
-
-        if isinstance(smat, (str, os.PathLike)):
-            smat = pd.read_csv(smat, index_col=0)
-
-        if isinstance(smat, pd.DataFrame):
-            smat = Lookup2d.from_dataframe(smat)
-
-        if not callable(smat):
-            raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
-
-        if not is_score_function(smat):
-            raise ValueError("smat is not a valid NBLAST score function, see documentation")
-
-        self.score_fn = smat
+        self.score_fn = parse_score_fn(smat, use_alpha)
 
         self.self_hits = []
         self.dotprops = []

From 3641ac589a9f5739825013ad013d4e66706b073c Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 15 Apr 2021 14:38:44 +0100
Subject: [PATCH 13/22] Refactor smat-checking

---
 navis/nbl/nblast_funcs.py | 84 +++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index 660df143..b572a3ee 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -115,57 +115,21 @@ def parse_interval(self, s):
 """.strip()
 
 
-def is_score_function(fn: Callable[[float, float], float]):
-    f"""Ensure that the score function is valid for NBLAST.
-
-    {score_func_description}
-    """
-    test_arr = np.array([0.5] * 3)
-    out = fn(test_arr, test_arr)
-
-    return isinstance(fn(0.5, 0.5), float) and out.shape == test_arr.shape and out.dtype == test_arr.dtype
+class NBlaster:
+    f"""Implements version 2 of the NBLAST algorithm.
 
+    Please note that some properties are computed on initialization and
+    changing parameters (e.g. ``use_alpha``) at a later stage will mess things
+    up!
 
-def parse_score_fn(smat, use_alpha):
-    f"""
     The highly flexible ``smat`` argument converts raw point match parameters
-    nto a single score representing how good that match is.
+    into a single score representing how good that match is.
     Most simply, it is an NBLAST score function.
     {score_func_description}
     If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above.
     If path-like, converts this into a dataframe and uses as above.
     If ``None``, uses ``operator.mul``.
     If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast).
-    """
-    if smat is None:
-        smat = operator.mul
-    elif smat == 'auto':
-        if use_alpha:
-            smat = smat_path / 'smat_alpha_fcwb.csv'
-        else:
-            smat = smat_path / 'smat_fcwb.csv'
-
-    if isinstance(smat, (str, os.PathLike)):
-        smat = pd.read_csv(smat, index_col=0)
-
-    if isinstance(smat, pd.DataFrame):
-        smat = Lookup2d.from_dataframe(smat)
-
-    if not callable(smat):
-        raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
-
-    if not is_score_function(smat):
-        raise ValueError("smat is not a valid NBLAST score function, see documentation")
-
-
-class NBlaster:
-    f"""Implements version 2 of the NBLAST algorithm.
-
-    Please note that some properties are computed on initialization and
-    changing parameters (e.g. ``use_alpha``) at a later stage will mess things
-    up!
-
-    {parse_score_fn.__doc__}
 
     Parameters
     ----------
@@ -189,11 +153,45 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True)
         self.normalized = normalized
         self.progress = progress
 
-        self.score_fn = parse_score_fn(smat, use_alpha)
+        self.score_fn = self._parse_score_fn(smat)
 
         self.self_hits = []
         self.dotprops = []
 
+    def _parse_score_fn(self, smat):
+        if smat is None:
+            smat = operator.mul
+        elif smat == 'auto':
+            if self.use_alpha:
+                smat = smat_path / 'smat_alpha_fcwb.csv'
+            else:
+                smat = smat_path / 'smat_fcwb.csv'
+
+        if isinstance(smat, (str, os.PathLike)):
+            smat = pd.read_csv(smat, index_col=0)
+
+        if isinstance(smat, pd.DataFrame):
+            smat = Lookup2d.from_dataframe(smat)
+
+        if not callable(smat):
+            raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
+
+        if not isinstance(smat(0.5, 0.5), float):
+            raise ValueError("smat does not take 2 floats and return a float")
+
+        test_arr = np.array([0.5] * 3)
+        try:
+            out = smat(test_arr, test_arr)
+        except Exception as e:
+            raise ValueError(f"Failed to use smat with numpy arrays: {e}")
+
+        if out.shape != test_arr.shape:
+            raise ValueError(
+                f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}"
+            )
+
+        return smat
+
     def append(self, dotprops):
         """Append dotprops."""
         if isinstance(dotprops, (NeuronList, list)):

From 982c69ce5a997ef429823500a92dd4d3f024717e Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 15 Apr 2021 18:32:46 +0100
Subject: [PATCH 14/22] Refactor LookupDistDotBuilder

- Use composition instead of inheritance to allow boundaries to be
  defined later
- Add (questionable) helper methods for determining boundaries
---
 navis/nbl/smat.py           | 135 ++++++++++++++++++++++++++++++++----
 tests/test_nbl/test_smat.py |  42 +++++++----
 2 files changed, 148 insertions(+), 29 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 7e1f2d3b..2f79cc26 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -5,15 +5,13 @@
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
 from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
-import logging
 
 import numpy as np
 import pandas as pd
 
+from ..config import logger
 from ..core.neurons import Dotprops
 
-logger = logging.getLogger(__name__)
-
 DEFAULT_SEED = 1991
 
 epsilon = sys.float_info.epsilon
@@ -276,13 +274,11 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops):
     return [dist, dot * np.sqrt(alpha)]
 
 
-class LookupDistDotBuilder(LookupNdBuilder):
+class LookupDistDotBuilder:
     def __init__(
         self,
         dotprops,
         matching_sets,
-        dist_boundaries,
-        dot_boundaries,
         nonmatching=None,
         use_alpha=False,
         seed=DEFAULT_SEED,
@@ -295,6 +291,10 @@ def __init__(
         2. The dot products of direction vectors around those points,
             optionally scaled by the colinearity ``alpha``.
 
+        The boundaries for the bins for each of these two values need to be set.
+        Use the ``.calc_(dist|dot)_boundaries`` methods to estimate values
+        based on the data, or the ``.set_(dist|dot)_boundaries`` to use your own.
+
         Parameters
         ----------
         dotprops : dict or list of Dotprops
@@ -334,18 +334,123 @@ def __init__(
             Non-matching pairs are drawn at random using this seed,
             by default {DEFAULT_SEED}
         """
-        fn = dist_dot_alpha if use_alpha else dist_dot
-        super().__init__(
-            dotprops,
-            matching_sets,
-            [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
-            fn,
-            nonmatching,
-            seed,
+        self.dotprops = dotprops
+        self.matching_sets = matching_sets
+        self._dist_boundaries = None
+        self._dot_boundaries = None
+        self.fn = dist_dot_alpha if use_alpha else dist_dot
+        self.nonmatching = nonmatching
+        self.seed = seed
+
+    @property
+    def dist_boundaries(self):
+        return self._dist_boundaries
+
+    def set_dist_boundaries(self, bounds):
+        """Set the boundaries for point match distance bins.
+
+        ``numpy.geomspace`` is helpful if you know approximately
+        what the upper bound of the lowest bin and the lower bound of the
+        highest bin should be.
+
+        Parameters
+        ----------
+        bounds : array-like of float
+            Bin boundaries to set (may be expanded to cover full domain).
+            Should not go below 0.
+
+        Returns
+        -------
+        np.ndarray
+            The (possibly expanded) bin boundaries which will be used.
+        """
+        self._dist_boundaries = wrap_bounds(bounds, 0)
+        return self._dist_boundaries
+
+    def calc_dist_boundaries(self, n_bins, scale=10):
+        """Estimate appropriate point match distance bins from the scale of the data.
+
+        The lower bound of the highest bin will be estimated as ``1/scale`` the size
+        of the diagonal of the axis aligned bounding box of the smallest dotprop:
+        there is unlikely to be any value in a point match
+        at a distance the same order of magnitude as a neuron's size.
+
+        Each subsequent lower bound will be half of that bin's upper bound;
+        therefore the upper bound of the lowest bin will be
+        ``highest_lower / scale ** (n_bins - 1)``.
+
+        These estimates are quite conservative and may not yield good results for your use case,
+        but you may want to use them as a guide for your own automatically-determined boundaries.
+
+        Parameters
+        ----------
+        n_bins : int
+            Number of bins to create.
+        scale : float, optional
+            The multiplication from one bound to another, default 10.
+
+        Returns
+        -------
+        np.ndarray
+            The bin boundaries which will be used (length ``n_bins + 1``).
+        """
+
+        smallest_diag = np.inf
+        for dp in self.dotprops:
+            smallest_diag = min(
+                smallest_diag, np.linalg.norm(np.ptp(dp.points, axis=0))
+            )
+        highest_lower = smallest_diag / scale
+        return self.set_dist_boundaries(
+            highest_lower / scale ** np.arange(n_bins - 1)[::-1]
         )
 
+    @property
+    def dot_boundaries(self):
+        return self._dot_boundaries
+
+    def set_dot_boundaries(self, bounds):
+        """Set the boundaries for point match tangent dot product bins.
+
+        Parameters
+        ----------
+        bounds : array-like of float
+            Bin boundaries to set (may be expanded to cover full domain).
+            Should not go below 0 or above 1.
+
+        Returns
+        -------
+        np.ndarray
+            The (possibly expanded) bin boundaries which will be used.
+        """
+        self._dot_boundaries = wrap_bounds(bounds, 0, 1)
+        return self._dot_boundaries
+
+    def calc_dot_boundaries(self, n_bins):
+        """Linearly interpolate bin boundaries between 0 and 1.
+
+        Parameters
+        ----------
+        n_bins : int
+            Number of bins to use.
+
+        Returns
+        -------
+        np.ndarray
+            Bin boundaries which will be used (length ``n_bins + 1``)
+        """
+        return self.set_dot_boundaries(np.linspace(0, 1, n_bins + 1))
+
     def build(self, threads=None) -> Lookup2d:
-        return Lookup2d(*self._build(threads))
+        builder = LookupNdBuilder(
+            self.dotprops,
+            self.matching_sets,
+            [self.dist_boundaries, self.dot_boundaries],
+            self.fn,
+            self.nonmatching,
+            self.seed
+        )
+        return Lookup2d(*builder._build(threads))
 
 
 class LookupNd:
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index 0183a1e1..daf2951e 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -114,8 +114,27 @@ def test_lookup2d_roundtrip(strict):
         assert np.allclose(b1, b2)
 
 
-def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
-    k = 5
+def test_calc_dist_boundaries(neurons):
+    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons]
+    builder = LookupDistDotBuilder(dotprops, [])
+    n_bins = 5
+    bounds = builder.calc_dist_boundaries(n_bins)
+    assert bounds.shape == (n_bins + 1,)
+    assert bounds[0] == -np.inf
+    assert bounds[-1] == np.inf
+
+
+def test_calc_dot_boundaries(neurons):
+    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons]
+    builder = LookupDistDotBuilder(dotprops, [])
+    n_bins = 5
+    bounds = builder.calc_dot_boundaries(n_bins)
+    assert bounds.shape == (n_bins + 1,)
+    assert bounds[0] == -np.inf
+    assert bounds[-1] == np.inf
+
+
+def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False):
     dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons]
     n_orig = len(dotprops)
 
@@ -135,29 +154,24 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
     # original neurons should all not match each other
     nonmatching = list(range(n_orig))
 
-    # max distance between any 2 points in the data
-    # for calculating dist boundaries
-    max_dist = np.linalg.norm(
-        np.ptp(
-            np.concatenate([dp.points for dp in dotprops], axis=0), axis=0,
-        )
-    )
-
-    return LookupDistDotBuilder(
+    builder = LookupDistDotBuilder(
         dotprops,
         matching_sets,
-        np.geomspace(10, max_dist, 5)[:-1],
-        np.linspace(0, 1, 5),
         nonmatching,
         alpha,
         seed=SEED + 1,
     )
+    if with_bounds:
+        n_bins = 5
+        builder.calc_dist_boundaries(n_bins)
+        builder.calc_dot_boundaries(n_bins)
+    return builder
 
 
 @pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)])
 @pytest.mark.parametrize(["alpha"], [(True,), (False,)])
 def test_lookupdistdotbuilder_builds(neurons, threads, alpha):
-    builder = prepare_lookupdistdotbuilder(neurons, alpha)
+    builder = prepare_lookupdistdotbuilder(neurons, alpha, with_bounds=True)
     lookup = builder.build(threads)
     # `pytest -rP` to see output
     print(lookup.to_dataframe())

From 268d81d67ccc66923c21cce980287d2fd743782c Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Fri, 16 Apr 2021 16:02:08 +0100
Subject: [PATCH 15/22] Revert "Refactor LookupDistDotBuilder"

This reverts commit 779616f769f8b7fd0360848601926883117a228d.
---
 navis/nbl/smat.py           | 135 ++++--------------------------------
 tests/test_nbl/test_smat.py |  42 ++++-------
 2 files changed, 29 insertions(+), 148 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 2f79cc26..7e1f2d3b 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -5,13 +5,15 @@
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
 from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
+import logging
 
 import numpy as np
 import pandas as pd
 
-from ..config import logger
 from ..core.neurons import Dotprops
 
+logger = logging.getLogger(__name__)
+
 DEFAULT_SEED = 1991
 
 epsilon = sys.float_info.epsilon
@@ -274,11 +276,13 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops):
     return [dist, dot * np.sqrt(alpha)]
 
 
-class LookupDistDotBuilder:
+class LookupDistDotBuilder(LookupNdBuilder):
     def __init__(
         self,
         dotprops,
         matching_sets,
+        dist_boundaries,
+        dot_boundaries,
         nonmatching=None,
         use_alpha=False,
         seed=DEFAULT_SEED,
@@ -291,10 +295,6 @@ def __init__(
         2. The dot products of direction vectors around those points,
             optionally scaled by the colinearity ``alpha``.
 
-        The boundaries for the bins for each of these two values need to be set.
-        Use the ``.calc_(dist|dot)_boundaries`` methods to estimate values
-        based on the data, or the ``.set_(dist|dot)_boundaries`` to use your own.
-
         Parameters
         ----------
         dotprops : dict or list of Dotprops
@@ -334,123 +334,18 @@ def __init__(
             Non-matching pairs are drawn at random using this seed,
             by default {DEFAULT_SEED}
         """
-        self.dotprops = dotprops
-        self.matching_sets = matching_sets
-        self._dist_boundaries = None
-        self._dot_boundaries = None
-        self.fn = dist_dot_alpha if use_alpha else dist_dot
-        self.nonmatching = nonmatching
-        self.seed = seed
-
-    @property
-    def dist_boundaries(self):
-        return self._dist_boundaries
-
-    def set_dist_boundaries(self, bounds):
-        """Set the boundaries for point match distance bins.
-
-        ``numpy.geomspace`` is helpful if you know approximately
-        what the upper bound of the lowest bin and the lower bound of the
-        highest bin should be.
-
-        Parameters
-        ----------
-        bounds : array-like of float
-            Bin boundaries to set (may be expanded to cover full domain).
-            Should not go below 0.
-
-        Returns
-        -------
-        np.ndarray
-            The (possibly expanded) bin boundaries which will be used.
-        """
-        self._dist_boundaries = wrap_bounds(bounds, 0)
-        return self._dist_boundaries
-
-    def calc_dist_boundaries(self, n_bins, scale=10):
-        """Estimate appropriate point match distance bins from the scale of the data.
-
-        The lower bound of the highest bin will be estimated as ``1/scale`` the size
-        of the diagonal of the axis aligned bounding box of the smallest dotprop:
-        there is unlikely to be any value in a point match
-        at a distance the same order of magnitude as a neuron's size.
-
-        Each subsequent lower bound will be half of that bin's upper bound;
-        therefore the upper bound of the lowest bin will be
-        ``highest_lower / scale ** (n_bins - 1)``.
-
-        These estimates are quite conservative and may not yield good results for your use case,
-        but you may want to use them as a guide for your own automatically-determined boundaries.
-
-        Parameters
-        ----------
-        n_bins : int
-            Number of bins to create.
-        scale : float, optional
-            The multiplication from one bound to another, default 10.
-
-        Returns
-        -------
-        np.ndarray
-            The bin boundaries which will be used (length ``n_bins + 1``).
-        """
-
-        smallest_diag = np.inf
-        for dp in self.dotprops:
-            smallest_diag = min(
-                smallest_diag, np.linalg.norm(np.ptp(dp.points, axis=0))
-            )
-        highest_lower = smallest_diag / scale
-        return self.set_dist_boundaries(
-            highest_lower / scale ** np.arange(n_bins - 1)[::-1]
+        fn = dist_dot_alpha if use_alpha else dist_dot
+        super().__init__(
+            dotprops,
+            matching_sets,
+            [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
+            fn,
+            nonmatching,
+            seed,
         )
 
-    @property
-    def dot_boundaries(self):
-        return self._dot_boundaries
-
-    def set_dot_boundaries(self, bounds):
-        """Set the boundaries for point match tangent dot product bins.
-
-        Parameters
-        ----------
-        bounds : array-like of float
-            Bin boundaries to set (may be expanded to cover full domain).
-            Should not go below 0 or above 1.
-
-        Returns
-        -------
-        np.ndarray
-            The (possibly expanded) bin boundaries which will be used.
-        """
-        self._dot_boundaries = wrap_bounds(bounds, 0, 1)
-        return self._dot_boundaries
-
-    def calc_dot_boundaries(self, n_bins):
-        """Linearly interpolate bin boundaries between 0 and 1.
-
-        Parameters
-        ----------
-        n_bins : int
-            Number of bins to use.
-
-        Returns
-        -------
-        np.ndarray
-            Bin boundaries which will be used (length ``n_bins + 1``)
-        """
-        return self.set_dot_boundaries(np.linspace(0, 1, n_bins + 1))
-
     def build(self, threads=None) -> Lookup2d:
-        builder = LookupNdBuilder(
-            self.dotprops,
-            self.matching_sets,
-            [self.dist_boundaries, self.dot_boundaries],
-            self.fn,
-            self.nonmatching,
-            self.seed
-        )
-        return Lookup2d(*builder._build(threads))
+        return Lookup2d(*self._build(threads))
 
 
 class LookupNd:
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index daf2951e..0183a1e1 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -114,27 +114,8 @@ def test_lookup2d_roundtrip(strict):
         assert np.allclose(b1, b2)
 
 
-def test_calc_dist_boundaries(neurons):
-    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons]
-    builder = LookupDistDotBuilder(dotprops, [])
-    n_bins = 5
-    bounds = builder.calc_dist_boundaries(n_bins)
-    assert bounds.shape == (n_bins + 1,)
-    assert bounds[0] == -np.inf
-    assert bounds[-1] == np.inf
-
-
-def test_calc_dot_boundaries(neurons):
-    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k=5) for n in neurons]
-    builder = LookupDistDotBuilder(dotprops, [])
-    n_bins = 5
-    bounds = builder.calc_dot_boundaries(n_bins)
-    assert bounds.shape == (n_bins + 1,)
-    assert bounds[0] == -np.inf
-    assert bounds[-1] == np.inf
-
-
-def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False):
+def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
+    k = 5
     dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons]
     n_orig = len(dotprops)
 
@@ -154,24 +135,29 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5, with_bounds=False):
     # original neurons should all not match each other
     nonmatching = list(range(n_orig))
 
-    builder = LookupDistDotBuilder(
+    # max distance between any 2 points in the data
+    # for calculating dist boundaries
+    max_dist = np.linalg.norm(
+        np.ptp(
+            np.concatenate([dp.points for dp in dotprops], axis=0), axis=0,
+        )
+    )
+
+    return LookupDistDotBuilder(
         dotprops,
         matching_sets,
+        np.geomspace(10, max_dist, 5)[:-1],
+        np.linspace(0, 1, 5),
         nonmatching,
         alpha,
         seed=SEED + 1,
     )
-    if with_bounds:
-        n_bins = 5
-        builder.calc_dist_boundaries(n_bins)
-        builder.calc_dot_boundaries(n_bins)
-    return builder
 
 
 @pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)])
 @pytest.mark.parametrize(["alpha"], [(True,), (False,)])
 def test_lookupdistdotbuilder_builds(neurons, threads, alpha):
-    builder = prepare_lookupdistdotbuilder(neurons, alpha, with_bounds=True)
+    builder = prepare_lookupdistdotbuilder(neurons, alpha)
     lookup = builder.build(threads)
     # `pytest -rP` to see output
     print(lookup.to_dataframe())

From 32b756df5990b3eba5d61359f8154248fc02d8ba Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Fri, 16 Apr 2021 16:17:40 +0100
Subject: [PATCH 16/22] smat: minor refactor

---
 navis/nbl/smat.py | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 7e1f2d3b..e60a07d4 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -90,7 +90,7 @@ def __init__(
         dotprops,
         matching_sets,
         boundaries: Sequence[Sequence[float]],
-        fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
+        match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
         nonmatching=None,
         seed=DEFAULT_SEED,
     ) -> None:
@@ -114,7 +114,7 @@ def __init__(
             -inf and inf will be prepended and appended.
 
             See the ``wrap_bounds`` convenience function.
-        fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
+        match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
             Function taking 2 arguments,
             both instances of ``navis.core.neurons.Dotprops``,
             and returning a list of 1D ``numpy.ndarray``s of floats.
@@ -136,7 +136,7 @@ def __init__(
         self.dotprops = dotprops
         self.matching_sets = matching_sets
         self._nonmatching = nonmatching
-        self.fn = fn
+        self.match_fn = match_fn
         self.boundaries = [wrap_bounds(b) for b in boundaries]
 
         self.rng = np.random.default_rng(seed)
@@ -167,7 +167,7 @@ def _empty_counts(self):
         return np.zeros(shape, int)
 
     def _query_to_idxs(self, q_idx, t_idx, counts=None):
-        response = self.fn(self.dotprops[q_idx], self.dotprops[t_idx])
+        response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
         idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)]
 
         if counts is None:
@@ -199,15 +199,7 @@ def _counts_array(self, idx_pairs, threads=None):
 
         return counts
 
-    def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
-        matching_pairs = list(set(self._yield_matching_pairs()))
-        # need to know the eventual distdot count
-        # so we know how many non-matching pairs to draw
-        q_idx_count = Counter(p[0] for p in matching_pairs)
-        n_matching_dist_dots = sum(
-            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
-        )
-
+    def _pick_nonmatching_pairs(self, n_matching_qual_vals):
         # pre-calculating which pairs we're going to use,
         # rather than drawing them as we need them,
         # means that we can parallelise the later step more effectively.
@@ -215,11 +207,25 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
         # because of how long distdot calculation will take
         all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
         nonmatching_pairs = []
-        n_nonmatching_dist_dots = 0
-        while n_nonmatching_dist_dots < n_matching_dist_dots:
+        n_nonmatching_qual_vals = 0
+        while n_nonmatching_qual_vals < n_matching_qual_vals:
             idx = self.rng.integers(0, len(all_nonmatching_pairs))
-            nonmatching_pairs.append(all_nonmatching_pairs.pop(idx))
-            n_nonmatching_dist_dots += len(self.dotprops[nonmatching_pairs[-1][0]])
+            nonmatching_pair = all_nonmatching_pairs.pop(idx)
+            nonmatching_pairs.append(nonmatching_pair)
+            n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]])
+
+        return nonmatching_pairs
+
+    def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
+        matching_pairs = list(set(self._yield_matching_pairs()))
+        # need to know the eventual distdot count
+        # so we know how many non-matching pairs to draw
+        q_idx_count = Counter(p[0] for p in matching_pairs)
+        n_matching_qual_vals = sum(
+            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
+        )
+
+        nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals)
 
         match_counts = self._counts_array(matching_pairs, threads)
         nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
@@ -334,12 +340,12 @@ def __init__(
             Non-matching pairs are drawn at random using this seed,
             by default {DEFAULT_SEED}
         """
-        fn = dist_dot_alpha if use_alpha else dist_dot
+        match_fn = dist_dot_alpha if use_alpha else dist_dot
         super().__init__(
             dotprops,
             matching_sets,
             [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
-            fn,
+            match_fn,
             nonmatching,
             seed,
         )

From c9b6715a4640aeaca7885695a2b71898285cfa5f Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 11:00:03 +0100
Subject: [PATCH 17/22] Minor refactor

---
 navis/nbl/smat.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index e60a07d4..72c3ab69 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -186,14 +186,15 @@ def _counts_array(self, idx_pairs, threads=None):
             return counts
 
         threads = threads or cpu_count
+        idx_pairs = np.asarray(idx_pairs, dtype=int)
+        chunks = chunksize(len(idx_pairs), threads)
 
         with ProcessPoolExecutor(threads) as exe:
-            idx_pairs = np.asarray(idx_pairs, dtype=int)
             for these_counts in exe.map(
                 self._query_to_idxs,
                 idx_pairs[:, 0],
                 idx_pairs[:, 1],
-                chunksize=chunksize(len(idx_pairs), threads),
+                chunksize=chunks,
             ):
                 counts += these_counts
 

From cb3a64b0218f63afcda6480dc01b395ebf7d62be Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 11:34:11 +0100
Subject: [PATCH 18/22] Re-order NBLAST builder tests to avoid deadlock

The tests are not run in parallel with each other so there shouldn't be any
OS-level interference.
I also tested without session-scoped test fixtures.
The deadlock occurs on the first parallelised test which runs directly after
a non-parallelised test - by re-ordering the tests, the deadlock is avoided.
---
 tests/test_nbl/test_smat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index 0183a1e1..f47527f0 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -154,8 +154,8 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
     )
 
 
-@pytest.mark.parametrize(["threads"], [(None,), (0,), (2,)])
 @pytest.mark.parametrize(["alpha"], [(True,), (False,)])
+@pytest.mark.parametrize(["threads"], [(0,), (2,), (None,)])
 def test_lookupdistdotbuilder_builds(neurons, threads, alpha):
     builder = prepare_lookupdistdotbuilder(neurons, alpha)
     lookup = builder.build(threads)

From e9e837a9341db6aa53a352f20c713c338783882f Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 13:16:26 +0100
Subject: [PATCH 19/22] Significant refactors

---
 navis/nbl/nblast_funcs.py | 104 +++++++++--------------------
 navis/nbl/smat.py         | 135 +++++++++++++++++++++++++++++++++++---
 2 files changed, 157 insertions(+), 82 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index b572a3ee..4eaf2bf4 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -16,31 +16,38 @@
 import numbers
 import os
 from warnings import warn
-import operator
-from pathlib import Path
 
 import numpy as np
 import pandas as pd
 
 from concurrent.futures import ProcessPoolExecutor
-from typing import Union, Optional, Callable
+from typing import Union, Optional
 from typing_extensions import Literal
 
-from .smat import Lookup2d
+from .smat import Lookup2d, parse_score_fn, SCORE_FN_DESCR
 from .. import core, utils
 from ..core import NeuronList, Dotprops, make_dotprops
 from .. import config
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
-fp = Path(__file__).resolve().parent
-smat_path = fp / 'score_mats'
 
 logger = config.logger
 
 
 class ScoringFunction:
-    """Class representing scoring function."""
+    """[DEPRECATED] Class representing scoring function.
+
+    Use a ``navis.nbl.smat.Lookup2d`` for score functions
+    like those used in Costa et al. 2016
+    (see the ``.from_dataframe()`` class method
+    to load it from pandas).
+    The actual score matrices from FCWB can be accessed with
+    ``navis.nbl.smat.smat_fcwb()``.
+
+    Otherwise, use any compatible callable,
+    e.g. ``operator.mul`` (used by this class for ``smat=None``).
+    """
 
     def __init__(self, smat):
         if isinstance(smat, type(None)):
@@ -107,14 +114,6 @@ def parse_interval(self, s):
         return float(s.strip("([])").split(",")[-1])
 
 
-score_func_description = """
-NBLAST score functions take 2 floats or numpy arrays of floats of length N
-(for matched dotprop points/tangents, distance and dot product;
-the latter possibly scaled by the geometric mean of the alpha colinearity values)
-and returns a float or N-length numpy array of floats.
-""".strip()
-
-
 class NBlaster:
     f"""Implements version 2 of the NBLAST algorithm.
 
@@ -125,11 +124,7 @@ class NBlaster:
     The highly flexible ``smat`` argument converts raw point match parameters
     into a single score representing how good that match is.
     Most simply, it is an NBLAST score function.
-    {score_func_description}
-    If a ``pandas.DataFrame``, converts this into a ``navis.Lookup2d`` and uses as above.
-    If path-like, converts this into a dataframe and uses as above.
-    If ``None``, uses ``operator.mul``.
-    If ``'auto'`` (default), uses score matrices from FCWB (like R's nat.nblast).
+    {SCORE_FN_DESCR}
 
     Parameters
     ----------
@@ -140,8 +135,8 @@ class NBlaster:
     normalized :    bool
                     If True, will normalize scores by the best possible score
                     (i.e. self-self) of the query neuron.
-    smat :          Callable[[float, float], float] | str | os.PathLike | pd.DataFrame
-                    See above for more details.
+    smat :          Callable[[float, float], float] | str | os.PathLike | pd.DataFrame, default "smat"
+                    See ``navis.nbl.smat.parse_score_fn``.
     progress :      bool
                     If True, will show a progress bar.
 
@@ -153,45 +148,11 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', progress=True)
         self.normalized = normalized
         self.progress = progress
 
-        self.score_fn = self._parse_score_fn(smat)
+        self.score_fn = parse_score_fn(smat, use_alpha)
 
         self.self_hits = []
         self.dotprops = []
 
-    def _parse_score_fn(self, smat):
-        if smat is None:
-            smat = operator.mul
-        elif smat == 'auto':
-            if self.use_alpha:
-                smat = smat_path / 'smat_alpha_fcwb.csv'
-            else:
-                smat = smat_path / 'smat_fcwb.csv'
-
-        if isinstance(smat, (str, os.PathLike)):
-            smat = pd.read_csv(smat, index_col=0)
-
-        if isinstance(smat, pd.DataFrame):
-            smat = Lookup2d.from_dataframe(smat)
-
-        if not callable(smat):
-            raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
-
-        if not isinstance(smat(0.5, 0.5), float):
-            raise ValueError("smat does not take 2 floats and return a float")
-
-        test_arr = np.array([0.5] * 3)
-        try:
-            out = smat(test_arr, test_arr)
-        except Exception as e:
-            raise ValueError(f"Failed to use smat with numpy arrays: {e}")
-
-        if out.shape != test_arr.shape:
-            raise ValueError(
-                f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}"
-            )
-
-        return smat
-
     def append(self, dotprops):
         """Append dotprops."""
         if isinstance(dotprops, (NeuronList, list)):
@@ -412,12 +373,8 @@ def nblast_smart(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotpro
                     that have lots of branches.
     normalized :    bool, optional
                     Whether to return normalized NBLAST scores.
-    smat :          str | pd.DataFrame
-                    Score matrix. If 'auto' (default), will use scoring matrices
-                    from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
-                    generated as the product of the distances and the dotproduct
-                    of the vectors of nearest-neighbor pairs.
+    smat :          str | pd.DataFrame, default "auto"
+                    Score matrix. See ``navis.nbl.smat.parse_score_fn``.
     progress :      bool
                     Whether to show progress bars.
 
@@ -467,6 +424,7 @@ def nblast_smart(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotpro
                 A synapse-based variant of NBLAST.
 
     """
+    smat = parse_score_fn(smat, use_alpha)
     utils.eval_param(criterion, name='criterion',
                      allowed_values=("percentile", "score", "N"))
 
@@ -679,15 +637,12 @@ def nblast(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotprops'],
                     individual processes.
     use_alpha :     bool, optional
                     Emphasizes neurons' straight parts (backbone) over parts
-                    that have lots of branches.         
+                    that have lots of branches.
     normalized :    bool, optional
                     Whether to return normalized NBLAST scores.
     smat :          str | pd.DataFrame
-                    Score matrix. If 'auto' (default), will use scoring matrices
-                    from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
-                    generated as the product of the distances and the dotproduct
-                    of the vectors of nearest-neighbor pairs.
+                    Score matrix.
+                    See ``navis.nbl.smat.parse_score_fn``.
     progress :      bool
                     Whether to show progress bars.
 
@@ -732,6 +687,8 @@ def nblast(query: Union['core.TreeNeuron', 'core.NeuronList', 'core.Dotprops'],
                 A synapse-based variant of NBLAST.
 
     """
+    smat = parse_score_fn(smat, use_alpha)
+
     # Check if query or targets are in microns
     # Note this test can return `None` if it can't be determined
     if check_microns(query) is False:
@@ -839,11 +796,8 @@ def nblast_allbyall(x: NeuronList,
     normalized :    bool, optional
                     Whether to return normalized NBLAST scores.
     smat :          str | pd.DataFrame, optional
-                    Score matrix. If 'auto' (default), will use scoring matrices
-                    from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
-                    generated as the product of the distances and the dotproduct
-                    of the vectors of nearest-neighbor pairs.
+                    Score matrix.
+                    See ``navis.nbl.smat.parse_score_fn``.
     progress :      bool
                     Whether to show progress bars.
 
@@ -883,6 +837,8 @@ def nblast_allbyall(x: NeuronList,
                 For generic query -> target nblasts.
 
     """
+    smat = parse_score_fn(smat, use_alpha)
+
     # Check if query or targets are in microns
     # Note this test can return `None` if it can't be determined
     if check_microns(x) is False:
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 72c3ab69..e61230f4 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -6,6 +6,10 @@
 from concurrent.futures import ProcessPoolExecutor
 from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
 import logging
+from pathlib import Path
+from functools import lru_cache
+from copy import deepcopy
+import operator
 
 import numpy as np
 import pandas as pd
@@ -20,6 +24,9 @@
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
 IMPLICIT_INTERVAL = "[)"
 
+fp = Path(__file__).resolve().parent
+smat_path = fp / 'score_mats'
+
 
 def chunksize(it_len, cpu_count, min_chunk=50):
     return max(min_chunk, int(it_len / (cpu_count * 4)))
@@ -390,12 +397,16 @@ def parse_boundary(item, strict=False):
 
 
 def parse_boundaries(items, strict=False):
-    last = None
+    # declaring upper first and then checking for None
+    # pleases the type checker, and handles the case where
+    # len(items) == 0
+    upper = None
     for item in items:
         lower, upper = parse_boundary(item, strict)
         yield lower
-        last = upper
-    yield last
+    if upper is None:
+        return
+    yield upper
 
 
 class Lookup2d(LookupNd):
@@ -437,12 +448,120 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False):
             Otherwise, raises a ValueError if parens/brackets
             do not match the implementation.
         """
-        boundaries = [
-            np.array(list(parse_boundaries(arr, strict)))
-            for arr in (df.index, df.columns)
-        ]
-        for b in boundaries:
+        boundaries = []
+        for arr in (df.index, df.columns):
+            b = np.array(list(parse_boundaries(arr, strict)))
             b[0] = -np.inf
             b[-1] = np.inf
+            boundaries.append(b)
 
         return cls(boundaries, df.to_numpy())
+
+
+@lru_cache
+def _smat_fcwb(alpha=False):
+    # cached private function defers construction
+    # until needed (speeding startup),
+    # but avoids repeated reads (speeding later uses)
+    fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha]
+    fpath = smat_path / fname
+
+    return Lookup2d.from_dataframe(
+        pd.read_csv(fpath, index_col=0)
+    )
+
+
+def smat_fcwb(alpha=False):
+    # deepcopied so that mutations do not propagate to cache
+    return deepcopy(_smat_fcwb(alpha))
+
+
+def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
+    """Checks functionally that the callable can be used as a score function.
+
+    Parameters
+    ----------
+    nargs : optional int, default 2
+        How many positional arguments the score function should have.
+    scalar : optional bool, default True
+        Check that the function can be used on ``nargs`` scalars.
+    array : optional bool, default True
+        Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s.
+
+    Raises
+    ------
+    ValueError
+        If the score function is not appropriate.
+    """
+    if scalar:
+        scalars = [0.5] * nargs
+        if not isinstance(fn(*scalars), float):
+            raise ValueError("smat does not take 2 floats and return a float")
+
+    if array:
+        test_arr = np.array([0.5] * 3)
+        arrs = [test_arr] * nargs
+        try:
+            out = fn(*arrs)
+        except Exception as e:
+            raise ValueError(f"Failed to use smat with numpy arrays: {e}")
+
+        if out.shape != test_arr.shape:
+            raise ValueError(
+                f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}"
+            )
+
+
+SCORE_FN_DESCR = """
+NBLAST score functions take 2 floats or numpy arrays of floats of length N
+(for matched dotprop points/tangents, distance and dot product;
+the latter possibly scaled by the geometric mean of the alpha colinearity values)
+and returns a float or N-length numpy array of floats.
+""".strip().replace("\n", " ")
+
+
+def parse_score_fn(smat, alpha=False):
+    f"""Interpret ``smat`` as a score function.
+    Primarily for backwards compatibility.
+
+    {SCORE_FN_DESCR}
+
+    Parameters
+    ----------
+    smat : None | "auto" | str | os.PathLike | pandas.DataFrame | Callable[[float, float], float]
+        If ``None``, use ``operator.mul``.
+        If ``"auto"``, use ``navis.nbl.smat.smat_fcwb(alpha)``.
+        If a dataframe, use ``navis.nbl.smat.Lookup2d.from_dataframe(smat)``.
+        If another string or path-like, load from CSV in a dataframe and uses as above.
+        Also checks the signature of the callable.
+        Raises an error, probably a ValueError, if it can't be interpreted.
+    alpha : optional bool, default False
+        If ``smat`` is None, choose whether to use the FCWB matrices
+        with or without alpha.
+
+    Returns
+    -------
+    Callable
+
+    Raises
+    ------
+    ValueError
+        If score function cannot be interpreted.
+    """
+    if smat is None:
+        smat = operator.mul
+    elif smat == 'auto':
+        smat = smat_fcwb(alpha)
+
+    if isinstance(smat, (str, os.PathLike)):
+        smat = pd.read_csv(smat, index_col=0)
+
+    if isinstance(smat, pd.DataFrame):
+        smat = Lookup2d.from_dataframe(smat)
+
+    if not callable(smat):
+        raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
+
+    check_score_fn(smat)
+
+    return smat

From 92fc6324296aac48c8fed3e52078c546d53145f0 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 13:16:51 +0100
Subject: [PATCH 20/22] Use parse_score_fn in synblast

---
 navis/nbl/synblast_funcs.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py
index 2208b49b..17c39807 100644
--- a/navis/nbl/synblast_funcs.py
+++ b/navis/nbl/synblast_funcs.py
@@ -29,6 +29,7 @@
 from ..core import NeuronList, BaseNeuron
 
 from .nblast_funcs import check_microns, find_optimal_partition, ScoringFunction
+from .smat import parse_score_fn
 
 __all__ = ['synblast']
 
@@ -76,11 +77,7 @@ def __init__(self, normalized=True, by_type=True,
         self.progress = progress
         self.by_type = by_type
 
-        if smat == 'auto':
-            smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv',
-                               index_col=0)
-
-        self.score_fn = ScoringFunction(smat)
+        self.score_fn = parse_score_fn(smat, False)
 
         self.self_hits = []
         self.trees = []

From eef4b65b70bfb4235a698562dae6b0fff9cc30fc Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 13:19:59 +0100
Subject: [PATCH 21/22] Fix lru_cache usage

---
 navis/nbl/smat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index e61230f4..2dd77473 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -458,7 +458,7 @@ def from_dataframe(cls, df: pd.DataFrame, strict=False):
         return cls(boundaries, df.to_numpy())
 
 
-@lru_cache
+@lru_cache(maxsize=None)
 def _smat_fcwb(alpha=False):
     # cached private function defers construction
     # until needed (speeding startup),

From e8ecdfb97a31b47ecca0c5aed51937d5f7e181f9 Mon Sep 17 00:00:00 2001
From: Chris Barnes <barnesc@janelia.hhmi.org>
Date: Thu, 29 Apr 2021 14:22:03 +0100
Subject: [PATCH 22/22] Expose some of smat.py to the nbl module

---
 navis/nbl/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/navis/nbl/__init__.py b/navis/nbl/__init__.py
index 6ce105c8..d1e63f13 100644
--- a/navis/nbl/__init__.py
+++ b/navis/nbl/__init__.py
@@ -15,5 +15,6 @@
 
 from .nblast_funcs import nblast, nblast_allbyall, nblast_smart
 from .synblast_funcs import synblast
+from .smat import Lookup2d, smat_fcwb, parse_score_fn
 
 __all__ = ['nblast', 'nblast_allbyall', 'nblast_smart', 'synblast']