From a57b84d188050346d19c17f7d7a4688352d9bc80 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 3 Aug 2024 19:45:38 -0400
Subject: [PATCH 01/18] replace cuboid method with recursive

---
 interpn/CHANGELOG.md                   |  12 +-
 interpn/Cargo.toml                     |   2 +-
 interpn/README.md                      |  15 +-
 interpn/src/lib.rs                     |  17 +-
 interpn/src/multicubic/rectilinear.rs  |   2 +-
 interpn/src/multicubic/regular.rs      |   2 +
 interpn/src/multilinear/rectilinear.rs | 339 +++++++------------------
 interpn/src/multilinear/regular.rs     | 337 ++++++++----------------
 8 files changed, 222 insertions(+), 504 deletions(-)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index ebe0d72..a1584ca 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -1,12 +1,8 @@
 # Changelog
-All notable changes to this project will be documented in this file.
 
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## 0.4.3 - 2024-08-03
 
-## [Unreleased]
+## Changed
 
-## [0.1.0](https://github.com/jlogan03/interpn/releases/tag/v0.1.0) - 2023-11-22
-
-### Other
-- Squash early development
+* Use recursive method to evaluate multilinear interpolation instead of hypercube method
+  * This makes extrapolation cost consistent with interpolation cost
diff --git a/interpn/Cargo.toml b/interpn/Cargo.toml
index 16a7980..985306f 100644
--- a/interpn/Cargo.toml
+++ b/interpn/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "interpn"
-version = "0.4.2"
+version = "0.4.3"
 edition = "2021"
 authors = ["James Logan <jlogan03@gmail.com>"]
 license = "MIT OR Apache-2.0"
diff --git a/interpn/README.md b/interpn/README.md
index 5ebb837..adf9308 100644
--- a/interpn/README.md
+++ b/interpn/README.md
@@ -11,12 +11,15 @@ Cubic interpolations require two more degrees of freedom per dimension, which re
 Similar to the linear methods, depending on implementation, the constant term can vary by orders of magnitude,
 as can the RAM usage.
 
-| Method                        | RAM       | Interp. Cost (Best Case) | Interp. Cost (Worst Case)               | Extrap. Cost (Worst Case)                      |
-|-------------------------------|-----------|--------------------------|-----------------------------------------|------------------------------------------------|
-| multilinear::regular          | O(ndims)  | O(2^ndims * ndims)       | O(2^ndims * ndims)                      | O(2^ndims + ndims^2)                           |
-| multilinear::rectilinear      | O(ndims)  | O(2^ndims * ndims)       | O(ndims * (2^ndims + log2(gridsize)))   | O(ndims * (2^ndims + ndims + log2(gridsize)))  |
-| multicubic::regular           | O(ndims)  | O(4^ndims)               | O(4^ndims)                              | O(4^ndims)                                     |
-| multicubic::rectilinear       | O(ndims)  | O(4^ndims)               | O(4^ndims) + ndims * log2(gridsize)     | O(4^ndims) + ndims * log2(gridsize)            |
+Rectilinear methods perform a bisection search to find the relevant grid cell, which takes
+a worst-case number of iterations of log2(number of grid elements).
+
+| Method                        | RAM       | Interp. / Extrap. Cost       |
+|-------------------------------|-----------|------------------------------|
+| multilinear::regular          | O(ndims)  | O(2^ndims)                   |
+| multilinear::rectilinear      | O(ndims)  | O(2^ndims) + log2(gridsize)  |
+| multicubic::regular           | O(ndims)  | O(4^ndims)                   |
+| multicubic::rectilinear       | O(ndims)  | O(4^ndims) + log2(gridsize)  |
 
 # Example: Multilinear and Multicubic w/ Regular Grid
 ```rust
diff --git a/interpn/src/lib.rs b/interpn/src/lib.rs
index fb9edfd..f5a8647 100644
--- a/interpn/src/lib.rs
+++ b/interpn/src/lib.rs
@@ -9,13 +9,16 @@
 //! Cubic interpolations require two more degrees of freedom per dimension, and have a minimal runtime scaling of 4^ndims.
 //! Similar to the linear methods, depending on implementation, the constant term can vary by orders of magnitude,
 //! as can the RAM usage.
-//!
-//! | Method                        | RAM       | Interp. Cost (Best Case) | Interp. Cost (Worst Case)               | Extrap. Cost (Worst Case)                      |
-//! |-------------------------------|-----------|--------------------------|-----------------------------------------|------------------------------------------------|
-//! | multilinear::regular          | O(ndims)  | O(2^ndims * ndims)       | O(2^ndims * ndims)                      | O(2^ndims + ndims^2)                           |
-//! | multilinear::rectilinear      | O(ndims)  | O(2^ndims * ndims)       | O(ndims * (2^ndims + log2(gridsize)))   | O(ndims * (2^ndims + ndims + log2(gridsize)))  |
-//! | multicubic::regular           | O(ndims)  | O(4^ndims)               | O(4^ndims)                              | O(4^ndims)                                     |
-//! | multicubic::rectilinear       | O(ndims)  | O(4^ndims)               | O(4^ndims) + ndims * log2(gridsize)     | O(4^ndims) + ndims * log2(gridsize)            |
+//! 
+//! Rectilinear methods perform a bisection search to find the relevant grid cell, which takes
+//! a worst-case number of iterations of log2(number of grid elements).
+//!
+//! | Method                        | RAM       | Interp. / Extrap. Cost       |
+//! |-------------------------------|-----------|------------------------------|
+//! | multilinear::regular          | O(ndims)  | O(2^ndims)                   |
+//! | multilinear::rectilinear      | O(ndims)  | O(2^ndims) + log2(gridsize)  |
+//! | multicubic::regular           | O(ndims)  | O(4^ndims)                   |
+//! | multicubic::rectilinear       | O(ndims)  | O(4^ndims) + log2(gridsize)  |
 //!
 //! # Example: Multilinear and Multicubic w/ Regular Grid
 //! ```rust
diff --git a/interpn/src/multicubic/rectilinear.rs b/interpn/src/multicubic/rectilinear.rs
index 895b010..81df950 100644
--- a/interpn/src/multicubic/rectilinear.rs
+++ b/interpn/src/multicubic/rectilinear.rs
@@ -576,7 +576,7 @@ pub fn interpn<T: Float>(
 }
 
 // We can use the same rectilinear-grid method again
-pub use crate::multilinear::rectilinear::check_bounds;
+// pub use crate::multilinear::rectilinear::check_bounds;
 
 #[cfg(test)]
 mod test {
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index f0301ea..d6af543 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -282,6 +282,8 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
         }
 
         // Calculate normalized delta locations
+        // For the cubic method, the normalized coordinate `t` is always relative
+        // to cube index 1 (out of 0-3)
         for i in 0..ndims {
             let index_one_loc = self.starts[i]
                 + self.steps[i]
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index 0c74925..ed35ba6 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -1,29 +1,17 @@
 //! Multilinear interpolation/extrapolation on a rectilinear grid.
 //!
-//! This is a fairly literal implementation of the geometric interpretation
-//! of multilinear interpolation as an area- or volume- weighted average
-//! on the interior of the grid, and extends this metaphor to include
-//! extrapolation to off-grid points.
-//!
 //! While this method does not fully capitalize on vectorization,
 //! it results in fairly minimal instantaneous memory usage,
 //! and throughput performance is similar to existing methods.
 //!
 //! Operation Complexity
-//! * Interpolating or extrapolating in face regions goes like
-//!   * Best case: O(2^ndims * ndims) when evaluating points in neighboring grid cells.
-//!   * Worst case: O(ndims * (2^ndims + ndims * log2(gridsize))) when evaluating arbitrary points.
-//! * Extrapolating in corner regions goes like O(2^ndims * ndims^2).
+//! * O(2^ndims) for interpolation and extrapolation in all regions.
 //!
 //! Memory Complexity
 //! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
 //!
 //! Timing
-//! * Timing determinism is not guaranteed due to the
-//!   difference in complexity between interpolation and extrapolation,
-//!   as well as due to the use of a bisection search for the grid index
-//!   location (which is itself not timing-deterministic) and the various
-//!   methods used to attempt to avoid that bisection search.
+//! * Timing determinism is very tight, but not guaranteed due to the use of a bisection search.
 //!
 //! ```rust
 //! use interpn::multilinear::rectilinear;
@@ -56,34 +44,26 @@ use num_traits::Float;
 
 /// An arbitrary-dimensional multilinear interpolator / extrapolator on a rectilinear grid.
 ///
-/// Unlike `RegularGridInterpolator`, this method does not handle
-/// grids with negative step size; all grids must be monotonically increasing.
-///
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 /// Assumes grids are monotonically _increasing_. Checking this is expensive, and is
 /// left to the user.
 ///
-/// While the worst-case interpolation runtime for this method is somewhat worse than
-/// for the regular grid method, in the particular case where the sequence of points
-/// being evaluated are within 1 grid cell of each other, it can be significantly
-/// faster than the regular grid method due to bypassing the expensive process of
-/// finding the location of the relevant grid cell.
-///
 /// Operation Complexity
-/// * Interpolating or extrapolating in face regions goes like
-///   * Best case: O(2^ndims * ndims) when evaluating points in neighboring grid cells.
-///   * Worst case: O(ndims * (2^ndims + ndims * log2(gridsize))) when evaluating arbitrary points.
-/// * Extrapolating in corner regions goes like O(2^ndims * ndims^2).
+/// * O(2^ndims) for interpolation and extrapolation in all regions.
 ///
 /// Memory Complexity
 /// * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
+/// * While evaluation is recursive, the recursion has constant
+///   max depth of MAXDIMS, which provides a guarantee on peak
+///   memory usage.
 ///
 /// Timing
-/// * Timing determinism is not guaranteed due to the
-///   difference in complexity between interpolation and extrapolation,
-///   as well as due to the use of a bisection search for the grid index
-///   location (which is itself not timing-deterministic) and the various
-///   methods used to attempt to avoid that bisection search.
+/// * Timing determinism very tight, but is not exact due to the
+///   differences in calculations (but not complexity) between
+///   interpolation and extrapolation.
+/// * An interpolation-only variant of this algorithm could achieve
+///   near-deterministic timing, but would produce incorrect results
+///   when evaluated at off-grid points.
 pub struct MultilinearRectilinear<'a, T: Float, const MAXDIMS: usize> {
     /// x, y, ... coordinate grids, each entry of size dims[i]
     grids: &'a [&'a [T]],
@@ -98,19 +78,14 @@ pub struct MultilinearRectilinear<'a, T: Float, const MAXDIMS: usize> {
 impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS> {
     /// Build a new interpolator, using O(MAXDIMS) calculations and storage.
     ///
-    /// This method does not handle degenerate dimensions with only a single
-    /// grid entry; all grids must have at least 2 entries.
+    /// This method does not handle degenerate dimensions; all grids must have at least 4 entries.
     ///
     /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-    /// Assumes grids are monotonically _increasing_. Checking this is expensive, and is
-    /// left to the user.
     ///
     /// # Errors
     /// * If any input dimensions do not match
-    /// * If any dimensions have size < 2
-    /// * If any grid's first two entries are not monotonically increasing
-    ///   * This is a courtesy to catch _some_, but not all, cases where a non-monotonic
-    ///     or reversed-order grid is provided.
+    /// * If any dimensions have size < 4
+    /// * If any step sizes have zero or negative magnitude
     #[inline(always)]
     pub fn new(grids: &'a [&'a [T]], vals: &'a [T]) -> Result<Self, &'static str> {
         // Check dimensions
@@ -170,34 +145,29 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     /// Interpolate the value at a point,
     /// using fixed-size intermediate storage of O(ndims) and no allocation.
     ///
-    /// `origin` is a required mutable index store of minimum size `ndims`,
-    /// which allows bypassing expensive bisection searches in some cases.
-    /// It should be initialized to zero.
-    ///
     /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
     ///
     /// # Errors
     ///   * If the dimensionality of the point does not match the data
     ///   * If the dimensionality of either one exceeds the fixed maximum
-    ///   * If values in `origin` are initialized to an index outside the grid
+    ///   * If the index along any dimension exceeds the maximum representable
+    ///     integer value within the value type `T`
     #[inline(always)]
-    fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
+    pub fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
         // Check sizes
         let ndims = self.grids.len();
-        if x.len() != ndims {
+        if !(x.len() == ndims && ndims <= MAXDIMS) {
             return Err("Dimension mismatch");
         }
 
         // Initialize fixed-size intermediate storage.
-        // This storage _could_ be initialized with the interpolator struct, but
-        // this would then require that every usage of struct be `mut`, which is
-        // ergonomically unpleasant. Based on testing with the regular grid method,
-        // it would likely also be slower.
-        let origin = &mut [0; MAXDIMS][..ndims]; // Indices of lower corner of grid cell
-        let ioffs = &mut [false; MAXDIMS][..ndims]; // Offset index for selected vertex
-        let sat = &mut [0_u8; MAXDIMS][..ndims]; // Saturated-low flag
-        let dxs = &mut [T::zero(); MAXDIMS][..ndims]; // Sub-cell volume storage
-        let steps = &mut [T::zero(); MAXDIMS][..ndims]; // Step size on each dimension
+        // Maybe counterintuitively, initializing this storage here on every usage
+        // instead of once with the top level struct is a significant speedup
+        // and does not increase peak stack usage.
+        //
+        // Also notably, storing the index offsets as bool instead of usize
+        // reduces memory overhead, but has not effect on throughput rate.
+        let origin = &mut [0_usize; MAXDIMS][..ndims]; // Indices of lower corner of hypercub
         let dimprod = &mut [1_usize; MAXDIMS][..ndims];
 
         // Populate cumulative product of higher dimensions for indexing.
@@ -211,224 +181,101 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
             acc *= self.dims[ndims - i - 1];
         }
 
-        // Populate lower corner
+        // Populate lower corner and saturation flag for each dimension
         for i in 0..ndims {
-            (origin[i], sat[i]) = self.get_loc(x[i], i)
+            origin[i] = self.get_loc(x[i], i)?;
         }
 
-        // Check if any dimension is saturated.
-        // This gives a ~15% overall speedup for points on the interior.
-        let any_dims_saturated = sat.iter().any(|&x| x != 0);
-
-        // Calculate the total volume of this cell
-        let cell_vol = self.get_cell(origin, steps);
-
-        // Traverse vertices, summing contributions to the interpolated value.
-        //
-        // This visits the 2^ndims elements of the cartesian product
-        // of `[0, 1] x ... x [0, 1]` using O(ndims) storage.
-        let mut interped = T::zero();
-        let nverts = 2_usize.pow(ndims as u32);
-        for i in 0..nverts {
-            let mut k: usize = 0; // index of the value for this vertex in self.vals
-
-            for j in 0..ndims {
-                // Every 2^nth vertex, flip which side of the cube we are examining
-                // in the nth dimension.
-                //
-                // Because i % 2^n has double the period for each sequential n,
-                // and their phase is only aligned once every 2^n for the largest
-                // n in the set, this is guaranteed to produce a path that visits
-                // each vertex exactly once.
-                let flip = i % 2_usize.pow(j as u32) == 0;
-                if flip {
-                    ioffs[j] = !ioffs[j];
-                }
-
-                // Accumulate the index into the value array,
-                // saturating to the bound if the resulting index would be outside.
-                k += dimprod[j] * (origin[j] + ioffs[j] as usize);
-
-                // Find the vector from the opposite vertex to the observation point
-                let iloc = origin[j] + !ioffs[j] as usize; // Index of location of opposite vertex
-                dxs[j] = (x[j] - self.grids[j][iloc]).abs(); // Loc. of opposite vertex
-            }
-
-            // Get the value at this vertex
-            let v = self.vals[k];
-
-            // Accumulate contribution from this vertex
-            // * Interpolating: just take the volume-weighted value and continue on
-            // * Extrapolating
-            //   * With opposite vertex on multiple extrapolated dims: return zero
-            //   * With opposite vertex on exactly one extrapolated dim
-            //     * Negate contribution & clip extrapolated region to maintain linearity
-            //   * Otherwise (meaning, corner regions)
-            //     * O(ndim^2) operation to accumulate only the linear portions of
-            //       the extrapolated volumes.
-            //
-            // While this section looks nearly identical between the regular grid
-            // and rectilinear methods, it is different in a few subtle but important
-            // ways, and separating it into shared functions makes it even harder
-            // to read than it already is.
-            if !any_dims_saturated {
-                // Interpolating
-                let vol = dxs[1..].iter().fold(dxs[0], |acc, x| acc * *x);
-                interped = interped + v * vol;
-            } else {
-                // Extrapolating requires some special attention.
-                let opsat = &mut [false; MAXDIMS][..ndims]; // Whether the opposite vertex is on the saturated bound
-                let thissat = &mut [false; MAXDIMS][..ndims]; // Whether the current vertex is on the saturated bound
-                let extrapdxs = &mut [T::zero(); MAXDIMS][..ndims]; // Extrapolated distances
-
-                let mut opsatcount = 0;
-                for j in 0..ndims {
-                    // For which dimensions is the opposite vertex on a saturated bound?
-                    opsat[j] = (!ioffs[j] && sat[j] == 2) || (ioffs[j] && sat[j] == 1);
-                    // For how many total dimensions is the opposite vertex on a saturated bound?
-                    opsatcount += opsat[j] as usize;
-
-                    // For which dimensions is the current vertex on a saturated bound?
-                    thissat[j] = sat[j] > 0 && !opsat[j];
-                }
-
-                // If the opposite vertex is on _more_ than one saturated bound,
-                // it should be clipped on multiple axes which, if the clipping
-                // were implemented in a general constructive geometry way, would
-                // result in a zero volume. Since we only deal in the difference
-                // in position between vertices and the observation point, our
-                // clipping method would not properly set this volume to zero,
-                // and we need to implement that behavior with explicit logic.
-                let zeroed = opsatcount > 1;
-                if zeroed {
-                    // No contribution from this vertex
-                    continue;
-                }
-
-                // If the opposite vertex is on exactly one saturated bound, negate its contribution
-                // in order to move smoothly from weighted-average on the interior to extrapolation
-                // on the exterior.
-                //
-                // If the opposite vertex is on exactly one saturated bound,
-                // allow the dx on that dimension to be as large as needed,
-                // but clip the dx on other saturated dimensions so that we
-                // don't produce an overlapping partition in outside-corner regions.
-                let neg = opsatcount == 1;
-                if neg {
-                    for j in 0..ndims {
-                        if thissat[j] {
-                            dxs[j] = dxs[j].min(steps[j]);
-                        }
-                    }
-
-                    let vol = dxs[1..].iter().fold(dxs[0], |acc, x| acc * *x).neg();
-                    interped = interped + v * vol;
-                    continue;
-                }
-
-                // See `RegularGridInterpolator` for more details about the rationale
-                // for this section, which handles extrapolation in corner regions.
-
-                // Get the volume that is inside the cell
-                //   Copy forward the original dxs, extrapolated or not,
-                //   and clip to the cell boundary
-                (0..ndims).for_each(|j| extrapdxs[j] = dxs[j].min(steps[j]));
-                //   Get the volume of this region which does not extend outside the cell
-                let vinterior = extrapdxs[1..].iter().fold(extrapdxs[0], |acc, x| acc * *x);
-
-                // Find each linear exterior region by, one at a time, taking the volume
-                // with one extrapolated dimension masked into the extrapdxs
-                // which are otherwise clipped to the interior region.
-                let mut vexterior = T::zero();
-                for j in 0..ndims {
-                    if thissat[j] {
-                        let dx_was = extrapdxs[j];
-                        extrapdxs[j] = dxs[j] - steps[j];
-                        vexterior =
-                            vexterior + extrapdxs[1..].iter().fold(extrapdxs[0], |acc, x| acc * *x);
-                        extrapdxs[j] = dx_was; // Reset extrapdxs to original state for next calc
-                    }
-                }
+        // Recursive interpolation of one dependency tree at a time
+        // let loc = &origin; // Starting location in the tree is the origin
+        let dim = ndims; // Start from the end and recurse back to zero
+        let loc = &mut [0_usize; MAXDIMS][..ndims];
+        loc.copy_from_slice(origin);
+        let interped = self.populate(dim, origin, loc, dimprod, x);
 
-                let vol = vexterior + vinterior;
-                interped = interped + v * vol;
-            }
-        }
-
-        Ok(interped / cell_vol)
+        Ok(interped)
     }
 
-    /// Get the next-lower-or-exact index along this dimension where `x` is found,
-    /// saturating to the bounds at the edges if the point is outside.
+    /// Get the two-lower index along this dimension where `x` is found,
+    /// saturating to the bounds at the edges if necessary.
     ///
-    /// At the high bound of a given dimension, saturates to the next-most-internal
-    /// point in order to capture a full cube, then saturates to 0 if the resulting
-    /// index would be off the grid (meaning, if a dimension has size one).
+    /// At the high bound of a given dimension, saturates to the fourth internal
+    /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    ///
-    /// Saturation flag
-    /// * 0 => inside
-    /// * 1 => low
-    /// * 2 => high
-    ///
-    /// Unfortunately, using a repr(u8) enum for the saturation flag
-    /// causes a significant perf hit.
     #[inline(always)]
-    fn get_loc(&self, v: T, dim: usize) -> (usize, u8) {
+    fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let grid = self.grids[dim];
-        let saturation: u8; // Saturated low/high/not at all
-                            // Signed integer index location of this point
 
         // Bisection search to find location on the grid.
         //
         // The search will return `0` if the point is outside-low,
         // and will return `self.dims[dim]` if outside-high.
         //
-        // We still need to convert to a signed integer here, because
-        // if the point is extrapolated below the grid,
-        // we may need to (briefly) represent a negative index.
-        //
         // This process accounts for essentially the entire difference in
         // performance between this method and the regular-grid method.
-        let iloc: isize = grid.partition_point(|x| *x < v) as isize - 1;
-        let dimmax = self.dims[dim] - 2; // maximum index for lower corner
-        let loc: usize = (iloc.max(0) as usize).min(dimmax); // unsigned integer loc clipped to interior
+        let iloc: isize = grid.partition_point(|x| *x < v) as isize - 2;
 
-        // Observation point is outside the grid on the low side
-        if iloc < 0 {
-            saturation = 1;
-        }
-        // Observation point is outside the grid on the high side
-        else if iloc > dimmax as isize {
-            saturation = 2;
-        }
-        // Observation point is on the interior
-        else {
-            saturation = 0;
-        }
+        let n = self.dims[dim] as isize; // Number of grid points on this dimension
+        let dimmax = n.saturating_sub(2).max(0); // maximum index for lower corner
+        let loc: usize = iloc.max(0).min(dimmax) as usize; // unsigned integer loc clipped to interior
 
-        (loc, saturation)
+        Ok(loc)
     }
 
-    /// Get the volume of the grid prism with `origin` as its lower corner
-    /// and output the step sizes for this cell as well.
-    #[inline(always)]
-    fn get_cell(&self, origin: &[usize], steps: &mut [T]) -> T {
-        let ndims = self.grids.len();
-        for i in 0..ndims {
-            // Index of upper face (saturating to bounds)
-            let j = origin[i] + 1;
-            steps[i] = self.grids[i][j] - self.grids[i][origin[i]];
+    /// Recursive evaluation of interpolant on each dimension
+    #[inline]
+    fn populate(
+        &self,
+        dim: usize,
+        origin: &[usize],
+        loc: &mut [usize],
+        dimprod: &[usize],
+        x: &[T],
+    ) -> T {
+        // Do the calc for this entry
+        match dim {
+            // If we have arrived at a leaf, index into data
+            0 => index_arr(loc, dimprod, self.vals),
+
+            // Otherwise, continue recursion
+            _ => {
+                // Keep track of where we are in the tree
+                // so that we can index into the value array properly
+                // when we reach the leaves
+                let next_dim = dim - 1;
+
+                // Populate next dim's values
+                let mut vals = [T::zero(); 2];
+                for i in 0..2 {
+                    loc[next_dim] = origin[next_dim] + i;
+                    vals[i] = self.populate(next_dim, origin, loc, dimprod, x);
+                }
+                loc[next_dim] = origin[next_dim]; // Reset for next usage
+
+                // Interpolate on next dim's values to populate an entry in this dim
+                let grid_cell = &self.grids[next_dim][origin[next_dim]..origin[next_dim] + 2];
+                let step = grid_cell[1] - grid_cell[0];
+                let t = (x[next_dim] - grid_cell[0]) / step;
+                let dy = vals[1] - vals[0];
+                vals[0] + t * dy
+            }
         }
+    }
+}
 
-        let cell_vol = steps[1..ndims].iter().fold(steps[0], |acc, x| acc * *x);
-
-        cell_vol
+/// Index a single value from an array
+#[inline(always)]
+fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
+    let mut i = 0;
+    for j in 0..dimprod.len() {
+        i += loc[j] * dimprod[j];
     }
+
+    data[i]
 }
 
-/// Evaluate multilinear interpolation on a regular grid in up to 10 dimensions.
+/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
 /// This is a convenience function; best performance will be achieved by using the exact right
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 6f9c448..2f98f7e 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -10,18 +10,14 @@
 //! and throughput performance is similar to existing methods.
 //!
 //! Operation Complexity
-//! * Interpolating or extrapolating in face regions goes like O(2^ndims * ndims).
-//! * Extrapolating in corner regions goes like O(2^ndims * ndims^2).
+//! * O(2^ndims) for interpolation and extrapolation in all regions.
 //!
 //! Memory Complexity
 //! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
 //!
 //! Timing
-//! * Timing determinism is not guaranteed due to the
-//!   difference in complexity between interpolation and extrapolation.
-//! * An interpolation-only variant of this algorithm could achieve
-//!   near-deterministic timing, but would produce incorrect results
-//!   when evaluated at off-grid points.
+//! * Timing determinism is guaranteed to the extent that floating-point calculation timing is consistent.
+//!   That said, floating-point calculations can take a different number of clock-cycles depending on numerical values.
 //!
 //! ```rust
 //! use interpn::multilinear::regular;
@@ -54,23 +50,27 @@
 //! ```
 //!
 //! References
-//! * https://en.wikipedia.org/wiki/Bilinear_interpolation#Weighted_mean
+//! * https://en.wikipedia.org/wiki/Bilinear_interpolation#Repeated_linear_interpolation
 use num_traits::{Float, NumCast};
 
 /// An arbitrary-dimensional multilinear interpolator / extrapolator on a regular grid.
 ///
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
+///
 /// Operation Complexity
-/// * Interpolating or extrapolating in face regions goes like O(2^ndims).
-/// * Extrapolating in corner regions goes like O(2^ndims * ndims^2).
+/// * O(2^ndims) for interpolation and extrapolation in all regions.
 ///
 /// Memory Complexity
 /// * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
+/// * While evaluation is recursive, the recursion has constant
+///   max depth of MAXDIMS, which provides a guarantee on peak
+///   memory usage.
 ///
 /// Timing
-/// * Timing determinism is not guaranteed due to the
-///   difference in complexity between interpolation and extrapolation.
+/// * Timing determinism very tight, but is not exact due to the
+///   differences in calculations (but not complexity) between
+///   interpolation and extrapolation.
 /// * An interpolation-only variant of this algorithm could achieve
 ///   near-deterministic timing, but would produce incorrect results
 ///   when evaluated at off-grid points.
@@ -94,8 +94,7 @@ pub struct MultilinearRegular<'a, T: Float, const MAXDIMS: usize> {
 impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     /// Build a new interpolator, using O(MAXDIMS) calculations and storage.
     ///
-    /// This method does not handle degenerate dimensions with only a single
-    /// grid entry; all grids must have at least 2 entries.
+    /// This method does not handle degenerate dimensions; all grids must have at least 2 entries.
     ///
     /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
     ///
@@ -108,7 +107,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         dims: &[usize],
         starts: &[T],
         steps: &[T],
-        vals: &'a [T],
+        vals: &'a [T]
     ) -> Result<Self, &'static str> {
         // Check dimensions
         let ndims = dims.len();
@@ -117,7 +116,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
             return Err("Dimension mismatch");
         }
 
-        // Make sure all dimensions have at least two entries
+        // Make sure all dimensions have at least four entries
         let degenerate = dims[..ndims].iter().any(|&x| x < 2);
         if degenerate {
             return Err("All grids must have at least two entries");
@@ -204,11 +203,8 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         //
         // Also notably, storing the index offsets as bool instead of usize
         // reduces memory overhead, but has not effect on throughput rate.
-        let steps = &self.steps[..ndims]; // Step size for each dimension
-        let origin = &mut [0_usize; MAXDIMS][..ndims]; // Indices of lower corner of hypercube
-        let ioffs = &mut [false; MAXDIMS][..ndims]; // Offset index for selected vertex
-        let sat = &mut [0_u8; MAXDIMS][..ndims]; // Saturation none/high/low flags for each dim
-        let dxs = &mut [T::zero(); MAXDIMS][..ndims]; // Sub-cell volume storage
+        let origin = &mut [0_usize; MAXDIMS][..ndims]; // Indices of lower corner of hypercub
+        let dts = &mut [T::zero(); MAXDIMS][..ndims]; // Normalized coordinate storage
         let dimprod = &mut [1_usize; MAXDIMS][..ndims];
 
         // Populate cumulative product of higher dimensions for indexing.
@@ -222,240 +218,104 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
             acc *= self.dims[ndims - i - 1];
         }
 
-        // Compute volume of reference cell.
-        // Maybe counterintuitively, doing this calculation for every call
-        // is as fast or faster than doing it once at struct initialization
-        // then referring to the stored value.
-        let cell_vol = steps[1..].iter().fold(steps[0], |acc, x| acc * *x);
-
         // Populate lower corner and saturation flag for each dimension
         for i in 0..ndims {
-            (origin[i], sat[i]) = self.get_loc(x[i], i)?;
+            origin[i] = self.get_loc(x[i], i)?;
         }
 
-        // Check if any dimension is saturated.
-        let any_dims_saturated = sat.iter().any(|&x| x != 0);
-
-        // Traverse vertices, summing contributions to the interpolated value.
-        //
-        // This visits the 2^ndims elements of the cartesian product
-        // of `[0, 1] x ... x [0, 1]` without simultaneously actualizing them in storage.
-        let mut interped = T::zero();
-        let nverts = 2_usize.pow(ndims as u32);
-        for i in 0..nverts {
-            let mut k: usize = 0; // index of the value for this vertex in self.vals
-
-            for j in 0..ndims {
-                // Every 2^nth vertex, flip which side of the cube we are examining
-                // in the nth dimension.
-                //
-                // Because i % 2^n has double the period for each sequential n,
-                // and their phase is only aligned once every 2^n for the largest
-                // n in the set, this is guaranteed to produce a path that visits
-                // each vertex exactly once.
-                let flip = i % 2_usize.pow(j as u32) == 0;
-                if flip {
-                    ioffs[j] = !ioffs[j];
-                }
-
-                // Accumulate the index into the value array,
-                // saturating to the bound if the resulting index would be outside.
-                k += dimprod[j] * (origin[j] + ioffs[j] as usize);
-
-                // Find the vector from the opposite vertex to the observation point
-                let iloc = origin[j] + !ioffs[j] as usize; // Index of location of opposite vertex
-                let floc = T::from(iloc);
-                match floc {
-                    Some(floc) => {
-                        let loc = self.starts[j] + steps[j] * floc; // Loc. of opposite vertex
-                        dxs[j] = (x[j] - loc).abs(); // Use dxs[j] as storage for float locs
-                    }
-                    None => return Err("Unrepresentable coordinate value"),
-                }
-            }
-
-            // Get the value at this vertex
-            let v = self.vals[k];
-
-            // Accumulate contribution from this vertex
-            // * Interpolating: just take the volume-weighted value and continue on
-            // * Extrapolating
-            //   * With opposite vertex on multiple extrapolated dims: return zero
-            //   * With opposite vertex on exactly one extrapolated dim
-            //     * Negate contribution & clip extrapolated region to maintain linearity
-            //   * Otherwise (meaning, corner regions)
-            //     * O(ndim^2) operation to accumulate only the linear portions of
-            //       the extrapolated volumes.
-            //
-            // While this section looks nearly identical between the regular grid
-            // and rectilinear methods, it is different in a few subtle but important
-            // ways, and separating it into shared functions makes it even harder
-            // to read than it already is.
-            if !any_dims_saturated {
-                // Interpolating
-                let vol = dxs[1..].iter().fold(dxs[0], |acc, x| acc * *x);
-                interped = interped + v * vol;
-            } else {
-                // Extrapolating requires some special attention.
-                let opsat = &mut [false; MAXDIMS][..ndims]; // Whether the opposite vertex is on the saturated bound
-                let thissat = &mut [false; MAXDIMS][..ndims]; // Whether the current vertex is on the saturated bound
-                let extrapdxs = &mut [T::zero(); MAXDIMS][..ndims]; // Extrapolated distances
-
-                let mut opsatcount = 0;
-                for j in 0..ndims {
-                    // For which dimensions is the opposite vertex on a saturated bound?
-                    opsat[j] = (!ioffs[j] && sat[j] == 2) || (ioffs[j] && sat[j] == 1);
-                    // For how many total dimensions is the opposite vertex on a saturated bound?
-                    opsatcount += opsat[j] as usize;
-
-                    // For which dimensions is the current vertex on a saturated bound?
-                    thissat[j] = sat[j] > 0 && !opsat[j];
-                }
-
-                // If the opposite vertex is on _more_ than one saturated bound,
-                // it should be clipped on multiple axes which, if the clipping
-                // were implemented in a general constructive geometry way, would
-                // result in a zero volume. Since we only deal in the difference
-                // in position between vertices and the observation point, our
-                // clipping method would not properly set this volume to zero,
-                // and we need to implement that behavior with explicit logic.
-                let zeroed = opsatcount > 1;
-                if zeroed {
-                    // No contribution from this vertex
-                    continue;
-                }
-
-                // If the opposite vertex is on exactly one saturated bound, negate its contribution
-                // in order to move smoothly from weighted-average on the interior to extrapolation
-                // on the exterior.
-                //
-                // If the opposite vertex is on exactly one saturated bound,
-                // allow the dx on that dimension to be as large as needed,
-                // but clip the dx on other saturated dimensions so that we
-                // don't produce an overlapping partition in outside-corner regions.
-                let neg = opsatcount == 1;
-                if neg {
-                    for j in 0..ndims {
-                        if thissat[j] {
-                            dxs[j] = dxs[j].min(steps[j]);
-                        }
-                    }
-
-                    let vol = dxs[1..].iter().fold(dxs[0], |acc, x| acc * *x).neg();
-                    interped = interped + v * vol;
-                    continue;
-                }
-
-                // If this vertex is on multiple saturated bounds, then the prism formed by the
-                // opposite vertex and the observation point will be extrapolated in more than
-                // one dimension, which will produce some regions with volume that scales
-                // nonlinearly with the position of the observation point.
-                // We need to restore linearity without resorting to using the recursive algorithm
-                // which would drive us to actualize (2^(n-1) * ndims) float values simultaneously.
-                //
-                // To do this, we can subtract the nonlinear regions' volume from the total
-                // volume of the opposite-to-observation prism for this vertex.
-                //
-                // Put differently - find the part of the volume that is scaling non-linearly
-                // in the coordinates, and bookkeep it to be removed entirely later.
-                //
-                // For one dimension, there are no such regions. For two dimensions, only the
-                // corner region contributes. For higher dimensions, there are increasing
-                // numbers of types of regions that appear, so we need a relatively general
-                // way of handling this without knowing all of those types of region.
-                //
-                // One way of circumventing the need to enumerate types of nonlinear region
-                // is to capitalize on the observation that the _linear_ regions are all of the
-                // same form, even in higher dimensions. We can traverse those instead,
-                // subtracting each one from the full extrapolated volume for this vertex
-                // until what's left is only the part that we want to remove. Then, we can
-                // remove that part and keep the rest.
-                //
-                // Continuing from that thought, we can skip evaluating the nonlinear portions
-                // entirely, by evaluating the interior portion and each linear exterior portion
-                // (which we needed to evaluate to remove them from the enclosing volume anyway)
-                // then summing the linear portions together directly. This avoids the loss of
-                // float precision that can come from addressing the nonlinear regions directly,
-                // as this can cause us to add some very large and very small numbers together
-                // in an order that is not necessarily favorable.
-
-                // Get the volume that is inside the cell
-                //   Copy forward the original dxs, extrapolated or not,
-                //   and clip to the cell boundary
-                (0..ndims).for_each(|j| extrapdxs[j] = dxs[j].min(steps[j]));
-                //   Get the volume of this region which does not extend outside the cell
-                let vinterior = extrapdxs[1..].iter().fold(extrapdxs[0], |acc, x| acc * *x);
-
-                // Find each linear exterior region by, one at a time, taking the volume
-                // with one extrapolated dimension masked into the extrapdxs
-                // which are otherwise clipped to the interior region.
-                let mut vexterior = T::zero();
-                for j in 0..ndims {
-                    if thissat[j] {
-                        let dx_was = extrapdxs[j];
-                        extrapdxs[j] = dxs[j] - steps[j];
-                        vexterior =
-                            vexterior + extrapdxs[1..].iter().fold(extrapdxs[0], |acc, x| acc * *x);
-                        extrapdxs[j] = dx_was; // Reset extrapdxs to original state for next calc
-                    }
-                }
-
-                let vol = vexterior + vinterior;
-                interped = interped + v * vol;
-            }
+        // Calculate normalized delta locations
+        for i in 0..ndims {
+            let index_zero_loc = self.starts[i]
+                + self.steps[i]
+                    * <T as NumCast>::from(origin[i])
+                        .ok_or("Unrepresentable coordinate value")?;
+            dts[i] = (x[i] - index_zero_loc) / self.steps[i];
         }
 
-        Ok(interped / cell_vol)
+        // Recursive interpolation of one dependency tree at a time
+        // let loc = &origin; // Starting location in the tree is the origin
+        let dim = ndims; // Start from the end and recurse back to zero
+        let loc = &mut [0_usize; MAXDIMS][..ndims];
+        loc.copy_from_slice(origin);
+        let interped = self.populate(dim, origin, loc, dimprod, dts);
+
+        Ok(interped)
     }
 
-    /// Get the next-lower-or-exact index along this dimension where `x` is found,
-    /// saturating to the bounds at the edges if the point is outside.
+    /// Get the two-lower index along this dimension where `x` is found,
+    /// saturating to the bounds at the edges if necessary.
     ///
-    /// At the high bound of a given dimension, saturates to the next-most-internal
-    /// point in order to capture a full cube, then saturates to 0 if the resulting
-    /// index would be off the grid (meaning, if a dimension has size one).
+    /// At the high bound of a given dimension, saturates to the fourth internal
+    /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    ///
-    /// Saturation flag
-    /// * 0 => inside
-    /// * 1 => low
-    /// * 2 => high
-    ///
-    /// Unfortunately, using a repr(u8) enum for the saturation flag
-    /// causes a significant perf hit.
     #[inline(always)]
-    fn get_loc(&self, v: T, dim: usize) -> Result<(usize, u8), &'static str> {
-        let saturation: u8; // Saturated low/high/not at all
-
+    fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let floc = ((v - self.starts[dim]) / self.steps[dim]).floor(); // float loc
-        let iloc = <isize as NumCast>::from(floc); // signed integer loc
+                                                                       // Signed integer loc, with the bottom of the cell aligned to place the normalized
+                                                                       // coordinate t=0 at cell index 1
+        let iloc = <isize as NumCast>::from(floc).ok_or("Unrepresentable coordinate value")? - 1;
 
-        match iloc {
-            Some(iloc) => {
-                let dimmax = self.dims[dim] - 2; // maximum index for lower corner
-                let loc: usize = (iloc.max(0) as usize).min(dimmax); // unsigned integer loc clipped to interior
+        let n = self.dims[dim] as isize; // Number of grid points on this dimension
+        let dimmax = n.saturating_sub(2).max(0); // maximum index for lower corner
+        let loc: usize = iloc.max(0).min(dimmax) as usize; // unsigned integer loc clipped to interior
 
-                // Observation point is outside the grid on the low side
-                if iloc < 0 {
-                    saturation = 1;
-                }
-                // Observation point is outside the grid on the high side
-                else if iloc > dimmax as isize {
-                    saturation = 2;
-                }
-                // Observation point is on the interior
-                else {
-                    saturation = 0;
+        Ok(loc)
+    }
+
+    /// Recursive evaluation of interpolant on each dimension
+    #[inline]
+    fn populate(
+        &self,
+        dim: usize,
+        origin: &[usize],
+        loc: &mut [usize],
+        dimprod: &[usize],
+        dts: &[T],
+    ) -> T {
+        // Do the calc for this entry
+        match dim {
+            // If we have arrived at a leaf, index into data
+            0 => index_arr(loc, dimprod, self.vals),
+
+            // Otherwise, continue recursion
+            _ => {
+                // Keep track of where we are in the tree
+                // so that we can index into the value array properly
+                // when we reach the leaves
+                let next_dim = dim - 1;
+
+                // Populate next dim's values
+                let mut vals = [T::zero(); 2];
+                for i in 0..2 {
+                    loc[next_dim] = origin[next_dim] + i;
+                    vals[i] = self.populate(next_dim, origin, loc, dimprod, dts);
                 }
+                loc[next_dim] = origin[next_dim]; // Reset for next usage
 
-                Ok((loc, saturation))
+                // Interpolate on next dim's values to populate an entry in this dim
+                let y0 = vals[0];
+                let dy = vals[1] - vals[0];
+                let t = dts[next_dim];
+                y0 + t * dy
             }
-            None => Err("Unrepresentable coordinate value"),
         }
     }
 }
 
+
+/// Index a single value from an array
+#[inline(always)]
+fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
+    let mut i = 0;
+    for j in 0..dimprod.len() {
+        i += loc[j] * dimprod[j];
+    }
+
+    data[i]
+}
+
+
 /// Evaluate multilinear interpolation on a regular grid in up to 8 dimensions.
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
@@ -479,6 +339,7 @@ pub fn interpn<T: Float>(
     Ok(())
 }
 
+
 /// Check whether a list of observation points are inside the grid within some absolute tolerance.
 /// Assumes the grid is valid for the rectilinear interpolator (monotonically increasing).
 ///
@@ -569,7 +430,13 @@ mod test {
             // Check that interpolated values match expectation,
             // using an absolute difference because some points are very close to or exactly at zero,
             // and do not do well under a check on relative difference.
-            (0..uobs.len()).for_each(|i| assert!((out[i] - uobs[i]).abs() < 1e-12));
+
+            (0..uobs.len()).for_each(|i| {
+                let outi = out[i];
+                let uobsi = uobs[i];
+                println!("{outi} {uobsi}");
+                assert!((out[i] - uobs[i]).abs() < 1e-12)
+            });
         }
     }
-}
+}
\ No newline at end of file

From 03030dd232258a985f4a4ca182fe1b090165beba Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 3 Aug 2024 19:54:53 -0400
Subject: [PATCH 02/18] backfill changelog

---
 interpn/CHANGELOG.md | 45 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index a1584ca..48c0581 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -6,3 +6,48 @@
 
 * Use recursive method to evaluate multilinear interpolation instead of hypercube method
   * This makes extrapolation cost consistent with interpolation cost
+
+## 0.4.2 - 2024-05-12
+
+## Added
+
+* Implement cubic rectilinear method
+
+## 0.4.1 - 2024-05-06
+
+## Fixed
+
+* Fix grid cell index selection to properly center the grid cell s.t. t=0 corresponds to index 1
+
+## Added
+
+* Add test of cubic method against sine function to capture potential problems not visible when testing against linear and quadratic functions
+
+## 0.4.0 - 2024-05-05
+
+## Changes
+
+* Implement cubic interpolation for regular grid
+    * Continuous first derivative everywhere
+    * Option to clamp to linear extrapolation to prevent unwanted extrapolation of local curvature
+        * Test linearized method against linear function in both interpolation and extrapolation
+        * Test un-linearized method against quadratic function in both interpolation and extrapolation
+
+## 0.3.0 - 2023-12-17
+
+## Changed
+
+* Remove initial guess for cell index from rectilinear method
+* Collapse some loops
+* Remove support for negative step sizes for regular grid in favor of reducing number of abs() calls
+* Remove some saturating sub calls that are not needed now that degenerate grids are not supported
+* Get indexing dimension product in the same way for rectilinear method as for regular grid method
+* Use better initial value for folds
+* Update docs
+* Use optimizations for tests because it's faster overall
+
+## 0.2.0 - 2023-12-06
+
+## Changed
+
+* Propagate Result everywhere that unwrap or assert! was being used

From eda24b40c51f1dc6e8298b3892ed663376988d61 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sun, 4 Aug 2024 11:29:06 -0400
Subject: [PATCH 03/18] touch up docs. implement  convenience functions that
 allocate for the output. pull convenience functions to the top of each
 module.

---
 README.md                              |  15 +-
 interpn/CHANGELOG.md                   |   2 +
 interpn/src/multicubic/mod.rs          |   5 +-
 interpn/src/multicubic/rectilinear.rs  | 110 ++++++--------
 interpn/src/multicubic/regular.rs      | 115 ++++++--------
 interpn/src/multilinear/rectilinear.rs | 153 ++++++++++---------
 interpn/src/multilinear/regular.rs     | 201 ++++++++++++-------------
 7 files changed, 272 insertions(+), 329 deletions(-)

diff --git a/README.md b/README.md
index 5ebb837..adf9308 100644
--- a/README.md
+++ b/README.md
@@ -11,12 +11,15 @@ Cubic interpolations require two more degrees of freedom per dimension, which re
 Similar to the linear methods, depending on implementation, the constant term can vary by orders of magnitude,
 as can the RAM usage.
 
-| Method                        | RAM       | Interp. Cost (Best Case) | Interp. Cost (Worst Case)               | Extrap. Cost (Worst Case)                      |
-|-------------------------------|-----------|--------------------------|-----------------------------------------|------------------------------------------------|
-| multilinear::regular          | O(ndims)  | O(2^ndims * ndims)       | O(2^ndims * ndims)                      | O(2^ndims + ndims^2)                           |
-| multilinear::rectilinear      | O(ndims)  | O(2^ndims * ndims)       | O(ndims * (2^ndims + log2(gridsize)))   | O(ndims * (2^ndims + ndims + log2(gridsize)))  |
-| multicubic::regular           | O(ndims)  | O(4^ndims)               | O(4^ndims)                              | O(4^ndims)                                     |
-| multicubic::rectilinear       | O(ndims)  | O(4^ndims)               | O(4^ndims) + ndims * log2(gridsize)     | O(4^ndims) + ndims * log2(gridsize)            |
+Rectilinear methods perform a bisection search to find the relevant grid cell, which takes
+a worst-case number of iterations of log2(number of grid elements).
+
+| Method                        | RAM       | Interp. / Extrap. Cost       |
+|-------------------------------|-----------|------------------------------|
+| multilinear::regular          | O(ndims)  | O(2^ndims)                   |
+| multilinear::rectilinear      | O(ndims)  | O(2^ndims) + log2(gridsize)  |
+| multicubic::regular           | O(ndims)  | O(4^ndims)                   |
+| multicubic::rectilinear       | O(ndims)  | O(4^ndims) + log2(gridsize)  |
 
 # Example: Multilinear and Multicubic w/ Regular Grid
 ```rust
diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index 48c0581..dfe69b3 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 * Use recursive method to evaluate multilinear interpolation instead of hypercube method
   * This makes extrapolation cost consistent with interpolation cost
+  * Shows about 2x slower perf in micro-benchmarks, but about 10x faster in end-to-end benchmarks after the Python bindings
+* Reduce repeated documentation
 
 ## 0.4.2 - 2024-05-12
 
diff --git a/interpn/src/multicubic/mod.rs b/interpn/src/multicubic/mod.rs
index 893ec78..4ac9e2f 100644
--- a/interpn/src/multicubic/mod.rs
+++ b/interpn/src/multicubic/mod.rs
@@ -2,8 +2,9 @@
 //!
 //! For interior points, this method gives the same result as
 //! a Hermite spline interpolation, with continuous first derivatives
-//! across the cell boundaries. Under extrapolation, the higher-order
-//! terms are dropped and linear extrapolation is used.
+//! across the cell boundaries. Under extrapolation, either quadratic
+//! or linear extrapolation is used depending on configuration, and
+//! both maintain continuous first derivatives everywhere.
 //!
 //! The solution on interior points is notably different from B-splines,
 //! where the exact values of the first and second derivatives are taken
diff --git a/interpn/src/multicubic/rectilinear.rs b/interpn/src/multicubic/rectilinear.rs
index 81df950..63a58e4 100644
--- a/interpn/src/multicubic/rectilinear.rs
+++ b/interpn/src/multicubic/rectilinear.rs
@@ -1,47 +1,5 @@
 //! An arbitrary-dimensional multicubic interpolator / extrapolator on a rectilinear grid.
 //!
-//! On interior points, a hermite spline is used, with the derivative at each
-//! grid point matched to a second-order central difference. This allows the
-//! interpolant to reproduce a quadratic function exactly, and to approximate
-//! others with minimal overshoot and wobble.
-//!
-//! At the grid boundary, a natural spline boundary condition is applied,
-//! meaning the third derivative of the interpolant is constrainted to zero
-//! at the last grid point, with the result that the interpolant is quadratic
-//! on the last interval before the boundary.
-//!
-//! With "linearize_extrapolation" set, extrapolation is linear on the extrapolated
-//! dimensions, holding the same derivative as the natural boundary condition produces
-//! at the last grid point. Otherwise, the last grid cell's spline function is continued,
-//! producing a quadratic extrapolation.
-//!
-//! This effectively gives a gradual decrease in the order of the interpolant
-//! as the observation point approaches then leaves the grid:
-//!
-//! out                     out
-//! ---|---|---|---|---|---|--- Grid
-//!  2   2   3   3   3   2   2  Order of interpolant between grid points
-//!  1                       1  Extrapolation with linearize_extrapolation
-//!
-//! Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-//!
-//! Operation Complexity
-//! * O(4^ndims) for interpolation and extrapolation in all regions.
-//!
-//! Memory Complexity
-//! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
-//! * While evaluation is recursive, the recursion has constant
-//!   max depth of MAXDIMS, which provides a guarantee on peak
-//!   memory usage.
-//!
-//! Timing
-//! * Timing determinism very tight, but is not exact due to the
-//!   differences in calculations (but not complexity) between
-//!   interpolation and extrapolation.
-//! * An interpolation-only variant of this algorithm could achieve
-//!   near-deterministic timing, but would produce incorrect results
-//!   when evaluated at off-grid points.
-//!
 //! ```rust
 //! use interpn::multicubic::rectilinear;
 //!
@@ -73,6 +31,48 @@
 //!   https://pure.rug.nl/ws/portalfiles/portal/3332271/1992JEngMathVeldman.pdf
 use num_traits::Float;
 
+/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
+/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
+///
+/// This is a convenience function; best performance will be achieved by using the exact right
+/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
+/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
+/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
+///
+/// While this method initializes the interpolator struct on every call, the overhead of doing this
+/// is minimal even when using it to evaluate one observation point at a time.
+#[inline(always)]
+pub fn interpn<T: Float>(
+    grids: &[&[T]],
+    vals: &[T],
+    linearize_extrapolation: bool,
+    obs: &[&[T]],
+    out: &mut [T],
+) -> Result<(), &'static str> {
+    MulticubicRectilinear::<'_, T, 8>::new(grids, vals, linearize_extrapolation)?
+        .interp(obs, out)?;
+    Ok(())
+}
+
+/// Evaluate interpolant, allocating a new Vec for the output.
+///
+/// For best results, use the `interpn` function with preallocated output;
+/// allocation has a significant performance cost, and should be used sparingly.
+#[cfg(feature = "std")]
+pub fn interpn_alloc<T: Float>(
+    grids: &[&[T]],
+    vals: &[T],
+    linearize_extrapolation: bool,
+    obs: &[&[T]],
+) -> Result<Vec<T>, &'static str> {
+    let mut out = vec![T::zero(); obs[0].len()];
+    interpn(grids, vals, linearize_extrapolation, obs, &mut out)?;
+    Ok(out)
+}
+
+// We can use the same rectilinear-grid method again
+pub use crate::multilinear::rectilinear::check_bounds;
+
 #[derive(Clone, Copy, PartialEq)]
 enum Saturation {
     None,
@@ -552,32 +552,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
-/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-///
-/// This is a convenience function; best performance will be achieved by using the exact right
-/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
-/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
-/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
-///
-/// While this method initializes the interpolator struct on every call, the overhead of doing this
-/// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
-pub fn interpn<T: Float>(
-    grids: &[&[T]],
-    vals: &[T],
-    linearize_extrapolation: bool,
-    obs: &[&[T]],
-    out: &mut [T],
-) -> Result<(), &'static str> {
-    MulticubicRectilinear::<'_, T, 8>::new(grids, vals, linearize_extrapolation)?
-        .interp(obs, out)?;
-    Ok(())
-}
-
-// We can use the same rectilinear-grid method again
-// pub use crate::multilinear::rectilinear::check_bounds;
-
 #[cfg(test)]
 mod test {
     use super::interpn;
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index d6af543..813d98d 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -1,47 +1,5 @@
 //! An arbitrary-dimensional multicubic interpolator / extrapolator on a regular grid.
 //!
-//! On interior points, a hermite spline is used, with the derivative at each
-//! grid point matched to a second-order central difference. This allows the
-//! interpolant to reproduce a quadratic function exactly, and to approximate
-//! others with minimal overshoot and wobble.
-//!
-//! At the grid boundary, a natural spline boundary condition is applied,
-//! meaning the third derivative of the interpolant is constrainted to zero
-//! at the last grid point, with the result that the interpolant is quadratic
-//! on the last interval before the boundary.
-//!
-//! With "linearize_extrapolation" set, extrapolation is linear on the extrapolated
-//! dimensions, holding the same derivative as the natural boundary condition produces
-//! at the last grid point. Otherwise, the last grid cell's spline function is continued,
-//! producing a quadratic extrapolation.
-//!
-//! This effectively gives a gradual decrease in the order of the interpolant
-//! as the observation point approaches then leaves the grid:
-//!
-//! out                     out
-//! ---|---|---|---|---|---|--- Grid
-//!  2   2   3   3   3   2   2  Order of interpolant between grid points
-//!  1                       1  Extrapolation with linearize_extrapolation
-//!
-//! Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-//!
-//! Operation Complexity
-//! * O(4^ndims) for interpolation and extrapolation in all regions.
-//!
-//! Memory Complexity
-//! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
-//! * While evaluation is recursive, the recursion has constant
-//!   max depth of MAXDIMS, which provides a guarantee on peak
-//!   memory usage.
-//!
-//! Timing
-//! * Timing determinism very tight, but is not exact due to the
-//!   differences in calculations (but not complexity) between
-//!   interpolation and extrapolation.
-//! * An interpolation-only variant of this algorithm could achieve
-//!   near-deterministic timing, but would produce incorrect results
-//!   when evaluated at off-grid points.
-//!
 //! ```rust
 //! use interpn::multicubic::regular::interpn;
 //!
@@ -74,6 +32,52 @@
 //! ```
 use num_traits::{Float, NumCast};
 
+/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
+/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
+///
+/// This is a convenience function; best performance will be achieved by using the exact right
+/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
+/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
+/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
+///
+/// While this method initializes the interpolator struct on every call, the overhead of doing this
+/// is minimal even when using it to evaluate one observation point at a time.
+#[inline(always)]
+pub fn interpn<T: Float>(
+    dims: &[usize],
+    starts: &[T],
+    steps: &[T],
+    vals: &[T],
+    linearize_extrapolation: bool,
+    obs: &[&[T]],
+    out: &mut [T],
+) -> Result<(), &'static str> {
+    MulticubicRegular::<'_, T, 8>::new(dims, starts, steps, vals, linearize_extrapolation)?
+        .interp(obs, out)?;
+    Ok(())
+}
+
+/// Evaluate interpolant, allocating a new Vec for the output.
+/// 
+/// For best results, use the `interpn` function with preallocated output;
+/// allocation has a significant performance cost, and should be used sparingly.
+#[cfg(feature = "std")]
+pub fn interpn_alloc<T: Float>(
+    dims: &[usize],
+    starts: &[T],
+    steps: &[T],
+    vals: &[T],
+    linearize_extrapolation: bool,
+    obs: &[&[T]],
+) -> Result<Vec<T>, &'static str> {
+    let mut out = vec![T::zero(); obs[0].len()];
+    interpn(dims, starts, steps, vals, linearize_extrapolation, obs, &mut out)?;
+    Ok(out)
+}
+
+// We can use the same regular-grid method again
+pub use crate::multilinear::regular::check_bounds;
+
 #[derive(Clone, Copy, PartialEq)]
 enum Saturation {
     None,
@@ -546,33 +550,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
-/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-///
-/// This is a convenience function; best performance will be achieved by using the exact right
-/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
-/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
-/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
-///
-/// While this method initializes the interpolator struct on every call, the overhead of doing this
-/// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
-pub fn interpn<T: Float>(
-    dims: &[usize],
-    starts: &[T],
-    steps: &[T],
-    vals: &[T],
-    linearize_extrapolation: bool,
-    obs: &[&[T]],
-    out: &mut [T],
-) -> Result<(), &'static str> {
-    MulticubicRegular::<'_, T, 8>::new(dims, starts, steps, vals, linearize_extrapolation)?
-        .interp(obs, out)?;
-    Ok(())
-}
-
-// We can use the same regular-grid method again
-pub use crate::multilinear::regular::check_bounds;
 
 #[cfg(test)]
 mod test {
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index ed35ba6..df73980 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -1,18 +1,5 @@
 //! Multilinear interpolation/extrapolation on a rectilinear grid.
-//!
-//! While this method does not fully capitalize on vectorization,
-//! it results in fairly minimal instantaneous memory usage,
-//! and throughput performance is similar to existing methods.
-//!
-//! Operation Complexity
-//! * O(2^ndims) for interpolation and extrapolation in all regions.
-//!
-//! Memory Complexity
-//! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
-//!
-//! Timing
-//! * Timing determinism is very tight, but not guaranteed due to the use of a bisection search.
-//!
+//! 
 //! ```rust
 //! use interpn::multilinear::rectilinear;
 //!
@@ -42,6 +29,80 @@
 //! * https://en.wikipedia.org/wiki/Bilinear_interpolation#Weighted_mean
 use num_traits::Float;
 
+
+/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
+/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
+///
+/// This is a convenience function; best performance will be achieved by using the exact right
+/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
+/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
+/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
+///
+/// While this method initializes the interpolator struct on every call, the overhead of doing this
+/// is minimal even when using it to evaluate one observation point at a time.
+#[inline(always)]
+pub fn interpn<T: Float>(
+    grids: &[&[T]],
+    vals: &[T],
+    obs: &[&[T]],
+    out: &mut [T],
+) -> Result<(), &'static str> {
+    MultilinearRectilinear::<'_, T, 8>::new(grids, vals)?.interp(obs, out)?;
+    Ok(())
+}
+
+/// Evaluate interpolant, allocating a new Vec for the output.
+/// 
+/// For best results, use the `interpn` function with preallocated output;
+/// allocation has a significant performance cost, and should be used sparingly.
+#[cfg(feature = "std")]
+pub fn interpn_alloc<T: Float>(
+    grids: &[&[T]],
+    vals: &[T],
+    obs: &[&[T]]
+) -> Result<Vec<T>, &'static str> {
+    let mut out = vec![T::zero(); obs[0].len()];
+    interpn(grids, vals, obs, &mut out)?;
+    Ok(out)
+}
+
+/// Check whether a list of observation points are inside the grid within some absolute tolerance.
+/// Assumes the grid is valid for the rectilinear interpolator (monotonically increasing).
+///
+/// Output slice entry `i` is set to `false` if no points on that dimension are out of bounds,
+/// and set to `true` if there is a bounds violation on that axis.
+///
+/// # Errors
+/// * If the dimensionality of the grid does not match the dimensionality of the observation points
+/// * If the output slice length does not match the dimensionality of the grid
+#[inline(always)]
+pub fn check_bounds<T: Float>(
+    grids: &[&[T]],
+    obs: &[&[T]],
+    atol: T,
+    out: &mut [bool],
+) -> Result<(), &'static str> {
+    let ndims = grids.len();
+    if !(obs.len() == ndims && out.len() == ndims && (0..ndims).all(|i| !grids[i].is_empty())) {
+        return Err("Dimension mismatch");
+    }
+    for i in 0..ndims {
+        let lo = grids[i][0];
+        let hi = grids[i].last();
+        match hi {
+            Some(&hi) => {
+                let bad = obs[i]
+                    .iter()
+                    .any(|&x| (x - lo) <= -atol || (x - hi) >= atol);
+
+                out[i] = bad;
+            }
+            None => return Err("Dimension mismatch"),
+        }
+    }
+    Ok(())
+}
+
 /// An arbitrary-dimensional multilinear interpolator / extrapolator on a rectilinear grid.
 ///
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
@@ -58,12 +119,7 @@ use num_traits::Float;
 ///   memory usage.
 ///
 /// Timing
-/// * Timing determinism very tight, but is not exact due to the
-///   differences in calculations (but not complexity) between
-///   interpolation and extrapolation.
-/// * An interpolation-only variant of this algorithm could achieve
-///   near-deterministic timing, but would produce incorrect results
-///   when evaluated at off-grid points.
+/// * Timing determinism is very tight, but not guaranteed due to the use of a bisection search.
 pub struct MultilinearRectilinear<'a, T: Float, const MAXDIMS: usize> {
     /// x, y, ... coordinate grids, each entry of size dims[i]
     grids: &'a [&'a [T]],
@@ -275,63 +331,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-/// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
-/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-///
-/// This is a convenience function; best performance will be achieved by using the exact right
-/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
-/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
-/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
-///
-/// While this method initializes the interpolator struct on every call, the overhead of doing this
-/// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
-pub fn interpn<T: Float>(
-    grids: &[&[T]],
-    vals: &[T],
-    obs: &[&[T]],
-    out: &mut [T],
-) -> Result<(), &'static str> {
-    MultilinearRectilinear::<'_, T, 8>::new(grids, vals)?.interp(obs, out)?;
-    Ok(())
-}
-
-/// Check whether a list of observation points are inside the grid within some absolute tolerance.
-/// Assumes the grid is valid for the rectilinear interpolator (monotonically increasing).
-///
-/// Output slice entry `i` is set to `false` if no points on that dimension are out of bounds,
-/// and set to `true` if there is a bounds violation on that axis.
-///
-/// # Errors
-/// * If the dimensionality of the grid does not match the dimensionality of the observation points
-/// * If the output slice length does not match the dimensionality of the grid
-#[inline(always)]
-pub fn check_bounds<T: Float>(
-    grids: &[&[T]],
-    obs: &[&[T]],
-    atol: T,
-    out: &mut [bool],
-) -> Result<(), &'static str> {
-    let ndims = grids.len();
-    if !(obs.len() == ndims && out.len() == ndims && (0..ndims).all(|i| !grids[i].is_empty())) {
-        return Err("Dimension mismatch");
-    }
-    for i in 0..ndims {
-        let lo = grids[i][0];
-        let hi = grids[i].last();
-        match hi {
-            Some(&hi) => {
-                let bad = obs[i]
-                    .iter()
-                    .any(|&x| (x - lo) <= -atol || (x - hi) >= atol);
-
-                out[i] = bad;
-            }
-            None => return Err("Dimension mismatch"),
-        }
-    }
-    Ok(())
-}
 
 #[cfg(test)]
 mod test {
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 2f98f7e..93fcc5d 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -1,24 +1,5 @@
 //! Multilinear interpolation/extrapolation on a regular grid.
 //!
-//! This is a fairly literal implementation of the geometric interpretation
-//! of multilinear interpolation as an area- or volume- weighted average
-//! on the interior of the grid, and extends this metaphor to include
-//! extrapolation to off-grid points.
-//!
-//! While this method does not fully capitalize on vectorization,
-//! it results in fairly minimal instantaneous memory usage,
-//! and throughput performance is similar to existing methods.
-//!
-//! Operation Complexity
-//! * O(2^ndims) for interpolation and extrapolation in all regions.
-//!
-//! Memory Complexity
-//! * Peak stack usage is O(MAXDIMS), which is minimally O(ndims).
-//!
-//! Timing
-//! * Timing determinism is guaranteed to the extent that floating-point calculation timing is consistent.
-//!   That said, floating-point calculations can take a different number of clock-cycles depending on numerical values.
-//!
 //! ```rust
 //! use interpn::multilinear::regular;
 //!
@@ -53,11 +34,99 @@
 //! * https://en.wikipedia.org/wiki/Bilinear_interpolation#Repeated_linear_interpolation
 use num_traits::{Float, NumCast};
 
+/// Evaluate multilinear interpolation on a regular grid in up to 8 dimensions.
+/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
+///
+/// This is a convenience function; best performance will be achieved by using the exact right
+/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
+/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
+/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
+///
+/// While this method initializes the interpolator struct on every call, the overhead of doing this
+/// is minimal even when using it to evaluate one observation point at a time.
+#[inline(always)]
+pub fn interpn<T: Float>(
+    dims: &[usize],
+    starts: &[T],
+    steps: &[T],
+    vals: &[T],
+    obs: &[&[T]],
+    out: &mut [T],
+) -> Result<(), &'static str> {
+    MultilinearRegular::<'_, T, 8>::new(dims, starts, steps, vals)?.interp(obs, out)?;
+    Ok(())
+}
+
+/// Evaluate interpolant, allocating a new Vec for the output.
+///
+/// For best results, use the `interpn` function with preallocated output;
+/// allocation has a significant performance cost, and should be used sparingly.
+#[cfg(feature = "std")]
+pub fn interpn_alloc<T: Float>(
+    dims: &[usize],
+    starts: &[T],
+    steps: &[T],
+    vals: &[T],
+    obs: &[&[T]],
+) -> Result<Vec<T>, &'static str> {
+    let mut out = vec![T::zero(); obs[0].len()];
+    interpn(dims, starts, steps, vals, obs, &mut out)?;
+    Ok(out)
+}
+
+/// Check whether a list of observation points are inside the grid within some absolute tolerance.
+/// Assumes the grid is valid for the rectilinear interpolator (monotonically increasing).
+///
+/// Output slice entry `i` is set to `false` if no points on that dimension are out of bounds,
+/// and set to `true` if there is a bounds violation on that axis.
+///
+/// # Errors
+/// * If the dimensionality of the grid does not match the dimensionality of the observation points
+/// * If the output slice length does not match the dimensionality of the grid
+#[inline(always)]
+pub fn check_bounds<T: Float>(
+    dims: &[usize],
+    starts: &[T],
+    steps: &[T],
+    obs: &[&[T]],
+    atol: T,
+    out: &mut [bool],
+) -> Result<(), &'static str> {
+    let ndims = dims.len();
+    if !(obs.len() == ndims && out.len() == ndims) {
+        return Err("Dimension mismatch");
+    }
+
+    for i in 0..ndims {
+        let first = starts[i];
+        let last_elem = <T as NumCast>::from(dims[i] - 1); // Last element in grid
+
+        match last_elem {
+            Some(last_elem) => {
+                let last = starts[i] + steps[i] * last_elem;
+                let lo = first.min(last);
+                let hi = first.max(last);
+
+                let bad = obs[i]
+                    .iter()
+                    .any(|&x| (x - lo) <= -atol || (x - hi) >= atol);
+                out[i] = bad;
+            }
+            // Passing an unrepresentable number in isn't, strictly speaking, an error
+            // and since an unrepresentable number can't be on the grid,
+            // we can just flag it for the bounds check like normal
+            None => {
+                out[i] = true;
+            }
+        }
+    }
+    Ok(())
+}
+
 /// An arbitrary-dimensional multilinear interpolator / extrapolator on a regular grid.
 ///
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
-///
 /// Operation Complexity
 /// * O(2^ndims) for interpolation and extrapolation in all regions.
 ///
@@ -68,12 +137,8 @@ use num_traits::{Float, NumCast};
 ///   memory usage.
 ///
 /// Timing
-/// * Timing determinism very tight, but is not exact due to the
-///   differences in calculations (but not complexity) between
-///   interpolation and extrapolation.
-/// * An interpolation-only variant of this algorithm could achieve
-///   near-deterministic timing, but would produce incorrect results
-///   when evaluated at off-grid points.
+/// * Timing determinism is guaranteed to the extent that floating-point calculation timing is consistent.
+///   That said, floating-point calculations can take a different number of clock-cycles depending on numerical values.
 pub struct MultilinearRegular<'a, T: Float, const MAXDIMS: usize> {
     /// Number of dimensions
     ndims: usize,
@@ -107,7 +172,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         dims: &[usize],
         starts: &[T],
         steps: &[T],
-        vals: &'a [T]
+        vals: &'a [T],
     ) -> Result<Self, &'static str> {
         // Check dimensions
         let ndims = dims.len();
@@ -115,7 +180,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         if !(starts.len() == ndims && steps.len() == ndims && vals.len() == nvals && ndims > 0) {
             return Err("Dimension mismatch");
         }
-
         // Make sure all dimensions have at least four entries
         let degenerate = dims[..ndims].iter().any(|&x| x < 2);
         if degenerate {
@@ -227,13 +291,11 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         for i in 0..ndims {
             let index_zero_loc = self.starts[i]
                 + self.steps[i]
-                    * <T as NumCast>::from(origin[i])
-                        .ok_or("Unrepresentable coordinate value")?;
+                    * <T as NumCast>::from(origin[i]).ok_or("Unrepresentable coordinate value")?;
             dts[i] = (x[i] - index_zero_loc) / self.steps[i];
         }
 
         // Recursive interpolation of one dependency tree at a time
-        // let loc = &origin; // Starting location in the tree is the origin
         let dim = ndims; // Start from the end and recurse back to zero
         let loc = &mut [0_usize; MAXDIMS][..ndims];
         loc.copy_from_slice(origin);
@@ -303,7 +365,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     }
 }
 
-
 /// Index a single value from an array
 #[inline(always)]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
@@ -315,80 +376,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-
-/// Evaluate multilinear interpolation on a regular grid in up to 8 dimensions.
-/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
-///
-/// This is a convenience function; best performance will be achieved by using the exact right
-/// number for the MAXDIMS parameter, as this will slightly reduce compute and storage overhead,
-/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
-/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
-///
-/// While this method initializes the interpolator struct on every call, the overhead of doing this
-/// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
-pub fn interpn<T: Float>(
-    dims: &[usize],
-    starts: &[T],
-    steps: &[T],
-    vals: &[T],
-    obs: &[&[T]],
-    out: &mut [T],
-) -> Result<(), &'static str> {
-    MultilinearRegular::<'_, T, 8>::new(dims, starts, steps, vals)?.interp(obs, out)?;
-    Ok(())
-}
-
-
-/// Check whether a list of observation points are inside the grid within some absolute tolerance.
-/// Assumes the grid is valid for the rectilinear interpolator (monotonically increasing).
-///
-/// Output slice entry `i` is set to `false` if no points on that dimension are out of bounds,
-/// and set to `true` if there is a bounds violation on that axis.
-///
-/// # Errors
-/// * If the dimensionality of the grid does not match the dimensionality of the observation points
-/// * If the output slice length does not match the dimensionality of the grid
-#[inline(always)]
-pub fn check_bounds<T: Float>(
-    dims: &[usize],
-    starts: &[T],
-    steps: &[T],
-    obs: &[&[T]],
-    atol: T,
-    out: &mut [bool],
-) -> Result<(), &'static str> {
-    let ndims = dims.len();
-    if !(obs.len() == ndims && out.len() == ndims) {
-        return Err("Dimension mismatch");
-    }
-
-    for i in 0..ndims {
-        let first = starts[i];
-        let last_elem = <T as NumCast>::from(dims[i] - 1); // Last element in grid
-
-        match last_elem {
-            Some(last_elem) => {
-                let last = starts[i] + steps[i] * last_elem;
-                let lo = first.min(last);
-                let hi = first.max(last);
-
-                let bad = obs[i]
-                    .iter()
-                    .any(|&x| (x - lo) <= -atol || (x - hi) >= atol);
-                out[i] = bad;
-            }
-            // Passing an unrepresentable number in isn't, strictly speaking, an error
-            // and since an unrepresentable number can't be on the grid,
-            // we can just flag it for the bounds check like normal
-            None => {
-                out[i] = true;
-            }
-        }
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod test {
     use super::interpn;
@@ -439,4 +426,4 @@ mod test {
             });
         }
     }
-}
\ No newline at end of file
+}

From ead1c083016babcfde0e7d371e43f4e56436e7e1 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sun, 4 Aug 2024 11:29:33 -0400
Subject: [PATCH 04/18] format

---
 interpn/src/lib.rs                     |  2 +-
 interpn/src/multicubic/regular.rs      | 13 ++++++++++---
 interpn/src/multilinear/rectilinear.rs |  8 +++-----
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/interpn/src/lib.rs b/interpn/src/lib.rs
index f5a8647..5c64270 100644
--- a/interpn/src/lib.rs
+++ b/interpn/src/lib.rs
@@ -9,7 +9,7 @@
 //! Cubic interpolations require two more degrees of freedom per dimension, and have a minimal runtime scaling of 4^ndims.
 //! Similar to the linear methods, depending on implementation, the constant term can vary by orders of magnitude,
 //! as can the RAM usage.
-//! 
+//!
 //! Rectilinear methods perform a bisection search to find the relevant grid cell, which takes
 //! a worst-case number of iterations of log2(number of grid elements).
 //!
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index 813d98d..6224d9a 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -58,7 +58,7 @@ pub fn interpn<T: Float>(
 }
 
 /// Evaluate interpolant, allocating a new Vec for the output.
-/// 
+///
 /// For best results, use the `interpn` function with preallocated output;
 /// allocation has a significant performance cost, and should be used sparingly.
 #[cfg(feature = "std")]
@@ -71,7 +71,15 @@ pub fn interpn_alloc<T: Float>(
     obs: &[&[T]],
 ) -> Result<Vec<T>, &'static str> {
     let mut out = vec![T::zero(); obs[0].len()];
-    interpn(dims, starts, steps, vals, linearize_extrapolation, obs, &mut out)?;
+    interpn(
+        dims,
+        starts,
+        steps,
+        vals,
+        linearize_extrapolation,
+        obs,
+        &mut out,
+    )?;
     Ok(out)
 }
 
@@ -550,7 +558,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-
 #[cfg(test)]
 mod test {
     use super::interpn;
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index df73980..3036e8e 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -1,5 +1,5 @@
 //! Multilinear interpolation/extrapolation on a rectilinear grid.
-//! 
+//!
 //! ```rust
 //! use interpn::multilinear::rectilinear;
 //!
@@ -29,7 +29,6 @@
 //! * https://en.wikipedia.org/wiki/Bilinear_interpolation#Weighted_mean
 use num_traits::Float;
 
-
 /// Evaluate multicubic interpolation on a regular grid in up to 8 dimensions.
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
@@ -52,14 +51,14 @@ pub fn interpn<T: Float>(
 }
 
 /// Evaluate interpolant, allocating a new Vec for the output.
-/// 
+///
 /// For best results, use the `interpn` function with preallocated output;
 /// allocation has a significant performance cost, and should be used sparingly.
 #[cfg(feature = "std")]
 pub fn interpn_alloc<T: Float>(
     grids: &[&[T]],
     vals: &[T],
-    obs: &[&[T]]
+    obs: &[&[T]],
 ) -> Result<Vec<T>, &'static str> {
     let mut out = vec![T::zero(); obs[0].len()];
     interpn(grids, vals, obs, &mut out)?;
@@ -331,7 +330,6 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     data[i]
 }
 
-
 #[cfg(test)]
 mod test {
     use super::{interpn, MultilinearRectilinear};

From c9743a85a79125dc5a7ca373f7e77dfe0a21b153 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sun, 4 Aug 2024 11:32:07 -0400
Subject: [PATCH 05/18] use interpn_alloc functions in doctests to get coverage

---
 interpn/src/multicubic/rectilinear.rs  | 4 ++--
 interpn/src/multicubic/regular.rs      | 6 +++---
 interpn/src/multilinear/rectilinear.rs | 4 ++--
 interpn/src/multilinear/regular.rs     | 7 ++-----
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/interpn/src/multicubic/rectilinear.rs b/interpn/src/multicubic/rectilinear.rs
index 63a58e4..d083132 100644
--- a/interpn/src/multicubic/rectilinear.rs
+++ b/interpn/src/multicubic/rectilinear.rs
@@ -21,9 +21,9 @@
 //! // Storage for output
 //! let mut out = [0.0; 2];
 //!
-//! // Do interpolation
+//! // Do interpolation, allocating for the output for convenience
 //! let linearize_extrapolation = false;
-//! rectilinear::interpn(grids, &z, linearize_extrapolation, &obs, &mut out).unwrap();
+//! rectilinear::interpn_alloc(grids, &z, linearize_extrapolation, &obs).unwrap();
 //! ```
 //!
 //! References
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index 6224d9a..87978af 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -1,7 +1,7 @@
 //! An arbitrary-dimensional multicubic interpolator / extrapolator on a regular grid.
 //!
 //! ```rust
-//! use interpn::multicubic::regular::interpn;
+//! use interpn::multicubic::regular;
 //!
 //! // Define a grid
 //! let x = [1.0_f64, 2.0, 3.0, 4.0];
@@ -26,9 +26,9 @@
 //! // Storage for output
 //! let mut out = [0.0; 2];
 //!
-//! // Do interpolation
+//! // Do interpolation, allocating for the output for convenience
 //! let linearize_extrapolation = false;
-//! interpn(&dims, &starts, &steps, &z, linearize_extrapolation, &obs, &mut out).unwrap();
+//! regular::interpn_alloc(&dims, &starts, &steps, &z, linearize_extrapolation, &obs).unwrap();
 //! ```
 use num_traits::{Float, NumCast};
 
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index 3036e8e..a02bc38 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -21,8 +21,8 @@
 //! // Storage for output
 //! let mut out = [0.0; 2];
 //!
-//! // Do interpolation
-//! rectilinear::interpn(grids, &z, &obs, &mut out).unwrap();
+//! // Do interpolation, allocating for the output for convenience
+//! rectilinear::interpn_alloc(grids, &z, &obs).unwrap();
 //! ```
 //!
 //! References
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 93fcc5d..65a376a 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -23,11 +23,8 @@
 //! let yobs = [-1.0, 3.0];
 //! let obs = [&xobs[..], &yobs[..]];
 //!
-//! // Storage for output
-//! let mut out = [0.0; 2];
-//!
-//! // Do interpolation
-//! regular::interpn(&dims, &starts, &steps, &z, &obs, &mut out).unwrap();
+//! // Do interpolation, allocating for the output for convenience
+//! regular::interpn_alloc(&dims, &starts, &steps, &z, &obs).unwrap();
 //! ```
 //!
 //! References

From 6a9e2d62303bcf9d2b102bfe297e7c98cb251143 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sun, 4 Aug 2024 11:35:49 -0400
Subject: [PATCH 06/18] update changelog

---
 interpn/CHANGELOG.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index dfe69b3..c773263 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -2,11 +2,16 @@
 
 ## 0.4.3 - 2024-08-03
 
+## Added
+
+* Implement `interpn_alloc` function for each method, which allocates a Vec for the output
+
 ## Changed
 
 * Use recursive method to evaluate multilinear interpolation instead of hypercube method
-  * This makes extrapolation cost consistent with interpolation cost
+  * This makes extrapolation cost consistent with interpolation cost, and reduces nominal perf scaling
   * Shows about 2x slower perf in micro-benchmarks, but about 10x faster in end-to-end benchmarks after the Python bindings
+    * Need to improve benchmarking strategy to better capture perf in real-life usage
 * Reduce repeated documentation
 
 ## 0.4.2 - 2024-05-12

From f19bb411ac6e4517948553815440418850855b8f Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sun, 4 Aug 2024 11:45:40 -0400
Subject: [PATCH 07/18] update changelog

---
 interpn/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index c773263..f99a892 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -10,7 +10,7 @@
 
 * Use recursive method to evaluate multilinear interpolation instead of hypercube method
   * This makes extrapolation cost consistent with interpolation cost, and reduces nominal perf scaling
-  * Shows about 2x slower perf in micro-benchmarks, but about 10x faster in end-to-end benchmarks after the Python bindings
+  * Shows about 2x slower perf in micro-benchmarks, but about 2x faster in end-to-end benchmarks after the Python bindings
     * Need to improve benchmarking strategy to better capture perf in real-life usage
 * Reduce repeated documentation
 

From dae90e8d05979579825d7f4910ffd62fa6352a4d Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 6 Aug 2024 05:31:31 -0400
Subject: [PATCH 08/18] remove all inlining annotations

---
 interpn/src/multicubic/rectilinear.rs  | 9 ---------
 interpn/src/multicubic/regular.rs      | 9 ---------
 interpn/src/multilinear/rectilinear.rs | 8 --------
 interpn/src/multilinear/regular.rs     | 8 --------
 4 files changed, 34 deletions(-)

diff --git a/interpn/src/multicubic/rectilinear.rs b/interpn/src/multicubic/rectilinear.rs
index d083132..785a193 100644
--- a/interpn/src/multicubic/rectilinear.rs
+++ b/interpn/src/multicubic/rectilinear.rs
@@ -41,7 +41,6 @@ use num_traits::Float;
 ///
 /// While this method initializes the interpolator struct on every call, the overhead of doing this
 /// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
 pub fn interpn<T: Float>(
     grids: &[&[T]],
     vals: &[T],
@@ -150,7 +149,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     /// * If any input dimensions do not match
     /// * If any dimensions have size < 4
     /// * If any step sizes have zero or negative magnitude
-    #[inline(always)]
     pub fn new(
         grids: &'a [&'a [T]],
         vals: &'a [T],
@@ -190,7 +188,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     /// # Errors
     ///   * If the dimensionality of the point does not match the data
     ///   * If the dimensionality of point or data does not match the grid
-    #[inline(always)]
     pub fn interp(&self, x: &[&[T]], out: &mut [T]) -> Result<(), &'static str> {
         let n = out.len();
         let ndims = self.grids.len();
@@ -225,7 +222,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     ///   * If the dimensionality of either one exceeds the fixed maximum
     ///   * If the index along any dimension exceeds the maximum representable
     ///     integer value within the value type `T`
-    #[inline(always)]
     pub fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
         // Check sizes
         let ndims = self.grids.len();
@@ -277,7 +273,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    #[inline(always)]
     fn get_loc(&self, v: T, dim: usize) -> Result<(usize, Saturation), &'static str> {
         let saturation: Saturation; // What part of the grid cell are we in?
         let grid = self.grids[dim];
@@ -323,7 +318,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
-    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -368,7 +362,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
 }
 
 /// Calculate slopes and offsets & select evaluation method
-#[inline]
 fn interp_inner<T: Float, const MAXDIMS: usize>(
     vals: [T; 4],
     grid_cell: &[T],
@@ -506,7 +499,6 @@ fn interp_inner<T: Float, const MAXDIMS: usize>(
 /// Evaluate a hermite spline function on an interval from x0 to x1,
 /// with imposed slopes k0 and k1 at the endpoints, and normalized
 /// coordinate t = (x - x0) / (x1 - x0).
-#[inline(always)]
 fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
     // `a` and `b` are the difference between this function and a linear one going
     // forward or backward with the imposed slopes.
@@ -542,7 +534,6 @@ fn centered_difference_nonuniform<T: Float>(y0: T, y1: T, y2: T, h01: T, h12: T)
 }
 
 /// Index a single value from an array
-#[inline(always)]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index 87978af..5e8e568 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -42,7 +42,6 @@ use num_traits::{Float, NumCast};
 ///
 /// While this method initializes the interpolator struct on every call, the overhead of doing this
 /// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
 pub fn interpn<T: Float>(
     dims: &[usize],
     starts: &[T],
@@ -169,7 +168,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     /// * If any input dimensions do not match
     /// * If any dimensions have size < 4
     /// * If any step sizes have zero or negative magnitude
-    #[inline(always)]
     pub fn new(
         dims: &[usize],
         starts: &[T],
@@ -224,7 +222,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     /// # Errors
     ///   * If the dimensionality of the point does not match the data
     ///   * If the dimensionality of point or data does not match the grid
-    #[inline(always)]
     pub fn interp(&self, x: &[&[T]], out: &mut [T]) -> Result<(), &'static str> {
         let n = out.len();
         let ndims = self.ndims;
@@ -257,7 +254,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     ///   * If the dimensionality of either one exceeds the fixed maximum
     ///   * If the index along any dimension exceeds the maximum representable
     ///     integer value within the value type `T`
-    #[inline(always)]
     pub fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
         // Check sizes
         let ndims = self.ndims;
@@ -321,7 +317,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    #[inline(always)]
     fn get_loc(&self, v: T, dim: usize) -> Result<(usize, Saturation), &'static str> {
         let saturation: Saturation; // What part of the grid cell are we in?
 
@@ -362,7 +357,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
-    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -405,7 +399,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
 }
 
 /// Calculate slopes and offsets & select evaluation method
-#[inline]
 fn interp_inner<T: Float, const MAXDIMS: usize>(
     vals: [T; 4],
     t: T,
@@ -530,7 +523,6 @@ fn interp_inner<T: Float, const MAXDIMS: usize>(
 /// Evaluate a hermite spline function on an interval from x0 to x1,
 /// with imposed slopes k0 and k1 at the endpoints, and normalized
 /// coordinate t = (x - x0) / (x1 - x0).
-#[inline(always)]
 fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
     // `a` and `b` are the difference between this function and a linear one going
     // forward or backward with the imposed slopes.
@@ -548,7 +540,6 @@ fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
 }
 
 /// Index a single value from an array
-#[inline(always)]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index a02bc38..3af62e4 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -39,7 +39,6 @@ use num_traits::Float;
 ///
 /// While this method initializes the interpolator struct on every call, the overhead of doing this
 /// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
 pub fn interpn<T: Float>(
     grids: &[&[T]],
     vals: &[T],
@@ -74,7 +73,6 @@ pub fn interpn_alloc<T: Float>(
 /// # Errors
 /// * If the dimensionality of the grid does not match the dimensionality of the observation points
 /// * If the output slice length does not match the dimensionality of the grid
-#[inline(always)]
 pub fn check_bounds<T: Float>(
     grids: &[&[T]],
     obs: &[&[T]],
@@ -141,7 +139,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     /// * If any input dimensions do not match
     /// * If any dimensions have size < 4
     /// * If any step sizes have zero or negative magnitude
-    #[inline(always)]
     pub fn new(grids: &'a [&'a [T]], vals: &'a [T]) -> Result<Self, &'static str> {
         // Check dimensions
         let ndims = grids.len();
@@ -172,7 +169,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     /// # Errors
     ///   * If the dimensionality of the point does not match the data
     ///   * If the dimensionality of point or data does not match the grid
-    #[inline(always)]
     pub fn interp(&self, x: &[&[T]], out: &mut [T]) -> Result<(), &'static str> {
         let n = out.len();
         let ndims = self.grids.len();
@@ -207,7 +203,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     ///   * If the dimensionality of either one exceeds the fixed maximum
     ///   * If the index along any dimension exceeds the maximum representable
     ///     integer value within the value type `T`
-    #[inline(always)]
     pub fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
         // Check sizes
         let ndims = self.grids.len();
@@ -258,7 +253,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    #[inline(always)]
     fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let grid = self.grids[dim];
 
@@ -279,7 +273,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     }
 
     /// Recursive evaluation of interpolant on each dimension
-    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -320,7 +313,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
 }
 
 /// Index a single value from an array
-#[inline(always)]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 65a376a..0995e20 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -41,7 +41,6 @@ use num_traits::{Float, NumCast};
 ///
 /// While this method initializes the interpolator struct on every call, the overhead of doing this
 /// is minimal even when using it to evaluate one observation point at a time.
-#[inline(always)]
 pub fn interpn<T: Float>(
     dims: &[usize],
     starts: &[T],
@@ -80,7 +79,6 @@ pub fn interpn_alloc<T: Float>(
 /// # Errors
 /// * If the dimensionality of the grid does not match the dimensionality of the observation points
 /// * If the output slice length does not match the dimensionality of the grid
-#[inline(always)]
 pub fn check_bounds<T: Float>(
     dims: &[usize],
     starts: &[T],
@@ -164,7 +162,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     /// * If any input dimensions do not match
     /// * If any dimensions have size < 2
     /// * If any step sizes have zero or negative magnitude
-    #[inline(always)]
     pub fn new(
         dims: &[usize],
         starts: &[T],
@@ -216,7 +213,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     /// # Errors
     ///   * If the dimensionality of the point does not match the data
     ///   * If the dimensionality of point or data does not match the grid
-    #[inline(always)]
     pub fn interp(&self, x: &[&[T]], out: &mut [T]) -> Result<(), &'static str> {
         let n = out.len();
         let ndims = self.ndims;
@@ -249,7 +245,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     ///   * If the dimensionality of either one exceeds the fixed maximum
     ///   * If the index along any dimension exceeds the maximum representable
     ///     integer value within the value type `T`
-    #[inline(always)]
     pub fn interp_one(&self, x: &[T]) -> Result<T, &'static str> {
         // Check sizes
         let ndims = self.ndims;
@@ -308,7 +303,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
-    #[inline(always)]
     fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let floc = ((v - self.starts[dim]) / self.steps[dim]).floor(); // float loc
                                                                        // Signed integer loc, with the bottom of the cell aligned to place the normalized
@@ -323,7 +317,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
-    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -363,7 +356,6 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
 }
 
 /// Index a single value from an array
-#[inline(always)]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {

From 63e9312833fbb7f7b703ebab015aea7b56a8a639 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 6 Aug 2024 05:33:41 -0400
Subject: [PATCH 09/18] update changelog

---
 interpn/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index f99a892..e2e11e8 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -13,6 +13,8 @@
   * Shows about 2x slower perf in micro-benchmarks, but about 2x faster in end-to-end benchmarks after the Python bindings
     * Need to improve benchmarking strategy to better capture perf in real-life usage
 * Reduce repeated documentation
+* Remove all inlining annotations
+  * Minimal effect on performance; provides more flexibility to downstream applications, especially opt-level=s builds
 
 ## 0.4.2 - 2024-05-12
 

From 03cc7eca442a8f2bc6a57e988369a4032aa678f8 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Wed, 7 Aug 2024 20:46:43 -0400
Subject: [PATCH 10/18] sprinkle some #[inline] on the leaves

---
 interpn/src/multicubic/rectilinear.rs  | 6 ++++++
 interpn/src/multicubic/regular.rs      | 5 +++++
 interpn/src/multilinear/rectilinear.rs | 3 +++
 interpn/src/multilinear/regular.rs     | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/interpn/src/multicubic/rectilinear.rs b/interpn/src/multicubic/rectilinear.rs
index 785a193..abe5e2a 100644
--- a/interpn/src/multicubic/rectilinear.rs
+++ b/interpn/src/multicubic/rectilinear.rs
@@ -273,6 +273,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
+    #[inline]
     fn get_loc(&self, v: T, dim: usize) -> Result<(usize, Saturation), &'static str> {
         let saturation: Saturation; // What part of the grid cell are we in?
         let grid = self.grids[dim];
@@ -318,6 +319,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
+    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -362,6 +364,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRectilinear<'a, T, MAXDIMS> {
 }
 
 /// Calculate slopes and offsets & select evaluation method
+#[inline]
 fn interp_inner<T: Float, const MAXDIMS: usize>(
     vals: [T; 4],
     grid_cell: &[T],
@@ -499,6 +502,7 @@ fn interp_inner<T: Float, const MAXDIMS: usize>(
 /// Evaluate a hermite spline function on an interval from x0 to x1,
 /// with imposed slopes k0 and k1 at the endpoints, and normalized
 /// coordinate t = (x - x0) / (x1 - x0).
+#[inline]
 fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
     // `a` and `b` are the difference between this function and a linear one going
     // forward or backward with the imposed slopes.
@@ -524,6 +528,7 @@ fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
 /// which is essentially a distance-weighted average of the forward and backward
 /// differences s.t. the closer points have more influence on the estimate
 /// of the derivative.
+#[inline]
 fn centered_difference_nonuniform<T: Float>(y0: T, y1: T, y2: T, h01: T, h12: T) -> T {
     let a = h01 / (h01 + h12);
     let b = (y2 - y1) / h12;
@@ -534,6 +539,7 @@ fn centered_difference_nonuniform<T: Float>(y0: T, y1: T, y2: T, h01: T, h12: T)
 }
 
 /// Index a single value from an array
+#[inline]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index 5e8e568..4e2e57c 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -317,6 +317,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
+    #[inline]
     fn get_loc(&self, v: T, dim: usize) -> Result<(usize, Saturation), &'static str> {
         let saturation: Saturation; // What part of the grid cell are we in?
 
@@ -357,6 +358,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
+    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -399,6 +401,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
 }
 
 /// Calculate slopes and offsets & select evaluation method
+#[inline]
 fn interp_inner<T: Float, const MAXDIMS: usize>(
     vals: [T; 4],
     t: T,
@@ -523,6 +526,7 @@ fn interp_inner<T: Float, const MAXDIMS: usize>(
 /// Evaluate a hermite spline function on an interval from x0 to x1,
 /// with imposed slopes k0 and k1 at the endpoints, and normalized
 /// coordinate t = (x - x0) / (x1 - x0).
+#[inline]
 fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
     // `a` and `b` are the difference between this function and a linear one going
     // forward or backward with the imposed slopes.
@@ -540,6 +544,7 @@ fn normalized_hermite_spline<T: Float>(t: T, y0: T, dy: T, k0: T, k1: T) -> T {
 }
 
 /// Index a single value from an array
+#[inline]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index 3af62e4..f5dd7ae 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -253,6 +253,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
+    #[inline]
     fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let grid = self.grids[dim];
 
@@ -273,6 +274,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
     }
 
     /// Recursive evaluation of interpolant on each dimension
+    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -313,6 +315,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
 }
 
 /// Index a single value from an array
+#[inline]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 0995e20..d000e42 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -303,6 +303,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     /// point in order to capture a full 4-cube.
     ///
     /// Returned value like (lower_corner_index, saturation_flag).
+    #[inline]
     fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let floc = ((v - self.starts[dim]) / self.steps[dim]).floor(); // float loc
                                                                        // Signed integer loc, with the bottom of the cell aligned to place the normalized
@@ -317,6 +318,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
     }
 
     /// Recursive evaluation of interpolant on each dimension
+    #[inline]
     fn populate(
         &self,
         dim: usize,
@@ -356,6 +358,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
 }
 
 /// Index a single value from an array
+#[inline]
 fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
     let mut i = 0;
     for j in 0..dimprod.len() {

From 563500e3bbf5a80fe27306777b96c513d1e77661 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Wed, 7 Aug 2024 20:47:45 -0400
Subject: [PATCH 11/18] fix typo

---
 interpn/src/multicubic/regular.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interpn/src/multicubic/regular.rs b/interpn/src/multicubic/regular.rs
index 4e2e57c..caf3981 100644
--- a/interpn/src/multicubic/regular.rs
+++ b/interpn/src/multicubic/regular.rs
@@ -268,7 +268,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MulticubicRegular<'a, T, MAXDIMS> {
         //
         // Also notably, storing the index offsets as bool instead of usize
         // reduces memory overhead, but has not effect on throughput rate.
-        let origin = &mut [0_usize; MAXDIMS][..ndims]; // Indices of lower corner of hypercub
+        let origin = &mut [0_usize; MAXDIMS][..ndims]; // Indices of lower corner of hypercube
         let sat = &mut [Saturation::None; MAXDIMS][..ndims]; // Saturation none/high/low flags for each dim
         let dts = &mut [T::zero(); MAXDIMS][..ndims]; // Normalized coordinate storage
         let dimprod = &mut [1_usize; MAXDIMS][..ndims];

From d985fa414adc285b81bd304d3cd5d370d2873f55 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Wed, 7 Aug 2024 20:49:00 -0400
Subject: [PATCH 12/18] update changelog

---
 interpn/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index e2e11e8..a611bd3 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -13,7 +13,7 @@
   * Shows about 2x slower perf in micro-benchmarks, but about 2x faster in end-to-end benchmarks after the Python bindings
     * Need to improve benchmarking strategy to better capture perf in real-life usage
 * Reduce repeated documentation
-* Remove all inlining annotations
+* Remove some inlining annotations and all instances of `#[inline(always)]`
   * Minimal effect on performance; provides more flexibility to downstream applications, especially opt-level=s builds
 
 ## 0.4.2 - 2024-05-12

From 5e4e2bb1607f9016f67aeefbf1faeb9326d857cf Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Mon, 19 Aug 2024 22:39:21 -0400
Subject: [PATCH 13/18] add tests of grid cell alignment and fix grid cell
 alignment issues

---
 interpn/src/multilinear/rectilinear.rs | 32 +++++++++++++++++++++-----
 interpn/src/multilinear/regular.rs     | 26 +++++++++++++++++++--
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index f5dd7ae..bbebaf7 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -246,13 +246,10 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
         Ok(interped)
     }
 
-    /// Get the two-lower index along this dimension where `x` is found,
+    /// Get the lower-corner index along this dimension where `x` is found,
     /// saturating to the bounds at the edges if necessary.
     ///
-    /// At the high bound of a given dimension, saturates to the fourth internal
-    /// point in order to capture a full 4-cube.
-    ///
-    /// Returned value like (lower_corner_index, saturation_flag).
+    /// At the high bound of a given dimension, saturates to the interior.
     #[inline]
     fn get_loc(&self, v: T, dim: usize) -> Result<usize, &'static str> {
         let grid = self.grids[dim];
@@ -264,7 +261,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRectilinear<'a, T, MAXDIMS>
         //
         // This process accounts for essentially the entire difference in
         // performance between this method and the regular-grid method.
-        let iloc: isize = grid.partition_point(|x| *x < v) as isize - 2;
+        let iloc: isize = grid.partition_point(|x| *x < v) as isize - 1;
 
         let n = self.dims[dim] as isize; // Number of grid points on this dimension
         let dimmax = n.saturating_sub(2).max(0); // maximum index for lower corner
@@ -409,4 +406,27 @@ mod test {
             (0..uobs.len()).for_each(|i| assert!((out[i] - uobs[i]).abs() < 1e-12));
         }
     }
+
+    /// Interpolate on a hat-shaped function to make sure that the grid cell indexing is aligned properly
+    #[test]
+    fn test_interp_hat_func() {
+        fn hat_func(x: f64) -> f64 {
+            if x <= 1.0 {
+                x
+            } else {
+                2.0 - x
+            }
+        }
+
+        let x = (0..3).map(|x| x as f64).collect::<Vec<f64>>();
+        let grids = [&x[..]];
+        let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
+        let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
+
+        let interpolator: MultilinearRectilinear<f64, 1> = MultilinearRectilinear::new(&grids, &y).unwrap();
+
+        (0..obs.len()).for_each(|i| {
+            assert_eq!(hat_func(obs[i]), interpolator.interp_one(&[obs[i]]).unwrap());
+        })
+    }
 }
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index d000e42..62b95f4 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -308,7 +308,7 @@ impl<'a, T: Float, const MAXDIMS: usize> MultilinearRegular<'a, T, MAXDIMS> {
         let floc = ((v - self.starts[dim]) / self.steps[dim]).floor(); // float loc
                                                                        // Signed integer loc, with the bottom of the cell aligned to place the normalized
                                                                        // coordinate t=0 at cell index 1
-        let iloc = <isize as NumCast>::from(floc).ok_or("Unrepresentable coordinate value")? - 1;
+        let iloc = <isize as NumCast>::from(floc).ok_or("Unrepresentable coordinate value")?;
 
         let n = self.dims[dim] as isize; // Number of grid points on this dimension
         let dimmax = n.saturating_sub(2).max(0); // maximum index for lower corner
@@ -371,7 +371,7 @@ fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {
 #[cfg(test)]
 mod test {
     use super::interpn;
-    use crate::utils::*;
+    use crate::{utils::*, MultilinearRegular};
 
     /// Iterate from 1 to 8 dimensions, making a minimum-sized grid for each one
     /// to traverse every combination of interpolating or extrapolating high or low on each dimension.
@@ -418,4 +418,26 @@ mod test {
             });
         }
     }
+
+    /// Interpolate on a hat-shaped function to make sure that the grid cell indexing is aligned properly
+    #[test]
+    fn test_interp_hat_func() {
+        fn hat_func(x: f64) -> f64 {
+            if x <= 1.0 {
+                x
+            } else {
+                2.0 - x
+            }
+        }
+
+        // let x = (0..3).map(|x| x as f64).collect::<Vec<f64>>();
+        let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
+        let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
+
+        let interpolator: MultilinearRegular<f64, 1> = MultilinearRegular::new(&[3], &[0.0], &[1.0], &y).unwrap();
+
+        (0..obs.len()).for_each(|i| {
+            assert_eq!(hat_func(obs[i]), interpolator.interp_one(&[obs[i]]).unwrap());
+        })
+    }
 }

From 8ed4ecb6792713a0ca9c348f1b94983561471245 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Mon, 19 Aug 2024 22:40:31 -0400
Subject: [PATCH 14/18] format

---
 interpn/src/multilinear/rectilinear.rs | 8 ++++++--
 interpn/src/multilinear/regular.rs     | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index bbebaf7..c0ce7a2 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -423,10 +423,14 @@ mod test {
         let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
         let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
 
-        let interpolator: MultilinearRectilinear<f64, 1> = MultilinearRectilinear::new(&grids, &y).unwrap();
+        let interpolator: MultilinearRectilinear<f64, 1> =
+            MultilinearRectilinear::new(&grids, &y).unwrap();
 
         (0..obs.len()).for_each(|i| {
-            assert_eq!(hat_func(obs[i]), interpolator.interp_one(&[obs[i]]).unwrap());
+            assert_eq!(
+                hat_func(obs[i]),
+                interpolator.interp_one(&[obs[i]]).unwrap()
+            );
         })
     }
 }
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index 62b95f4..f3d7ce4 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -434,10 +434,14 @@ mod test {
         let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
         let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
 
-        let interpolator: MultilinearRegular<f64, 1> = MultilinearRegular::new(&[3], &[0.0], &[1.0], &y).unwrap();
+        let interpolator: MultilinearRegular<f64, 1> =
+            MultilinearRegular::new(&[3], &[0.0], &[1.0], &y).unwrap();
 
         (0..obs.len()).for_each(|i| {
-            assert_eq!(hat_func(obs[i]), interpolator.interp_one(&[obs[i]]).unwrap());
+            assert_eq!(
+                hat_func(obs[i]),
+                interpolator.interp_one(&[obs[i]]).unwrap()
+            );
         })
     }
 }

From 218377ba1ce9f175453e28ed57b93edd771d9cc1 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 20 Aug 2024 19:48:06 -0400
Subject: [PATCH 15/18] update changelog

---
 interpn/CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index a611bd3..9daeee5 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -5,6 +5,7 @@
 ## Added
 
 * Implement `interpn_alloc` function for each method, which allocates a Vec for the output
+* Add test of linear methods using hat function to check grid cell alignment
 
 ## Changed
 

From dd794958a96a410336c68d651b2dab2ed1efbc57 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 20 Aug 2024 19:48:51 -0400
Subject: [PATCH 16/18] changelog formatting

---
 interpn/CHANGELOG.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/interpn/CHANGELOG.md b/interpn/CHANGELOG.md
index 9daeee5..721f539 100644
--- a/interpn/CHANGELOG.md
+++ b/interpn/CHANGELOG.md
@@ -2,12 +2,12 @@
 
 ## 0.4.3 - 2024-08-03
 
-## Added
+### Added
 
 * Implement `interpn_alloc` function for each method, which allocates a Vec for the output
 * Add test of linear methods using hat function to check grid cell alignment
 
-## Changed
+### Changed
 
 * Use recursive method to evaluate multilinear interpolation instead of hypercube method
   * This makes extrapolation cost consistent with interpolation cost, and reduces nominal perf scaling
@@ -19,23 +19,23 @@
 
 ## 0.4.2 - 2024-05-12
 
-## Added
+### Added
 
 * Implement cubic rectilinear method
 
 ## 0.4.1 - 2024-05-06
 
-## Fixed
+### Fixed
 
 * Fix grid cell index selection to properly center the grid cell s.t. t=0 corresponds to index 1
 
-## Added
+### Added
 
 * Add test of cubic method against sine function to capture potential problems not visible when testing against linear and quadratic functions
 
 ## 0.4.0 - 2024-05-05
 
-## Changes
+### Changed
 
 * Implement cubic interpolation for regular grid
     * Continuous first derivative everywhere
@@ -45,7 +45,7 @@
 
 ## 0.3.0 - 2023-12-17
 
-## Changed
+### Changed
 
 * Remove initial guess for cell index from rectilinear method
 * Collapse some loops
@@ -58,6 +58,6 @@
 
 ## 0.2.0 - 2023-12-06
 
-## Changed
+### Changed
 
 * Propagate Result everywhere that unwrap or assert! was being used

From 8c7abad0730d318e07ea87656c96736316f44ce7 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 20 Aug 2024 20:11:34 -0400
Subject: [PATCH 17/18] update readme

---
 README.md          | 1 -
 interpn/README.md  | 1 -
 interpn/src/lib.rs | 1 -
 3 files changed, 3 deletions(-)

diff --git a/README.md b/README.md
index adf9308..318dbfc 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,6 @@ multicubic::rectilinear::interpn(grids, &z, false, &obs, &mut out).unwrap();
 ```
 
 # Development Roadmap
-* Recursive multilinear methods (for better extrapolation speed and timing determinism)
 * Methods for unstructured triangular and tetrahedral meshes
 
 # License
diff --git a/interpn/README.md b/interpn/README.md
index adf9308..318dbfc 100644
--- a/interpn/README.md
+++ b/interpn/README.md
@@ -81,7 +81,6 @@ multicubic::rectilinear::interpn(grids, &z, false, &obs, &mut out).unwrap();
 ```
 
 # Development Roadmap
-* Recursive multilinear methods (for better extrapolation speed and timing determinism)
 * Methods for unstructured triangular and tetrahedral meshes
 
 # License
diff --git a/interpn/src/lib.rs b/interpn/src/lib.rs
index 5c64270..bf3a625 100644
--- a/interpn/src/lib.rs
+++ b/interpn/src/lib.rs
@@ -80,7 +80,6 @@
 //! ```
 //!
 //! # Development Roadmap
-//! * Recursive multilinear methods (for better extrapolation speed and timing determinism)
 //! * Methods for unstructured triangular and tetrahedral meshes
 #![cfg_attr(not(feature = "std"), no_std)]
 // These "needless" range loops are a significant speedup

From 39cbcd7f1c51729c4aaee9ae8ab1575fce8e5c37 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Tue, 20 Aug 2024 20:51:46 -0400
Subject: [PATCH 18/18] test more points in hat func tests

---
 interpn/src/multilinear/rectilinear.rs | 2 +-
 interpn/src/multilinear/regular.rs     | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/interpn/src/multilinear/rectilinear.rs b/interpn/src/multilinear/rectilinear.rs
index c0ce7a2..2419115 100644
--- a/interpn/src/multilinear/rectilinear.rs
+++ b/interpn/src/multilinear/rectilinear.rs
@@ -421,7 +421,7 @@ mod test {
         let x = (0..3).map(|x| x as f64).collect::<Vec<f64>>();
         let grids = [&x[..]];
         let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
-        let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
+        let obs = linspace(-2.0, 4.0, 100);
 
         let interpolator: MultilinearRectilinear<f64, 1> =
             MultilinearRectilinear::new(&grids, &y).unwrap();
diff --git a/interpn/src/multilinear/regular.rs b/interpn/src/multilinear/regular.rs
index f3d7ce4..ac45d20 100644
--- a/interpn/src/multilinear/regular.rs
+++ b/interpn/src/multilinear/regular.rs
@@ -430,9 +430,8 @@ mod test {
             }
         }
 
-        // let x = (0..3).map(|x| x as f64).collect::<Vec<f64>>();
         let y = (0..3).map(|x| hat_func(x as f64)).collect::<Vec<f64>>();
-        let obs = (-2..6).map(|x| x as f64 * 0.75).collect::<Vec<f64>>();
+        let obs = linspace(-2.0, 4.0, 100);
 
         let interpolator: MultilinearRegular<f64, 1> =
             MultilinearRegular::new(&[3], &[0.0], &[1.0], &y).unwrap();