diff --git a/README.md b/README.md
index 45daed7..2c7c03c 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
## A Hierarchical Resampling Package for Python
-Version 1.1.2
+Version 1.1.3
hierarch is a package for hierarchical resampling (bootstrapping, permutation) of datasets in Python. Because for loops are ultimately intrinsic to cluster-aware resampling, hierarch uses Numba to accelerate many of its key functions.
@@ -13,6 +13,7 @@ hierarch has several functions to assist in performing resampling-based (and the
1. [Introduction](#introduction)
2. [Setup](#setup)
3. [Documentation](#documentation)
+4. [Citation](#citation)
@@ -50,4 +51,12 @@ Alternatively, you can install from Anaconda.
## Documentation
-Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).
\ No newline at end of file
+Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/).
+
+
+## Citation
+If hierarch helps you analyze your data, please consider citing it. The manuscript also
+contains a set of simulations validating hierarchical randomization tests in a variety of
+conditions.
+
+Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance. Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi. bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 10c98e6..001f7d7 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,7 @@
-sphinx==4.0.2
+sphinx==4.0.3
sphinx_rtd_theme==0.5.2
numpydoc==1.1.0
numba==0.53.1
-numpy==1.20.3
-pandas==1.2.4
-scipy==1.6.3
+numpy==1.21.0
+pandas==1.3.0
+scipy==1.7.0
diff --git a/docs/user/overview.rst b/docs/user/overview.rst
index 7a38038..91f1217 100644
--- a/docs/user/overview.rst
+++ b/docs/user/overview.rst
@@ -54,4 +54,10 @@ The code to perform a hierarchical permutation t-test on this dataset looks like
from hierarch.stats import hypothesis_test
hypothesis_test(data, treatment_col=0,
- bootstraps=1000, permutations='all')
\ No newline at end of file
+ bootstraps=1000, permutations='all')
+
+If you find hierarch useful for analyzing your data, please consider citing it.
+
+Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance
+Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi
+bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439
\ No newline at end of file
diff --git a/hierarch/internal_functions.py b/hierarch/internal_functions.py
index 1ca6028..8a7c2a5 100644
--- a/hierarch/internal_functions.py
+++ b/hierarch/internal_functions.py
@@ -173,7 +173,7 @@ def bounded_uint(ub):
Returns
-------
int
- """
+ """
x = np.random.randint(low=2 ** 32)
m = ub * x
l = np.uint32(m)
@@ -193,7 +193,7 @@ def bounded_uint(ub):
@nb.jit(nopython=True, cache=True)
def nb_fast_shuffle(arr):
"""Reimplementation of Fisher-Yates shuffle using bounded_uint to generate random numbers.
- """
+ """
i = arr.shape[0] - 1
while i > 0:
j = bounded_uint(i + 1)
@@ -211,7 +211,7 @@ def nb_strat_shuffle(arr, stratification):
Target array.
stratification : 1D array-like
Ranges to shuffle within. Must be sorted.
- """
+ """
for v, w in zip(stratification[:-1], stratification[1:]):
i = w - v - 1
while i > 0:
@@ -248,137 +248,6 @@ def id_cluster_counts(design):
return cluster_dict
-@nb.jit(nopython=True, cache=True)
-def nb_reweighter(
- data, columns_to_resample, clusternum_dict, start: int, shape: int, indexes=True
-):
- """Internal function for bootstrapping a design matrix with integer
- weights.
-
- Parameters
- ----------
- data : 2D array
- Target data to be bootstrapped.
- columns_to_resample : 1D bool array-like
- False for columns to be skipped in the resampling plan.
- clusternum_dict : TypedDict
- Hierarchy dictionary produced by id_cluster_counts
- start : int
- First column of the data matrix to resample
- shape : int
- Last column of the data matrix to resample
- indexes : bool, optional
- If True, returns a reindexed array. If False, returns
- a reweighted array, by default True.
-
- Returns
- -------
- 2D array
- Nonparametrically bootstrapped sample from the input data array.
- """
-
- out = data.astype(np.float64)
- # at the start, everything is weighted equally
- weights = np.array([1 for i in clusternum_dict[start]])
-
- for key in range(start, shape):
- # fetch design matrix info for current column
- to_do = clusternum_dict[key]
- # preallocate the full array for new_weight
- new_weight = np.empty(to_do.sum(), np.int64)
- place = 0
-
- # if not resampling this column, new_weight is all 1
- if not columns_to_resample[key]:
- for idx, v in enumerate(to_do):
- num = np.array([1 for m in range(v.item())])
- # carry over resampled weights from previous columns
- num *= weights[idx]
- for idx_2, w in enumerate(num):
- new_weight[place + idx_2] = w.item()
- place += v
-
- # else do a multinomial experiment to generate new_weight
- else:
- for idx, v in enumerate(to_do):
- num = v.item()
- # num*weights[idx] carries over weights from previous columns
- randos = np.random.multinomial(num * weights[idx], [1 / num] * num)
- for idx_2, w in enumerate(randos):
- new_weight[place + idx_2] = w.item()
- place += v
-
- weights = new_weight
-
- if indexes is False:
- out[:, -1] = out[:, -1] * weights
- return out
- else:
- indexes = weights_to_index(weights)
- return out[indexes]
-
-
-@nb.jit(nopython=True, cache=True)
-def nb_reweighter_real(data, columns_to_resample, clusternum_dict, start, shape):
- """Internal function for bootstrapping a design matrix with real number
- weights.
-
- Parameters
- ----------
- data : 2D array
- Target data to be bootstrapped.
- columns_to_resample : 1D bool array-like
- False for columns to be skipped in the resampling plan.
- clusternum_dict : TypedDict
- Hierarchy dictionary produced by id_cluster_counts
- start : int
- First column of the data matrix to resample
- shape : int
- Last column of the data matrix to resample
-
- Returns
- -------
- 2D array
- Nonparametrically bootstrapped sample from the input data array.
- """
-
- out = data.astype(np.float64)
- # at the start, everything is weighted equally
- # dype is float64 because weights can be any real number
- weights = np.array([1 for i in clusternum_dict[start]], dtype=np.float64)
-
- for key in range(start, shape):
- # fetch design matrix info for current column
- to_do = clusternum_dict[key]
- # preallocate the full array for new_weight
- new_weight = np.empty(to_do.sum(), np.float64)
- place = 0
-
- # if not resampling this column, new_weight is all 1
- if not columns_to_resample[key]:
- for idx, v in enumerate(to_do):
- num = np.array([1 for m in range(v.item())], dtype=np.float64)
- num *= weights[idx]
- for idx_2, w in enumerate(num):
- new_weight[place + idx_2] = w.item()
- place += v
-
- # else do a dirichlet experiment to generate new_weight
- else:
- for idx, v in enumerate(to_do):
- num = [1 for a in range(v.item())]
- # multiplying by weights[idx] carries over prior columns
- randos = np.random.dirichlet(num, size=None) * weights[idx] * v.item()
- for idx_2, w in enumerate(randos):
- new_weight[place + idx_2] = w.item()
- place += v
-
- weights = new_weight
-
- out[:, -1] = out[:, -1] * weights
- return out
-
-
@nb.jit(nopython=True, cache=True)
def weights_to_index(weights):
"""Converts a 1D array of integer weights to indices.
@@ -591,4 +460,3 @@ def class_make_ufunc_list(target, reference, counts):
return ufunc_list
-
diff --git a/hierarch/resampling.py b/hierarch/resampling.py
index d43fa43..6061467 100644
--- a/hierarch/resampling.py
+++ b/hierarch/resampling.py
@@ -3,8 +3,6 @@
from numba import jit
from functools import lru_cache
from hierarch.internal_functions import (
- nb_reweighter,
- nb_reweighter_real,
nb_unique,
id_cluster_counts,
msp,
@@ -12,6 +10,7 @@
_repeat,
nb_fast_shuffle,
nb_strat_shuffle,
+ weights_to_index,
)
@@ -269,13 +268,21 @@ def fit(self, data, skip=None, y=-1):
else:
skip = []
- self.cluster_dict = id_cluster_counts(data[:, :y])
+ cluster_dict = id_cluster_counts(data[:, :y])
+ cluster_dict = tuple(reversed(list(cluster_dict.values())))
+ cluster_dict = tuple(map(tuple, cluster_dict))
y %= data.shape[1]
- self.shape = y
+ shape = y
- self.columns_to_resample = np.array([True for k in range(self.shape)])
+ columns_to_resample = np.array([True for k in range(shape)])
for key in skip:
- self.columns_to_resample[key] = False
+ columns_to_resample[key] = False
+
+ kind = str(self.kind)
+
+ self.transform = _bootstrapper_factory(
+ tuple(columns_to_resample), cluster_dict, shape, kind
+ )
def transform(self, data, start: int):
"""Generate a bootstrapped sample from target data.
@@ -294,30 +301,140 @@ def transform(self, data, start: int):
according to "kind" argument.
"""
- if self.kind == "weights":
- resampled = nb_reweighter(
- data,
- self.columns_to_resample,
- self.cluster_dict,
- start,
- self.shape,
- indexes=False,
- )
- elif self.kind == "indexes":
- resampled = nb_reweighter(
- data,
- self.columns_to_resample,
- self.cluster_dict,
- start,
- self.shape,
- indexes=True,
- )
+ raise Exception("Use fit() before using transform().")
- elif self.kind == "bayesian":
- resampled = nb_reweighter_real(
- data, self.columns_to_resample, self.cluster_dict, start, self.shape
+@lru_cache()
+def _bootstrapper_factory(columns_to_resample, clusternum_dict, shape, kind):
+ """Factory function that returns the appropriate transform().
+ """
+ clusternum_dict = tuple(map(np.array, clusternum_dict))
+ columns_to_resample = np.array(columns_to_resample)
+
+ if kind == "weights":
+
+ @jit(nopython=True)
+ def _bootstrapper_impl(data, start):
+ out = data.astype(np.float64)
+ # at the start, everything is weighted equally
+ weights = np.array([1 for i in clusternum_dict[start]])
+
+ for key in range(start, shape):
+ # fetch design matrix info for current column
+ to_do = clusternum_dict[key]
+ # preallocate the full array for new_weight
+ new_weight = np.empty(to_do.sum(), np.int64)
+ place = 0
+
+ # if not resampling this column, new_weight is the prior column's weights
+ if not columns_to_resample[key]:
+ for idx, v in enumerate(to_do):
+ new_weight[place : place + v] = np.array(
+ [weights[idx] for m in range(v.item())]
+ )
+ place += v
+
+ # else do a multinomial experiment to generate new_weight
+ else:
+ for idx, v in enumerate(to_do):
+ # v*weights[idx] carries over weights from previous columns
+ new_weight[place : place + v] = np.random.multinomial(
+ v * weights[idx], [1 / v] * v
+ )
+ place += v
+
+ weights = new_weight
+
+ out[:, -1] = out[:, -1] * weights
+ return out
+
+ elif kind == "indexes":
+
+ @jit(nopython=True)
+ def _bootstrapper_impl(data, start):
+ out = data.astype(np.float64)
+ # at the start, everything is weighted equally
+ weights = np.array([1 for i in clusternum_dict[start]])
+
+ for key in range(start, shape):
+ # fetch design matrix info for current column
+ to_do = clusternum_dict[key]
+ # preallocate the full array for new_weight
+ new_weight = np.empty(to_do.sum(), np.int64)
+ place = 0
+
+ # if not resampling this column, new_weight is the prior column's weights
+ if not columns_to_resample[key]:
+ for idx, v in enumerate(to_do):
+ new_weight[place : place + v] = np.array(
+ [weights[idx] for m in range(v.item())]
+ )
+ place += v
+
+ # else do a multinomial experiment to generate new_weight
+ else:
+ for idx, v in enumerate(to_do):
+ # v*weights[idx] carries over weights from previous columns
+ new_weight[place : place + v] = np.random.multinomial(
+ v * weights[idx], [1 / v] * v
+ )
+ place += v
+
+ weights = new_weight
+
+ indexes = weights_to_index(weights)
+ return out[indexes]
+
+ elif kind == "bayesian":
+
+ @jit(nopython=True)
+ def _bootstrapper_impl(data, start):
+ out = data.astype(np.float64)
+ # at the start, everything is weighted equally
+ # dype is float64 because weights can be any real number
+ weights = np.array(
+ [1 for i in clusternum_dict[start]], dtype=np.float64
)
- return resampled
+
+ for key in range(start, shape):
+ # fetch design matrix info for current column
+ to_do = clusternum_dict[key]
+ # preallocate the full array for new_weight
+ new_weight = np.empty(to_do.sum(), np.float64)
+ place = 0
+
+ # if not resampling this column, new_weight is all 1
+ if not columns_to_resample[key]:
+ for idx, v in enumerate(to_do):
+ new_weight[place : place + v] = np.array(
+ [weights[idx] for m in range(v.item())],
+ dtype=np.float64,
+ )
+ place += v
+
+ # else do a dirichlet experiment to generate new_weight
+ else:
+ for idx, v in enumerate(to_do):
+ # multiplying by weights[idx] carries over prior columns
+ new_weight[place : place + v] = (
+ np.random.dirichlet(
+ [1 for a in range(v.item())], size=None
+ )
+ * weights[idx]
+ * v.item()
+ )
+ place += v
+
+ weights = new_weight
+
+ out[:, -1] = out[:, -1] * weights
+ return out
+
+ else:
+ raise KeyError(
+ "No such bootstrapping algorithm. kind must be 'weights' or 'indexes' or 'bayesian'"
+ )
+
+ return _bootstrapper_impl
class Permuter:
@@ -441,9 +558,9 @@ def fit(self, data, col_to_permute: int, exact=False):
col_values = values[:, -2].copy()
self.iterator = cycle(msp(col_values))
if len(col_values) == len(data):
- self.transform = self._exact_return(col_to_permute, self.iterator)
+ self.transform = _exact_return(col_to_permute, self.iterator)
else:
- self.transform = self._exact_repeat_return(
+ self.transform = _exact_repeat_return(
col_to_permute, self.iterator, counts
)
@@ -458,13 +575,13 @@ def fit(self, data, col_to_permute: int, exact=False):
keys = tuple(keys.tolist())
if indexes.size == len(data):
- self.transform = self._random_return(col_to_permute, keys)
+ self.transform = _random_return(col_to_permute, keys)
else:
col_values = data[:, col_to_permute][indexes]
col_values = tuple(col_values.tolist())
counts = tuple(counts.tolist())
- self.transform = self._random_repeat_return(
+ self.transform = _random_repeat_return(
col_to_permute, col_values, keys, counts
)
@@ -486,75 +603,71 @@ def transform(self, data):
# four static methods defined below
raise Exception("Use fit() before calling transform.")
- @staticmethod
- def _exact_return(col_to_permute, generator):
- """Transformer when exact is True and permutations are unrestricted.
- """
-
- def _exact_return_impl(data):
- data[:, col_to_permute] = next(generator)
- return data
+def _exact_return(col_to_permute, generator):
+ """Transformer when exact is True and permutations are unrestricted.
+ """
- return _exact_return_impl
+ def _exact_return_impl(data):
+ data[:, col_to_permute] = next(generator)
+ return data
- @staticmethod
- def _exact_repeat_return(col_to_permute, generator, counts):
- """Transformer when exact is True and permutations are restricted by
- repetition of treated entities.
- """
+ return _exact_return_impl
- def _rep_iter_return_impl(data):
- data[:, col_to_permute] = _repeat(tuple(next(generator)), counts)
- return data
+def _exact_repeat_return(col_to_permute, generator, counts):
+ """Transformer when exact is True and permutations are restricted by
+ repetition of treated entities.
+ """
- return _rep_iter_return_impl
+ def _rep_iter_return_impl(data):
+ data[:, col_to_permute] = _repeat(tuple(next(generator)), counts)
+ return data
- @staticmethod
- @lru_cache()
- def _random_return(col_to_permute, keys):
- """Transformer when exact is False and repetition is not required.
- """
+ return _rep_iter_return_impl
- if col_to_permute == 0:
+@lru_cache()
+def _random_return(col_to_permute, keys):
+ """Transformer when exact is False and repetition is not required.
+ """
- @jit(nopython=True)
- def _random_return_impl(data):
- nb_fast_shuffle(data[:, col_to_permute])
- return data
+ if col_to_permute == 0:
- else:
+ @jit(nopython=True)
+ def _random_return_impl(data):
+ nb_fast_shuffle(data[:, col_to_permute])
+ return data
- @jit(nopython=True)
- def _random_return_impl(data):
- nb_strat_shuffle(data[:, col_to_permute], keys)
- return data
+ else:
- return _random_return_impl
+ @jit(nopython=True)
+ def _random_return_impl(data):
+ nb_strat_shuffle(data[:, col_to_permute], keys)
+ return data
- @staticmethod
- @lru_cache()
- def _random_repeat_return(col_to_permute, col_values, keys, counts):
- """Transformer when exact is False and repetition is required.
- """
- col_values = np.array(col_values)
- counts = np.array(counts)
- if col_to_permute == 0:
+ return _random_return_impl
- @jit(nopython=True)
- def _random_repeat_return_impl(data):
- shuffled_col_values = col_values.copy()
- nb_fast_shuffle(shuffled_col_values)
- data[:, col_to_permute] = np.repeat(shuffled_col_values, counts)
- return data
+@lru_cache()
+def _random_repeat_return(col_to_permute, col_values, keys, counts):
+ """Transformer when exact is False and repetition is required.
+ """
+ col_values = np.array(col_values)
+ counts = np.array(counts)
+ if col_to_permute == 0:
+
+ @jit(nopython=True)
+ def _random_repeat_return_impl(data):
+ shuffled_col_values = col_values.copy()
+ nb_fast_shuffle(shuffled_col_values)
+ data[:, col_to_permute] = np.repeat(shuffled_col_values, counts)
+ return data
- else:
+ else:
- @jit(nopython=True)
- def _random_repeat_return_impl(data):
- shuffled_col_values = col_values.copy()
- nb_strat_shuffle(shuffled_col_values, keys)
- data[:, col_to_permute] = np.repeat(shuffled_col_values, counts)
- return data
+ @jit(nopython=True)
+ def _random_repeat_return_impl(data):
+ shuffled_col_values = col_values.copy()
+ nb_strat_shuffle(shuffled_col_values, keys)
+ data[:, col_to_permute] = np.repeat(shuffled_col_values, counts)
+ return data
- return _random_repeat_return_impl
+ return _random_repeat_return_impl
diff --git a/hierarch/stats.py b/hierarch/stats.py
index 858b9b1..38613a7 100644
--- a/hierarch/stats.py
+++ b/hierarch/stats.py
@@ -240,7 +240,7 @@ def _grabber(X, y, treatment_labels):
def hypothesis_test(
data_array,
- treatment_col: int,
+ treatment_col,
compare="corr",
alternative="two-sided",
skip=None,
@@ -261,9 +261,10 @@ def hypothesis_test(
Array-like containing both the independent and dependent variables to
be analyzed. It's assumed that the final (rightmost) column
contains the dependent variable values.
- treatment_col : int
+ treatment_col : int or str
The index number of the column containing "two samples" to be compared.
- Indexing starts at 0.
+ Indexing starts at 0. If input data is a pandas DataFrame, this can be
+ the name of the column.
compare : str, optional
The test statistic to use to perform the hypothesis test, by default "corr"
which automatically calls the studentized covariance test statistic.
@@ -354,6 +355,8 @@ def hypothesis_test(
# turns the input array or dataframe into a float64 array
if isinstance(data_array, (np.ndarray, pd.DataFrame)):
+ if isinstance(data_array, pd.DataFrame) and isinstance(treatment_col, str):
+ treatment_col = int(data_array.columns.get_loc(treatment_col))
data = _preprocess_data(data_array)
else:
raise TypeError("Input data must be ndarray or DataFrame.")
@@ -490,7 +493,7 @@ def hypothesis_test(
def multi_sample_test(
data_array,
- treatment_col: int,
+ treatment_col,
hypotheses="all",
correction="fdr",
compare="means",
@@ -511,9 +514,10 @@ def multi_sample_test(
Array-like containing both the independent and dependent variables to
be analyzed. It's assumed that the final (rightmost) column
contains the dependent variable values.
- treatment_col : int
+ treatment_col : int or str
The index number of the column containing labels to be compared.
- Indexing starts at 0.
+ Indexing starts at 0. If input data is a pandas DataFrame, this can
+ be the column name.
hypotheses : list of two-element lists or "all", optional
Hypotheses to be tested. If 'all' every pairwise comparison will be
tested. Can be passed a list of lists to restrict comparisons, which
@@ -667,6 +671,8 @@ def multi_sample_test(
# coerce data into an object array
if isinstance(data_array, pd.DataFrame):
+ if isinstance(treatment_col, str):
+ treatment_col = data_array.columns.get_loc(treatment_col)
data = data_array.to_numpy()
elif isinstance(data_array, np.ndarray):
data = data_array
@@ -870,9 +876,10 @@ def confidence_interval(
Array-like containing both the independent and dependent variables to
be analyzed. It's assumed that the final (rightmost) column
contains the dependent variable values.
- treatment_col : int
+ treatment_col : int or str
The index number of the column containing "two samples" to be compared.
- Indexing starts at 0.
+ Indexing starts at 0. If input data is a pandas DataFrame, this can be
+ the column name.
interval : float, optional
Percentage value indicating the confidence interval's coverage, by default 95
iterations : int, optional
@@ -980,6 +987,8 @@ def confidence_interval(
# turns the input array or dataframe into a float64 array
if isinstance(data_array, (np.ndarray, pd.DataFrame)):
+ if isinstance(data_array, pd.DataFrame) and isinstance(treatment_col, str):
+ treatment_col = int(data_array.columns.get_loc(treatment_col))
data = _preprocess_data(data_array)
else:
raise TypeError("Input data must be ndarray or DataFrame.")
diff --git a/requirements.txt b/requirements.txt
index 4e0f517..15620c8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
numba==0.53.1
-numpy==1.20.2
-pandas==1.2.4
-scipy==1.6.2
+numpy==1.21.0
+pandas==1.3.0
+scipy==1.7.0
diff --git a/setup.py b/setup.py
index b5fe852..a6a60dd 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
setuptools.setup(
name="hierarch",
- version="1.1.2",
+ version="1.1.3",
author="Rishi Kulkarni",
author_email="rkulk@stanford.edu",
description="Hierarchical hypothesis testing library",