diff --git a/README.md b/README.md index 45daed7..2c7c03c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## A Hierarchical Resampling Package for Python -Version 1.1.2 +Version 1.1.3 hierarch is a package for hierarchical resampling (bootstrapping, permutation) of datasets in Python. Because for loops are ultimately intrinsic to cluster-aware resampling, hierarch uses Numba to accelerate many of its key functions. @@ -13,6 +13,7 @@ hierarch has several functions to assist in performing resampling-based (and the 1. [Introduction](#introduction) 2. [Setup](#setup) 3. [Documentation](#documentation) +4. [Citation](#citation) @@ -50,4 +51,12 @@ Alternatively, you can install from Anaconda. ## Documentation -Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/). \ No newline at end of file +Check out our user guide at [readthedocs](https://hierarch.readthedocs.io/). + + +## Citation +If hierarch helps you analyze your data, please consider citing it. The manuscript also +contains a set of simulations validating hierarchical randomization tests in a variety of +conditions. + +Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance. Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi. bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439 \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 10c98e6..001f7d7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ -sphinx==4.0.2 +sphinx==4.0.3 sphinx_rtd_theme==0.5.2 numpydoc==1.1.0 numba==0.53.1 -numpy==1.20.3 -pandas==1.2.4 -scipy==1.6.3 +numpy==1.21.0 +pandas==1.3.0 +scipy==1.7.0 diff --git a/docs/user/overview.rst b/docs/user/overview.rst index 7a38038..91f1217 100644 --- a/docs/user/overview.rst +++ b/docs/user/overview.rst @@ -54,4 +54,10 @@ The code to perform a hierarchical permutation t-test on this dataset looks like from hierarch.stats import hypothesis_test hypothesis_test(data, treatment_col=0, - bootstraps=1000, permutations='all') \ No newline at end of file + bootstraps=1000, permutations='all') + +If you find hierarch useful for analyzing your data, please consider citing it. + +Analyzing Nested Experimental Designs – A User-Friendly Resampling Method to Determine Experimental Significance +Rishikesh U. Kulkarni, Catherine L. Wang, Carolyn R. Bertozzi +bioRxiv 2021.06.29.450439; doi: https://doi.org/10.1101/2021.06.29.450439 \ No newline at end of file diff --git a/hierarch/internal_functions.py b/hierarch/internal_functions.py index 1ca6028..8a7c2a5 100644 --- a/hierarch/internal_functions.py +++ b/hierarch/internal_functions.py @@ -173,7 +173,7 @@ def bounded_uint(ub): Returns ------- int - """ + """ x = np.random.randint(low=2 ** 32) m = ub * x l = np.uint32(m) @@ -193,7 +193,7 @@ def bounded_uint(ub): @nb.jit(nopython=True, cache=True) def nb_fast_shuffle(arr): """Reimplementation of Fisher-Yates shuffle using bounded_uint to generate random numbers. - """ + """ i = arr.shape[0] - 1 while i > 0: j = bounded_uint(i + 1) @@ -211,7 +211,7 @@ def nb_strat_shuffle(arr, stratification): Target array. stratification : 1D array-like Ranges to shuffle within. Must be sorted. - """ + """ for v, w in zip(stratification[:-1], stratification[1:]): i = w - v - 1 while i > 0: @@ -248,137 +248,6 @@ def id_cluster_counts(design): return cluster_dict -@nb.jit(nopython=True, cache=True) -def nb_reweighter( - data, columns_to_resample, clusternum_dict, start: int, shape: int, indexes=True -): - """Internal function for bootstrapping a design matrix with integer - weights. - - Parameters - ---------- - data : 2D array - Target data to be bootstrapped. - columns_to_resample : 1D bool array-like - False for columns to be skipped in the resampling plan. - clusternum_dict : TypedDict - Hierarchy dictionary produced by id_cluster_counts - start : int - First column of the data matrix to resample - shape : int - Last column of the data matrix to resample - indexes : bool, optional - If True, returns a reindexed array. If False, returns - a reweighted array, by default True. - - Returns - ------- - 2D array - Nonparametrically bootstrapped sample from the input data array. - """ - - out = data.astype(np.float64) - # at the start, everything is weighted equally - weights = np.array([1 for i in clusternum_dict[start]]) - - for key in range(start, shape): - # fetch design matrix info for current column - to_do = clusternum_dict[key] - # preallocate the full array for new_weight - new_weight = np.empty(to_do.sum(), np.int64) - place = 0 - - # if not resampling this column, new_weight is all 1 - if not columns_to_resample[key]: - for idx, v in enumerate(to_do): - num = np.array([1 for m in range(v.item())]) - # carry over resampled weights from previous columns - num *= weights[idx] - for idx_2, w in enumerate(num): - new_weight[place + idx_2] = w.item() - place += v - - # else do a multinomial experiment to generate new_weight - else: - for idx, v in enumerate(to_do): - num = v.item() - # num*weights[idx] carries over weights from previous columns - randos = np.random.multinomial(num * weights[idx], [1 / num] * num) - for idx_2, w in enumerate(randos): - new_weight[place + idx_2] = w.item() - place += v - - weights = new_weight - - if indexes is False: - out[:, -1] = out[:, -1] * weights - return out - else: - indexes = weights_to_index(weights) - return out[indexes] - - -@nb.jit(nopython=True, cache=True) -def nb_reweighter_real(data, columns_to_resample, clusternum_dict, start, shape): - """Internal function for bootstrapping a design matrix with real number - weights. - - Parameters - ---------- - data : 2D array - Target data to be bootstrapped. - columns_to_resample : 1D bool array-like - False for columns to be skipped in the resampling plan. - clusternum_dict : TypedDict - Hierarchy dictionary produced by id_cluster_counts - start : int - First column of the data matrix to resample - shape : int - Last column of the data matrix to resample - - Returns - ------- - 2D array - Nonparametrically bootstrapped sample from the input data array. - """ - - out = data.astype(np.float64) - # at the start, everything is weighted equally - # dype is float64 because weights can be any real number - weights = np.array([1 for i in clusternum_dict[start]], dtype=np.float64) - - for key in range(start, shape): - # fetch design matrix info for current column - to_do = clusternum_dict[key] - # preallocate the full array for new_weight - new_weight = np.empty(to_do.sum(), np.float64) - place = 0 - - # if not resampling this column, new_weight is all 1 - if not columns_to_resample[key]: - for idx, v in enumerate(to_do): - num = np.array([1 for m in range(v.item())], dtype=np.float64) - num *= weights[idx] - for idx_2, w in enumerate(num): - new_weight[place + idx_2] = w.item() - place += v - - # else do a dirichlet experiment to generate new_weight - else: - for idx, v in enumerate(to_do): - num = [1 for a in range(v.item())] - # multiplying by weights[idx] carries over prior columns - randos = np.random.dirichlet(num, size=None) * weights[idx] * v.item() - for idx_2, w in enumerate(randos): - new_weight[place + idx_2] = w.item() - place += v - - weights = new_weight - - out[:, -1] = out[:, -1] * weights - return out - - @nb.jit(nopython=True, cache=True) def weights_to_index(weights): """Converts a 1D array of integer weights to indices. @@ -591,4 +460,3 @@ def class_make_ufunc_list(target, reference, counts): return ufunc_list - diff --git a/hierarch/resampling.py b/hierarch/resampling.py index d43fa43..6061467 100644 --- a/hierarch/resampling.py +++ b/hierarch/resampling.py @@ -3,8 +3,6 @@ from numba import jit from functools import lru_cache from hierarch.internal_functions import ( - nb_reweighter, - nb_reweighter_real, nb_unique, id_cluster_counts, msp, @@ -12,6 +10,7 @@ _repeat, nb_fast_shuffle, nb_strat_shuffle, + weights_to_index, ) @@ -269,13 +268,21 @@ def fit(self, data, skip=None, y=-1): else: skip = [] - self.cluster_dict = id_cluster_counts(data[:, :y]) + cluster_dict = id_cluster_counts(data[:, :y]) + cluster_dict = tuple(reversed(list(cluster_dict.values()))) + cluster_dict = tuple(map(tuple, cluster_dict)) y %= data.shape[1] - self.shape = y + shape = y - self.columns_to_resample = np.array([True for k in range(self.shape)]) + columns_to_resample = np.array([True for k in range(shape)]) for key in skip: - self.columns_to_resample[key] = False + columns_to_resample[key] = False + + kind = str(self.kind) + + self.transform = _bootstrapper_factory( + tuple(columns_to_resample), cluster_dict, shape, kind + ) def transform(self, data, start: int): """Generate a bootstrapped sample from target data. @@ -294,30 +301,140 @@ def transform(self, data, start: int): according to "kind" argument. """ - if self.kind == "weights": - resampled = nb_reweighter( - data, - self.columns_to_resample, - self.cluster_dict, - start, - self.shape, - indexes=False, - ) - elif self.kind == "indexes": - resampled = nb_reweighter( - data, - self.columns_to_resample, - self.cluster_dict, - start, - self.shape, - indexes=True, - ) + raise Exception("Use fit() before using transform().") - elif self.kind == "bayesian": - resampled = nb_reweighter_real( - data, self.columns_to_resample, self.cluster_dict, start, self.shape +@lru_cache() +def _bootstrapper_factory(columns_to_resample, clusternum_dict, shape, kind): + """Factory function that returns the appropriate transform(). + """ + clusternum_dict = tuple(map(np.array, clusternum_dict)) + columns_to_resample = np.array(columns_to_resample) + + if kind == "weights": + + @jit(nopython=True) + def _bootstrapper_impl(data, start): + out = data.astype(np.float64) + # at the start, everything is weighted equally + weights = np.array([1 for i in clusternum_dict[start]]) + + for key in range(start, shape): + # fetch design matrix info for current column + to_do = clusternum_dict[key] + # preallocate the full array for new_weight + new_weight = np.empty(to_do.sum(), np.int64) + place = 0 + + # if not resampling this column, new_weight is the prior column's weights + if not columns_to_resample[key]: + for idx, v in enumerate(to_do): + new_weight[place : place + v] = np.array( + [weights[idx] for m in range(v.item())] + ) + place += v + + # else do a multinomial experiment to generate new_weight + else: + for idx, v in enumerate(to_do): + # v*weights[idx] carries over weights from previous columns + new_weight[place : place + v] = np.random.multinomial( + v * weights[idx], [1 / v] * v + ) + place += v + + weights = new_weight + + out[:, -1] = out[:, -1] * weights + return out + + elif kind == "indexes": + + @jit(nopython=True) + def _bootstrapper_impl(data, start): + out = data.astype(np.float64) + # at the start, everything is weighted equally + weights = np.array([1 for i in clusternum_dict[start]]) + + for key in range(start, shape): + # fetch design matrix info for current column + to_do = clusternum_dict[key] + # preallocate the full array for new_weight + new_weight = np.empty(to_do.sum(), np.int64) + place = 0 + + # if not resampling this column, new_weight is the prior column's weights + if not columns_to_resample[key]: + for idx, v in enumerate(to_do): + new_weight[place : place + v] = np.array( + [weights[idx] for m in range(v.item())] + ) + place += v + + # else do a multinomial experiment to generate new_weight + else: + for idx, v in enumerate(to_do): + # v*weights[idx] carries over weights from previous columns + new_weight[place : place + v] = np.random.multinomial( + v * weights[idx], [1 / v] * v + ) + place += v + + weights = new_weight + + indexes = weights_to_index(weights) + return out[indexes] + + elif kind == "bayesian": + + @jit(nopython=True) + def _bootstrapper_impl(data, start): + out = data.astype(np.float64) + # at the start, everything is weighted equally + # dype is float64 because weights can be any real number + weights = np.array( + [1 for i in clusternum_dict[start]], dtype=np.float64 ) - return resampled + + for key in range(start, shape): + # fetch design matrix info for current column + to_do = clusternum_dict[key] + # preallocate the full array for new_weight + new_weight = np.empty(to_do.sum(), np.float64) + place = 0 + + # if not resampling this column, new_weight is all 1 + if not columns_to_resample[key]: + for idx, v in enumerate(to_do): + new_weight[place : place + v] = np.array( + [weights[idx] for m in range(v.item())], + dtype=np.float64, + ) + place += v + + # else do a dirichlet experiment to generate new_weight + else: + for idx, v in enumerate(to_do): + # multiplying by weights[idx] carries over prior columns + new_weight[place : place + v] = ( + np.random.dirichlet( + [1 for a in range(v.item())], size=None + ) + * weights[idx] + * v.item() + ) + place += v + + weights = new_weight + + out[:, -1] = out[:, -1] * weights + return out + + else: + raise KeyError( + "No such bootstrapping algorithm. kind must be 'weights' or 'indexes' or 'bayesian'" + ) + + return _bootstrapper_impl class Permuter: @@ -441,9 +558,9 @@ def fit(self, data, col_to_permute: int, exact=False): col_values = values[:, -2].copy() self.iterator = cycle(msp(col_values)) if len(col_values) == len(data): - self.transform = self._exact_return(col_to_permute, self.iterator) + self.transform = _exact_return(col_to_permute, self.iterator) else: - self.transform = self._exact_repeat_return( + self.transform = _exact_repeat_return( col_to_permute, self.iterator, counts ) @@ -458,13 +575,13 @@ def fit(self, data, col_to_permute: int, exact=False): keys = tuple(keys.tolist()) if indexes.size == len(data): - self.transform = self._random_return(col_to_permute, keys) + self.transform = _random_return(col_to_permute, keys) else: col_values = data[:, col_to_permute][indexes] col_values = tuple(col_values.tolist()) counts = tuple(counts.tolist()) - self.transform = self._random_repeat_return( + self.transform = _random_repeat_return( col_to_permute, col_values, keys, counts ) @@ -486,75 +603,71 @@ def transform(self, data): # four static methods defined below raise Exception("Use fit() before calling transform.") - @staticmethod - def _exact_return(col_to_permute, generator): - """Transformer when exact is True and permutations are unrestricted. - """ - - def _exact_return_impl(data): - data[:, col_to_permute] = next(generator) - return data +def _exact_return(col_to_permute, generator): + """Transformer when exact is True and permutations are unrestricted. + """ - return _exact_return_impl + def _exact_return_impl(data): + data[:, col_to_permute] = next(generator) + return data - @staticmethod - def _exact_repeat_return(col_to_permute, generator, counts): - """Transformer when exact is True and permutations are restricted by - repetition of treated entities. - """ + return _exact_return_impl - def _rep_iter_return_impl(data): - data[:, col_to_permute] = _repeat(tuple(next(generator)), counts) - return data +def _exact_repeat_return(col_to_permute, generator, counts): + """Transformer when exact is True and permutations are restricted by + repetition of treated entities. + """ - return _rep_iter_return_impl + def _rep_iter_return_impl(data): + data[:, col_to_permute] = _repeat(tuple(next(generator)), counts) + return data - @staticmethod - @lru_cache() - def _random_return(col_to_permute, keys): - """Transformer when exact is False and repetition is not required. - """ + return _rep_iter_return_impl - if col_to_permute == 0: +@lru_cache() +def _random_return(col_to_permute, keys): + """Transformer when exact is False and repetition is not required. + """ - @jit(nopython=True) - def _random_return_impl(data): - nb_fast_shuffle(data[:, col_to_permute]) - return data + if col_to_permute == 0: - else: + @jit(nopython=True) + def _random_return_impl(data): + nb_fast_shuffle(data[:, col_to_permute]) + return data - @jit(nopython=True) - def _random_return_impl(data): - nb_strat_shuffle(data[:, col_to_permute], keys) - return data + else: - return _random_return_impl + @jit(nopython=True) + def _random_return_impl(data): + nb_strat_shuffle(data[:, col_to_permute], keys) + return data - @staticmethod - @lru_cache() - def _random_repeat_return(col_to_permute, col_values, keys, counts): - """Transformer when exact is False and repetition is required. - """ - col_values = np.array(col_values) - counts = np.array(counts) - if col_to_permute == 0: + return _random_return_impl - @jit(nopython=True) - def _random_repeat_return_impl(data): - shuffled_col_values = col_values.copy() - nb_fast_shuffle(shuffled_col_values) - data[:, col_to_permute] = np.repeat(shuffled_col_values, counts) - return data +@lru_cache() +def _random_repeat_return(col_to_permute, col_values, keys, counts): + """Transformer when exact is False and repetition is required. + """ + col_values = np.array(col_values) + counts = np.array(counts) + if col_to_permute == 0: + + @jit(nopython=True) + def _random_repeat_return_impl(data): + shuffled_col_values = col_values.copy() + nb_fast_shuffle(shuffled_col_values) + data[:, col_to_permute] = np.repeat(shuffled_col_values, counts) + return data - else: + else: - @jit(nopython=True) - def _random_repeat_return_impl(data): - shuffled_col_values = col_values.copy() - nb_strat_shuffle(shuffled_col_values, keys) - data[:, col_to_permute] = np.repeat(shuffled_col_values, counts) - return data + @jit(nopython=True) + def _random_repeat_return_impl(data): + shuffled_col_values = col_values.copy() + nb_strat_shuffle(shuffled_col_values, keys) + data[:, col_to_permute] = np.repeat(shuffled_col_values, counts) + return data - return _random_repeat_return_impl + return _random_repeat_return_impl diff --git a/hierarch/stats.py b/hierarch/stats.py index 858b9b1..38613a7 100644 --- a/hierarch/stats.py +++ b/hierarch/stats.py @@ -240,7 +240,7 @@ def _grabber(X, y, treatment_labels): def hypothesis_test( data_array, - treatment_col: int, + treatment_col, compare="corr", alternative="two-sided", skip=None, @@ -261,9 +261,10 @@ def hypothesis_test( Array-like containing both the independent and dependent variables to be analyzed. It's assumed that the final (rightmost) column contains the dependent variable values. - treatment_col : int + treatment_col : int or str The index number of the column containing "two samples" to be compared. - Indexing starts at 0. + Indexing starts at 0. If input data is a pandas DataFrame, this can be + the name of the column. compare : str, optional The test statistic to use to perform the hypothesis test, by default "corr" which automatically calls the studentized covariance test statistic. @@ -354,6 +355,8 @@ def hypothesis_test( # turns the input array or dataframe into a float64 array if isinstance(data_array, (np.ndarray, pd.DataFrame)): + if isinstance(data_array, pd.DataFrame) and isinstance(treatment_col, str): + treatment_col = int(data_array.columns.get_loc(treatment_col)) data = _preprocess_data(data_array) else: raise TypeError("Input data must be ndarray or DataFrame.") @@ -490,7 +493,7 @@ def hypothesis_test( def multi_sample_test( data_array, - treatment_col: int, + treatment_col, hypotheses="all", correction="fdr", compare="means", @@ -511,9 +514,10 @@ def multi_sample_test( Array-like containing both the independent and dependent variables to be analyzed. It's assumed that the final (rightmost) column contains the dependent variable values. - treatment_col : int + treatment_col : int or str The index number of the column containing labels to be compared. - Indexing starts at 0. + Indexing starts at 0. If input data is a pandas DataFrame, this can + be the column name. hypotheses : list of two-element lists or "all", optional Hypotheses to be tested. If 'all' every pairwise comparison will be tested. Can be passed a list of lists to restrict comparisons, which @@ -667,6 +671,8 @@ def multi_sample_test( # coerce data into an object array if isinstance(data_array, pd.DataFrame): + if isinstance(treatment_col, str): + treatment_col = data_array.columns.get_loc(treatment_col) data = data_array.to_numpy() elif isinstance(data_array, np.ndarray): data = data_array @@ -870,9 +876,10 @@ def confidence_interval( Array-like containing both the independent and dependent variables to be analyzed. It's assumed that the final (rightmost) column contains the dependent variable values. - treatment_col : int + treatment_col : int or str The index number of the column containing "two samples" to be compared. - Indexing starts at 0. + Indexing starts at 0. If input data is a pandas DataFrame, this can be + the column name. interval : float, optional Percentage value indicating the confidence interval's coverage, by default 95 iterations : int, optional @@ -980,6 +987,8 @@ def confidence_interval( # turns the input array or dataframe into a float64 array if isinstance(data_array, (np.ndarray, pd.DataFrame)): + if isinstance(data_array, pd.DataFrame) and isinstance(treatment_col, str): + treatment_col = int(data_array.columns.get_loc(treatment_col)) data = _preprocess_data(data_array) else: raise TypeError("Input data must be ndarray or DataFrame.") diff --git a/requirements.txt b/requirements.txt index 4e0f517..15620c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numba==0.53.1 -numpy==1.20.2 -pandas==1.2.4 -scipy==1.6.2 +numpy==1.21.0 +pandas==1.3.0 +scipy==1.7.0 diff --git a/setup.py b/setup.py index b5fe852..a6a60dd 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="hierarch", - version="1.1.2", + version="1.1.3", author="Rishi Kulkarni", author_email="rkulk@stanford.edu", description="Hierarchical hypothesis testing library",