From c5b79fa498062e06c8fae87b3b0d98d3740e8883 Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Mon, 28 Mar 2022 19:27:42 +0100 Subject: [PATCH] REF: switch to CSR matrix in area binning --- tobler/area_weighted/area_interpolate.py | 48 ++++++++--------- tobler/tests/test_area_interpolators.py | 66 ++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 23 deletions(-) diff --git a/tobler/area_weighted/area_interpolate.py b/tobler/area_weighted/area_interpolate.py index ed3e057..ff5f538 100644 --- a/tobler/area_weighted/area_interpolate.py +++ b/tobler/area_weighted/area_interpolate.py @@ -76,7 +76,7 @@ def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1): Returns ------- - tables : scipy.sparse.dok_matrix + tables : scipy.sparse.csr_matrix """ from joblib import Parallel, delayed, parallel_backend @@ -125,7 +125,7 @@ def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1): ) areas = np.concatenate(worker_out) - # Build DOK table + # Build CSR table table = coo_matrix( ( areas, @@ -134,7 +134,7 @@ def _area_tables_binning_parallel(source_df, target_df, n_jobs=-1): shape=(df1.shape[0], df2.shape[0]), dtype=np.float32, ) - table = table.todok() + table = table.tocsr() return table @@ -161,7 +161,7 @@ def _area_tables_binning(source_df, target_df, spatial_index): Returns ------- - tables : scipy.sparse.dok_matrix + tables : scipy.sparse.csr_matrix """ if _check_crs(source_df, target_df): @@ -199,7 +199,7 @@ def _area_tables_binning(source_df, target_df, spatial_index): dtype=np.float32, ) - table = table.todok() + table = table.tocsr() return table @@ -226,7 +226,7 @@ def _area_interpolate_binning( [Optional. Default=None] Columns in dataframes for extensive variables intensive_variables : list [Optional. Default=None] Columns in dataframes for intensive variables - table : scipy.sparse.dok_matrix + table : scipy.sparse.csr_matrix [Optional. Default=None] Area allocation source-target correspondence table. If not provided, it will be built from `source_df` and `target_df` using `tobler.area_interpolate._area_tables_binning` @@ -305,19 +305,20 @@ def _area_interpolate_binning( else: table = _area_tables_binning_parallel(source_df, target_df, n_jobs=n_jobs) - den = source_df.area.values - if allocate_total: - den = np.asarray(table.sum(axis=1)) - den = den + (den == 0) - den = 1.0 / den - n = den.shape[0] - den = den.reshape((n,)) - den = diags([den], [0]) - weights = den.dot(table) # row standardize table - dfs = [] extensive = [] if extensive_variables: + + den = source_df.area.values + if allocate_total: + den = np.asarray(table.sum(axis=1)) + den = den + (den == 0) + den = 1.0 / den + n = den.shape[0] + den = den.reshape((n,)) + den = diags([den], [0]) + weights = den.dot(table) # row standardize table + for variable in extensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) @@ -329,15 +330,16 @@ def _area_interpolate_binning( extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) - area = np.asarray(table.sum(axis=0)) - den = 1.0 / (area + (area == 0)) - n, k = den.shape - den = den.reshape((k,)) - den = diags([den], [0]) - weights = table.dot(den) - intensive = [] if intensive_variables: + + area = np.asarray(table.sum(axis=0)) + den = 1.0 / (area + (area == 0)) + n, k = den.shape + den = den.reshape((k,)) + den = diags([den], [0]) + weights = table.dot(den) + for variable in intensive_variables: vals = _nan_check(source_df, variable) vals = _inf_check(source_df, variable) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index b80a3d7..27cd829 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -40,6 +40,45 @@ def test_area_interpolate_singlecore(): assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) +def test_area_interpolate_extensive(): + sac1, sac2 = datasets() + area = area_interpolate( + source_df=sac1, + target_df=sac2, + extensive_variables=["TOT_POP"], + n_jobs=1, + ) + assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) + + +def test_area_interpolate_intensive(): + sac1, sac2 = datasets() + area = area_interpolate( + source_df=sac1, + target_df=sac2, + intensive_variables=["pct_poverty"], + n_jobs=1, + ) + assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) + + +def test_area_interpolate_categorical(): + sac1, sac2 = datasets() + area = area_interpolate( + source_df=sac1, + target_df=sac2, + extensive_variables=["TOT_POP"], + intensive_variables=["pct_poverty"], + categorical_variables=["animal"], + n_jobs=1, + ) + assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) + assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) + assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) + assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) + assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) + + def test_area_interpolate_custom_index(): sac1, sac2 = datasets() sac1.index = sac1.index * 2 @@ -128,3 +167,30 @@ def test_area_tables_binning(): assert auto.mean() == pytest.approx(2.7552649e-05) assert (auto[5][0].toarray() > 0).sum() == 7 + + +def test_passed_table(): + sac1, sac2 = datasets() + csr = _area_tables_binning(source_df=sac1, target_df=sac2, spatial_index="auto") + + area = area_interpolate( + source_df=sac1, + target_df=sac2, + extensive_variables=["TOT_POP"], + intensive_variables=["pct_poverty"], + table=csr, + ) + assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) + assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) + + dok = csr.todok() + + area = area_interpolate( + source_df=sac1, + target_df=sac2, + extensive_variables=["TOT_POP"], + intensive_variables=["pct_poverty"], + table=dok, + ) + assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) + assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) \ No newline at end of file