From 8d09e38627ffef4b9f5b6140d2eb7e0b6a35e845 Mon Sep 17 00:00:00 2001 From: lisazeyen <35347358+lisazeyen@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:16:53 +0200 Subject: [PATCH 1/3] drop duplicates in prepare_hotmaps_database --- scripts/build_industrial_distribution_key.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py index 979a14935..fe7f5d822 100644 --- a/scripts/build_industrial_distribution_key.py +++ b/scripts/build_industrial_distribution_key.py @@ -93,6 +93,20 @@ def prepare_hotmaps_database(regions): gdf.rename(columns={"index_right": "bus"}, inplace=True) gdf["country"] = gdf.bus.str[:2] + # the .sjoin can lead to duplicates if a geom is in two regions + if gdf.index.duplicated().any(): + import pycountry + # get all duplicated entries + duplicated_i = gdf.index[gdf.index.duplicated()] + # convert from raw data country name to iso-2-code + s = df.loc[duplicated_i, "Country"].apply(lambda x: pycountry.countries.lookup(x).alpha_2) + # Get a boolean mask where gdf's country column matches s's values for the same index + mask = gdf['country'] == gdf.index.map(s) + # Filter gdf using the mask + gdf_filtered = gdf[mask] + # concat not duplicated and filtered gdf + gdf = pd.concat([gdf.drop(duplicated_i), gdf_filtered]).sort_index() + return gdf From dadc372ecd629cdeb708727e5abce3b7179eb307 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:17:19 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/build_industrial_distribution_key.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py index fe7f5d822..3c62ca1b3 100644 --- a/scripts/build_industrial_distribution_key.py +++ b/scripts/build_industrial_distribution_key.py @@ -96,12 +96,15 @@ def prepare_hotmaps_database(regions): # the .sjoin can lead to duplicates if a geom is in two regions if gdf.index.duplicated().any(): import pycountry + # get all duplicated entries duplicated_i = gdf.index[gdf.index.duplicated()] # convert from raw data country name to iso-2-code - s = df.loc[duplicated_i, "Country"].apply(lambda x: pycountry.countries.lookup(x).alpha_2) + s = df.loc[duplicated_i, "Country"].apply( + lambda x: pycountry.countries.lookup(x).alpha_2 + ) # Get a boolean mask where gdf's country column matches s's values for the same index - mask = gdf['country'] == gdf.index.map(s) + mask = gdf["country"] == gdf.index.map(s) # Filter gdf using the mask gdf_filtered = gdf[mask] # concat not duplicated and filtered gdf From f4d821ad2ad5709910988d05b2ff1867bc39dee3 Mon Sep 17 00:00:00 2001 From: Fabian Neumann Date: Tue, 22 Aug 2023 14:22:25 +0200 Subject: [PATCH 3/3] use country_converter instead of pycountry --- scripts/build_industrial_distribution_key.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py index 3c62ca1b3..25d0235a0 100644 --- a/scripts/build_industrial_distribution_key.py +++ b/scripts/build_industrial_distribution_key.py @@ -93,22 +93,16 @@ def prepare_hotmaps_database(regions): gdf.rename(columns={"index_right": "bus"}, inplace=True) gdf["country"] = gdf.bus.str[:2] - # the .sjoin can lead to duplicates if a geom is in two regions + # the .sjoin can lead to duplicates if a geom is in two overlapping regions if gdf.index.duplicated().any(): - import pycountry - # get all duplicated entries duplicated_i = gdf.index[gdf.index.duplicated()] # convert from raw data country name to iso-2-code - s = df.loc[duplicated_i, "Country"].apply( - lambda x: pycountry.countries.lookup(x).alpha_2 - ) - # Get a boolean mask where gdf's country column matches s's values for the same index - mask = gdf["country"] == gdf.index.map(s) - # Filter gdf using the mask - gdf_filtered = gdf[mask] + code = cc.convert(gdf.loc[duplicated_i, "Country"], to="iso2") + # screen out malformed country allocation + gdf_filtered = gdf.loc[duplicated_i].query("country == @code") # concat not duplicated and filtered gdf - gdf = pd.concat([gdf.drop(duplicated_i), gdf_filtered]).sort_index() + gdf = pd.concat([gdf.drop(duplicated_i), gdf_filtered]) return gdf