From 1863d0ba201f1339a43d4bde2d43312b5f829e27 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 20 Dec 2024 00:24:53 +0100 Subject: [PATCH] wip --- .../2024-12-17/efr_malani_jacob.meta.yml | 4 +- .../demography/2024-12-17/efr_malani_jacob.py | 91 ++++++++++++------- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml index 88626f997e6..62525770764 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml @@ -17,7 +17,7 @@ tables: efr_repr: title: Reproductive Effective Fertility rate (scaled by sex ratio), using UN data description_short: |- - The number of daughters that live long enough to reproduce, between ages 15 and 49. This focuses on daughters, not all children because only females reproduce. Because a child need not live until age 49 to reproduce, we approximate efr_r by taking the average of efr over all reproductive ages (15-49). + The number of children who live long enough to reproduce, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49. @@ -37,7 +37,7 @@ tables: efr_labor: title: Labor Effective Fertility rate, using UN data description_short: |- - The number of children born in a year who will live long enough to earn labor income. This is approximated this by taking the average of Effective Fertility rate (EFR) over all working ages (15-65). + The number of children who live long enough to earn labor income, per woman. This number is dependent on the survival of daughters to childbearing age (between 15 and 49 years old). unit: "children per women" description_processing: |- For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65. diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py index 86a2c896c85..c40939754f4 100644 --- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py +++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" +from owid.catalog import Origin from owid.catalog import processing as pr from etl.helpers import PathFinder, create_dataset @@ -16,6 +17,14 @@ AGE_REPR_END = 49 AGE_LAB_END = 65 +# Additional origin metadata of the paper +origin = Origin( + producer="Malani and Jacob", + title="A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility", + citation_full="Malani, A., & Jacob, A. (2024). A New Measure of Surviving Children that Sheds Light on Long-term Trends in Fertility. https://doi.org/10.3386/w33175", + date_published="2024-11-01", # type: ignore +) + def run(dest_dir: str) -> None: # @@ -39,45 +48,16 @@ def run(dest_dir: str) -> None: tb_proj=tb_un_proj, ) - # Filter TFR table - tb_tfr = tb_tfr.loc[ - (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), - ["country", "year", "fertility_rate"], - ] - - # Add TFR - tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") - - # Estimate EFR - tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] - - # Estimate metrics - ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) - ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) - ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) - ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) - tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] - tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() - - # Pivot - tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() - - def rename_col(colname): - mapping = { - "female": "repr", - "total": "labor", - } - - if colname[1] == "": - return colname[0] - else: - return f"{colname[0]}_{mapping.get(colname[1])}" - - tb_un.columns = [rename_col(col) for col in tb_un.columns] + # Add EFR + tb_un = estimate_un_efr(tb_un, tb_tfr) # Format tb_un = tb_un.format(["country", "year"], short_name="un") + # Add extra origin + tb_un.efr_repr.metadata.origins = [origin] + tb_un.efr_repr.metadata.origins + + # Build list of tables tables = [ tb_un, ] @@ -131,3 +111,44 @@ def estimate_un_cum_survival(tb, tb_proj): # tb = tb.drop(columns=["year_born"]) return tb + + +def estimate_un_efr(tb_un, tb_tfr): + # Filter TFR table + tb_tfr = tb_tfr.loc[ + (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])), + ["country", "year", "fertility_rate"], + ] + + # Add TFR + tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1") + + # Estimate EFR + tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"] + + # Estimate metrics + ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49) + ## EFR-reproductive: Average number of kids that make it to the labour age (15-65) + ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49) + ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65) + tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")] + tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean() + + # Pivot + tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index() + + # Rename columns + def rename_col(colname): + mapping = { + "female": "repr", + "total": "labor", + } + + if colname[1] == "": + return colname[0] + else: + return f"{colname[0]}_{mapping.get(colname[1])}" + + tb_un.columns = [rename_col(col) for col in tb_un.columns] + + return tb_un