From fae53b314f3dcc67b4821db227b2a9130646b653 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 19 Dec 2024 23:46:30 +0100
Subject: [PATCH] wip

---
 .../efr_malani_jacob.countries.json           |   2 -
 .../efr_malani_jacob.excluded_countries.json  |   2 -
 .../2024-12-17/efr_malani_jacob.meta.yml      | 104 ++++++++------
 .../demography/2024-12-17/efr_malani_jacob.py | 133 ++++++++++++------
 .../demography/2024-12-17/efr_malani_jacob.py |   4 +-
 5 files changed, 155 insertions(+), 90 deletions(-)
 delete mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json
 delete mode 100644 etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json

diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json
deleted file mode 100644
index 2c63c085104..00000000000
--- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.countries.json
+++ /dev/null
@@ -1,2 +0,0 @@
-{
-}
diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json
deleted file mode 100644
index 0d4f101c7a3..00000000000
--- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.excluded_countries.json
+++ /dev/null
@@ -1,2 +0,0 @@
-[
-]
diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml
index 9ec97ed28e5..6b2b8dc953d 100644
--- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml
+++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.meta.yml
@@ -1,58 +1,76 @@
 # NOTE: To learn more about the fields, hover over their names.
 definitions:
   common:
+    description_key: []
     presentation:
+      grapher_config: none
       topic_tags:
         - Fertility Rate
 
-
 # Learn more about the available fields:
 # http://docs.owid.io/projects/etl/architecture/metadata/reference/
 dataset:
   update_period_days: 365
 
-
 tables:
-  efr_malani_jacob:
+  un:
     variables:
-      # testing_variable:
-      #   title: Testing variable title
-      #   unit: arbitrary units
-      #   short_unit: au
-      #   description_short: Short description of testing variable.
-      #   description_processing: Description of processing of testing variable.
-      #   description_key: List of key points about the indicator.
-      #   description_from_producer: Description of testing variable from producer.
-      #   processing_level: minor
-      #   type:
-      #   sort:
-      #   presentation:
-      #     attribution:
-      #     attribution_short:
-      #     faqs:
-      #     grapher_config:
-      #     title_public:
-      #     title_variant:
-      #     topic_tags:
-      #   display:
-      #     name: Testing variable
-      #     numDecimalPlaces: 0
-      #     tolerance: 0
-      #     color:
-      #     conversionFactor: 1
-      #     description:
-      #     entityAnnotationsMap: Test annotation
-      #     includeInTable:
-      #     isProjection: false
-      #     unit: arbitrary units
-      #     shortUnit: au
-      #     tableDisplay:
-      #       hideAbsoluteChange:
-      #       hideRelativeChange:
-      #     yearIsDay: false
-      #     zeroDay:
-      #     roundingMode:
-      #     numSignificantFigures:
-      #
-      {}
+      efr_repr:
+        title: Reproductive Effective Fertility rate (scaled by sex ratio)
+        description_short: |-
+          The number of daughters that live long enough to reproduce, between ages 15 and 49. This focuses on daughters, not all children because only females reproduce. Because a child need not live until age 49 to reproduce, we approximate efr_r by taking the average of efr over all reproductive ages (15-49).
+        unit: "children per women"
+        description_processing: |-
+          For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49.
+
+          We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a woman that will live long enough to reach that age.
+
+          The Reproductive Effective Fertility rate (EFR) is the average of the EFR over all reproductive ages (15-49).
+
+          Note that the Reproductive Effective Fertility rate (EFR) is an approximation of the number of daughters, so it uses the total fertility rate of female children, or equivalently, the TFR weighted by the sex ratio at birth.
+
+          So we have that: EFR_repr = (TFR * mean(EFR)) / (1 + SRB), where SRB is the male-to-female ratio and the mean is taken over all reproductive ages (15-49).
+
+          This indicator is scaled by the sex ratio to allow easy comparability with the Total Fertility Rate (TFR) and the Labor Effective Fertility rate (EFR_labor).
+
+          Read more details in the author's paper: https://www.nber.org/papers/w33175
+
+      efr_labor:
+        title: Labor Effective Fertility rate
+        description_short: |-
+          The number of children born in a year who will live long enough to earn labor income. This is approximated this by taking the average of Effective Fertility rate (EFR) over all working ages (15-65).
+        unit: "children per women"
+        description_processing: |-
+          For a given cohort year, we estimate the cumulative survival probability for a person to reach each age age from 0 to 65. E.g. the probability of a person born in 2000 to reach age 15, 16, 17, ..., 65.
+
+          We then estimate the Effective Fertility Rate (EFR) for each age group by multiplying the Total Fertility Rate (TFR) by the cumulative survival probability. The EFR for a given age gives us an approximation of the average number of children from a women that will live long enough to reach that age.
+
+          The Labor Effective Fertility rate (EFR) is the average of the EFR over all labor ages (15-65).
+
+          So we have that: EFR_labor = (TFR * mean(EFR)), where the mean is taken over all labor ages (15-65).
+
+          Read more details in the author's paper: https://www.nber.org/papers/w33175
+
+      cumulative_survival_repr:
+        title: Cumulative survival probability to reproductive age
+        description_short: |-
+          The probability that a person born in a given year will live long enough to reach reproductive age (15-49).
+        description_processing: |-
+          For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 49. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 49.
+
+          This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc.
+
+          Read more details in the author's paper: https://www.nber.org/papers/w33175
+        unit: ""
+
+      cumulative_survival_labor:
+        title: Cumulative survival probability to labor age
+        description_short: |-
+          The probability that a person born in a given year will live long enough to reach labor age (15-65).
+        description_processing: |-
+          For a given cohort year, we estimate the cumulative survival probability for a person to reach each age from 0 to 65. For example, the probability of a person born in 2000 reaching age 15, 16, 17, and so on up to 65.
+
+          This is done by multiplying the survival probability at various years, depending on the age of the person. For example, if born in 2000, we use the probability of surviving age 0 from 2000, the probability of surviving age 1 from 2001, etc.
 
+          Read more details in the author's paper: https://www.nber.org/papers/w33175
+        unit: ""
diff --git a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py
index f0e8cc3eb07..36a48b1e5a4 100644
--- a/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py
+++ b/etl/steps/data/garden/demography/2024-12-17/efr_malani_jacob.py
@@ -1,6 +1,5 @@
 """Load a meadow dataset and create a garden dataset."""
 
-import pandas as pd
 from owid.catalog import processing as pr
 
 from etl.data_helpers import geo
@@ -9,7 +8,14 @@
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
 
-COLUMNS_UN = ["location", "year", "age", "sex", "probability_of_survival"]
+COLUMNS_UN = ["country", "year", "age", "sex", "probability_of_survival"]
+# Years
+YEAR_UN_START = 1950
+YEAR_UN_END = 2023
+# Ages
+AGE_LAB_START = 15
+AGE_REPR_END = 49
+AGE_LAB_END = 65
 
 
 def run(dest_dir: str) -> None:
@@ -20,64 +26,109 @@ def run(dest_dir: str) -> None:
     ds_un_lt = paths.load_dataset("un_wpp_lt")
     ds_un_wpp = paths.load_dataset("un_wpp")
 
-    ds_hmd = paths.load_dataset("hmd")
-    ds_hfd = paths.load_dataset("hfd")
+    # ds_hmd = paths.load_dataset("hmd")
+    # ds_hfd = paths.load_dataset("hfd")
 
     # Load tables
     tb_un = ds_un_lt.read("un_wpp_lt")
     tb_un_proj = ds_un_lt.read("un_wpp_lt_proj")
+    tb_tfr = ds_un_wpp.read("fertility_rate")
 
-    # Concatenate
-    tb_un = pr.concat([tb_un, tb_un_proj], ignore_index=True)
+    # Estimate cumulative survival in UN LT tables
+    tb_un = estimate_un_cum_survival(
+        tb=tb_un,
+        tb_proj=tb_un_proj,
+    )
 
-    # Filter 'total' and 'female'
-    tb_un = tb_un.loc[tb_un["sex"].isin(["total", "female"]), COLUMNS_UN]
+    # Filter TFR table
+    tb_tfr = tb_tfr.loc[
+        (tb_tfr["sex"] == "all") & (tb_tfr["age"] == "all") & (tb_tfr["variant"].isin(["estimates", "medium"])),
+        ["country", "year", "fertility_rate"],
+    ]
 
-    # Dtypes
-    tb_un["age"] = tb_un["age"].str.replace("100+", "100").astype("UInt16")
+    # Add TFR
+    tb_un = tb_un.merge(tb_tfr, on=["country", "year"], validate="m:1")
 
-    # Scale
-    tb_un["probability_of_survival"] /= 100
+    # Estimate EFR
+    tb_un["efr"] = tb_un["fertility_rate"] * tb_un["cumulative_survival"]
 
-    # Cumulative product
-    # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age.
-    # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1).
-    # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born.
-    # After that, we just do the cumulative product for each year_born.
-    # Note that for the cumulative product to make sense, we need to first sort table by age!
-    # Step 1: Create year_born
-    tb_un["year_born"] = tb_un["year"] - tb_un["age"]
-    # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows)
-    tb_un = tb_un.loc[(tb_un["year_born"] >= 1950) & (tb_un["year_born"] <= 2023)]
-    # Step 3: Sort by age
-    tb_un = tb_un.sort_values(["location", "year_born", "sex", "age"])
-    # Step 4: Estimate cumulative survival probability
-    tb_un["cumulative_survival"] = tb_un.groupby(["location", "sex", "year_born"])["probability_of_survival"].cumprod()
-    # Step 5: Keep only years of interest (15-65), further reduction of 50% rows
-    tb_un = tb_un.loc[(tb_un["age"] >= 15) & (tb_un["age"] <= 65)]
-    # Step 6: Drop columns
-    tb_un = tb_un.drop(columns=["year_born"])
+    # Estimate metrics
+    ## EFR-labor: Average number of daughters that make it to the reproductive age (15-49)
+    ## EFR-reproductive: Average number of kids that make it to the labour age (15-65)
+    ## Cum survival prob, labor: Probability of a girl to survive to the reproductive age (15-49)
+    ## Cum survival prob, reproductive: Probability of a kid to survive to the labor age (15-65)
+    tb_un = tb_un.loc[(tb_un["age"] <= AGE_REPR_END) | (tb_un["sex"] == "total")]
+    tb_un = tb_un.groupby(["country", "year", "sex"], as_index=False)[["efr", "cumulative_survival"]].mean()
 
-    # Read table from meadow dataset.
-    tb = ds_un.read("efr_malani_jacob")
+    # Pivot
+    tb_un = tb_un.pivot(index=["country", "year"], columns=["sex"], values=["efr", "cumulative_survival"]).reset_index()
 
-    #
-    # Process data.
-    #
-    tb = geo.harmonize_countries(
-        df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
-    )
-    tb = tb.format(["country", "year"])
+    def rename_col(colname):
+        mapping = {
+            "female": "repr",
+            "total": "labor",
+        }
+
+        if colname[1] == "":
+            return colname[0]
+        else:
+            return f"{colname[0]}_{mapping.get(colname[1])}"
+
+    tb_un.columns = [rename_col(col) for col in tb_un.columns]
+
+    # Format
+    tb_un = tb_un.format(["country", "year"], short_name="un")
+
+    tables = [
+        tb_un,
+    ]
 
-    #
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
     ds_garden = create_dataset(
         dest_dir,
-        tables=[tb],
+        tables=tables,
         check_variables_metadata=True,
     )
 
     # Save changes in the new garden dataset.
     ds_garden.save()
+
+
+def estimate_un_cum_survival(tb, tb_proj):
+    # Concatenate
+    tb = pr.concat([tb, tb_proj], ignore_index=True)
+
+    # Rename columns
+    tb = tb.rename(columns={"location": "country"})
+
+    # Filter 'total' and 'female'
+    tb = tb.loc[tb["sex"].isin(["total", "female"]), COLUMNS_UN]
+
+    # Dtypes
+    tb["age"] = tb["age"].str.replace("100+", "100").astype("UInt16")
+
+    # Scale
+    tb["probability_of_survival"] /= 100
+
+    # Cumulative product
+    # We estimate the cumulative survival probability. This is the probability to survive from birth to a given age.
+    # The source provides the probability to survive from one age to the next (pn = probability to survive age n to n+1).
+    # To estimate this for people born in 1950, we need the data of p0 in 1950, p1 in 1951, etc. That's why we create year_born.
+    # After that, we just do the cumulative product for each year_born.
+    # Note that for the cumulative product to make sense, we need to first sort table by age!
+    # Step 1: Replace year with "cohort year"
+    tb["year"] = tb["year"] - tb["age"]
+    # Step 2: We only estimate the cumulative survival probability for people born between 1950 and 2023 (reduction of 50% rows)
+    tb = tb.loc[(tb["year"] >= YEAR_UN_START) & (tb["year"] <= YEAR_UN_END)]
+    # Step 3: Sort by age, so we can do the cumulative product later
+    tb = tb.sort_values(["country", "sex", "year", "age"], ignore_index=True)
+    # Step 4: Estimate cumulative survival probability
+    tb["cumulative_survival"] = tb.groupby(["country", "sex", "year"])["probability_of_survival"].cumprod()
+    # Step 5: Keep only years of interest (15-65), further reduction of 65% rows (aggregate -83%)
+    tb = tb.loc[(tb["age"] >= AGE_LAB_START) & (tb["age"] <= AGE_LAB_END)]
+    # # Step 6: Drop columns
+    # tb = tb.drop(columns=["year_born"])
+
+    return tb
diff --git a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py
index 8fb82651c03..f805b3f04b9 100644
--- a/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py
+++ b/etl/steps/data/grapher/demography/2024-12-17/efr_malani_jacob.py
@@ -14,14 +14,14 @@ def run(dest_dir: str) -> None:
     ds_garden = paths.load_dataset("efr_malani_jacob")
 
     # Read table from garden dataset.
-    tb = ds_garden.read("efr_malani_jacob", reset_index=False)
+    tables = list(ds_garden)
 
     #
     # Save outputs.
     #
     # Create a new grapher dataset with the same metadata as the garden dataset.
     ds_grapher = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+        dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata
     )
 
     # Save changes in the new grapher dataset.