PR fixes: better naming, better rules, more assert checks

calliope-project · Jul 4, 2024 · f734975 · f734975
1 parent de2ba7f
commit f734975
Show file tree

Hide file tree

Showing 8 changed files with 172 additions and 178 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 ### Added (models)
 
-* **ADD** industry module and steel industry energy demand processing. NOT CONNECTED TO THE MAIN WORKFLOW. Industry sectors pending: chemical. (Fixes #308, #309, #310, #347, #345 and #346)
+* **ADD** Industry module: iron and steel, "default" combined categories. NOT CONNECTED TO THE MAIN WORKFLOW. (Fixes #308, #309, #310, #347, #345 and #346)
 
 * **ADD** Spatial resolution that aligns with the regions defined by the [e-Highway 2050 project](https://cordis.europa.eu/project/id/308908/reporting) (`ehighways`) (#370).
 

diff --git a/modules/industry/config.yaml b/modules/industry/config.yaml
@@ -1,18 +1,18 @@
 industry:
-    inputs:
-        path-energy-balances: build/data/annual-energy-balances.csv
-        path-cat-names: config/energy-balances/energy-balance-category-names.csv
-        path-carrier-names: config/energy-balances/energy-balance-carrier-names.csv
-        path-jrc-industry-energy: build/data/jrc-idees/industry/processed-energy.nc
-        path-jrc-industry-production: build/data/jrc-idees/industry/processed-production.nc
+    input-paths:
+        energy-balances: build/data/annual-energy-balances.csv
+        cat-names: config/energy-balances/energy-balance-category-names.csv
+        carrier-names: config/energy-balances/energy-balance-carrier-names.csv
+        jrc-industry-energy: build/data/jrc-idees/industry/processed-energy.nc
+        jrc-industry-production: build/data/jrc-idees/industry/processed-production.nc
     outputs:
         placeholder-out1:
         placeholder-out2:
     params:
-        non-generic-categories: ["Iron and steel", "Chemicals Industry"]
-        steel-config:
-            recycled-steel-share: 0.5  # % of recycled scrap steel for H-DRI
-        generic-config:
+        specific-categories: ["Iron and steel", "Chemicals Industry"]
+        config-combined-categories:
             final-energy-method: "by priority"
             final-energy-carriers: ["Electricity", "Natural gas (incl. biogas)", "Diesel oil (incl. biofuels)"]
             useful-demands: ["Low enthalpy heat"]
+        config-iron-and-steel:
+            recycled-steel-share: 0.5  # % of recycled scrap steel for H-DRI
diff --git a/modules/industry/industry.smk b/modules/industry/industry.smk
@@ -1,66 +1,68 @@
 from snakemake.utils import validate
 
 # Paths dependent on main Snakefile
-MODULE_PATH = "modules/industry"
+MODULE_PATH = "modules/industry"  # TODO: remove if the module becomes an imported external workflow
 BUILD_PATH = f"{MODULE_PATH}/build"
 DATA_PATH = f"{MODULE_PATH}/raw_data"
 
 # Paths relative to this snakefile (snakemake behaviour is inconsitent)
 SCRIPT_PATH = "scripts"  # scripts are called relative to this file
 CONDA_PATH = "./env_industry.yaml"
 
+configfile: "./config.yaml"
 validate(config, "./schema.yaml")
 
 # Ensure rules are defined in order.
 # Otherwise commands like "rules.rulename.output" won't work!
-if "Iron and steel" in config["params"]["non-generic-categories"]:
-    rule steel_processing:
-        message: "Calculate energy demand for the 'Iron and steel' sector in JRC-IDEES."
-        conda: CONDA_PATH
-        params:
-            steel_config = config["params"]["steel-config"]
-        input:
-            path_energy_balances = config["inputs"]["path-energy-balances"],
-            path_cat_names = config["inputs"]["path-cat-names"],
-            path_carrier_names = config["inputs"]["path-carrier-names"],
-            path_jrc_industry_energy = config["inputs"]["path-jrc-industry-energy"],
-            path_jrc_industry_production = config["inputs"]["path-jrc-industry-production"],
-        output:
-            path_output = f"{BUILD_PATH}/annual_demand_steel.nc"
-        script: f"{SCRIPT_PATH}/steel_processing.py"
+rule iron_and_steel:
+    message: "Calculate energy demand for the 'Iron and steel' sector in JRC-IDEES."
+    conda: CONDA_PATH
+    params:
+        config = config["params"]["config-iron-and-steel"]
+    input:
+        energy_balances = config["input-paths"]["energy-balances"],
+        cat_names = config["input-paths"]["cat-names"],
+        carrier_names = config["input-paths"]["carrier-names"],
+        jrc_industry_energy = config["input-paths"]["jrc-industry-energy"],
+        jrc_industry_production = config["input-paths"]["jrc-industry-production"],
+    output:
+        path_output = f"{BUILD_PATH}/annual_demand_iron_and_steel.nc"
+    script: f"{SCRIPT_PATH}/steel_processing.py"
 
-if "Chemicals Industry" in config["params"]["non-generic-categories"]:
-    rule chemicals_processing:
-        message: "."
-        conda: CONDA_PATH
-        params:
-        input:
-        output:
-        script: f"{SCRIPT_PATH}/chemicals_processing.py"
+rule chemicals_industry:
+    message: "."
+    # conda: CONDA_PATH
+    params:
+    input:
+    output: f"{BUILD_PATH}/annual_demand_chemicals_industry.nc"
+    shell:
+        "touch {output}"
+    # script: f"{SCRIPT_PATH}/annual_demand_chemicals_industry.py"
 
-rule generic_processing:
+rule combined_categories:
     message: "Calculate energy demand for all other industry sectors in JRC-IDEES."
     conda: CONDA_PATH
     params:
-        non_generic_categories = config["params"]["non-generic-categories"],
-        generic_config = config["params"]["generic-config"],
+        specific_categories = config["params"]["specific-categories"],
+        config = config["params"]["config-combined-categories"],
     input:
-        path_energy_balances = config["inputs"]["path-energy-balances"],
-        path_cat_names = config["inputs"]["path-cat-names"],
-        path_carrier_names = config["inputs"]["path-carrier-names"],
-        path_jrc_industry_energy = config["inputs"]["path-jrc-industry-energy"],
-        path_jrc_industry_production = config["inputs"]["path-jrc-industry-production"],
-    output:
-        path_output = f"{BUILD_PATH}/annual_demand_generic.nc"
+        energy_balances = config["input-paths"]["energy-balances"],
+        cat_names = config["input-paths"]["cat-names"],
+        carrier_names = config["input-paths"]["carrier-names"],
+        jrc_industry_energy = config["input-paths"]["jrc-industry-energy"],
+        jrc_industry_production = config["input-paths"]["jrc-industry-production"],
+    output: f"{BUILD_PATH}/annual_demand_combined_categories.nc"
     script: f"{SCRIPT_PATH}/generic_processing.py"
 
-# rule combine_and_scale:
-#     message: "."
-#     conda: CONDA_PATH
-#     params:
-#     input:
-#     output:
-#     script:
+SUFFIXES = [i.lower().replace(" ", "_") for i in config["params"]["specific-categories"]]
+rule combine_and_scale:
+    message: "Identify the category scripts to run based on the configuration."
+    conda: CONDA_PATH
+    input:
+        expand("{path}/annual_demand_{sample}.nc", path=[BUILD_PATH], sample=SUFFIXES),
+        rules.combined_categories.output
+    # output: "{BUILD_PATH}/annual_demand_aggregated.nc"
+
 
 # rule verify:
 #     message: "."

diff --git a/modules/industry/schema.yaml b/modules/industry/schema.yaml
@@ -2,31 +2,31 @@ $schema: https://json-schema.org/draft/2020-12/schema
 type: object
 additionalProperties: false
 properties:
-    inputs:
+    input-paths:
         type: object
         additionalProperties: false
         description: Inputs are paths of prerequired files.
         properties:
-            path-energy-balances:
+            energy-balances:
                 type: string
                 description: |
                     Annual energy balance file.
                     Columns [cat_code,carrier_code,unit,country,year,value].
-            path-cat-names:
+            cat-names:
                 type: string
                 description: |
                     Category mapping file.
                     Columns [cat_code,top_cat,sub_cat_contribution,sub_cat_1,sub_cat_2,jrc_idees].
-            path-carrier-names:
+            carrier-names:
                 type: string
                 description: |
                     Carrier mapping file.
                     Columns [carrier_code,carrier_name,hh_carrier_name,com_carrier_name,ind_carrier_name,oth_carrier_name].
-            path-jrc-industry-energy:
+            jrc-industry-energy:
                 type: string
                 description: |
                     JRC processed industry energy demand .nc file.
-            path-jrc-industry-production:
+            jrc-industry-production:
                 type: string
                 description: |
                     JRC processed industrial production .nc file.
@@ -38,24 +38,16 @@ properties:
         additionalProperties: false
         description: Parameters allow users to configure module behaviour.
         properties:
-            non-generic-categories:
+            specific-categories:
                 type: array
-                description: "Specifies which JRC industry categories will be processed separately."
+                description: |
+                    Specifies which JRC industry categories will be processed through category-specific rules.
+                    Omitted categories will instead be processed through the "combined" category rule.
                 uniqueItems: true
                 items:
                     type: string
                     enum: ["Iron and steel", "Chemicals Industry"]
-            steel-config:
-                type: object
-                additionalProperties: false
-                description: "Parameters specific to the 'Iron and steel' industry category."
-                properties:
-                    recycled-steel-share:
-                        type: number
-                        description: "Share of recycled metal in the H-DRI steel process."
-                        minimum: 0
-                        maximum: 1
-            generic-config:
+            config-combined-categories:
                 type: object
                 additionalProperties: false
                 description: "Parameters for default/generic category processing."
@@ -82,3 +74,13 @@ properties:
                         uniqueItems: true
                         items:
                             type: string
+            config-iron-and-steel:
+                type: object
+                additionalProperties: false
+                description: "Parameters specific to the 'Iron and steel' industry category."
+                properties:
+                    recycled-steel-share:
+                        type: number
+                        description: "Share of recycled metal in the H-DRI steel process."
+                        minimum: 0
+                        maximum: 1
diff --git a/modules/industry/scripts/generic_processing.py b/modules/industry/scripts/generic_processing.py
@@ -1,88 +1,84 @@
-from typing import Optional
-
 import pandas as pd
 import xarray as xr
 from utils import filling
 from utils import jrc_idees_parser as jrc
 
 
 def get_generic_demand(
-    non_generic_categories: list,
-    generic_config: dict,
-    path_energy_balances: str,
-    path_cat_names: str,
-    path_carrier_names: str,
-    path_jrc_industry_energy: str,
-    path_jrc_industry_production: str,
-    path_output: Optional[str] = None,
-) -> xr.DataArray:
-    """Processing of industry categories not selected for individual processing.
+    specific_categories: list,
+    config: dict,
+    energy_balances: str,
+    cat_names: str,
+    carrier_names: str,
+    jrc_industry_energy: str,
+    jrc_industry_production: str,
+    output_path: str,
+):
+    """Processing of industry categories not selected for specific processing.
 
     Merges all energy demand into a single `generic` category using a configurable data processing pipeline.
 
     Args:
-        non_generic_categories (list): categories with separate processing (will be ignored).
-        generic_config (dict): configuration for generic category processing.
-        path_energy_balances (str): country energy balances (usually from eurostat).
-        path_cat_names (str): eurostat category mapping file.
-        path_carrier_names (str): eurostat carrier name mapping file.
-        path_jrc_industry_energy (str): jrc country-specific industrial energy demand file.
-        path_jrc_industry_production (str): jrc country-specific industrial production file.
-        path_output (str): location of steel demand output file.
+        specific_categories (list): categories with separate processing (will be ignored).
+        config (dict): configuration for generic category processing.
+        energy_balances (str): country energy balances (usually from eurostat).
+        cat_names (str): eurostat category mapping file.
+        carrier_names (str): eurostat carrier name mapping file.
+        jrc_industry_energy (str): jrc country-specific industrial energy demand file.
+        jrc_industry_production (str): jrc country-specific industrial production file.
+        output_path (str): location of steel demand output file.
 
     Returns:
         pd.DataFrame: dataframe with industrial demand per country.
     """
     # Load data
     energy_balances_df = pd.read_csv(
-        path_energy_balances, index_col=[0, 1, 2, 3, 4]
+        energy_balances, index_col=[0, 1, 2, 3, 4]
     ).squeeze("columns")
-    cat_names_df = pd.read_csv(path_cat_names, header=0, index_col=0)
-    carrier_names_df = pd.read_csv(path_carrier_names, header=0, index_col=0)
-    jrc_energy = xr.open_dataset(path_jrc_industry_energy)
-    jrc_prod = xr.open_dataarray(path_jrc_industry_production)
+    cat_names_df = pd.read_csv(cat_names, header=0, index_col=0)
+    carrier_names_df = pd.read_csv(carrier_names, header=0, index_col=0)
+    jrc_energy = xr.open_dataset(jrc_industry_energy)
+    jrc_prod = xr.open_dataarray(jrc_industry_production)
+    jrc.check_units(jrc_energy, jrc_prod)
 
     # Remove data from all specifically processed industries
-    cat_names_df = cat_names_df[~cat_names_df["jrc_idees"].isin(non_generic_categories)]
-    jrc_energy = jrc_energy.drop_sel(cat_name=non_generic_categories)
-    jrc_prod = jrc_prod.drop_sel(cat_name=non_generic_categories)
+    cat_names_df = cat_names_df[~cat_names_df["jrc_idees"].isin(specific_categories)]
+    jrc_energy = jrc_energy.drop_sel(cat_name=specific_categories)
+    jrc_prod = jrc_prod.drop_sel(cat_name=specific_categories)
 
     # Process data:
-    # Extract useful dem. -> remove useful dem. from rest -> extract final dem.
-    selected_useful = generic_config["useful-demands"]
+    # Extract useful demand -> remove useful demand from rest -> extract final demand
+    selected_useful = config["useful-demands"]
     other_useful_demand = jrc.convert_subsection_demand_to_carrier(
         jrc_energy, selected_useful
     )
 
-    final_method = generic_config["final-energy-method"]
+    final_method = config["final-energy-method"]
     jrc_energy = jrc_energy.drop_sel(subsection=selected_useful)
 
     match final_method:
         case "by priority":
             other_final_demand = transform_final_demand_by_priority(
-                jrc_energy, generic_config["final-energy-carriers"]
+                jrc_energy, config["final-energy-carriers"]
             )
         case "keep everything":
             other_final_demand = jrc_energy["final"].sum(["section", "subsection"])
             other_final_demand = jrc.standardize(other_final_demand, "twh")
         case _:
             raise ValueError(f"Unsupported final energy method: {final_method}.")
-
     # Combine and fill missing countries
     other_demand = xr.concat(
         [other_useful_demand, other_final_demand], dim="carrier_name"
     )
 
+    assert other_demand.sum() < jrc_energy["final"].sum(), "Potential double counting!"
+
     other_demand = filling.fill_missing_countries_years(
         energy_balances_df, cat_names_df, carrier_names_df, other_demand
     )
 
-    other_demand = jrc.standardize(other_demand, "twh")
-
-    if path_output:
-        other_demand.to_netcdf(path_output)
-
-    return other_demand
+    other_demand = jrc.standardize(other_demand, "twh", "demand")
+    other_demand.to_netcdf(output_path)
 
 
 def transform_final_demand_by_priority(
@@ -110,7 +106,7 @@ def transform_final_demand_by_priority(
     carrier_final_dem = {}
 
     for carrier in carrier_priority:
-        dem_replaced = jrc.replace_final_demand_by_carrier(carrier, jrc_energy)
+        dem_replaced = jrc.replace_carrier_final_demand(carrier, jrc_energy)
         dem_replaced = dem_replaced.to_dataframe().dropna()
         for dem_replaced_prev in carrier_final_dem.values():
             dem_replaced = dem_replaced.drop(dem_replaced_prev.index, errors="ignore")
@@ -131,12 +127,12 @@ def transform_final_demand_by_priority(
 
 if __name__ == "__main__":
     get_generic_demand(
-        non_generic_categories=snakemake.params.non_generic_categories,
-        generic_config=snakemake.params.generic_config,
-        path_energy_balances=snakemake.input.path_energy_balances,
-        path_cat_names=snakemake.input.path_cat_names,
-        path_carrier_names=snakemake.input.path_carrier_names,
-        path_jrc_industry_energy=snakemake.input.path_jrc_industry_energy,
-        path_jrc_industry_production=snakemake.input.path_jrc_industry_production,
-        path_output=snakemake.output.path_output,
+        specific_categories=snakemake.params.specific_categories,
+        config=snakemake.params.config,
+        energy_balances=snakemake.input.energy_balances,
+        cat_names=snakemake.input.cat_names,
+        carrier_names=snakemake.input.carrier_names,
+        jrc_industry_energy=snakemake.input.jrc_industry_energy,
+        jrc_industry_production=snakemake.input.jrc_industry_production,
+        output_path=snakemake.output[0],
     )