catalyst-cooperative · aesharpe · Nov 1, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -70,7 +70,8 @@ Data Coverage
 ^^^^^^^^^^^^^
 
 * Updated :doc:`data_sources/eia860` to include early release data from 2022.
-* Updated :doc:`data_sources/eia923` to include early release data from 2022.
+* Updated :doc:`data_sources/eia923` to include early release data from 2022 and 
+  monthly YTD data as of April 2023.
 * Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new
   CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico
   and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs

diff --git a/src/pudl/extract/excel.py b/src/pudl/extract/excel.py
@@ -4,6 +4,7 @@
 
 import dbfread
 import pandas as pd
+import regex as re
 
 import pudl
 
@@ -195,6 +196,17 @@ def add_data_maturity(self, df: pd.DataFrame, page, **partition) -> pd.DataFrame
             maturity = "provisional"
         elif self._dataset_name == "eia860m":
             maturity = "monthly_update"
+        elif self._dataset_name == "eia923":
+            # this conditional is a hacky way to get around the fact that the formatting
+            # for the file names didn't always look like "EIA923_Schedules_2_3_4_5_M_"
+            # But, because we only want YTD data, it's fine to set a year floor.
+            if partition["year"] > 2022:
+                release_month = re.search(
+                    r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
+                    self.excel_filename(page, **partition),
+                ).group(1)
-            # this conditional is a hacky way to get around the fact that the formatting
-            # for the file names didn't always look like "EIA923_Schedules_2_3_4_5_M_"
-            # But, because we only want YTD data, it's fine to set a year floor.
-            if partition["year"] > 2022:
-                release_month = re.search(
-                    r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
-                    self.excel_filename(page, **partition),
-                ).group(1)
+            file_name = self.excel_filename(page, **partition)
+            ytd_file_name_start = "EIA923_Schedules_2_3_4_5_M_"
+            if ytd_file_name_start in file_name:
+                release_month = re.search(
+                    r"ytd_file_name_start(\d{2})", file_name,
+                ).group(1)
-            # this conditional is a hacky way to get around the fact that the formatting
-            # for the file names didn't always look like "EIA923_Schedules_2_3_4_5_M_"
-            # But, because we only want YTD data, it's fine to set a year floor.
-            if partition["year"] > 2022:
-                release_month = re.search(
-                    r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
-                    self.excel_filename(page, **partition),
-                ).group(1)
+            file_name = self.excel_filename(page, **partition)
+            ytd_file_name_start = "EIA923_Schedules_2_3_4_5_M_"
+            if ytd_file_name_start in file_name:
+                release_month = re.search(
+                    r"ytd_file_name_start(\d{2})", file_name,
+                ).group(1)
+                if release_month != "12":
+                    maturity = "incremental_ytd"
         df = df.assign(data_maturity=maturity)
         self.cols_added.append("data_maturity")
         return df

diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py
@@ -191,7 +191,7 @@
         },
         "field_namespace": "eia",
         "working_partitions": {
-            "years": sorted(set(range(2001, 2023))),
+            "years": sorted(set(range(2001, 2024))),
         },
         "contributors": [
             CONTRIBUTORS["catalyst-cooperative"],

diff --git a/src/pudl/output/eia923.py b/src/pudl/output/eia923.py
@@ -126,6 +126,30 @@ def _fill_fuel_costs_by_state(
     return out_df
 
 
+def drop_ytd_for_annual_tables(df: pd.DataFrame, freq: str) -> pd.DataFrame:
+    """Drop records in annual tables where data_maturity is incremental_ytd.
+
+    This avoids accidental aggregation errors due to sub-annually reported data.
+
+    Args:
+        df: A pd.DataFrame that contains a data_maturity column and for
+            which you want to drop values where data_maturity = incremental_ytd.
+        freq: either MS or AS to indicate the level of aggretation for a specific table.
+
+    Returns:
+        pd.DataFrame: The same input pd.DataFrames but without any rows where
+            data_maturity = incremental_ytd.
+    """
+    if freq == "AS":
+        logger.info(
+            "Removing rows where data_maturity is incremental_ytd to avoid "
+            "aggregation errors."
+        )
+        return df.loc[df["data_maturity"] != "incremental_ytd"]
+    else:
+        return df
+
+
 #####################################################################################
 # Simple Denormalized Assets
 #####################################################################################
@@ -339,6 +363,7 @@ def generation_agg_eia923(
             denorm_generation_eia923.set_index(
                 pd.DatetimeIndex(denorm_generation_eia923.report_date)
             )
+            .pipe(drop_ytd_for_annual_tables, freq)
             .groupby(
                 by=["plant_id_eia", "generator_id", pd.Grouper(freq=freq)],
                 observed=True,
@@ -367,6 +392,7 @@ def generation_fuel_combined_agg_eia923(
             denorm_generation_fuel_combined_eia923.set_index(
                 pd.DatetimeIndex(denorm_generation_fuel_combined_eia923.report_date)
             )
+            .pipe(drop_ytd_for_annual_tables, freq)
             .groupby(
                 by=[
                     "plant_id_eia",
@@ -439,6 +465,7 @@ def boiler_fuel_agg_eia923(
                 total_ash_content=lambda x: x.fuel_consumed_units * x.ash_content_pct,
             )
             .set_index(pd.DatetimeIndex(denorm_boiler_fuel_eia923.report_date))
+            .pipe(drop_ytd_for_annual_tables, freq)
             .groupby(
                 by=[
                     "plant_id_eia",
@@ -501,6 +528,7 @@ def fuel_receipts_costs_agg_eia923(
                 total_chlorine_content=lambda x: x.chlorine_content_ppm
                 * x.fuel_received_units,
             )
+            .pipe(drop_ytd_for_annual_tables, freq)
             .groupby(
                 by=["plant_id_eia", "fuel_type_code_pudl", pd.Grouper(freq=freq)],
                 observed=True,