Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data maturity for 923m #2936

Merged
merged 18 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ Data Coverage
^^^^^^^^^^^^^

* Updated :doc:`data_sources/eia860` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022 and
monthly YTD data as of April 2023.
* Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new
CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico
and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs
Expand Down
12 changes: 12 additions & 0 deletions src/pudl/extract/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import dbfread
import pandas as pd
import regex as re

import pudl

Expand Down Expand Up @@ -195,6 +196,17 @@ def add_data_maturity(self, df: pd.DataFrame, page, **partition) -> pd.DataFrame
maturity = "provisional"
elif self._dataset_name == "eia860m":
maturity = "monthly_update"
elif self._dataset_name == "eia923":
# this conditional is a hacky way to get around the fact that the formatting
# for the file names didn't always look like "EIA923_Schedules_2_3_4_5_M_"
# But, because we only want YTD data, it's fine to set a year floor.
if partition["year"] > 2022:
release_month = re.search(
r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
self.excel_filename(page, **partition),
).group(1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the filename for the YTD file always contains EIA923_Schedules_2_3_4_5_M_ you could do something like:

Suggested change
# this conditional is a hacky way to get around the fact that the formatting
# for the file names didn't always look like "EIA923_Schedules_2_3_4_5_M_"
# But, because we only want YTD data, it's fine to set a year floor.
if partition["year"] > 2022:
release_month = re.search(
r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
self.excel_filename(page, **partition),
).group(1)
file_name = self.excel_filename(page, **partition)
ytd_file_name_start = "EIA923_Schedules_2_3_4_5_M_"
if ytd_file_name_start in file_name:
release_month = re.search(
r"ytd_file_name_start(\d{2})", file_name,
).group(1)

i think you could probably even use the if ytd_file_name_start in file_name: as the elif instead of checking for 923 first.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense, but also it won't fail if there is no capture group found. I feel like it would be good if this failed if, for example, the file naming changed and we needed to come up with a new way to grab it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I don't think you can use a variable in re.search if you also have the {2} in brackets because it will think it's also an f-string variable.

I.e., release_month = re.search(r"ytd_file_name_start(\d{2})", file_name).group(1) won't work.

if release_month != "12":
maturity = "incremental_ytd"
df = df.assign(data_maturity=maturity)
self.cols_added.append("data_maturity")
return df
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
},
"field_namespace": "eia",
"working_partitions": {
"years": sorted(set(range(2001, 2023))),
"years": sorted(set(range(2001, 2024))),
},
"contributors": [
CONTRIBUTORS["catalyst-cooperative"],
Expand Down
28 changes: 28 additions & 0 deletions src/pudl/output/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,30 @@ def _fill_fuel_costs_by_state(
return out_df


def drop_ytd_for_annual_tables(df: pd.DataFrame, freq: str) -> pd.DataFrame:
"""Drop records in annual tables where data_maturity is incremental_ytd.

This avoids accidental aggregation errors due to sub-annually reported data.

Args:
df: A pd.DataFrame that contains a data_maturity column and for
which you want to drop values where data_maturity = incremental_ytd.
freq: either MS or AS to indicate the level of aggretation for a specific table.

Returns:
pd.DataFrame: The same input pd.DataFrames but without any rows where
data_maturity = incremental_ytd.
"""
if freq == "AS":
logger.info(
"Removing rows where data_maturity is incremental_ytd to avoid "
"aggregation errors."
)
return df.loc[df["data_maturity"] != "incremental_ytd"]
else:
return df
aesharpe marked this conversation as resolved.
Show resolved Hide resolved


#####################################################################################
# Simple Denormalized Assets
#####################################################################################
Expand Down Expand Up @@ -339,6 +363,7 @@ def generation_agg_eia923(
denorm_generation_eia923.set_index(
pd.DatetimeIndex(denorm_generation_eia923.report_date)
)
.pipe(drop_ytd_for_annual_tables, freq)
.groupby(
by=["plant_id_eia", "generator_id", pd.Grouper(freq=freq)],
observed=True,
Expand Down Expand Up @@ -367,6 +392,7 @@ def generation_fuel_combined_agg_eia923(
denorm_generation_fuel_combined_eia923.set_index(
pd.DatetimeIndex(denorm_generation_fuel_combined_eia923.report_date)
)
.pipe(drop_ytd_for_annual_tables, freq)
.groupby(
by=[
"plant_id_eia",
Expand Down Expand Up @@ -439,6 +465,7 @@ def boiler_fuel_agg_eia923(
total_ash_content=lambda x: x.fuel_consumed_units * x.ash_content_pct,
)
.set_index(pd.DatetimeIndex(denorm_boiler_fuel_eia923.report_date))
.pipe(drop_ytd_for_annual_tables, freq)
.groupby(
by=[
"plant_id_eia",
Expand Down Expand Up @@ -501,6 +528,7 @@ def fuel_receipts_costs_agg_eia923(
total_chlorine_content=lambda x: x.chlorine_content_ppm
* x.fuel_received_units,
)
.pipe(drop_ytd_for_annual_tables, freq)
.groupby(
by=["plant_id_eia", "fuel_type_code_pudl", pd.Grouper(freq=freq)],
observed=True,
Expand Down
Loading
Loading