Skip to content

Commit

Permalink
Merge pull request #2936 from catalyst-cooperative/add-data-maturity-…
Browse files Browse the repository at this point in the history
…for-923m

Add monthly data and data maturity for 923m
  • Loading branch information
aesharpe authored Nov 1, 2023
2 parents 8c60657 + b376141 commit dbf686f
Show file tree
Hide file tree
Showing 29 changed files with 561 additions and 378 deletions.
3 changes: 2 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ Data Coverage
^^^^^^^^^^^^^

* Updated :doc:`data_sources/eia860` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022 and
monthly YTD data as of April 2023.
* Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new
CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico
and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""add data_maturity to eia923m tables
Revision ID: 1ceb9897fd34
Revises: f11241c9292d
Create Date: 2023-10-26 16:30:33.771381
"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = '1ceb9897fd34'
down_revision = 'f11241c9292d'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('boiler_fuel_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_boiler_fuel_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_boiler_fuel_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_boiler_fuel_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_boiler_fuel_monthly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_boiler_fuel_monthly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_boiler_fuel_yearly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_boiler_fuel_yearly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_fuel_receipts_costs_monthly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_fuel_receipts_costs_monthly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_fuel_receipts_costs_yearly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_fuel_receipts_costs_yearly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_generation_fuel_combined_monthly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_generation_fuel_combined_monthly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_generation_fuel_combined_yearly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_generation_fuel_combined_yearly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_generation_monthly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_generation_monthly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_generation_yearly_eia923', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_generation_yearly_eia923_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

with op.batch_alter_table('denorm_plants_utilities_eia', schema=None) as batch_op:
batch_op.add_column(sa.Column('data_maturity', sa.Text(), nullable=True, comment='Level of maturity of the data record. Some data sources report less-than-final data. PUDL sometimes includes this data, but use at your own risk.'))
batch_op.create_foreign_key(batch_op.f('fk_denorm_plants_utilities_eia_data_maturity_data_maturities'), 'data_maturities', ['data_maturity'], ['code'])

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('denorm_plants_utilities_eia', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_plants_utilities_eia_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_generation_yearly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_generation_yearly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_generation_monthly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_generation_monthly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_generation_fuel_combined_yearly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_generation_fuel_combined_yearly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_generation_fuel_combined_monthly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_generation_fuel_combined_monthly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_fuel_receipts_costs_yearly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_fuel_receipts_costs_yearly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_fuel_receipts_costs_monthly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_fuel_receipts_costs_monthly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_boiler_fuel_yearly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_boiler_fuel_yearly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_boiler_fuel_monthly_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_boiler_fuel_monthly_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('denorm_boiler_fuel_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_denorm_boiler_fuel_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

with op.batch_alter_table('boiler_fuel_eia923', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_boiler_fuel_eia923_data_maturity_data_maturities'), type_='foreignkey')
batch_op.drop_column('data_maturity')

# ### end Alembic commands ###
3 changes: 2 additions & 1 deletion src/pudl/analysis/allocate_gen_fuel.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ def allocate_gen_fuel_by_generator_energy_source(
# Add any startup energy source codes to the list of energy source codes
gens_at_freq = adjust_msw_energy_source_codes(gens_at_freq, gf, bf)
gens_at_freq = add_missing_energy_source_codes_to_gens(gens_at_freq, gf, bf)
# do the association!
# do the association! --> this step is where a small no. of plants are dropped for
# an unknown reason. Investigate in issue #2978.
gen_assoc = associate_generator_tables(
gens=gens_at_freq, gf=gf, gen=gen, bf=bf, bga=bga
)
Expand Down
4 changes: 4 additions & 0 deletions src/pudl/extract/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def process_raw(self, df, page, **partition):
if col in df.columns:
df = remove_leading_zeros_from_numeric_strings(df=df, col_name=col)
df = self.add_data_maturity(df, page, **partition)
# Fill in blank reporting_frequency_code for monthly data
df.loc[
df["data_maturity"] == "incremental_ytd", "reporting_frequency_code"
] = "M"
# the 2021 early release data had some ding dang "."'s and nulls in the year column
if "report_year" in df.columns:
mask = (df.report_year == ".") | df.report_year.isnull()
Expand Down
11 changes: 10 additions & 1 deletion src/pudl/extract/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import dbfread
import pandas as pd
import regex as re
from dagster import (
AssetsDefinition,
DynamicOut,
Expand Down Expand Up @@ -200,10 +201,18 @@ def add_data_maturity(self, df: pd.DataFrame, page, **partition) -> pd.DataFrame
``self.cols_added``.
"""
maturity = "final"
if "early_release" in self.excel_filename(page, **partition).lower():
file_name = self.excel_filename(page, **partition)
if "early_release" in file_name.lower():
maturity = "provisional"
elif self._dataset_name == "eia860m":
maturity = "monthly_update"
elif "EIA923_Schedules_2_3_4_5_M_" in file_name:
release_month = re.search(
r"EIA923_Schedules_2_3_4_5_M_(\d{2})",
file_name,
).group(1)
if release_month != "12":
maturity = "incremental_ytd"
df = df.assign(data_maturity=maturity)
self.cols_added.append("data_maturity")
return df
Expand Down
1 change: 1 addition & 0 deletions src/pudl/metadata/resources/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@
"utility_id_eia",
"utility_name_eia",
"utility_id_pudl",
"data_maturity",
],
"primary_key": ["report_date", "plant_id_eia", "utility_id_eia"],
},
Expand Down
18 changes: 10 additions & 8 deletions src/pudl/metadata/resources/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@
"fuel_mmbtu_per_unit",
"sulfur_content_pct",
"ash_content_pct",
# No data_maturity field
# see: https://github.com/catalyst-cooperative/pudl/issues/1847
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down Expand Up @@ -158,8 +157,7 @@
"fuel_consumed_mmbtu",
"sulfur_content_pct",
"ash_content_pct",
# No data_maturity field
# see: https://github.com/catalyst-cooperative/pudl/issues/1847
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down Expand Up @@ -194,8 +192,7 @@
"fuel_consumed_mmbtu",
"sulfur_content_pct",
"ash_content_pct",
# No data_maturity field
# see: https://github.com/catalyst-cooperative/pudl/issues/1847
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down Expand Up @@ -230,8 +227,7 @@
"fuel_consumed_mmbtu",
"sulfur_content_pct",
"ash_content_pct",
# No data_maturity field
# see: https://github.com/catalyst-cooperative/pudl/issues/1847
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down Expand Up @@ -382,6 +378,7 @@
"mercury_content_ppm",
"moisture_content_pct",
"chlorine_content_ppm",
"data_maturity",
],
},
"field_namespace": "eia",
Expand Down Expand Up @@ -412,6 +409,7 @@
"mercury_content_ppm",
"moisture_content_pct",
"chlorine_content_ppm",
"data_maturity",
],
},
"field_namespace": "eia",
Expand Down Expand Up @@ -470,6 +468,7 @@
"generator_id",
"unit_id_pudl",
"net_generation_mwh",
"data_maturity",
],
"primary_key": ["plant_id_eia", "generator_id", "report_date"],
},
Expand All @@ -491,6 +490,7 @@
"generator_id",
"unit_id_pudl",
"net_generation_mwh",
"data_maturity",
],
"primary_key": ["plant_id_eia", "generator_id", "report_date"],
},
Expand Down Expand Up @@ -581,6 +581,7 @@
"fuel_consumed_mmbtu",
"fuel_consumed_for_electricity_mmbtu",
"net_generation_mwh",
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down Expand Up @@ -613,6 +614,7 @@
"fuel_consumed_mmbtu",
"fuel_consumed_for_electricity_mmbtu",
"net_generation_mwh",
"data_maturity",
],
"primary_key": [
"plant_id_eia",
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
},
"field_namespace": "eia",
"working_partitions": {
"years": sorted(set(range(2001, 2023))),
"years": sorted(set(range(2001, 2024))),
},
"contributors": [
CONTRIBUTORS["catalyst-cooperative"],
Expand Down
9 changes: 6 additions & 3 deletions src/pudl/output/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def denorm_generators_eia(

# Bring in some generic plant & utility information:
pu_eia = denorm_plants_utilities_eia.drop(
["plant_name_eia", "utility_id_eia"], axis="columns"
["plant_name_eia", "utility_id_eia", "data_maturity"], axis="columns"
)
out_df = pd.merge(out_df, pu_eia, on=["report_date", "plant_id_eia"], how="left")

Expand Down Expand Up @@ -242,7 +242,9 @@ def denorm_boilers_eia(
# Bring in some generic plant & utility information:
out_df = pd.merge(
out_df,
denorm_plants_utilities_eia.drop(["plant_name_eia"], axis="columns"),
denorm_plants_utilities_eia.drop(
["plant_name_eia", "data_maturity"], axis="columns"
),
on=["report_date", "plant_id_eia"],
how="left",
)
Expand Down Expand Up @@ -324,7 +326,7 @@ def denorm_plants_utilities_eia(
# to avoid duplicate columns on the merge...
out_df = pd.merge(
plants_eia,
denorm_utilities_eia,
denorm_utilities_eia.drop(columns=["data_maturity"]),
how="left",
on=["report_date", "utility_id_eia"],
)
Expand All @@ -339,6 +341,7 @@ def denorm_plants_utilities_eia(
"utility_id_eia",
"utility_name_eia",
"utility_id_pudl",
"data_maturity",
],
].dropna(subset=["report_date", "plant_id_eia", "utility_id_eia"])
return out_df
Expand Down
Loading

0 comments on commit dbf686f

Please sign in to comment.