Skip to content

Commit

Permalink
Merge pull request #2915 from catalyst-cooperative/better-calc-checks
Browse files Browse the repository at this point in the history
Improve calculation error checking
  • Loading branch information
zaneselvans authored Oct 31, 2023
2 parents 7707a34 + ab71e2d commit bbd82ba
Show file tree
Hide file tree
Showing 5 changed files with 979 additions and 234 deletions.
24 changes: 13 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,21 @@ select = [
"W", # pycodestyle warnings
]
ignore = [
"D401", # Require imperative mood in docstrings.
"D401", # Require imperative mood in docstrings.
"D417",
"E501", # Overlong lines.
"E203", # Space before ':' (black recommends to ignore)
"PD003", # Use of isna rather than isnull
"PD004", # Use of notna rather than notnull
"PD008", # Use of df.at[] rather than df.loc[]
"PD010", # Use of df.stack()
"PD013", # Use of df.unstack()
"PD015", # Use of pd.merge() rather than df.merge()
"PD901", # df as variable name
"E501", # Overlong lines.
"E203", # Space before ':' (black recommends to ignore)
"E226", # Missing whitespace around arithmetic operator
"E266", # Too many leading `#` before block comment
"PD003", # Use of isna rather than isnull
"PD004", # Use of notna rather than notnull
"PD008", # Use of df.at[] rather than df.loc[]
"PD010", # Use of df.stack()
"PD013", # Use of df.unstack()
"PD015", # Use of pd.merge() rather than df.merge()
"PD901", # df as variable name
"RET504", # Ignore unnecessary assignment before return
"S101", # Use of assert
"S101", # Use of assert
]

# Assume Python 3.11
Expand Down
166 changes: 116 additions & 50 deletions src/pudl/output/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,42 +12,108 @@
from matplotlib import pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from pandas._libs.missing import NAType as pandas_NAType
from pydantic import BaseModel, confloat, validator
from pydantic import BaseModel, validator

import pudl
from pudl.transform.ferc1 import (
GroupMetricChecks,
GroupMetricTolerances,
MetricTolerances,
)

logger = pudl.logging_helpers.get_logger(__name__)


class CalculationToleranceFerc1(BaseModel):
"""Data quality expectations related to FERC 1 calculations.
We are doing a lot of comparisons between calculated and reported values to identify
reporting errors in the data, errors in FERC's metadata, and bugs in our own code.
This class provides a structure for encoding our expectations about the level of
acceptable (or at least expected) errors, and allows us to pass them around.
In the future we might also want to specify much more granular expectations,
pertaining to individual tables, years, utilities, or facts to ensure that we don't
have low overall error rates, but a problem with the way the data or metadata is
reported in a particular year. We could also define per-filing and per-table error
tolerances to help us identify individual utilities that have e.g. used an outdated
version of Form 1 when filing.
"""

intertable_calculation_errors: confloat(ge=0.0, le=1.0) = 0.05
"""Fraction of interatble calculations that are allowed to not match exactly."""


EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationToleranceFerc1] = {
"income_statement_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.20,
EXPLOSION_CALCULATION_TOLERANCES: dict[str, GroupMetricChecks] = {
"income_statement_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.02,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.036,
relative_error_magnitude=0.048,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.35,
relative_error_magnitude=0.17,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.13,
relative_error_magnitude=0.42,
null_calculated_value_frequency=1.0,
),
),
),
"balance_sheet_assets_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.65,
"balance_sheet_assets_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.014,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.12,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.37,
relative_error_magnitude=0.22,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.21,
relative_error_magnitude=0.26,
null_calculated_value_frequency=1.0,
),
),
),
"balance_sheet_liabilities_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.07,
"balance_sheet_liabilities_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.019,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.019,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.063,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
),
),
}

Expand Down Expand Up @@ -980,7 +1046,7 @@ def exploded_table_asset_factory(
root_table: str,
table_names_to_explode: list[str],
seed_nodes: list[NodeId],
calculation_tolerance: CalculationToleranceFerc1,
group_metric_checks: GroupMetricChecks,
io_manager_key: str | None = None,
) -> AssetsDefinition:
"""Create an exploded table based on a set of related input tables."""
Expand Down Expand Up @@ -1017,7 +1083,7 @@ def exploded_tables_asset(
calculation_components_xbrl_ferc1=calculation_components_xbrl_ferc1,
seed_nodes=seed_nodes,
tags=tags,
calculation_tolerance=calculation_tolerance,
group_metric_checks=group_metric_checks,
).boom(tables_to_explode=tables_to_explode)

return exploded_tables_asset
Expand All @@ -1039,7 +1105,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"electric_operating_expenses_ferc1",
"electric_operating_revenues_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"income_statement_ferc1"
],
"seed_nodes": [
Expand All @@ -1060,7 +1126,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"plant_in_service_ferc1",
"electric_plant_depreciation_functional_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"balance_sheet_assets_ferc1"
],
"seed_nodes": [
Expand All @@ -1079,7 +1145,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"balance_sheet_liabilities_ferc1",
"retained_earnings_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"balance_sheet_liabilities_ferc1"
],
"seed_nodes": [
Expand Down Expand Up @@ -1110,7 +1176,7 @@ def __init__(
calculation_components_xbrl_ferc1: pd.DataFrame,
seed_nodes: list[NodeId],
tags: pd.DataFrame = pd.DataFrame(),
calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1(),
group_metric_checks: GroupMetricChecks = GroupMetricChecks(),
):
"""Instantiate an Exploder class.
Expand All @@ -1124,7 +1190,7 @@ def __init__(
"""
self.table_names: list[str] = table_names
self.root_table: str = root_table
self.calculation_tolerance = calculation_tolerance
self.group_metric_checks = group_metric_checks
self.metadata_xbrl_ferc1 = metadata_xbrl_ferc1
self.calculation_components_xbrl_ferc1 = calculation_components_xbrl_ferc1
self.seed_nodes = seed_nodes
Expand Down Expand Up @@ -1302,7 +1368,7 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1":
exploded_meta=self.exploded_meta,
seeds=self.seed_nodes,
tags=self.tags,
calculation_tolerance=self.calculation_tolerance,
group_metric_checks=self.group_metric_checks,
)

@cached_property
Expand Down Expand Up @@ -1366,8 +1432,6 @@ def boom(self: Self, tables_to_explode: dict[str, pd.DataFrame]) -> pd.DataFrame
Args:
tables_to_explode: dictionary of table name (key) to transfomed table (value).
calculation_tolerance: What proportion (0-1) of calculated values are
allowed to be incorrect without raising an AssertionError.
"""
exploded = (
self.initial_explosion_concatenation(tables_to_explode)
Expand Down Expand Up @@ -1454,7 +1518,7 @@ def reconcile_intertable_calculations(
components originate entirely or partially outside of the table. It also
accounts for components that only sum to a factoid within a particular dimension
(e.g., for an electric utility or for plants whose plant_function is
"in_service"). This returns a dataframe with a "calculated_amount" column.
"in_service"). This returns a dataframe with a "calculated_value" column.
Args:
exploded: concatenated tables for table explosion.
Expand All @@ -1479,11 +1543,13 @@ def reconcile_intertable_calculations(
value_col=self.value_col,
)
calculated_df = pudl.transform.ferc1.check_calculation_metrics(
calculated_df=calculated_df, group_metric_checks=self.group_metric_checks
)
calculated_df = pudl.transform.ferc1.add_corrections(
calculated_df=calculated_df,
value_col=self.value_col,
calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors,
is_close_tolerance=pudl.transform.ferc1.IsCloseTolerance(),
table_name=self.root_table,
add_corrections=True,
)
logger.info("Checking sub-total calcs.")
subtotal_calcs = pudl.transform.ferc1.calculate_values_from_components(
Expand All @@ -1496,10 +1562,7 @@ def reconcile_intertable_calculations(
)
subtotal_calcs = pudl.transform.ferc1.check_calculation_metrics(
calculated_df=subtotal_calcs,
value_col=self.value_col,
calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors,
table_name=self.root_table,
add_corrections=True,
group_metric_checks=self.group_metric_checks,
)
return calculated_df

Expand Down Expand Up @@ -1551,7 +1614,7 @@ class XbrlCalculationForestFerc1(BaseModel):
exploded_calcs: pd.DataFrame = pd.DataFrame()
seeds: list[NodeId] = []
tags: pd.DataFrame = pd.DataFrame()
calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1()
group_metric_checks: GroupMetricChecks = GroupMetricChecks()

class Config:
"""Allow the class to store a dataframe."""
Expand Down Expand Up @@ -1785,8 +1848,8 @@ def check_conflicting_tags(annotated_forest: nx.DiGraph) -> None:
nodes = annotated_forest.nodes
for ancestor in nodes:
for descendant in nx.descendants(annotated_forest, ancestor):
for tag in nodes[ancestor]["tags"]:
if tag in nodes[descendant]["tags"]:
for tag in nodes[ancestor].get("tags", {}):
if tag in nodes[descendant].get("tags", {}):
ancestor_tag_value = nodes[ancestor]["tags"][tag]
descendant_tag_value = nodes[descendant]["tags"][tag]
if ancestor_tag_value != descendant_tag_value:
Expand Down Expand Up @@ -2069,7 +2132,7 @@ def leafy_meta(self: Self) -> pd.DataFrame:
leaf_tags = {}
ancestors = list(nx.ancestors(self.annotated_forest, leaf)) + [leaf]
for node in ancestors:
leaf_tags |= self.annotated_forest.nodes[node]["tags"]
leaf_tags |= self.annotated_forest.nodes[node].get("tags", {})
all_leaf_weights = {
self._get_path_weight(path, self.annotated_forest)
for path in nx.all_simple_paths(
Expand Down Expand Up @@ -2273,5 +2336,8 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame:
}
index = pd.DataFrame(node_dict.keys()).astype("string")
data = pd.DataFrame(node_dict.values())
tags = pd.json_normalize(data.tags).astype("string")
try:
tags = pd.json_normalize(data.tags).astype("string")
except AttributeError:
tags = pd.DataFrame()
return pd.concat([index, tags], axis="columns")
Loading

0 comments on commit bbd82ba

Please sign in to comment.