Skip to content

Commit

Permalink
Only generate alphanumeric entity IDs in test - non-printable charact…
Browse files Browse the repository at this point in the history
…ers seem to break groupby. (#2993)
  • Loading branch information
jdangerx authored Oct 31, 2023
1 parent bbd82ba commit 47304b3
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions test/unit/io_managers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,9 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path):

example_schema = pandera.DataFrameSchema(
{
"entity_id": pandera.Column(str, nullable=False),
"entity_id": pandera.Column(
str, pandera.Check.str_matches(r"[0-9a-zA-Z]+"), nullable=False
),
"date": pandera.Column("datetime64[ns]", nullable=False),
"utility_type": pandera.Column(
str,
Expand All @@ -368,7 +370,6 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path):
)


@pytest.mark.xfail
@hypothesis.given(example_schema.strategy(size=3))
def test_filter_for_freshest_data(df):
# XBRL context is the identifying metadata for reported values
Expand All @@ -382,7 +383,7 @@ def test_filter_for_freshest_data(df):

# every post-deduplication row exists in the original rows
assert (deduped.merge(df, how="left", indicator=True)._merge != "left_only").all()
# for every [entity_id, utility_type, date] - th"true"e is only one row
# for every [entity_id, utility_type, date] - there is only one row
assert (~deduped.duplicated(subset=xbrl_context_cols)).all()
# for every *context* in the input there is a corresponding row in the output
original_contexts = df.groupby(xbrl_context_cols, as_index=False).last()
Expand All @@ -393,7 +394,9 @@ def test_filter_for_freshest_data(df):
suffixes=["_in", "_out"],
indicator=True,
).set_index(xbrl_context_cols)
hypothesis.note(f"Found these contexts in input data:\n{original_contexts}")
hypothesis.note(
f"Found these contexts ({xbrl_context_cols}) in input data:\n{original_contexts[xbrl_context_cols]}"
)
hypothesis.note(f"The freshest data:\n{deduped}")
hypothesis.note(f"Paired by context:\n{paired_by_context}")
assert (paired_by_context._merge == "both").all()
Expand Down

0 comments on commit 47304b3

Please sign in to comment.