From 47304b388930b864397b4d67f6fdef4824db0624 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 31 Oct 2023 11:21:10 -0400 Subject: [PATCH] Only generate alphanumeric entity IDs in test - non-printable characters seem to break groupby. (#2993) --- test/unit/io_managers_test.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/unit/io_managers_test.py b/test/unit/io_managers_test.py index 29003400e5..a4ba43efc2 100644 --- a/test/unit/io_managers_test.py +++ b/test/unit/io_managers_test.py @@ -353,7 +353,9 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path): example_schema = pandera.DataFrameSchema( { - "entity_id": pandera.Column(str, nullable=False), + "entity_id": pandera.Column( + str, pandera.Check.str_matches(r"[0-9a-zA-Z]+"), nullable=False + ), "date": pandera.Column("datetime64[ns]", nullable=False), "utility_type": pandera.Column( str, @@ -368,7 +370,6 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path): ) -@pytest.mark.xfail @hypothesis.given(example_schema.strategy(size=3)) def test_filter_for_freshest_data(df): # XBRL context is the identifying metadata for reported values @@ -382,7 +383,7 @@ def test_filter_for_freshest_data(df): # every post-deduplication row exists in the original rows assert (deduped.merge(df, how="left", indicator=True)._merge != "left_only").all() - # for every [entity_id, utility_type, date] - th"true"e is only one row + # for every [entity_id, utility_type, date] - there is only one row assert (~deduped.duplicated(subset=xbrl_context_cols)).all() # for every *context* in the input there is a corresponding row in the output original_contexts = df.groupby(xbrl_context_cols, as_index=False).last() @@ -393,7 +394,9 @@ def test_filter_for_freshest_data(df): suffixes=["_in", "_out"], indicator=True, ).set_index(xbrl_context_cols) - hypothesis.note(f"Found these contexts in input data:\n{original_contexts}") + hypothesis.note( + f"Found these contexts ({xbrl_context_cols}) in input data:\n{original_contexts[xbrl_context_cols]}" + ) hypothesis.note(f"The freshest data:\n{deduped}") hypothesis.note(f"Paired by context:\n{paired_by_context}") assert (paired_by_context._merge == "both").all()