From 47304b388930b864397b4d67f6fdef4824db0624 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 31 Oct 2023 11:21:10 -0400
Subject: [PATCH] Only generate alphanumeric entity IDs in test - non-printable
 characters seem to break groupby. (#2993)

---
 test/unit/io_managers_test.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/unit/io_managers_test.py b/test/unit/io_managers_test.py
index 29003400e5..a4ba43efc2 100644
--- a/test/unit/io_managers_test.py
+++ b/test/unit/io_managers_test.py
@@ -353,7 +353,9 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path):
 
 example_schema = pandera.DataFrameSchema(
     {
-        "entity_id": pandera.Column(str, nullable=False),
+        "entity_id": pandera.Column(
+            str, pandera.Check.str_matches(r"[0-9a-zA-Z]+"), nullable=False
+        ),
         "date": pandera.Column("datetime64[ns]", nullable=False),
         "utility_type": pandera.Column(
             str,
@@ -368,7 +370,6 @@ def test_ferc_xbrl_sqlite_io_manager_dedupes(mocker, tmp_path):
 )
 
 
-@pytest.mark.xfail
 @hypothesis.given(example_schema.strategy(size=3))
 def test_filter_for_freshest_data(df):
     # XBRL context is the identifying metadata for reported values
@@ -382,7 +383,7 @@ def test_filter_for_freshest_data(df):
 
     # every post-deduplication row exists in the original rows
     assert (deduped.merge(df, how="left", indicator=True)._merge != "left_only").all()
-    # for every [entity_id, utility_type, date] - th"true"e is only one row
+    # for every [entity_id, utility_type, date] - there is only one row
     assert (~deduped.duplicated(subset=xbrl_context_cols)).all()
     # for every *context* in the input there is a corresponding row in the output
     original_contexts = df.groupby(xbrl_context_cols, as_index=False).last()
@@ -393,7 +394,9 @@ def test_filter_for_freshest_data(df):
         suffixes=["_in", "_out"],
         indicator=True,
     ).set_index(xbrl_context_cols)
-    hypothesis.note(f"Found these contexts in input data:\n{original_contexts}")
+    hypothesis.note(
+        f"Found these contexts ({xbrl_context_cols}) in input data:\n{original_contexts[xbrl_context_cols]}"
+    )
     hypothesis.note(f"The freshest data:\n{deduped}")
     hypothesis.note(f"Paired by context:\n{paired_by_context}")
     assert (paired_by_context._merge == "both").all()