Fix datetime related assertions and warnings in pytests (#14673)

This PR fixes all `datetime` related pytests by properly handling their assertions with bug-fixes made in pandas-2.x and filtering newly introduced warnings where not necessary to propagate to the end-user. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 161 failed, 101280 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1106.29s (0:18:26) = ```
rapidsai · Dec 27, 2023 · fd1f986 · fd1f986
1 parent e218f5c
commit fd1f986
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 84 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -4,6 +4,8 @@
 
 import builtins
 import pickle
+import warnings
+
 from collections import abc
 from functools import cached_property
 from itertools import chain
@@ -2596,7 +2598,11 @@ def _construct_array(
         ):
             # We may have date-like strings with timezones
             try:
-                pd_arbitrary = pd.to_datetime(arbitrary)
+                with warnings.catch_warnings():
+                    # Need to ignore userwarnings when
+                    # datetime format cannot be inferred.
+                    warnings.simplefilter("ignore", UserWarning)
+                    pd_arbitrary = pd.to_datetime(arbitrary)
                 if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
                     raise NotImplementedError(
                         "cuDF does not yet support timezone-aware datetimes"

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -923,10 +923,14 @@ def date_range(
     # FIXME: when `end_estim` is out of bound, but the actual `end` is not,
     # we shouldn't raise but compute the sequence as is. The trailing overflow
     # part should get trimmed at the end.
-    end_estim = (
-        pd.Timestamp(start.value)
-        + periods * offset._maybe_as_fast_pandas_offset()
-    ).to_datetime64()
+    with warnings.catch_warnings():
+        # Need to ignore userwarnings where nonzero nanoseconds
+        # are dropped in conversion during the binops
+        warnings.simplefilter("ignore", UserWarning)
+        end_estim = (
+            pd.Timestamp(start.value)
+            + periods * offset._maybe_as_fast_pandas_offset()
+        ).to_datetime64()
 
     if "months" in offset.kwds or "years" in offset.kwds:
         # If `offset` is non-fixed frequency, resort to libcudf.

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
@@ -642,21 +642,10 @@ def test_cudf_to_datetime(data, dayfirst):
     expected = pd.to_datetime(pd_data, dayfirst=dayfirst)
     actual = cudf.to_datetime(gd_data, dayfirst=dayfirst)
 
-    # TODO: Remove typecast to `ns` and following if/else
-    # workaround after following issue is fixed:
-    # https://github.com/pandas-dev/pandas/issues/52449
-
-    if actual is not None and expected is not None:
-        assert_eq(
-            actual.astype(pd_data.dtype)
-            if pd_data is not None
-            and hasattr(pd_data, "dtype")
-            and cudf.api.types.is_datetime_dtype(pd_data.dtype)
-            else actual.astype("datetime64[ns]"),
-            expected,
-        )
+    if isinstance(expected, pd.Series):
+        assert_eq(actual, expected, check_dtype=False)
     else:
-        assert_eq(actual, expected)
+        assert_eq(actual, expected, check_exact=False)
 
 
 @pytest.mark.parametrize(
@@ -748,11 +737,10 @@ def test_to_datetime_units(data, unit):
     expected = pd.to_datetime(pd_data, unit=unit)
     actual = cudf.to_datetime(gd_data, unit=unit)
 
-    # TODO: Remove typecast to `ns` after following
-    # issue is fixed:
-    # https://github.com/pandas-dev/pandas/issues/52449
-
-    assert_eq(actual.astype("datetime64[ns]"), expected)
+    if isinstance(expected, pd.Series):
+        assert_eq(actual, expected, check_dtype=False)
+    else:
+        assert_eq(actual, expected, exact=False, check_exact=False)
 
 
 @pytest.mark.parametrize(
@@ -810,11 +798,11 @@ def test_to_datetime_format(data, format, infer_datetime_format):
         actual = cudf.to_datetime(
             gd_data, format=format, infer_datetime_format=infer_datetime_format
         )
-    # TODO: Remove typecast to `ns` after following
-    # issue is fixed:
-    # https://github.com/pandas-dev/pandas/issues/52449
 
-    assert_eq(actual.astype("datetime64[ns]"), expected)
+    if isinstance(expected, pd.Series):
+        assert_eq(actual, expected, check_dtype=False)
+    else:
+        assert_eq(actual, expected, check_exact=False)
 
 
 def test_to_datetime_data_out_of_range_for_format():
@@ -879,11 +867,8 @@ def test_datetime_scalar_timeunit_cast(timeunit):
 
     gs = Series(testscalar)
     ps = pd.Series(testscalar)
-    # TODO: Remove typecast to `ns` after following
-    # issue is fixed:
-    # https://github.com/pandas-dev/pandas/issues/52449
 
-    assert_eq(ps, gs.astype("datetime64[ns]"))
+    assert_eq(ps, gs, check_dtype=False)
 
     gdf = DataFrame()
     gdf["a"] = np.arange(5)
@@ -894,11 +879,7 @@ def test_datetime_scalar_timeunit_cast(timeunit):
     pdf["b"] = testscalar
 
     assert gdf["b"].dtype == cudf.dtype("datetime64[s]")
-    # TODO: Remove typecast to `ns` after following
-    # issue is fixed:
-    # https://github.com/pandas-dev/pandas/issues/52449
-    gdf["b"] = gdf["b"].astype("datetime64[ns]")
-    assert_eq(pdf, gdf)
+    assert_eq(pdf, gdf, check_dtype=True)
 
 
 @pytest.mark.parametrize(
@@ -1328,14 +1309,13 @@ def test_datetime_infer_format(data, timezone, dtype):
 
         assert_eq(expected, actual)
     else:
-        with pytest.raises(NotImplementedError):
-            assert_exceptions_equal(
-                lfunc=psr.astype,
-                rfunc=sr.astype,
-                lfunc_args_and_kwargs=([], {"dtype": dtype}),
-                rfunc_args_and_kwargs=([], {"dtype": dtype}),
-                check_exception_type=False,
-            )
+        assert_exceptions_equal(
+            lfunc=psr.astype,
+            rfunc=sr.astype,
+            lfunc_args_and_kwargs=([], {"dtype": dtype}),
+            rfunc_args_and_kwargs=([], {"dtype": dtype}),
+            check_exception_type=False,
+        )
 
 
 def test_dateoffset_instance_subclass_check():
@@ -1634,7 +1614,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods):
     request.applymarker(
         pytest.mark.xfail(
             condition=(
-                "nanoseconds" in freq
+                not PANDAS_GE_210
+                and "nanoseconds" in freq
                 and periods != 1
                 and end == "1970-01-01 00:00:00"
             ),
@@ -2268,7 +2249,7 @@ def test_format_timezone_not_implemented(code):
 
 @pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"])
 def test_no_format_timezone_not_implemented(tz):
-    with pytest.raises(NotImplementedError):
+    with pytest.raises((NotImplementedError, ValueError)):
         cudf.to_datetime([f"2020-01-01 00:00:00{tz}"])
 
 

diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
@@ -787,13 +787,7 @@ def test_join_datetimes_index(dtype):
 
     assert gdf["d"].dtype == cudf.dtype(dtype)
 
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        gdf = gdf.astype("datetime64[ns]")
-
-    assert_join_results_equal(pdf, gdf, how="inner")
+    assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False)
 
 
 def test_join_with_different_names():

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -21,7 +21,6 @@
     gen_rand_series,
     supported_numpy_dtypes,
 )
-from cudf.core._compat import PANDAS_GE_200
 
 # Removal of these deprecated features is no longer imminent. They will not be
 # removed until a suitable alternative has been implemented. As a result, we
@@ -160,13 +159,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
     pdf = orcfile.read().to_pandas(date_as_object=False)
     gdf = cudf.read_orc(path, use_index=use_index)
 
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        gdf = gdf.astype("datetime64[ns]")
-
-    assert_eq(pdf, gdf, check_categorical=False)
+    assert_eq(pdf, gdf, check_categorical=False, check_exact=False)
 
 
 def test_orc_reader_strings(datadir):
@@ -1832,13 +1825,7 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
     with expect_warning_if(engine == "pyarrow", UserWarning):
         got = cudf.read_orc(buffer, engine=engine)
 
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")
-
-    assert_eq(negative_timestamp_df, got)
+    assert_eq(negative_timestamp_df, got, check_dtype=False)
 
 
 def test_orc_writer_negative_timestamp(negative_timestamp_df):
@@ -1847,14 +1834,10 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
     buffer = BytesIO()
     negative_timestamp_df.to_orc(buffer)
 
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")
-
-    assert_eq(negative_timestamp_df, pd.read_orc(buffer))
-    assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read())
+    assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False)
+    assert_eq(
+        negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False
+    )
 
 
 def test_orc_reader_apache_negative_timestamp(datadir):

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -632,13 +632,6 @@ def test_parquet_reader_microsecond_timestamps(datadir):
     expect = pd.read_parquet(fname)
     got = cudf.read_parquet(fname)
 
-    if PANDAS_GE_200:
-        # TODO: Remove typecast to `ns` after following
-        # issue is fixed:
-        # https://github.com/pandas-dev/pandas/issues/52449
-        assert got["a"].dtype == cudf.dtype("datetime64[us]")
-        got = got.astype("datetime64[ns]")
-
     assert_eq(expect, got)