Fix datetime64 related inconsistencies in pytests (#13175)

This PR fixes `datetime64` related pytest failures where pandas returns `ns` time resolutions for quite a lot of cases, i.e., mostly on the IO APIs side. Fixes 72 pytests: ``` = 484 failed, 88162 passed, 2044 skipped, 932 xfailed, 165 xpassed in 444.53s (0:07:24) = ``` On `pandas_2.0_feature_branch`: ``` = 556 failed, 88090 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.49s (0:07:36) = ```
rapidsai · Apr 20, 2023 · 901a971 · 901a971
1 parent 69af242
commit 901a971
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 14 deletions.
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -16,7 +16,7 @@
 
 import cudf
 from cudf import read_csv
-from cudf.core._compat import PANDAS_LT_140
+from cudf.core._compat import PANDAS_LT_140, PANDAS_GE_200
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
@@ -367,6 +367,11 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
 
     assert len(out.columns) == len(df_out.columns)
     assert len(out) == len(df_out)
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        out["2"] = out["2"].astype("datetime64[ns]")
     assert_eq(df_out, out)
 
 

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -2648,7 +2648,13 @@ def test_groupby_freq_week(label, closed):
     got = gdf.groupby(
         cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed)
     ).mean()
-    assert_eq(expect, got, check_like=True, check_dtype=False)
+    assert_eq(
+        expect,
+        got,
+        check_like=True,
+        check_dtype=False,
+        check_index_type=not PANDAS_GE_200,
+    )
 
 
 @pytest.mark.parametrize("label", [None, "left", "right"])
@@ -2675,7 +2681,13 @@ def test_groupby_freq_day(label, closed):
     got = gdf.groupby(
         cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed)
     ).mean()
-    assert_eq(expect, got, check_like=True, check_dtype=False)
+    assert_eq(
+        expect,
+        got,
+        check_like=True,
+        check_dtype=False,
+        check_index_type=not PANDAS_GE_200,
+    )
 
 
 @pytest.mark.parametrize("label", [None, "left", "right"])
@@ -2702,7 +2714,13 @@ def test_groupby_freq_min(label, closed):
     got = gdf.groupby(
         cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed)
     ).mean()
-    assert_eq(expect, got, check_like=True, check_dtype=False)
+    assert_eq(
+        expect,
+        got,
+        check_like=True,
+        check_dtype=False,
+        check_index_type=not PANDAS_GE_200,
+    )
 
 
 @pytest.mark.parametrize("label", [None, "left", "right"])
@@ -2729,7 +2747,13 @@ def test_groupby_freq_s(label, closed):
     got = gdf.groupby(
         cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed)
     ).mean()
-    assert_eq(expect, got, check_like=True, check_dtype=False)
+    assert_eq(
+        expect,
+        got,
+        check_like=True,
+        check_dtype=False,
+        check_index_type=not PANDAS_GE_200,
+    )
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -1764,7 +1764,13 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-
+    if PANDAS_GE_200:
+        # Arrow bug:
+        # https://github.com/apache/arrow/issues/33321
+        # arrow cannot convert non-nanosecond
+        # resolution to appropriate type in pandas.
+        # Hence need to type-cast.
+        expected_index = expected_index.astype(gdi.dtype)
     assert_eq(expected_index, gdi)
 
 

diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
@@ -15,6 +15,7 @@
     assert_exceptions_equal,
     expect_warning_if,
 )
+from cudf.core._compat import PANDAS_GE_200
 
 _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
 
@@ -785,6 +786,12 @@ def test_join_datetimes_index(dtype):
 
     assert gdf["d"].dtype == cudf.dtype(dtype)
 
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        gdf = gdf.astype("datetime64[ns]")
+
     assert_join_results_equal(pdf, gdf, how="inner")
 
 

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -23,6 +23,7 @@
     gen_rand_series,
     supported_numpy_dtypes,
 )
+from cudf.core._compat import PANDAS_GE_200
 
 # Removal of these deprecated features is no longer imminent. They will not be
 # removed until a suitable alternative has been implemented. As a result, we
@@ -159,6 +160,12 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
     pdf = orcfile.read().to_pandas(date_as_object=False)
     gdf = cudf.read_orc(path, use_index=use_index)
 
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        gdf = gdf.astype("datetime64[ns]")
+
     assert_eq(pdf, gdf, check_categorical=False)
 
 
@@ -1847,13 +1854,25 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
     with expect_warning_if(engine == "pyarrow", UserWarning):
         got = cudf.read_orc(buffer, engine=engine)
 
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")
+
     assert_eq(negative_timestamp_df, got)
 
 
 def test_orc_writer_negative_timestamp(negative_timestamp_df):
     buffer = BytesIO()
     negative_timestamp_df.to_orc(buffer)
 
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")
+
     assert_eq(negative_timestamp_df, pd.read_orc(buffer))
     assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())
 

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -664,6 +664,13 @@ def test_parquet_reader_microsecond_timestamps(datadir):
     expect = pd.read_parquet(fname)
     got = cudf.read_parquet(fname)
 
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        assert got["a"].dtype == cudf.dtype("datetime64[us]")
+        got = got.astype("datetime64[ns]")
+
     assert_eq(expect, got)
 
 
@@ -2513,6 +2520,16 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):
 
     got = pd.read_parquet(fname)
     nullable = num_rows > 0
+    if PANDAS_GE_200:
+        # TODO: Remove typecast to `ns` after following
+        # issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/52449
+        gdf["col_datetime64[ms]"] = gdf["col_datetime64[ms]"].astype(
+            "datetime64[ns]"
+        )
+        gdf["col_datetime64[us]"] = gdf["col_datetime64[us]"].astype(
+            "datetime64[ns]"
+        )
     assert_eq(gdf.to_pandas(nullable=nullable), got)
 
 

diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
@@ -1,11 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
 from cudf.testing._utils import assert_eq
+from cudf.core._compat import PANDAS_GE_200
 
 
 def assert_resample_results_equal(lhs, rhs, **kwargs):
@@ -14,6 +15,7 @@ def assert_resample_results_equal(lhs, rhs, **kwargs):
         rhs.sort_index(),
         check_dtype=False,
         check_freq=False,
+        check_index_type=not PANDAS_GE_200,
         **kwargs,
     )
 

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
@@ -200,12 +200,12 @@ def test_string_astype(dtype):
         data = ["True", "False", "True", "False", "False"]
     elif dtype.startswith("datetime64"):
         data = [
-            "2019-06-04T00:00:00Z",
-            "2019-06-04T12:12:12Z",
-            "2019-06-03T00:00:00Z",
-            "2019-05-04T00:00:00Z",
-            "2018-06-04T00:00:00Z",
-            "1922-07-21T01:02:03Z",
+            "2019-06-04T00:00:00",
+            "2019-06-04T12:12:12",
+            "2019-06-03T00:00:00",
+            "2019-05-04T00:00:00",
+            "2018-06-04T00:00:00",
+            "1922-07-21T01:02:03",
         ]
     elif dtype == "str" or dtype == "object":
         data = ["ab", "cd", "ef", "gh", "ij"]