Skip to content

Commit

Permalink
Fix datetime64 related inconsistencies in pytests (#13175)
Browse files Browse the repository at this point in the history
This PR fixes `datetime64` related pytest failures where pandas returns `ns` time resolutions for quite a lot of cases, i.e., mostly on the IO APIs side.

Fixes 72 pytests:
```
= 484 failed, 88162 passed, 2044 skipped, 932 xfailed, 165 xpassed in 444.53s (0:07:24) =
```
On `pandas_2.0_feature_branch`:
```
= 556 failed, 88090 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.49s (0:07:36) =
```
  • Loading branch information
galipremsagar authored Apr 20, 2023
1 parent 69af242 commit 901a971
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 14 deletions.
7 changes: 6 additions & 1 deletion python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import cudf
from cudf import read_csv
from cudf.core._compat import PANDAS_LT_140
from cudf.core._compat import PANDAS_LT_140, PANDAS_GE_200
from cudf.testing._utils import assert_eq, assert_exceptions_equal


Expand Down Expand Up @@ -367,6 +367,11 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):

assert len(out.columns) == len(df_out.columns)
assert len(out) == len(df_out)
if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
out["2"] = out["2"].astype("datetime64[ns]")
assert_eq(df_out, out)


Expand Down
32 changes: 28 additions & 4 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2648,7 +2648,13 @@ def test_groupby_freq_week(label, closed):
got = gdf.groupby(
cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed)
).mean()
assert_eq(expect, got, check_like=True, check_dtype=False)
assert_eq(
expect,
got,
check_like=True,
check_dtype=False,
check_index_type=not PANDAS_GE_200,
)


@pytest.mark.parametrize("label", [None, "left", "right"])
Expand All @@ -2675,7 +2681,13 @@ def test_groupby_freq_day(label, closed):
got = gdf.groupby(
cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed)
).mean()
assert_eq(expect, got, check_like=True, check_dtype=False)
assert_eq(
expect,
got,
check_like=True,
check_dtype=False,
check_index_type=not PANDAS_GE_200,
)


@pytest.mark.parametrize("label", [None, "left", "right"])
Expand All @@ -2702,7 +2714,13 @@ def test_groupby_freq_min(label, closed):
got = gdf.groupby(
cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed)
).mean()
assert_eq(expect, got, check_like=True, check_dtype=False)
assert_eq(
expect,
got,
check_like=True,
check_dtype=False,
check_index_type=not PANDAS_GE_200,
)


@pytest.mark.parametrize("label", [None, "left", "right"])
Expand All @@ -2729,7 +2747,13 @@ def test_groupby_freq_s(label, closed):
got = gdf.groupby(
cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed)
).mean()
assert_eq(expect, got, check_like=True, check_dtype=False)
assert_eq(
expect,
got,
check_like=True,
check_dtype=False,
check_index_type=not PANDAS_GE_200,
)


@pytest.mark.parametrize(
Expand Down
8 changes: 7 additions & 1 deletion python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,7 +1764,13 @@ def test_index_from_arrow(data):
arrow_array = pa.Array.from_pandas(pdi)
expected_index = pd.Index(arrow_array.to_pandas())
gdi = cudf.Index.from_arrow(arrow_array)

if PANDAS_GE_200:
# Arrow bug:
# https://github.com/apache/arrow/issues/33321
# arrow cannot convert non-nanosecond
# resolution to appropriate type in pandas.
# Hence need to type-cast.
expected_index = expected_index.astype(gdi.dtype)
assert_eq(expected_index, gdi)


Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
assert_exceptions_equal,
expect_warning_if,
)
from cudf.core._compat import PANDAS_GE_200

_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")

Expand Down Expand Up @@ -785,6 +786,12 @@ def test_join_datetimes_index(dtype):

assert gdf["d"].dtype == cudf.dtype(dtype)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf = gdf.astype("datetime64[ns]")

assert_join_results_equal(pdf, gdf, how="inner")


Expand Down
21 changes: 20 additions & 1 deletion python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

import datetime
import decimal
Expand All @@ -23,6 +23,7 @@
gen_rand_series,
supported_numpy_dtypes,
)
from cudf.core._compat import PANDAS_GE_200

# Removal of these deprecated features is no longer imminent. They will not be
# removed until a suitable alternative has been implemented. As a result, we
Expand Down Expand Up @@ -159,6 +160,12 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
pdf = orcfile.read().to_pandas(date_as_object=False)
gdf = cudf.read_orc(path, use_index=use_index)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf = gdf.astype("datetime64[ns]")

assert_eq(pdf, gdf, check_categorical=False)


Expand Down Expand Up @@ -1847,13 +1854,25 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
with expect_warning_if(engine == "pyarrow", UserWarning):
got = cudf.read_orc(buffer, engine=engine)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")

assert_eq(negative_timestamp_df, got)


def test_orc_writer_negative_timestamp(negative_timestamp_df):
buffer = BytesIO()
negative_timestamp_df.to_orc(buffer)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")

assert_eq(negative_timestamp_df, pd.read_orc(buffer))
assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())

Expand Down
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,13 @@ def test_parquet_reader_microsecond_timestamps(datadir):
expect = pd.read_parquet(fname)
got = cudf.read_parquet(fname)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
assert got["a"].dtype == cudf.dtype("datetime64[us]")
got = got.astype("datetime64[ns]")

assert_eq(expect, got)


Expand Down Expand Up @@ -2513,6 +2520,16 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):

got = pd.read_parquet(fname)
nullable = num_rows > 0
if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf["col_datetime64[ms]"] = gdf["col_datetime64[ms]"].astype(
"datetime64[ns]"
)
gdf["col_datetime64[us]"] = gdf["col_datetime64[us]"].astype(
"datetime64[ns]"
)
assert_eq(gdf.to_pandas(nullable=nullable), got)


Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/tests/test_resampling.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest

import cudf
from cudf.testing._utils import assert_eq
from cudf.core._compat import PANDAS_GE_200


def assert_resample_results_equal(lhs, rhs, **kwargs):
Expand All @@ -14,6 +15,7 @@ def assert_resample_results_equal(lhs, rhs, **kwargs):
rhs.sort_index(),
check_dtype=False,
check_freq=False,
check_index_type=not PANDAS_GE_200,
**kwargs,
)

Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ def test_string_astype(dtype):
data = ["True", "False", "True", "False", "False"]
elif dtype.startswith("datetime64"):
data = [
"2019-06-04T00:00:00Z",
"2019-06-04T12:12:12Z",
"2019-06-03T00:00:00Z",
"2019-05-04T00:00:00Z",
"2018-06-04T00:00:00Z",
"1922-07-21T01:02:03Z",
"2019-06-04T00:00:00",
"2019-06-04T12:12:12",
"2019-06-03T00:00:00",
"2019-05-04T00:00:00",
"2018-06-04T00:00:00",
"1922-07-21T01:02:03",
]
elif dtype == "str" or dtype == "object":
data = ["ab", "cd", "ef", "gh", "ij"]
Expand Down

0 comments on commit 901a971

Please sign in to comment.