Skip to content

Commit

Permalink
Fix datetime related assertions and warnings in pytests (#14673)
Browse files Browse the repository at this point in the history
This PR fixes all `datetime` related pytests by properly handling their assertions with bug-fixes made in pandas-2.x and filtering newly introduced warnings where not necessary to propagate to the end-user.

On `pandas_2.0_feature_branch`:
```
= 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) =
```

This PR:
```
= 161 failed, 101280 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1106.29s (0:18:26) =
```
  • Loading branch information
galipremsagar authored Dec 27, 2023
1 parent e218f5c commit fd1f986
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 84 deletions.
8 changes: 7 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import builtins
import pickle
import warnings

from collections import abc
from functools import cached_property
from itertools import chain
Expand Down Expand Up @@ -2596,7 +2598,11 @@ def _construct_array(
):
# We may have date-like strings with timezones
try:
pd_arbitrary = pd.to_datetime(arbitrary)
with warnings.catch_warnings():
# Need to ignore userwarnings when
# datetime format cannot be inferred.
warnings.simplefilter("ignore", UserWarning)
pd_arbitrary = pd.to_datetime(arbitrary)
if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,10 +923,14 @@ def date_range(
# FIXME: when `end_estim` is out of bound, but the actual `end` is not,
# we shouldn't raise but compute the sequence as is. The trailing overflow
# part should get trimmed at the end.
end_estim = (
pd.Timestamp(start.value)
+ periods * offset._maybe_as_fast_pandas_offset()
).to_datetime64()
with warnings.catch_warnings():
# Need to ignore userwarnings where nonzero nanoseconds
# are dropped in conversion during the binops
warnings.simplefilter("ignore", UserWarning)
end_estim = (
pd.Timestamp(start.value)
+ periods * offset._maybe_as_fast_pandas_offset()
).to_datetime64()

if "months" in offset.kwds or "years" in offset.kwds:
# If `offset` is non-fixed frequency, resort to libcudf.
Expand Down
65 changes: 23 additions & 42 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,21 +642,10 @@ def test_cudf_to_datetime(data, dayfirst):
expected = pd.to_datetime(pd_data, dayfirst=dayfirst)
actual = cudf.to_datetime(gd_data, dayfirst=dayfirst)

# TODO: Remove typecast to `ns` and following if/else
# workaround after following issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449

if actual is not None and expected is not None:
assert_eq(
actual.astype(pd_data.dtype)
if pd_data is not None
and hasattr(pd_data, "dtype")
and cudf.api.types.is_datetime_dtype(pd_data.dtype)
else actual.astype("datetime64[ns]"),
expected,
)
if isinstance(expected, pd.Series):
assert_eq(actual, expected, check_dtype=False)
else:
assert_eq(actual, expected)
assert_eq(actual, expected, check_exact=False)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -748,11 +737,10 @@ def test_to_datetime_units(data, unit):
expected = pd.to_datetime(pd_data, unit=unit)
actual = cudf.to_datetime(gd_data, unit=unit)

# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449

assert_eq(actual.astype("datetime64[ns]"), expected)
if isinstance(expected, pd.Series):
assert_eq(actual, expected, check_dtype=False)
else:
assert_eq(actual, expected, exact=False, check_exact=False)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -810,11 +798,11 @@ def test_to_datetime_format(data, format, infer_datetime_format):
actual = cudf.to_datetime(
gd_data, format=format, infer_datetime_format=infer_datetime_format
)
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449

assert_eq(actual.astype("datetime64[ns]"), expected)
if isinstance(expected, pd.Series):
assert_eq(actual, expected, check_dtype=False)
else:
assert_eq(actual, expected, check_exact=False)


def test_to_datetime_data_out_of_range_for_format():
Expand Down Expand Up @@ -879,11 +867,8 @@ def test_datetime_scalar_timeunit_cast(timeunit):

gs = Series(testscalar)
ps = pd.Series(testscalar)
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449

assert_eq(ps, gs.astype("datetime64[ns]"))
assert_eq(ps, gs, check_dtype=False)

gdf = DataFrame()
gdf["a"] = np.arange(5)
Expand All @@ -894,11 +879,7 @@ def test_datetime_scalar_timeunit_cast(timeunit):
pdf["b"] = testscalar

assert gdf["b"].dtype == cudf.dtype("datetime64[s]")
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf["b"] = gdf["b"].astype("datetime64[ns]")
assert_eq(pdf, gdf)
assert_eq(pdf, gdf, check_dtype=True)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1328,14 +1309,13 @@ def test_datetime_infer_format(data, timezone, dtype):

assert_eq(expected, actual)
else:
with pytest.raises(NotImplementedError):
assert_exceptions_equal(
lfunc=psr.astype,
rfunc=sr.astype,
lfunc_args_and_kwargs=([], {"dtype": dtype}),
rfunc_args_and_kwargs=([], {"dtype": dtype}),
check_exception_type=False,
)
assert_exceptions_equal(
lfunc=psr.astype,
rfunc=sr.astype,
lfunc_args_and_kwargs=([], {"dtype": dtype}),
rfunc_args_and_kwargs=([], {"dtype": dtype}),
check_exception_type=False,
)


def test_dateoffset_instance_subclass_check():
Expand Down Expand Up @@ -1634,7 +1614,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods):
request.applymarker(
pytest.mark.xfail(
condition=(
"nanoseconds" in freq
not PANDAS_GE_210
and "nanoseconds" in freq
and periods != 1
and end == "1970-01-01 00:00:00"
),
Expand Down Expand Up @@ -2268,7 +2249,7 @@ def test_format_timezone_not_implemented(code):

@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"])
def test_no_format_timezone_not_implemented(tz):
with pytest.raises(NotImplementedError):
with pytest.raises((NotImplementedError, ValueError)):
cudf.to_datetime([f"2020-01-01 00:00:00{tz}"])


Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,13 +787,7 @@ def test_join_datetimes_index(dtype):

assert gdf["d"].dtype == cudf.dtype(dtype)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf = gdf.astype("datetime64[ns]")

assert_join_results_equal(pdf, gdf, how="inner")
assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False)


def test_join_with_different_names():
Expand Down
29 changes: 6 additions & 23 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
gen_rand_series,
supported_numpy_dtypes,
)
from cudf.core._compat import PANDAS_GE_200

# Removal of these deprecated features is no longer imminent. They will not be
# removed until a suitable alternative has been implemented. As a result, we
Expand Down Expand Up @@ -160,13 +159,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
pdf = orcfile.read().to_pandas(date_as_object=False)
gdf = cudf.read_orc(path, use_index=use_index)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
gdf = gdf.astype("datetime64[ns]")

assert_eq(pdf, gdf, check_categorical=False)
assert_eq(pdf, gdf, check_categorical=False, check_exact=False)


def test_orc_reader_strings(datadir):
Expand Down Expand Up @@ -1832,13 +1825,7 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
with expect_warning_if(engine == "pyarrow", UserWarning):
got = cudf.read_orc(buffer, engine=engine)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")

assert_eq(negative_timestamp_df, got)
assert_eq(negative_timestamp_df, got, check_dtype=False)


def test_orc_writer_negative_timestamp(negative_timestamp_df):
Expand All @@ -1847,14 +1834,10 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
buffer = BytesIO()
negative_timestamp_df.to_orc(buffer)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]")

assert_eq(negative_timestamp_df, pd.read_orc(buffer))
assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read())
assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False)
assert_eq(
negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False
)


def test_orc_reader_apache_negative_timestamp(datadir):
Expand Down
7 changes: 0 additions & 7 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,13 +632,6 @@ def test_parquet_reader_microsecond_timestamps(datadir):
expect = pd.read_parquet(fname)
got = cudf.read_parquet(fname)

if PANDAS_GE_200:
# TODO: Remove typecast to `ns` after following
# issue is fixed:
# https://github.com/pandas-dev/pandas/issues/52449
assert got["a"].dtype == cudf.dtype("datetime64[us]")
got = got.astype("datetime64[ns]")

assert_eq(expect, got)


Expand Down

0 comments on commit fd1f986

Please sign in to comment.