Skip to content

Commit

Permalink
fix: to_datetime in Pandas 2 (apache#24952)
Browse files Browse the repository at this point in the history
  • Loading branch information
betodealmeida authored Aug 11, 2023
1 parent ce65a3b commit 41ca4a0
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 5 deletions.
10 changes: 8 additions & 2 deletions superset/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1834,7 +1834,12 @@ def normalize_dttm_col(
# Column is formatted as a numeric value
unit = _col.timestamp_format.replace("epoch_", "")
df[_col.col_label] = pd.to_datetime(
dttm_series, utc=False, unit=unit, origin="unix", errors="coerce"
dttm_series,
utc=False,
unit=unit,
origin="unix",
errors="raise",
exact=False,
)
else:
# Column has already been formatted as a timestamp.
Expand All @@ -1844,7 +1849,8 @@ def normalize_dttm_col(
df[_col.col_label],
utc=False,
format=_col.timestamp_format,
errors="coerce",
errors="raise",
exact=False,
)
if _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
Expand Down
6 changes: 3 additions & 3 deletions tests/integration_tests/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,7 @@ def normalize_col(
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts

# test that out of bounds timestamps are coerced to None instead of
# erroring out
# test that we raise an error when we can't convert
df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
assert pd.isnull(normalize_col(df, None, 0, None)[DTTM_ALIAS][0])
with pytest.raises(pd.errors.OutOfBoundsDatetime):
normalize_col(df, None, 0, None)
30 changes: 30 additions & 0 deletions tests/unit_tests/utils/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@
import os
from typing import Any, Optional

import pandas as pd
import pytest

from superset.utils.core import (
cast_to_boolean,
DateColumn,
is_test,
normalize_dttm_col,
parse_boolean_string,
QueryObjectFilterClause,
remove_extra_adhoc_filters,
Expand Down Expand Up @@ -171,3 +174,30 @@ def test_other_values():
assert cast_to_boolean([]) is False
assert cast_to_boolean({}) is False
assert cast_to_boolean(object()) is False


def test_normalize_dttm_col() -> None:
"""
Tests for the ``normalize_dttm_col`` function.
In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
2.0.3 and the behavior of ``pd.to_datetime`` changed.
"""
df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
assert (
df.to_markdown()
== """
| | __time |
|---:|:-------------------------|
| 0 | 2017-07-01T00:00:00.000Z |
""".strip()
)

# in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
# add ``exact=False`` since there is a leftover after parsing the format
dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)

# the function modifies the dataframe in place
normalize_dttm_col(df, dttm_cols)

assert df["__time"].astype(str).tolist() == ["2017-07-01"]

0 comments on commit 41ca4a0

Please sign in to comment.