Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(chart-data-api): support numeric temporal columns #13138

Merged
merged 1 commit into from
Feb 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 6 additions & 12 deletions superset/common/query_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import copy
import logging
import math
from datetime import timedelta
from typing import Any, cast, ClassVar, Dict, List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -112,17 +111,12 @@ def get_query_result(self, query_object: QueryObject) -> Dict[str, Any]:
# If the datetime format is unix, the parse will use the corresponding
# parsing logic
if not df.empty:
if DTTM_ALIAS in df.columns:
if timestamp_format in ("epoch_s", "epoch_ms"):
# Column has already been formatted as a timestamp.
df[DTTM_ALIAS] = df[DTTM_ALIAS].apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += query_object.time_shift
df = utils.normalize_dttm_col(
df=df,
timestamp_format=timestamp_format,
offset=self.datasource.offset,
time_shift=query_object.time_shift,
)

if self.enforce_numerical_metrics:
self.df_metrics_to_num(df, query_object)
Expand Down
32 changes: 32 additions & 0 deletions superset/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
from flask_babel import gettext as __
from flask_babel.speaklater import LazyString
from pandas.api.types import infer_dtype
from pandas.core.dtypes.common import is_numeric_dtype
from sqlalchemy import event, exc, select, Text
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlalchemy.engine import Connection, Engine
Expand Down Expand Up @@ -1579,3 +1580,34 @@ def format_list(items: Sequence[str], sep: str = ", ", quote: str = '"') -> str:
def find_duplicates(items: Iterable[InputType]) -> List[InputType]:
"""Find duplicate items in an iterable."""
return [item for item, count in collections.Counter(items).items() if count > 1]


def normalize_dttm_col(
df: pd.DataFrame,
timestamp_format: Optional[str],
offset: int,
time_shift: Optional[timedelta],
) -> pd.DataFrame:
if DTTM_ALIAS not in df.columns:
return df
df = df.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the copy operation necessary here? This could be very expensive on memory for large result sets.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @robdiciuccio . I'm a bit allergic to functions that mutate their inputs (even Pandas is moving away from this pattern), but this could in fact have a noticeable performance hit. I'm going to be touching this code in the coming days, so I'll make sure to address this then (either confirm that the copy is shallow enough to not cause a perf hit or remove the copy operation).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, @villebro. DataFrame.copy() is deep by default.

if timestamp_format in ("epoch_s", "epoch_ms"):
dttm_col = df[DTTM_ALIAS]
if is_numeric_dtype(dttm_col):
# Column is formatted as a numeric value
unit = timestamp_format.replace("epoch_", "")
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
# Column has already been formatted as a timestamp.
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if offset:
df[DTTM_ALIAS] += timedelta(hours=offset)
if time_shift is not None:
df[DTTM_ALIAS] += time_shift
return df
33 changes: 6 additions & 27 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,33 +284,12 @@ def get_df(self, query_obj: Optional[QueryObjectDict] = None) -> pd.DataFrame:
# If the datetime format is unix, the parse will use the corresponding
# parsing logic.
if not df.empty:
if DTTM_ALIAS in df.columns:
if timestamp_format in ("epoch_s", "epoch_ms"):
# Column has already been formatted as a timestamp.
dttm_col = df[DTTM_ALIAS]
one_ts_val = dttm_col[0]

# convert time column to pandas Timestamp, but different
# ways to convert depending on string or int types
try:
int(one_ts_val)
is_integral = True
except (ValueError, TypeError):
is_integral = False
if is_integral:
unit = "s" if timestamp_format == "epoch_s" else "ms"
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += self.time_shift
df = utils.normalize_dttm_col(
df=df,
timestamp_format=timestamp_format,
offset=self.datasource.offset,
time_shift=self.time_shift,
)

if self.enforce_numerical_metrics:
self.df_metrics_to_num(df)
Expand Down
29 changes: 29 additions & 0 deletions tests/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
cast_to_num,
convert_legacy_filters_into_adhoc,
create_ssl_cert_file,
DTTM_ALIAS,
format_timedelta,
GenericDataType,
get_form_data_token,
Expand All @@ -59,6 +60,7 @@
merge_extra_filters,
merge_extra_form_data,
merge_request_params,
normalize_dttm_col,
parse_ssl_cert,
parse_js_uri_path_item,
extract_dataframe_dtypes,
Expand Down Expand Up @@ -1131,3 +1133,30 @@ def test_extract_dataframe_dtypes(self):

df = pd.DataFrame(data={col[0]: col[2] for col in cols})
assert extract_dataframe_dtypes(df) == [col[1] for col in cols]

def test_normalize_dttm_col(self):
ts = pd.Timestamp(2021, 2, 15, 19, 0, 0, 0)
df = pd.DataFrame([{"__timestamp": ts, "a": 1}])

# test regular (non-numeric) format
assert normalize_dttm_col(df, None, 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts

# test offset
assert normalize_dttm_col(df, None, 1, None)[DTTM_ALIAS][0] == pd.Timestamp(
2021, 2, 15, 20, 0, 0, 0
)

# test offset and timedelta
assert normalize_dttm_col(df, None, 1, timedelta(minutes=30))[DTTM_ALIAS][
0
] == pd.Timestamp(2021, 2, 15, 20, 30, 0, 0)

# test numeric epoch_s format
df = pd.DataFrame([{"__timestamp": ts.timestamp(), "a": 1}])
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts

# test numeric epoch_ms format
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts