Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[missing values] Reverting replacing missing values with zeros #4905

Merged
merged 1 commit into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion superset/assets/backendSync.json
Original file line number Diff line number Diff line change
Expand Up @@ -3725,4 +3725,5 @@
"default": false
}
}
}
}

2 changes: 1 addition & 1 deletion superset/assets/src/explore/controls.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,7 @@ export const controls = {
'mean',
'min',
'max',
'stdev',
'std',
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note stdev is not a valid Pandas function.

'var',
]),
default: 'sum',
Expand Down
23 changes: 0 additions & 23 deletions superset/common/query_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class QueryContext:
to retrieve the data payload for a given viz.
"""

default_fillna = 0
cache_type = 'df'
enforce_numerical_metrics = True

Expand Down Expand Up @@ -103,7 +102,6 @@ def get_query_result(self, query_object):
self.df_metrics_to_num(df, query_object)

df.replace([np.inf, -np.inf], np.nan)
df = self.handle_nulls(df)
return {
'query': result.query,
'status': result.status,
Expand All @@ -118,27 +116,6 @@ def df_metrics_to_num(self, df, query_object):
if dtype.type == np.object_ and col in metrics:
df[col] = pd.to_numeric(df[col], errors='coerce')

def handle_nulls(self, df):
fillna = self.get_fillna_for_columns(df.columns)
return df.fillna(fillna)

def get_fillna_for_col(self, col):
"""Returns the value to use as filler for a specific Column.type"""
if col and col.is_string:
return ' NULL'
return self.default_fillna

def get_fillna_for_columns(self, columns=None):
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
if columns is None:
return self.default_fillna
columns_dict = {col.column_name: col for col in self.datasource.columns}
fillna = {
c: self.get_fillna_for_col(columns_dict.get(c))
for c in columns
}
return fillna

def get_data(self, df):
return df.to_dict(orient='records')

Expand Down
102 changes: 43 additions & 59 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class BaseViz(object):
verbose_name = 'Base Viz'
credits = ''
is_timeseries = False
default_fillna = 0
cache_type = 'df'
enforce_numerical_metrics = True

Expand Down Expand Up @@ -164,28 +163,6 @@ def run_extra_queries(self):
"""
pass

def handle_nulls(self, df):
fillna = self.get_fillna_for_columns(df.columns)
return df.fillna(fillna)

def get_fillna_for_col(self, col):
"""Returns the value to use as filler for a specific Column.type"""
if col:
if col.is_string:
return ' NULL'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @xrmx added some of this logic. null-labeled series creates issues on some visualizations. I believe the leading space has to do with sorting NULL first. We may want to reuse some of the logic I added recently around filtering that replaces None by <NULL> and empty stings by <empty string>. This should only be applied to dimensions / strings. Note that it applies only to string columns, thus not "zerofying" NULL values.

I'm pretty sure that removing this will break many charts.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mistercrunch the ' NULL' (with a leading space) is somewhat misleading and is inconsistent with SQL Lab which reports NULL (or missing values) as null. On an unrelated note SQL Lab doesn't seem to sort NULL values correctly and it would be good to ensure that NULL values are rendered/sorted consistency throughout the application.

return self.default_fillna

def get_fillna_for_columns(self, columns=None):
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
if columns is None:
return self.default_fillna
columns_dict = {col.column_name: col for col in self.datasource.columns}
fillna = {
c: self.get_fillna_for_col(columns_dict.get(c))
for c in columns
}
return fillna

def get_samples(self):
query_obj = self.query_obj()
query_obj.update({
Expand Down Expand Up @@ -254,8 +231,7 @@ def get_df(self, query_obj=None):
if self.enforce_numerical_metrics:
self.df_metrics_to_num(df)

df.replace([np.inf, -np.inf], np.nan)
df = self.handle_nulls(df)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
return df

def df_metrics_to_num(self, df):
Expand Down Expand Up @@ -653,7 +629,9 @@ def get_data(self, df):
pt = df.pivot_table(
index=DTTM_ALIAS,
columns=columns,
values=values)
values=values,
dropna=False,
)
pt.index = pt.index.map(str)
pt = pt.sort_index()
return dict(
Expand Down Expand Up @@ -696,20 +674,28 @@ def get_data(self, df):
self.form_data.get('granularity') == 'all' and
DTTM_ALIAS in df):
del df[DTTM_ALIAS]

aggfunc = self.form_data.get('pandas_aggfunc')

# Ensure that Pandas's sum function mimics that of SQL.
if aggfunc == 'sum':
aggfunc = lambda x: x.sum(min_count=1) # noqa: E731

df = df.pivot_table(
index=self.form_data.get('groupby'),
columns=self.form_data.get('columns'),
values=[utils.get_metric_name(m) for m in self.form_data.get('metrics')],
aggfunc=self.form_data.get('pandas_aggfunc'),
aggfunc=aggfunc,
margins=self.form_data.get('pivot_margins'),
dropna=False,
)
# Display metrics side by side with each column
if self.form_data.get('combine_metric'):
df = df.stack(0).unstack()
return dict(
columns=list(df.columns),
html=df.to_html(
na_rep='',
na_rep='null',
classes=(
'dataframe table table-striped table-bordered '
'table-condensed table-hover').split(' ')),
Expand Down Expand Up @@ -877,7 +863,7 @@ def to_series(self, df, classed='', title_suffix=''):
index_value = label_sep.join(index_value)
boxes = defaultdict(dict)
for (label, key), value in row.items():
if key == 'median':
if key == 'nanmedian':
key = 'Q2'
boxes[label][key] = value
for label, box in boxes.items():
Expand All @@ -894,28 +880,24 @@ def to_series(self, df, classed='', title_suffix=''):

def get_data(self, df):
form_data = self.form_data
df = df.fillna(0)

# conform to NVD3 names
def Q1(series): # need to be named functions - can't use lambdas
return np.percentile(series, 25)
return np.nanpercentile(series, 25)

def Q3(series):
return np.percentile(series, 75)
return np.nanpercentile(series, 75)

whisker_type = form_data.get('whisker_options')
if whisker_type == 'Tukey':

def whisker_high(series):
upper_outer_lim = Q3(series) + 1.5 * (Q3(series) - Q1(series))
series = series[series <= upper_outer_lim]
return series[np.abs(series - upper_outer_lim).argmin()]
return series[series <= upper_outer_lim].max()

def whisker_low(series):
lower_outer_lim = Q1(series) - 1.5 * (Q3(series) - Q1(series))
# find the closest value above the lower outer limit
series = series[series >= lower_outer_lim]
return series[np.abs(series - lower_outer_lim).argmin()]
return series[series >= lower_outer_lim].min()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per Wikipedia there are several representations of the whiskers, including the common Tukey boxplot,

The lowest datum still within 1.5 IQR of the lower quartile, and the highest datum still within 1.5 IQR of the upper quartile.

The definition above is the same as the more complex argmin approach and doesn't require a second lookup. The resulting value is in agreement with the definition above.


elif whisker_type == 'Min/max (no outliers)':

Expand All @@ -929,10 +911,10 @@ def whisker_low(series):
low, high = whisker_type.replace(' percentiles', '').split('/')

def whisker_high(series):
return np.percentile(series, int(high))
return np.nanpercentile(series, int(high))

def whisker_low(series):
return np.percentile(series, int(low))
return np.nanpercentile(series, int(low))

else:
raise ValueError('Unknown whisker type: {}'.format(whisker_type))
Expand All @@ -943,7 +925,7 @@ def outliers(series):
# pandas sometimes doesn't like getting lists back here
return set(above.tolist() + below.tolist())

aggregate = [Q1, np.median, Q3, whisker_high, whisker_low, outliers]
aggregate = [Q1, np.nanmedian, Q3, whisker_high, whisker_low, outliers]
df = df.groupby(form_data.get('groupby')).agg(aggregate)
chart_data = self.to_series(df)
return chart_data
Expand Down Expand Up @@ -1034,7 +1016,6 @@ def as_floats(field):
return d

def get_data(self, df):
df = df.fillna(0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that this particular visualization handles null nicely?

df['metric'] = df[[utils.get_metric_name(self.metric)]]
values = df['metric'].values
return {
Expand Down Expand Up @@ -1152,22 +1133,25 @@ def to_series(self, df, classed='', title_suffix=''):

def process_data(self, df, aggregate=False):
fd = self.form_data
df = df.fillna(0)
if fd.get('granularity') == 'all':
raise Exception(_('Pick a time granularity for your time series'))

if not aggregate:
df = df.pivot_table(
index=DTTM_ALIAS,
columns=fd.get('groupby'),
values=self.metric_labels)
values=self.metric_labels,
dropna=False,
)
else:
df = df.pivot_table(
index=DTTM_ALIAS,
columns=fd.get('groupby'),
values=self.metric_labels,
fill_value=0,
aggfunc=sum)
aggfunc=sum,
dropna=False,
)

fm = fd.get('resample_fillmethod')
if not fm:
Expand All @@ -1176,8 +1160,6 @@ def process_data(self, df, aggregate=False):
rule = fd.get('resample_rule')
if how and rule:
df = df.resample(rule, how=how, fill_method=fm)
if not fm:
df = df.fillna(0)

if self.sort_series:
dfs = df.sum()
Expand Down Expand Up @@ -1241,7 +1223,6 @@ def get_data(self, df):
fd = self.form_data
comparison_type = fd.get('comparison_type') or 'values'
df = self.process_data(df)

if comparison_type == 'values':
chart_data = self.to_series(df)
for i, (label, df2) in enumerate(self._extra_chart_data):
Expand Down Expand Up @@ -1368,7 +1349,6 @@ def to_series(self, df, classed=''):

def get_data(self, df):
fd = self.form_data
df = df.fillna(0)

if self.form_data.get('granularity') == 'all':
raise Exception(_('Pick a time granularity for your time series'))
Expand All @@ -1377,7 +1357,9 @@ def get_data(self, df):
metric_2 = utils.get_metric_name(fd.get('metric_2'))
df = df.pivot_table(
index=DTTM_ALIAS,
values=[metric, metric_2])
values=[metric, metric_2],
dropna=False,
)

chart_data = self.to_series(df)
return chart_data
Expand Down Expand Up @@ -1425,7 +1407,9 @@ def get_data(self, df):
df = df.pivot_table(
index=DTTM_ALIAS,
columns='series',
values=utils.get_metric_name(fd.get('metric')))
values=utils.get_metric_name(fd.get('metric')),
dropna=False,
)
chart_data = self.to_series(df)
for serie in chart_data:
serie['rank'] = rank_lookup[serie['key']]
Expand Down Expand Up @@ -1462,7 +1446,9 @@ def get_data(self, df):
metric = self.metric_labels[0]
df = df.pivot_table(
index=self.groupby,
values=[metric])
values=[metric],
dropna=False,
)
df.sort_values(by=metric, ascending=False, inplace=True)
df = df.reset_index()
df.columns = ['x', 'y']
Expand Down Expand Up @@ -1549,9 +1535,10 @@ def get_data(self, df):
pt = df.pivot_table(
index=self.groupby,
columns=columns,
values=metrics)
values=metrics,
dropna=False,
)
if fd.get('contribution'):
pt = pt.fillna(0)
pt = pt.T
pt = (pt / pt.sum()).T
pt = pt.reindex(row.index)
Expand Down Expand Up @@ -2117,9 +2104,6 @@ class BaseDeckGLViz(BaseViz):
credits = '<a href="https://uber.github.io/deck.gl/">deck.gl</a>'
spatial_control_keys = []

def handle_nulls(self, df):
return df

def get_metrics(self):
self.metric = self.form_data.get('size')
return [self.metric] if self.metric else []
Expand Down Expand Up @@ -2572,11 +2556,11 @@ def get_data(self, df):
fd = self.form_data
groups = fd.get('groupby')
metrics = fd.get('metrics')
df.fillna(0)
df = df.pivot_table(
index=DTTM_ALIAS,
columns=groups,
values=metrics)
values=metrics,
)
cols = []
# Be rid of falsey keys
for col in df.columns:
Expand Down Expand Up @@ -2699,7 +2683,7 @@ def levels_for_time(self, groups, df):
for i in range(0, len(groups) + 1):
self.form_data['groupby'] = groups[:i]
df_drop = df.drop(groups[i:], 1)
procs[i] = self.process_data(df_drop, aggregate=True).fillna(0)
procs[i] = self.process_data(df_drop, aggregate=True)
self.form_data['groupby'] = groups
return procs

Expand Down
10 changes: 0 additions & 10 deletions tests/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,16 +622,6 @@ def test_slice_query_endpoint(self):
assert 'language' in resp
self.logout()

def test_viz_get_fillna_for_columns(self):
slc = self.get_slice('Girls', db.session)
q = slc.viz.query_obj()
results = slc.viz.datasource.query(q)
fillna_columns = slc.viz.get_fillna_for_columns(results.df.columns)
self.assertDictEqual(
fillna_columns,
{'name': ' NULL', 'sum__num': 0},
)

def test_import_csv(self):
self.login(username='admin')
filename = 'testCSV.csv'
Expand Down
12 changes: 0 additions & 12 deletions tests/viz_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,6 @@ def test_process_metrics(self):
self.assertEqual(test_viz.metric_labels, expect_metric_labels)
self.assertEqual(test_viz.all_metrics, expect_metric_labels)

def test_get_fillna_returns_default_on_null_columns(self):
form_data = {
'viz_type': 'table',
'token': '12345',
}
datasource = self.get_datasource_mock()
test_viz = viz.BaseViz(datasource, form_data)
self.assertEqual(
test_viz.default_fillna,
test_viz.get_fillna_for_columns(),
)

def test_get_df_returns_empty_df(self):
form_data = {'dummy': 123}
query_obj = {'granularity': 'day'}
Expand Down