-
Notifications
You must be signed in to change notification settings - Fork 14.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[missing values] Reverting replacing missing values with zeros #4905
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3725,4 +3725,5 @@ | |
"default": false | ||
} | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1350,7 +1350,7 @@ export const controls = { | |
'mean', | ||
'min', | ||
'max', | ||
'stdev', | ||
'std', | ||
'var', | ||
]), | ||
default: 'sum', | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,7 +75,6 @@ class BaseViz(object): | |
verbose_name = 'Base Viz' | ||
credits = '' | ||
is_timeseries = False | ||
default_fillna = 0 | ||
cache_type = 'df' | ||
enforce_numerical_metrics = True | ||
|
||
|
@@ -164,28 +163,6 @@ def run_extra_queries(self): | |
""" | ||
pass | ||
|
||
def handle_nulls(self, df): | ||
fillna = self.get_fillna_for_columns(df.columns) | ||
return df.fillna(fillna) | ||
|
||
def get_fillna_for_col(self, col): | ||
"""Returns the value to use as filler for a specific Column.type""" | ||
if col: | ||
if col.is_string: | ||
return ' NULL' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think @xrmx added some of this logic. I'm pretty sure that removing this will break many charts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mistercrunch the |
||
return self.default_fillna | ||
|
||
def get_fillna_for_columns(self, columns=None): | ||
"""Returns a dict or scalar that can be passed to DataFrame.fillna""" | ||
if columns is None: | ||
return self.default_fillna | ||
columns_dict = {col.column_name: col for col in self.datasource.columns} | ||
fillna = { | ||
c: self.get_fillna_for_col(columns_dict.get(c)) | ||
for c in columns | ||
} | ||
return fillna | ||
|
||
def get_samples(self): | ||
query_obj = self.query_obj() | ||
query_obj.update({ | ||
|
@@ -254,8 +231,7 @@ def get_df(self, query_obj=None): | |
if self.enforce_numerical_metrics: | ||
self.df_metrics_to_num(df) | ||
|
||
df.replace([np.inf, -np.inf], np.nan) | ||
df = self.handle_nulls(df) | ||
df.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
return df | ||
|
||
def df_metrics_to_num(self, df): | ||
|
@@ -653,7 +629,9 @@ def get_data(self, df): | |
pt = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
columns=columns, | ||
values=values) | ||
values=values, | ||
dropna=False, | ||
) | ||
pt.index = pt.index.map(str) | ||
pt = pt.sort_index() | ||
return dict( | ||
|
@@ -696,20 +674,28 @@ def get_data(self, df): | |
self.form_data.get('granularity') == 'all' and | ||
DTTM_ALIAS in df): | ||
del df[DTTM_ALIAS] | ||
|
||
aggfunc = self.form_data.get('pandas_aggfunc') | ||
|
||
# Ensure that Pandas's sum function mimics that of SQL. | ||
if aggfunc == 'sum': | ||
aggfunc = lambda x: x.sum(min_count=1) # noqa: E731 | ||
|
||
df = df.pivot_table( | ||
index=self.form_data.get('groupby'), | ||
columns=self.form_data.get('columns'), | ||
values=[utils.get_metric_name(m) for m in self.form_data.get('metrics')], | ||
aggfunc=self.form_data.get('pandas_aggfunc'), | ||
aggfunc=aggfunc, | ||
margins=self.form_data.get('pivot_margins'), | ||
dropna=False, | ||
) | ||
# Display metrics side by side with each column | ||
if self.form_data.get('combine_metric'): | ||
df = df.stack(0).unstack() | ||
return dict( | ||
columns=list(df.columns), | ||
html=df.to_html( | ||
na_rep='', | ||
na_rep='null', | ||
classes=( | ||
'dataframe table table-striped table-bordered ' | ||
'table-condensed table-hover').split(' ')), | ||
|
@@ -877,7 +863,7 @@ def to_series(self, df, classed='', title_suffix=''): | |
index_value = label_sep.join(index_value) | ||
boxes = defaultdict(dict) | ||
for (label, key), value in row.items(): | ||
if key == 'median': | ||
if key == 'nanmedian': | ||
key = 'Q2' | ||
boxes[label][key] = value | ||
for label, box in boxes.items(): | ||
|
@@ -894,28 +880,24 @@ def to_series(self, df, classed='', title_suffix=''): | |
|
||
def get_data(self, df): | ||
form_data = self.form_data | ||
df = df.fillna(0) | ||
|
||
# conform to NVD3 names | ||
def Q1(series): # need to be named functions - can't use lambdas | ||
return np.percentile(series, 25) | ||
return np.nanpercentile(series, 25) | ||
|
||
def Q3(series): | ||
return np.percentile(series, 75) | ||
return np.nanpercentile(series, 75) | ||
|
||
whisker_type = form_data.get('whisker_options') | ||
if whisker_type == 'Tukey': | ||
|
||
def whisker_high(series): | ||
upper_outer_lim = Q3(series) + 1.5 * (Q3(series) - Q1(series)) | ||
series = series[series <= upper_outer_lim] | ||
return series[np.abs(series - upper_outer_lim).argmin()] | ||
return series[series <= upper_outer_lim].max() | ||
|
||
def whisker_low(series): | ||
lower_outer_lim = Q1(series) - 1.5 * (Q3(series) - Q1(series)) | ||
# find the closest value above the lower outer limit | ||
series = series[series >= lower_outer_lim] | ||
return series[np.abs(series - lower_outer_lim).argmin()] | ||
return series[series >= lower_outer_lim].min() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per Wikipedia there are several representations of the whiskers, including the common Tukey boxplot,
The definition above is the same as the more complex |
||
|
||
elif whisker_type == 'Min/max (no outliers)': | ||
|
||
|
@@ -929,10 +911,10 @@ def whisker_low(series): | |
low, high = whisker_type.replace(' percentiles', '').split('/') | ||
|
||
def whisker_high(series): | ||
return np.percentile(series, int(high)) | ||
return np.nanpercentile(series, int(high)) | ||
|
||
def whisker_low(series): | ||
return np.percentile(series, int(low)) | ||
return np.nanpercentile(series, int(low)) | ||
|
||
else: | ||
raise ValueError('Unknown whisker type: {}'.format(whisker_type)) | ||
|
@@ -943,7 +925,7 @@ def outliers(series): | |
# pandas sometimes doesn't like getting lists back here | ||
return set(above.tolist() + below.tolist()) | ||
|
||
aggregate = [Q1, np.median, Q3, whisker_high, whisker_low, outliers] | ||
aggregate = [Q1, np.nanmedian, Q3, whisker_high, whisker_low, outliers] | ||
df = df.groupby(form_data.get('groupby')).agg(aggregate) | ||
chart_data = self.to_series(df) | ||
return chart_data | ||
|
@@ -1034,7 +1016,6 @@ def as_floats(field): | |
return d | ||
|
||
def get_data(self, df): | ||
df = df.fillna(0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we sure that this particular visualization handles null nicely? |
||
df['metric'] = df[[utils.get_metric_name(self.metric)]] | ||
values = df['metric'].values | ||
return { | ||
|
@@ -1152,22 +1133,25 @@ def to_series(self, df, classed='', title_suffix=''): | |
|
||
def process_data(self, df, aggregate=False): | ||
fd = self.form_data | ||
df = df.fillna(0) | ||
if fd.get('granularity') == 'all': | ||
raise Exception(_('Pick a time granularity for your time series')) | ||
|
||
if not aggregate: | ||
df = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
columns=fd.get('groupby'), | ||
values=self.metric_labels) | ||
values=self.metric_labels, | ||
dropna=False, | ||
) | ||
else: | ||
df = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
columns=fd.get('groupby'), | ||
values=self.metric_labels, | ||
fill_value=0, | ||
aggfunc=sum) | ||
aggfunc=sum, | ||
dropna=False, | ||
) | ||
|
||
fm = fd.get('resample_fillmethod') | ||
if not fm: | ||
|
@@ -1176,8 +1160,6 @@ def process_data(self, df, aggregate=False): | |
rule = fd.get('resample_rule') | ||
if how and rule: | ||
df = df.resample(rule, how=how, fill_method=fm) | ||
if not fm: | ||
df = df.fillna(0) | ||
|
||
if self.sort_series: | ||
dfs = df.sum() | ||
|
@@ -1241,7 +1223,6 @@ def get_data(self, df): | |
fd = self.form_data | ||
comparison_type = fd.get('comparison_type') or 'values' | ||
df = self.process_data(df) | ||
|
||
if comparison_type == 'values': | ||
chart_data = self.to_series(df) | ||
for i, (label, df2) in enumerate(self._extra_chart_data): | ||
|
@@ -1368,7 +1349,6 @@ def to_series(self, df, classed=''): | |
|
||
def get_data(self, df): | ||
fd = self.form_data | ||
df = df.fillna(0) | ||
|
||
if self.form_data.get('granularity') == 'all': | ||
raise Exception(_('Pick a time granularity for your time series')) | ||
|
@@ -1377,7 +1357,9 @@ def get_data(self, df): | |
metric_2 = utils.get_metric_name(fd.get('metric_2')) | ||
df = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
values=[metric, metric_2]) | ||
values=[metric, metric_2], | ||
dropna=False, | ||
) | ||
|
||
chart_data = self.to_series(df) | ||
return chart_data | ||
|
@@ -1425,7 +1407,9 @@ def get_data(self, df): | |
df = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
columns='series', | ||
values=utils.get_metric_name(fd.get('metric'))) | ||
values=utils.get_metric_name(fd.get('metric')), | ||
dropna=False, | ||
) | ||
chart_data = self.to_series(df) | ||
for serie in chart_data: | ||
serie['rank'] = rank_lookup[serie['key']] | ||
|
@@ -1462,7 +1446,9 @@ def get_data(self, df): | |
metric = self.metric_labels[0] | ||
df = df.pivot_table( | ||
index=self.groupby, | ||
values=[metric]) | ||
values=[metric], | ||
dropna=False, | ||
) | ||
df.sort_values(by=metric, ascending=False, inplace=True) | ||
df = df.reset_index() | ||
df.columns = ['x', 'y'] | ||
|
@@ -1549,9 +1535,10 @@ def get_data(self, df): | |
pt = df.pivot_table( | ||
index=self.groupby, | ||
columns=columns, | ||
values=metrics) | ||
values=metrics, | ||
dropna=False, | ||
) | ||
if fd.get('contribution'): | ||
pt = pt.fillna(0) | ||
pt = pt.T | ||
pt = (pt / pt.sum()).T | ||
pt = pt.reindex(row.index) | ||
|
@@ -2117,9 +2104,6 @@ class BaseDeckGLViz(BaseViz): | |
credits = '<a href="https://uber.github.io/deck.gl/">deck.gl</a>' | ||
spatial_control_keys = [] | ||
|
||
def handle_nulls(self, df): | ||
return df | ||
|
||
def get_metrics(self): | ||
self.metric = self.form_data.get('size') | ||
return [self.metric] if self.metric else [] | ||
|
@@ -2572,11 +2556,11 @@ def get_data(self, df): | |
fd = self.form_data | ||
groups = fd.get('groupby') | ||
metrics = fd.get('metrics') | ||
df.fillna(0) | ||
df = df.pivot_table( | ||
index=DTTM_ALIAS, | ||
columns=groups, | ||
values=metrics) | ||
values=metrics, | ||
) | ||
cols = [] | ||
# Be rid of falsey keys | ||
for col in df.columns: | ||
|
@@ -2699,7 +2683,7 @@ def levels_for_time(self, groups, df): | |
for i in range(0, len(groups) + 1): | ||
self.form_data['groupby'] = groups[:i] | ||
df_drop = df.drop(groups[i:], 1) | ||
procs[i] = self.process_data(df_drop, aggregate=True).fillna(0) | ||
procs[i] = self.process_data(df_drop, aggregate=True) | ||
self.form_data['groupby'] = groups | ||
return procs | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note
stdev
is not a validPandas
function.