Skip to content

Commit

Permalink
Matching the behaviour to pandas>=1.0.0 at most. (#1299)
Browse files Browse the repository at this point in the history
Follow-up for #1197

Since we're following latest version of pandas, should fix several TODOs with matching pandas>=1.0.0 for now.

## For example.

the behaviour of `Expanding.count()` and `ExpandingGroupby.count()` are different depending on what pandas version has been installed.

- pandas < 1.0.0
```python
>>> s = pd.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
>>> s.groupby(s).expanding(3).count().sort_index()
2  0     1.0
   1     2.0
3  2     1.0
   3     2.0
   4     3.0
4  5     1.0
   6     2.0
   7     3.0
   8     4.0
5  9     1.0
   10    2.0
dtype: float64
```

- pandas >= 1.0.0
```python
>>> s = pd.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
>>> s.groupby(s).expanding(3).count().sort_index()
2  0     NaN
   1     NaN
3  2     NaN
   3     NaN
   4     3.0
4  5     NaN
   6     NaN
   7     3.0
   8     4.0
5  9     NaN
   10    NaN
dtype: float64
```

Since we're following latest version of pandas, need to fix this.
  • Loading branch information
itholic authored Feb 27, 2020
1 parent 3f9a9cd commit 84da886
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 42 deletions.
4 changes: 2 additions & 2 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@ def kurtosis(self, axis=None, numeric_only=True):

kurt = kurtosis

def min(self, axis=None, numeric_only=False):
def min(self, axis=None, numeric_only=None):
"""
Return the minimum of the values.
Expand Down Expand Up @@ -1141,7 +1141,7 @@ def min(self, axis=None, numeric_only=False):
F.min, name="min", numeric_only=numeric_only, axis=axis
)

def max(self, axis=None, numeric_only=False):
def max(self, axis=None, numeric_only=None):
"""
Return the maximum of the values.
Expand Down
6 changes: 1 addition & 5 deletions databricks/koalas/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1722,12 +1722,8 @@ def _is_monotonic(self):
for field in self.spark_type[::-1]:
left = col.getField(field.name)
right = prev.getField(field.name)
if isinstance(field.dataType, StringType):
compare = compare_disallow_null
elif isinstance(field.dataType, BooleanType):
if isinstance(field.dataType, BooleanType):
compare = compare_allow_null
elif isinstance(field.dataType, NumericType):
compare = compare_null_first
else:
compare = compare_null_last
cond = F.when(left.eqNullSafe(right), cond).otherwise(
Expand Down
106 changes: 104 additions & 2 deletions databricks/koalas/tests/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from distutils.version import LooseVersion

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -58,7 +60,46 @@ def test_expanding_repr(self):
self.assertEqual(repr(ks.range(10).expanding(5)), "Expanding [min_periods=5]")

def test_expanding_count(self):
self._test_expanding_func("count")
# The behaviour of Expanding.count are different between pandas>=1.0.0 and lower,
# and we're following the behaviour of latest version of pandas.
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
self._test_expanding_func("count")
else:
# Series
kser = ks.Series([1, 2, 3], index=np.random.rand(3))
expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
self.assert_eq(
repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
)
# MultiIndex
kser = ks.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
)
expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
self.assert_eq(
repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
)

# DataFrame
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
expected_result = ks.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
self.assert_eq(
repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
)

# MultiIndex columns
kdf = ks.DataFrame(
{"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)
)
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
expected_result = ks.DataFrame(
{"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]},
index=kdf.index.to_pandas(),
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
self.assert_eq(
repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
)

def test_expanding_min(self):
self._test_expanding_func("min")
Expand Down Expand Up @@ -118,7 +159,68 @@ def _test_groupby_expanding_func(self, f):
)

def test_groupby_expanding_count(self):
self._test_groupby_expanding_func("count")
# The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower,
# and we're following the behaviour of latest version of pandas.
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
self._test_groupby_expanding_func("count")
else:
# Series
kser = ks.Series([1, 2, 3], index=np.random.rand(3))
midx = pd.MultiIndex.from_tuples(
list(zip(kser.to_pandas().values, kser.index.to_pandas().values))
)
expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
self.assert_eq(
kser.groupby(kser).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
# MultiIndex
kser = ks.Series(
[1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
)
midx = pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"), (3, "b", "z")])
expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
self.assert_eq(
kser.groupby(kser).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
# DataFrame
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
expected_result = ks.DataFrame(
{"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
)
self.assert_eq(
kdf.groupby(kdf.a).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
# MultiIndex column
kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
expected_result = ks.DataFrame(
{"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
self.assert_eq(
kdf.groupby(("a", "x")).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)
midx = pd.MultiIndex.from_tuples([(1, 4.0, 0), (2, 1.0, 3), (2, 2.0, 1), (3, 3.0, 2)])
expected_result = ks.DataFrame(
{"a": [np.nan, np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan, np.nan]},
index=midx,
)
expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
self.assert_eq(
kdf.groupby([("a", "x"), ("a", "y")]).expanding(2).count().sort_index(),
expected_result.sort_index(),
almost=True,
)

def test_groupby_expanding_min(self):
self._test_groupby_expanding_func("min")
Expand Down
23 changes: 19 additions & 4 deletions databricks/koalas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,23 @@ def test_groupby_multiindex_columns(self):
kdf.groupby(("x", "a"))[[("y", "c")]].sum().sort_index(),
pdf.groupby(("x", "a"))[[("y", "c")]].sum().sort_index(),
)
# TODO: seems like a pandas' bug ?
# self.assert_eq(kdf[('x', 'a')].groupby(kdf[('x', 'b')]).sum().sort_index(),
# pdf[('x', 'a')].groupby(pdf[('x', 'b')]).sum().sort_index())
# TODO: seems like a pandas' bug. it works well in Koalas like the below.
# >>> pdf[('x', 'a')].groupby(pdf[('x', 'b')]).sum().sort_index()
# Traceback (most recent call last):
# ...
# ValueError: Can only tuple-index with a MultiIndex
# >>> kdf[('x', 'a')].groupby(kdf[('x', 'b')]).sum().sort_index()
# (x, b)
# 1 13
# 2 9
# 3 8
# 4 1
# 7 6
# Name: (x, a), dtype: int64
expected_result = ks.Series(
[13, 9, 8, 1, 6], name=("x", "a"), index=pd.Index([1, 2, 3, 4, 7], name=("x", "b"))
)
self.assert_eq(kdf[("x", "a")].groupby(kdf[("x", "b")]).sum().sort_index(), expected_result)

def test_split_apply_combine_on_series(self):
pdf = pd.DataFrame(
Expand Down Expand Up @@ -1029,7 +1043,8 @@ def test_shift(self):
pdf.groupby(["a", "b"])["c"].shift().sort_index(),
almost=True,
)
# TODO: seems like a pandas' bug when fill_value is not None when only pandas>=1.0.0
# TODO: known pandas' bug when fill_value is not None pandas>=1.0.0
# https://github.com/pandas-dev/pandas/issues/31971#issue-565171762
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
self.assert_eq(
kdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(),
Expand Down
35 changes: 24 additions & 11 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,9 @@ def test_argmax(self):
kidx.argmax()

def test_monotonic(self):
# test monotonic_increasing & monotonic_decreasing for MultiIndex
# test monotonic_increasing & monotonic_decreasing for MultiIndex.
# Since the Behavior for null value was changed in pandas >= 1.0.0,
# several cases are tested differently.
datas = []

# increasing / decreasing ordered each index level with string
Expand Down Expand Up @@ -772,40 +774,51 @@ def test_monotonic(self):
datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])

# None type tests (None type is treated as the largets value)
# TODO: the commented tests below should be uncommented after fixing for pandas >= 1.0.0
# datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
# None type tests (None type is treated as the smallest value)
datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
# datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
# datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
# datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])

# duplicated index value tests
# TODO: the commented test below should be uncommented after fixing for pandas >= 1.0.0
datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
# datas.append([('x', 'd'), ('y', None), ('y', 'c'), ('z', 'a')])
datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])

# more depth tests
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
# datas.append([('x', 'd', 'o'), ('y', 'c', None), ('y', 'c', 'q'), ('z', 'a', 'r')])
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])

for i, data in enumerate(datas):
for data in datas:
kmidx = ks.MultiIndex.from_tuples(data)
pmidx = kmidx.to_pandas()
self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)

# The datas below are showing different result depends on pandas version.
# Because the behavior of handling null values is changed in pandas >= 1.0.0.
datas = []
datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])

for data in datas:
kmidx = ks.MultiIndex.from_tuples(data)
pmidx = kmidx.to_pandas()
expected_increasing_result = pmidx.is_monotonic_increasing
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
expected_increasing_result = not expected_increasing_result
self.assert_eq(kmidx.is_monotonic_increasing, expected_increasing_result)
self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
8 changes: 6 additions & 2 deletions databricks/koalas/tests/test_series_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#


from distutils.version import LooseVersion

import pandas as pd

from databricks import koalas as ks
Expand Down Expand Up @@ -53,6 +55,8 @@ def test_to_latex(self):
self.assert_eq(kser.to_latex(sparsify=False), pser.to_latex(sparsify=False))
self.assert_eq(kser.to_latex(index_names=False), pser.to_latex(index_names=False))
self.assert_eq(kser.to_latex(bold_rows=True), pser.to_latex(bold_rows=True))
# Error in pandas - ValueError: buf is not a file name and encoding is specified.
# self.assert_eq(kser.to_latex(encoding='ascii'), pser.to_latex(encoding='ascii'))
# Can't specifying `encoding` without specifying `buf` as filename in pandas >= 1.0.0
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/formats/format.py#L492-L495
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
self.assert_eq(kser.to_latex(encoding="ascii"), pser.to_latex(encoding="ascii"))
self.assert_eq(kser.to_latex(decimal=","), pser.to_latex(decimal=","))
5 changes: 2 additions & 3 deletions databricks/koalas/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,8 @@ def test_axis_on_dataframe(self):
self.assert_eq(kdf.count(axis=1), pdf.count(axis=1))
self.assert_eq(kdf.var(axis=1), pdf.var(axis=1))
self.assert_eq(kdf.std(axis=1), pdf.std(axis=1))
# TODO: `max` & `min` with `axis=1` now raise error in pandas 1.0.0
# self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
# self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
self.assert_eq(kdf.sum(axis=1), pdf.sum(axis=1))
self.assert_eq(kdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
self.assert_eq(kdf.skew(axis=1), pdf.skew(axis=1))
Expand Down
21 changes: 8 additions & 13 deletions databricks/koalas/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#
from functools import partial
from typing import Any
from distutils.version import LooseVersion

import pandas as pd

Expand Down Expand Up @@ -59,17 +58,6 @@ def count(self):
def count(scol):
return F.count(scol).over(self._window)

if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
if isinstance(self, (Expanding, ExpandingGroupby)):

def count_expanding(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.count(scol).over(self._window),
).otherwise(F.lit(None))

return self._apply_as_series_or_frame(count_expanding).astype("float64")

return self._apply_as_series_or_frame(count).astype("float64")

def sum(self):
Expand Down Expand Up @@ -1115,7 +1103,14 @@ def count(self):
2 2.0
3 3.0
"""
return super(Expanding, self).count()

def count(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.count(scol).over(self._window),
).otherwise(F.lit(None))

return self._apply_as_series_or_frame(count).astype("float64")

def sum(self):
"""
Expand Down

0 comments on commit 84da886

Please sign in to comment.