Matching the behaviour to pandas>=1.0.0 at most. (#1299)

Follow-up for #1197 Since we're following latest version of pandas, should fix several TODOs with matching pandas>=1.0.0 for now. ## For example. the behaviour of `Expanding.count()` and `ExpandingGroupby.count()` are different depending on what pandas version has been installed. - pandas < 1.0.0 ```python >>> s = pd.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).expanding(3).count().sort_index() 2 0 1.0 1 2.0 3 2 1.0 3 2.0 4 3.0 4 5 1.0 6 2.0 7 3.0 8 4.0 5 9 1.0 10 2.0 dtype: float64 ``` - pandas >= 1.0.0 ```python >>> s = pd.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).expanding(3).count().sort_index() 2 0 NaN 1 NaN 3 2 NaN 3 NaN 4 3.0 4 5 NaN 6 NaN 7 3.0 8 4.0 5 9 NaN 10 NaN dtype: float64 ``` Since we're following latest version of pandas, need to fix this.
databricks · Feb 27, 2020 · 84da886 · 84da886
1 parent 3f9a9cd
commit 84da886
Show file tree

Hide file tree

Showing 8 changed files with 166 additions and 42 deletions.
diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py
@@ -1096,7 +1096,7 @@ def kurtosis(self, axis=None, numeric_only=True):
 
     kurt = kurtosis
 
-    def min(self, axis=None, numeric_only=False):
+    def min(self, axis=None, numeric_only=None):
         """
         Return the minimum of the values.
 
@@ -1141,7 +1141,7 @@ def min(self, axis=None, numeric_only=False):
             F.min, name="min", numeric_only=numeric_only, axis=axis
         )
 
-    def max(self, axis=None, numeric_only=False):
+    def max(self, axis=None, numeric_only=None):
         """
         Return the maximum of the values.
 

diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -1722,12 +1722,8 @@ def _is_monotonic(self):
         for field in self.spark_type[::-1]:
             left = col.getField(field.name)
             right = prev.getField(field.name)
-            if isinstance(field.dataType, StringType):
-                compare = compare_disallow_null
-            elif isinstance(field.dataType, BooleanType):
+            if isinstance(field.dataType, BooleanType):
                 compare = compare_allow_null
-            elif isinstance(field.dataType, NumericType):
-                compare = compare_null_first
             else:
                 compare = compare_null_last
             cond = F.when(left.eqNullSafe(right), cond).otherwise(

diff --git a/databricks/koalas/tests/test_expanding.py b/databricks/koalas/tests/test_expanding.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from distutils.version import LooseVersion
+
 import numpy as np
 import pandas as pd
 
@@ -58,7 +60,46 @@ def test_expanding_repr(self):
         self.assertEqual(repr(ks.range(10).expanding(5)), "Expanding [min_periods=5]")
 
     def test_expanding_count(self):
-        self._test_expanding_func("count")
+        # The behaviour of Expanding.count are different between pandas>=1.0.0 and lower,
+        # and we're following the behaviour of latest version of pandas.
+        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
+            self._test_expanding_func("count")
+        else:
+            # Series
+            kser = ks.Series([1, 2, 3], index=np.random.rand(3))
+            expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
+            self.assert_eq(
+                repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
+            )
+            # MultiIndex
+            kser = ks.Series(
+                [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
+            )
+            expected_result = ks.Series([None, 2.0, 3.0], index=kser.index.to_pandas())
+            self.assert_eq(
+                repr(kser.expanding(2).count().sort_index()), repr(expected_result.sort_index())
+            )
+
+            # DataFrame
+            kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
+            expected_result = ks.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
+            self.assert_eq(
+                repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
+            )
+
+            # MultiIndex columns
+            kdf = ks.DataFrame(
+                {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)
+            )
+            kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
+            expected_result = ks.DataFrame(
+                {"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]},
+                index=kdf.index.to_pandas(),
+            )
+            expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
+            self.assert_eq(
+                repr(kdf.expanding(2).count().sort_index()), repr(expected_result.sort_index())
+            )
 
     def test_expanding_min(self):
         self._test_expanding_func("min")
@@ -118,7 +159,68 @@ def _test_groupby_expanding_func(self, f):
         )
 
     def test_groupby_expanding_count(self):
-        self._test_groupby_expanding_func("count")
+        # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower,
+        # and we're following the behaviour of latest version of pandas.
+        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
+            self._test_groupby_expanding_func("count")
+        else:
+            # Series
+            kser = ks.Series([1, 2, 3], index=np.random.rand(3))
+            midx = pd.MultiIndex.from_tuples(
+                list(zip(kser.to_pandas().values, kser.index.to_pandas().values))
+            )
+            expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
+            self.assert_eq(
+                kser.groupby(kser).expanding(2).count().sort_index(),
+                expected_result.sort_index(),
+                almost=True,
+            )
+            # MultiIndex
+            kser = ks.Series(
+                [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
+            )
+            midx = pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"), (3, "b", "z")])
+            expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
+            self.assert_eq(
+                kser.groupby(kser).expanding(2).count().sort_index(),
+                expected_result.sort_index(),
+                almost=True,
+            )
+            # DataFrame
+            kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
+            midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
+            expected_result = ks.DataFrame(
+                {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
+            )
+            self.assert_eq(
+                kdf.groupby(kdf.a).expanding(2).count().sort_index(),
+                expected_result.sort_index(),
+                almost=True,
+            )
+            # MultiIndex column
+            kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
+            kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
+            midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
+            expected_result = ks.DataFrame(
+                {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx
+            )
+            expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
+            self.assert_eq(
+                kdf.groupby(("a", "x")).expanding(2).count().sort_index(),
+                expected_result.sort_index(),
+                almost=True,
+            )
+            midx = pd.MultiIndex.from_tuples([(1, 4.0, 0), (2, 1.0, 3), (2, 2.0, 1), (3, 3.0, 2)])
+            expected_result = ks.DataFrame(
+                {"a": [np.nan, np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan, np.nan]},
+                index=midx,
+            )
+            expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
+            self.assert_eq(
+                kdf.groupby([("a", "x"), ("a", "y")]).expanding(2).count().sort_index(),
+                expected_result.sort_index(),
+                almost=True,
+            )
 
     def test_groupby_expanding_min(self):
         self._test_groupby_expanding_func("min")

diff --git a/databricks/koalas/tests/test_groupby.py b/databricks/koalas/tests/test_groupby.py
@@ -168,9 +168,23 @@ def test_groupby_multiindex_columns(self):
             kdf.groupby(("x", "a"))[[("y", "c")]].sum().sort_index(),
             pdf.groupby(("x", "a"))[[("y", "c")]].sum().sort_index(),
         )
-        # TODO: seems like a pandas' bug ?
-        # self.assert_eq(kdf[('x', 'a')].groupby(kdf[('x', 'b')]).sum().sort_index(),
-        #                pdf[('x', 'a')].groupby(pdf[('x', 'b')]).sum().sort_index())
+        # TODO: seems like a pandas' bug. it works well in Koalas like the below.
+        # >>> pdf[('x', 'a')].groupby(pdf[('x', 'b')]).sum().sort_index()
+        # Traceback (most recent call last):
+        # ...
+        # ValueError: Can only tuple-index with a MultiIndex
+        # >>> kdf[('x', 'a')].groupby(kdf[('x', 'b')]).sum().sort_index()
+        # (x, b)
+        # 1    13
+        # 2     9
+        # 3     8
+        # 4     1
+        # 7     6
+        # Name: (x, a), dtype: int64
+        expected_result = ks.Series(
+            [13, 9, 8, 1, 6], name=("x", "a"), index=pd.Index([1, 2, 3, 4, 7], name=("x", "b"))
+        )
+        self.assert_eq(kdf[("x", "a")].groupby(kdf[("x", "b")]).sum().sort_index(), expected_result)
 
     def test_split_apply_combine_on_series(self):
         pdf = pd.DataFrame(
@@ -1029,7 +1043,8 @@ def test_shift(self):
             pdf.groupby(["a", "b"])["c"].shift().sort_index(),
             almost=True,
         )
-        # TODO: seems like a pandas' bug when fill_value is not None when only pandas>=1.0.0
+        # TODO: known pandas' bug when fill_value is not None pandas>=1.0.0
+        # https://github.com/pandas-dev/pandas/issues/31971#issue-565171762
         if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
             self.assert_eq(
                 kdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(),

diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -743,7 +743,9 @@ def test_argmax(self):
             kidx.argmax()
 
     def test_monotonic(self):
-        # test monotonic_increasing & monotonic_decreasing for MultiIndex
+        # test monotonic_increasing & monotonic_decreasing for MultiIndex.
+        # Since the Behavior for null value was changed in pandas >= 1.0.0,
+        # several cases are tested differently.
         datas = []
 
         # increasing / decreasing ordered each index level with string
@@ -772,40 +774,51 @@ def test_monotonic(self):
         datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
         datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])
 
-        # None type tests (None type is treated as the largets value)
-        # TODO: the commented tests below should be uncommented after fixing for pandas >= 1.0.0
-        # datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
+        # None type tests (None type is treated as the smallest value)
         datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
-        # datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
         datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
         datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
         datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
-        # datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
         datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
-        # datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
         datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
         datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
         datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
 
         # duplicated index value tests
-        # TODO: the commented test below should be uncommented after fixing for pandas >= 1.0.0
         datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
         datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
         datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
-        # datas.append([('x', 'd'), ('y', None), ('y', 'c'), ('z', 'a')])
         datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
         datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
         datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
 
         # more depth tests
         datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
         datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
-        # datas.append([('x', 'd', 'o'), ('y', 'c', None), ('y', 'c', 'q'), ('z', 'a', 'r')])
         datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
         datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
 
-        for i, data in enumerate(datas):
+        for data in datas:
             kmidx = ks.MultiIndex.from_tuples(data)
             pmidx = kmidx.to_pandas()
             self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
             self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
+
+        # The datas below are showing different result depends on pandas version.
+        # Because the behavior of handling null values is changed in pandas >= 1.0.0.
+        datas = []
+        datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
+        datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
+        datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
+        datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
+        datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
+        datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])
+
+        for data in datas:
+            kmidx = ks.MultiIndex.from_tuples(data)
+            pmidx = kmidx.to_pandas()
+            expected_increasing_result = pmidx.is_monotonic_increasing
+            if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
+                expected_increasing_result = not expected_increasing_result
+            self.assert_eq(kmidx.is_monotonic_increasing, expected_increasing_result)
+            self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
diff --git a/databricks/koalas/tests/test_series_conversion.py b/databricks/koalas/tests/test_series_conversion.py
@@ -15,6 +15,8 @@
 #
 
 
+from distutils.version import LooseVersion
+
 import pandas as pd
 
 from databricks import koalas as ks
@@ -53,6 +55,8 @@ def test_to_latex(self):
         self.assert_eq(kser.to_latex(sparsify=False), pser.to_latex(sparsify=False))
         self.assert_eq(kser.to_latex(index_names=False), pser.to_latex(index_names=False))
         self.assert_eq(kser.to_latex(bold_rows=True), pser.to_latex(bold_rows=True))
-        # Error in pandas - ValueError: buf is not a file name and encoding is specified.
-        # self.assert_eq(kser.to_latex(encoding='ascii'), pser.to_latex(encoding='ascii'))
+        # Can't specifying `encoding` without specifying `buf` as filename in pandas >= 1.0.0
+        # https://github.com/pandas-dev/pandas/blob/master/pandas/io/formats/format.py#L492-L495
+        if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
+            self.assert_eq(kser.to_latex(encoding="ascii"), pser.to_latex(encoding="ascii"))
         self.assert_eq(kser.to_latex(decimal=","), pser.to_latex(decimal=","))
diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py
@@ -88,9 +88,8 @@ def test_axis_on_dataframe(self):
             self.assert_eq(kdf.count(axis=1), pdf.count(axis=1))
             self.assert_eq(kdf.var(axis=1), pdf.var(axis=1))
             self.assert_eq(kdf.std(axis=1), pdf.std(axis=1))
-            # TODO: `max` & `min` with `axis=1` now raise error in pandas 1.0.0
-            # self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
-            # self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
+            self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
+            self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
             self.assert_eq(kdf.sum(axis=1), pdf.sum(axis=1))
             self.assert_eq(kdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
             self.assert_eq(kdf.skew(axis=1), pdf.skew(axis=1))

diff --git a/databricks/koalas/window.py b/databricks/koalas/window.py
@@ -15,7 +15,6 @@
 #
 from functools import partial
 from typing import Any
-from distutils.version import LooseVersion
 
 import pandas as pd
 
@@ -59,17 +58,6 @@ def count(self):
         def count(scol):
             return F.count(scol).over(self._window)
 
-        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
-            if isinstance(self, (Expanding, ExpandingGroupby)):
-
-                def count_expanding(scol):
-                    return F.when(
-                        F.row_number().over(self._unbounded_window) >= self._min_periods,
-                        F.count(scol).over(self._window),
-                    ).otherwise(F.lit(None))
-
-                return self._apply_as_series_or_frame(count_expanding).astype("float64")
-
         return self._apply_as_series_or_frame(count).astype("float64")
 
     def sum(self):
@@ -1115,7 +1103,14 @@ def count(self):
         2  2.0
         3  3.0
         """
-        return super(Expanding, self).count()
+
+        def count(scol):
+            return F.when(
+                F.row_number().over(self._unbounded_window) >= self._min_periods,
+                F.count(scol).over(self._window),
+            ).otherwise(F.lit(None))
+
+        return self._apply_as_series_or_frame(count).astype("float64")
 
     def sum(self):
         """