DEPR: Deprecate str.split return_type #9847

pandas-dev · May 9, 2015 · 8b89842 · 8b89842
1 parent 5c595f3
commit 8b89842
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 40 deletions.
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -221,6 +221,28 @@ enhancements are performed to make string operation easier.
      idx.str.startswith('a')
      s[s.index.str.startswith('a')]
 
+
+- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`)
+
+  .. ipython:: python
+
+     s = Series(['a,b', 'a,c', 'b,c'])
+
+     # return Series
+     s.str.split(',')
+
+     # return DataFrame
+     s.str.split(',', expand=True)
+
+     idx = Index(['a,b', 'a,c', 'b,c'])
+
+     # return Index
+     idx.str.split(',')
+
+     # return MultiIndex
+     idx.str.split(',', expand=True)
+
+
 - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
 
 .. _whatsnew_0161.api:
@@ -249,6 +271,13 @@ API changes
 
 - By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)
 
+.. _whatsnew_0161.deprecations:
+
+Deprecations
+^^^^^^^^^^^^
+
+- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`)
+
 .. _whatsnew_0161.performance:
 
 Performance Improvements

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -3,7 +3,7 @@
 from pandas.compat import zip
 from pandas.core.common import isnull, _values_from_object, is_bool_dtype
 import pandas.compat as compat
-from pandas.util.decorators import Appender
+from pandas.util.decorators import Appender, deprecate_kwarg
 import re
 import pandas.lib as lib
 import warnings
@@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
     return _na_map(f, arr)
 
 
-def str_split(arr, pat=None, n=None, return_type='series'):
+def str_split(arr, pat=None, n=None):
     """
     Split each string (a la re.split) in the Series/Index by given
     pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -705,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
     ----------
     pat : string, default None
         String or regular expression to split on. If None, splits on whitespace
-    n : int, default None (all)
-    return_type : {'series', 'index', 'frame'}, default 'series'
-        If frame, returns a DataFrame (elements are strings)
-        If series or index, returns the same type as the original object
-        (elements are lists of strings).
-
-    Notes
-    -----
-    Both 0 and -1 will be interpreted as return all splits
+    n : int, default -1 (all)
+        None, 0 and -1 will be interpreted as return all splits
+    expand : bool, default False
+        * If True, return DataFrame/MultiIndex expanding dimensionality.
+        * If False, return Series/Index.
+    return_type : deprecated, use `expand`
 
     Returns
     -------
-    split : Series/Index of objects or DataFrame
+    split : Series/Index or DataFrame/MultiIndex of objects
     """
-    from pandas.core.series import Series
-    from pandas.core.frame import DataFrame
-    from pandas.core.index import Index
-
-    if return_type not in ('series', 'index', 'frame'):
-        raise ValueError("return_type must be {'series', 'index', 'frame'}")
-    if return_type == 'frame' and isinstance(arr, Index):
-        raise ValueError("return_type='frame' is not supported for string "
-                         "methods on Index")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -742,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
                 n = 0
             regex = re.compile(pat)
             f = lambda x: regex.split(x, maxsplit=n)
-    if return_type == 'frame':
-        res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
-    else:
-        res = _na_map(f, arr)
+    res = _na_map(f, arr)
     return res
 
 
@@ -1083,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
             return DataFrame(result, index=self.series.index)
 
     def _wrap_result_expand(self, result, expand=False):
-        from pandas.core.index import Index
+        if not isinstance(expand, bool):
+            raise ValueError("expand must be True or False")
+
+        from pandas.core.index import Index, MultiIndex
         if not hasattr(result, 'ndim'):
             return result
 
@@ -1096,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):
 
             if expand:
                 result = list(result)
-            return Index(result, name=name)
+                return MultiIndex.from_tuples(result, names=name)
+            else:
+                return Index(result, name=name)
         else:
             index = self.series.index
             if expand:
@@ -1114,10 +1104,12 @@ def cat(self, others=None, sep=None, na_rep=None):
         result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
         return self._wrap_result(result)
 
+    @deprecate_kwarg('return_type', 'expand',
+                     mapping={'series': False, 'frame': True})
     @copy(str_split)
-    def split(self, pat=None, n=-1, return_type='series'):
-        result = str_split(self.series, pat, n=n, return_type=return_type)
-        return self._wrap_result(result)
+    def split(self, pat=None, n=-1, expand=False):
+        result = str_split(self.series, pat, n=n)
+        return self._wrap_result_expand(result, expand=expand)
 
     _shared_docs['str_partition'] = ("""
     Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1131,7 +1123,7 @@ def split(self, pat=None, n=-1, return_type='series'):
         String to split on.
     expand : bool, default True
         * If True, return DataFrame/MultiIndex expanding dimensionality.
-        * If False, return Series/Index
+        * If False, return Series/Index.
 
     Returns
     -------

diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1280,11 +1280,12 @@ def test_str_attribute(self):
         idx = Index(['a b c', 'd e', 'f'])
         expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
         tm.assert_index_equal(idx.str.split(), expected)
-        tm.assert_index_equal(idx.str.split(return_type='series'), expected)
-        # return_type 'index' is an alias for 'series'
-        tm.assert_index_equal(idx.str.split(return_type='index'), expected)
-        with self.assertRaisesRegexp(ValueError, 'not supported'):
-            idx.str.split(return_type='frame')
+        tm.assert_index_equal(idx.str.split(expand=False), expected)
+
+        expected = MultiIndex.from_tuples([('a', 'b', 'c'),
+                                           ('d', 'e', np.nan),
+                                           ('f', np.nan, np.nan)])
+        tm.assert_index_equal(idx.str.split(expand=True), expected)
 
         # test boolean case, should return np.array instead of boolean Index
         idx = Index(['a1', 'a2', 'b1', 'b2'])

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -1206,14 +1206,19 @@ def test_split(self):
         result = values.str.split('__')
         tm.assert_series_equal(result, exp)
 
+        result = values.str.split('__', expand=False)
+        tm.assert_series_equal(result, exp)
+
         # mixed
         mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
                         None, 1, 2.])
-
-        rs = Series(mixed).str.split('_')
+        rs = mixed.str.split('_')
         xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
                      NA, NA, NA])
+        tm.assert_isinstance(rs, Series)
+        tm.assert_almost_equal(rs, xp)
 
+        rs = mixed.str.split('_', expand=False)
         tm.assert_isinstance(rs, Series)
         tm.assert_almost_equal(rs, xp)
 
@@ -1226,6 +1231,9 @@ def test_split(self):
                       [u('f'), u('g'), u('h')]])
         tm.assert_series_equal(result, exp)
 
+        result = values.str.split('_', expand=False)
+        tm.assert_series_equal(result, exp)
+
     def test_split_noargs(self):
         # #1859
         s = Series(['Wes McKinney', 'Travis  Oliphant'])
@@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self):
 
     def test_split_to_dataframe(self):
         s = Series(['nosplit', 'alsonosplit'])
-        result = s.str.split('_', return_type='frame')
+
+        with tm.assert_produces_warning():
+            result = s.str.split('_', return_type='frame')
+
         exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
         tm.assert_frame_equal(result, exp)
 
@@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self):
                         index=['preserve', 'me'])
         tm.assert_frame_equal(result, exp)
 
-        with tm.assertRaisesRegexp(ValueError, "return_type must be"):
+        with tm.assertRaisesRegexp(ValueError, "expand must be"):
+            s.str.split('_', return_type="some_invalid_type")
+
+    def test_split_to_dataframe_expand(self):
+        s = Series(['nosplit', 'alsonosplit'])
+        result = s.str.split('_', expand=True)
+        exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_equal_splits', 'with_no_nans'])
+        result = s.str.split('_', expand=True)
+        exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
+                         2: ['splits', 'nans']})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
+        result = s.str.split('_', expand=True)
+        exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
+                         2: ['splits', 'these'], 3: [NA, 'things'],
+                         4: [NA, 'is'], 5: [NA, 'not']})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
+        result = s.str.split('_', expand=True)
+        exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
+                        index=['preserve', 'me'])
+        tm.assert_frame_equal(result, exp)
+
+        with tm.assertRaisesRegexp(ValueError, "expand must be"):
             s.str.split('_', return_type="some_invalid_type")
 
+    def test_split_to_multiindex_expand(self):
+        idx = Index(['nosplit', 'alsonosplit'])
+        result = idx.str.split('_', expand=True)
+        exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 1)
+
+        idx = Index(['some_equal_splits', 'with_no_nans'])
+        result = idx.str.split('_', expand=True)
+        exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
+                                      ('with', 'no', 'nans')])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 3)
+
+        idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
+        result = idx.str.split('_', expand=True)
+        exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA),
+                                      ('one', 'of', 'these', 'things', 'is', 'not')])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 6)
+
+        with tm.assertRaisesRegexp(ValueError, "expand must be"):
+            idx.str.split('_', return_type="some_invalid_type")
+
     def test_partition_series(self):
         values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])