Skip to content

Commit

Permalink
DEPR: Deprecate str.split return_type #9847
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks authored and jreback committed May 9, 2015
1 parent 5c595f3 commit 8b89842
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 40 deletions.
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,28 @@ enhancements are performed to make string operation easier.
idx.str.startswith('a')
s[s.index.str.startswith('a')]


- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`)

.. ipython:: python

s = Series(['a,b', 'a,c', 'b,c'])

# return Series
s.str.split(',')

# return DataFrame
s.str.split(',', expand=True)

idx = Index(['a,b', 'a,c', 'b,c'])

# return Index
idx.str.split(',')

# return MultiIndex
idx.str.split(',', expand=True)


- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)

.. _whatsnew_0161.api:
Expand Down Expand Up @@ -249,6 +271,13 @@ API changes

- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)

.. _whatsnew_0161.deprecations:

Deprecations
^^^^^^^^^^^^

- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`)

.. _whatsnew_0161.performance:

Performance Improvements
Expand Down
54 changes: 23 additions & 31 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pandas.compat import zip
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
import pandas.compat as compat
from pandas.util.decorators import Appender
from pandas.util.decorators import Appender, deprecate_kwarg
import re
import pandas.lib as lib
import warnings
Expand Down Expand Up @@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
return _na_map(f, arr)


def str_split(arr, pat=None, n=None, return_type='series'):
def str_split(arr, pat=None, n=None):
"""
Split each string (a la re.split) in the Series/Index by given
pattern, propagating NA values. Equivalent to :meth:`str.split`.
Expand All @@ -705,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
----------
pat : string, default None
String or regular expression to split on. If None, splits on whitespace
n : int, default None (all)
return_type : {'series', 'index', 'frame'}, default 'series'
If frame, returns a DataFrame (elements are strings)
If series or index, returns the same type as the original object
(elements are lists of strings).
Notes
-----
Both 0 and -1 will be interpreted as return all splits
n : int, default -1 (all)
None, 0 and -1 will be interpreted as return all splits
expand : bool, default False
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index.
return_type : deprecated, use `expand`
Returns
-------
split : Series/Index of objects or DataFrame
split : Series/Index or DataFrame/MultiIndex of objects
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if return_type not in ('series', 'index', 'frame'):
raise ValueError("return_type must be {'series', 'index', 'frame'}")
if return_type == 'frame' and isinstance(arr, Index):
raise ValueError("return_type='frame' is not supported for string "
"methods on Index")
if pat is None:
if n is None or n == 0:
n = -1
Expand All @@ -742,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
n = 0
regex = re.compile(pat)
f = lambda x: regex.split(x, maxsplit=n)
if return_type == 'frame':
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
else:
res = _na_map(f, arr)
res = _na_map(f, arr)
return res


Expand Down Expand Up @@ -1083,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
return DataFrame(result, index=self.series.index)

def _wrap_result_expand(self, result, expand=False):
from pandas.core.index import Index
if not isinstance(expand, bool):
raise ValueError("expand must be True or False")

from pandas.core.index import Index, MultiIndex
if not hasattr(result, 'ndim'):
return result

Expand All @@ -1096,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):

if expand:
result = list(result)
return Index(result, name=name)
return MultiIndex.from_tuples(result, names=name)
else:
return Index(result, name=name)
else:
index = self.series.index
if expand:
Expand All @@ -1114,10 +1104,12 @@ def cat(self, others=None, sep=None, na_rep=None):
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
return self._wrap_result(result)

@deprecate_kwarg('return_type', 'expand',
mapping={'series': False, 'frame': True})
@copy(str_split)
def split(self, pat=None, n=-1, return_type='series'):
result = str_split(self.series, pat, n=n, return_type=return_type)
return self._wrap_result(result)
def split(self, pat=None, n=-1, expand=False):
result = str_split(self.series, pat, n=n)
return self._wrap_result_expand(result, expand=expand)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
Expand All @@ -1131,7 +1123,7 @@ def split(self, pat=None, n=-1, return_type='series'):
String to split on.
expand : bool, default True
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index
* If False, return Series/Index.
Returns
-------
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,11 +1280,12 @@ def test_str_attribute(self):
idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
# return_type 'index' is an alias for 'series'
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
with self.assertRaisesRegexp(ValueError, 'not supported'):
idx.str.split(return_type='frame')
tm.assert_index_equal(idx.str.split(expand=False), expected)

expected = MultiIndex.from_tuples([('a', 'b', 'c'),
('d', 'e', np.nan),
('f', np.nan, np.nan)])
tm.assert_index_equal(idx.str.split(expand=True), expected)

# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
Expand Down
71 changes: 67 additions & 4 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,14 +1206,19 @@ def test_split(self):
result = values.str.split('__')
tm.assert_series_equal(result, exp)

result = values.str.split('__', expand=False)
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
None, 1, 2.])

rs = Series(mixed).str.split('_')
rs = mixed.str.split('_')
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
NA, NA, NA])
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

rs = mixed.str.split('_', expand=False)
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

Expand All @@ -1226,6 +1231,9 @@ def test_split(self):
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)

result = values.str.split('_', expand=False)
tm.assert_series_equal(result, exp)

def test_split_noargs(self):
# #1859
s = Series(['Wes McKinney', 'Travis Oliphant'])
Expand Down Expand Up @@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self):

def test_split_to_dataframe(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', return_type='frame')

with tm.assert_produces_warning():
result = s.str.split('_', return_type='frame')

exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

Expand All @@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self):
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "return_type must be"):
with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_dataframe_expand(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
2: ['splits', 'these'], 3: [NA, 'things'],
4: [NA, 'is'], 5: [NA, 'not']})
tm.assert_frame_equal(result, exp)

s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.split('_', expand=True)
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 3)

idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA),
('one', 'of', 'these', 'things', 'is', 'not')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 6)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
idx.str.split('_', return_type="some_invalid_type")

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])

Expand Down

0 comments on commit 8b89842

Please sign in to comment.