From 9b45d74c6e347533701d52dfb8370af85c8c78ed Mon Sep 17 00:00:00 2001 From: Bill Letson Date: Mon, 27 Oct 2014 15:37:06 -0700 Subject: [PATCH] ENH: Series.str.split can return a DataFrame instead of Series of lists --- doc/source/whatsnew/v0.15.1.txt | 1 + pandas/core/strings.py | 18 +++++++++++++----- pandas/tests/test_strings.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index c666a19bcd133..ac788967f1c18 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -109,6 +109,7 @@ Enhancements - Added support for 3-character ISO and non-standard country codes in :func:``io.wb.download()`` (:issue:`8482`) - :ref:`World Bank data requests ` now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and the World Bank's JSON response. In prior versions, the error messages didn't look at the World Bank's JSON response. Problem-inducing input were simply dropped prior to the request. The issue was that many good countries were cropped in the hard-coded approach. All countries will work now, but some bad countries will raise exceptions because some edge cases break the entire response. (:issue:`8482`) +- Added option to ``Series.str.split()`` to return a ``DataFrame`` rather than a ``Series`` (:issue:`8428`) .. _whatsnew_0151.performance: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2d8b8f8b2edff..78780bc9618f7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -621,7 +621,7 @@ def str_center(arr, width): return str_pad(arr, width, side='both') -def str_split(arr, pat=None, n=None): +def str_split(arr, pat=None, n=None, return_type='series'): """ Split each string (a la re.split) in array by given pattern, propagating NA values @@ -631,6 +631,9 @@ def str_split(arr, pat=None, n=None): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) + return_type : {'series', 'frame'}, default 'series + If frame, returns a DataFrame (elements are strings) + If series, returns an Series (elements are lists of strings). Notes ----- @@ -640,6 +643,8 @@ def str_split(arr, pat=None, n=None): ------- split : array """ + if return_type not in ('series', 'frame'): + raise ValueError("return_type must be {'series', 'frame'}") if pat is None: if n is None or n == 0: n = -1 @@ -654,8 +659,11 @@ def str_split(arr, pat=None, n=None): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - - return _na_map(f, arr) + if return_type == 'frame': + res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index) + else: + res = _na_map(f, arr) + return res def str_slice(arr, start=None, stop=None, step=1): @@ -937,8 +945,8 @@ def cat(self, others=None, sep=None, na_rep=None): return self._wrap_result(result) @copy(str_split) - def split(self, pat=None, n=-1): - result = str_split(self.series, pat, n=n) + def split(self, pat=None, n=-1, return_type='series'): + result = str_split(self.series, pat, n=n, return_type=return_type) return self._wrap_result(result) @copy(str_get) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 41594a1655d18..02808ebf0b340 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -873,6 +873,34 @@ def test_split_no_pat_with_nonzero_n(self): expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) tm.assert_series_equal(expected, result) + def test_split_to_dataframe(self): + s = Series(['nosplit', 'alsonosplit']) + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_equal_splits', 'with_no_nans']) + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], + 2: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'], + 2: ['splits', 'these'], 3: [NA, 'things'], + 4: [NA, 'is'], 5: [NA, 'not']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, + index=['preserve', 'me']) + tm.assert_frame_equal(result, exp) + + with tm.assertRaisesRegexp(ValueError, "return_type must be"): + s.str.split('_', return_type="some_invalid_type") + def test_pipe_failures(self): # #2119 s = Series(['A|B|C'])