Skip to content

Commit

Permalink
ENH: Add StringMethods.partition and rpartition
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed May 3, 2015
1 parent 8f0f417 commit b6b05a9
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 0 deletions.
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -541,10 +541,12 @@ strings and apply several methods to it. These can be acccessed like
Series.str.lstrip
Series.str.match
Series.str.pad
Series.str.partition
Series.str.repeat
Series.str.replace
Series.str.rfind
Series.str.rjust
Series.str.rpartition
Series.str.rstrip
Series.str.slice
Series.str.slice_replace
Expand Down
2 changes: 2 additions & 0 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ Method Summary
:meth:`~Series.str.strip`,Equivalent to ``str.strip``
:meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
:meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
:meth:`~Series.str.partition`,Equivalent to ``str.partition``
:meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
:meth:`~Series.str.lower`,Equivalent to ``str.lower``
:meth:`~Series.str.upper`,Equivalent to ``str.upper``
:meth:`~Series.str.find`,Equivalent to ``str.find``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Enhancements
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).

The ``.str`` accessor is now available for both ``Series`` and ``Index``.
Expand Down
90 changes: 90 additions & 0 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,8 @@ def __iter__(self):
g = self.get(i)

def _wrap_result(self, result):
# leave as it is to keep extract and get_dummies results
# can be merged to _wrap_result_expand in v0.17
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index
Expand All @@ -982,6 +984,34 @@ def _wrap_result(self, result):
assert result.ndim < 3
return DataFrame(result, index=self.series.index)

def _wrap_result_expand(self, result, expand=False):
from pandas.core.index import Index
if not hasattr(result, 'ndim'):
return result

if isinstance(self.series, Index):
name = getattr(result, 'name', None)
# if result is a boolean np.array, return the np.array
# instead of wrapping it into a boolean Index (GH 8875)
if hasattr(result, 'dtype') and is_bool_dtype(result):
return result

if expand:
return Index(list(result), name=name)
else:
return Index(result, name=name)
else:
index = self.series.index
if expand:
cons_row = self.series._constructor
cons = self.series._constructor_expanddim
data = [cons_row(x) for x in result]
return cons(data, index=index)
else:
name = getattr(result, 'name', None)
cons = self.series._constructor
return cons(result, name=name, index=index)

@copy(str_cat)
def cat(self, others=None, sep=None, na_rep=None):
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
Expand All @@ -992,6 +1022,65 @@ def split(self, pat=None, n=-1, return_type='series'):
result = str_split(self.series, pat, n=n, return_type=return_type)
return self._wrap_result(result)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of sep, and return a 3-tuple containing the part
before the separator, the separator itself, and the part after the separator.
If the separator is not found, return %(return)s.
Parameters
----------
pat : string, default whitespace
String to split on.
expand : bool, default True
If True, return DataFrame/MultiIndex expanding dimensionality
If False, return Series/Index
Returns
-------
split : DataFrame or Series
See Also
--------
%(also)s
Examples
--------
>>> s = Series(['A_B_C', 'D_E_F', 'X'])
0 A_B_C
1 D_E_F
2 X
dtype: object
>>> s.str.partition('_')
0 1 2
0 A _ B_C
1 D _ E_F
2 X
>>> s.str.rpartition('_')
0 1 2
0 A_B _ C
1 D_E _ F
2 X
""")
@Appender(_shared_docs['str_partition'] % {'side': 'first',
'return': 'a 3-tuple containing the string itself, followed by two empty strings',
'also': 'rpartition : Split the string at the last occurrence of sep'})
def partition(self, pat=' ', expand=True):
f = lambda x: x.partition(pat)
result = _na_map(f, self.series)
print('x', result)
return self._wrap_result_expand(result, expand=expand)

@Appender(_shared_docs['str_partition'] % {'side': 'last',
'return': 'a 3-tuple containing two empty strings, followed by the string itself',
'also': 'partition : Split the string at the first occurrence of sep'})
def rpartition(self, pat=' ', expand=True):
f = lambda x: x.rpartition(pat)
result = _na_map(f, self.series)
return self._wrap_result_expand(result, expand=expand)

@copy(str_get)
def get(self, i):
result = str_get(self.series, i)
Expand Down Expand Up @@ -1124,6 +1213,7 @@ def get_dummies(self, sep='|'):
startswith = _pat_wrapper(str_startswith, na=True)
endswith = _pat_wrapper(str_endswith, na=True)
findall = _pat_wrapper(str_findall, flags=True)
findall = _pat_wrapper(str_findall, flags=True)
extract = _pat_wrapper(str_extract, flags=True)

_shared_docs['find'] = ("""
Expand Down
121 changes: 121 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,8 @@ def test_empty_str_methods(self):
tm.assert_series_equal(empty_str, empty.str.pad(42))
tm.assert_series_equal(empty_str, empty.str.center(42))
tm.assert_series_equal(empty_list, empty.str.split('a'))
tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
tm.assert_series_equal(empty_str, empty.str.strip())
Expand All @@ -686,6 +688,12 @@ def test_empty_str_methods(self):
tm.assert_series_equal(empty_str, empty.str.capitalize())
tm.assert_series_equal(empty_str, empty.str.swapcase())

def test_empty_str_methods_to_frame(self):
empty_str = empty = Series(dtype=str)
empty_df = DataFrame([])
tm.assert_frame_equal(empty_df, empty.str.partition('a'))
tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))

def test_ismethods(self):
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
str_s = Series(values)
Expand Down Expand Up @@ -1174,6 +1182,119 @@ def test_split_to_dataframe(self):
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
s.str.split('_', return_type="some_invalid_type")

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])

result = values.str.partition('_', expand=False)
exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
tm.assert_series_equal(result, exp)

# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.partition('__', expand=False)
exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('__', expand=False)
exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
tm.assert_series_equal(result, exp)

# None
values = Series(['a b c', 'c d e', NA, 'f g h'])
result = values.str.partition(expand=False)
exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition(expand=False)
exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
tm.assert_series_equal(result, exp)

# Not splited
values = Series(['abc', 'cde', NA, 'fgh'])
result = values.str.partition('_', expand=False)
exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
tm.assert_series_equal(result, exp)

# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])

result = values.str.partition('_', expand=False)
exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
NA, [u('f'), u('_'), u('g_h')]])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
NA, [u('f_g'), u('_'), u('h')]])
tm.assert_series_equal(result, exp)

# compare to standard lib
values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
result = values.str.partition('_', expand=False).tolist()
self.assertEqual(result, [v.partition('_') for v in values])
result = values.str.rpartition('_', expand=False).tolist()
self.assertEqual(result, [v.rpartition('_') for v in values])

def test_partition_index(self):
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])

result = values.str.partition('_', expand=False)
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]))
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

result = values.str.rpartition('_', expand=False)
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]))
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

result = values.str.partition('_')
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
tm.assert_index_equal(result, exp)
self.assertTrue(isinstance(result, MultiIndex))
self.assertEqual(result.nlevels, 3)

result = values.str.rpartition('_')
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
tm.assert_index_equal(result, exp)
self.assertTrue(isinstance(result, MultiIndex))
self.assertEqual(result.nlevels, 3)

def test_partition_to_dataframe(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_')
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_')
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_', expand=True)
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_', expand=True)
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)

def test_pipe_failures(self):
# #2119
s = Series(['A|B|C'])
Expand Down

0 comments on commit b6b05a9

Please sign in to comment.