Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add StringMethods.partition and rpartition #9773

Merged
merged 1 commit into from
May 7, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -544,10 +544,12 @@ strings and apply several methods to it. These can be acccessed like
Series.str.match
Series.str.normalize
Series.str.pad
Series.str.partition
Series.str.repeat
Series.str.replace
Series.str.rfind
Series.str.rjust
Series.str.rpartition
Series.str.rstrip
Series.str.slice
Series.str.slice_replace
Expand Down
2 changes: 2 additions & 0 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ Method Summary
:meth:`~Series.str.strip`,Equivalent to ``str.strip``
:meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
:meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
:meth:`~Series.str.partition`,Equivalent to ``str.partition``
:meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
:meth:`~Series.str.lower`,Equivalent to ``str.lower``
:meth:`~Series.str.upper`,Equivalent to ``str.upper``
:meth:`~Series.str.find`,Equivalent to ``str.find``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Enhancements
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
- Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`)

- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).

The ``.str`` accessor is now available for both ``Series`` and ``Index``.
Expand Down
88 changes: 88 additions & 0 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,8 @@ def __iter__(self):
g = self.get(i)

def _wrap_result(self, result):
# leave as it is to keep extract and get_dummies results
# can be merged to _wrap_result_expand in v0.17
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index
Expand All @@ -1012,6 +1014,33 @@ def _wrap_result(self, result):
assert result.ndim < 3
return DataFrame(result, index=self.series.index)

def _wrap_result_expand(self, result, expand=False):
from pandas.core.index import Index
if not hasattr(result, 'ndim'):
return result

if isinstance(self.series, Index):
name = getattr(result, 'name', None)
# if result is a boolean np.array, return the np.array
# instead of wrapping it into a boolean Index (GH 8875)
if hasattr(result, 'dtype') and is_bool_dtype(result):
return result

if expand:
result = list(result)
return Index(result, name=name)
else:
index = self.series.index
if expand:
cons_row = self.series._constructor
cons = self.series._constructor_expanddim
data = [cons_row(x) for x in result]
return cons(data, index=index)
else:
name = getattr(result, 'name', None)
cons = self.series._constructor
return cons(result, name=name, index=index)

@copy(str_cat)
def cat(self, others=None, sep=None, na_rep=None):
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
Expand All @@ -1022,6 +1051,65 @@ def split(self, pat=None, n=-1, return_type='series'):
result = str_split(self.series, pat, n=n, return_type=return_type)
return self._wrap_result(result)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
containing the part before the separator, the separator itself,
and the part after the separator.
If the separator is not found, return %(return)s.

Parameters
----------
pat : string, default whitespace
String to split on.
expand : bool, default True
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index

Returns
-------
split : DataFrame/MultiIndex or Series/Index of objects

See Also
--------
%(also)s

Examples
--------

>>> s = Series(['A_B_C', 'D_E_F', 'X'])
0 A_B_C
1 D_E_F
2 X
dtype: object

>>> s.str.partition('_')
0 1 2
0 A _ B_C
1 D _ E_F
2 X
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want empty strings here, or NaNs? (which is more consistent with split)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no matter, that is of course the behaviour of the standard library partition

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it returns empty char based on standard partition.


>>> s.str.rpartition('_')
0 1 2
0 A_B _ C
1 D_E _ F
2 X
""")
@Appender(_shared_docs['str_partition'] % {'side': 'first',
'return': '3 elements containing the string itself, followed by two empty strings',
'also': 'rpartition : Split the string at the last occurrence of `sep`'})
def partition(self, pat=' ', expand=True):
f = lambda x: x.partition(pat)
result = _na_map(f, self.series)
return self._wrap_result_expand(result, expand=expand)

@Appender(_shared_docs['str_partition'] % {'side': 'last',
'return': '3 elements containing two empty strings, followed by the string itself',
'also': 'partition : Split the string at the first occurrence of `sep`'})
def rpartition(self, pat=' ', expand=True):
f = lambda x: x.rpartition(pat)
result = _na_map(f, self.series)
return self._wrap_result_expand(result, expand=expand)

@copy(str_get)
def get(self, i):
result = str_get(self.series, i)
Expand Down
121 changes: 121 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,8 @@ def test_empty_str_methods(self):
tm.assert_series_equal(empty_str, empty.str.pad(42))
tm.assert_series_equal(empty_str, empty.str.center(42))
tm.assert_series_equal(empty_list, empty.str.split('a'))
tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
tm.assert_series_equal(empty_str, empty.str.strip())
Expand All @@ -687,6 +689,12 @@ def test_empty_str_methods(self):
tm.assert_series_equal(empty_str, empty.str.swapcase())
tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))

def test_empty_str_methods_to_frame(self):
empty_str = empty = Series(dtype=str)
empty_df = DataFrame([])
tm.assert_frame_equal(empty_df, empty.str.partition('a'))
tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))

def test_ismethods(self):
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
str_s = Series(values)
Expand Down Expand Up @@ -1175,6 +1183,119 @@ def test_split_to_dataframe(self):
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
s.str.split('_', return_type="some_invalid_type")

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])

result = values.str.partition('_', expand=False)
exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
tm.assert_series_equal(result, exp)

# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.partition('__', expand=False)
exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('__', expand=False)
exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
tm.assert_series_equal(result, exp)

# None
values = Series(['a b c', 'c d e', NA, 'f g h'])
result = values.str.partition(expand=False)
exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition(expand=False)
exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
tm.assert_series_equal(result, exp)

# Not splited
values = Series(['abc', 'cde', NA, 'fgh'])
result = values.str.partition('_', expand=False)
exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
tm.assert_series_equal(result, exp)

# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])

result = values.str.partition('_', expand=False)
exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
NA, [u('f'), u('_'), u('g_h')]])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
NA, [u('f_g'), u('_'), u('h')]])
tm.assert_series_equal(result, exp)

# compare to standard lib
values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
result = values.str.partition('_', expand=False).tolist()
self.assertEqual(result, [v.partition('_') for v in values])
result = values.str.rpartition('_', expand=False).tolist()
self.assertEqual(result, [v.rpartition('_') for v in values])

def test_partition_index(self):
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])

result = values.str.partition('_', expand=False)
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]))
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

result = values.str.rpartition('_', expand=False)
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]))
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

result = values.str.partition('_')
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
tm.assert_index_equal(result, exp)
self.assertTrue(isinstance(result, MultiIndex))
self.assertEqual(result.nlevels, 3)

result = values.str.rpartition('_')
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
tm.assert_index_equal(result, exp)
self.assertTrue(isinstance(result, MultiIndex))
self.assertEqual(result.nlevels, 3)

def test_partition_to_dataframe(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_')
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_')
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.partition('_', expand=True)
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_', expand=True)
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
tm.assert_frame_equal(result, exp)

def test_pipe_failures(self):
# #2119
s = Series(['A|B|C'])
Expand Down