Skip to content

Commit

Permalink
ENH: improve extract and get_dummies methods for Index.str (fix for #…
Browse files Browse the repository at this point in the history
…9980)

simplify str_extract(), pass name into _wrap_result()
  • Loading branch information
mortada authored and jreback committed May 8, 2015
1 parent be2a9f8 commit e686387
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 29 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Enhancements
Timestamp('2014-08-01 16:30') + BusinessHour()

- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)

- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).

- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
Expand Down Expand Up @@ -216,6 +217,8 @@ enhancements are performed to make string operation easier.
idx.str.startswith('a')
s[s.index.str.startswith('a')]

- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)

.. _whatsnew_0161.api:

API changes
Expand Down
35 changes: 24 additions & 11 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,7 @@ def str_extract(arr, pat, flags=0):
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

regex = re.compile(pat, flags=flags)
# just to be safe, check this
Expand All @@ -481,11 +482,14 @@ def f(x):
return [np.nan if item is None else item for item in m.groups()]
else:
return empty_row

if regex.groups == 1:
result = Series([f(val)[0] for val in arr],
name=_get_single_group_name(regex),
index=arr.index, dtype=object)
result = np.array([f(val)[0] for val in arr], dtype=object)
name = _get_single_group_name(regex)
else:
if isinstance(arr, Index):
raise ValueError("only one regex group is supported with Index")
name = None
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
columns = [names.get(1 + i, i) for i in range(regex.groups)]
if arr.empty:
Expand All @@ -495,7 +499,7 @@ def f(x):
columns=columns,
index=arr.index,
dtype=object)
return result
return result, name


def str_get_dummies(arr, sep='|'):
Expand Down Expand Up @@ -531,6 +535,11 @@ def str_get_dummies(arr, sep='|'):
pandas.get_dummies
"""
from pandas.core.frame import DataFrame
from pandas.core.index import Index

# GH9980, Index.str does not support get_dummies() as it returns a frame
if isinstance(arr, Index):
raise TypeError("get_dummies is not supported for string methods on Index")

# TODO remove this hack?
arr = arr.fillna('')
Expand Down Expand Up @@ -991,7 +1000,7 @@ def __iter__(self):
i += 1
g = self.get(i)

def _wrap_result(self, result):
def _wrap_result(self, result, **kwargs):
# leave as it is to keep extract and get_dummies results
# can be merged to _wrap_result_expand in v0.17
from pandas.core.series import Series
Expand All @@ -1000,16 +1009,16 @@ def _wrap_result(self, result):

if not hasattr(result, 'ndim'):
return result
elif result.ndim == 1:
name = getattr(result, 'name', None)
name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name

if result.ndim == 1:
if isinstance(self.series, Index):
# if result is a boolean np.array, return the np.array
# instead of wrapping it into a boolean Index (GH 8875)
if is_bool_dtype(result):
return result
return Index(result, name=name or self.series.name)
return Series(result, index=self.series.index,
name=name or self.series.name)
return Index(result, name=name)
return Series(result, index=self.series.index, name=name)
else:
assert result.ndim < 3
return DataFrame(result, index=self.series.index)
Expand Down Expand Up @@ -1257,7 +1266,11 @@ def get_dummies(self, sep='|'):
startswith = _pat_wrapper(str_startswith, na=True)
endswith = _pat_wrapper(str_endswith, na=True)
findall = _pat_wrapper(str_findall, flags=True)
extract = _pat_wrapper(str_extract, flags=True)

@copy(str_extract)
def extract(self, pat, flags=0):
result, name = str_extract(self.series, pat, flags=flags)
return self._wrap_result(result, name=name)

_shared_docs['find'] = ("""
Return %(side)s indexes in each strings in the Series/Index
Expand Down
54 changes: 36 additions & 18 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,6 @@ def test_match(self):

def test_extract(self):
# Contains tests like those in test_match and some others.

values = Series(['fooBAD__barBAD', NA, 'foo'])
er = [NA, NA] # empty row

Expand All @@ -540,15 +539,31 @@ def test_extract(self):
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
tm.assert_frame_equal(result, exp)

# no groups
s = Series(['A1', 'B2', 'C3'])
f = lambda: s.str.extract('[ABC][123]')
self.assertRaises(ValueError, f)

# only non-capturing groups
f = lambda: s.str.extract('(?:[AB]).*')
self.assertRaises(ValueError, f)
# GH9980
# Index only works with one regex group since
# multi-group would expand to a frame
idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
with tm.assertRaisesRegexp(ValueError, "supported"):
idx.str.extract('([AB])([123])')

# these should work for both Series and Index
for klass in [Series, Index]:
# no groups
s_or_idx = klass(['A1', 'B2', 'C3'])
f = lambda: s_or_idx.str.extract('[ABC][123]')
self.assertRaises(ValueError, f)

# only non-capturing groups
f = lambda: s_or_idx.str.extract('(?:[AB]).*')
self.assertRaises(ValueError, f)

# single group renames series/index properly
s_or_idx = klass(['A1', 'A2'])
result = s_or_idx.str.extract(r'(?P<uno>A)\d')
tm.assert_equal(result.name, 'uno')
tm.assert_array_equal(result, klass(['A', 'A']))

s = Series(['A1', 'B2', 'C3'])
# one group, no matches
result = s.str.extract('(_)')
exp = Series([NA, NA, NA], dtype=object)
Expand All @@ -569,14 +584,16 @@ def test_extract(self):
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
tm.assert_frame_equal(result, exp)

# named group/groups
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)
# one named group
result = s.str.extract('(?P<letter>[AB])')
exp = Series(['A', 'B', NA], name='letter')
tm.assert_series_equal(result, exp)

# two named groups
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)

# mix named and unnamed groups
result = s.str.extract('([AB])(?P<number>[123])')
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
Expand All @@ -602,11 +619,6 @@ def test_extract(self):
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
tm.assert_frame_equal(result, exp)

# single group renames series properly
s = Series(['A1', 'A2'])
result = s.str.extract(r'(?P<uno>A)\d')
tm.assert_equal(result.name, 'uno')

# GH6348
# not passing index to the extractor
def check_index(index):
Expand Down Expand Up @@ -761,6 +773,12 @@ def test_get_dummies(self):
columns=list('7ab'))
tm.assert_frame_equal(result, expected)

# GH9980
# Index.str does not support get_dummies() as it returns a frame
with tm.assertRaisesRegexp(TypeError, "not supported"):
idx = Index(['a|b', 'a|c', 'b|c'])
idx.str.get_dummies('|')

def test_join(self):
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
result = values.str.split('_').str.join('_')
Expand Down

0 comments on commit e686387

Please sign in to comment.