Skip to content

Commit

Permalink
API: change default behaviour of str.match from deprecated extract to…
Browse files Browse the repository at this point in the history
… match (GH5224)
  • Loading branch information
jorisvandenbossche committed Mar 22, 2017
1 parent fb7af6e commit 0788de2
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 94 deletions.
12 changes: 0 additions & 12 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,18 +385,6 @@ or match a pattern:
The distinction between ``match`` and ``contains`` is strictness: ``match``
relies on strict ``re.match``, while ``contains`` relies on ``re.search``.

.. warning::

In previous versions, ``match`` was for *extracting* groups,
returning a not-so-convenient Series of tuples. The new method ``extract``
(described in the previous section) is now preferred.

This old, deprecated behavior of ``match`` is still the default. As
demonstrated above, use the new behavior by setting ``as_indexer=True``.
In this mode, ``match`` is analogous to ``contains``, returning a boolean
Series. The new behavior will become the default behavior in a future
release.

Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
an extra ``na`` argument so missing values can be considered True or False:

Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,11 @@ Other API Changes
- ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`)
- The default behaviour of ``Series.str.match`` has changed from extracting
groups to matching the pattern. The extracting behaviour was deprecated
since pandas version 0.13.0 and can be done with the ``Series.str.extract``
method (:issue:`5224`).


.. _whatsnew_0200.deprecations:

Expand Down
53 changes: 12 additions & 41 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,11 +464,9 @@ def rep(x, r):
return result


def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=None):
"""
Deprecated: Find groups in each string in the Series/Index
using passed regular expression.
If as_indexer=True, determine if each string matches a regular expression.
Determine if each string matches a regular expression.
Parameters
----------
Expand All @@ -479,60 +477,33 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
flags : int, default 0 (no flags)
re module flags, e.g. re.IGNORECASE
na : default NaN, fill value for missing values.
as_indexer : False, by default, gives deprecated behavior better achieved
using str_extract. True return boolean indexer.
as_indexer : ignored
Returns
-------
Series/array of boolean values
if as_indexer=True
Series/Index of tuples
if as_indexer=False, default but deprecated
See Also
--------
contains : analogous, but less strict, relying on re.search instead of
re.match
extract : now preferred to the deprecated usage of match (as_indexer=False)
extract : extract matched groups
Notes
-----
To extract matched groups, which is the deprecated behavior of match, use
str.extract.
"""

if not case:
flags |= re.IGNORECASE

regex = re.compile(pat, flags=flags)

if (not as_indexer) and regex.groups > 0:
# Do this first, to make sure it happens even if the re.compile
# raises below.
warnings.warn("In future versions of pandas, match will change to"
" always return a bool indexer.", FutureWarning,
stacklevel=3)

if as_indexer and regex.groups > 0:
warnings.warn("This pattern has match groups. To actually get the"
" groups, use str.extract.", UserWarning, stacklevel=3)
if as_indexer is not None:
# Previously, this keyword was used for changing the default but
# deprecated behaviour. This keyword is now no longer needed.
warnings.warn("'as_indexer' keyword was specified but will be ignored;"
" match now returns a boolean indexer by default.",
UserWarning, stacklevel=3)

# If not as_indexer and regex.groups == 0, this returns empty lists
# and is basically useless, so we will not warn.

if (not as_indexer) and regex.groups > 0:
dtype = object

def f(x):
m = regex.match(x)
if m:
return m.groups()
else:
return []
else:
# This is the new behavior of str_match.
dtype = bool
f = lambda x: bool(regex.match(x))
dtype = bool
f = lambda x: bool(regex.match(x))

return _na_map(f, arr, na, dtype=dtype)

Expand Down
58 changes: 17 additions & 41 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,64 +559,39 @@ def test_repeat(self):
exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
tm.assert_series_equal(result, exp)

def test_deprecated_match(self):
# Old match behavior, deprecated (but still default) in 0.13
def test_match(self):
# New match behavior introduced in 0.13
values = Series(['fooBAD__barBAD', NA, 'foo'])

with tm.assert_produces_warning():
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([('BAD__', 'BAD'), NA, []])
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
'foo', None, 1, 2.])

with tm.assert_produces_warning():
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
xp = Series([('BAD_', 'BAD'), NA, ('BAD_', 'BAD'),
NA, NA, [], NA, NA, NA])
tm.assertIsInstance(rs, Series)
tm.assert_series_equal(rs, xp)

# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo')])

with tm.assert_produces_warning():
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([(u('BAD__'), u('BAD')), NA, []])
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)

def test_match(self):
# New match behavior introduced in 0.13
values = Series(['fooBAD__barBAD', NA, 'foo'])
with tm.assert_produces_warning():
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
result = values.str.match('.*BAD[_]+.*BAD')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)

# If no groups, use new behavior even when as_indexer is False.
# (Old behavior is pretty much useless in this case.)
# test passing as_indexer still works but is ignored
values = Series(['fooBAD__barBAD', NA, 'foo'])
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
exp = Series([True, NA, False])
with tm.assert_produces_warning(UserWarning):
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True)
tm.assert_series_equal(result, exp)
with tm.assert_produces_warning(UserWarning):
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
'foo', None, 1, 2.])

with tm.assert_produces_warning():
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
tm.assertIsInstance(rs, Series)
tm.assert_series_equal(rs, xp)

# unicode
values = Series([u('fooBAD__barBAD'), NA, u('foo')])

with tm.assert_produces_warning():
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
result = values.str.match('.*(BAD[_]+).*(BAD)')
exp = Series([True, NA, False])
tm.assert_series_equal(result, exp)

Expand Down Expand Up @@ -2610,11 +2585,12 @@ def test_match_findall_flags(self):

pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

with tm.assert_produces_warning(FutureWarning):
result = data.str.match(pat, flags=re.IGNORECASE)

result = data.str.extract(pat, flags=re.IGNORECASE)
self.assertEqual(result[0], ('dave', 'google', 'com'))

result = data.str.match(pat, flags=re.IGNORECASE)
self.assertEqual(result[0], True)

result = data.str.findall(pat, flags=re.IGNORECASE)
self.assertEqual(result[0][0], ('dave', 'google', 'com'))

Expand Down

0 comments on commit 0788de2

Please sign in to comment.