From 94720d951b4e804bab72abc33dffeb2186ecb310 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Mar 2017 14:48:38 -0400 Subject: [PATCH] API: change default behaviour of str.match from deprecated extract to match (GH5224) This PR changes the default behaviour of `str.match` from extracting groups to just a match (True/False). The previous default behaviour was deprecated since 0.13.0 (https://github.com/pandas-dev/pandas/pull/5224) Author: Joris Van den Bossche Closes #15257 from jorisvandenbossche/str-match and squashes the following commits: 0ab36b6 [Joris Van den Bossche] Raise FutureWarning instead of UserWarning for as_indexer a2bae51 [Joris Van den Bossche] raise error in case of regex with groups and as_indexer=False 87446c3 [Joris Van den Bossche] fix test 0788de2 [Joris Van den Bossche] API: change default behaviour of str.match from deprecated extract to match (GH5224) --- doc/source/text.rst | 12 ------- doc/source/whatsnew/v0.20.0.txt | 7 ++++ pandas/core/strings.py | 59 +++++++++--------------------- pandas/tests/test_strings.py | 63 ++++++++++++--------------------- 4 files changed, 46 insertions(+), 95 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 2b2520cb6100f..b110ef2167a03 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -385,18 +385,6 @@ or match a pattern: The distinction between ``match`` and ``contains`` is strictness: ``match`` relies on strict ``re.match``, while ``contains`` relies on ``re.search``. -.. warning:: - - In previous versions, ``match`` was for *extracting* groups, - returning a not-so-convenient Series of tuples. The new method ``extract`` - (described in the previous section) is now preferred. - - This old, deprecated behavior of ``match`` is still the default. As - demonstrated above, use the new behavior by setting ``as_indexer=True``. - In this mode, ``match`` is analogous to ``contains``, returning a boolean - Series. The new behavior will become the default behavior in a future - release. - Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take an extra ``na`` argument so missing values can be considered True or False: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6d951af139b42..37a70435ed6ff 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -761,6 +761,12 @@ Other API Changes - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`) - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`) - ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`) +- The default behaviour of ``Series.str.match`` has changed from extracting + groups to matching the pattern. The extracting behaviour was deprecated + since pandas version 0.13.0 and can be done with the ``Series.str.extract`` + method (:issue:`5224`). As a consequence, the ``as_indexer`` keyword is + ignored (no longer needed to specify the new behaviour) and is deprecated. + .. _whatsnew_0200.deprecations: @@ -777,6 +783,7 @@ Deprecations - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`) - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) +- The ``as_indexer`` keyword of ``Series.str.match()`` has been deprecated (ignored keyword) (:issue:`15257`). - The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`) * ``pd.pnow()``, replaced by ``Period.now()`` * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b5b5d58235eaa..504d3dd47cc21 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -464,11 +464,9 @@ def rep(x, r): return result -def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): +def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=None): """ - Deprecated: Find groups in each string in the Series/Index - using passed regular expression. - If as_indexer=True, determine if each string matches a regular expression. + Determine if each string matches a regular expression. Parameters ---------- @@ -479,60 +477,37 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE na : default NaN, fill value for missing values. - as_indexer : False, by default, gives deprecated behavior better achieved - using str_extract. True return boolean indexer. + as_indexer : DEPRECATED Returns ------- Series/array of boolean values - if as_indexer=True - Series/Index of tuples - if as_indexer=False, default but deprecated See Also -------- contains : analogous, but less strict, relying on re.search instead of re.match - extract : now preferred to the deprecated usage of match (as_indexer=False) + extract : extract matched groups - Notes - ----- - To extract matched groups, which is the deprecated behavior of match, use - str.extract. """ - if not case: flags |= re.IGNORECASE regex = re.compile(pat, flags=flags) - if (not as_indexer) and regex.groups > 0: - # Do this first, to make sure it happens even if the re.compile - # raises below. - warnings.warn("In future versions of pandas, match will change to" - " always return a bool indexer.", FutureWarning, - stacklevel=3) - - if as_indexer and regex.groups > 0: - warnings.warn("This pattern has match groups. To actually get the" - " groups, use str.extract.", UserWarning, stacklevel=3) + if (as_indexer is False) and (regex.groups > 0): + raise ValueError("as_indexer=False with a pattern with groups is no " + "longer supported. Use '.str.extract(pat)' instead") + elif as_indexer is not None: + # Previously, this keyword was used for changing the default but + # deprecated behaviour. This keyword is now no longer needed. + warnings.warn("'as_indexer' keyword was specified but is ignored " + "(match now returns a boolean indexer by default), " + "and will be removed in a future version.", + FutureWarning, stacklevel=3) - # If not as_indexer and regex.groups == 0, this returns empty lists - # and is basically useless, so we will not warn. - - if (not as_indexer) and regex.groups > 0: - dtype = object - - def f(x): - m = regex.match(x) - if m: - return m.groups() - else: - return [] - else: - # This is the new behavior of str_match. - dtype = bool - f = lambda x: bool(regex.match(x)) + dtype = bool + f = lambda x: bool(regex.match(x)) return _na_map(f, arr, na, dtype=dtype) @@ -1587,7 +1562,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): return self._wrap_result(result) @copy(str_match) - def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): + def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): result = str_match(self._data, pat, case=case, flags=flags, na=na, as_indexer=as_indexer) return self._wrap_result(result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f8ce0070b2c78..7a68ec8f368ae 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -559,64 +559,44 @@ def test_repeat(self): exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')]) tm.assert_series_equal(result, exp) - def test_deprecated_match(self): - # Old match behavior, deprecated (but still default) in 0.13 + def test_match(self): + # New match behavior introduced in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) - - with tm.assert_produces_warning(): - result = values.str.match('.*(BAD[_]+).*(BAD)') - exp = Series([('BAD__', 'BAD'), NA, []]) - tm.assert_series_equal(result, exp) - - # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) - - with tm.assert_produces_warning(): - rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') - xp = Series([('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), - NA, NA, [], NA, NA, NA]) - tm.assertIsInstance(rs, Series) - tm.assert_series_equal(rs, xp) - - # unicode - values = Series([u('fooBAD__barBAD'), NA, u('foo')]) - - with tm.assert_produces_warning(): - result = values.str.match('.*(BAD[_]+).*(BAD)') - exp = Series([(u('BAD__'), u('BAD')), NA, []]) + result = values.str.match('.*(BAD[_]+).*(BAD)') + exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) - def test_match(self): - # New match behavior introduced in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(): - result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + result = values.str.match('.*BAD[_]+.*BAD') exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) - # If no groups, use new behavior even when as_indexer is False. - # (Old behavior is pretty much useless in this case.) + # test passing as_indexer still works but is ignored values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) exp = Series([True, NA, False]) + with tm.assert_produces_warning(FutureWarning): + result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True) + tm.assert_series_equal(result, exp) + with tm.assert_produces_warning(FutureWarning): + result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) tm.assert_series_equal(result, exp) + with tm.assert_produces_warning(FutureWarning): + result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + tm.assert_series_equal(result, exp) + self.assertRaises(ValueError, values.str.match, '.*(BAD[_]+).*(BAD)', + as_indexer=False) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) - - with tm.assert_produces_warning(): - rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') xp = Series([True, NA, True, NA, NA, False, NA, NA, NA]) tm.assertIsInstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) - - with tm.assert_produces_warning(): - result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + result = values.str.match('.*(BAD[_]+).*(BAD)') exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) @@ -2610,10 +2590,11 @@ def test_match_findall_flags(self): pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' - with tm.assert_produces_warning(FutureWarning): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) + self.assertEqual(result.iloc[0].tolist(), ['dave', 'google', 'com']) - self.assertEqual(result[0], ('dave', 'google', 'com')) + result = data.str.match(pat, flags=re.IGNORECASE) + self.assertEqual(result[0], True) result = data.str.findall(pat, flags=re.IGNORECASE) self.assertEqual(result[0][0], ('dave', 'google', 'com'))