From 9a42cbe85461c28417a5130bc80b035044c5575a Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 1 Jun 2019 17:03:06 +0200 Subject: [PATCH] API: Series.str-accessor infers dtype (and Index.str does not raise on all-NA) (#23167) --- doc/source/user_guide/text.rst | 10 ++ doc/source/whatsnew/v0.25.0.rst | 40 +++++- pandas/core/strings.py | 214 +++++++++++++++++++++++++------- pandas/tests/test_strings.py | 48 +++---- 4 files changed, 233 insertions(+), 79 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f7fdfcf8bf882..87c75e8bcd91f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -70,6 +70,16 @@ and replacing any remaining whitespaces with underscores: ``.str`` methods which operate on elements of type ``list`` are not available on such a ``Series``. +.. _text.warn_types: + +.. warning:: + + Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with + v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously. + + Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few + exceptions, other uses are not supported, and may be disabled at a later point. + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3275223b159f8..87a8010998bd0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -231,6 +231,43 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +The ``.str``-accessor performs stricter type checks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Due to the lack of more fine-grained dtypes, :attr:`Series.str` so far only checked whether the data was +of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* the Series; in particular, +``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, +:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. + +*Previous Behaviour*: + +.. code-block:: python + + In [1]: s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + + In [2]: s + Out[2]: + 0 b'a' + 1 b'ba' + 2 b'cba' + dtype: object + + In [3]: s.str.startswith(b'a') + Out[3]: + 0 True + 1 False + 2 False + dtype: bool + +*New Behaviour*: + +.. ipython:: python + :okexcept: + + s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + s + s.str.startswith(b'a') + .. _whatsnew_0250.api_breaking.incompatible_index_unions Incompatible Index Type Unions @@ -331,7 +368,6 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). - .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies @@ -537,7 +573,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ee3796241690d..bd756491abd2f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,4 +1,5 @@ import codecs +from functools import wraps import re import textwrap from typing import Dict @@ -12,8 +13,8 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + is_list_like, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -1720,12 +1721,78 @@ def str_encode(arr, encoding, errors="strict"): return _na_map(f, arr) -def _noarg_wrapper(f, docstring=None, **kargs): +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {'string', 'empty', 'bytes', + 'mixed', 'mixed-integer'} - set(forbidden) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ('Cannot use .str.{name} with values of inferred dtype ' + '{inf_type!r}.'.format(name=func_name, + inf_type=self._inferred_dtype)) + raise TypeError(msg) + return func(self, *args, **kwargs) + wrapper.__name__ = func_name + return wrapper + return _forbid_nonstring_types + + +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], + **kargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) return self._wrap_result(result) - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if docstring is not None: wrapper.__doc__ = docstring else: @@ -1734,22 +1801,26 @@ def wrapper(self): return wrapper -def _pat_wrapper(f, flags=False, na=False, **kwargs): +def _pat_wrapper(f, flags=False, na=False, name=None, + forbidden_types=['bytes'], **kwargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper2(self, pat, flags=0, **kwargs): result = f(self._parent, pat, flags=flags, **kwargs) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) return self._wrap_result(result) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if f.__doc__: wrapper.__doc__ = f.__doc__ @@ -1780,7 +1851,7 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): - self._validate(data) + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) # .values.categories works for both Series/Index @@ -1791,38 +1862,44 @@ def __init__(self, data): @staticmethod def _validate(data): - from pandas.core.index import Index - - if (isinstance(data, ABCSeries) and - not ((is_categorical_dtype(data.dtype) and - is_object_dtype(data.values.categories)) or - (is_object_dtype(data.dtype)))): - # it's neither a string series not a categorical series with - # strings inside the categories. - # this really should exclude all series with any non-string values - # (instead of test for object dtype), but that isn't practical for - # performance reasons until we have a str dtype (GH 9343) + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + if isinstance(data, ABCMultiIndex): + raise AttributeError('Can only use .str accessor with Index, ' + 'not MultiIndex') + + # see _libs/lib.pyx for list of inferred types + allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] + + values = getattr(data, 'values', data) # Series / Index + values = getattr(values, 'categories', values) # categorical / normal + + # missing values obfuscate type inference -> skip + inferred_dtype = lib.infer_dtype(values, skipna=True) + + if inferred_dtype not in allowed_types: raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - elif isinstance(data, Index): - # can't use ABCIndex to exclude non-str - - # see src/inference.pyx which can contain string values - allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if is_categorical_dtype(data.dtype): - inf_type = data.categories.inferred_type - else: - inf_type = data.inferred_type - if inf_type not in allowed_types: - message = ("Can only use .str accessor with string values " - "(i.e. inferred_type is 'string', 'unicode' or " - "'mixed')") - raise AttributeError(message) - if data.nlevels > 1: - message = ("Can only use .str accessor with Index, not " - "MultiIndex") - raise AttributeError(message) + "values!") + return inferred_dtype def __getitem__(self, key): if isinstance(key, slice): @@ -2025,12 +2102,13 @@ def _get_series_list(self, others, ignore_index=False): warnings.warn('list-likes other than Series, Index, or ' 'np.ndarray WITHIN another list-like are ' 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=3) + 'version.', FutureWarning, stacklevel=4) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) + @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2211,7 +2289,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "Index/DataFrame in `others`. To enable alignment " "and silence this warning, pass `join='left'|" "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=2) + "be `join='left'`.", FutureWarning, stacklevel=3) # if join is None, _get_series_list already force-aligned indexes join = 'left' if join is None else join @@ -2384,6 +2462,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): @Appender(_shared_docs['str_split'] % { 'side': 'beginning', 'method': 'split'}) + @forbid_nonstring_types(['bytes']) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2391,6 +2470,7 @@ def split(self, pat=None, n=-1, expand=False): @Appender(_shared_docs['str_split'] % { 'side': 'end', 'method': 'rsplit'}) + @forbid_nonstring_types(['bytes']) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2485,6 +2565,7 @@ def rsplit(self, pat=None, n=-1, expand=False): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def partition(self, sep=' ', expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) @@ -2498,6 +2579,7 @@ def partition(self, sep=' ', expand=True): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def rpartition(self, sep=' ', expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) @@ -2509,33 +2591,39 @@ def get(self, i): return self._wrap_result(result) @copy(str_join) + @forbid_nonstring_types(['bytes']) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) + @forbid_nonstring_types(['bytes']) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result, fill_value=na) @copy(str_match) + @forbid_nonstring_types(['bytes']) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) + @forbid_nonstring_types(['bytes']) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) + @forbid_nonstring_types(['bytes']) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) + @forbid_nonstring_types(['bytes']) def pad(self, width, side='left', fillchar=' '): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) @@ -2559,17 +2647,21 @@ def pad(self, width, side='left', fillchar=' '): @Appender(_shared_docs['str_pad'] % dict(side='left and right', method='center')) + @forbid_nonstring_types(['bytes']) def center(self, width, fillchar=' '): return self.pad(width, side='both', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) + @forbid_nonstring_types(['bytes']) def ljust(self, width, fillchar=' '): return self.pad(width, side='right', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) + @forbid_nonstring_types(['bytes']) def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) + @forbid_nonstring_types(['bytes']) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2639,16 +2731,19 @@ def slice(self, start=None, stop=None, step=None): return self._wrap_result(result) @copy(str_slice_replace) + @forbid_nonstring_types(['bytes']) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @copy(str_decode) def decode(self, encoding, errors="strict"): + # need to allow bytes here result = str_decode(self._parent, encoding, errors) return self._wrap_result(result) @copy(str_encode) + @forbid_nonstring_types(['bytes']) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) @@ -2718,28 +2813,33 @@ def encode(self, encoding, errors="strict"): @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', method='strip')) + @forbid_nonstring_types(['bytes']) def strip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='both') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='left side', method='lstrip')) + @forbid_nonstring_types(['bytes']) def lstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='left') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='right side', method='rstrip')) + @forbid_nonstring_types(['bytes']) def rstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) + @forbid_nonstring_types(['bytes']) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) + @forbid_nonstring_types(['bytes']) def get_dummies(self, sep='|'): # we need to cast to Series of strings as only that has all # methods available for making the dummies... @@ -2749,20 +2849,23 @@ def get_dummies(self, sep='|'): name=name, expand=True) @copy(str_translate) + @forbid_nonstring_types(['bytes']) def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True) - startswith = _pat_wrapper(str_startswith, na=True) - endswith = _pat_wrapper(str_endswith, na=True) - findall = _pat_wrapper(str_findall, flags=True) + count = _pat_wrapper(str_count, flags=True, name='count') + startswith = _pat_wrapper(str_startswith, na=True, name='startswith') + endswith = _pat_wrapper(str_endswith, na=True, name='endswith') + findall = _pat_wrapper(str_findall, flags=True, name='findall') @copy(str_extract) + @forbid_nonstring_types(['bytes']) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) + @forbid_nonstring_types(['bytes']) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) @@ -2792,6 +2895,7 @@ def extractall(self, pat, flags=0): @Appender(_shared_docs['find'] % dict(side='lowest', method='find', also='rfind : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def find(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='left') return self._wrap_result(result) @@ -2799,11 +2903,13 @@ def find(self, sub, start=0, end=None): @Appender(_shared_docs['find'] % dict(side='highest', method='rfind', also='find : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rfind(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='right') return self._wrap_result(result) + @forbid_nonstring_types(['bytes']) def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. @@ -2851,6 +2957,7 @@ def normalize(self, form): @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', also='rindex : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def index(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='left') @@ -2859,6 +2966,7 @@ def index(self, sub, start=0, end=None): @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', also='index : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rindex(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='right') @@ -2908,7 +3016,8 @@ def rindex(self, sub, start=0, end=None): 5 3.0 dtype: float64 """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int) + len = _noarg_wrapper(len, docstring=_shared_docs['len'], + forbidden_types=None, dtype=int) _shared_docs['casemethods'] = (""" Convert strings in the Series/Index to %(type)s. @@ -2989,21 +3098,27 @@ def rindex(self, sub, start=0, end=None): _doc_args['casefold'] = dict(type='be casefolded', method='casefold', version='\n .. versionadded:: 0.25.0\n') lower = _noarg_wrapper(lambda x: x.lower(), + name='lower', docstring=_shared_docs['casemethods'] % _doc_args['lower']) upper = _noarg_wrapper(lambda x: x.upper(), + name='upper', docstring=_shared_docs['casemethods'] % _doc_args['upper']) title = _noarg_wrapper(lambda x: x.title(), + name='title', docstring=_shared_docs['casemethods'] % _doc_args['title']) capitalize = _noarg_wrapper(lambda x: x.capitalize(), + name='capitalize', docstring=_shared_docs['casemethods'] % _doc_args['capitalize']) swapcase = _noarg_wrapper(lambda x: x.swapcase(), + name='swapcase', docstring=_shared_docs['casemethods'] % _doc_args['swapcase']) casefold = _noarg_wrapper(lambda x: x.casefold(), + name='casefold', docstring=_shared_docs['casemethods'] % _doc_args['casefold']) @@ -3157,30 +3272,39 @@ def rindex(self, sub, start=0, end=None): _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric') _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal') isalnum = _noarg_wrapper(lambda x: x.isalnum(), + name='isalnum', docstring=_shared_docs['ismethods'] % _doc_args['isalnum']) isalpha = _noarg_wrapper(lambda x: x.isalpha(), + name='isalpha', docstring=_shared_docs['ismethods'] % _doc_args['isalpha']) isdigit = _noarg_wrapper(lambda x: x.isdigit(), + name='isdigit', docstring=_shared_docs['ismethods'] % _doc_args['isdigit']) isspace = _noarg_wrapper(lambda x: x.isspace(), + name='isspace', docstring=_shared_docs['ismethods'] % _doc_args['isspace']) islower = _noarg_wrapper(lambda x: x.islower(), + name='islower', docstring=_shared_docs['ismethods'] % _doc_args['islower']) isupper = _noarg_wrapper(lambda x: x.isupper(), + name='isupper', docstring=_shared_docs['ismethods'] % _doc_args['isupper']) istitle = _noarg_wrapper(lambda x: x.istitle(), + name='istitle', docstring=_shared_docs['ismethods'] % _doc_args['istitle']) isnumeric = _noarg_wrapper(lambda x: x.isnumeric(), + name='isnumeric', docstring=_shared_docs['ismethods'] % _doc_args['isnumeric']) isdecimal = _noarg_wrapper(lambda x: x.isdecimal(), + name='isdecimal', docstring=_shared_docs['ismethods'] % _doc_args['isdecimal']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2951ca24fa7ff..1ba0ef3918fb7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -150,6 +150,9 @@ def any_allowed_skipna_inferred_dtype(request): ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... pd.Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -179,20 +182,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): pytest.xfail(reason='Conversion to numpy array fails because ' 'the ._values-attribute is not a numpy array for ' 'PeriodArray/IntervalArray; see GH 23553') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') - if (box == Series - and (dtype == object and inferred_dtype not in [ - 'string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer']) - or (dtype == 'category' - and inferred_dtype in ['decimal', 'boolean', 'time'])): - pytest.xfail(reason='Not raising correctly; solved by GH 23167') types_passing_constructor = ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer'] @@ -220,27 +209,21 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name not in ['encode', 'decode', 'len'] - and inferred_dtype == 'bytes'): - pytest.xfail(reason='Not raising for "bytes", see GH 23011;' - 'Also: malformed method names, see GH 23551; ' - 'solved by GH 23167') - if (method_name == 'cat' - and inferred_dtype in ['mixed', 'mixed-integer']): - pytest.xfail(reason='Bad error message; should raise better; ' - 'solved by GH 23167') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') + if (method_name in ['partition', 'rpartition'] and box == Index + and inferred_dtype == 'empty'): + pytest.xfail(reason='Method cannot deal with empty Index') + if (method_name == 'split' and box == Index and values.size == 0 + and kwargs.get('expand', None) is not None): + pytest.xfail(reason='Split fails on empty Series when expand=True') + if (method_name == 'get_dummies' and box == Index + and inferred_dtype == 'empty' and (dtype == object + or values.size == 0)): + pytest.xfail(reason='Need to fortify get_dummies corner cases') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['encode', 'decode', 'len'] + bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, @@ -3167,7 +3150,8 @@ def test_str_accessor_no_new_attributes(self): def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) - with pytest.raises(TypeError, match="can't concat str to bytes"): + with pytest.raises(TypeError, + match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self):