Skip to content

Commit

Permalink
Disable bytes on all methods except encode/decode
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Nov 2, 2018
1 parent e6ec6bc commit 37feec1
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 7 deletions.
83 changes: 77 additions & 6 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1759,14 +1759,23 @@ def wrapper(self):

def _pat_wrapper(f, flags=False, na=False, **kwargs):
def wrapper1(self, pat):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.{} with 'bytes' "
"values".format(f.__name__))
result = f(self._parent, pat)
return self._wrap_result(result)

def wrapper2(self, pat, flags=0, **kwargs):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.{} with 'bytes' "
"values".format(f.__name__))
result = f(self._parent, pat, flags=flags, **kwargs)
return self._wrap_result(result)

def wrapper3(self, pat, na=np.nan):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.{} with 'bytes' "
"values".format(f.__name__))
result = f(self._parent, pat, na=na)
return self._wrap_result(result)

Expand Down Expand Up @@ -1803,7 +1812,7 @@ class StringMethods(NoNewAttributesMixin):
"""

def __init__(self, data):
self._validate(data)
self._inferred_type = self._validate(data)
self._is_categorical = is_categorical_dtype(data)

# .values.categories works for both Series/Index
Expand All @@ -1818,18 +1827,18 @@ def _validate(data):
raise AttributeError('Can only use .str accessor with Index, '
'not MultiIndex')

# see src/inference.pyx which can contain string values
# see _libs/lib.pyx for list of inferred types
allowed_types = ['string', 'unicode', 'empty',
'mixed', 'mixed-integer']
if isinstance(data, ABCSeries):
# needed for str.decode
allowed_types = allowed_types + ['bytes']

data = data.dropna() # missing values mess up type inference
values = getattr(data, 'values', data) # Series / Index
values = getattr(values, 'categories', values) # categorical / normal
inferred_type = lib.infer_dtype(values)
# missing values mess up type inference -> skip
inferred_type = lib.infer_dtype(values, skipna=True)

# same for Series and Index (that is not MultiIndex)
if inferred_type not in allowed_types:
# it's neither a string series/index not a categorical series/index
# with strings inside the categories.
Expand All @@ -1838,7 +1847,8 @@ def _validate(data):
# have a str dtype (GH 9343 / 13877)
raise AttributeError("Can only use .str accessor with string "
"values (i.e. inferred_type is 'string', "
"'unicode' or 'mixed')")
"'unicode', 'mixed' or 'empty')")
return inferred_type

def __getitem__(self, key):
if isinstance(key, slice):
Expand Down Expand Up @@ -2188,6 +2198,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
"""
from pandas import Index, Series, concat

if self._inferred_type in ['mixed', 'mixed-integer', 'bytes']:
raise AttributeError("Can only use .str.cat with string values "
"(i.e. inferred_type is 'string', 'unicode' "
"'empty')")

if isinstance(others, compat.string_types):
raise ValueError("Did you mean to supply a `sep` keyword?")
if sep is None:
Expand Down Expand Up @@ -2396,13 +2411,17 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
'side': 'beginning',
'method': 'split'})
def split(self, pat=None, n=-1, expand=False):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.split with 'bytes' values")
result = str_split(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand)

@Appender(_shared_docs['str_split'] % {
'side': 'end',
'method': 'rsplit'})
def rsplit(self, pat=None, n=-1, expand=False):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rsplit with 'bytes' values")
result = str_rsplit(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand)

Expand Down Expand Up @@ -2493,6 +2512,9 @@ def rsplit(self, pat=None, n=-1, expand=False):
'also': 'rpartition : Split the string at the last occurrence of `sep`'
})
def partition(self, pat=' ', expand=True):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.partition with "
"'bytes' values")
f = lambda x: x.partition(pat)
result = _na_map(f, self._parent)
return self._wrap_result(result, expand=expand)
Expand All @@ -2504,6 +2526,9 @@ def partition(self, pat=' ', expand=True):
'also': 'partition : Split the string at the first occurrence of `sep`'
})
def rpartition(self, pat=' ', expand=True):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rpartition with "
"'bytes' values")
f = lambda x: x.rpartition(pat)
result = _na_map(f, self._parent)
return self._wrap_result(result, expand=expand)
Expand All @@ -2515,6 +2540,8 @@ def get(self, i):

@copy(str_join)
def join(self, sep):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.join with 'bytes' values")
result = str_join(self._parent, sep)
return self._wrap_result(result)

Expand Down Expand Up @@ -2565,14 +2592,20 @@ def pad(self, width, side='left', fillchar=' '):
@Appender(_shared_docs['str_pad'] % dict(side='left and right',
method='center'))
def center(self, width, fillchar=' '):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.center with 'bytes' values")
return self.pad(width, side='both', fillchar=fillchar)

@Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
def ljust(self, width, fillchar=' '):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.ljust with 'bytes' values")
return self.pad(width, side='right', fillchar=fillchar)

@Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
def rjust(self, width, fillchar=' '):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rjust with 'bytes' values")
return self.pad(width, side='left', fillchar=fillchar)

def zfill(self, width):
Expand Down Expand Up @@ -2635,21 +2668,29 @@ def zfill(self, width):
4 NaN
dtype: object
"""
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.zfill with 'bytes' values")
result = str_pad(self._parent, width, side='left', fillchar='0')
return self._wrap_result(result)

@copy(str_slice)
def slice(self, start=None, stop=None, step=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.slice with 'bytes' values")
result = str_slice(self._parent, start, stop, step)
return self._wrap_result(result)

@copy(str_slice_replace)
def slice_replace(self, start=None, stop=None, repl=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.slice_replace with "
"'bytes' values")
result = str_slice_replace(self._parent, start, stop, repl)
return self._wrap_result(result)

@copy(str_decode)
def decode(self, encoding, errors="strict"):
# need to allow bytes here
result = str_decode(self._parent, encoding, errors)
return self._wrap_result(result)

Expand Down Expand Up @@ -2724,28 +2765,39 @@ def encode(self, encoding, errors="strict"):
@Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
method='strip'))
def strip(self, to_strip=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.strip with 'bytes' values")
result = str_strip(self._parent, to_strip, side='both')
return self._wrap_result(result)

@Appender(_shared_docs['str_strip'] % dict(side='left side',
method='lstrip'))
def lstrip(self, to_strip=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.lstrip with 'bytes' values")
result = str_strip(self._parent, to_strip, side='left')
return self._wrap_result(result)

@Appender(_shared_docs['str_strip'] % dict(side='right side',
method='rstrip'))
def rstrip(self, to_strip=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rstrip with 'bytes' values")
result = str_strip(self._parent, to_strip, side='right')
return self._wrap_result(result)

@copy(str_wrap)
def wrap(self, width, **kwargs):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.wrap with 'bytes' values")
result = str_wrap(self._parent, width, **kwargs)
return self._wrap_result(result)

@copy(str_get_dummies)
def get_dummies(self, sep='|'):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.get_dummies with "
"'bytes' values")
# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
data = self._orig.astype(str) if self._is_categorical else self._parent
Expand All @@ -2755,6 +2807,9 @@ def get_dummies(self, sep='|'):

@copy(str_translate)
def translate(self, table, deletechars=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.translate with "
"'bytes' values")
result = str_translate(self._parent, table, deletechars)
return self._wrap_result(result)

Expand All @@ -2765,10 +2820,15 @@ def translate(self, table, deletechars=None):

@copy(str_extract)
def extract(self, pat, flags=0, expand=True):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.extract with 'bytes' values")
return str_extract(self, pat, flags=flags, expand=expand)

@copy(str_extractall)
def extractall(self, pat, flags=0):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.extractall with "
"'bytes' values")
return str_extractall(self._orig, pat, flags=flags)

_shared_docs['find'] = ("""
Expand Down Expand Up @@ -2798,13 +2858,17 @@ def extractall(self, pat, flags=0):
dict(side='lowest', method='find',
also='rfind : Return highest indexes in each strings'))
def find(self, sub, start=0, end=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.find with 'bytes' values")
result = str_find(self._parent, sub, start=start, end=end, side='left')
return self._wrap_result(result)

@Appender(_shared_docs['find'] %
dict(side='highest', method='rfind',
also='find : Return lowest indexes in each strings'))
def rfind(self, sub, start=0, end=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rfind with 'bytes' values")
result = str_find(self._parent, sub,
start=start, end=end, side='right')
return self._wrap_result(result)
Expand All @@ -2824,6 +2888,9 @@ def normalize(self, form):
normalized : Series/Index of objects
"""
import unicodedata
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.normalize with "
"'bytes' values")
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
result = _na_map(f, self._parent)
return self._wrap_result(result)
Expand Down Expand Up @@ -2856,6 +2923,8 @@ def normalize(self, form):
dict(side='lowest', similar='find', method='index',
also='rindex : Return highest indexes in each strings'))
def index(self, sub, start=0, end=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.index with 'bytes' values")
result = str_index(self._parent, sub,
start=start, end=end, side='left')
return self._wrap_result(result)
Expand All @@ -2864,6 +2933,8 @@ def index(self, sub, start=0, end=None):
dict(side='highest', similar='rfind', method='rindex',
also='index : Return lowest indexes in each strings'))
def rindex(self, sub, start=0, end=None):
if self._inferred_type in ['bytes']:
raise AttributeError("Cannot use .str.rindex with 'bytes' values")
result = str_index(self._parent, sub,
start=start, end=end, side='right')
return self._wrap_result(result)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3130,7 +3130,7 @@ def test_method_on_bytes(self):
lhs = Series(np.array(list('abc'), 'S1').astype(object))
rhs = Series(np.array(list('def'), 'S1').astype(object))
if compat.PY3:
pytest.raises(TypeError, lhs.str.cat, rhs, sep=',')
pytest.raises(AttributeError, lhs.str.cat, rhs)
else:
result = lhs.str.cat(rhs)
expected = Series(np.array(
Expand Down

0 comments on commit 37feec1

Please sign in to comment.