Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow errors keyword for HDF IO Encoding Err Handling #20873

Merged
merged 8 commits into from
May 1, 2018
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ Other Enhancements
- Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from
the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ
library. (:issue:`20564`)
- :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)

.. _whatsnew_0230.api_breaking:

Expand Down
93 changes: 57 additions & 36 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
def func(_start, _stop, _where):
return s.read(start=_start, stop=_stop,
where=_where,
columns=columns, **kwargs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this **kwargs argument because it was getting mangled when calling read_index_node with arbitrary keyword arguments in read_hdf. I think it was a mistake to be included originally

columns=columns)

# create the iterator
it = TableIterator(self, s, func, where=where, nrows=s.nrows,
Expand Down Expand Up @@ -1566,14 +1566,14 @@ def infer(self, handler):
new_self.read_metadata(handler)
return new_self

def convert(self, values, nan_rep, encoding):
def convert(self, values, nan_rep, encoding, errors):
""" set the values from this selection: take = take ownership """

# values is a recarray
if values.dtype.fields is not None:
values = values[self.cname]

values = _maybe_convert(values, self.kind, encoding)
values = _maybe_convert(values, self.kind, encoding, errors)

kwargs = dict()
if self.freq is not None:
Expand Down Expand Up @@ -1748,7 +1748,7 @@ class GenericIndexCol(IndexCol):
def is_indexed(self):
return False

def convert(self, values, nan_rep, encoding):
def convert(self, values, nan_rep, encoding, errors):
""" set the values from this selection: take = take ownership """

self.values = Int64Index(np.arange(self.table.nrows))
Expand Down Expand Up @@ -1877,7 +1877,7 @@ def set_kind(self):
self.typ = getattr(self.description, self.cname, None)

def set_atom(self, block, block_items, existing_col, min_itemsize,
nan_rep, info, encoding=None, **kwargs):
nan_rep, info, encoding=None, errors='strict'):
""" create and setup my atom from the block b """

self.values = list(block_items)
Expand Down Expand Up @@ -1922,7 +1922,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
existing_col,
min_itemsize,
nan_rep,
encoding)
encoding,
errors)

# set as a data block
else:
Expand All @@ -1932,7 +1933,7 @@ def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])

def set_atom_string(self, block, block_items, existing_col, min_itemsize,
nan_rep, encoding):
nan_rep, encoding, errors):
# fill nan items with myself, don't disturb the blocks by
# trying to downcast
block = block.fillna(nan_rep, downcast=False)
Expand All @@ -1958,7 +1959,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
)

# itemsize is the maximum length of a string (along any dimension)
data_converted = _convert_string_array(data, encoding)
data_converted = _convert_string_array(data, encoding, errors)
itemsize = data_converted.itemsize

# specified min_itemsize?
Expand Down Expand Up @@ -2089,7 +2090,7 @@ def validate_attr(self, append):
raise ValueError("appended items dtype do not match existing "
"items dtype in table!")

def convert(self, values, nan_rep, encoding):
def convert(self, values, nan_rep, encoding, errors):
"""set the data from this selection (and convert to the correct dtype
if we can)
"""
Expand Down Expand Up @@ -2163,7 +2164,7 @@ def convert(self, values, nan_rep, encoding):
# convert nans / decode
if _ensure_decoded(self.kind) == u('string'):
self.data = _unconvert_string_array(
self.data, nan_rep=nan_rep, encoding=encoding)
self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)

return self

Expand Down Expand Up @@ -2229,10 +2230,12 @@ class Fixed(StringMixin):
ndim = None
is_table = False

def __init__(self, parent, group, encoding=None, **kwargs):
def __init__(self, parent, group, encoding=None, errors='strict',
**kwargs):
self.parent = parent
self.group = group
self.encoding = _ensure_encoding(encoding)
self.errors = errors
self.set_version()

@property
Expand Down Expand Up @@ -2436,10 +2439,12 @@ def is_exists(self):
def set_attrs(self):
""" set our object attributes """
self.attrs.encoding = self.encoding
self.attrs.errors = self.errors

def get_attrs(self):
""" retrieve our attributes """
self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
self.errors = getattr(self.attrs, 'errors', 'strict')
for n in self.attributes:
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))

Expand Down Expand Up @@ -2506,7 +2511,7 @@ def write_index(self, key, index):
self.write_sparse_intindex(key, index)
else:
setattr(self.attrs, '%s_variety' % key, 'regular')
converted = _convert_index(index, self.encoding,
converted = _convert_index(index, self.encoding, self.errors,
self.format_type).set_name('index')

self.write_array(key, converted.values)
Expand Down Expand Up @@ -2552,7 +2557,7 @@ def write_multi_index(self, key, index):
index.names)):
# write the level
level_key = '%s_level%d' % (key, i)
conv_level = _convert_index(lev, self.encoding,
conv_level = _convert_index(lev, self.encoding, self.errors,
self.format_type).set_name(level_key)
self.write_array(level_key, conv_level.values)
node = getattr(self.group, level_key)
Expand Down Expand Up @@ -2613,11 +2618,13 @@ def read_index_node(self, node, start=None, stop=None):

if kind in (u('date'), u('datetime')):
index = factory(_unconvert_index(data, kind,
encoding=self.encoding),
encoding=self.encoding,
errors=self.errors),
dtype=object, **kwargs)
else:
index = factory(_unconvert_index(data, kind,
encoding=self.encoding), **kwargs)
encoding=self.encoding,
errors=self.errors), **kwargs)

index.name = name

Expand Down Expand Up @@ -2730,7 +2737,8 @@ def read_index_legacy(self, key, start=None, stop=None):
node = getattr(self.group, key)
data = node[start:stop]
kind = node._v_attrs.kind
return _unconvert_index_legacy(data, kind, encoding=self.encoding)
return _unconvert_index_legacy(data, kind, encoding=self.encoding,
errors=self.errors)


class LegacySeriesFixed(LegacyFixed):
Expand Down Expand Up @@ -3149,7 +3157,8 @@ def write_metadata(self, key, values):
"""
values = Series(values)
self.parent.put(self._get_metadata_path(key), values, format='table',
encoding=self.encoding, nan_rep=self.nan_rep)
encoding=self.encoding, errors=self.errors,
nan_rep=self.nan_rep)

def read_metadata(self, key):
""" return the meta data array for this key """
Expand All @@ -3170,6 +3179,7 @@ def set_attrs(self):
self.attrs.data_columns = self.data_columns
self.attrs.nan_rep = self.nan_rep
self.attrs.encoding = self.encoding
self.attrs.errors = self.errors
self.attrs.levels = self.levels
self.attrs.metadata = self.metadata
self.set_info()
Expand All @@ -3185,6 +3195,7 @@ def get_attrs(self):
self.nan_rep = getattr(self.attrs, 'nan_rep', None)
self.encoding = _ensure_encoding(
getattr(self.attrs, 'encoding', None))
self.errors = getattr(self.attrs, 'errors', 'strict')
self.levels = getattr(
self.attrs, 'levels', None) or []
self.index_axes = [
Expand Down Expand Up @@ -3342,7 +3353,8 @@ def read_axes(self, where, **kwargs):
# convert the data
for a in self.axes:
a.set_info(self.info)
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding)
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
errors=self.errors)

return True

Expand Down Expand Up @@ -3424,6 +3436,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
data_columns = existing_table.data_columns
nan_rep = existing_table.nan_rep
self.encoding = existing_table.encoding
self.errors = existing_table.errors
self.info = copy.copy(existing_table.info)
else:
existing_table = None
Expand All @@ -3450,7 +3463,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
if i in axes:
name = obj._AXIS_NAMES[i]
index_axes_map[i] = _convert_index(
a, self.encoding, self.format_type
a, self.encoding, self.errors, self.format_type
).set_name(name).set_axis(i)
else:

Expand Down Expand Up @@ -3569,8 +3582,8 @@ def get_blk_items(mgr, blocks):
min_itemsize=min_itemsize,
nan_rep=nan_rep,
encoding=self.encoding,
info=self.info,
**kwargs)
errors=self.errors,
info=self.info)
col.set_pos(j)

self.values_axes.append(col)
Expand Down Expand Up @@ -3734,7 +3747,8 @@ def read_column(self, column, where=None, start=None, stop=None, **kwargs):
a.set_info(self.info)
return Series(_set_tz(a.convert(c[start:stop],
nan_rep=self.nan_rep,
encoding=self.encoding
encoding=self.encoding,
errors=self.errors
).take_data(),
a.tz, True), name=column)

Expand Down Expand Up @@ -4415,7 +4429,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False):
return values


def _convert_index(index, encoding=None, format_type=None):
def _convert_index(index, encoding=None, errors='strict', format_type=None):
index_name = getattr(index, 'name', None)

if isinstance(index, DatetimeIndex):
Expand Down Expand Up @@ -4469,7 +4483,7 @@ def _convert_index(index, encoding=None, format_type=None):
# atom = _tables().ObjectAtom()
# return np.asarray(values, dtype='O'), 'object', atom

converted = _convert_string_array(values, encoding)
converted = _convert_string_array(values, encoding, errors)
itemsize = converted.dtype.itemsize
return IndexCol(
converted, 'string', _tables().StringCol(itemsize),
Expand Down Expand Up @@ -4500,7 +4514,7 @@ def _convert_index(index, encoding=None, format_type=None):
index_name=index_name)


def _unconvert_index(data, kind, encoding=None):
def _unconvert_index(data, kind, encoding=None, errors='strict'):
kind = _ensure_decoded(kind)
if kind == u('datetime64'):
index = DatetimeIndex(data)
Expand All @@ -4519,28 +4533,31 @@ def _unconvert_index(data, kind, encoding=None):
elif kind in (u('integer'), u('float')):
index = np.asarray(data)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
errors=errors)
elif kind == u('object'):
index = np.asarray(data[0])
else: # pragma: no cover
raise ValueError('unrecognized index type %s' % kind)
return index


def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
errors='strict'):
kind = _ensure_decoded(kind)
if kind == u('datetime'):
index = to_datetime(data)
elif kind in (u('integer')):
index = np.asarray(data, dtype=object)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
errors=errors)
else: # pragma: no cover
raise ValueError('unrecognized index type %s' % kind)
return index


def _convert_string_array(data, encoding, itemsize=None):
def _convert_string_array(data, encoding, errors, itemsize=None):
"""
we take a string-like that is object dtype and coerce to a fixed size
string type
Expand All @@ -4549,6 +4566,7 @@ def _convert_string_array(data, encoding, itemsize=None):
----------
data : a numpy array of object dtype
encoding : None or string-encoding
errors : handler for encoding errors
itemsize : integer, optional, defaults to the max length of the strings

Returns
Expand All @@ -4559,7 +4577,7 @@ def _convert_string_array(data, encoding, itemsize=None):
# encode if needed
if encoding is not None and len(data):
data = Series(data.ravel()).str.encode(
encoding).values.reshape(data.shape)
encoding, errors).values.reshape(data.shape)

# create the sized dtype
if itemsize is None:
Expand All @@ -4570,7 +4588,8 @@ def _convert_string_array(data, encoding, itemsize=None):
return data


def _unconvert_string_array(data, nan_rep=None, encoding=None):
def _unconvert_string_array(data, nan_rep=None, encoding=None,
errors='strict'):
"""
inverse of _convert_string_array

Expand All @@ -4579,6 +4598,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
data : fixed length string dtyped array
nan_rep : the storage repr of NaN, optional
encoding : the encoding of the data, optional
errors : handler for encoding errors, default 'strict'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you show options and/or point to the python ref for these


Returns
-------
Expand All @@ -4600,7 +4620,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
dtype = "S{0}".format(itemsize)

if isinstance(data[0], compat.binary_type):
data = Series(data).str.decode(encoding).values
data = Series(data).str.decode(encoding, errors=errors).values
else:
data = data.astype(dtype, copy=False).astype(object, copy=False)

Expand All @@ -4611,22 +4631,23 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
return data.reshape(shape)


def _maybe_convert(values, val_kind, encoding):
def _maybe_convert(values, val_kind, encoding, errors):
if _need_convert(val_kind):
conv = _get_converter(val_kind, encoding)
conv = _get_converter(val_kind, encoding, errors)
# conv = np.frompyfunc(conv, 1, 1)
values = conv(values)
return values


def _get_converter(kind, encoding):
def _get_converter(kind, encoding, errors):
kind = _ensure_decoded(kind)
if kind == 'datetime64':
return lambda x: np.asarray(x, dtype='M8[ns]')
elif kind == 'datetime':
return lambda x: to_datetime(x, cache=True).to_pydatetime()
elif kind == 'string':
return lambda x: _unconvert_string_array(x, encoding=encoding)
return lambda x: _unconvert_string_array(x, encoding=encoding,
errors=errors)
else: # pragma: no cover
raise ValueError('invalid kind %s' % kind)

Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1462,6 +1462,18 @@ def test_to_hdf_with_min_itemsize(self):
tm.assert_series_equal(pd.read_hdf(path, 'ss4'),
pd.concat([df['B'], df2['B']]))

@pytest.mark.parametrize("format", ['fixed', 'table'])
def test_to_hdf_errors(self, format):

data = ['\ud800foo']
ser = pd.Series(data, index=pd.Index(data))
with ensure_clean_path(self.path) as path:
# GH 20835
ser.to_hdf(path, 'table', format=format, errors='surrogatepass')

result = pd.read_hdf(path, 'table', errors='surrogatepass')
tm.assert_series_equal(result, ser)

def test_append_with_data_columns(self):

with ensure_clean_store(self.path) as store:
Expand Down