-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow errors
keyword for HDF IO Encoding Err Handling
#20873
Merged
Merged
Changes from 6 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
f75ca6e
Added test case
WillAyd 97f6a54
Round trippable read/write with errors
WillAyd 9ae2ea0
Added index to test case
WillAyd cfe09d1
Mirrored encoding impl
WillAyd 3973ef7
Updated whatsnew
WillAyd 61a0c6b
LINT fixup
WillAyd 0fe838a
Document errors
TomAugspurger 9a13234
Merge remote-tracking branch 'upstream/master' into WillAyd-tbl-arg-pass
TomAugspurger File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -705,7 +705,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, | |
def func(_start, _stop, _where): | ||
return s.read(start=_start, stop=_stop, | ||
where=_where, | ||
columns=columns, **kwargs) | ||
columns=columns) | ||
|
||
# create the iterator | ||
it = TableIterator(self, s, func, where=where, nrows=s.nrows, | ||
|
@@ -1566,14 +1566,14 @@ def infer(self, handler): | |
new_self.read_metadata(handler) | ||
return new_self | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
""" set the values from this selection: take = take ownership """ | ||
|
||
# values is a recarray | ||
if values.dtype.fields is not None: | ||
values = values[self.cname] | ||
|
||
values = _maybe_convert(values, self.kind, encoding) | ||
values = _maybe_convert(values, self.kind, encoding, errors) | ||
|
||
kwargs = dict() | ||
if self.freq is not None: | ||
|
@@ -1748,7 +1748,7 @@ class GenericIndexCol(IndexCol): | |
def is_indexed(self): | ||
return False | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
""" set the values from this selection: take = take ownership """ | ||
|
||
self.values = Int64Index(np.arange(self.table.nrows)) | ||
|
@@ -1877,7 +1877,7 @@ def set_kind(self): | |
self.typ = getattr(self.description, self.cname, None) | ||
|
||
def set_atom(self, block, block_items, existing_col, min_itemsize, | ||
nan_rep, info, encoding=None, **kwargs): | ||
nan_rep, info, encoding=None, errors='strict'): | ||
""" create and setup my atom from the block b """ | ||
|
||
self.values = list(block_items) | ||
|
@@ -1922,7 +1922,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, | |
existing_col, | ||
min_itemsize, | ||
nan_rep, | ||
encoding) | ||
encoding, | ||
errors) | ||
|
||
# set as a data block | ||
else: | ||
|
@@ -1932,7 +1933,7 @@ def get_atom_string(self, block, itemsize): | |
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) | ||
|
||
def set_atom_string(self, block, block_items, existing_col, min_itemsize, | ||
nan_rep, encoding): | ||
nan_rep, encoding, errors): | ||
# fill nan items with myself, don't disturb the blocks by | ||
# trying to downcast | ||
block = block.fillna(nan_rep, downcast=False) | ||
|
@@ -1958,7 +1959,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, | |
) | ||
|
||
# itemsize is the maximum length of a string (along any dimension) | ||
data_converted = _convert_string_array(data, encoding) | ||
data_converted = _convert_string_array(data, encoding, errors) | ||
itemsize = data_converted.itemsize | ||
|
||
# specified min_itemsize? | ||
|
@@ -2089,7 +2090,7 @@ def validate_attr(self, append): | |
raise ValueError("appended items dtype do not match existing " | ||
"items dtype in table!") | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
"""set the data from this selection (and convert to the correct dtype | ||
if we can) | ||
""" | ||
|
@@ -2163,7 +2164,7 @@ def convert(self, values, nan_rep, encoding): | |
# convert nans / decode | ||
if _ensure_decoded(self.kind) == u('string'): | ||
self.data = _unconvert_string_array( | ||
self.data, nan_rep=nan_rep, encoding=encoding) | ||
self.data, nan_rep=nan_rep, encoding=encoding, errors=errors) | ||
|
||
return self | ||
|
||
|
@@ -2229,10 +2230,12 @@ class Fixed(StringMixin): | |
ndim = None | ||
is_table = False | ||
|
||
def __init__(self, parent, group, encoding=None, **kwargs): | ||
def __init__(self, parent, group, encoding=None, errors='strict', | ||
**kwargs): | ||
self.parent = parent | ||
self.group = group | ||
self.encoding = _ensure_encoding(encoding) | ||
self.errors = errors | ||
self.set_version() | ||
|
||
@property | ||
|
@@ -2436,10 +2439,12 @@ def is_exists(self): | |
def set_attrs(self): | ||
""" set our object attributes """ | ||
self.attrs.encoding = self.encoding | ||
self.attrs.errors = self.errors | ||
|
||
def get_attrs(self): | ||
""" retrieve our attributes """ | ||
self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) | ||
self.errors = getattr(self.attrs, 'errors', 'strict') | ||
for n in self.attributes: | ||
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) | ||
|
||
|
@@ -2506,7 +2511,7 @@ def write_index(self, key, index): | |
self.write_sparse_intindex(key, index) | ||
else: | ||
setattr(self.attrs, '%s_variety' % key, 'regular') | ||
converted = _convert_index(index, self.encoding, | ||
converted = _convert_index(index, self.encoding, self.errors, | ||
self.format_type).set_name('index') | ||
|
||
self.write_array(key, converted.values) | ||
|
@@ -2552,7 +2557,7 @@ def write_multi_index(self, key, index): | |
index.names)): | ||
# write the level | ||
level_key = '%s_level%d' % (key, i) | ||
conv_level = _convert_index(lev, self.encoding, | ||
conv_level = _convert_index(lev, self.encoding, self.errors, | ||
self.format_type).set_name(level_key) | ||
self.write_array(level_key, conv_level.values) | ||
node = getattr(self.group, level_key) | ||
|
@@ -2613,11 +2618,13 @@ def read_index_node(self, node, start=None, stop=None): | |
|
||
if kind in (u('date'), u('datetime')): | ||
index = factory(_unconvert_index(data, kind, | ||
encoding=self.encoding), | ||
encoding=self.encoding, | ||
errors=self.errors), | ||
dtype=object, **kwargs) | ||
else: | ||
index = factory(_unconvert_index(data, kind, | ||
encoding=self.encoding), **kwargs) | ||
encoding=self.encoding, | ||
errors=self.errors), **kwargs) | ||
|
||
index.name = name | ||
|
||
|
@@ -2730,7 +2737,8 @@ def read_index_legacy(self, key, start=None, stop=None): | |
node = getattr(self.group, key) | ||
data = node[start:stop] | ||
kind = node._v_attrs.kind | ||
return _unconvert_index_legacy(data, kind, encoding=self.encoding) | ||
return _unconvert_index_legacy(data, kind, encoding=self.encoding, | ||
errors=self.errors) | ||
|
||
|
||
class LegacySeriesFixed(LegacyFixed): | ||
|
@@ -3149,7 +3157,8 @@ def write_metadata(self, key, values): | |
""" | ||
values = Series(values) | ||
self.parent.put(self._get_metadata_path(key), values, format='table', | ||
encoding=self.encoding, nan_rep=self.nan_rep) | ||
encoding=self.encoding, errors=self.errors, | ||
nan_rep=self.nan_rep) | ||
|
||
def read_metadata(self, key): | ||
""" return the meta data array for this key """ | ||
|
@@ -3170,6 +3179,7 @@ def set_attrs(self): | |
self.attrs.data_columns = self.data_columns | ||
self.attrs.nan_rep = self.nan_rep | ||
self.attrs.encoding = self.encoding | ||
self.attrs.errors = self.errors | ||
self.attrs.levels = self.levels | ||
self.attrs.metadata = self.metadata | ||
self.set_info() | ||
|
@@ -3185,6 +3195,7 @@ def get_attrs(self): | |
self.nan_rep = getattr(self.attrs, 'nan_rep', None) | ||
self.encoding = _ensure_encoding( | ||
getattr(self.attrs, 'encoding', None)) | ||
self.errors = getattr(self.attrs, 'errors', 'strict') | ||
self.levels = getattr( | ||
self.attrs, 'levels', None) or [] | ||
self.index_axes = [ | ||
|
@@ -3342,7 +3353,8 @@ def read_axes(self, where, **kwargs): | |
# convert the data | ||
for a in self.axes: | ||
a.set_info(self.info) | ||
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) | ||
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, | ||
errors=self.errors) | ||
|
||
return True | ||
|
||
|
@@ -3424,6 +3436,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, | |
data_columns = existing_table.data_columns | ||
nan_rep = existing_table.nan_rep | ||
self.encoding = existing_table.encoding | ||
self.errors = existing_table.errors | ||
self.info = copy.copy(existing_table.info) | ||
else: | ||
existing_table = None | ||
|
@@ -3450,7 +3463,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, | |
if i in axes: | ||
name = obj._AXIS_NAMES[i] | ||
index_axes_map[i] = _convert_index( | ||
a, self.encoding, self.format_type | ||
a, self.encoding, self.errors, self.format_type | ||
).set_name(name).set_axis(i) | ||
else: | ||
|
||
|
@@ -3569,8 +3582,8 @@ def get_blk_items(mgr, blocks): | |
min_itemsize=min_itemsize, | ||
nan_rep=nan_rep, | ||
encoding=self.encoding, | ||
info=self.info, | ||
**kwargs) | ||
errors=self.errors, | ||
info=self.info) | ||
col.set_pos(j) | ||
|
||
self.values_axes.append(col) | ||
|
@@ -3734,7 +3747,8 @@ def read_column(self, column, where=None, start=None, stop=None, **kwargs): | |
a.set_info(self.info) | ||
return Series(_set_tz(a.convert(c[start:stop], | ||
nan_rep=self.nan_rep, | ||
encoding=self.encoding | ||
encoding=self.encoding, | ||
errors=self.errors | ||
).take_data(), | ||
a.tz, True), name=column) | ||
|
||
|
@@ -4415,7 +4429,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): | |
return values | ||
|
||
|
||
def _convert_index(index, encoding=None, format_type=None): | ||
def _convert_index(index, encoding=None, errors='strict', format_type=None): | ||
index_name = getattr(index, 'name', None) | ||
|
||
if isinstance(index, DatetimeIndex): | ||
|
@@ -4469,7 +4483,7 @@ def _convert_index(index, encoding=None, format_type=None): | |
# atom = _tables().ObjectAtom() | ||
# return np.asarray(values, dtype='O'), 'object', atom | ||
|
||
converted = _convert_string_array(values, encoding) | ||
converted = _convert_string_array(values, encoding, errors) | ||
itemsize = converted.dtype.itemsize | ||
return IndexCol( | ||
converted, 'string', _tables().StringCol(itemsize), | ||
|
@@ -4500,7 +4514,7 @@ def _convert_index(index, encoding=None, format_type=None): | |
index_name=index_name) | ||
|
||
|
||
def _unconvert_index(data, kind, encoding=None): | ||
def _unconvert_index(data, kind, encoding=None, errors='strict'): | ||
kind = _ensure_decoded(kind) | ||
if kind == u('datetime64'): | ||
index = DatetimeIndex(data) | ||
|
@@ -4519,28 +4533,31 @@ def _unconvert_index(data, kind, encoding=None): | |
elif kind in (u('integer'), u('float')): | ||
index = np.asarray(data) | ||
elif kind in (u('string')): | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, | ||
errors=errors) | ||
elif kind == u('object'): | ||
index = np.asarray(data[0]) | ||
else: # pragma: no cover | ||
raise ValueError('unrecognized index type %s' % kind) | ||
return index | ||
|
||
|
||
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): | ||
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, | ||
errors='strict'): | ||
kind = _ensure_decoded(kind) | ||
if kind == u('datetime'): | ||
index = to_datetime(data) | ||
elif kind in (u('integer')): | ||
index = np.asarray(data, dtype=object) | ||
elif kind in (u('string')): | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, | ||
errors=errors) | ||
else: # pragma: no cover | ||
raise ValueError('unrecognized index type %s' % kind) | ||
return index | ||
|
||
|
||
def _convert_string_array(data, encoding, itemsize=None): | ||
def _convert_string_array(data, encoding, errors, itemsize=None): | ||
""" | ||
we take a string-like that is object dtype and coerce to a fixed size | ||
string type | ||
|
@@ -4549,6 +4566,7 @@ def _convert_string_array(data, encoding, itemsize=None): | |
---------- | ||
data : a numpy array of object dtype | ||
encoding : None or string-encoding | ||
errors : handler for encoding errors | ||
itemsize : integer, optional, defaults to the max length of the strings | ||
|
||
Returns | ||
|
@@ -4559,7 +4577,7 @@ def _convert_string_array(data, encoding, itemsize=None): | |
# encode if needed | ||
if encoding is not None and len(data): | ||
data = Series(data.ravel()).str.encode( | ||
encoding).values.reshape(data.shape) | ||
encoding, errors).values.reshape(data.shape) | ||
|
||
# create the sized dtype | ||
if itemsize is None: | ||
|
@@ -4570,7 +4588,8 @@ def _convert_string_array(data, encoding, itemsize=None): | |
return data | ||
|
||
|
||
def _unconvert_string_array(data, nan_rep=None, encoding=None): | ||
def _unconvert_string_array(data, nan_rep=None, encoding=None, | ||
errors='strict'): | ||
""" | ||
inverse of _convert_string_array | ||
|
||
|
@@ -4579,6 +4598,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
data : fixed length string dtyped array | ||
nan_rep : the storage repr of NaN, optional | ||
encoding : the encoding of the data, optional | ||
errors : handler for encoding errors, default 'strict' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you show options and/or point to the python ref for these |
||
|
||
Returns | ||
------- | ||
|
@@ -4600,7 +4620,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
dtype = "S{0}".format(itemsize) | ||
|
||
if isinstance(data[0], compat.binary_type): | ||
data = Series(data).str.decode(encoding).values | ||
data = Series(data).str.decode(encoding, errors=errors).values | ||
else: | ||
data = data.astype(dtype, copy=False).astype(object, copy=False) | ||
|
||
|
@@ -4611,22 +4631,23 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
return data.reshape(shape) | ||
|
||
|
||
def _maybe_convert(values, val_kind, encoding): | ||
def _maybe_convert(values, val_kind, encoding, errors): | ||
if _need_convert(val_kind): | ||
conv = _get_converter(val_kind, encoding) | ||
conv = _get_converter(val_kind, encoding, errors) | ||
# conv = np.frompyfunc(conv, 1, 1) | ||
values = conv(values) | ||
return values | ||
|
||
|
||
def _get_converter(kind, encoding): | ||
def _get_converter(kind, encoding, errors): | ||
kind = _ensure_decoded(kind) | ||
if kind == 'datetime64': | ||
return lambda x: np.asarray(x, dtype='M8[ns]') | ||
elif kind == 'datetime': | ||
return lambda x: to_datetime(x, cache=True).to_pydatetime() | ||
elif kind == 'string': | ||
return lambda x: _unconvert_string_array(x, encoding=encoding) | ||
return lambda x: _unconvert_string_array(x, encoding=encoding, | ||
errors=errors) | ||
else: # pragma: no cover | ||
raise ValueError('invalid kind %s' % kind) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed this
**kwargs
argument because it was getting mangled when callingread_index_node
with arbitrary keyword arguments inread_hdf
. I think it was a mistake to be included originally