"to_hdf()" with "format='table'" ignores encoder "errors" argument. #20835

John-Doe-Smith · 2018-04-26T20:50:59Z

Code Sample, a copy-pastable example if possible

s = '\ud800'
srs = _pd.Series()
srs.loc[ 0 ] = s

srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'fixed' )  # works fine
srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )
        # fails with "UnicodeEncodeError: 'utf-8' codec can't encode character
        # '\ud800' in position 0: surrogates not allowed"

Problem description

The default encoder options for to_hdf() with format = 'table' seem to specify errors = 'strict' as a default for the encoding. The problem is that while one can specify the encoding with, e.g., encoding = 'utf-8', the encoder errors cannot be specified.

Below is an iPython stacktrace:

srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )

UnicodeEncodeError Traceback (most recent call last)
in ()
----> 1 srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )

/usr/lib/python3.6/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, **kwargs)
1469
1470 from pandas.io import pytables
-> 1471 return pytables.to_hdf(path_or_buf, key, self, **kwargs)
1472
1473 def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
279 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
280 complib=complib) as store:
--> 281 f(store)
282 else:
283 f(path_or_buf)

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in (store)
273 f = lambda store: store.append(key, value, **kwargs)
274 else:
--> 275 f = lambda store: store.put(key, value, **kwargs)
276
277 path_or_buf = _stringify_path(path_or_buf)

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in put(self, key, value, format, append, **kwargs)
864 format = get_option("io.hdf.default_format") or 'fixed'
865 kwargs = self._validate_format(format, kwargs)
--> 866 self._write_to_group(key, value, append=append, **kwargs)
867
868 def remove(self, key, where=None, start=None, stop=None):

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1339
1340 # write the object
-> 1341 s.write(obj=value, append=append, complib=complib, **kwargs)
1342
1343 if s.is_table and index:

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, data_columns, **kwargs)
4208 obj.columns = [name]
4209 return super(AppendableSeriesTable, self).write(
-> 4210 obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4211
4212 def read(self, columns=None, **kwargs):

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, **kwargs)
3905 self.create_axes(axes=axes, obj=obj, validate=append,
3906 min_itemsize=min_itemsize,
-> 3907 **kwargs)
3908
3909 for a in self.axes:

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3577 self.values_axes.append(col)
3578 except (NotImplementedError, ValueError, TypeError) as e:
-> 3579 raise e
3580 except Exception as detail:
3581 raise Exception(

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3572 encoding=self.encoding,
3573 info=self.info,
-> 3574 **kwargs)
3575 col.set_pos(j)
3576

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1923 min_itemsize,
1924 nan_rep,
-> 1925 encoding)
1926
1927 # set as a data block

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1959
1960 # itemsize is the maximum length of a string (along any dimension)
-> 1961 data_converted = _convert_string_array(data, encoding)
1962 itemsize = data_converted.itemsize
1963

/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _convert_string_array(data, encoding, itemsize)
4569 if encoding is not None and len(data):
4570 data = Series(data.ravel()).str.encode(
-> 4571 encoding).values.reshape(data.shape)
4572
4573 # create the sized dtype

/usr/lib/python3.6/site-packages/pandas/core/strings.py in encode(self, encoding, errors)
1655 @copy(str_encode)
1656 def encode(self, encoding, errors="strict"):
-> 1657 result = str_encode(self._data, encoding, errors)
1658 return self._wrap_result(result)
1659

/usr/lib/python3.6/site-packages/pandas/core/strings.py in str_encode(arr, encoding, errors)
1309 encoder = codecs.getencoder(encoding)
1310 f = lambda x: encoder(x, errors)[0]
-> 1311 return _na_map(f, arr)
1312
1313

/usr/lib/python3.6/site-packages/pandas/core/strings.py in _na_map(f, arr, na_result, dtype)
154 def _na_map(f, arr, na_result=np.nan, dtype=object):
155 # should really check for NA
--> 156 return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
157
158

/usr/lib/python3.6/site-packages/pandas/core/strings.py in _map(f, arr, na_mask, na_value, dtype)
169 try:
170 convert = not all(mask)
--> 171 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
172 except (TypeError, AttributeError) as e:
173 # Reraise the exception if callable f got wrong number of args.

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer_mask()

/usr/lib/python3.6/site-packages/pandas/core/strings.py in (x)
1308 else:
1309 encoder = codecs.getencoder(encoding)
-> 1310 f = lambda x: encoder(x, errors)[0]
1311 return _na_map(f, arr)
1312

UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' in position 0: surrogates not allowed

Expected Output

What I'd like to see is the following working:

srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table', \
        encoding = 'utf-8', errors = 'surrogatepass' )

That is to say, I'd like to be able to specify the error argument for an encoder (and, of course, for the decoder also).

Perhaps better yet would be to also change the default errors (at least in this case) to errors = 'surrogatepass' for both the encoder and decoder, so as to preserve the strings as they are without errors or surprises.

As an example in plain Python:

s = '\ud800'
b = s.encode( 'utf-8', errors = 'surrogatepass' )
s == b.decode( 'utf-8', errors = 'surrogatepass' )  # True, and no errors - it just 'works'

Output of `pd.show_versions()`

INSTALLED VERSIONS

commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Linux
OS-release: 4.16.3-1-ARCH
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.22.0
pytest: 3.5.0
pip: 9.0.1
setuptools: 39.0.1
Cython: None
numpy: 1.14.2
scipy: 1.0.1
pyarrow: None
xarray: None
IPython: 6.3.1
sphinx: None
patsy: 0.5.0
dateutil: 2.7.2
pytz: 2018.4
blosc: 1.5.1
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.4
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 1.2.7
pymysql: None
psycopg2: None
jinja2: None
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: 0.6.0

The text was updated successfully, but these errors were encountered:

WillAyd · 2018-04-30T06:26:54Z

Thanks for the report. Referenced this in a PR that I believe will allow this

WillAyd mentioned this issue Apr 30, 2018

Allow errors keyword for HDF IO Encoding Err Handling #20873

Merged

4 tasks

jreback added Bug Unicode Unicode strings IO HDF5 read_hdf, HDFStore Compat pandas objects compatability with Numpy or Python functions and removed Bug labels Apr 30, 2018

jreback modified the milestones: Next Major Release, 0.23.0 Apr 30, 2018

TomAugspurger closed this as completed in #20873 May 1, 2018

obilodeau mentioned this issue Sep 5, 2018

to_csv() surrogates not allowed #22610

Closed

kprestel mentioned this issue Oct 28, 2018

CLN:Remove unused **kwargs from user facing methods #23249

Merged

2 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

"to_hdf()" with "format='table'" ignores encoder "errors" argument. #20835

"to_hdf()" with "format='table'" ignores encoder "errors" argument. #20835

John-Doe-Smith commented Apr 26, 2018

srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )

INSTALLED VERSIONS

WillAyd commented Apr 30, 2018 •

edited

Loading

"to_hdf()" with "format='table'" ignores encoder "errors" argument. #20835

"to_hdf()" with "format='table'" ignores encoder "errors" argument. #20835

Comments

John-Doe-Smith commented Apr 26, 2018

Code Sample, a copy-pastable example if possible

Problem description

srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )

Expected Output

Output of pd.show_versions()

INSTALLED VERSIONS

WillAyd commented Apr 30, 2018 • edited Loading

Output of `pd.show_versions()`

WillAyd commented Apr 30, 2018 •

edited

Loading