You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
s='\ud800'srs=_pd.Series()
srs.loc[ 0 ] =ssrs.to_hdf( 'srs.hdf', key='srs', mode='w', format='fixed' ) # works finesrs.to_hdf( 'srs.hdf', key='srs', mode='w', format='table' )
# fails with "UnicodeEncodeError: 'utf-8' codec can't encode character# '\ud800' in position 0: surrogates not allowed"
Problem description
The default encoder options for to_hdf() with format = 'table' seem to specify errors = 'strict' as a default for the encoding. The problem is that while one can specify the encoding with, e.g., encoding = 'utf-8', the encoder errors cannot be specified.
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1923 min_itemsize,
1924 nan_rep,
-> 1925 encoding)
1926
1927 # set as a data block
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1959
1960 # itemsize is the maximum length of a string (along any dimension)
-> 1961 data_converted = _convert_string_array(data, encoding)
1962 itemsize = data_converted.itemsize
1963
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _convert_string_array(data, encoding, itemsize)
4569 if encoding is not None and len(data):
4570 data = Series(data.ravel()).str.encode(
-> 4571 encoding).values.reshape(data.shape)
4572
4573 # create the sized dtype
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _na_map(f, arr, na_result, dtype)
154 def _na_map(f, arr, na_result=np.nan, dtype=object):
155 # should really check for NA
--> 156 return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
157
158
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _map(f, arr, na_mask, na_value, dtype)
169 try:
170 convert = not all(mask)
--> 171 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
172 except (TypeError, AttributeError) as e:
173 # Reraise the exception if callable f got wrong number of args.
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer_mask()
That is to say, I'd like to be able to specify the error argument for an encoder (and, of course, for the decoder also).
Perhaps better yet would be to also change the default errors (at least in this case) to errors = 'surrogatepass' for both the encoder and decoder, so as to preserve the strings as they are without errors or surprises.
As an example in plain Python:
s='\ud800'b=s.encode( 'utf-8', errors='surrogatepass' )
s==b.decode( 'utf-8', errors='surrogatepass' ) # True, and no errors - it just 'works'
Code Sample, a copy-pastable example if possible
Problem description
The default encoder options for
to_hdf()
withformat = 'table'
seem to specifyerrors = 'strict'
as a default for the encoding. The problem is that while one can specify the encoding with, e.g.,encoding = 'utf-8'
, the encodererrors
cannot be specified.Below is an iPython stacktrace:
srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )
UnicodeEncodeError Traceback (most recent call last)
in ()
----> 1 srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )
/usr/lib/python3.6/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, **kwargs)
1469
1470 from pandas.io import pytables
-> 1471 return pytables.to_hdf(path_or_buf, key, self, **kwargs)
1472
1473 def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
279 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
280 complib=complib) as store:
--> 281 f(store)
282 else:
283 f(path_or_buf)
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in (store)
273 f = lambda store: store.append(key, value, **kwargs)
274 else:
--> 275 f = lambda store: store.put(key, value, **kwargs)
276
277 path_or_buf = _stringify_path(path_or_buf)
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in put(self, key, value, format, append, **kwargs)
864 format = get_option("io.hdf.default_format") or 'fixed'
865 kwargs = self._validate_format(format, kwargs)
--> 866 self._write_to_group(key, value, append=append, **kwargs)
867
868 def remove(self, key, where=None, start=None, stop=None):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1339
1340 # write the object
-> 1341 s.write(obj=value, append=append, complib=complib, **kwargs)
1342
1343 if s.is_table and index:
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, data_columns, **kwargs)
4208 obj.columns = [name]
4209 return super(AppendableSeriesTable, self).write(
-> 4210 obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4211
4212 def read(self, columns=None, **kwargs):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, **kwargs)
3905 self.create_axes(axes=axes, obj=obj, validate=append,
3906 min_itemsize=min_itemsize,
-> 3907 **kwargs)
3908
3909 for a in self.axes:
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3577 self.values_axes.append(col)
3578 except (NotImplementedError, ValueError, TypeError) as e:
-> 3579 raise e
3580 except Exception as detail:
3581 raise Exception(
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3572 encoding=self.encoding,
3573 info=self.info,
-> 3574 **kwargs)
3575 col.set_pos(j)
3576
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1923 min_itemsize,
1924 nan_rep,
-> 1925 encoding)
1926
1927 # set as a data block
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1959
1960 # itemsize is the maximum length of a string (along any dimension)
-> 1961 data_converted = _convert_string_array(data, encoding)
1962 itemsize = data_converted.itemsize
1963
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _convert_string_array(data, encoding, itemsize)
4569 if encoding is not None and len(data):
4570 data = Series(data.ravel()).str.encode(
-> 4571 encoding).values.reshape(data.shape)
4572
4573 # create the sized dtype
/usr/lib/python3.6/site-packages/pandas/core/strings.py in encode(self, encoding, errors)
1655 @copy(str_encode)
1656 def encode(self, encoding, errors="strict"):
-> 1657 result = str_encode(self._data, encoding, errors)
1658 return self._wrap_result(result)
1659
/usr/lib/python3.6/site-packages/pandas/core/strings.py in str_encode(arr, encoding, errors)
1309 encoder = codecs.getencoder(encoding)
1310 f = lambda x: encoder(x, errors)[0]
-> 1311 return _na_map(f, arr)
1312
1313
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _na_map(f, arr, na_result, dtype)
154 def _na_map(f, arr, na_result=np.nan, dtype=object):
155 # should really check for NA
--> 156 return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
157
158
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _map(f, arr, na_mask, na_value, dtype)
169 try:
170 convert = not all(mask)
--> 171 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
172 except (TypeError, AttributeError) as e:
173 # Reraise the exception if callable
f
got wrong number of args.pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer_mask()
/usr/lib/python3.6/site-packages/pandas/core/strings.py in (x)
1308 else:
1309 encoder = codecs.getencoder(encoding)
-> 1310 f = lambda x: encoder(x, errors)[0]
1311 return _na_map(f, arr)
1312
UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' in position 0: surrogates not allowed
Expected Output
What I'd like to see is the following working:
That is to say, I'd like to be able to specify the
error
argument for an encoder (and, of course, for the decoder also).Perhaps better yet would be to also change the default errors (at least in this case) to
errors = 'surrogatepass'
for both the encoder and decoder, so as to preserve the strings as they are without errors or surprises.As an example in plain Python:
Output of
pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Linux
OS-release: 4.16.3-1-ARCH
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8
pandas: 0.22.0
pytest: 3.5.0
pip: 9.0.1
setuptools: 39.0.1
Cython: None
numpy: 1.14.2
scipy: 1.0.1
pyarrow: None
xarray: None
IPython: 6.3.1
sphinx: None
patsy: 0.5.0
dateutil: 2.7.2
pytz: 2018.4
blosc: 1.5.1
bottleneck: 1.2.1
tables: 3.4.2
numexpr: 2.6.4
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 1.2.7
pymysql: None
psycopg2: None
jinja2: None
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: 0.6.0
The text was updated successfully, but these errors were encountered: