From ac0898f08890dafb9a9e5cbc9515e15112616d53 Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 11 Nov 2012 18:45:28 +0200 Subject: [PATCH 1/9] TST: series tidy_repr with unicode data values --- pandas/tests/test_series.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index a906489e67b57..db0a37978000d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1043,6 +1043,11 @@ def test_repr(self): rep_str = repr(ser) self.assert_("Name: 0" in rep_str) + def test_tidy_repr(self): + a=Series([u"\u05d0"]*1000) + a.name= 'title1' + repr(a) # should not raise exception + def test_repr_bool_fails(self): s = Series([DataFrame(np.random.randn(2,2)) for i in range(5)]) From 007622ddee58087b49480220a4d51d03fa59b3f5 Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 11 Nov 2012 18:15:24 +0200 Subject: [PATCH 2/9] ENH: Series tidy_repr should use pprint_thing and console_encode #2225 --- pandas/core/series.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3241044a63c68..62cc9a2f42cb3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -884,7 +884,11 @@ def _tidy_repr(self, max_vals=20): length=False, name=False) result = head + '\n...\n' + tail - return '%s\n%s' % (result, self._repr_footer()) + result = '%s\n%s' % (result, self._repr_footer()) + + if py3compat.PY3: + return unicode(result) + return com.console_encode(result) def _repr_footer(self): namestr = "Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" From 2599741cbda7272eb0c9a9d00bbc340cb5c737ea Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 11 Nov 2012 18:16:17 +0200 Subject: [PATCH 3/9] ENH: to_string() and to_str_columns() should return unicode, deprecate force_unicode #2225 using pprint_thing will try to decode using utf-8 as a fallback, but by these functions will now return unicode() rather then str() objects. --- pandas/core/format.py | 49 ++++++++++++++++--------------------- pandas/core/frame.py | 15 +++++------- pandas/tests/test_format.py | 3 +-- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 13e504a8e1f88..b28e825fd6f99 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -135,9 +135,7 @@ def to_string(self): if footer: result.append(footer) - if py3compat.PY3: - return unicode(u'\n'.join(result)) - return com.console_encode(u'\n'.join(result)) + return unicode(u'\n'.join(result)) if py3compat.PY3: # pragma: no cover _encode_diff = lambda x: 0 @@ -200,10 +198,15 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, else: self.columns = frame.columns - def _to_str_columns(self, force_unicode=False): + def _to_str_columns(self, force_unicode=None): """ Render a DataFrame to a list of columns (as lists of strings). """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + # may include levels names also str_index = self._get_formatted_index() str_columns = self._get_formatted_column_labels() @@ -237,32 +240,17 @@ def _to_str_columns(self, force_unicode=False): if self.index: strcols.insert(0, str_index) - if not py3compat.PY3: - if force_unicode: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) - else: - # Generally everything is plain strings, which has ascii - # encoding. Problem is when there is a char with value over - # 127. Everything then gets converted to unicode. - try: - map(lambda col: map(str, col), strcols) - except UnicodeError: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) - return strcols - def to_string(self, force_unicode=False): + def to_string(self, force_unicode=None): """ Render a DataFrame to a console-friendly tabular output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -272,15 +260,20 @@ def to_string(self, force_unicode=False): com.pprint_thing(frame.index))) text = info_line else: - strcols = self._to_str_columns(force_unicode) + strcols = self._to_str_columns() text = adjoin(1, *strcols) self.buf.writelines(text) - def to_latex(self, force_unicode=False, column_format=None): + def to_latex(self, force_unicode=None, column_format=None): """ Render a DataFrame to a LaTeX tabular environment output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -289,7 +282,7 @@ def to_latex(self, force_unicode=False, column_format=None): frame.columns, frame.index)) strcols = [[info_line]] else: - strcols = self._to_str_columns(force_unicode) + strcols = self._to_str_columns() if column_format is None: column_format = '|l|%s|' % '|'.join('c' for _ in strcols) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7f296e822e15..439c59a6a8de2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1379,19 +1379,21 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, nanRep=None, - index_names=True, justify=None, force_unicode=False): + index_names=True, justify=None, force_unicode=None): """ Render a DataFrame to a console-friendly tabular output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) if nanRep is not None: # pragma: no cover - import warnings warnings.warn("nanRep is deprecated, use na_rep", FutureWarning) na_rep = nanRep if colSpace is not None: # pragma: no cover - import warnings warnings.warn("colSpace is deprecated, use col_space", FutureWarning) col_space = colSpace @@ -1404,15 +1406,10 @@ def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, justify=justify, index_names=index_names, header=header, index=index) - formatter.to_string(force_unicode=force_unicode) + formatter.to_string() if buf is None: result = formatter.buf.getvalue() - if not force_unicode: - try: - result = str(result) - except ValueError: - pass return result @Appender(fmt.docstring_to_string, indents=1) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 10bb75bfbb5b6..8e8772dcab01d 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -135,7 +135,7 @@ def test_to_string_unicode_columns(self): df.info(buf=buf) buf.getvalue() - result = self.frame.to_string(force_unicode=True) + result = self.frame.to_string() self.assert_(isinstance(result, unicode)) def test_to_string_unicode_two(self): @@ -495,7 +495,6 @@ def test_to_string_int_formatting(self): self.assert_(issubclass(df['x'].dtype.type, np.integer)) output = df.to_string() - self.assert_(isinstance(output, str)) expected = (' x\n' '0 -15\n' '1 20\n' From a34ac81bbe5be142455fe30f5d1541860684cd21 Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 14 Nov 2012 03:20:23 +0200 Subject: [PATCH 4/9] ENH: convert more internal string processing to unicode, SeriesFormatter, Index.format, etc' --- pandas/core/format.py | 16 ++++------------ pandas/core/index.py | 6 +++--- pandas/tests/test_format.py | 14 +++++++------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index b28e825fd6f99..79077088384ae 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -62,7 +62,7 @@ class SeriesFormatter(object): def __init__(self, series, buf=None, header=True, length=True, na_rep='NaN', name=False, float_format=None): self.series = series - self.buf = buf if buf is not None else StringIO() + self.buf = buf if buf is not None else StringIO(u"") self.name = name self.na_rep = na_rep self.length = length @@ -112,7 +112,7 @@ def to_string(self): series = self.series if len(series) == 0: - return '' + return u'' fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() @@ -719,18 +719,10 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.justify = justify def get_result(self): - if self._have_unicode(): - fmt_values = self._format_strings(use_unicode=True) - else: - fmt_values = self._format_strings(use_unicode=False) - + fmt_values = self._format_strings() return _make_fixed_width(fmt_values, self.justify) - def _have_unicode(self): - mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode)) - return mask.any() - - def _format_strings(self, use_unicode=False): + def _format_strings(self): if self.float_format is None: float_format = print_config.float_format if float_format is None: diff --git a/pandas/core/index.py b/pandas/core/index.py index b7792309f66ff..500712d42ee80 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -394,8 +394,8 @@ def format(self, name=False): result = [] for dt in self: if dt.time() != zero_time or dt.tzinfo is not None: - return header + ['%s' % x for x in self] - result.append('%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) + return header + [u'%s' % x for x in self] + result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) return header + result values = self.values @@ -1496,7 +1496,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, formatted = lev.take(lab).format() else: # weird all NA case - formatted = [str(x) for x in com.take_1d(lev.values, lab)] + formatted = [com.pprint_thing(x) for x in com.take_1d(lev.values, lab)] stringified_levels.append(formatted) result_levels = [] diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 8e8772dcab01d..0b5182acb7f72 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -840,16 +840,16 @@ def test_to_string(self): def test_to_string_mixed(self): s = Series(['foo', np.nan, -1.23, 4.56]) result = s.to_string() - expected = ('0 foo\n' - '1 NaN\n' - '2 -1.23\n' - '3 4.56') + expected = (u'0 foo\n' + u'1 NaN\n' + u'2 -1.23\n' + u'3 4.56') self.assertEqual(result, expected) # but don't count NAs as floats s = Series(['foo', np.nan, 'bar', 'baz']) result = s.to_string() - expected = ('0 foo\n' + expected = (u'0 foo\n' '1 NaN\n' '2 bar\n' '3 baz') @@ -857,7 +857,7 @@ def test_to_string_mixed(self): s = Series(['foo', 5, 'bar', 'baz']) result = s.to_string() - expected = ('0 foo\n' + expected = (u'0 foo\n' '1 5\n' '2 bar\n' '3 baz') @@ -868,7 +868,7 @@ def test_to_string_float_na_spacing(self): s[::2] = np.nan result = s.to_string() - expected = ('0 NaN\n' + expected = (u'0 NaN\n' '1 1.5678\n' '2 NaN\n' '3 -3.0000\n' From 695aa061e55d0c4dd8357e958e082d0d6de6e2fd Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 14 Nov 2012 03:19:56 +0200 Subject: [PATCH 5/9] DOC: add note about formatters needing to return unicode (if returning strings) we need to keep everything unicode at the bottom levels, so that we can combine strings with other unicode strings at the I/O choke-points, otherwise python tries to coerce bytestring into unicode using 'ascii' encoding, and we get UnicodeDecodeError DOC: add note about formatters needing to return unicode )if returning strings) we need to keep everything unicode at the bottom levels, so that we can combine strings with other unicode strings at the I/O choke-points, otherwise python tries to coerce bytestring into unicode using 'ascii' encoding, and we get UnicodeDecodeError --- pandas/core/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 79077088384ae..f2999c63db38e 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -36,7 +36,7 @@ string representation of NAN to use, default 'NaN' formatters : list or dict of one-parameter functions, optional formatter functions to apply to columns' elements by position or name, - default None + default None, if the result is a string , it must be a unicode string. float_format : one-parameter function, optional formatter function to apply to columns' elements if they are floats default None From c22da50977ec0b8145b5a0b2fb84dd7c4499f7a2 Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 14 Nov 2012 03:21:56 +0200 Subject: [PATCH 6/9] TST: str(x)/unicode(x),bytes(x)/str(x) should always work if x is a df/series containing unicode --- pandas/tests/test_frame.py | 16 ++++++++++++++++ pandas/tests/test_series.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fea84f5a86e36..4eb1be94e0846 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -27,6 +27,7 @@ from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal) +from pandas.util import py3compat import pandas.util.testing as tm import pandas.lib as lib @@ -2916,6 +2917,21 @@ def test_repr_unicode(self): result = repr(df) self.assertEqual(result.split('\n')[0].rstrip(), ex_top) + def test_unicode_string_with_unicode(self): + df = DataFrame({'A': [u"\u05d0"]}) + + if py3compat.PY3: + str(df) + else: + unicode(df) + + def test_bytestring_with_unicode(self): + df = DataFrame({'A': [u"\u05d0"]}) + if py3compat.PY3: + bytes(df) + else: + str(df) + def test_very_wide_info_repr(self): df = DataFrame(np.random.randn(10, 20), columns=[tm.rands(10) for _ in xrange(20)]) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index db0a37978000d..96de4784fdc99 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1083,6 +1083,22 @@ def test_repr_should_return_str (self): df=Series(data,index=index1) self.assertTrue(type(df.__repr__() == str)) # both py2 / 3 + + def test_unicode_string_with_unicode(self): + df = Series([u"\u05d0"],name=u"\u05d1") + if py3compat.PY3: + str(df) + else: + unicode(df) + + def test_bytestring_with_unicode(self): + df = Series([u"\u05d0"],name=u"\u05d1") + if py3compat.PY3: + bytes(df) + else: + str(df) + + def test_timeseries_repr_object_dtype(self): index = Index([datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object) From b7cc302969c51bde32dec0e9d34bac6b54234436 Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 14 Nov 2012 03:30:41 +0200 Subject: [PATCH 7/9] ENH: py2/py3 support for str(x)/unicode(x) and bytes(x)/str(x) for series,df,panel - If you put in proper unicode data, you're good. - If you put in utf-8 bytestrings you should still be good (it works if rendering is wrapped by pprint_thing, I may have missed a few spots). - If you put in non utf-8 bytestrings, with the encoding unknown, and expect unicode(x) or str(x) to do the right thing - you're doing it wrong. --- pandas/core/frame.py | 41 +++++++++++++++++++++--- pandas/core/panel.py | 56 +++++++++++++++++++++++++++------ pandas/core/series.py | 72 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 139 insertions(+), 30 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 439c59a6a8de2..a160c994e94a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -612,20 +612,51 @@ def _need_info_repr_(self): else: return False - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): """ Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - buf = StringIO() + buf = StringIO(u"") if self._need_info_repr_(): self.info(buf=buf, verbose=self._verbose_info) else: self.to_string(buf=buf) + value = buf.getvalue() + assert type(value) == unicode - if py3compat.PY3: - return unicode(value) - return com.console_encode(value) + return value + + def __repr__(self): + """ + Return a string representation for a particular DataFrame + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def _repr_html_(self): """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 2dca8a2aef801..ae4a5d868b139 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -386,34 +386,70 @@ def __array_wrap__(self, result): #---------------------------------------------------------------------- # Magic methods - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Panel + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Panel + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Panel + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ + class_name = str(self.__class__) I, N, K = len(self.items), len(self.major_axis), len(self.minor_axis) - dims = 'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) + dims = u'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) if len(self.major_axis) > 0: - major = 'Major axis: %s to %s' % (self.major_axis[0], + major = u'Major axis: %s to %s' % (self.major_axis[0], self.major_axis[-1]) else: - major = 'Major axis: None' + major = u'Major axis: None' if len(self.minor_axis) > 0: - minor = 'Minor axis: %s to %s' % (self.minor_axis[0], - self.minor_axis[-1]) + minor = u'Minor axis: %s to %s' % (com.pprint_thing(self.minor_axis[0]), + com.pprint_thing(self.minor_axis[-1])) else: - minor = 'Minor axis: None' + minor = u'Minor axis: None' if len(self.items) > 0: - items = 'Items: %s to %s' % (self.items[0], self.items[-1]) + items = u'Items: %s to %s' % (com.pprint_thing(self.items[0]), + com.pprint_thing(self.items[-1])) else: - items = 'Items: None' + items = u'Items: None' - output = '%s\n%s\n%s\n%s\n%s' % (class_name, dims, items, major, minor) + output = u'%s\n%s\n%s\n%s\n%s' % (class_name, dims, items, major, minor) return output + def __repr__(self): + """ + Return a string representation for a particular Panel + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) + def __iter__(self): return iter(self.items) diff --git a/pandas/core/series.py b/pandas/core/series.py index 62cc9a2f42cb3..dc7588847775b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -858,8 +858,34 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): return df.reset_index(level=level, drop=drop) - def __repr__(self): - """Clean string representation of a Series""" + + def __str__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ width, height = get_terminal_size() max_rows = (height if fmt.print_config.max_rows == 0 else fmt.print_config.max_rows) @@ -870,13 +896,24 @@ def __repr__(self): length=len(self) > 50, name=True) else: - result = '%s' % ndarray.__repr__(self) + result = com.pprint_thing(self) - if py3compat.PY3: - return unicode(result) - return com.console_encode(result) + assert type(result) == unicode + return result + + def __repr__(self): + """ + Return a string representation for a particular Series + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def _tidy_repr(self, max_vals=20): + """ + + Internal function, should always return unicode string + """ num = max_vals // 2 head = self[:num]._get_repr(print_header=True, length=False, name=False) @@ -886,13 +923,11 @@ def _tidy_repr(self, max_vals=20): result = head + '\n...\n' + tail result = '%s\n%s' % (result, self._repr_footer()) - if py3compat.PY3: - return unicode(result) - return com.console_encode(result) + return unicode(result) def _repr_footer(self): - namestr = "Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" - return '%sLength: %d' % (namestr, len(self)) + namestr = u"Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" + return u'%sLength: %d' % (namestr, len(self)) def to_string(self, buf=None, na_rep='NaN', float_format=None, nanRep=None, length=False, name=False): @@ -925,6 +960,9 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, the_repr = self._get_repr(float_format=float_format, na_rep=na_rep, length=length, name=name) + + assert type(the_repr) == unicode + if buf is None: return the_repr else: @@ -932,13 +970,17 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, def _get_repr(self, name=False, print_header=False, length=True, na_rep='NaN', float_format=None): + """ + + Internal function, should always return unicode string + """ + formatter = fmt.SeriesFormatter(self, name=name, header=print_header, length=length, na_rep=na_rep, float_format=float_format) - return formatter.to_string() - - def __str__(self): - return repr(self) + result = formatter.to_string() + assert type(result) == unicode + return result def __iter__(self): if np.issubdtype(self.dtype, np.datetime64): From f0deaa67699db4fb03cf2164404f946e18aaaf26 Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 22 Nov 2012 20:40:47 +0200 Subject: [PATCH 8/9] TST: str(x)/unicode(x),bytes(x)/str(x) should always work for Index,MultiIndex --- pandas/tests/test_index.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index b94840d0dfd85..4a86db9d67196 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -851,6 +851,21 @@ def test_print_unicode_columns(self): df=pd.DataFrame({u"\u05d0":[1,2,3],"\u05d1":[4,5,6],"c":[7,8,9]}) print(df.columns) # should not raise UnicodeDecodeError + def test_unicode_string_with_unicode(self): + idx = Index(range(1000)) + + if py3compat.PY3: + str(idx) + else: + unicode(idx) + + def test_bytestring_with_unicode(self): + idx = Index(range(1000)) + if py3compat.PY3: + bytes(idx) + else: + str(idx) + class TestMultiIndex(unittest.TestCase): def setUp(self): @@ -1680,6 +1695,24 @@ def test_repr_with_unicode_data(self): index=pd.DataFrame(d).set_index(["a","b"]).index self.assertFalse("\\u" in repr(index)) # we don't want unicode-escaped + def test_unicode_string_with_unicode(self): + d={"a":[u"\u05d0",2,3],"b":[4,5,6],"c":[7,8,9]} + idx=pd.DataFrame(d).set_index(["a","b"]).index + + if py3compat.PY3: + str(idx) + else: + unicode(idx) + + def test_bytestring_with_unicode(self): + d={"a":[u"\u05d0",2,3],"b":[4,5,6],"c":[7,8,9]} + idx=pd.DataFrame(d).set_index(["a","b"]).index + + if py3compat.PY3: + bytes(idx) + else: + str(idx) + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) From 436bf36857e6a4a9f7879cbd36edab936951e01f Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 22 Nov 2012 20:37:03 +0200 Subject: [PATCH 9/9] ENH: py2/py3 support for str(x)/unicode(x) and bytes(x)/str(x) for Index,MultiIndex --- pandas/core/index.py | 94 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 18 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 500712d42ee80..133449d79d521 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -132,12 +132,48 @@ def __array_finalize__(self, obj): def _shallow_copy(self): return self.view() - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Index + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + if py3compat.PY3: - prepr = com.pprint_thing(self) + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Index + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Index + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ + if len(self) > 6 and len(self) > np.get_printoptions()['threshold']: + data = self[:3].tolist() + ["..."] + self[-3:].tolist() else: - prepr = com.pprint_thing_encoded(self) - return 'Index(%s, dtype=%s)' % (prepr, self.dtype) + data = self + + prepr = com.pprint_thing(data) + return '%s(%s, dtype=%s)' % (type(self).__name__, prepr, self.dtype) + + def __repr__(self): + """ + Return a string representation for a particular Index + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, @@ -207,15 +243,6 @@ def summary(self, name=None): name = type(self).__name__ return '%s: %s entries%s' % (name, len(self), index_summary) - def __str__(self): - try: - return np.array_repr(self.values) - except UnicodeError: - converted = u','.join(com.pprint_thing(x) for x in self.values) - result = u'%s([%s], dtype=''%s'')' % (type(self).__name__, converted, - str(self.values.dtype)) - return com.console_encode(result) - def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values @@ -1319,7 +1346,33 @@ def _array_values(self): def dtype(self): return np.dtype('O') - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Index + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Index + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Index + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ output = 'MultiIndex\n%s' options = np.get_printoptions() @@ -1335,10 +1388,15 @@ def __repr__(self): np.set_printoptions(threshold=options['threshold']) - if py3compat.PY3: - return output % summary - else: - return com.console_encode(output % summary) + return output % summary + + def __repr__(self): + """ + Return a string representation for a particular Index + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def __len__(self): return len(self.labels[0])