Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode : change df.to_string() and friends to always return unicode objects #2224

Merged
merged 9 commits into from Nov 27, 2012
67 changes: 26 additions & 41 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
string representation of NAN to use, default 'NaN'
formatters : list or dict of one-parameter functions, optional
formatter functions to apply to columns' elements by position or name,
default None
default None, if the result is a string , it must be a unicode string.
float_format : one-parameter function, optional
formatter function to apply to columns' elements if they are floats
default None
Expand All @@ -62,7 +62,7 @@ class SeriesFormatter(object):
def __init__(self, series, buf=None, header=True, length=True,
na_rep='NaN', name=False, float_format=None):
self.series = series
self.buf = buf if buf is not None else StringIO()
self.buf = buf if buf is not None else StringIO(u"")
self.name = name
self.na_rep = na_rep
self.length = length
Expand Down Expand Up @@ -112,7 +112,7 @@ def to_string(self):
series = self.series

if len(series) == 0:
return ''
return u''

fmt_index, have_header = self._get_formatted_index()
fmt_values = self._get_formatted_values()
Expand All @@ -135,9 +135,7 @@ def to_string(self):
if footer:
result.append(footer)

if py3compat.PY3:
return unicode(u'\n'.join(result))
return com.console_encode(u'\n'.join(result))
return unicode(u'\n'.join(result))

if py3compat.PY3: # pragma: no cover
_encode_diff = lambda x: 0
Expand Down Expand Up @@ -200,10 +198,15 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
else:
self.columns = frame.columns

def _to_str_columns(self, force_unicode=False):
def _to_str_columns(self, force_unicode=None):
"""
Render a DataFrame to a list of columns (as lists of strings).
"""
import warnings
if force_unicode is not None: # pragma: no cover
warnings.warn("force_unicode is deprecated, it will have no effect",
FutureWarning)

# may include levels names also
str_index = self._get_formatted_index()
str_columns = self._get_formatted_column_labels()
Expand Down Expand Up @@ -237,32 +240,17 @@ def _to_str_columns(self, force_unicode=False):
if self.index:
strcols.insert(0, str_index)

if not py3compat.PY3:
if force_unicode:
def make_unicode(x):
if isinstance(x, unicode):
return x
return x.decode('utf-8')
strcols = map(lambda col: map(make_unicode, col), strcols)
else:
# Generally everything is plain strings, which has ascii
# encoding. Problem is when there is a char with value over
# 127. Everything then gets converted to unicode.
try:
map(lambda col: map(str, col), strcols)
except UnicodeError:
def make_unicode(x):
if isinstance(x, unicode):
return x
return x.decode('utf-8')
strcols = map(lambda col: map(make_unicode, col), strcols)

return strcols

def to_string(self, force_unicode=False):
def to_string(self, force_unicode=None):
"""
Render a DataFrame to a console-friendly tabular output.
"""
import warnings
if force_unicode is not None: # pragma: no cover
warnings.warn("force_unicode is deprecated, it will have no effect",
FutureWarning)

frame = self.frame

if len(frame.columns) == 0 or len(frame.index) == 0:
Expand All @@ -272,15 +260,20 @@ def to_string(self, force_unicode=False):
com.pprint_thing(frame.index)))
text = info_line
else:
strcols = self._to_str_columns(force_unicode)
strcols = self._to_str_columns()
text = adjoin(1, *strcols)

self.buf.writelines(text)

def to_latex(self, force_unicode=False, column_format=None):
def to_latex(self, force_unicode=None, column_format=None):
"""
Render a DataFrame to a LaTeX tabular environment output.
"""
import warnings
if force_unicode is not None: # pragma: no cover
warnings.warn("force_unicode is deprecated, it will have no effect",
FutureWarning)

frame = self.frame

if len(frame.columns) == 0 or len(frame.index) == 0:
Expand All @@ -289,7 +282,7 @@ def to_latex(self, force_unicode=False, column_format=None):
frame.columns, frame.index))
strcols = [[info_line]]
else:
strcols = self._to_str_columns(force_unicode)
strcols = self._to_str_columns()

if column_format is None:
column_format = '|l|%s|' % '|'.join('c' for _ in strcols)
Expand Down Expand Up @@ -726,18 +719,10 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
self.justify = justify

def get_result(self):
if self._have_unicode():
fmt_values = self._format_strings(use_unicode=True)
else:
fmt_values = self._format_strings(use_unicode=False)

fmt_values = self._format_strings()
return _make_fixed_width(fmt_values, self.justify)

def _have_unicode(self):
mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
return mask.any()

def _format_strings(self, use_unicode=False):
def _format_strings(self):
if self.float_format is None:
float_format = print_config.float_format
if float_format is None:
Expand Down
56 changes: 42 additions & 14 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,20 +612,51 @@ def _need_info_repr_(self):
else:
return False

def __repr__(self):
def __str__(self):
"""
Return a string representation for a particular DataFrame

Invoked by str(df) in both py2/py3.
Yields Bytestring in Py2, Unicode String in py3.
"""

if py3compat.PY3:
return self.__unicode__()
return self.__bytes__()

def __bytes__(self):
"""
Return a string representation for a particular DataFrame

Invoked by bytes(df) in py3 only.
Yields a bytestring in both py2/py3.
"""
return com.console_encode(self.__unicode__())

def __unicode__(self):
"""
Return a string representation for a particular DataFrame

Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
"""
buf = StringIO()
buf = StringIO(u"")
if self._need_info_repr_():
self.info(buf=buf, verbose=self._verbose_info)
else:
self.to_string(buf=buf)

value = buf.getvalue()
assert type(value) == unicode

if py3compat.PY3:
return unicode(value)
return com.console_encode(value)
return value

def __repr__(self):
"""
Return a string representation for a particular DataFrame

Yields Bytestring in Py2, Unicode String in py3.
"""
return str(self)

def _repr_html_(self):
"""
Expand Down Expand Up @@ -1379,19 +1410,21 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
def to_string(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, nanRep=None,
index_names=True, justify=None, force_unicode=False):
index_names=True, justify=None, force_unicode=None):
"""
Render a DataFrame to a console-friendly tabular output.
"""
import warnings
if force_unicode is not None: # pragma: no cover
warnings.warn("force_unicode is deprecated, it will have no effect",
FutureWarning)

if nanRep is not None: # pragma: no cover
import warnings
warnings.warn("nanRep is deprecated, use na_rep",
FutureWarning)
na_rep = nanRep

if colSpace is not None: # pragma: no cover
import warnings
warnings.warn("colSpace is deprecated, use col_space",
FutureWarning)
col_space = colSpace
Expand All @@ -1404,15 +1437,10 @@ def to_string(self, buf=None, columns=None, col_space=None, colSpace=None,
justify=justify,
index_names=index_names,
header=header, index=index)
formatter.to_string(force_unicode=force_unicode)
formatter.to_string()

if buf is None:
result = formatter.buf.getvalue()
if not force_unicode:
try:
result = str(result)
except ValueError:
pass
return result

@Appender(fmt.docstring_to_string, indents=1)
Expand Down
100 changes: 79 additions & 21 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,48 @@ def __array_finalize__(self, obj):
def _shallow_copy(self):
return self.view()

def __repr__(self):
def __str__(self):
"""
Return a string representation for a particular Index

Invoked by str(df) in both py2/py3.
Yields Bytestring in Py2, Unicode String in py3.
"""

if py3compat.PY3:
prepr = com.pprint_thing(self)
return self.__unicode__()
return self.__bytes__()

def __bytes__(self):
"""
Return a string representation for a particular Index

Invoked by bytes(df) in py3 only.
Yields a bytestring in both py2/py3.
"""
return com.console_encode(self.__unicode__())

def __unicode__(self):
"""
Return a string representation for a particular Index

Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
"""
if len(self) > 6 and len(self) > np.get_printoptions()['threshold']:
data = self[:3].tolist() + ["..."] + self[-3:].tolist()
else:
prepr = com.pprint_thing_encoded(self)
return 'Index(%s, dtype=%s)' % (prepr, self.dtype)
data = self

prepr = com.pprint_thing(data)
return '%s(%s, dtype=%s)' % (type(self).__name__, prepr, self.dtype)

def __repr__(self):
"""
Return a string representation for a particular Index

Yields Bytestring in Py2, Unicode String in py3.
"""
return str(self)

def astype(self, dtype):
return Index(self.values.astype(dtype), name=self.name,
Expand Down Expand Up @@ -207,15 +243,6 @@ def summary(self, name=None):
name = type(self).__name__
return '%s: %s entries%s' % (name, len(self), index_summary)

def __str__(self):
try:
return np.array_repr(self.values)
except UnicodeError:
converted = u','.join(com.pprint_thing(x) for x in self.values)
result = u'%s([%s], dtype=''%s'')' % (type(self).__name__, converted,
str(self.values.dtype))
return com.console_encode(result)

def _mpl_repr(self):
# how to represent ourselves to matplotlib
return self.values
Expand Down Expand Up @@ -394,8 +421,8 @@ def format(self, name=False):
result = []
for dt in self:
if dt.time() != zero_time or dt.tzinfo is not None:
return header + ['%s' % x for x in self]
result.append('%d-%.2d-%.2d' % (dt.year, dt.month, dt.day))
return header + [u'%s' % x for x in self]
result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day))
return header + result

values = self.values
Expand Down Expand Up @@ -1319,7 +1346,33 @@ def _array_values(self):
def dtype(self):
return np.dtype('O')

def __repr__(self):
def __str__(self):
"""
Return a string representation for a particular Index

Invoked by str(df) in both py2/py3.
Yields Bytestring in Py2, Unicode String in py3.
"""

if py3compat.PY3:
return self.__unicode__()
return self.__bytes__()

def __bytes__(self):
"""
Return a string representation for a particular Index

Invoked by bytes(df) in py3 only.
Yields a bytestring in both py2/py3.
"""
return com.console_encode(self.__unicode__())

def __unicode__(self):
"""
Return a string representation for a particular Index

Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
"""
output = 'MultiIndex\n%s'

options = np.get_printoptions()
Expand All @@ -1335,10 +1388,15 @@ def __repr__(self):

np.set_printoptions(threshold=options['threshold'])

if py3compat.PY3:
return output % summary
else:
return com.console_encode(output % summary)
return output % summary

def __repr__(self):
"""
Return a string representation for a particular Index

Yields Bytestring in Py2, Unicode String in py3.
"""
return str(self)

def __len__(self):
return len(self.labels[0])
Expand Down Expand Up @@ -1496,7 +1554,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
formatted = lev.take(lab).format()
else:
# weird all NA case
formatted = [str(x) for x in com.take_1d(lev.values, lab)]
formatted = [com.pprint_thing(x) for x in com.take_1d(lev.values, lab)]
stringified_levels.append(formatted)

result_levels = []
Expand Down
Loading