Skip to content

Commit

Permalink
Trac #24222: py3: simplified string conversion utilities
Browse files Browse the repository at this point in the history
A possible alternative to #24186, implementing simple conversion from C
`char` arrays or `bytes` objects to `str` objects, and of `str` objects
to `bytes` objects.  Here "`str`" and "`bytes`" are to be read exactly
for either Python 2 or Python 3, so on Python 2 this means no conversion
is performed since `str is bytes == True`.

One thing this does not do is implement any kind of conversion from
Python 2 `unicode` objects to `bytes`.  This functionality might be
worth adding, in some form, to `str_to_bytes`.  But this would add a
''new'' feature on Python 2, whereas for now I'm only trying to preserve
the existing functionality on Python 2 exactly, while transparently
supporting Python 3 `str`s everywhere that Python 2 `str`s are
supported.

URL: https://trac.sagemath.org/24222
Reported by: embray
Ticket author(s): Erik Bray, Jeroen Demeyer
Reviewer(s): Jeroen Demeyer, Erik Bray
  • Loading branch information
Release Manager authored and vbraun committed Dec 25, 2017
2 parents 6103d53 + dec9f3a commit bbd5bdb
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/doc/en/reference/cpython/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ internals.
.. toctree::
:maxdepth: 2

sage/cpython/string
sage/cpython/debug
sage/cpython/getattr
sage/cpython/cython_metaclass
Expand Down
114 changes: 114 additions & 0 deletions src/sage/cpython/string.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- encoding: utf-8 -*-
#*****************************************************************************
# Copyright (C) 2017 Erik M. Bray <erik.bray@lri.fr>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# http://www.gnu.org/licenses/
#*****************************************************************************
from __future__ import absolute_import

from libc.string cimport strlen

from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_FromString
from cpython.unicode cimport PyUnicode_Decode, PyUnicode_AsEncodedString

IF PY_MAJOR_VERSION >= 3:
cdef extern from "Python.h":
# Missing from cpython.unicode in Cython 0.27.3
char* PyUnicode_AsUTF8(object s)
str PyUnicode_DecodeLocale(const char* s, const char* errors)
bytes PyUnicode_EncodeLocale(object s, const char* errors)


cdef inline str char_to_str(const char* c, encoding=None, errors=None):
IF PY_MAJOR_VERSION <= 2:
return <str>PyBytes_FromString(c)
ELSE:
cdef char* err
if errors is None:
err = NULL # implies "strict"
else:
err = PyUnicode_AsUTF8(errors)

if encoding is None:
return PyUnicode_DecodeLocale(c, err)

return PyUnicode_Decode(c, strlen(c), PyUnicode_AsUTF8(encoding), err)


cpdef inline bytes_to_str(b, encoding=None, errors=None):
r"""
Convert ``bytes`` to ``str``.
On Python 2 this is a no-op since ``bytes is str``. On Python 3
this decodes the given ``bytes`` to a Python 3 unicode ``str`` using
the specified encoding.
EXAMPLES::
sage: import six
sage: from sage.cpython.string import bytes_to_str
sage: s = bytes_to_str(b'\xcf\x80')
sage: if six.PY2:
....: s == b'\xcf\x80'
....: else:
....: s == u'π'
True
sage: bytes_to_str([])
Traceback (most recent call last):
...
TypeError: expected bytes, list found
"""
if not isinstance(b, bytes):
raise TypeError(f"expected bytes, {type(b).__name__} found")

IF PY_MAJOR_VERSION <= 2:
return b
ELSE:
return char_to_str(PyBytes_AS_STRING(b), encoding=encoding,
errors=errors)


cpdef inline str_to_bytes(s, encoding=None, errors=None):
r"""
Convert ``str`` to ``bytes``.
On Python 2 this is a no-op since ``str is bytes``. On Python 3
this encodes the given ``str`` to a Python 3 ``bytes`` using the
specified encoding.
EXAMPLES::
sage: import six
sage: from sage.cpython.string import str_to_bytes
sage: if six.PY2:
....: b = str_to_bytes('\xcf\x80')
....: else:
....: b = str_to_bytes(u'π')
sage: b == b'\xcf\x80'
True
sage: str_to_bytes([])
Traceback (most recent call last):
...
TypeError: expected str, list found
"""
# Make this check explicit to avoid obscure error message below
if not isinstance(s, str):
raise TypeError(f"expected str, {type(s).__name__} found")

IF PY_MAJOR_VERSION <= 2:
return s
ELSE:
cdef char* err
if errors is None:
err = NULL # implies "strict"
else:
err = PyUnicode_AsUTF8(errors)

if encoding is None:
return PyUnicode_EncodeLocale(s, err)

return PyUnicode_AsEncodedString(s, PyUnicode_AsUTF8(encoding), err)
25 changes: 25 additions & 0 deletions src/sage/cpython/string.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- encoding: utf-8 -*-
"""
String <-> bytes encoding/decoding
"""

#*****************************************************************************
# Copyright (C) 2017 Erik M. Bray <erik.bray@lri.fr>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# http://www.gnu.org/licenses/
#*****************************************************************************

from __future__ import absolute_import

import sys


# Provide this as a shortcut to calling sys.getfilesystemencoding(), which
# after interpeter initialization is constant.
FS_ENCODING = sys.getfilesystemencoding()

# Functions in this module are implemented in the .pxd file for inlining.

0 comments on commit bbd5bdb

Please sign in to comment.