-
-
Notifications
You must be signed in to change notification settings - Fork 453
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Trac #24222: py3: simplified string conversion utilities
A possible alternative to #24186, implementing simple conversion from C `char` arrays or `bytes` objects to `str` objects, and of `str` objects to `bytes` objects. Here "`str`" and "`bytes`" are to be read exactly for either Python 2 or Python 3, so on Python 2 this means no conversion is performed since `str is bytes == True`. One thing this does not do is implement any kind of conversion from Python 2 `unicode` objects to `bytes`. This functionality might be worth adding, in some form, to `str_to_bytes`. But this would add a ''new'' feature on Python 2, whereas for now I'm only trying to preserve the existing functionality on Python 2 exactly, while transparently supporting Python 3 `str`s everywhere that Python 2 `str`s are supported. URL: https://trac.sagemath.org/24222 Reported by: embray Ticket author(s): Erik Bray, Jeroen Demeyer Reviewer(s): Jeroen Demeyer, Erik Bray
- Loading branch information
Showing
3 changed files
with
140 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# -*- encoding: utf-8 -*- | ||
#***************************************************************************** | ||
# Copyright (C) 2017 Erik M. Bray <erik.bray@lri.fr> | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 2 of the License, or | ||
# (at your option) any later version. | ||
# http://www.gnu.org/licenses/ | ||
#***************************************************************************** | ||
from __future__ import absolute_import | ||
|
||
from libc.string cimport strlen | ||
|
||
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_FromString | ||
from cpython.unicode cimport PyUnicode_Decode, PyUnicode_AsEncodedString | ||
|
||
IF PY_MAJOR_VERSION >= 3: | ||
cdef extern from "Python.h": | ||
# Missing from cpython.unicode in Cython 0.27.3 | ||
char* PyUnicode_AsUTF8(object s) | ||
str PyUnicode_DecodeLocale(const char* s, const char* errors) | ||
bytes PyUnicode_EncodeLocale(object s, const char* errors) | ||
|
||
|
||
cdef inline str char_to_str(const char* c, encoding=None, errors=None): | ||
IF PY_MAJOR_VERSION <= 2: | ||
return <str>PyBytes_FromString(c) | ||
ELSE: | ||
cdef char* err | ||
if errors is None: | ||
err = NULL # implies "strict" | ||
else: | ||
err = PyUnicode_AsUTF8(errors) | ||
|
||
if encoding is None: | ||
return PyUnicode_DecodeLocale(c, err) | ||
|
||
return PyUnicode_Decode(c, strlen(c), PyUnicode_AsUTF8(encoding), err) | ||
|
||
|
||
cpdef inline bytes_to_str(b, encoding=None, errors=None): | ||
r""" | ||
Convert ``bytes`` to ``str``. | ||
On Python 2 this is a no-op since ``bytes is str``. On Python 3 | ||
this decodes the given ``bytes`` to a Python 3 unicode ``str`` using | ||
the specified encoding. | ||
EXAMPLES:: | ||
sage: import six | ||
sage: from sage.cpython.string import bytes_to_str | ||
sage: s = bytes_to_str(b'\xcf\x80') | ||
sage: if six.PY2: | ||
....: s == b'\xcf\x80' | ||
....: else: | ||
....: s == u'π' | ||
True | ||
sage: bytes_to_str([]) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: expected bytes, list found | ||
""" | ||
if not isinstance(b, bytes): | ||
raise TypeError(f"expected bytes, {type(b).__name__} found") | ||
|
||
IF PY_MAJOR_VERSION <= 2: | ||
return b | ||
ELSE: | ||
return char_to_str(PyBytes_AS_STRING(b), encoding=encoding, | ||
errors=errors) | ||
|
||
|
||
cpdef inline str_to_bytes(s, encoding=None, errors=None): | ||
r""" | ||
Convert ``str`` to ``bytes``. | ||
On Python 2 this is a no-op since ``str is bytes``. On Python 3 | ||
this encodes the given ``str`` to a Python 3 ``bytes`` using the | ||
specified encoding. | ||
EXAMPLES:: | ||
sage: import six | ||
sage: from sage.cpython.string import str_to_bytes | ||
sage: if six.PY2: | ||
....: b = str_to_bytes('\xcf\x80') | ||
....: else: | ||
....: b = str_to_bytes(u'π') | ||
sage: b == b'\xcf\x80' | ||
True | ||
sage: str_to_bytes([]) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: expected str, list found | ||
""" | ||
# Make this check explicit to avoid obscure error message below | ||
if not isinstance(s, str): | ||
raise TypeError(f"expected str, {type(s).__name__} found") | ||
|
||
IF PY_MAJOR_VERSION <= 2: | ||
return s | ||
ELSE: | ||
cdef char* err | ||
if errors is None: | ||
err = NULL # implies "strict" | ||
else: | ||
err = PyUnicode_AsUTF8(errors) | ||
|
||
if encoding is None: | ||
return PyUnicode_EncodeLocale(s, err) | ||
|
||
return PyUnicode_AsEncodedString(s, PyUnicode_AsUTF8(encoding), err) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# -*- encoding: utf-8 -*- | ||
""" | ||
String <-> bytes encoding/decoding | ||
""" | ||
|
||
#***************************************************************************** | ||
# Copyright (C) 2017 Erik M. Bray <erik.bray@lri.fr> | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 2 of the License, or | ||
# (at your option) any later version. | ||
# http://www.gnu.org/licenses/ | ||
#***************************************************************************** | ||
|
||
from __future__ import absolute_import | ||
|
||
import sys | ||
|
||
|
||
# Provide this as a shortcut to calling sys.getfilesystemencoding(), which | ||
# after interpeter initialization is constant. | ||
FS_ENCODING = sys.getfilesystemencoding() | ||
|
||
# Functions in this module are implemented in the .pxd file for inlining. |