Skip to content

Commit

Permalink
Encode with ensure_ascii=True and then fix
Browse files Browse the repository at this point in the history
This turns out to be way quicker. It also allows us to fix #2.
  • Loading branch information
richvdh committed Mar 28, 2018
1 parent 5b224ac commit 5d24666
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
45 changes: 41 additions & 4 deletions canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# using simplejson rather than regular json gives approximately a 25%
import re
from six import unichr

# using simplejson rather than regular json gives approximately a 100%
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
import simplejson as json

Expand All @@ -31,21 +34,55 @@ def _default(obj):
obj.__class__.__name__)


# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
# much quicker (assuming c speedups are enabled) that it's actually much
# quicker to let it do that and then substitute back (it's about 3x faster).
#
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
# as per https://github.com/simplejson/simplejson/issues/206).
#
_canonical_encoder = json.JSONEncoder(
ensure_ascii=False,
ensure_ascii=True,
separators=(',', ':'),
sort_keys=True,
default=_default,
)


_pretty_encoder = json.JSONEncoder(
ensure_ascii=True,
indent=4,
sort_keys=True,
default=_default,
)

# here's the magic which we'll use to go from the ensure_ascii-encoded
# output, with its `\uNNNN` escapes, to the raw unicode output.
#
# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")


def _replace(match):
g = match.group(1)
if g is None:
# escaped backslash
return '\\\\'
c = int(g, 16)
return unichr(c)


def _unescape(s):
return _U_ESCAPE.sub(_replace, s)


def _default(obj):
if type(obj) is frozendict:
return dict(obj)
raise TypeError('Object of type %s is not JSON serializable' %
obj.__class__.__name__)


def encode_canonical_json(json_object):
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
Expand All @@ -56,8 +93,8 @@ def encode_canonical_json(json_object):
Returns:
bytes encoding the JSON object"""

s = _canonical_encoder.encode(json_object)
s = _unescape(s)
return s.encode("UTF-8")


Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def exec_file(path_segments, name):
install_requires=[
"simplejson>=3.6.5",
"frozendict>=0.4",
"six",
],
long_description=read_file(("README.rst",)),
keywords="json",
Expand Down
13 changes: 13 additions & 0 deletions test_canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ def test_encode_canonical(self):
u"la merde amusée": u"💩",
}), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}')

# so should U+2028 and U+2029
self.assertEquals(
encode_canonical_json({u"spaces": u"\u2028 \u2029"}),
b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}',
)

# but we need to watch out for 'u1234' after backslash, which should
# get encoded to an escaped backslash, followed by u1234
self.assertEquals(
encode_canonical_json(u"\\u1234"),
b'"\\\\u1234"',
)

def test_encode_pretty_printed(self):
self.assertEquals(encode_pretty_printed_json({}), b'{}')

Expand Down

0 comments on commit 5d24666

Please sign in to comment.