From 6eb904b6080efdfb682e8b4ba5ab7de8a900baaa Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 28 Mar 2018 18:31:56 +0100 Subject: [PATCH] Encode with ensure_ascii=True and then fix This turns out to be way quicker. It also allows us to fix #2. --- canonicaljson.py | 83 ++++++++++++++++++++++++++++++++++++++++--- setup.py | 1 + test_canonicaljson.py | 13 +++++++ 3 files changed, 92 insertions(+), 5 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index 89c46a1..5f8fe5a 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# using simplejson rather than regular json gives approximately a 25% +import re +from six import unichr, PY2 + +# using simplejson rather than regular json gives approximately a 100% # performance improvement (as measured on python 2.7.12/simplejson 3.13.2) import simplejson as json @@ -33,14 +36,20 @@ def _default(obj): obj.__class__.__name__) +# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so +# much quicker (assuming c speedups are enabled) that it's actually much +# quicker to let it do that and then substitute back (it's about 2.5x faster). +# +# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, +# as per https://github.com/simplejson/simplejson/issues/206). +# _canonical_encoder = json.JSONEncoder( - ensure_ascii=False, + ensure_ascii=True, separators=(',', ':'), sort_keys=True, default=_default, ) - _pretty_encoder = json.JSONEncoder( ensure_ascii=True, indent=4, @@ -48,6 +57,71 @@ def _default(obj): default=_default, ) +# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it +# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN +# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'. +_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\") + + +def _unascii(s): + """Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8 + + This method takes the output of the JSONEncoder and expands any \\uNNNN + escapes it finds. + + For performance, it assumes that the input is valid JSON, and performs few + sanity checks. + """ + + # make the fast path fast: if there are no matches in the string, the + # whole thing is ascii. On python 2, that means we're done. On python 3, + # we have to turn it into a bytes, which is quickest with encode('utf-8') + m = _U_ESCAPE.search(s) + if not m: + return s if PY2 else s.encode('utf-8') + + # appending to a string (or a bytes) is slooow, so we accumulate sections + # of string result in 'chunks', and join them all together later. + # (It doesn't seem to make much difference whether we accumulate + # utf8-encoded bytes, or strings which we utf-8 encode after rejoining) + # + chunks = [] + + # 'pos' tracks the index in 's' that we have processed into 'chunks' so + # far. + pos = 0 + + while m: + start = m.start() + end = m.end() + + g = m.group(1) + + if g is None: + # escaped backslash: pass it through along with anything before the + # match + chunks.append(s[pos:end]) + else: + # \uNNNN, but we have to watch out for surrogate pairs + c = int(g, 16) + + if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u': + esc2 = s[end + 2:end + 6] + c2 = int(esc2, 16) + if c2 & 0xfc00 == 0xdc00: + c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)) + end += 6 + chunks.append(s[pos:start]) + chunks.append(unichr(c)) + + pos = end + m = _U_ESCAPE.search(s, pos) + + # pass through anything after the last match + chunks.append(s[pos:]) + + return (''.join(chunks)).encode("utf-8") + def encode_canonical_json(json_object): """Encodes the shortest UTF-8 JSON encoding with dictionary keys @@ -58,9 +132,8 @@ def encode_canonical_json(json_object): Returns: bytes encoding the JSON object""" - s = _canonical_encoder.encode(json_object) - return s.encode("UTF-8") + return _unascii(s) def encode_pretty_printed_json(json_object): diff --git a/setup.py b/setup.py index cec8058..34aa1e8 100755 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ def exec_file(path_segments, name): install_requires=[ "simplejson>=3.6.5", "frozendict>=0.4", + "six", ], zip_safe=True, long_description=read_file(("README.rst",)), diff --git a/test_canonicaljson.py b/test_canonicaljson.py index 2b5891e..a13f0c4 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -35,6 +35,19 @@ def test_encode_canonical(self): u"la merde amusée": u"💩", }), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}') + # so should U+2028 and U+2029 + self.assertEquals( + encode_canonical_json({u"spaces": u"\u2028 \u2029"}), + b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}', + ) + + # but we need to watch out for 'u1234' after backslash, which should + # get encoded to an escaped backslash, followed by u1234 + self.assertEquals( + encode_canonical_json(u"\\u1234"), + b'"\\\\u1234"', + ) + def test_encode_pretty_printed(self): self.assertEquals(encode_pretty_printed_json({}), b'{}')