diff --git a/canonicaljson.py b/canonicaljson.py index b6b1c41..146c1e7 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# using simplejson rather than regular json gives approximately a 25% +import re +from six import unichr + +# using simplejson rather than regular json gives approximately a 100% # performance improvement (as measured on python 2.7.12/simplejson 3.13.2) import simplejson as json @@ -31,14 +34,20 @@ def _default(obj): obj.__class__.__name__) +# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so +# much quicker (assuming c speedups are enabled) that it's actually much +# quicker to let it do that and then substitute back (it's about 3x faster). +# +# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, +# as per https://github.com/simplejson/simplejson/issues/206). +# _canonical_encoder = json.JSONEncoder( - ensure_ascii=False, + ensure_ascii=True, separators=(',', ':'), sort_keys=True, default=_default, ) - _pretty_encoder = json.JSONEncoder( ensure_ascii=True, indent=4, @@ -46,6 +55,27 @@ def _default(obj): default=_default, ) +# here's the magic which we'll use to go from the ensure_ascii-encoded +# output, with its `\uNNNN` escapes, to the raw unicode output. +# +# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it +# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN +# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'. +_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\") + + +def _replace(match): + g = match.group(1) + if g is None: + # escaped backslash + return '\\\\' + c = int(g, 16) + return unichr(c) + + +def _unescape(s): + return _U_ESCAPE.sub(_replace, s) + def encode_canonical_json(json_object): """Encodes the shortest UTF-8 JSON encoding with dictionary keys @@ -56,8 +86,8 @@ def encode_canonical_json(json_object): Returns: bytes encoding the JSON object""" - s = _canonical_encoder.encode(json_object) + s = _unescape(s) return s.encode("UTF-8") diff --git a/setup.py b/setup.py index 3f2c812..22903ed 100755 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def exec_file(path_segments, name): install_requires=[ "simplejson>=3.6.5", "frozendict>=0.4", + "six", ], long_description=read_file(("README.rst",)), keywords="json", diff --git a/test_canonicaljson.py b/test_canonicaljson.py index 2b5891e..a13f0c4 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -35,6 +35,19 @@ def test_encode_canonical(self): u"la merde amusée": u"💩", }), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}') + # so should U+2028 and U+2029 + self.assertEquals( + encode_canonical_json({u"spaces": u"\u2028 \u2029"}), + b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}', + ) + + # but we need to watch out for 'u1234' after backslash, which should + # get encoded to an escaped backslash, followed by u1234 + self.assertEquals( + encode_canonical_json(u"\\u1234"), + b'"\\\\u1234"', + ) + def test_encode_pretty_printed(self): self.assertEquals(encode_pretty_printed_json({}), b'{}')