Encode with ensure_ascii=True and then fix

This turns out to be way quicker. It also allows us to fix #2.
matrix-org · Mar 28, 2018 · 5d24666 · 5d24666
1 parent 5b224ac
commit 5d24666
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 4 deletions.
diff --git a/canonicaljson.py b/canonicaljson.py
@@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# using simplejson rather than regular json gives approximately a 25%
+import re
+from six import unichr
+
+# using simplejson rather than regular json gives approximately a 100%
 # performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
 import simplejson as json
 
@@ -31,21 +34,55 @@ def _default(obj):
                     obj.__class__.__name__)
 
 
+# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
+# much quicker (assuming c speedups are enabled) that it's actually much
+# quicker to let it do that and then substitute back (it's about 3x faster).
+#
+# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
+# as per https://github.com/simplejson/simplejson/issues/206).
+#
 _canonical_encoder = json.JSONEncoder(
-    ensure_ascii=False,
+    ensure_ascii=True,
     separators=(',', ':'),
     sort_keys=True,
     default=_default,
 )
 
-
 _pretty_encoder = json.JSONEncoder(
     ensure_ascii=True,
     indent=4,
     sort_keys=True,
     default=_default,
 )
 
+# here's the magic which we'll use to go from the ensure_ascii-encoded
+# output, with its `\uNNNN` escapes, to the raw unicode output.
+#
+# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
+# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
+# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
+_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
+
+
+def _replace(match):
+    g = match.group(1)
+    if g is None:
+        # escaped backslash
+        return '\\\\'
+    c = int(g, 16)
+    return unichr(c)
+
+
+def _unescape(s):
+    return _U_ESCAPE.sub(_replace, s)
+
+
+def _default(obj):
+    if type(obj) is frozendict:
+        return dict(obj)
+    raise TypeError('Object of type %s is not JSON serializable' %
+                    obj.__class__.__name__)
+
 
 def encode_canonical_json(json_object):
     """Encodes the shortest UTF-8 JSON encoding with dictionary keys
@@ -56,8 +93,8 @@ def encode_canonical_json(json_object):
 
     Returns:
         bytes encoding the JSON object"""
-
     s = _canonical_encoder.encode(json_object)
+    s = _unescape(s)
     return s.encode("UTF-8")
 
 

diff --git a/setup.py b/setup.py
@@ -47,6 +47,7 @@ def exec_file(path_segments, name):
     install_requires=[
         "simplejson>=3.6.5",
         "frozendict>=0.4",
+        "six",
     ],
     long_description=read_file(("README.rst",)),
     keywords="json",

diff --git a/test_canonicaljson.py b/test_canonicaljson.py
@@ -35,6 +35,19 @@ def test_encode_canonical(self):
                 u"la merde amusée": u"💩",
         }), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}')
 
+        # so should U+2028 and U+2029
+        self.assertEquals(
+            encode_canonical_json({u"spaces": u"\u2028 \u2029"}),
+            b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}',
+        )
+
+        # but we need to watch out for 'u1234' after backslash, which should
+        # get encoded to an escaped backslash, followed by u1234
+        self.assertEquals(
+            encode_canonical_json(u"\\u1234"),
+            b'"\\\\u1234"',
+        )
+
     def test_encode_pretty_printed(self):
         self.assertEquals(encode_pretty_printed_json({}), b'{}')