Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode with ensure_ascii=True and then fix #9

Merged
merged 1 commit into from
Mar 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 78 additions & 5 deletions canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# using simplejson rather than regular json gives approximately a 25%
import re
from six import unichr, PY2

# using simplejson rather than regular json gives approximately a 100%
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
import simplejson as json

Expand All @@ -33,21 +36,92 @@ def _default(obj):
obj.__class__.__name__)


# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
# much quicker (assuming c speedups are enabled) that it's actually much
# quicker to let it do that and then substitute back (it's about 2.5x faster).
#
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
# as per https://github.com/simplejson/simplejson/issues/206).
#
_canonical_encoder = json.JSONEncoder(
ensure_ascii=False,
ensure_ascii=True,
separators=(',', ':'),
sort_keys=True,
default=_default,
)


_pretty_encoder = json.JSONEncoder(
ensure_ascii=True,
indent=4,
sort_keys=True,
default=_default,
)

# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")


def _unascii(s):
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8

This method takes the output of the JSONEncoder and expands any \\uNNNN
escapes it finds.

For performance, it assumes that the input is valid JSON, and performs few
sanity checks.
"""

# make the fast path fast: if there are no matches in the string, the
# whole thing is ascii. On python 2, that means we're done. On python 3,
# we have to turn it into a bytes, which is quickest with encode('utf-8')
m = _U_ESCAPE.search(s)
if not m:
return s if PY2 else s.encode('utf-8')

# appending to a string (or a bytes) is slooow, so we accumulate sections
# of string result in 'chunks', and join them all together later.
# (It doesn't seem to make much difference whether we accumulate
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
#
chunks = []

# 'pos' tracks the index in 's' that we have processed into 'chunks' so
# far.
pos = 0

while m:
start = m.start()
end = m.end()

g = m.group(1)

if g is None:
# escaped backslash: pass it through along with anything before the
# match
chunks.append(s[pos:end])
else:
# \uNNNN, but we have to watch out for surrogate pairs
c = int(g, 16)

if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
esc2 = s[end + 2:end + 6]
c2 = int(esc2, 16)
if c2 & 0xfc00 == 0xdc00:
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00))
end += 6
chunks.append(s[pos:start])
chunks.append(unichr(c))

pos = end
m = _U_ESCAPE.search(s, pos)

# pass through anything after the last match
chunks.append(s[pos:])

return (''.join(chunks)).encode("utf-8")


def encode_canonical_json(json_object):
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
Expand All @@ -58,9 +132,8 @@ def encode_canonical_json(json_object):

Returns:
bytes encoding the JSON object"""

s = _canonical_encoder.encode(json_object)
return s.encode("UTF-8")
return _unascii(s)


def encode_pretty_printed_json(json_object):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def exec_file(path_segments, name):
install_requires=[
"simplejson>=3.6.5",
"frozendict>=0.4",
"six",
],
zip_safe=True,
long_description=read_file(("README.rst",)),
Expand Down
13 changes: 13 additions & 0 deletions test_canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ def test_encode_canonical(self):
u"la merde amusée": u"💩",
}), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}')

# so should U+2028 and U+2029
self.assertEquals(
encode_canonical_json({u"spaces": u"\u2028 \u2029"}),
b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}',
)

# but we need to watch out for 'u1234' after backslash, which should
# get encoded to an escaped backslash, followed by u1234
self.assertEquals(
encode_canonical_json(u"\\u1234"),
b'"\\\\u1234"',
)

def test_encode_pretty_printed(self):
self.assertEquals(encode_pretty_printed_json({}), b'{}')

Expand Down