Skip to content

Commit

Permalink
Don't duplicate memory when getting the contents of bytes/str. Fixes m…
Browse files Browse the repository at this point in the history
  • Loading branch information
fabioz committed May 20, 2020
1 parent f752952 commit 105d82a
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 49 deletions.
67 changes: 61 additions & 6 deletions src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from _pydevd_bundle.pydevd_constants import IS_PY2
import locale
import json
from _pydev_bundle import pydev_log

# Py3 compat - alias unicode to str, and xrange to range
try:
Expand All @@ -33,13 +33,15 @@ class SafeRepr(object):
maxstring_inner = 30
if sys.version_info >= (3, 0):
string_types = (str, bytes)
bytes = bytes
set_info = (set, '{', '}', False)
frozenset_info = (frozenset, 'frozenset({', '})', False)
int_types = (int,)
long_iter_types = (list, tuple, bytearray, range,
dict, set, frozenset)
else:
string_types = (str, unicode)
bytes = str
set_info = (set, 'set([', '])', False)
frozenset_info = (frozenset, 'frozenset([', '])', False)
int_types = (int, long) # noqa
Expand Down Expand Up @@ -279,8 +281,61 @@ def _repr_dict(self, obj, level, prefix, suffix,
yield suffix

def _repr_str(self, obj, level):
return self._repr_obj(obj, level,
self.maxstring_inner, self.maxstring_outer)
try:
if self.raw_value:
# For raw value retrieval, ignore all limits.
if isinstance(obj, bytes):
yield obj.decode('latin-1')
else:
yield obj
return

limit_inner = self.maxother_inner
limit_outer = self.maxother_outer
limit = limit_inner if level > 0 else limit_outer
if len(obj) <= limit:
# Note that we check the limit before doing the repr (so, the final string
# may actually be considerably bigger on some cases, as besides
# the additional u, b, ' chars, some chars may be escaped in repr, so
# even a single char such as \U0010ffff may end up adding more
# chars than expected).
yield self._convert_to_unicode_or_bytes_repr(repr(obj))
return

# Slightly imprecise calculations - we may end up with a string that is
# up to 6 characters longer than limit. If you need precise formatting,
# you are using the wrong class.
left_count, right_count = max(1, int(2 * limit / 3)), max(1, int(limit / 3)) # noqa

# Important: only do repr after slicing to avoid duplicating a byte array that could be
# huge.

# Note: we don't deal with high surrogates here because we're not dealing with the
# repr() of a random object.
# i.e.: A high surrogate unicode char may be splitted on Py2, but as we do a `repr`
# afterwards, that's ok.

# Also, we just show the unicode/string/bytes repr() directly to make clear what the
# input type was (so, on py2 a unicode would start with u' and on py3 a bytes would
# start with b').

part1 = obj[:left_count]
part1 = repr(part1)
part1 = part1[:part1.rindex("'")] # Remove the last '

part2 = obj[-right_count:]
part2 = repr(part2)
part2 = part2[part2.index("'") + 1:] # Remove the first ' (and possibly u or b).

yield part1
yield '...'
yield part2
except:
# This shouldn't really happen, but let's play it safe.
pydev_log.exception('Error getting string representation to show.')
for part in self._repr_obj(obj, level,
self.maxother_inner, self.maxother_outer):
yield part

def _repr_other(self, obj, level):
return self._repr_obj(obj, level,
Expand Down Expand Up @@ -327,7 +382,7 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer):
# you are using the wrong class.
left_count, right_count = max(1, int(2 * limit / 3)), max(1, int(limit / 3)) # noqa

if IS_PY2 and isinstance(obj_repr, bytes):
if IS_PY2 and isinstance(obj_repr, self.bytes):
# If we can convert to unicode before slicing, that's better (but don't do
# it if it's not possible as we may be dealing with actual binary data).

Expand Down Expand Up @@ -370,9 +425,9 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer):
yield obj_repr[-right_count:]

def _convert_to_unicode_or_bytes_repr(self, obj_repr):
if IS_PY2 and isinstance(obj_repr, bytes):
if IS_PY2 and isinstance(obj_repr, self.bytes):
obj_repr = self._bytes_as_unicode_if_possible(obj_repr)
if isinstance(obj_repr, bytes):
if isinstance(obj_repr, self.bytes):
# If we haven't been able to decode it this means it's some binary data
# we can't make sense of, so, we need its repr() -- otherwise json
# encoding may break later on.
Expand Down
161 changes: 118 additions & 43 deletions src/debugpy/_vendored/pydevd/tests_python/test_safe_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class SafeReprTestBase(object):
def assert_saferepr(self, value, expected):
safe = self.saferepr(value)

if len(safe) != len(expected):
raise AssertionError('Expected:\n%s\nFound:\n%s\n Expected len: %s Found len: %s' % (
expected, safe, len(expected), len(safe),))
assert safe == expected
return safe

Expand Down Expand Up @@ -108,29 +111,29 @@ def test_str_large(self):
value = 'A' * (SafeRepr.maxstring_outer + 10)

self.assert_shortened(value,
"'" + 'A' * 43689 + "..." + 'A' * 21844 + "'")
self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAA...AAAAAAAAA']")
"'" + 'A' * 43690 + "..." + 'A' * 21845 + "'")
self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']")

def test_str_largest_unchanged(self):
value = 'A' * (SafeRepr.maxstring_outer - 2)
value = 'A' * (SafeRepr.maxstring_outer)

self.assert_unchanged(value, "'" + 'A' * 65534 + "'")
self.assert_unchanged(value, "'" + 'A' * 65536 + "'")

def test_str_smallest_changed(self):
value = 'A' * (SafeRepr.maxstring_outer - 1)
value = 'A' * (SafeRepr.maxstring_outer + 1)

self.assert_shortened(value,
"'" + 'A' * 43689 + "..." + 'A' * 21844 + "'")
"'" + 'A' * 43690 + "..." + 'A' * 21845 + "'")

def test_str_list_largest_unchanged(self):
value = 'A' * (SafeRepr.maxstring_inner - 2)
value = 'A' * (SafeRepr.maxstring_inner)

self.assert_unchanged([value], "['" + 'A' * 28 + "']")
self.assert_unchanged([value], "['AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA']")

def test_str_list_smallest_changed(self):
value = 'A' * (SafeRepr.maxstring_inner - 1)
value = 'A' * (SafeRepr.maxstring_inner + 1)

self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAA...AAAAAAAAA']")
self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']")

@pytest.mark.skipif(sys.version_info > (3, 0), reason='Py2 specific test')
def test_unicode_small(self):
Expand All @@ -144,8 +147,8 @@ def test_unicode_large(self):
value = u'A' * (SafeRepr.maxstring_outer + 10)

self.assert_shortened(value,
"u'" + 'A' * 43688 + "..." + 'A' * 21844 + "'")
self.assert_shortened([value], "[u'AAAAAAAAAAAAAAAAAA...AAAAAAAAA']")
"u'" + 'A' * 43690 + "..." + 'A' * 21845 + "'")
self.assert_shortened([value], "[u'AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']")

@pytest.mark.skipif(sys.version_info < (3, 0), reason='Py3 specific test')
def test_bytes_small(self):
Expand All @@ -159,8 +162,8 @@ def test_bytes_large(self):
value = b'A' * (SafeRepr.maxstring_outer + 10)

self.assert_shortened(value,
"b'" + 'A' * 43688 + "..." + 'A' * 21844 + "'")
self.assert_shortened([value], "[b'AAAAAAAAAAAAAAAAAA...AAAAAAAAA']")
"b'" + 'A' * 43690 + "..." + 'A' * 21845 + "'")
self.assert_shortened([value], "[b'AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']")

# @pytest.mark.skip(reason='not written') # TODO: finish!
# def test_bytearray_small(self):
Expand Down Expand Up @@ -603,23 +606,25 @@ def test_zeros(self):
# ucs-4 build (so, we have to strip the high-surrogate if it's ucs-2 and the number of chars
# will be different).
{'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄F😄FF😄F", 'output': (u"😄😄😄😄😄😄...FF😄F", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...F😄FF😄F")},
{'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄F😄FF😄F", 'output': (u"😄😄😄😄😄😄...FF😄F", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...F😄FF😄F"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\ud83d...\\ude04FF\\U0001f604F'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...F\\U0001f604FF\\U0001f604F'")},
{'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄😄😄😄...FFFFFF", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF")},
{'maxother_outer': 20, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐🌐🌐🌐...FFFFFF", u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐...FFFFFF")},
{'maxother_outer': 10, 'input': u"😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄...FFF", u"😄😄😄😄😄😄...FFF")},
{'maxother_outer': 10, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐...FFF", u"🌐🌐🌐🌐🌐🌐...FFF")},
{'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄😄😄😄...FFFFFF", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\ud83d...FFFFFF'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...FFFFFF'")},
{'maxother_outer': 20, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐🌐🌐🌐...FFFFFF", u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐...FFFFFF"), 'output_str': ("u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\ud83c...FFFFFF'", "u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310...FFFFFF'")},
{'maxother_outer': 10, 'input': u"😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄...FFF", u"😄😄😄😄😄😄...FFF"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604...FFF'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...FFF'")},
{'maxother_outer': 10, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐...FFF", u"🌐🌐🌐🌐🌐🌐...FFF"), 'output_str': ("u'\\U0001f310\\U0001f310\\U0001f310...FFF'", "u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310...FFF'")},
# Regular unicode
{'maxother_outer': 20, 'input': u"ωωωωωωωωωωωωωωωωωωωωωωωFFFFFFFF", 'output': u"ωωωωωωωωωωωωω...FFFFFF"},
{'maxother_outer': 20, 'input': u"������������FFFFFFFF", 'output': u"������������F...FFFFFF"},
{'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF"},
{'maxother_outer': 20, 'input': u"ωωωωωωωωωωωωωωωωωωωωωωωFFFFFFFF", 'output': u"ωωωωωωωωωωωωω...FFFFFF", 'output_str': repr(u"ωωωωωωωωωωωωω...FFFFFF")},
{'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF", 'output_str': repr(u"������...FFF")},
# Note: as unicode directly doesn't reach the limit and is not elided.
{'maxother_outer': 20, 'input': u"������������FFFFFFFF", 'output': u"������������F...FFFFFF", 'output_str': repr(u"������������FFFFFFFF")},
# Note that we actually get the repr() in this case as we can't decode it with any of the available encodings.
{'maxother_outer': 10, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\...fd'"},
{'maxother_outer': 20, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\xbd\\xbf...a\\xfd'"},
{'maxother_outer': 10, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\...fd'", 'output_str': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xfe\\xfa\\xfd'"},
{'maxother_outer': 20, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\xbd\\xbf...a\\xfd'", 'output_str': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'"},
# Check that we use repr() even if it fits the maxother_outer limit.
{'maxother_outer': 100, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd', 'output': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'"},
{'maxother_outer': 100, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd', 'output': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'", 'output_str': repr(b'\xed\xbd\xbf\xff\xfe\xfa\xfd')},
# Note that with latin1 encoding we can actually decode the string but when encoding back to utf-8 we have garbage
# (couldn't find a good approach to know what to do here as we've actually been able to decode it as
Expand All @@ -628,11 +633,13 @@ def test_zeros(self):
'maxother_outer': 10,
'sys_stdout_encoding': 'latin1',
'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10,
'output': b'\xc3\xad\xc2\xbd\xc2\xbf\xc3\xbf\xc3\xbe\xc3\xba...\xc3\xbe\xc3\xba\xc3\xbd'
'output': b'\xc3\xad\xc2\xbd\xc2\xbf\xc3\xbf\xc3\xbe\xc3\xba...\xc3\xbe\xc3\xba\xc3\xbd',
'output_str': "\'\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xfe\\xfa\\xfd\'",
},
])
@pytest.mark.parametrize('use_str', [True, False])
@pytest.mark.skipif(not IS_PY2, reason='Py2 specific test.')
def test_py2_bytes_slicing(params):
def test_py2_bytes_slicing(params, use_str):
safe_repr = SafeRepr()
safe_repr.locale_preferred_encoding = 'ascii'
safe_repr.sys_stdout_encoding = params.get('sys_stdout_encoding', 'ascii')
Expand All @@ -643,16 +650,26 @@ def test_py2_bytes_slicing(params):
# later on, so, the return from SafeRepr must always be utf-8 regardless of the input).
encoding = 'utf-8'

class MyObj(object):
if not use_str:

def __repr__(self):
ret = params['input']
if isinstance(ret, unicode):
ret = ret.encode(encoding)
return ret
class MyObj(object):

expected_output = params['output']
computed = safe_repr(MyObj())
def __repr__(self):
ret = params['input']
if isinstance(ret, unicode):
ret = ret.encode(encoding)
return ret

safe_repr_input = MyObj()
else:
safe_repr_input = params['input']

computed = safe_repr(safe_repr_input)

if use_str:
expected_output = params['output_str']
else:
expected_output = params['output']

expect_unicode = False
if isinstance(expected_output, unicode):
Expand All @@ -667,7 +684,10 @@ def __repr__(self):
else:
assert computed == expected_output
else:
assert repr(computed) == repr(expected_output)
if isinstance(expected_output, tuple):
assert computed in expected_output
else:
assert computed == expected_output

# Check that we can json-encode the return.
assert json.dumps(computed)
Expand All @@ -681,7 +701,8 @@ def __repr__(self):
# Because we can't return bytes, byte-related tests aren't needed (and str works as it should).
])
@pytest.mark.skipif(IS_PY2, reason='Py3 specific test')
def test_py3_str_slicing(params):
@pytest.mark.parametrize('use_str', [True, False])
def test_py3_str_slicing(params, use_str):
# Note: much simpler in python because __repr__ is required to return str
# (which is actually unicode).
safe_repr = SafeRepr()
Expand All @@ -690,20 +711,28 @@ def test_py3_str_slicing(params):

safe_repr.maxother_outer = params['maxother_outer']

class MyObj(object):
if not use_str:

def __repr__(self):
return params['input']
class MyObj(object):

def __repr__(self):
return params['input']

safe_repr_input = MyObj()
else:
safe_repr_input = params['input']
expected_output = params['output']
computed = safe_repr(MyObj())
assert repr(computed) == repr(expected_output)
computed = safe_repr(safe_repr_input)
expected = repr(expected_output)
if use_str:
expected = repr(expected)
assert repr(computed) == expected

# Check that we can json-encode the return.
assert json.dumps(computed)


def test_raw():
def test_raw_bytes():
safe_repr = SafeRepr()
safe_repr.raw_value = True
obj = b'\xed\xbd\xbf\xff\xfe\xfa\xfd'
Expand All @@ -714,3 +743,49 @@ def test_raw():
else:
assert raw_value_repr == obj.decode('latin1')


def test_raw_unicode():
safe_repr = SafeRepr()
safe_repr.raw_value = True
obj = u'\xed\xbd\xbf\xff\xfe\xfa\xfd'
raw_value_repr = safe_repr(obj)
assert isinstance(raw_value_repr, str) # bytes on py2, str on py3
if IS_PY2:
assert raw_value_repr == obj.encode('utf-8')
else:
assert raw_value_repr == obj


def test_no_repr():

class MyBytes(object):

def __init__(self, contents):
self.contents = contents
self.errored = None

def __iter__(self):
return iter(self.contents)

def decode(self, encoding):
self.errored = 'decode called'
raise RuntimeError('Should not be called.')

def __repr__(self):
self.errored = '__repr__ called'
raise RuntimeError('Should not be called.')

def __getitem__(self, *args):
return self.contents.__getitem__(*args)

def __len__(self):
return len(self.contents)

safe_repr = SafeRepr()
safe_repr.string_types = (MyBytes,)
safe_repr.bytes = MyBytes
obj = b'f' * (safe_repr.maxstring_outer * 10)
my_bytes = MyBytes(obj)
raw_value_repr = safe_repr(my_bytes)
assert not my_bytes.errored

0 comments on commit 105d82a

Please sign in to comment.