From fad7d8e84d2e7fdc227a1ac2f54c88640411d7c9 Mon Sep 17 00:00:00 2001 From: Fabio Zadrozny Date: Sat, 16 May 2020 09:13:57 -0300 Subject: [PATCH] Don't duplicate memory when getting the contents of bytes/str. Fixes #242 --- .../pydevd/_pydevd_bundle/pydevd_safe_repr.py | 60 ++++++- .../pydevd/tests_python/test_safe_repr.py | 147 +++++++++++++----- 2 files changed, 159 insertions(+), 48 deletions(-) diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py index 512be7f10..9070fcc38 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py @@ -6,7 +6,7 @@ import sys from _pydevd_bundle.pydevd_constants import IS_PY2 import locale -import json +from _pydev_bundle import pydev_log # Py3 compat - alias unicode to str, and xrange to range try: @@ -33,6 +33,7 @@ class SafeRepr(object): maxstring_inner = 30 if sys.version_info >= (3, 0): string_types = (str, bytes) + bytes = bytes set_info = (set, '{', '}', False) frozenset_info = (frozenset, 'frozenset({', '})', False) int_types = (int,) @@ -40,6 +41,7 @@ class SafeRepr(object): dict, set, frozenset) else: string_types = (str, unicode) + bytes = str set_info = (set, 'set([', '])', False) frozenset_info = (frozenset, 'frozenset([', '])', False) int_types = (int, long) # noqa @@ -279,8 +281,54 @@ def _repr_dict(self, obj, level, prefix, suffix, yield suffix def _repr_str(self, obj, level): - return self._repr_obj(obj, level, - self.maxstring_inner, self.maxstring_outer) + try: + if self.raw_value: + # For raw value retrieval, ignore all limits. + if isinstance(obj, bytes): + yield obj.decode('latin-1') + else: + yield self._convert_to_unicode_or_bytes_repr(repr(obj)) + return + + limit_inner = self.maxother_inner + limit_outer = self.maxother_outer + limit = limit_inner if level > 0 else limit_outer + if len(obj) <= limit: + # Note that we check the limit before doing the repr, so, this could account + # for up to 3 chars above the limit afterwards. i.e.: u'' or b'' + yield self._convert_to_unicode_or_bytes_repr(repr(obj)) + return + + # Slightly imprecise calculations - we may end up with a string that is + # up to 6 characters longer than limit. If you need precise formatting, + # you are using the wrong class. + left_count, right_count = max(1, int(2 * limit / 3)), max(1, int(limit / 3)) # noqa + + # Important: only do repr after slicing to avoid duplicating a byte array that could be + # huge. + + # Note: we don't deal with high surrogates here because we're not dealing with the + # repr() of a random object (which we have to convert to unicode to show as unicode + # when needed), we just show the unicode/string/bytes repr() directly to make clear + # what the input type was (so, on py2 a unicode would start with u' and on py3 a bytes + # would start with b'). + part1 = obj[:left_count] + part1 = repr(part1) + part1 = part1[:part1.rindex("'")] # Remove the last ' + + part2 = obj[-right_count:] + part2 = repr(part2) + part2 = part2[part2.index("'") + 1:] # Remove the first ' (and possibly u or b). + + yield part1 + yield '...' + yield part2 + except: + # This shouldn't really happen, but let's play it safe. + pydev_log.exception('Error getting string representation to show.') + for part in self._repr_obj(obj, level, + self.maxother_inner, self.maxother_outer): + yield part def _repr_other(self, obj, level): return self._repr_obj(obj, level, @@ -327,7 +375,7 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer): # you are using the wrong class. left_count, right_count = max(1, int(2 * limit / 3)), max(1, int(limit / 3)) # noqa - if IS_PY2 and isinstance(obj_repr, bytes): + if IS_PY2 and isinstance(obj_repr, self.bytes): # If we can convert to unicode before slicing, that's better (but don't do # it if it's not possible as we may be dealing with actual binary data). @@ -370,9 +418,9 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer): yield obj_repr[-right_count:] def _convert_to_unicode_or_bytes_repr(self, obj_repr): - if IS_PY2 and isinstance(obj_repr, bytes): + if IS_PY2 and isinstance(obj_repr, self.bytes): obj_repr = self._bytes_as_unicode_if_possible(obj_repr) - if isinstance(obj_repr, bytes): + if isinstance(obj_repr, self.bytes): # If we haven't been able to decode it this means it's some binary data # we can't make sense of, so, we need its repr() -- otherwise json # encoding may break later on. diff --git a/src/debugpy/_vendored/pydevd/tests_python/test_safe_repr.py b/src/debugpy/_vendored/pydevd/tests_python/test_safe_repr.py index 918ba3306..3f25045a3 100644 --- a/src/debugpy/_vendored/pydevd/tests_python/test_safe_repr.py +++ b/src/debugpy/_vendored/pydevd/tests_python/test_safe_repr.py @@ -28,6 +28,9 @@ class SafeReprTestBase(object): def assert_saferepr(self, value, expected): safe = self.saferepr(value) + if len(safe) != len(expected): + raise AssertionError('Expected:\n%s\nFound:\n%s\n Expected len: %s Found len: %s' % ( + expected, safe, len(expected), len(safe),)) assert safe == expected return safe @@ -108,29 +111,29 @@ def test_str_large(self): value = 'A' * (SafeRepr.maxstring_outer + 10) self.assert_shortened(value, - "'" + 'A' * 43689 + "..." + 'A' * 21844 + "'") - self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAA...AAAAAAAAA']") + "'" + 'A' * 43690 + "..." + 'A' * 21845 + "'") + self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']") def test_str_largest_unchanged(self): - value = 'A' * (SafeRepr.maxstring_outer - 2) + value = 'A' * (SafeRepr.maxstring_outer) - self.assert_unchanged(value, "'" + 'A' * 65534 + "'") + self.assert_unchanged(value, "'" + 'A' * 65536 + "'") def test_str_smallest_changed(self): - value = 'A' * (SafeRepr.maxstring_outer - 1) + value = 'A' * (SafeRepr.maxstring_outer + 1) self.assert_shortened(value, - "'" + 'A' * 43689 + "..." + 'A' * 21844 + "'") + "'" + 'A' * 43690 + "..." + 'A' * 21845 + "'") def test_str_list_largest_unchanged(self): - value = 'A' * (SafeRepr.maxstring_inner - 2) + value = 'A' * (SafeRepr.maxstring_inner) - self.assert_unchanged([value], "['" + 'A' * 28 + "']") + self.assert_unchanged([value], "['AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA']") def test_str_list_smallest_changed(self): - value = 'A' * (SafeRepr.maxstring_inner - 1) + value = 'A' * (SafeRepr.maxstring_inner + 1) - self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAA...AAAAAAAAA']") + self.assert_shortened([value], "['AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']") @pytest.mark.skipif(sys.version_info > (3, 0), reason='Py2 specific test') def test_unicode_small(self): @@ -144,8 +147,8 @@ def test_unicode_large(self): value = u'A' * (SafeRepr.maxstring_outer + 10) self.assert_shortened(value, - "u'" + 'A' * 43688 + "..." + 'A' * 21844 + "'") - self.assert_shortened([value], "[u'AAAAAAAAAAAAAAAAAA...AAAAAAAAA']") + "u'" + 'A' * 43690 + "..." + 'A' * 21845 + "'") + self.assert_shortened([value], "[u'AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']") @pytest.mark.skipif(sys.version_info < (3, 0), reason='Py3 specific test') def test_bytes_small(self): @@ -159,8 +162,8 @@ def test_bytes_large(self): value = b'A' * (SafeRepr.maxstring_outer + 10) self.assert_shortened(value, - "b'" + 'A' * 43688 + "..." + 'A' * 21844 + "'") - self.assert_shortened([value], "[b'AAAAAAAAAAAAAAAAAA...AAAAAAAAA']") + "b'" + 'A' * 43690 + "..." + 'A' * 21845 + "'") + self.assert_shortened([value], "[b'AAAAAAAAAAAAAAAAAAAA...AAAAAAAAAA']") # @pytest.mark.skip(reason='not written') # TODO: finish! # def test_bytearray_small(self): @@ -603,23 +606,25 @@ def test_zeros(self): # ucs-4 build (so, we have to strip the high-surrogate if it's ucs-2 and the number of chars # will be different). - {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄F😄FF😄F", 'output': (u"😄😄😄😄😄😄...FF😄F", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...F😄FF😄F")}, + {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄F😄FF😄F", 'output': (u"😄😄😄😄😄😄...FF😄F", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...F😄FF😄F"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\ud83d...\\ude04FF\\U0001f604F'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...F\\U0001f604FF\\U0001f604F'")}, - {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄😄😄😄...FFFFFF", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF")}, - {'maxother_outer': 20, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐🌐🌐🌐...FFFFFF", u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐...FFFFFF")}, - {'maxother_outer': 10, 'input': u"😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄...FFF", u"😄😄😄😄😄😄...FFF")}, - {'maxother_outer': 10, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐...FFF", u"🌐🌐🌐🌐🌐🌐...FFF")}, + {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄😄😄😄...FFFFFF", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\ud83d...FFFFFF'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...FFFFFF'")}, + {'maxother_outer': 20, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐🌐🌐🌐...FFFFFF", u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐...FFFFFF"), 'output_str': ("u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\ud83c...FFFFFF'", "u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310...FFFFFF'")}, + {'maxother_outer': 10, 'input': u"😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄...FFF", u"😄😄😄😄😄😄...FFF"), 'output_str': ("u'\\U0001f604\\U0001f604\\U0001f604...FFF'", "u'\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604\\U0001f604...FFF'")}, + {'maxother_outer': 10, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐...FFF", u"🌐🌐🌐🌐🌐🌐...FFF"), 'output_str': ("u'\\U0001f310\\U0001f310\\U0001f310...FFF'", "u'\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310\\U0001f310...FFF'")}, # Regular unicode - {'maxother_outer': 20, 'input': u"ωωωωωωωωωωωωωωωωωωωωωωωFFFFFFFF", 'output': u"ωωωωωωωωωωωωω...FFFFFF"}, - {'maxother_outer': 20, 'input': u"������������FFFFFFFF", 'output': u"������������F...FFFFFF"}, - {'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF"}, + {'maxother_outer': 20, 'input': u"ωωωωωωωωωωωωωωωωωωωωωωωFFFFFFFF", 'output': u"ωωωωωωωωωωωωω...FFFFFF", 'output_str': repr(u"ωωωωωωωωωωωωω...FFFFFF")}, + {'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF", 'output_str': repr(u"������...FFF")}, + + # Note: as unicode directly doesn't reach the limit and is not elided. + {'maxother_outer': 20, 'input': u"������������FFFFFFFF", 'output': u"������������F...FFFFFF", 'output_str': repr(u"������������FFFFFFFF")}, # Note that we actually get the repr() in this case as we can't decode it with any of the available encodings. - {'maxother_outer': 10, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\...fd'"}, - {'maxother_outer': 20, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\xbd\\xbf...a\\xfd'"}, + {'maxother_outer': 10, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\...fd'", 'output_str': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xfe\\xfa\\xfd'"}, + {'maxother_outer': 20, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\xbd\\xbf...a\\xfd'", 'output_str': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'"}, # Check that we use repr() even if it fits the maxother_outer limit. - {'maxother_outer': 100, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd', 'output': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'"}, + {'maxother_outer': 100, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd', 'output': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'", 'output_str': repr(b'\xed\xbd\xbf\xff\xfe\xfa\xfd')}, # Note that with latin1 encoding we can actually decode the string but when encoding back to utf-8 we have garbage # (couldn't find a good approach to know what to do here as we've actually been able to decode it as @@ -628,11 +633,13 @@ def test_zeros(self): 'maxother_outer': 10, 'sys_stdout_encoding': 'latin1', 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, - 'output': b'\xc3\xad\xc2\xbd\xc2\xbf\xc3\xbf\xc3\xbe\xc3\xba...\xc3\xbe\xc3\xba\xc3\xbd' + 'output': b'\xc3\xad\xc2\xbd\xc2\xbf\xc3\xbf\xc3\xbe\xc3\xba...\xc3\xbe\xc3\xba\xc3\xbd', + 'output_str': "\'\\xed\\xbd\\xbf\\xff\\xfe\\xfa...\\xfe\\xfa\\xfd\'", }, ]) +@pytest.mark.parametrize('use_str', [True, False]) @pytest.mark.skipif(not IS_PY2, reason='Py2 specific test.') -def test_py2_bytes_slicing(params): +def test_py2_bytes_slicing(params, use_str): safe_repr = SafeRepr() safe_repr.locale_preferred_encoding = 'ascii' safe_repr.sys_stdout_encoding = params.get('sys_stdout_encoding', 'ascii') @@ -643,16 +650,26 @@ def test_py2_bytes_slicing(params): # later on, so, the return from SafeRepr must always be utf-8 regardless of the input). encoding = 'utf-8' - class MyObj(object): + if not use_str: - def __repr__(self): - ret = params['input'] - if isinstance(ret, unicode): - ret = ret.encode(encoding) - return ret + class MyObj(object): - expected_output = params['output'] - computed = safe_repr(MyObj()) + def __repr__(self): + ret = params['input'] + if isinstance(ret, unicode): + ret = ret.encode(encoding) + return ret + + safe_repr_input = MyObj() + else: + safe_repr_input = params['input'] + + computed = safe_repr(safe_repr_input) + + if use_str: + expected_output = params['output_str'] + else: + expected_output = params['output'] expect_unicode = False if isinstance(expected_output, unicode): @@ -667,7 +684,10 @@ def __repr__(self): else: assert computed == expected_output else: - assert repr(computed) == repr(expected_output) + if isinstance(expected_output, tuple): + assert computed in expected_output + else: + assert computed == expected_output # Check that we can json-encode the return. assert json.dumps(computed) @@ -681,7 +701,8 @@ def __repr__(self): # Because we can't return bytes, byte-related tests aren't needed (and str works as it should). ]) @pytest.mark.skipif(IS_PY2, reason='Py3 specific test') -def test_py3_str_slicing(params): +@pytest.mark.parametrize('use_str', [True, False]) +def test_py3_str_slicing(params, use_str): # Note: much simpler in python because __repr__ is required to return str # (which is actually unicode). safe_repr = SafeRepr() @@ -690,14 +711,22 @@ def test_py3_str_slicing(params): safe_repr.maxother_outer = params['maxother_outer'] - class MyObj(object): + if not use_str: - def __repr__(self): - return params['input'] + class MyObj(object): + def __repr__(self): + return params['input'] + + safe_repr_input = MyObj() + else: + safe_repr_input = params['input'] expected_output = params['output'] - computed = safe_repr(MyObj()) - assert repr(computed) == repr(expected_output) + computed = safe_repr(safe_repr_input) + expected = repr(expected_output) + if use_str: + expected = repr(expected) + assert repr(computed) == expected # Check that we can json-encode the return. assert json.dumps(computed) @@ -714,3 +743,37 @@ def test_raw(): else: assert raw_value_repr == obj.decode('latin1') + +def test_no_repr(): + + class MyBytes(object): + + def __init__(self, contents): + self.contents = contents + self.errored = None + + def __iter__(self): + return iter(self.contents) + + def decode(self, encoding): + self.errored = 'decode called' + raise RuntimeError('Should not be called.') + + def __repr__(self): + self.errored = '__repr__ called' + raise RuntimeError('Should not be called.') + + def __getitem__(self, *args): + return self.contents.__getitem__(*args) + + def __len__(self): + return len(self.contents) + + safe_repr = SafeRepr() + safe_repr.string_types = (MyBytes,) + safe_repr.bytes = MyBytes + obj = b'f' * (safe_repr.maxstring_outer * 10) + my_bytes = MyBytes(obj) + raw_value_repr = safe_repr(my_bytes) + assert not my_bytes.errored +