From 5f4357ab13fb04904d8601f828225fba72b52826 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 10 Aug 2023 20:05:44 +0200 Subject: [PATCH 1/2] BUG: prevent stall when accessing image in corrupted pdf closes #2077 --- pypdf/_page.py | 13 +++++++++++-- tests/test_page.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 01c5b0506..868e9c594 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -477,8 +477,17 @@ def _old_images(self) -> List[File]: # deprecated return images_extracted def _get_ids_image( - self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None + self, + obj: Optional[DictionaryObject] = None, + ancest: Optional[List[str]] = None, + call_stack: Optional[List[Any]] = None, ) -> List[Union[str, List[str]]]: + if call_stack is None: + call_stack = [] + if obj in call_stack: + return [] + else: + call_stack.append(obj) if self.inline_images_keys is None: nb_inlines = len( re.findall( @@ -502,7 +511,7 @@ def _get_ids_image( if x_object[o][IA.SUBTYPE] == "/Image": lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside - lst.extend(self._get_ids_image(x_object[o], ancest + [o])) + lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) return lst + self.inline_images_keys def _get_image( diff --git a/tests/test_page.py b/tests/test_page.py index d1f6f3fcb..aaee6278e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1220,3 +1220,13 @@ def create_stamp_pdf() -> BytesIO: assert isinstance( writer._objects[contents.indirect_reference.idnum - 1], NullObject ) + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(30) +def test_loop_in_image_keys(): + """Cf #2077""" + url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" + name = "iss2077.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images.keys() From 7230c02f79481565eb9d083b3bd043401ce47845 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 11 Aug 2023 11:09:49 +0200 Subject: [PATCH 2/2] fix test and move it --- pypdf/_page.py | 5 +++-- tests/test_images.py | 10 ++++++++++ tests/test_page.py | 10 ---------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 868e9c594..00800a9bf 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -484,10 +484,11 @@ def _get_ids_image( ) -> List[Union[str, List[str]]]: if call_stack is None: call_stack = [] - if obj in call_stack: + _i = getattr(obj, "indirect_reference", None) + if _i in call_stack: return [] else: - call_stack.append(obj) + call_stack.append(_i) if self.inline_images_keys is None: nb_inlines = len( re.findall( diff --git a/tests/test_images.py b/tests/test_images.py index b159af0d3..2f14c7b38 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -209,3 +209,13 @@ def test_image_extraction(src, page_index, image_key, expected): with open(f"page-{page_index}-{actual_image.name}", "wb") as fp: fp.write(actual_image.data) assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(30) +def test_loop_in_image_keys(): + """Cf #2077""" + url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" + name = "iss2077.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images.keys() diff --git a/tests/test_page.py b/tests/test_page.py index aaee6278e..d1f6f3fcb 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1220,13 +1220,3 @@ def create_stamp_pdf() -> BytesIO: assert isinstance( writer._objects[contents.indirect_reference.idnum - 1], NullObject ) - - -@pytest.mark.enable_socket() -@pytest.mark.timeout(30) -def test_loop_in_image_keys(): - """Cf #2077""" - url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" - name = "iss2077.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - reader.pages[0].images.keys()