From 30dacb96048b27b7a5344101e975e3662bed3a8c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 25 Jul 2023 20:28:43 +0200 Subject: [PATCH 1/3] ENH : accelerate image list keys generation closes #1987 --- pypdf/_page.py | 36 ++++++++++++++++++++++++++++++++---- pypdf/_utils.py | 1 + tests/test_workflows.py | 3 --- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 081eb8815..07f537285 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,6 +28,7 @@ # POSSIBILITY OF SUCH DAMAGE. import math +import re import warnings from decimal import Decimal from typing import ( @@ -55,6 +56,7 @@ mult, ) from ._utils import ( + WHITESPACES_AS_REGEXP, CompressedTransformationMatrix, File, ImageFile, @@ -342,6 +344,7 @@ def __init__( DictionaryObject.__init__(self) self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf self.inline_images: Optional[Dict[str, ImageFile]] = None + self.inline_images_keys: Optional[List[str]] = None if indirect_ref is not None: # deprecated warnings.warn( ( @@ -475,8 +478,14 @@ def _old_images(self) -> List[File]: # deprecated def _get_ids_image( self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None ) -> List[Union[str, List[str]]]: - if self.inline_images is None: - self.inline_images = self._get_inline_images() + if self.inline_images_keys is None: + nb_inlines = len( + re.findall( + WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, + self._get_contents_as_bytes() or b"", + ) + ) + self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)] if obj is None: obj = self if ancest is None: @@ -485,7 +494,7 @@ def _get_ids_image( if PG.RESOURCES not in obj or RES.XOBJECT not in cast( DictionaryObject, obj[PG.RESOURCES] ): - return list(self.inline_images.keys()) + return self.inline_images_keys x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: @@ -493,7 +502,7 @@ def _get_ids_image( lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], ancest + [o])) - return lst + list(self.inline_images.keys()) + return lst + self.inline_images_keys def _get_image( self, @@ -515,6 +524,8 @@ def _get_image( raise if isinstance(id, str): if id[0] == "~" and id[-1] == "~": + if self.inline_images is None: + self.inline_images = self._get_inline_images() if self.inline_images is None: # pragma: no cover raise KeyError("no inline image can be found") return self.inline_images[id] @@ -894,6 +905,23 @@ def _add_transformation_matrix( ) return contents + def _get_contents_as_bytes(self) -> Optional[bytes]: + """ + Return the page contents as bytes . + + Returns: + The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. + + """ + if PG.CONTENTS in self: + obj = self[PG.CONTENTS].get_object() + if isinstance(obj, list): + return b"".join(x.get_object().get_data() for x in obj) + else: + return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + else: + return None + def get_contents(self) -> Optional[ContentStream]: """ Access the page contents. diff --git a/pypdf/_utils.py b/pypdf/_utils.py index da121ac55..06845d6ac 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00") +WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]" def paeth_predictor(left: int, up: int, up_left: int) -> int: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index c24399f83..1c06c02df 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1016,9 +1016,6 @@ def test_inline_images(): _a[x] = y with pytest.raises(KeyError) as exc: reader.pages[2]._get_image(("test",)) - reader.pages[2].inline_images = None - with pytest.raises(KeyError) as exc: - reader.pages[2]._get_image(("~1~",)) @pytest.mark.enable_socket() From 88c8bb2d9d6c7a4f5014aa2e3e5d28feeea8152d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 26 Jul 2023 20:40:12 +0200 Subject: [PATCH 2/3] mypy --- pypdf/_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 07f537285..384726347 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -344,7 +344,8 @@ def __init__( DictionaryObject.__init__(self) self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf self.inline_images: Optional[Dict[str, ImageFile]] = None - self.inline_images_keys: Optional[List[str]] = None + # below Union for mypy but actually Optional[List[str]] + self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None if indirect_ref is not None: # deprecated warnings.warn( ( From c756267a22c69e750c34ca2b00544ca8a1ff24b5 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 28 Jul 2023 17:50:03 +0200 Subject: [PATCH 3/3] Update pypdf/_page.py --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 384726347..2d31afafe 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -908,7 +908,7 @@ def _add_transformation_matrix( def _get_contents_as_bytes(self) -> Optional[bytes]: """ - Return the page contents as bytes . + Return the page contents as bytes. Returns: The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.