py-pdf · MartinThoma · Jul 28, 2023 · Jul 25, 2023 · Jul 26, 2023 · Jul 28, 2023
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,6 +28,7 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
+import re
 import warnings
 from decimal import Decimal
 from typing import (
@@ -55,6 +56,7 @@
     mult,
 )
 from ._utils import (
+    WHITESPACES_AS_REGEXP,
     CompressedTransformationMatrix,
     File,
     ImageFile,
@@ -342,6 +344,8 @@ def __init__(
         DictionaryObject.__init__(self)
         self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf
         self.inline_images: Optional[Dict[str, ImageFile]] = None
+        # below Union for mypy but actually Optional[List[str]]
+        self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
         if indirect_ref is not None:  # deprecated
             warnings.warn(
                 (
@@ -475,8 +479,14 @@ def _old_images(self) -> List[File]:  # deprecated
     def _get_ids_image(
         self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
     ) -> List[Union[str, List[str]]]:
-        if self.inline_images is None:
-            self.inline_images = self._get_inline_images()
+        if self.inline_images_keys is None:
+            nb_inlines = len(
+                re.findall(
+                    WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
+                    self._get_contents_as_bytes() or b"",
+                )
+            )
+            self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
         if obj is None:
             obj = self
         if ancest is None:
@@ -485,15 +495,15 @@ def _get_ids_image(
         if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
             DictionaryObject, obj[PG.RESOURCES]
         ):
-            return list(self.inline_images.keys())
+            return self.inline_images_keys
 
         x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
         for o in x_object:
             if x_object[o][IA.SUBTYPE] == "/Image":
                 lst.append(o if len(ancest) == 0 else ancest + [o])
             else:  # is a form with possible images inside
                 lst.extend(self._get_ids_image(x_object[o], ancest + [o]))
-        return lst + list(self.inline_images.keys())
+        return lst + self.inline_images_keys
 
     def _get_image(
         self,
@@ -515,6 +525,8 @@ def _get_image(
                 raise
         if isinstance(id, str):
             if id[0] == "~" and id[-1] == "~":
+                if self.inline_images is None:
+                    self.inline_images = self._get_inline_images()
                 if self.inline_images is None:  # pragma: no cover
                     raise KeyError("no inline image can be found")
                 return self.inline_images[id]
@@ -894,6 +906,23 @@ def _add_transformation_matrix(
         )
         return contents
 
+    def _get_contents_as_bytes(self) -> Optional[bytes]:
+        """
+        Return the page contents as bytes.
+
+        Returns:
+            The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
+
+        """
+        if PG.CONTENTS in self:
+            obj = self[PG.CONTENTS].get_object()
+            if isinstance(obj, list):
+                return b"".join(x.get_object().get_data() for x in obj)
+            else:
+                return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+        else:
+            return None
+
     def get_contents(self) -> Optional[ContentStream]:
         """
         Access the page contents.

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 
 
 WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
+WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"
 
 
 def paeth_predictor(left: int, up: int, up_left: int) -> int:

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -1016,9 +1016,6 @@ def test_inline_images():
         _a[x] = y
     with pytest.raises(KeyError) as exc:
         reader.pages[2]._get_image(("test",))
-    reader.pages[2].inline_images = None
-    with pytest.raises(KeyError) as exc:
-        reader.pages[2]._get_image(("~1~",))
 
 
 @pytest.mark.enable_socket()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:


		WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
		WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"


		def paeth_predictor(left: int, up: int, up_left: int) -> int:
Expand Down