Add Inline Image extraction

closes py-pdf#1368
pubpub-zz · May 20, 2023 · 5fd8135 · 5fd8135
1 parent a73e24a
commit 5fd8135
Show file tree

Hide file tree

Showing 2 changed files with 143 additions and 5 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -68,7 +68,7 @@
 from .constants import ImageAttributes as IA
 from .constants import PageAttributes as PG
 from .constants import Ressources as RES
-from .errors import PageSizeNotDefinedError
+from .errors import PageSizeNotDefinedError, PdfReadError
 from .filters import _xobj_to_image
 from .generic import (
     ArrayObject,
@@ -341,6 +341,7 @@ def __init__(
     ) -> None:
         DictionaryObject.__init__(self)
         self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf
+        self.inline_images: Optional[Dict[str, FileImage]] = None
         if indirect_ref is not None:  # deprecated
             warnings.warn(
                 (
@@ -470,6 +471,8 @@ def _old_images(self) -> List[File]:  # deprecated
     def _get_ids_image(
         self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
     ) -> List[Union[str, List[str]]]:
+        if self.inline_images is None:
+            self.inline_images = self._get_inline_images()
         if obj is None:
             obj = self
         if ancest is None:
@@ -478,7 +481,7 @@ def _get_ids_image(
         if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
             DictionaryObject, obj[PG.RESOURCES]
         ):
-            return lst
+            return list(self.inline_images.keys())
 
         x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
         for o in x_object:
@@ -504,8 +507,14 @@ def _get_image(
                 DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
             )
         except KeyError:
-            raise
+            if not (id[0] == "~" and id[-1] == "~"):
+                raise
         if isinstance(id, str):
+            if id[0] == "~" and id[-1] == "~":
+                if self.inline_images is None:
+                    raise KeyError("no inline image can be found")
+                return self.inline_images[id]
+
             extension, byte_stream, img = _xobj_to_image(
                 cast(DictionaryObject, xobjs[id])
             )
@@ -552,9 +561,109 @@ def images(self) -> List[FileImage]:
                 applying the saving parameters indicated (such as quality)
             e.g. :
             `reader.pages[0].images[0]=replace(Image.open("new_image.jpg", quality = 20)`
+
+        Inline Image are now extracted : they are names ~0~, ~1~, ...
+        Note that the indirect_reference is None in these cases.
         """
         return _VirtualListImages(self._get_ids_image, self._get_image)  # type: ignore
 
+    def _get_inline_images(self) -> Dict[str, FileImage]:
+        """
+        get inline_images
+        entries will be identified as ~1~
+        """
+        content = self.get_contents()
+        if content is None:
+            return {}
+        imgs_data = []
+        for param, ope in content.operations:
+            if ope == b"INLINE IMAGE":
+                imgs_data.append(
+                    {"settings": param["settings"], "__streamdata__": param["data"]}
+                )
+            elif ope in (b"BI", b"EI", b"ID"):
+                raise PdfReadError(
+                    f"{ope} operator met whereas not expected,"
+                    "please share usecase with pypdf dev team"
+                )
+            """backup
+            elif ope == b"BI":
+                img_data["settings"] = {}
+            elif ope == b"EI":
+                imgs_data.append(img_data)
+                img_data = {}
+            elif ope == b"ID":
+                img_data["__streamdata__"] = b""
+            elif "__streamdata__" in img_data:
+                if len(img_data["__streamdata__"]) > 0:
+                    img_data["__streamdata__"] += b"\n"
+                    raise Exception("check append")
+                img_data["__streamdata__"] += param
+            elif "settings" in img_data:
+                img_data["settings"][ope.decode()] = param
+            """
+        files = {}
+        for num, ii in enumerate(imgs_data):
+            init = {
+                "__streamdata__": ii["__streamdata__"],
+                "/Length": len(ii["__streamdata__"]),
+            }
+            for k, v in ii["settings"].items():
+                try:
+                    v = NameObject(
+                        {
+                            "/G": "/DeviceGray",
+                            "/RGB": "/DeviceRGB",
+                            "/CMYK": "/DeviceCMYK",
+                            "/I": "/Indexed",
+                            "/AHx": "/ASCIIHexDecode",
+                            "/A85": "/ASCII85Decode",
+                            "/LZW": "/LZWDecode",
+                            "/Fl": "/FlateDecode",
+                            "/RL": "/RunLengthDecode",
+                            "/CCF": "/CCITTFaxDecode",
+                            "/DCT": "/DCTDecode",
+                        }[v]
+                    )
+                except (TypeError, KeyError):
+                    if isinstance(v, NameObject):
+                        #  it is a custom name : we have to look in resources :
+                        # the only applicable case is for ColorSpace
+                        try:
+                            res = cast(DictionaryObject, self["/Resources"])[
+                                "/ColorSpace"
+                            ]
+                            v = cast(DictionaryObject, res)[v]
+                        except KeyError:  # for res and v
+                            raise PdfReadError(
+                                f"Can not find resource entry {v} for {k}"
+                            )
+                init[
+                    NameObject(
+                        {
+                            "/BPC": "/BitsPerComponent",
+                            "/CS": "/ColorSpace",
+                            "/D": "/Decode",
+                            "/DP": "/DecodeParms",
+                            "/F": "/Filter",
+                            "/H": "/Height",
+                            "/W": "/Width",
+                            "/I": "/Interpolate",
+                            "/Intent": "/Intent",
+                            "/IM": "/ImageMask",
+                        }[k]
+                    )
+                ] = v
+            ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
+            extension, byte_stream, img = _xobj_to_image(ii["object"])
+            files[f"~{num}~"] = FileImage(
+                name=f"~{num}~{extension}",
+                data=byte_stream,
+                image=img,
+                indirect_reference=None,
+            )
+        return files
+
     @property
     def rotation(self) -> int:
         """

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -11,12 +11,12 @@
 from re import findall
 
 import pytest
-from PIL import ImageChops
+from PIL import Image, ImageChops
 
 from pypdf import PdfMerger, PdfReader, PdfWriter
 from pypdf.constants import PageAttributes as PG
 from pypdf.errors import PdfReadError, PdfReadWarning
-from pypdf.generic import ContentStream, read_object
+from pypdf.generic import ContentStream, NameObject, read_object
 
 from . import get_pdf_from_url, normalize_warnings
 
@@ -965,3 +965,32 @@ def test_replace_image(tmp_path):
     with pytest.raises(TypeError) as exc:
         i.replace(reader.pages[0].images[0].image)
     assert exc.value.args[0] == "Can not update an inline image"
+
+
+@pytest.mark.enable_socket()
+def test_inline_images():
+    """This problem was reported in #424"""
+    url = "https://arxiv.org/pdf/2201.00151.pdf"
+    name = "2201.00151.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    url2 = "https://github.com/py-pdf/pypdf/assets/4083478/28e8b87c-be2c-40d9-9c86-15c7819021bf"
+    name2 = "inline4.png"
+    img_ref = Image.open(BytesIO(get_pdf_from_url(url2, name=name2)))
+    # in the assert below, the convert will have to be remove with other fixes
+    assert list(reader.pages[1].images[4].image.convert("RGB").getdata()) == list(
+        img_ref.getdata()
+    )
+    with pytest.raises(KeyError):
+        reader.pages[0].images["~999~"]
+    del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"]
+    reader.pages[1].inline_images = None  # to force recalculation
+    with pytest.raises(PdfReadError):
+        reader.pages[1].images["~1~"]
+    co = reader.pages[0].get_contents()
+    co.operations.append(([], b"BI"))
+    reader.pages[0][NameObject("/Contents")] = co
+    reader.pages[0].images.keys()
+
+    with pytest.raises(TypeError) as exc:
+        reader.pages[0].images[0].replace(img_ref)
+    assert exc.value.args[0] == "Can not update an inline image"