diff --git a/pypdf/_page.py b/pypdf/_page.py index ac50355c2..1eb40504f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -68,7 +68,7 @@ from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Ressources as RES -from .errors import PageSizeNotDefinedError +from .errors import PageSizeNotDefinedError, PdfReadError from .filters import _xobj_to_image from .generic import ( ArrayObject, @@ -341,6 +341,7 @@ def __init__( ) -> None: DictionaryObject.__init__(self) self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf + self.inline_images: Optional[Dict[str, FileImage]] = None if indirect_ref is not None: # deprecated warnings.warn( ( @@ -470,6 +471,8 @@ def _old_images(self) -> List[File]: # deprecated def _get_ids_image( self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None ) -> List[Union[str, List[str]]]: + if self.inline_images is None: + self.inline_images = self._get_inline_images() if obj is None: obj = self if ancest is None: @@ -478,7 +481,7 @@ def _get_ids_image( if PG.RESOURCES not in obj or RES.XOBJECT not in cast( DictionaryObject, obj[PG.RESOURCES] ): - return lst + return list(self.inline_images.keys()) x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: @@ -504,8 +507,14 @@ def _get_image( DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] ) except KeyError: - raise + if not (id[0] == "~" and id[-1] == "~"): + raise if isinstance(id, str): + if id[0] == "~" and id[-1] == "~": + if self.inline_images is None: + raise KeyError("no inline image can be found") + return self.inline_images[id] + extension, byte_stream, img = _xobj_to_image( cast(DictionaryObject, xobjs[id]) ) @@ -552,9 +561,109 @@ def images(self) -> List[FileImage]: applying the saving parameters indicated (such as quality) e.g. : `reader.pages[0].images[0]=replace(Image.open("new_image.jpg", quality = 20)` + + Inline Image are now extracted : they are names ~0~, ~1~, ... + Note that the indirect_reference is None in these cases. """ return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + def _get_inline_images(self) -> Dict[str, FileImage]: + """ + get inline_images + entries will be identified as ~1~ + """ + content = self.get_contents() + if content is None: + return {} + imgs_data = [] + for param, ope in content.operations: + if ope == b"INLINE IMAGE": + imgs_data.append( + {"settings": param["settings"], "__streamdata__": param["data"]} + ) + elif ope in (b"BI", b"EI", b"ID"): + raise PdfReadError( + f"{ope} operator met whereas not expected," + "please share usecase with pypdf dev team" + ) + """backup + elif ope == b"BI": + img_data["settings"] = {} + elif ope == b"EI": + imgs_data.append(img_data) + img_data = {} + elif ope == b"ID": + img_data["__streamdata__"] = b"" + elif "__streamdata__" in img_data: + if len(img_data["__streamdata__"]) > 0: + img_data["__streamdata__"] += b"\n" + raise Exception("check append") + img_data["__streamdata__"] += param + elif "settings" in img_data: + img_data["settings"][ope.decode()] = param + """ + files = {} + for num, ii in enumerate(imgs_data): + init = { + "__streamdata__": ii["__streamdata__"], + "/Length": len(ii["__streamdata__"]), + } + for k, v in ii["settings"].items(): + try: + v = NameObject( + { + "/G": "/DeviceGray", + "/RGB": "/DeviceRGB", + "/CMYK": "/DeviceCMYK", + "/I": "/Indexed", + "/AHx": "/ASCIIHexDecode", + "/A85": "/ASCII85Decode", + "/LZW": "/LZWDecode", + "/Fl": "/FlateDecode", + "/RL": "/RunLengthDecode", + "/CCF": "/CCITTFaxDecode", + "/DCT": "/DCTDecode", + }[v] + ) + except (TypeError, KeyError): + if isinstance(v, NameObject): + # it is a custom name : we have to look in resources : + # the only applicable case is for ColorSpace + try: + res = cast(DictionaryObject, self["/Resources"])[ + "/ColorSpace" + ] + v = cast(DictionaryObject, res)[v] + except KeyError: # for res and v + raise PdfReadError( + f"Can not find resource entry {v} for {k}" + ) + init[ + NameObject( + { + "/BPC": "/BitsPerComponent", + "/CS": "/ColorSpace", + "/D": "/Decode", + "/DP": "/DecodeParms", + "/F": "/Filter", + "/H": "/Height", + "/W": "/Width", + "/I": "/Interpolate", + "/Intent": "/Intent", + "/IM": "/ImageMask", + }[k] + ) + ] = v + ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) + extension, byte_stream, img = _xobj_to_image(ii["object"]) + files[f"~{num}~"] = FileImage( + name=f"~{num}~{extension}", + data=byte_stream, + image=img, + indirect_reference=None, + ) + return files + @property def rotation(self) -> int: """ diff --git a/tests/test_workflows.py b/tests/test_workflows.py index eaa8f6e7f..3e042f27c 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -11,12 +11,12 @@ from re import findall import pytest -from PIL import ImageChops +from PIL import Image, ImageChops from pypdf import PdfMerger, PdfReader, PdfWriter from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PdfReadWarning -from pypdf.generic import ContentStream, read_object +from pypdf.generic import ContentStream, NameObject, read_object from . import get_pdf_from_url, normalize_warnings @@ -965,3 +965,32 @@ def test_replace_image(tmp_path): with pytest.raises(TypeError) as exc: i.replace(reader.pages[0].images[0].image) assert exc.value.args[0] == "Can not update an inline image" + + +@pytest.mark.enable_socket() +def test_inline_images(): + """This problem was reported in #424""" + url = "https://arxiv.org/pdf/2201.00151.pdf" + name = "2201.00151.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + url2 = "https://github.com/py-pdf/pypdf/assets/4083478/28e8b87c-be2c-40d9-9c86-15c7819021bf" + name2 = "inline4.png" + img_ref = Image.open(BytesIO(get_pdf_from_url(url2, name=name2))) + # in the assert below, the convert will have to be remove with other fixes + assert list(reader.pages[1].images[4].image.convert("RGB").getdata()) == list( + img_ref.getdata() + ) + with pytest.raises(KeyError): + reader.pages[0].images["~999~"] + del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] + reader.pages[1].inline_images = None # to force recalculation + with pytest.raises(PdfReadError): + reader.pages[1].images["~1~"] + co = reader.pages[0].get_contents() + co.operations.append(([], b"BI")) + reader.pages[0][NameObject("/Contents")] = co + reader.pages[0].images.keys() + + with pytest.raises(TypeError) as exc: + reader.pages[0].images[0].replace(img_ref) + assert exc.value.args[0] == "Can not update an inline image"