Skip to content

Commit

Permalink
Add Inline Image extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed May 20, 2023
1 parent a73e24a commit 5fd8135
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 5 deletions.
115 changes: 112 additions & 3 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
from .constants import ImageAttributes as IA
from .constants import PageAttributes as PG
from .constants import Ressources as RES
from .errors import PageSizeNotDefinedError
from .errors import PageSizeNotDefinedError, PdfReadError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
Expand Down Expand Up @@ -341,6 +341,7 @@ def __init__(
) -> None:
DictionaryObject.__init__(self)
self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf
self.inline_images: Optional[Dict[str, FileImage]] = None
if indirect_ref is not None: # deprecated
warnings.warn(
(
Expand Down Expand Up @@ -470,6 +471,8 @@ def _old_images(self) -> List[File]: # deprecated
def _get_ids_image(
self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
) -> List[Union[str, List[str]]]:
if self.inline_images is None:
self.inline_images = self._get_inline_images()
if obj is None:
obj = self
if ancest is None:
Expand All @@ -478,7 +481,7 @@ def _get_ids_image(
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return lst
return list(self.inline_images.keys())

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
Expand All @@ -504,8 +507,14 @@ def _get_image(
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
)
except KeyError:
raise
if not (id[0] == "~" and id[-1] == "~"):
raise
if isinstance(id, str):
if id[0] == "~" and id[-1] == "~":
if self.inline_images is None:
raise KeyError("no inline image can be found")
return self.inline_images[id]

extension, byte_stream, img = _xobj_to_image(
cast(DictionaryObject, xobjs[id])
)
Expand Down Expand Up @@ -552,9 +561,109 @@ def images(self) -> List[FileImage]:
applying the saving parameters indicated (such as quality)
e.g. :
`reader.pages[0].images[0]=replace(Image.open("new_image.jpg", quality = 20)`
Inline Image are now extracted : they are names ~0~, ~1~, ...
Note that the indirect_reference is None in these cases.
"""
return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore

def _get_inline_images(self) -> Dict[str, FileImage]:
"""
get inline_images
entries will be identified as ~1~
"""
content = self.get_contents()
if content is None:
return {}
imgs_data = []
for param, ope in content.operations:
if ope == b"INLINE IMAGE":
imgs_data.append(
{"settings": param["settings"], "__streamdata__": param["data"]}
)
elif ope in (b"BI", b"EI", b"ID"):
raise PdfReadError(
f"{ope} operator met whereas not expected,"
"please share usecase with pypdf dev team"
)
"""backup
elif ope == b"BI":
img_data["settings"] = {}
elif ope == b"EI":
imgs_data.append(img_data)
img_data = {}
elif ope == b"ID":
img_data["__streamdata__"] = b""
elif "__streamdata__" in img_data:
if len(img_data["__streamdata__"]) > 0:
img_data["__streamdata__"] += b"\n"
raise Exception("check append")
img_data["__streamdata__"] += param
elif "settings" in img_data:
img_data["settings"][ope.decode()] = param
"""
files = {}
for num, ii in enumerate(imgs_data):
init = {
"__streamdata__": ii["__streamdata__"],
"/Length": len(ii["__streamdata__"]),
}
for k, v in ii["settings"].items():
try:
v = NameObject(
{
"/G": "/DeviceGray",
"/RGB": "/DeviceRGB",
"/CMYK": "/DeviceCMYK",
"/I": "/Indexed",
"/AHx": "/ASCIIHexDecode",
"/A85": "/ASCII85Decode",
"/LZW": "/LZWDecode",
"/Fl": "/FlateDecode",
"/RL": "/RunLengthDecode",
"/CCF": "/CCITTFaxDecode",
"/DCT": "/DCTDecode",
}[v]
)
except (TypeError, KeyError):
if isinstance(v, NameObject):
# it is a custom name : we have to look in resources :
# the only applicable case is for ColorSpace
try:
res = cast(DictionaryObject, self["/Resources"])[
"/ColorSpace"
]
v = cast(DictionaryObject, res)[v]
except KeyError: # for res and v
raise PdfReadError(
f"Can not find resource entry {v} for {k}"
)
init[
NameObject(
{
"/BPC": "/BitsPerComponent",
"/CS": "/ColorSpace",
"/D": "/Decode",
"/DP": "/DecodeParms",
"/F": "/Filter",
"/H": "/Height",
"/W": "/Width",
"/I": "/Interpolate",
"/Intent": "/Intent",
"/IM": "/ImageMask",
}[k]
)
] = v
ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
extension, byte_stream, img = _xobj_to_image(ii["object"])
files[f"~{num}~"] = FileImage(
name=f"~{num}~{extension}",
data=byte_stream,
image=img,
indirect_reference=None,
)
return files

@property
def rotation(self) -> int:
"""
Expand Down
33 changes: 31 additions & 2 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
from re import findall

import pytest
from PIL import ImageChops
from PIL import Image, ImageChops

from pypdf import PdfMerger, PdfReader, PdfWriter
from pypdf.constants import PageAttributes as PG
from pypdf.errors import PdfReadError, PdfReadWarning
from pypdf.generic import ContentStream, read_object
from pypdf.generic import ContentStream, NameObject, read_object

from . import get_pdf_from_url, normalize_warnings

Expand Down Expand Up @@ -965,3 +965,32 @@ def test_replace_image(tmp_path):
with pytest.raises(TypeError) as exc:
i.replace(reader.pages[0].images[0].image)
assert exc.value.args[0] == "Can not update an inline image"


@pytest.mark.enable_socket()
def test_inline_images():
"""This problem was reported in #424"""
url = "https://arxiv.org/pdf/2201.00151.pdf"
name = "2201.00151.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
url2 = "https://github.com/py-pdf/pypdf/assets/4083478/28e8b87c-be2c-40d9-9c86-15c7819021bf"
name2 = "inline4.png"
img_ref = Image.open(BytesIO(get_pdf_from_url(url2, name=name2)))
# in the assert below, the convert will have to be remove with other fixes
assert list(reader.pages[1].images[4].image.convert("RGB").getdata()) == list(
img_ref.getdata()
)
with pytest.raises(KeyError):
reader.pages[0].images["~999~"]
del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"]
reader.pages[1].inline_images = None # to force recalculation
with pytest.raises(PdfReadError):
reader.pages[1].images["~1~"]
co = reader.pages[0].get_contents()
co.operations.append(([], b"BI"))
reader.pages[0][NameObject("/Contents")] = co
reader.pages[0].images.keys()

with pytest.raises(TypeError) as exc:
reader.pages[0].images[0].replace(img_ref)
assert exc.value.args[0] == "Can not update an inline image"

0 comments on commit 5fd8135

Please sign in to comment.