pageobjects: add initial image support model (#151)

pypdfium2-team · Oct 13, 2022 · 7c34f76 · 7c34f76
1 parent a631a1a
commit 7c34f76
Show file tree

Hide file tree

Showing 20 changed files with 358 additions and 80 deletions.
diff --git a/.reuse/dep5 b/.reuse/dep5
@@ -37,6 +37,7 @@ Files: tests/resources/toc_circular.pdf
        tests/resources/toc_viewmodes.pdf
        tests/resources/toc_maxdepth.pdf
        tests/resources/form_listbox.pdf
+       tests/resources/mona_lisa.jpg
 Copyright: 2022 PDFium Developers
 License: BSD-3-Clause OR Apache-2.0
 Comment:
@@ -45,6 +46,7 @@ Comment:
     https://pdfium.googlesource.com/pdfium_tests/+/refs/heads/main/fx/other/8.2_outline.pdf
     https://pdfium.googlesource.com/pdfium_tests/+/refs/heads/main/fx/FRC_8.2.2_part1/FRC_51_8.2.2_T_8.4__Count_edit_count_100.pdf
     https://pdfium.googlesource.com/pdfium/+/refs/heads/main/testing/resources/listbox_form.pdf
+    https://pdfium.googlesource.com/pdfium/+/refs/heads/main/testing/resources/mona_lisa.jpg
 
 Files: tests/resources/box_fallback.in
        tests/resources/box_fallback.pdf

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
@@ -7,4 +7,5 @@
 - Disruption: Two components of `PdfDocument` have been removed to clean up the code (without a major release, due to their insignificance):
   - Removal of `update_rendering_input()`. Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
   - The multipage renderer does not implicitly read byte buffers into memory anymore. Callers are expected to take an explicit decision by providing a different input in the first place.
+- Added a new support model `PdfImageObject` (which inherits from `PdfPageObject`). This can be used to insert a JPEG image into a page, get metadata, etc.
 - Docs: The changelog page now selectively includes an entry for the next release that may be shown on `latest` builds.
diff --git a/docs/devel/tasks.md b/docs/devel/tasks.md
@@ -7,8 +7,12 @@ These are various tasks for the maintainer to keep in mind, in no specific order
 Also see the issues panel and inline `TODO` marks in source code.
 
 ### Main Code
+* Make bitmap converters independent of rendering so as to use them on bitmaps in other places as well.
+* Allow to plug in arbitrary static render functions into `render_to()` methods. Then we can add a secondary function for matrix rendering.
+* Rename `insert_text()` to `insert_text_shaped()` (keeping the other as alias) so we can add a simple text insertion function later.
 * Check if we should use `FPDFPage_HasTransparency()` on rendering.
-* Add support models for attachments and image extraction.
+* Add new support models for attachments, document metadata, and image extraction.
+* Add helper methods for page labels and trailer ID.
 * Consolidate and extend helper classes.
 * Ensure we correctly handle PDFium return codes indicating failure.
 * Review on a case-by-case basis where we should raise an error and where pass.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -57,6 +57,7 @@ def _have_changelog():
 autodoc_default_options = {
     "members": True,
     "undoc-members": True,
+    "show-inheritance": True,
     "member-order": "bysource",
 }
 intersphinx_mapping = {

diff --git a/docs/source/planned_changes.md b/docs/source/planned_changes.md
@@ -9,5 +9,6 @@ The following API breaking changes are being considered for the next major relea
 - The textpage API will change
   * The `count_chars()` alias will be removed in favour of the `n_chars` attribute.
   * The `get_text()` alias will be removed in favour of `get_text_bounded()`.
+- `PdfPage.insert_text()` will be renamed to `insert_text_shaped()`.
 - The `PdfDocument` context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
 - `PdfDocument.add_font()` might be changed to take bytes rather than a file path.
diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
@@ -26,12 +26,10 @@ Version
 Document
 ********
 .. automodule:: pypdfium2._helpers.document
-    :show-inheritance:
 
 Page
 ****
 .. automodule:: pypdfium2._helpers.page
-    :show-inheritance:
 
 Page Object
 ***********
@@ -48,7 +46,6 @@ Matrix
 Converters
 **********
 .. automodule:: pypdfium2._helpers.converters
-    :show-inheritance:
 
 Miscellaneous
 *************

diff --git a/docs/source/shell_api.rst b/docs/source/shell_api.rst
@@ -9,57 +9,46 @@ pypdfium2 can also be used from the command-line.
 
 Version
 *******
-``pypdfium2 --version``
-
-.. program-output:: pypdfium2 --version
+.. command-output:: pypdfium2 --version
 
 
 Main Help
 *********
-``pypdfium2 --help``
-
-.. program-output:: pypdfium2 --help
+.. command-output:: pypdfium2 --help
 
 
 Renderer
 ********
 *Requires* :mod:`PIL`
 
-``pypdfium2 render --help``
-
-.. program-output:: pypdfium2 render --help
+.. command-output:: pypdfium2 render --help
 
 
 Table of Contents Reader
 ************************
-``pypdfium2 toc --help``
-
-.. program-output:: pypdfium2 toc --help
+.. command-output:: pypdfium2 toc --help
 
 
 Merger
 ******
-``pypdfium2 merge --help``
-
-.. program-output:: pypdfium2 merge --help
+.. command-output:: pypdfium2 merge --help
 
 
 Page Tiler
 **********
-``pypdfium2 tile --help``
-
-.. program-output:: pypdfium2 tile --help
+.. command-output:: pypdfium2 tile --help
 
 
 Text Extractor
 **************
-``pypdfium2 extract-text --help``
-
-.. program-output:: pypdfium2 extract-text --help
+.. command-output:: pypdfium2 extract-text --help
 
 
 Page Object Finder
 ******************
-``pypdfium2 find-pageobjects --help``
+.. command-output:: pypdfium2 find-pageobjects --help
 
-.. program-output:: pypdfium2 find-pageobjects --help
+
+JPEG Converter
+**************
+.. command-output:: pypdfium2 jpegtopdf --help
diff --git a/src/pypdfium2/_cli/jpegtopdf.py b/src/pypdfium2/_cli/jpegtopdf.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import os.path
+from pypdfium2 import _namespace as pdfium
+
+
+def attach_parser(subparsers):
+    parser = subparsers.add_parser(
+        "jpegtopdf",
+        help = "Convert JPEG images to PDF",
+    )
+    parser.add_argument(
+        "images",
+        nargs = "+",
+        help = "Input JPEG images",
+        type = os.path.abspath,
+    )
+    parser.add_argument(
+        "--output", "-o",
+        required = True,
+        help = "Target path for the new PDF"
+    )
+    parser.add_argument(
+        "--inline",
+        action = "store_true",
+        help = "Whether to use FPDFImageObj_LoadJpegFileInline() rather than FPDFImageObj_LoadJpegFile()."
+    )
+
+
+def main(args):
+
+    # Very rudimentary JPEG to PDF conversion, mostly for testing
+    # The implementation could certainly be more sophisticated (e. g. configurable DPI, margins, crop, positioning, ...)
+
+    pdf = pdfium.PdfDocument.new()
+
+    for file in args.images:
+
+        # Simple check if the input files are actually JPEGs
+        # A better implementation could use mimetypes or python-magic instead
+        assert any(file.lower().endswith(ext) for ext in (".jpg", ".jpeg"))
+
+        image = pdfium.PdfImageObject.new(pdf)
+
+        buffer = open(file, "rb")
+        width, height = image.load_jpeg(buffer, inline=args.inline, autoclose=True)
+
+        page = pdf.new_page(width, height)
+        page.insert_object(image)
+        page.generate_content()
+
+    if os.path.exists(args.output):
+        raise FileExistsError("Refusing to overwrite '%s'" % args.output)
+
+    with open(args.output, "wb") as buffer:
+        pdf.save(buffer)
diff --git a/src/pypdfium2/_cli/main.py b/src/pypdfium2/_cli/main.py
@@ -17,6 +17,7 @@
     tile,
     extract_text,
     find_pageobjects,
+    jpegtopdf,
 )
 
 try:
@@ -32,6 +33,7 @@
     "tile": tile,
     "extract-text": extract_text,
     "find-pageobjects": find_pageobjects,
+    "jpegtopdf": jpegtopdf,
 }
 
 

diff --git a/src/pypdfium2/_helpers/__init__.py b/src/pypdfium2/_helpers/__init__.py
@@ -6,4 +6,5 @@
 from pypdfium2._helpers.converters import *
 from pypdfium2._helpers.document import *
 from pypdfium2._helpers.page import *
+from pypdfium2._helpers.pageobject import *
 from pypdfium2._helpers.textpage import *
diff --git a/src/pypdfium2/_helpers/converters.py b/src/pypdfium2/_helpers/converters.py
@@ -17,7 +17,7 @@
 class BitmapConvBase:
     """
     Parent class for bitmap converters compatible with :meth:`.PdfPage.render_to` / :meth:`.PdfDocument.render_to`.
-    The constructor captures any arguments and adds them to the :meth:`.run` call.
+    The initialiser captures any arguments and adds them to the :meth:`.run` call.
     """
 
     def __init__(self, *args, **kwargs):
@@ -35,9 +35,9 @@ def run(result, renderer_kws, *args, **kwargs):
             renderer_kws (dict):
                 Dictionary of rendering keywords that were passed in by the caller.
             args (tuple):
-                Further positional arguments to the converter, as captured by the constructor.
+                Further positional arguments to the converter, as captured by the initialiser.
             kwargs (dict):
-                Further keyword arguments to the converter, as captured by the constructor.
+                Further keyword arguments to the converter, as captured by the initialiser.
         Returns:
             typing.Any: The converted rendering result (implementation-specific).
         """

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
@@ -75,6 +75,7 @@ def __init__(
         self._orig_input = input_data
         self._actual_input = input_data
         self._data_holder = []
+        self._data_closer = []
 
         self._password = password
         self._file_access = file_access
@@ -94,7 +95,7 @@ def __init__(
                 pass
             elif self._file_access is FileAccess.BUFFER:
                 self._actual_input = open(self._orig_input, "rb")
-                self._autoclose = True
+                self._data_closer.append(self._actual_input)
             elif self._file_access is FileAccess.BYTES:
                 buf = open(self._orig_input, "rb")
                 self._actual_input = buf.read()
@@ -108,9 +109,12 @@ def __init__(
             self.raw, ld_data = _open_pdf(self._actual_input, self._password)
             self._data_holder += ld_data
 
+        if self._autoclose and is_input_buffer(self._actual_input):
+            self._data_closer.append(self._actual_input)
+
         self._finalizer = weakref.finalize(
             self, self._static_close,
-            self.raw, self._data_holder, self._autoclose, self._actual_input,
+            self.raw, self._data_holder, self._data_closer,
         )
 
 
@@ -147,15 +151,15 @@ def new(cls):
 
 
     @staticmethod
-    def _static_close(raw, data_holder, autoclose, actual_input):
+    def _static_close(raw, data_holder, data_closer):
 
         # logger.debug("Closing document")
         pdfium.FPDF_CloseDocument(raw)
 
         for data in data_holder:
             id(data)
-        if autoclose and is_input_buffer(actual_input):
-            actual_input.close()
+        for data in data_closer:
+            data.close()
 
 
     @staticmethod
@@ -178,6 +182,8 @@ def close(self):
         self.exit_formenv()
         self._finalizer()
         self.raw = None
+        self._data_holder = []
+        self._data_closer = []
 
 
     def _tree_closed(self):
@@ -279,7 +285,7 @@ def get_page_size(self, index):
         success = pdfium.FPDF_GetPageSizeByIndexF(self.raw, index, size)
         if not success:
             raise PdfiumError("Getting page size by index failed.")
-        return (float(size.width), float(size.height))
+        return (size.width, size.height)
 
 
     def page_as_xobject(self, index, dest_pdf):
@@ -656,6 +662,7 @@ def as_pageobject(self):
         raw_pageobj = pdfium.FPDF_NewFormObjectFromXObject(self.raw)
         return PdfPageObject(
             raw = raw_pageobj,
+            type = pdfium.FPDF_PAGEOBJ_FORM,
             pdf = self.pdf,
         )
 

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
@@ -1,5 +1,4 @@
 # SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
-# SPDX-FileCopyrightText: 2022 Anurag Bansal <anurag.bansal.585@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import math
@@ -263,6 +262,9 @@ def insert_text(
                 PDF font data.
         """
 
+        # User-contributed code
+        # SPDX-FileCopyrightText: 2022 Anurag Bansal <anurag.bansal.585@gmail.com>
+
         hb_buffer = harfbuzz.Buffer()
         hb_buffer.add_str(text)
         hb_buffer.guess_segment_properties()
@@ -316,15 +318,16 @@ def get_objects(self, max_depth=2, form=None, level=0):
             if raw_obj is None:
                 raise PdfiumError("Failed to get page object.")
 
-            helper_obj = PdfPageObject(
+            type = pdfium.FPDFPageObj_GetType(raw_obj)
+            yield PdfPageObject(
                 raw = raw_obj,
+                type = type,
                 page = self,
                 pdf = self.pdf,
                 level = level,
             )
-            yield helper_obj
 
-            if level < max_depth-1 and helper_obj.type == pdfium.FPDF_PAGEOBJ_FORM:
+            if level < max_depth-1 and type == pdfium.FPDF_PAGEOBJ_FORM:
                 yield from self.get_objects(
                     max_depth = max_depth,
                     form = raw_obj,
@@ -338,10 +341,10 @@ def render_to(self, converter, **renderer_kws):
         
         Parameters:
             converter (BitmapConvBase | typing.Callable):
-                A translator to convert the output of :meth:`.render_base`.
-                See :class:`.BitmapConv` for a set of built-in converters.
+                A translator to convert the output of :meth:`.render_base`. See :class:`.BitmapConv` for a set of built-in converters.
             renderer_kws (dict):
                 Keyword arguments to the renderer.
+        
         Returns:
             typing.Any: Converter-specific result.