From fbab9b0f472373ea874d5403f8c388c7300cf25b Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 2 Sep 2023 22:53:40 +0200 Subject: [PATCH] [BP] `PdfDocument.render()`: Fix non-API implementation issues --- docs/devel/changelog_staging.md | 2 + src/pypdfium2/_helpers/document.py | 76 +++++++++++++++--------------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index bccd8be11..718781280 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -8,5 +8,7 @@ This release backports some key fixes/improvements from the development branch: - [V8/XFA] Fixed XFA init. This issue was caused by a typo in a struct field. Thanks to BenoƮt Blanchon. - [V8/XFA] Expose V8/XFA exclusive members in the bindings file by passing ctypesgen the pre-processor defines in question. +- `PdfDocument.render()`: Fixed non-API implementation issues: + * Avoid full state data transfer and object re-initialization for each job. Instead, use a pool initializer and exploit global variables. This also makes bytes input tolerable for parallel rendering. - Fixed sourcebuild with system libraries. - Attempt to fix automatic GH pages rebuild on release. diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index dce8d9b41..6d4d6b0f7 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -6,7 +6,6 @@ import os import ctypes import logging -import functools from pathlib import Path from collections import namedtuple from concurrent.futures import ProcessPoolExecutor @@ -568,27 +567,6 @@ def get_toc( bookmark = pdfium_c.FPDFBookmark_GetNextSibling(self, bookmark) - @classmethod - def _process_page(cls, index, input_data, password, renderer, converter, pass_info, need_formenv, mk_formconfig, **kwargs): - - pdf = cls(input_data, password=password) - if need_formenv: - pdf.init_forms(config=mk_formconfig()) if mk_formconfig else pdf.init_forms() - - page = pdf[index] - bitmap = renderer(page, **kwargs) - info = bitmap.get_info() - result = converter(bitmap) - - # NOTE We MUST NOT call bitmap.close() before the converted object is serialized to the main process, otherwise we would free the buffer of a foreign bitmap prematurely if the converted object references the buffer rather than owning a copy. Confirmed by POC. - # This is not an issue when freeing the bitmap on garbage collection, provided the converted object keeps the buffer alive. - - for g in (page, pdf): - g.close() - - return (result, info) if pass_info else result - - def render( self, converter, @@ -600,6 +578,12 @@ def render( **kwargs ): """ + .. deprecated:: 4.19 + This method will be removed with the next major release (v5) due to serious conceptual problems. See the upcoming changelog for more info. + + .. versionchanged:: 4.19 + Fixed major non-API implementation issues. + Render multiple pages in parallel, using a process pool executor. Hint: @@ -622,13 +606,13 @@ def render( Keyword arguments to the renderer. Yields: - :data:`typing.Any`: Parameter-dependent result. + :data:`typing.Any`: Result as returned by the given converter. """ # TODO(apibreak) remove mk_formconfig parameter (bloat) - if not isinstance(self._input, (Path, str)): - raise ValueError("Can only render in parallel with file path input.") + if not isinstance(self._input, (Path, str, bytes)): + raise ValueError(f"Cannot render in parallel with input type '{type(self._input).__name__}'.") n_pages = len(self) if not page_indices: @@ -639,20 +623,36 @@ def render( if len(page_indices) != len(set(page_indices)): raise ValueError("Duplicate page indices are prohibited.") - invoke_renderer = functools.partial( - PdfDocument._process_page, - input_data = self._input, - password = self._password, - renderer = renderer, - converter = converter, - pass_info = pass_info, - need_formenv = bool(self.formenv), - mk_formconfig = mk_formconfig, - **kwargs + # TODO consider `mp_context = multiprocessing.get_context("spawn")` + pool_kwargs = dict( + initializer = _parallel_renderer_init, + initargs = (self._input, self._password, bool(self.formenv), mk_formconfig, renderer, converter, pass_info, kwargs), ) - - with ProcessPoolExecutor(n_processes) as pool: - yield from pool.map(invoke_renderer, page_indices) + with ProcessPoolExecutor(n_processes, **pool_kwargs) as pool: + yield from pool.map(_parallel_renderer_job, page_indices) + + +def _parallel_renderer_init(input_data, password, need_formenv, mk_formconfig, renderer, converter, pass_info, kwargs): + + pdf = PdfDocument(input_data, password=password) + if need_formenv: + pdf.init_forms(config=mk_formconfig()) if mk_formconfig else pdf.init_forms() + + global _ParallelRenderObjs + _ParallelRenderObjs = (pdf, renderer, converter, pass_info, kwargs) + + +def _parallel_renderer_job(index): + + global _ParallelRenderObjs + pdf, renderer, converter, pass_info, kwargs = _ParallelRenderObjs + + page = pdf[index] + bitmap = renderer(page, **kwargs) + info = bitmap.get_info() + result = converter(bitmap) + + return (result, info) if pass_info else result class PdfFormEnv (pdfium_i.AutoCloseable):