From 607ace6f94531150d3f58190c2b97bc50faf6f26 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 24 Nov 2023 19:56:48 +0100 Subject: [PATCH] Make document-level renderer harmless --- src/pypdfium2/__main__.py | 16 +- src/pypdfium2/_cli/_parsers.py | 14 ++ src/pypdfium2/_cli/render.py | 302 ++++++++++++++++++++++------- src/pypdfium2/_helpers/bitmap.py | 19 +- src/pypdfium2/_helpers/document.py | 95 ++------- tests_old/test_renderer.py | 7 +- 6 files changed, 270 insertions(+), 183 deletions(-) diff --git a/src/pypdfium2/__main__.py b/src/pypdfium2/__main__.py index 423f3cd11..0f459c0bb 100644 --- a/src/pypdfium2/__main__.py +++ b/src/pypdfium2/__main__.py @@ -1,16 +1,13 @@ # SPDX-FileCopyrightText: 2023 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -import os import sys -import logging import argparse import importlib -import pypdfium2._helpers as pdfium -import pypdfium2.internal as pdfium_i from pypdfium2.version import PYPDFIUM_INFO, PDFIUM_INFO # the * import in pypdfium2.raw loses underscore-prefixed members, so import from the direct origin from pypdfium2_raw.bindings import _loader_info as loader_info +from pypdfium2._cli._parsers import setup_logging SubCommands = { "arrange": "rearrange/merge documents", @@ -49,17 +46,6 @@ def get_parser(): return main_parser -def setup_logging(): - - pdfium_i.DEBUG_AUTOCLOSE.value = bool(int( os.environ.get("DEBUG_AUTOCLOSE", 0) )) - - lib_logger = logging.getLogger("pypdfium2") - lib_logger.addHandler(logging.StreamHandler()) - lib_logger.setLevel(logging.DEBUG) - - pdfium.PdfUnspHandler().setup() - - def api_main(raw_args=sys.argv[1:]): parser = get_parser() diff --git a/src/pypdfium2/_cli/_parsers.py b/src/pypdfium2/_cli/_parsers.py index f5c4021f3..500b265ca 100644 --- a/src/pypdfium2/_cli/_parsers.py +++ b/src/pypdfium2/_cli/_parsers.py @@ -1,8 +1,22 @@ # SPDX-FileCopyrightText: 2023 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +import os +import logging from pathlib import Path import pypdfium2._helpers as pdfium +import pypdfium2.internal as pdfium_i + + +def setup_logging(): + + pdfium_i.DEBUG_AUTOCLOSE.value = bool(int( os.environ.get("DEBUG_AUTOCLOSE", 0) )) + + lib_logger = logging.getLogger("pypdfium2") + lib_logger.addHandler(logging.StreamHandler()) + lib_logger.setLevel(logging.DEBUG) + + pdfium.PdfUnspHandler().setup() def parse_numtext(numtext): diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index d284a1c85..98d313bcb 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -2,34 +2,71 @@ # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause import os +import math import logging +import functools from pathlib import Path +import multiprocessing as mp +import concurrent.futures as ft + +try: + import PIL.Image +except ImportError: + PIL = None + import pypdfium2._helpers as pdfium -# TODO? consider dotted access -from pypdfium2._cli._parsers import add_input, get_input +import pypdfium2.internal as pdfium_i +import pypdfium2.raw as pdfium_r +# CONSIDER dotted access +from pypdfium2._cli._parsers import add_input, get_input, setup_logging logger = logging.getLogger(__name__) -ColorOpts = dict( - metavar = "C", - nargs = 4, - type = int, +def _bitmap_wrapper_foreign_simple(width, height, format, *args, **kwargs): + if format == pdfium_r.FPDFBitmap_BGRx: + use_alpha = False + elif format == pdfium_r.FPDFBitmap_BGRA: + use_alpha = True + else: + raise RuntimeError(f"Cannot create foreign_simple bitmap with bitmap type {pdfium_i.BitmapTypeToStr[format]}.") + return pdfium.PdfBitmap.new_foreign_simple(width, height, use_alpha, *args, **kwargs) + +BitmapMakers = dict( + native = pdfium.PdfBitmap.new_native, + foreign = pdfium.PdfBitmap.new_foreign, + foreign_packed = functools.partial(pdfium.PdfBitmap.new_foreign, force_packed=True), + foreign_simple = _bitmap_wrapper_foreign_simple, +) + +CsFields = ("path_fill", "path_stroke", "text_fill", "text_stroke") +ColorOpts = dict(metavar="C", nargs=4, type=int) +SampleTheme = dict( + # choose some random colors so we can distinguish the different drawings (TODO improve) + path_fill = (170, 100, 0, 255), # dark orange + path_stroke = (0, 150, 255, 255), # sky blue + text_fill = (255, 255, 255, 255), # white + text_stroke = (150, 255, 0, 255), # green ) def attach(parser): + # TODO implement numpy/opencv receiver add_input(parser, pages=True) parser.add_argument( "--output", "-o", type = Path, required = True, - help = "Output directory where to place the serially numbered images", + help = "Output directory where the serially numbered images shall be placed.", + ) + parser.add_argument( + "--prefix", + help = "Custom prefix for the images. Defaults to the input filename's stem.", ) parser.add_argument( "--format", "-f", default = "jpg", - help = "The image format to use", + help = "The image format to use.", ) parser.add_argument( "--scale", @@ -42,73 +79,124 @@ def attach(parser): default = 0, type = int, choices = (0, 90, 180, 270), - help = "Rotate pages by 90, 180 or 270 degrees", + help = "Rotate pages by 90, 180 or 270 degrees.", ) parser.add_argument( "--fill-color", - default = (255, 255, 255, 255), help = "Color the bitmap will be filled with before rendering. It shall be given in RGBA format as a sequence of integers ranging from 0 to 255. Defaults to white.", **ColorOpts, ) - parser.add_argument( - "--force-halftone", - action = "store_true", - help = "Always use halftone for image stretching", - ) - parser.add_argument( - "--no-annotations", - action = "store_true", - help = "Prevent rendering of PDF annotations", - ) - parser.add_argument( - "--no-forms", - action = "store_true", - help = "Prevent rendering of PDF forms", - ) parser.add_argument( "--optimize-mode", choices = ("lcd", "print"), - help = "Select a rendering optimisation mode (lcd, print)", - ) - parser.add_argument( - "--grayscale", - action = "store_true", - help = "Whether to render in grayscale mode (no colors)", + help = "The rendering optimisation mode. None if not given.", ) parser.add_argument( "--crop", nargs = 4, type = float, default = (0, 0, 0, 0), - help = "Amount to crop from (left, bottom, right, top)", + help = "Amount to crop from (left, bottom, right, top).", + ) + parser.add_argument( + "--no-annotations", + action = "store_true", + help = "Prevent rendering of PDF annotations.", + ) + parser.add_argument( + "--no-forms", + action = "store_true", + help = "Prevent rendering of PDF forms.", ) parser.add_argument( "--no-antialias", nargs = "+", default = (), choices = ("text", "image", "path"), - help = "Item types that shall not be smoothed", + type = str.lower, + help = "Item types that shall not be smoothed.", ) parser.add_argument( + "--force-halftone", + action = "store_true", + help = "Always use halftone for image stretching.", + ) + + bitmap = parser.add_argument_group( + title = "Bitmap options", + description = "Bitmap config, including pixel format. Notes: 1) By default, an alpha channel will be used only if --fill-color has transparency. 2) The combination of --rev-byteorder and --prefer-bgrx may be used to achieve a pixel format natively supported by PIL, to avoid data copying.", + ) + bitmap.add_argument( + "--bitmap-maker", + choices = BitmapMakers.keys(), + default = "native", + help = "The bitmap maker to use.", + type = str.lower, + ) + bitmap.add_argument( + "--grayscale", + action = "store_true", + help = "Whether to render in grayscale mode (no colors).", + ) + # TODO consider making --rev-byteorder and --prefer-bgrx default for PIL + bitmap.add_argument( "--rev-byteorder", action = "store_true", - help = "Render with reverse byte order internally, i. e. RGB(A) instead of BGR(A). The result should be completely identical.", + help = "Render with reverse byte order internally, i. e. RGB(A/X) instead of BGR(A/X). The result should be identical.", ) - parser.add_argument( + bitmap.add_argument( "--prefer-bgrx", action = "store_true", - help = "Request the use of a four-channel pixel format for colored output, even if rendering without transparency.", + help = "Use a four-channel pixel format for colored output, even if rendering without transparency.", ) - parser.add_argument( + + parallel = parser.add_argument_group( + title = "Parallelization", + description = "Options for rendering with multiple processes.", + ) + parallel.add_argument( + "--linear", + nargs = "?", + type = int, + default = 4, + const = math.inf, + help = "Render non-parallel if the pdf is shorter than the specified value (defaults to 4). If this flag is given without a value, then render linear regardless of document length.", + ) + parallel.add_argument( "--processes", default = os.cpu_count(), type = int, - help = "The number of processes to use for rendering (defaults to the number of CPU cores)", + help = "The maximum number of parallel rendering processes. Defaults to the number of CPU cores.", + ) + parallel.add_argument( + "--parallel-strategy", + choices = ("spawn", "forkserver", "fork"), + # NOTE forkserver might also be a reasonable default for linux + default = "spawn", + type = str.lower, + help = "The process start method to use. ('fork' is discouraged due to stability issues.)", + ) + parallel.add_argument( + "--parallel-lib", + choices = ("mp", "ft"), + default = "mp", + type = str.lower, + help = "The parallelization module to use (mp = multiprocessing, ft = concurrent.futures).", + ) + parallel.add_argument( + "--parallel-map", + type = str.lower, + help = "The map function to use (backend specific, the default is an iterative map)." ) color_scheme = parser.add_argument_group( - title = "Color scheme", - description = "Options for rendering with custom color scheme", + title = "Forced color scheme", + description = "Options for using pdfium's forced color scheme renderer. Deprecated, considered not useful.", + ) + color_scheme.add_argument( + "--sample-theme", + action = "store_true", + help = "Use a dark background sample theme as base. Explicit color params override selectively." ) color_scheme.add_argument( "--path-fill", @@ -129,63 +217,135 @@ def attach(parser): color_scheme.add_argument( "--fill-to-stroke", action = "store_true", - help = "Whether fill paths need to be stroked.", + help = "Only draw borders around fill areas using the `path_stroke` color, instead of filling with the `path_fill` color.", ) -class PILSaver: +class SavingReceiver: - def __init__(self, fn_args): - self._fn_args = fn_args + def __init__(self, path_parts): + self._path_parts = path_parts + + def get_path(self, i): + output_dir, prefix, n_digits, format = self._path_parts + return output_dir / (prefix + "%0*d.%s" % (n_digits, i+1, format)) + + +class PILReceiver (SavingReceiver): def __call__(self, bitmap, index): - out_dir, out_prefix, out_suffix, n_digits, format = self._fn_args - out_path = out_dir / (out_prefix + out_suffix % (n_digits, index+1, format)) - pil_image = pdfium.PdfBitmap.to_pil(bitmap) - pil_image.save(out_path) - # return out_path + out_path = self.get_path(index) + bitmap.to_pil().save(out_path) + logger.info(f"Wrote page {index+1} as {out_path.name}") + + +def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, receiver): + + if extra_init: + extra_init() + + logger.info(f"Initializing data for process {os.getpid()}") + + pdf = pdfium.PdfDocument(input, password=password, autoclose=True) + if may_init_forms: + pdf.init_forms() + + global ProcObjs + ProcObjs = (pdf, kwargs, receiver) + + +def _render_parallel_job(i): + # logger.info(f"Started page {i+1} ...") + global ProcObjs + pdf, kwargs, receiver = ProcObjs + page = pdf[i] + bitmap = page.render(**kwargs) + receiver(bitmap, i) + page.close() def main(args): + # TODO turn into a python-usable API yielding output paths as they arrive + pdf = get_input(args) - pdf.init_forms() - - cs_kwargs = dict( - path_fill = args.path_fill, - path_stroke = args.path_stroke, - text_fill = args.text_fill, - text_stroke = args.text_stroke, - ) - cs = None - if all(cs_kwargs.values()): - cs = pdfium.PdfColorScheme( - fill_to_stroke = args.fill_to_stroke, - **cs_kwargs, - ) - elif any(cs_kwargs.values()): - raise ValueError("If rendering with custom color scheme, all parameters need to be set explicitly.") + # TODO move to parsers? + pdf_len = len(pdf) + if not all(0 <= i < pdf_len for i in args.pages): + raise ValueError("Out-of-bounds page indices are prohibited.") + if len(args.pages) != len(set(args.pages)): + raise ValueError("Duplicate page indices are prohibited.") + + if not args.prefix: + args.prefix = f"{args.input.stem}_" + if not args.fill_color: + args.fill_color = (0, 0, 0, 255) if args.sample_theme else (255, 255, 255, 255) + + cs_kwargs = dict() + if args.sample_theme: + cs_kwargs.update(**SampleTheme) + cs_kwargs.update(**{f: getattr(args, f) for f in CsFields if getattr(args, f)}) + cs = pdfium.PdfColorScheme(**cs_kwargs) if len(cs_kwargs) > 0 else None + + may_draw_forms = not args.no_forms kwargs = dict( - page_indices = args.pages, - n_processes = args.processes, scale = args.scale, rotation = args.rotation, crop = args.crop, grayscale = args.grayscale, fill_color = args.fill_color, color_scheme = cs, + fill_to_stroke = args.fill_to_stroke, optimize_mode = args.optimize_mode, draw_annots = not args.no_annotations, - may_draw_forms = not args.no_forms, + may_draw_forms = may_draw_forms, force_halftone = args.force_halftone, rev_byteorder = args.rev_byteorder, prefer_bgrx = args.prefer_bgrx, + bitmap_maker = BitmapMakers[args.bitmap_maker], ) for type in args.no_antialias: kwargs[f"no_smooth{type}"] = True - n_digits = len(str( max(args.pages)+1 )) - converter = PILSaver( (args.output, args.input.stem, "_%0*d.%s", n_digits, args.format) ) + n_digits = len(str(pdf_len)) + path_parts = (args.output, args.prefix, n_digits, args.format) + receiver = PILReceiver(path_parts) - for _ in pdf.render(converter, **kwargs): pass + if len(args.pages) <= args.linear: + + logger.info("Linear rendering ...") + if may_draw_forms: + pdf.init_forms() + + for i in args.pages: + # logger.info(f"Started page {i+1} ...") + page = pdf[i] + bitmap = page.render(**kwargs) + receiver(bitmap, i) + + else: + + logger.info("Parallel rendering ...") + + ctx = mp.get_context(args.parallel_strategy) + # TODO unify using mp.pool.Pool(context=...) ? + pool_backends = dict( + mp = (ctx.Pool, "imap"), + ft = (functools.partial(ft.ProcessPoolExecutor, mp_context=ctx), "map"), + ) + pool_ctor, map_attr = pool_backends[args.parallel_lib] + if args.parallel_map: + map_attr = args.parallel_map + + extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None) + pool_kwargs = dict( + initializer = _render_parallel_init, + initargs = (extra_init, pdf._input, args.password, may_draw_forms, kwargs, receiver), + ) + + n_procs = min(args.processes, len(args.pages)) + with pool_ctor(n_procs, **pool_kwargs) as pool: + map_func = getattr(pool, map_attr) + for _ in map_func(_render_parallel_job, args.pages): + pass diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index e0222f8cc..b80af1536 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -239,7 +239,6 @@ def to_pil(self): return image - # FIXME might want to rename *recopy* to *mutable* ? @classmethod def from_pil(cls, pil_image, recopy=False): """ @@ -247,18 +246,16 @@ def from_pil(cls, pil_image, recopy=False): Due to the restricted number of color formats and bit depths supported by PDFium's bitmap implementation, this may be a lossy operation. + Bitmaps returned by this function should be treated as immutable (i.e. don't call :meth:`.fill_rect`). + Parameters: pil_image (PIL.Image.Image): The image. - recopy (bool): - If False (the default), reuse the memory segment of an immutable bytes object as buffer to avoid an additional layer of copying. This is recommended if you do not modify the bitmap, though the buffer does not actually enforce immutability. - If True (otherwise), copy memory into a new buffer that is mutable by design. This is recommended if you modify the bitmap, e.g. using :meth:`.fill_rect`. - Note that the resulting bitmap is always independent of the PIL image, regardless of this option. Returns: PdfBitmap: PDFium bitmap (with a copy of the PIL image's data). - .. versionchanged:: 4.15 reference bytes object instead of copying - .. versionadded:: 4.16 opt-in re-copying for mutability within Python API contract + .. deprecated:: 4.25 + The *recopy* parameter has been deprecated. """ if pil_image.mode in pdfium_i.BitmapStrToConst: @@ -270,14 +267,12 @@ def from_pil(cls, pil_image, recopy=False): py_buffer = pil_image.tobytes() if recopy: - c_buffer = (ctypes.c_ubyte * len(py_buffer)).from_buffer_copy(py_buffer) + buffer = (ctypes.c_ubyte * len(py_buffer)).from_buffer_copy(py_buffer) else: - # see docs above and https://stackoverflow.com/a/21490290/15547292 - c_buffer = ctypes.cast(py_buffer, ctypes.POINTER(ctypes.c_ubyte * len(py_buffer))).contents - weakref.finalize(c_buffer, lambda: id(py_buffer)) + buffer = py_buffer w, h = pil_image.size - return cls.new_native(w, h, format, rev_byteorder=False, buffer=c_buffer) + return cls.new_native(w, h, format, rev_byteorder=False, buffer=buffer) # TODO implement from_numpy() diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 499477c17..a3a8de6d6 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -7,6 +7,7 @@ import ctypes import logging import inspect +import warnings from pathlib import Path from collections import namedtuple import multiprocessing as mp @@ -569,96 +570,30 @@ def render( converter, renderer = PdfPage.render, page_indices = None, - n_processes = os.cpu_count(), pass_info = False, - mk_formconfig = None, + n_processes = None, # ignored, retained for compat + mk_formconfig = None, # ignored, retained for compat **kwargs ): """ .. deprecated:: 4.19 - This method will be removed with the next major release (v5) due to serious conceptual problems. See the upcoming changelog for more info. + This method will be removed with the next major release due to serious issues rooted in the original API design. Use :meth:`PdfPage.render()` instead. See the upcoming changelog or :issue:`#281` for more info. + *Note that the CLI provides parallel rendering using a proper caller-side process pool with inline saving in rendering jobs.* - .. versionchanged:: 4.19 - Fixed some major non-API implementation issues. - - Render multiple pages in parallel, using a process pool executor. - - Hint: - If your code shall be frozen into an executable, :func:`multiprocessing.freeze_support` - needs to be called at the start of the ``if __name__ == "__main__":`` block if using this method. - - Parameters: - converter (typing.Callable): - A function to convert the rendering output. See :class:`.PdfBitmap` for built-in converters. - page_indices (list[int] | None): - A sequence of zero-based indices of the pages to render. Duplicate page indices are prohibited. - If None, all pages will be included. The order of results is supposed to match the order of given page indices. - n_processes (int): - The number of parallel process to use. - renderer (typing.Callable): - The page rendering function to use. This may be used to plug in custom renderers other than :meth:`.PdfPage.render`. - mk_formconfig (typing.Callable[FPDF_FORMFILLINFO] | None): - Optional callback returning a custom form config to use when initializing a form env in worker jobs. - kwargs (dict): - Keyword arguments to the renderer. - - Yields: - :data:`typing.Any`: Result as returned by the given converter. + .. versionchanged:: 4.25 + Removed the original process pool implementation and turned this into a wrapper for linear rendering, due to the serious conceptual issues and possible memory load escalation, especially with expensive receiving code (e.g. PNG encoding) or long documents. """ - # TODO(apibreak) remove mk_formconfig parameter (bloat) + warnings.warn("The document-level pdf.render() API is deprecated and uncored due to serious issues in the original concept. Use page.render() and a caller-side loop or process pool.", category=DeprecationWarning) - if not isinstance(self._input, (Path, str, bytes)): - raise ValueError(f"Cannot render in parallel with input type '{type(self._input).__name__}'.") - - n_pages = len(self) if not page_indices: - page_indices = [i for i in range(n_pages)] - else: - if not all(0 <= i < n_pages for i in page_indices): - raise ValueError("Out-of-bounds page indices are prohibited.") - if len(page_indices) != len(set(page_indices)): - raise ValueError("Duplicate page indices are prohibited.") - - converter_params = list( inspect.signature(converter).parameters )[1:] - pool_kwargs = dict( - initializer = _parallel_renderer_init, - initargs = (self._input, self._password, bool(self.formenv), mk_formconfig, renderer, converter, converter_params, pass_info, kwargs), - ) - n_processes = min(n_processes, n_pages) - with mp.Pool(n_processes, **pool_kwargs) as pool: - yield from pool.imap(_parallel_renderer_job, page_indices) - - -def _parallel_renderer_init(input_data, password, need_formenv, mk_formconfig, renderer, converter, converter_params, pass_info, kwargs): - - logger.info(f"Initializing PID {os.getpid()}") - - pdf = PdfDocument(input_data, password=password) - if need_formenv: - pdf.init_forms(config=mk_formconfig()) if mk_formconfig else pdf.init_forms() - - global _ParallelRenderObjs - _ParallelRenderObjs = (pdf, renderer, converter, converter_params, pass_info, kwargs) - - -def _parallel_renderer_job(index): - - logger.info(f"Starting page {index}") - - global _ParallelRenderObjs - pdf, renderer, converter, converter_params, pass_info, kwargs = _ParallelRenderObjs - - page = pdf[index] - bitmap = renderer(page, **kwargs) - info = bitmap.get_info() - page.close() - - # in principle, we could expose any local variables here - local_vars = dict(index=index) - result = converter(bitmap, **{p: local_vars[p] for p in converter_params}) - - return (result, info) if pass_info else result + page_indices = [i for i in range(len(self))] + for i in page_indices: + bitmap = renderer(self[i], **kwargs) + if pass_info: + yield (converter(bitmap), bitmap.get_info()) + else: + yield converter(bitmap) class PdfFormEnv (pdfium_i.AutoCloseable): diff --git a/tests_old/test_renderer.py b/tests_old/test_renderer.py index f94902b4d..bd325ffa4 100644 --- a/tests_old/test_renderer.py +++ b/tests_old/test_renderer.py @@ -294,9 +294,7 @@ def test_render_pdf_new(): page_1 = pdf.new_page(50, 100) page_2 = pdf.new_page(50, 100) renderer = pdf.render(pdfium.PdfBitmap.to_pil) - - with pytest.raises(ValueError): - next(renderer) + bitmap_p1 = next(renderer) def test_render_pdfbuffer(): @@ -305,8 +303,7 @@ def test_render_pdfbuffer(): pdf = pdfium.PdfDocument(buffer) renderer = pdf.render(pdfium.PdfBitmap.to_pil) - with pytest.raises(ValueError): - next(renderer) + bitmap_p1 = next(renderer) @pytest.mark.parametrize(