Skip to content

Commit

Permalink
Remove PdfDocument._rendering_input and related features
Browse files Browse the repository at this point in the history
In principle, this is an API-breaking change, but this part was so
insignificant that I have no objection to removing it without a major
release.
  • Loading branch information
mara004 committed Oct 11, 2022
1 parent ba76f9b commit f1f510c
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 71 deletions.
3 changes: 3 additions & 0 deletions docs/devel/changelog_staging.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
<!-- List character: dash (-) -->

# Changelog for next release
- Disruption: Two components of `PdfDocument` have been removed to clean up the code (without a major release, due to their insignificance):
- Removal of `update_rendering_input()`. Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
- The multipage renderer does not implicitly read byte buffers into memory anymore. Callers are expected to take an explicit decision by providing a different input in the first place.
1 change: 0 additions & 1 deletion docs/devel/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Also see the issues panel and inline `TODO` marks in source code.
* Ensure we correctly handle PDFium return codes indicating failure.
* Review on a case-by-case basis where we should raise an error and where pass.
* Investigate if we can implement interruptible rendering.
* When rendering with multiple processes and bytes were provided as input, is the memory duplicated or shared? If it's duplicated, find a way to share it or write a tempfile instead.
* Move init/destroy into a separate file. Provide public init/destroy functions, given that embedders who deal with long-running applications might not want to have PDFium in memory all the time.
* Make the bindings file `_pypdfium.py` public ?

Expand Down
7 changes: 1 addition & 6 deletions docs/source/planned_changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,5 @@ The following API breaking changes are being considered for the next major relea
- The textpage API will change
* The `count_chars()` alias will be removed in favour of the `n_chars` attribute.
* The `get_text()` alias will be removed in favour of `get_text_bounded()`.
- The `PdfDocument` class will be cleaned up:
* The context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
* The `update_rendering_input()` method will be removed.
Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
* The multipage renderer will not implicitly read byte buffers into memory anymore.
Callers are expected to take an explicit decision by providing a different input in the first place.
- The `PdfDocument` context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
- `PdfDocument.add_font()` might be changed to take bytes rather than a file path.
41 changes: 10 additions & 31 deletions src/pypdfium2/_helpers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,7 @@ def __init__(

self._orig_input = input_data
self._actual_input = input_data
self._rendering_input = None
self._ld_data = None
self._form_env = None
self._form_config = None
self._form_finalizer = None

self._password = password
self._file_access = file_access
Expand Down Expand Up @@ -109,6 +105,10 @@ def __init__(
else:
self.raw, self._ld_data = open_pdf(self._actual_input, self._password)

self._form_env = None
self._form_config = None
self._form_finalizer = None

self._finalizer = weakref.finalize(
self, self._static_close,
self.raw, self._ld_data, self._autoclose, self._actual_input,
Expand Down Expand Up @@ -494,19 +494,6 @@ def print_toc(toc, n_digits=2):
)


def update_rendering_input(self):
"""
Update the input sources for concurrent rendering to the document's current state
by saving to bytes and setting the result as new input.
If you modified the document, you may want to call this method before :meth:`.render_to`.
"""
buffer = io.BytesIO()
self.save(buffer)
buffer.seek(0)
self._rendering_input = buffer.read()
buffer.close()


@classmethod
def _process_page(cls, index, converter, input_data, password, file_access, **kwargs):
pdf = cls(
Expand All @@ -527,7 +514,7 @@ def render_to(
**kwargs
):
"""
Concurrently render multiple pages, using a process pool executor.
Render multiple pages in parallel, using a process pool executor.
If rendering only a single page, the call is simply forwarded to :meth:`.PdfPage.render_to` as a shortcut.
Expand Down Expand Up @@ -560,23 +547,15 @@ def render_to(
yield result
return

if self._rendering_input is None:
if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
logger.warning("Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data.")
self.update_rendering_input()
elif is_input_buffer(self._orig_input):
logger.warning("Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly.")
cursor = self._orig_input.tell()
self._orig_input.seek(0)
self._rendering_input = self._orig_input.read()
self._orig_input.seek(cursor)
else:
self._rendering_input = self._orig_input
if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
raise ValueError("Cannot render in parallel without input sources.")
elif is_input_buffer(self._orig_input):
raise ValueError("Cannot render in parallel with buffer input.")

invoke_renderer = functools.partial(
PdfDocument._process_page,
converter = converter,
input_data = self._rendering_input,
input_data = self._orig_input,
password = self._password,
file_access = self._file_access,
**kwargs
Expand Down
1 change: 0 additions & 1 deletion tests/helpers/test_opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ def test_open_new():
assert isinstance(dest_pdf, pdfium.PdfDocument)
assert isinstance(dest_pdf.raw, pdfium.FPDF_DOCUMENT)
assert dest_pdf.raw is dest_pdf._orig_input is dest_pdf._actual_input
assert dest_pdf._rendering_input is None
assert dest_pdf._ld_data is None

assert dest_pdf.get_version() is None
Expand Down
46 changes: 14 additions & 32 deletions tests/helpers/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

import io
import re
import math
import ctypes
import logging
Expand Down Expand Up @@ -377,44 +378,32 @@ def test_render_pdffile(render_pdffile_topil, render_pdffile_tobytes, render_pdf
assert a == b == c


def test_render_pdf_new(caplog):
def test_render_pdf_new():

pdf = pdfium.PdfDocument.new()
# two pages to actually reach the process pool and not just the single-page shortcut
pdf = pdfium.PdfDocument.new()
page_1 = pdf.new_page(50, 100)
page_2 = pdf.new_page(50, 100)
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
)

with caplog.at_level(logging.WARNING):
renderer = pdf.render_to(pdfium.BitmapConv.pil_image)
with pytest.raises(ValueError, match="Cannot render in parallel without input sources."):
image = next(renderer)

warning = "Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data."
assert warning in caplog.text

assert isinstance(image, PIL.Image.Image)
assert image.mode == "RGB"
assert image.size == (50, 100)


def test_render_pdfbuffer(caplog):

def test_render_pdfbuffer():

buffer = open(TestFiles.multipage, "rb")
pdf = pdfium.PdfDocument(buffer)
assert pdf._orig_input is buffer
assert pdf._actual_input is buffer
assert pdf._rendering_input is None

with caplog.at_level(logging.WARNING):
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
scale = 0.5,
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)

assert isinstance(pdf._rendering_input, bytes)
warning = "Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly."
assert warning in caplog.text
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
)
with pytest.raises(ValueError, match=re.escape("Cannot render in parallel with buffer input.")):
next(renderer)


def test_render_pdfbytes():
Expand All @@ -425,14 +414,12 @@ def test_render_pdfbytes():
pdf = pdfium.PdfDocument(data)
assert pdf._orig_input is data
assert pdf._actual_input is data
assert pdf._rendering_input is None
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
scale = 0.5,
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)
assert isinstance(pdf._rendering_input, bytes)


def test_render_pdffile_asbuffer():
Expand All @@ -441,7 +428,6 @@ def test_render_pdffile_asbuffer():

assert pdf._orig_input == TestFiles.multipage
assert isinstance(pdf._actual_input, io.BufferedReader)
assert pdf._rendering_input is None
assert pdf._file_access is pdfium.FileAccess.BUFFER

renderer = pdf.render_to(
Expand All @@ -451,8 +437,6 @@ def test_render_pdffile_asbuffer():
image = next(renderer)
assert isinstance(image, PIL.Image.Image)

assert pdf._rendering_input == TestFiles.multipage

pdf.close()
assert pdf._actual_input.closed is True

Expand All @@ -463,7 +447,6 @@ def test_render_pdffile_asbytes():

assert pdf._orig_input == TestFiles.multipage
assert isinstance(pdf._actual_input, bytes)
assert pdf._rendering_input is None
assert pdf._file_access is pdfium.FileAccess.BYTES

renderer = pdf.render_to(
Expand All @@ -472,7 +455,6 @@ def test_render_pdffile_asbytes():
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)
assert pdf._rendering_input == TestFiles.multipage


@pytest.mark.parametrize(
Expand Down

0 comments on commit f1f510c

Please sign in to comment.