Remove PdfDocument._rendering_input and related features

In principle, this is an API-breaking change, but this part was so insignificant that I have no objection to removing it without a major release.
pypdfium2-team · Oct 11, 2022 · f1f510c · f1f510c
1 parent ba76f9b
commit f1f510c
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 71 deletions.
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
@@ -4,3 +4,6 @@
 <!-- List character: dash (-) -->
 
 # Changelog for next release
+- Disruption: Two components of `PdfDocument` have been removed to clean up the code (without a major release, due to their insignificance):
+  - Removal of `update_rendering_input()`. Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
+  - The multipage renderer does not implicitly read byte buffers into memory anymore. Callers are expected to take an explicit decision by providing a different input in the first place.
diff --git a/docs/devel/tasks.md b/docs/devel/tasks.md
@@ -13,7 +13,6 @@ Also see the issues panel and inline `TODO` marks in source code.
 * Ensure we correctly handle PDFium return codes indicating failure.
 * Review on a case-by-case basis where we should raise an error and where pass.
 * Investigate if we can implement interruptible rendering.
-* When rendering with multiple processes and bytes were provided as input, is the memory duplicated or shared? If it's duplicated, find a way to share it or write a tempfile instead.
 * Move init/destroy into a separate file. Provide public init/destroy functions, given that embedders who deal with long-running applications might not want to have PDFium in memory all the time.
 * Make the bindings file `_pypdfium.py` public ?
 

diff --git a/docs/source/planned_changes.md b/docs/source/planned_changes.md
@@ -9,10 +9,5 @@ The following API breaking changes are being considered for the next major relea
 - The textpage API will change
   * The `count_chars()` alias will be removed in favour of the `n_chars` attribute.
   * The `get_text()` alias will be removed in favour of `get_text_bounded()`.
-- The `PdfDocument` class will be cleaned up:
-  * The context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
-  * The `update_rendering_input()` method will be removed.
-    Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
-  * The multipage renderer will not implicitly read byte buffers into memory anymore.
-    Callers are expected to take an explicit decision by providing a different input in the first place.
+- The `PdfDocument` context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
 - `PdfDocument.add_font()` might be changed to take bytes rather than a file path.
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
@@ -76,11 +76,7 @@ def __init__(
 
         self._orig_input = input_data
         self._actual_input = input_data
-        self._rendering_input = None
         self._ld_data = None
-        self._form_env = None
-        self._form_config = None
-        self._form_finalizer = None
 
         self._password = password
         self._file_access = file_access
@@ -109,6 +105,10 @@ def __init__(
         else:
             self.raw, self._ld_data = open_pdf(self._actual_input, self._password)
 
+        self._form_env = None
+        self._form_config = None
+        self._form_finalizer = None
+
         self._finalizer = weakref.finalize(
             self, self._static_close,
             self.raw, self._ld_data, self._autoclose, self._actual_input,
@@ -494,19 +494,6 @@ def print_toc(toc, n_digits=2):
             )
 
 
-    def update_rendering_input(self):
-        """
-        Update the input sources for concurrent rendering to the document's current state
-        by saving to bytes and setting the result as new input.
-        If you modified the document, you may want to call this method before :meth:`.render_to`.
-        """
-        buffer = io.BytesIO()
-        self.save(buffer)
-        buffer.seek(0)
-        self._rendering_input = buffer.read()
-        buffer.close()
-
-
     @classmethod
     def _process_page(cls, index, converter, input_data, password, file_access, **kwargs):
         pdf = cls(
@@ -527,7 +514,7 @@ def render_to(
             **kwargs
         ):
         """
-        Concurrently render multiple pages, using a process pool executor.
+        Render multiple pages in parallel, using a process pool executor.
         
         If rendering only a single page, the call is simply forwarded to :meth:`.PdfPage.render_to` as a shortcut.
         
@@ -560,23 +547,15 @@ def render_to(
             yield result
             return
 
-        if self._rendering_input is None:
-            if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
-                logger.warning("Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data.")
-                self.update_rendering_input()
-            elif is_input_buffer(self._orig_input):
-                logger.warning("Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly.")
-                cursor = self._orig_input.tell()
-                self._orig_input.seek(0)
-                self._rendering_input = self._orig_input.read()
-                self._orig_input.seek(cursor)
-            else:
-                self._rendering_input = self._orig_input
+        if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
+            raise ValueError("Cannot render in parallel without input sources.")
+        elif is_input_buffer(self._orig_input):
+            raise ValueError("Cannot render in parallel with buffer input.")
 
         invoke_renderer = functools.partial(
             PdfDocument._process_page,
             converter = converter,
-            input_data = self._rendering_input,
+            input_data = self._orig_input,
             password = self._password,
             file_access = self._file_access,
             **kwargs

diff --git a/tests/helpers/test_opener.py b/tests/helpers/test_opener.py
@@ -166,7 +166,6 @@ def test_open_new():
     assert isinstance(dest_pdf, pdfium.PdfDocument)
     assert isinstance(dest_pdf.raw, pdfium.FPDF_DOCUMENT)
     assert dest_pdf.raw is dest_pdf._orig_input is dest_pdf._actual_input
-    assert dest_pdf._rendering_input is None
     assert dest_pdf._ld_data is None
 
     assert dest_pdf.get_version() is None

diff --git a/tests/helpers/test_renderer.py b/tests/helpers/test_renderer.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import io
+import re
 import math
 import ctypes
 import logging
@@ -377,44 +378,32 @@ def test_render_pdffile(render_pdffile_topil, render_pdffile_tobytes, render_pdf
         assert a == b == c
 
 
-def test_render_pdf_new(caplog):
+def test_render_pdf_new():
 
-    pdf = pdfium.PdfDocument.new()
     # two pages to actually reach the process pool and not just the single-page shortcut
+    pdf = pdfium.PdfDocument.new()
     page_1 = pdf.new_page(50, 100)
     page_2 = pdf.new_page(50, 100)
+    renderer = pdf.render_to(
+        pdfium.BitmapConv.pil_image,
+    )
 
-    with caplog.at_level(logging.WARNING):
-        renderer = pdf.render_to(pdfium.BitmapConv.pil_image)
+    with pytest.raises(ValueError, match="Cannot render in parallel without input sources."):
         image = next(renderer)
-
-    warning = "Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data."
-    assert warning in caplog.text
-
-    assert isinstance(image, PIL.Image.Image)
-    assert image.mode == "RGB"
-    assert image.size == (50, 100)
-
 
-def test_render_pdfbuffer(caplog):
+
+def test_render_pdfbuffer():
 
     buffer = open(TestFiles.multipage, "rb")
     pdf = pdfium.PdfDocument(buffer)
     assert pdf._orig_input is buffer
     assert pdf._actual_input is buffer
-    assert pdf._rendering_input is None
 
-    with caplog.at_level(logging.WARNING):
-        renderer = pdf.render_to(
-            pdfium.BitmapConv.pil_image,
-            scale = 0.5,
-        )
-        image = next(renderer)
-        assert isinstance(image, PIL.Image.Image)
-
-    assert isinstance(pdf._rendering_input, bytes)
-    warning = "Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly."
-    assert warning in caplog.text
+    renderer = pdf.render_to(
+        pdfium.BitmapConv.pil_image,
+    )
+    with pytest.raises(ValueError, match=re.escape("Cannot render in parallel with buffer input.")):
+        next(renderer)
 
 
 def test_render_pdfbytes():
@@ -425,14 +414,12 @@ def test_render_pdfbytes():
     pdf = pdfium.PdfDocument(data)
     assert pdf._orig_input is data
     assert pdf._actual_input is data
-    assert pdf._rendering_input is None
     renderer = pdf.render_to(
         pdfium.BitmapConv.pil_image,
         scale = 0.5,
     )
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
-    assert isinstance(pdf._rendering_input, bytes)
 
 
 def test_render_pdffile_asbuffer():
@@ -441,7 +428,6 @@ def test_render_pdffile_asbuffer():
 
     assert pdf._orig_input == TestFiles.multipage
     assert isinstance(pdf._actual_input, io.BufferedReader)
-    assert pdf._rendering_input is None
     assert pdf._file_access is pdfium.FileAccess.BUFFER
 
     renderer = pdf.render_to(
@@ -451,8 +437,6 @@ def test_render_pdffile_asbuffer():
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
 
-    assert pdf._rendering_input == TestFiles.multipage
-
     pdf.close()
     assert pdf._actual_input.closed is True
 
@@ -463,7 +447,6 @@ def test_render_pdffile_asbytes():
 
     assert pdf._orig_input == TestFiles.multipage
     assert isinstance(pdf._actual_input, bytes)
-    assert pdf._rendering_input is None
     assert pdf._file_access is pdfium.FileAccess.BYTES
 
     renderer = pdf.render_to(
@@ -472,7 +455,6 @@ def test_render_pdffile_asbytes():
     )
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
-    assert pdf._rendering_input == TestFiles.multipage
 
 
 @pytest.mark.parametrize(