From 78baa8f30bf9f2acabd20c9efdc26a4b81042999 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Sat, 14 Sep 2024 18:56:00 +1000 Subject: [PATCH] BUG: Warn when visitor* arguments are ignored (#2845) visitor* function arguments are silently ignored when extraction_mode="layout". Document this a bit better and add a warning when these arguments are ignored. Closes #2840. --- pypdf/_page.py | 14 ++++++++++++++ tests/test_text_extraction.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index d4ba13134..471256eec 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2172,19 +2172,24 @@ def extract_text( default = (0, 90, 180, 270) note: currently only 0 (up),90 (turned left), 180 (upside down), 270 (turned right) + Silently ignored in "layout" mode. space_width: force default space width if not extracted from font (default: 200) + Silently ignored in "layout" mode. visitor_operand_before: function to be called before processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. + Ignored with a warning in "layout" mode. visitor_operand_after: function to be called after processing an operation. It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. + Ignored with a warning in "layout" mode. visitor_text: function to be called when extracting some text at some position. It has five arguments: text, current transformation matrix, text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + Ignored with a warning in "layout" mode. extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, "layout" for experimental layout mode functionality. NOTE: orientations, space_width, and visitor_* parameters are NOT respected @@ -2213,6 +2218,15 @@ def extract_text( if extraction_mode not in ["plain", "layout"]: raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") if extraction_mode == "layout": + for visitor in ("visitor_operand_before", + "visitor_operand_after", + "visitor_text", + ): + if locals()[visitor]: + logger_warning( + f"Argument {visitor} is ignored in layout mode", + __name__, + ) return self._layout_mode_text( space_vertically=kwargs.get("layout_mode_space_vertically", True), scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index dcd4e6cae..2f0eaad1d 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -5,6 +5,7 @@ """ from io import BytesIO from pathlib import Path +from unittest.mock import patch import pytest @@ -173,3 +174,18 @@ def test_layout_mode_indirect_sequence_font_widths(): with pytest.raises(ParseError) as exc: reader.pages[0].extract_text(extraction_mode="layout") assert str(exc.value).startswith("Invalid font width definition") + +def dummy_visitor_text(text, ctm, tm, fd, fs): + pass + +@patch("pypdf._page.logger_warning") +def test_layout_mode_warnings(mock_logger_warning): + # Check that a warning is issued when an argument is ignored + reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") + page = reader.pages[0] + page.extract_text(extraction_mode="plain", visitor_text=dummy_visitor_text) + mock_logger_warning.assert_not_called() + page.extract_text(extraction_mode="layout", visitor_text=dummy_visitor_text) + mock_logger_warning.assert_called_with( + "Argument visitor_text is ignored in layout mode", "pypdf._page" + )