Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Warn when visitor* arguments are ignored #2845

Merged
merged 1 commit into from
Sep 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2172,19 +2172,24 @@ def extract_text(
default = (0, 90, 180, 270)
note: currently only 0 (up),90 (turned left), 180 (upside down),
270 (turned right)
Silently ignored in "layout" mode.
space_width: force default space width
if not extracted from font (default: 200)
Silently ignored in "layout" mode.
visitor_operand_before: function to be called before processing an operation.
It has four arguments: operator, operand-arguments,
current transformation matrix and text matrix.
Ignored with a warning in "layout" mode.
visitor_operand_after: function to be called after processing an operation.
It has four arguments: operator, operand-arguments,
current transformation matrix and text matrix.
Ignored with a warning in "layout" mode.
visitor_text: function to be called when extracting some text at some position.
It has five arguments: text, current transformation matrix,
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
Ignored with a warning in "layout" mode.
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
"layout" for experimental layout mode functionality.
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
Expand Down Expand Up @@ -2213,6 +2218,15 @@ def extract_text(
if extraction_mode not in ["plain", "layout"]:
raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
if extraction_mode == "layout":
for visitor in ("visitor_operand_before",
"visitor_operand_after",
"visitor_text",
):
if locals()[visitor]:
logger_warning(
f"Argument {visitor} is ignored in layout mode",
__name__,
)
return self._layout_mode_text(
space_vertically=kwargs.get("layout_mode_space_vertically", True),
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
Expand Down
16 changes: 16 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
from io import BytesIO
from pathlib import Path
from unittest.mock import patch

import pytest

Expand Down Expand Up @@ -173,3 +174,18 @@ def test_layout_mode_indirect_sequence_font_widths():
with pytest.raises(ParseError) as exc:
reader.pages[0].extract_text(extraction_mode="layout")
assert str(exc.value).startswith("Invalid font width definition")

def dummy_visitor_text(text, ctm, tm, fd, fs):
pass

@patch("pypdf._page.logger_warning")
def test_layout_mode_warnings(mock_logger_warning):
# Check that a warning is issued when an argument is ignored
reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf")
page = reader.pages[0]
page.extract_text(extraction_mode="plain", visitor_text=dummy_visitor_text)
mock_logger_warning.assert_not_called()
page.extract_text(extraction_mode="layout", visitor_text=dummy_visitor_text)
mock_logger_warning.assert_called_with(
"Argument visitor_text is ignored in layout mode", "pypdf._page"
)
Loading