Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add layout_mode_font_height_weight argument to PageObject.extract_text() #2920

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2210,6 +2210,7 @@ def _layout_mode_text(
scale_weight: float = 1.25,
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
font_height_weight: float = 1,
) -> str:
"""
Get text preserving fidelity to source PDF text layout.
Expand All @@ -2229,6 +2230,8 @@ def _layout_mode_text(
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Defaults to None.
font_height_weight: multiplier for font height when calculating
blank lines. Defaults to 1.

Returns:
str: multiline string containing page text in a fixed width format that
Expand Down Expand Up @@ -2260,7 +2263,7 @@ def _layout_mode_text(

char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

def extract_text(
self,
Expand Down Expand Up @@ -2335,6 +2338,8 @@ def extract_text(
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
layout_mode_font_height_weight (float): multiplier for font height when calculating
blank lines. Defaults to 1.

Returns:
The extracted text
Expand All @@ -2358,6 +2363,7 @@ def extract_text(
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
debug_path=kwargs.get("layout_mode_debug_path"),
font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
)
if len(args) >= 1:
if isinstance(args[0], str):
Expand Down
5 changes: 3 additions & 2 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl


def fixed_width_page(
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
) -> str:
"""
Generate page text from text operations grouped by rendered y coordinate.
Expand All @@ -352,6 +352,7 @@ def fixed_width_page(
ty_groups: dict of text show ops as returned by y_coordinate_groups()
char_width: fixed character width
space_vertically: include blank lines inferred from y distance + font height.
font_height_weight: multiplier for font height when calculating blank lines.

Returns:
str: page text in a fixed width format that closely adheres to the rendered
Expand All @@ -363,7 +364,7 @@ def fixed_width_page(
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
blank_lines = (
int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line = ""
Expand Down
19 changes: 19 additions & 0 deletions resources/crazyones_layout_vertical_space.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
The Crazy Ones
October 14, 1998

Heres to the crazy ones. The misfits. The rebels. The troublemakers.
The round pegs in the square holes.
The ones who see things differently. Theyre not fond of rules. And
they have no respect for the status quo. You can quote them,
disagree with them, glorify or vilify them.
About the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.
Maybe they have to be crazy.
How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never been written? Or gaze at
a red planet and see a laboratory on wheels?
We make tools for these kinds of people.
While some see them as the crazy ones, we see genius. Because the
people who are crazy enough to think they can change the world,
are the ones who do.
25 changes: 25 additions & 0 deletions resources/crazyones_layout_vertical_space_font_height_weight.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
The Crazy Ones
October 14, 1998

Heres to the crazy ones. The misfits. The rebels. The troublemakers.
The round pegs in the square holes.

The ones who see things differently. Theyre not fond of rules. And
they have no respect for the status quo. You can quote them,
disagree with them, glorify or vilify them.

About the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.

Maybe they have to be crazy.

How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never been written? Or gaze at
a red planet and see a laboratory on wheels?

We make tools for these kinds of people.

While some see them as the crazy ones, we see genius. Because the
people who are crazy enough to think they can change the world,
are the ones who do.
41 changes: 41 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,44 @@ def test_text_leading_height_unit():
page = reader.pages[0]
extracted = page.extract_text()
assert "Something[cited]\n" in extracted


def test_layout_mode_space_vertically_font_height_weight():
"""Tests layout mode with vertical space and font height weight (issue #2915)"""
with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile:
# Load PDF file from file
reader = PdfReader(inputfile)
page = reader.pages[0]

# Normal behaviour
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
assert expected_line == actual_line

pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
assert text == pdftext, (
"PDF extracted text differs from expected value.\n\n"
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
)

# Blank lines are added to truly separate paragraphs
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True,
layout_mode_font_height_weight=0.85).encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
assert expected_line == actual_line

pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
assert text == pdftext, (
"PDF extracted text differs from expected value.\n\n"
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
)
Loading