diff --git a/pypdf/_page.py b/pypdf/_page.py index 120e15a19..8b6461082 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1873,6 +1873,7 @@ def _extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ See extract_text for most arguments. @@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() return None elif operator == b"ET": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] elif operator == b"cm": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: elif operator == b"Tf": if text != "": output += text # .translate(cmap) - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + if visitor_text is not None and group_TJ: + # To prevent sending letters instead of words we + # override the visitor temporarily. + visitor_text_before = visitor_text + tm_matrix_before = [ + tm_matrix[0], + tm_matrix[1], + tm_matrix[2], + tm_matrix[3], + tm_matrix[4], + tm_matrix[5], + ] + text_TJ: List[str] = [] + + def visitor_text( + text: str, + cm_matrix: Any, + tm_matrix: Any, + font_dict: Any, + font_size: Any, + ) -> None: + # TODO cases where the current inserting order is kept + if rtl_dir: + # right-to-left + text_TJ.insert(0, text) # noqa + else: + text_TJ.append(text) # noqa + for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) @@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: and (text[-1] != " ") ): process_operation(b"Tj", [" "]) + if visitor_text is not None and group_TJ: + visitor_text = visitor_text_before + visitor_text( + "".join(text_TJ), + cm_matrix, + tm_matrix_before, + cmap[3], + font_size, + ) elif operator == b"Do": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" @@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) output += text - if visitor_text is not None: - visitor_text( - text, - memo_cm, - memo_tm, - cmap[3], - font_size, - ) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of - if text != "" and visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output def extract_text( @@ -2207,6 +2226,7 @@ def extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Locate all text drawing commands, in the order they are provided in the @@ -2246,6 +2266,8 @@ def extract_text( text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text @@ -2295,6 +2317,7 @@ def extract_text( visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) def extract_xform_text( @@ -2305,6 +2328,7 @@ def extract_xform_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Extract text from an XObject. @@ -2316,6 +2340,8 @@ def extract_xform_text( visitor_operand_before: visitor_operand_after: visitor_text: + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 37af3cd54..9e1b08164 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -123,7 +123,7 @@ def crlf_space_check( output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -136,13 +136,21 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 180: if delta_y > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -155,13 +163,21 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 90: if delta_x > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -180,7 +196,7 @@ def crlf_space_check( output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -193,6 +209,14 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) except Exception: pass tm_prev = tm_matrix.copy() @@ -214,12 +238,13 @@ def handle_tj( rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], ) -> Tuple[str, bool]: - m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): text += operands[0] + if visitor_text is not None: + visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size) else: t: str = "" tt: bytes = ( @@ -243,6 +268,7 @@ def handle_tj( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) # "\u0590 - \u08FF \uFB50 - \uFDFF" + tj_text = "" for x in [cmap[1][x] if x in cmap[1] else x for x in t]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: @@ -258,7 +284,7 @@ def handle_tj( or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): - text = x + text if rtl_dir else text + x + tj_text = x + tj_text if rtl_dir else tj_text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF @@ -280,6 +306,9 @@ def handle_tj( if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - text = text + x + tj_text = tj_text + x # fmt: on + text = tj_text + text if rtl_dir else text + tj_text + if visitor_text is not None: + visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size) return text, rtl_dir diff --git a/tests/test_page.py b/tests/test_page.py index 1c388c426..a8dbf0a79 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,6 +1,7 @@ """Test the pypdf._page module.""" import json import math +import re from copy import deepcopy from io import BytesIO from pathlib import Path @@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None: rectangles.append(r) def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None: - if text.strip() != "": + if text != "": if logger.isEnabledFor(logging.DEBUG): logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}") texts.append( @@ -571,7 +572,7 @@ def extract_table( It is expected that each cell is marked by a rectangle-object. It is expected that the page contains one table only. - It is expected that the table contains at least 3 columns and 2 rows. + It is expected that the table contains at least 2 columns and 2 rows. A list of rows is returned. Each row contains a list of cells. @@ -623,8 +624,8 @@ def extract_table( curr_y = None curr_row = None for r in rectangles_filtered: - if col2count[r.x] < 3 or row2count[r.y] < 2: - # We expect at least 3 columns and 2 rows. + if col2count[r.x] < 2 or row2count[r.y] < 2: + # We expect at least 2 columns and 2 rows. continue if curr_y is None or r.y != curr_y: # next row @@ -646,7 +647,8 @@ def extract_table( def extract_cell_text(cell_texts: List[PositionedText]) -> str: """Joins the text-objects of a cell.""" - return ("".join(t.text for t in cell_texts)).strip() + text_raw = "".join(t.text for t in cell_texts) + return re.sub(r" +\n", "\n", text_raw.strip()) # Test 1: We test the analysis of page 7 "2.1 LRS model". reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") @@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool: for t in texts: for r in rectangles: if r.contains(t.x, t.y): - texts = rectangle2texts.setdefault(r, []) - texts.append(t.text.strip()) + rtexts = rectangle2texts.setdefault(r, []) + if t.text != "": + rtexts.append(t.text) break # Five boxes and the figure-description below. - assert len(rectangle2texts) == 6 - box_texts = [" ".join(texts) for texts in rectangle2texts.values()] + assert len(rectangle2texts) == 11 + box_texts = [ + re.sub(" *\n", " ", "".join(texts).strip()) + for texts in rectangle2texts.values() + ] assert "Hydro Network" in box_texts assert "Hydro Events" in box_texts assert "Metadata" in box_texts @@ -697,10 +703,10 @@ def filter_first_table(r) -> bool: assert extract_cell_text(rows[0][2]) == "Description" assert extract_cell_text(rows[1][0]) == "September 2002" # The line break between "English review;" - # and "Remove" is not detected. + # and "Remove" is detected. assert ( extract_cell_text(rows[6][2]) - == "English review;Remove the UML model for the Segmented view." + == "English review;\nRemove the UML model for the Segmented view." ) assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments." @@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None: assert list_td[2] == (210.0, 210.0) assert list_td[3] == (410.0, 210.0) + # Test 3b: check extract_visitor in Sample_Td-matrix.pdf + # + (texts, rectangles) = extract_text_and_rectangles(page_td_model) + rows = extract_table(texts, rectangles) + assert len(rows) == 2 + assert extract_cell_text(rows[0][0]) == "Hello PDF!" + assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!" + assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!" + assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!" + @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"),