Skip to content

Commit

Permalink
Merge branch 'main' into iss2233
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Oct 8, 2023
2 parents c2842c5 + 5c3550f commit 51933fd
Show file tree
Hide file tree
Showing 11 changed files with 215 additions and 47 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# CHANGELOG

## Version 3.16.3, 2023-10-08

### Bug Fixes (BUG)
- Invalid cm/tm in visitor functions (#2206)
- Encrypt / decrypt Stream object dictionaries (#2228)
- Support nested color spaces for the /DeviceN color space (#2241)
- Images property fails if NullObject in list (#2215)

### Developer Experience (DEV)
- Unify mypy options and warn redundant workarounds (#2223)

[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.16.2...3.16.3)

## Version 3.16.2, 2023-09-24

### Bug Fixes (BUG)
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra
* [nalin-udhaar](https://github.com/nalin-udhaar)
* [Paternault, Louis](https://framagit.org/spalax)
* [Perrensen, Olsen](https://github.com/olsonperrensen)
* [pilotandy](https://github.com/pilotandy)
* [Pinheiro, Arthur](https://github.com/xilopaint)
* [Poddar, Arka](https://github.com/postmeback)
* [programmarchy](https://github.com/programmarchy)
Expand Down
25 changes: 19 additions & 6 deletions docs/user/extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,27 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.

The function provided in argument visitor_text of function extract_text has five arguments:
text, current transformation matrix, text matrix, font-dictionary and font-size.
In most cases the x and y coordinates of the current position
are in index 4 and 5 of the current transformation matrix.
* text: the current text (as long as possible, can be up to a full line)
* user_matrix: current matrix to move from user coordinate space (also known as CTM)
* tm_matrix: current matrix from text coordinate space
* font-dictionary: full font dictionary
* font-size: the size (in text coordinate space)

The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical)
It is recommended to use the user_matrix as it takes into all transformations.

Notes :

- as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space.
- if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows:
`txt2user = mult(tm, cm))`
The font-size is the raw text size, that is affected by the `user_matrix`


The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

**Caveat**: In complicated documents the calculated positions might be wrong.
**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).

The function provided in argument visitor_operand_before has four arguments:
operator, operand-arguments, current transformation matrix and text matrix.
Expand All @@ -53,7 +66,7 @@ parts = []


def visitor_body(text, cm, tm, font_dict, font_size):
y = tm[5]
y = cm[5]
if y > 50 and y < 720:
parts.append(text)

Expand Down Expand Up @@ -88,7 +101,7 @@ def visitor_svg_rect(op, args, cm, tm):


def visitor_svg_text(text, cm, tm, fontDict, fontSize):
(x, y) = (tm[4], tm[5])
(x, y) = (cm[4], cm[5])
dwg.add(dwg.text(text, insert=(x, y), fill="blue"))


Expand Down
3 changes: 2 additions & 1 deletion pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ._crypt_providers import crypt_provider
from ._encryption import PasswordType
from ._merger import PdfFileMerger, PdfMerger
from ._page import PageObject, Transformation
from ._page import PageObject, Transformation, mult
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
Expand All @@ -31,6 +31,7 @@
__all__ = [
"__version__",
"_debug_versions",
"mult",
"PageRange",
"PaperSize",
"DocumentInformation",
Expand Down
70 changes: 47 additions & 23 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,18 +1921,17 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [
1.0,
0.0,
0.0,
1.0,
0.0,
0.0,
] # will store previous tm_matrix

# cm/tm_prev stores the last modified matrices can be an intermediate position
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]

# memo_cm/tm will be used to store the position at the beginning of building the text
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand All @@ -1943,9 +1942,9 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
nonlocal orientations, rtl_dir, visitor_text, output, text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
Expand All @@ -1954,14 +1953,18 @@ def process_operation(operator: bytes, operands: List) -> None:
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
Expand Down Expand Up @@ -1992,7 +1995,7 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -2005,6 +2008,8 @@ def process_operation(operator: bytes, operands: List) -> None:
],
cm_matrix,
)
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
Expand All @@ -2016,8 +2021,10 @@ def process_operation(operator: bytes, operands: List) -> None:
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
try:
# charMapTuple: font_type, float(sp_width / 2), encoding,
# map_dict, font-dictionary
Expand Down Expand Up @@ -2088,17 +2095,19 @@ def process_operation(operator: bytes, operands: List) -> None:
try:
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
(cm_prev, tm_prev),
(cm_matrix, tm_matrix),
(memo_cm, memo_tm),
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
if text == "":
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
except OrientationNotFoundError:
return None

Expand Down Expand Up @@ -2130,12 +2139,18 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
if visitor_text is not None:
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
"\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except IndexError:
pass
try:
Expand All @@ -2151,21 +2166,30 @@ def process_operation(operator: bytes, operands: List) -> None:
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
text,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
finally:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand Down
31 changes: 19 additions & 12 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
Expand All @@ -100,13 +99,21 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]

m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,8 +124,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -136,8 +143,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -155,8 +162,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -174,8 +181,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.16.2"
__version__ = "3.16.3"
Loading

0 comments on commit 51933fd

Please sign in to comment.