Skip to content

Commit

Permalink
ROB: Ignore odd-length strings when processing cmap lines
Browse files Browse the repository at this point in the history
Closes #2216.
  • Loading branch information
stefan6419846 committed Dec 19, 2024
1 parent 17f6e35 commit 5c2285c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import binascii
from binascii import unhexlify
from math import ceil
from typing import Any, Dict, List, Tuple, Union, cast
Expand Down Expand Up @@ -304,7 +305,10 @@ def process_cm_line(
elif b"endbfchar" in line:
process_char = False
elif process_rg:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
try:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
except binascii.Error as error:
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
elif process_char:
parse_bfchar(line, map_dict, int_entry)
return process_rg, process_char, multiline_rg
Expand Down
11 changes: 11 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,14 @@ def test_iss2966():
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()


@pytest.mark.enable_socket
def test_binascii_odd_length_string(caplog):
"""Tests for #2216"""
url = "https://github.com/user-attachments/files/18199642/iss2216.pdf"
name = "iss2216.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

page = reader.pages[0]
assert "\n(Many other theorems may\n" in page.extract_text()
assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text

0 comments on commit 5c2285c

Please sign in to comment.