Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Fall back to non-Adobe Ascii85 format for missing end markers #3007

Merged
merged 1 commit into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,13 @@ def decode(
if isinstance(data, str):
data = data.encode()
data = data.strip(WHITESPACES_AS_BYTES)
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
try:
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
except ValueError as error:
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
raise


class DCTDecode:
Expand Down
26 changes: 26 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,29 @@ def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup():
name = "issue2331.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].images[0]


@pytest.mark.enable_socket
def test_ascii85decode__invalid_end__recoverable(caplog):
"""From #2996"""
url = "https://github.com/user-attachments/files/18050808/1af7d56a-5c8c-4914-85b3-b2536a5525cd.pdf"
name = "issue2996.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

page = reader.pages[1]
assert page.extract_text() == ""
assert "Ignoring missing Ascii85 end marker." in caplog.text


def test_ascii85decode__non_recoverable(caplog):
# Without our custom handling, this would complain about the final `~>` being missing.
data = "äöüß"
with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"):
ASCII85Decode.decode(data)
assert "Ignoring missing Ascii85 end marker." in caplog.text
caplog.clear()

data += "~>"
with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"):
ASCII85Decode.decode(data)
assert caplog.text == ""
Loading