From 4f2cd3439c6f074c515fe347ef92ba0bb44a9e37 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:15:27 +0100 Subject: [PATCH] ROB: Fall back to non-Adobe Ascii85 format for missing end markers (#3007) Closes #2996. --- pypdf/filters.py | 8 +++++++- tests/test_filters.py | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index 517d6aac3..a95b96a54 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -446,7 +446,13 @@ def decode( if isinstance(data, str): data = data.encode() data = data.strip(WHITESPACES_AS_BYTES) - return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) + try: + return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) + except ValueError as error: + if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'": + logger_warning("Ignoring missing Ascii85 end marker.", __name__) + return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES) + raise class DCTDecode: diff --git a/tests/test_filters.py b/tests/test_filters.py index 23b90cca8..90a119844 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -594,3 +594,29 @@ def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): name = "issue2331.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] + + +@pytest.mark.enable_socket +def test_ascii85decode__invalid_end__recoverable(caplog): + """From #2996""" + url = "https://github.com/user-attachments/files/18050808/1af7d56a-5c8c-4914-85b3-b2536a5525cd.pdf" + name = "issue2996.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + page = reader.pages[1] + assert page.extract_text() == "" + assert "Ignoring missing Ascii85 end marker." in caplog.text + + +def test_ascii85decode__non_recoverable(caplog): + # Without our custom handling, this would complain about the final `~>` being missing. + data = "äöüß" + with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): + ASCII85Decode.decode(data) + assert "Ignoring missing Ascii85 end marker." in caplog.text + caplog.clear() + + data += "~>" + with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): + ASCII85Decode.decode(data) + assert caplog.text == ""