Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Capture UnicodeDecodeError at PdfReader.pdf_header #1768

Merged
merged 3 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import struct
import zlib
from datetime import datetime
from io import BytesIO
from io import BytesIO, UnsupportedOperation
from pathlib import Path
from typing import (
Any,
Expand Down Expand Up @@ -360,7 +360,7 @@ def pdf_header(self) -> str:
# but that needs a deprecation
loc = self.stream.tell()
self.stream.seek(0, 0)
pdf_file_version = self.stream.read(8).decode("utf-8")
pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace")
self.stream.seek(loc, 0) # return to where it was
return pdf_file_version

Expand Down Expand Up @@ -1541,19 +1541,22 @@ def read(self, stream: StreamType) -> None:

def _basic_validation(self, stream: StreamType) -> None:
"""Ensure file is not empty. Read at most 5 bytes."""
# start at the end:
stream.seek(0, os.SEEK_END)
if not stream.tell():
raise EmptyFileError("Cannot read an empty file")
if self.strict:
stream.seek(0, os.SEEK_SET)
stream.seek(0, os.SEEK_SET)
try:
header_byte = stream.read(5)
if header_byte != b"%PDF-":
except UnicodeDecodeError:
raise UnsupportedOperation("cannot read header")
if header_byte == b"":
raise EmptyFileError("Cannot read an empty file")
elif header_byte != b"%PDF-":
if self.strict:
raise PdfReadError(
f"PDF starts with '{header_byte.decode('utf8')}', "
"but '%PDF-' expected"
)
stream.seek(0, os.SEEK_END)
else:
logger_warning(f"invalid pdf header: {header_byte}", __name__)
stream.seek(0, os.SEEK_END)

def _find_eof_marker(self, stream: StreamType) -> None:
"""
Expand All @@ -1567,7 +1570,10 @@ def _find_eof_marker(self, stream: StreamType) -> None:
line = b""
while line[:5] != b"%%EOF":
if stream.tell() < HEADER_SIZE:
raise PdfReadError("EOF marker not found")
if self.strict:
raise PdfReadError("EOF marker not found")
else:
logger_warning("EOF marker not found", __name__)
line = read_previous_line(stream)

def _find_startxref_pos(self, stream: StreamType) -> int:
Expand Down
42 changes: 41 additions & 1 deletion tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,16 @@ def test_read_empty():
assert exc.value.args[0] == "Cannot read an empty file"


def test_read_malformed_header():
def test_read_malformed_header(caplog):
with pytest.raises(PdfReadError) as exc:
PdfReader(io.BytesIO(b"foo"), strict=True)
assert exc.value.args[0] == "PDF starts with 'foo', but '%PDF-' expected"
caplog.clear()
try:
PdfReader(io.BytesIO(b"foo"), strict=False)
except Exception:
pass
assert caplog.messages[0].startswith("invalid pdf header")


def test_read_malformed_body():
Expand Down Expand Up @@ -1352,3 +1358,37 @@ def test_iss1710():
name = "irbookonlinereading.pdf"
in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
in_pdf.outline


def test_broken_file_header():
pdf_data = (
b"%%PDF-\xa0sd\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << %s/Root 5 0 R /Size 6 >>\n"
b"startxref %d\n"
b"%%%%EOF"
)
with_prev_0 = True
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref") - 1,
)
PdfReader(io.BytesIO(pdf_data))