Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Repair pdf with invalid Root object #2880

Merged
merged 10 commits into from
Sep 28, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,8 +1148,9 @@ def _flatten(
# Fix issue 327: set flattened_pages attribute only for
# decrypted file
catalog = self.root_object
pages = catalog["/Pages"].get_object() # type: ignore
assert isinstance(pages, DictionaryObject)
pages = catalog.get("/Pages").get_object() # type: ignore
if not isinstance(pages, DictionaryObject):
raise PdfReadError("Invalid object in /Pages")
self.flattened_pages = []

if PA.TYPE in pages:
Expand Down
39 changes: 33 additions & 6 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ def __init__(
# map page indirect_reference number to page number
self._page_id2num: Optional[Dict[Any, Any]] = None

self._validated_root: Optional[DictionaryObject] = None

self._initialize_stream(stream)

self._override_encryption = False
Expand Down Expand Up @@ -197,10 +199,35 @@ def close(self) -> None:
@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". Standardized with PdfWriter."""
root = self.trailer[TK.ROOT]
if root is None:
raise PdfReadError('Cannot find "/Root" key in trailer')
return cast(DictionaryObject, root.get_object())
if self._validated_root:
return self._validated_root
root = self.trailer.get(TK.ROOT)
if is_null_or_none(root):
logger_warning('Cannot find "/Root" key in trailer', __name__)
elif (
cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type")
== "/Catalog"
):
self._validated_root = cast(
DictionaryObject, cast(PdfObject, root).get_object()
)
else:
logger_warning("Invalid Root Object in trailer", __name__)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
if self._validated_root is None:
logger_warning("trying to fix", __name__)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
nb = cast(int, self.trailer.get("/Size", 0))
for i in range(nb):
try:
o = self.get_object(i + 1)
except Exception: # to be sure to capture all errors
o = None
if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog":
self._validated_root = o
logger_warning(f"root found at {o.indirect_reference!r}", __name__)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
break
if self._validated_root is None:
raise PdfReadError("Cannot find Root object in pdf")
return self._validated_root

@property
def _info(self) -> Optional[DictionaryObject]:
Expand All @@ -215,11 +242,11 @@ def _info(self) -> Optional[DictionaryObject]:
return None
else:
info = info.get_object()
if info == None: # noqa: E711
if not isinstance(info, DictionaryObject):
raise PdfReadError(
"Trailer not found or does not point to document information directory"
)
return cast(DictionaryObject, info)
return info

@property
def _ID(self) -> Optional[ArrayObject]:
Expand Down
3 changes: 2 additions & 1 deletion pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,5 +879,6 @@ def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]
True if x is None or NullObject.
"""
return x is None or (
isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject)
isinstance(x, PdfObject)
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
)
76 changes: 67 additions & 9 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,14 @@ def test_iss1943():
def test_broken_meta_data(pdf_path):
with open(pdf_path, "rb") as f:
reader = PdfReader(f)
with pytest.raises(
PdfReadError,
match=(
"Trailer not found or does not point to document "
"information directory"
),
):
reader.metadata
assert reader.metadata is None

with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as f:
b = f.read(-1)
reader = PdfReader(BytesIO(b.replace(b"/Info 2 0 R", b"/Info 2 ")))
with pytest.raises(PdfReadError) as exc:
reader.metadata
assert "does not point to document information directory" in repr(exc)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -621,7 +621,7 @@ def test_read_unknown_zero_pages(caplog):
assert normalize_warnings(caplog.text) == warnings
with pytest.raises(PdfReadError) as exc:
len(reader.pages)
assert exc.value.args[0] == 'Cannot find "/Root" key in trailer'
assert exc.value.args[0] == "Invalid object in /Pages"


def test_read_encrypted_without_decryption():
Expand Down Expand Up @@ -1712,3 +1712,61 @@ def test_unbalanced_brackets_in_dictionary_object(caplog):
name = "iss2877.pdf" # reused
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert len(reader.pages) == 43 # note: /Count = 46 but 3 kids are None


@pytest.mark.enable_socket()
def test_repair_root(caplog):
"""Cf #2877"""
url = "https://github.com/user-attachments/files/17162216/crash-6620e8b1abfe3da639b654595da859b87f985748.pdf"
name = "iss2875.pdf"

b = get_data_from_url(url, name=name)
reader = PdfReader(BytesIO(b))
assert len(reader.pages) == 1
assert all(
msg in caplog.text
for msg in (
"Invalid Root Object",
"trying to fix",
"root found at IndirectObject(2, 0,",
)
)

# no /Root Entry
reader = PdfReader(BytesIO(b.replace(b"/Root", b"/Roo ")))
caplog.clear()
assert len(reader.pages) == 1
assert all(
msg in caplog.text
for msg in (
'Cannot find "/Root" key in trailer',
"trying to fix",
"root found at IndirectObject(2, 0,",
)
)

# Invalid /Root Entry
caplog.clear()
reader = PdfReader(
BytesIO(
b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
)
)
with pytest.raises(PdfReadError):
len(reader.pages)
assert all(
msg in caplog.text
for msg in ("Invalid Root Object in trailer", "trying to fix")
)

# Invalid /Root Entry + error in get_object
caplog.clear()
b = b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
b = b[:5124] + b"A" + b[5125:]
reader = PdfReader(BytesIO(b))
with pytest.raises(PdfReadError):
len(reader.pages)
assert all(
msg in caplog.text
for msg in ("Invalid Root Object in trailer", "trying to fix")
)
Loading