Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Simplify file identifiers generation #2003

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
39 changes: 22 additions & 17 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import enum
import hashlib
import re
import time
import uuid
import warnings
from io import BytesIO, FileIO, IOBase
Expand Down Expand Up @@ -145,13 +146,6 @@ class ObjectDeletionFlag(enum.IntFlag):
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES


def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
hash = hashlib.md5()
for block in iter(lambda: stream.read(blocksize), b""):
hash.update(block)
return hash.hexdigest()


class PdfWriter:
"""
Write a PDF file out, given pages produced by another class.
Expand Down Expand Up @@ -1223,24 +1217,34 @@ def cloneDocumentFromReader(
)
self.clone_document_from_reader(reader, after_page_append)

def _compute_document_identifier_from_content(self) -> ByteStringObject:
stream = BytesIO()
self._write_pdf_structure(stream)
stream.seek(0)
return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
def _compute_document_identifier(self) -> ByteStringObject:
md5 = hashlib.md5()
md5.update(str(time.time()).encode("utf-8"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes document-generation non-deterministic, right?

md5.update(str(self.fileobj).encode("utf-8"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is self.fileobj equivalent to self._write_pdf_structure(stream)?

md5.update(str(len(self._objects)).encode("utf-8"))
if hasattr(self, "_info"):
for k, v in cast(DictionaryObject, self._info.get_object()).items():
md5.update(f"{k}={v}".encode())
return ByteStringObject(md5.hexdigest().encode("utf-8"))

def generate_file_identifiers(self) -> None:
"""
Generate an identifier for the PDF that will be written.

The only point of this is ensuring uniqueness. Reproducibility is not
required; see 14.4 "File Identifiers".
required;
When a file is first written, both identifiers shall be set to the same value.
If both identifiers match when a file reference is resolved, it is very
likely that the correct and unchanged file has been found. If only the first
identifier matches, a different version of the correct file has been found.
see 14.4 "File Identifiers".
"""
if self._ID:
id1 = self._ID[0]
id2 = self._compute_document_identifier()
else:
id1 = self._compute_document_identifier_from_content()
id2 = self._compute_document_identifier_from_content()
id1 = self._compute_document_identifier()
id2 = ByteStringObject(id1.original_bytes)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id1 is a ByteStringObject already. So .original_bytes just returns id1. Then wrapping it in ByteStringObject doesn't do anything, right?

self._ID = ArrayObject((id1, id2))

def encrypt(
Expand Down Expand Up @@ -1325,8 +1329,9 @@ def encrypt(
if not use_128bit:
alg = EncryptAlgorithm.RC4_40
self.generate_file_identifiers()
assert self._ID
self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
self._encryption = Encryption.make(
alg, permissions_flag, cast(ArrayObject, self._ID)[0]
)
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
# in case call `encrypt` again
entry = self._encryption.write_entry(user_password, owner_password)
if self._encrypt_entry:
Expand Down
Loading