Skip to content

Commit

Permalink
Merge branch 'main' into Merger
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz authored Sep 13, 2024
2 parents 938fc4a + c4e95bd commit f9d77bb
Show file tree
Hide file tree
Showing 6 changed files with 428 additions and 17 deletions.
38 changes: 36 additions & 2 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
NullObject,
NumberObject,
PdfObject,
StreamObject,
TextStringObject,
read_object,
)
Expand Down Expand Up @@ -316,8 +317,6 @@ def _get_object_from_stream(
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
# This is an xref to a stream, so its type better be a stream
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
# /N is the number of indirect objects in the stream
assert idx < obj_stm["/N"]
stream_data = BytesIO(obj_stm.get_data())
for i in range(obj_stm["/N"]): # type: ignore
read_non_whitespace(stream_data)
Expand Down Expand Up @@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)

logger_warning("parsing for Object Streams", __name__)
for g in self.xref:
for i in self.xref[g]:
# get_object in manual
stream.seek(self.xref[g][i], 0)
try:
_ = self.read_object_header(stream)
o = cast(StreamObject, read_object(stream, self))
if o.get("/Type", "") != "/ObjStm":
continue
strm = BytesIO(o.get_data())
cpt = 0
while True:
s = read_until_whitespace(strm)
if not s.isdigit():
break
_i = int(s)
skip_over_whitespace(strm)
strm.seek(-1, 1)
s = read_until_whitespace(strm)
if not s.isdigit(): # pragma: no cover
break # pragma: no cover
_o = int(s)
self.xref_objStm[_i] = (i, _o)
cpt += 1
if cpt != o.get("/N"): # pragma: no cover
logger_warning( # pragma: no cover
f"found {cpt} objects within Object({i},{g})"
f" whereas {o.get('/N')} expected",
__name__,
)
except Exception: # could be of many cause
pass

stream.seek(0, 0)
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
stream.seek(m.start(1), 0)
Expand Down
2 changes: 1 addition & 1 deletion pypdf/annotations/_non_markup_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
if is_external and is_internal:
raise ValueError(
"Either 'url' or 'target_page_index' have to be provided. "
f"url={url}, target_page_index={target_page_index}"
f"{url=}, {target_page_index=}"
)

border_arr: BorderArrayType
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_rectangle.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore

def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
if not isinstance(value, (NumberObject, FloatObject)):
if not isinstance(value, (FloatObject, NumberObject)):
value = FloatObject(value)
return value

Expand Down
8 changes: 2 additions & 6 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO
from itertools import product as cartesian_product
from pathlib import Path
from unittest.mock import patch

import pytest
from PIL import Image
Expand Down Expand Up @@ -225,14 +224,11 @@ def test_ccitt_fax_decode():


@pytest.mark.enable_socket()
@patch("pypdf._reader.logger_warning")
def test_decompress_zlib_error(mock_logger_warning):
def test_decompress_zlib_error(caplog):
reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf")))
for page in reader.pages:
page.extract_text()
mock_logger_warning.assert_called_with(
"incorrect startxref pointer(3)", "pypdf._reader"
)
assert "incorrect startxref pointer(3)" in caplog.text


@pytest.mark.enable_socket()
Expand Down
Loading

0 comments on commit f9d77bb

Please sign in to comment.