From fac24b431eab4bfb0c3413b34c3e1cdc59441566 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:00:31 +0200 Subject: [PATCH 1/2] FIX: Cope with xref not followed by separator closes #2082 --- pypdf/_reader.py | 4 ++-- tests/test_reader.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 721bcdb7c..ac32f0ffa 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1634,8 +1634,8 @@ def _find_startxref_pos(self, stream: StreamType) -> int: def _read_standard_xref_table(self, stream: StreamType) -> None: # standard cross-reference table - ref = stream.read(4) - if ref[:3] != b"ref": + ref = stream.read(3) + if ref != b"ref": raise PdfReadError("xref table read error") read_non_whitespace(stream) stream.seek(-1, 1) diff --git a/tests/test_reader.py b/tests/test_reader.py index 141d59aed..c27fd0636 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1436,3 +1436,11 @@ def test_iss1825(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] page.extract_text() + + +@pytest.mark.enable_socket() +def test_iss2082(): + url = "https://github.com/py-pdf/pypdf/files/12317939/test.pdf" + name = "iss2082.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].extract_text() From b396c216b1b0600a7fd88d3ffdf3844a999a9914 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 11 Aug 2023 11:28:56 +0200 Subject: [PATCH 2/2] coverage --- tests/test_reader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index c27fd0636..69ef80e94 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1442,5 +1442,11 @@ def test_iss1825(): def test_iss2082(): url = "https://github.com/py-pdf/pypdf/files/12317939/test.pdf" name = "iss2082.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + b = get_data_from_url(url, name=name) + reader = PdfReader(BytesIO(b)) reader.pages[0].extract_text() + + bb = bytearray(b) + bb[b.find(b"xref") + 2] = ord(b"E") + with pytest.raises(PdfReadError): + reader = PdfReader(BytesIO(bb))