diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 0e3e7ebab..9788ba418 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -190,6 +190,23 @@ def skip_over_whitespace(stream: StreamType) -> bool: return cnt > 1 +def check_if_whitespace_only(value: bytes) -> bool: + """ + Check if the given value consists of whitespace characters only. + + Args: + value: The bytes to check. + + Returns: + True if the value only has whitespace characters, otherwise return False. + """ + for index in range(len(value)): + current = value[index:index + 1] + if current not in WHITESPACES: + return False + return True + + def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) stream.seek(-1, 1) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index a390357dd..1c41c453b 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import WHITESPACES, logger_warning +from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( @@ -199,9 +199,9 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: if len(lookup) != expected_count: if len(lookup) < expected_count: raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.") - lookup = lookup[:expected_count] - if not all(_value in WHITESPACES for _value in lookup[expected_count:]): + if not check_if_whitespace_only(lookup[expected_count:]): raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.") + lookup = lookup[:expected_count] colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ diff --git a/tests/test_utils.py b/tests/test_utils.py index f00be5d62..8803feea8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,6 +10,7 @@ Version, _get_max_pdf_version_header, _human_readable_bytes, + check_if_whitespace_only, deprecate_with_replacement, deprecation_bookmark, deprecation_no_replacement, @@ -48,6 +49,23 @@ def test_skip_over_whitespace(stream, expected): assert skip_over_whitespace(stream) == expected +@pytest.mark.parametrize( + ("value", "expected"), + [ + (b"foo", False), + (b" a", False), + (b" a\n b", False), + (b"", True), + (b" ", True), + (b" ", True), + (b" \n", True), + (b" \n", True), + ], +) +def test_check_if_whitespace_only(value, expected): + assert check_if_whitespace_only(value) is expected + + def test_read_until_whitespace(): assert read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py index e6d9e8e0f..0e515cae5 100644 --- a/tests/test_xobject_image_helpers.py +++ b/tests/test_xobject_image_helpers.py @@ -4,7 +4,9 @@ import pytest from pypdf import PdfReader +from pypdf._xobj_image_helpers import _handle_flate from pypdf.errors import PdfReadError +from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject from . import get_data_from_url @@ -25,3 +27,61 @@ def test_get_imagemode_recursion_depth(): match="Color spaces nested too deep. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH.", ): reader.pages[0].images[0] + + +def test_handle_flate__image_mode_1(): + data = b"\x00\xe0\x00" + lookup = DecodedStreamObject() + expected_data = [ + (66, 66, 66), (66, 66, 66), (66, 66, 66), + (0, 19, 55), (0, 19, 55), (0, 19, 55), + (66, 66, 66), (66, 66, 66), (66, 66, 66) + ] + + # No trailing data. + lookup.set_data(b"\x42\x42\x42\x00\x13\x37") + result = _handle_flate( + size=(3, 3), + data=data, + mode="1", + color_space=ArrayObject([NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]), + colors=2, + obj_as_text="dummy" + ) + assert expected_data == list(result[0].getdata()) + + # Trailing whitespace. + lookup.set_data(b"\x42\x42\x42\x00\x13\x37 \x0a") + result = _handle_flate( + size=(3, 3), + data=data, + mode="1", + color_space=ArrayObject([NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]), + colors=2, + obj_as_text="dummy" + ) + assert expected_data == list(result[0].getdata()) + + # Trailing non-whitespace character. + lookup.set_data(b"\x42\x42\x42\x00\x13\x37\x12") + with pytest.raises(PdfReadError, match=r"^Too many lookup values: Expected 6, got 7\.$"): + _handle_flate( + size=(3, 3), + data=data, + mode="1", + color_space=ArrayObject([NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]), + colors=2, + obj_as_text="dummy" + ) + + # Not enough lookup data. + lookup.set_data(b"\x42\x42\x42\x00\x13") + with pytest.raises(PdfReadError, match=r"^Not enough lookup values: Expected 6, got 5\.$"): + _handle_flate( + size=(3, 3), + data=data, + mode="1", + color_space=ArrayObject([NameObject("/Indexed"), NameObject("/DeviceRGB"), NumberObject(1), lookup]), + colors=2, + obj_as_text="dummy" + )