From f973c0c75a871de9974bfa8759c95e8f303a62b0 Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Wed, 14 Jun 2023 20:19:42 +0100 Subject: [PATCH] BUG: Support UTF-16-LE Strings Fixes #1838. --- pypdf/generic/_utils.py | 2 +- tests/test_generic.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 695736769..3d8746570 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -144,7 +144,7 @@ def create_string_object( return TextStringObject(string.decode(forced_encoding)) else: try: - if string.startswith(codecs.BOM_UTF16_BE): + if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) retval.autodetect_utf16 = True return retval diff --git a/tests/test_generic.py b/tests/test_generic.py index 80ee76b66..255fe74bc 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1031,6 +1031,22 @@ def test_indirect_object_invalid_read(): assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5" +def test_create_string_object_utf16be_bom(): + result = create_string_object( + b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" + ) + assert result == "PaperPort 14\x00" + assert result.autodetect_utf16 is True + + +def test_create_string_object_utf16le_bom(): + result = create_string_object( + b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" + ) + assert result == "PaperPort 14\x00" + assert result.autodetect_utf16 is True + + def test_create_string_object_force(): assert create_string_object(b"Hello World", []) == "Hello World" assert create_string_object(b"Hello World", {72: "A"}) == "Aello World"