Fixing document outline encoding - close #458 (#463)

py-pdf · Jun 30, 2022 · 22491d3 · 22491d3
1 parent be3ad07
commit 22491d3
Show file tree

Hide file tree

Showing 22 changed files with 77 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,9 +18,10 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 
 ## [2.5.6] - not released yet
 ### Changed
-- the [svg.path](https://pypi.org/project/svg.path/) package was added as dependency to better parse SVG files
+- the [svg.path](https://pypi.org/project/svg.path/) package was added as a dependency to better parse SVG images
 ### Fixed
 - properly parsing single-digits arguments in SVG paths - _cf._ [#450](https://github.com/PyFPDF/fpdf2/issues/450)
+- document outline encoding: it was found to be broken when using a thai font - _cf._ [#458](https://github.com/PyFPDF/fpdf2/issues/458)
 
 ## [2.5.5] - 2022-06-17
 ### Added
@@ -35,7 +36,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 - allowing to change appearance of [highlight annotations](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.highlight) by specifying a [`TextMarkupType`](https://pyfpdf.github.io/fpdf2/fpdf/enums.html#fpdf.enums.TextMarkupType)
 - documentation on how to control objects transparency: [link to docs](https://pyfpdf.github.io/fpdf2/Transparency.html)
 - documentation on how to create tables and charts using [pandas](https://pandas.pydata.org/) DataFrames: [link to docs](https://pyfpdf.github.io/fpdf2/Maths.html), thanks to @iwayankurniawan
-- added argument `round_corners` to `FPDF.rect()` that allows to draw rectangles with round corners: [link to docs](https://pyfpdf.github.io/fpdf2/Shapes.html#rectangle)
+- added argument `round_corners` to `FPDF.rect()` that allows to draw rectangles with round corners: [link to docs](https://pyfpdf.github.io/fpdf2/Shapes.html#rectangle) - thanks to @gonzalobarbaran
 ### Fixed
 - support for `"x"` & `"y"` attributes in SVG `<use>` tags - _cf._ [#446](https://github.com/PyFPDF/fpdf2/issues/446)
 - `CreationDate` of PDFs generated, that was broken - _cf._ [#451](https://github.com/PyFPDF/fpdf2/issues/451)

diff --git a/fpdf/fpdf.py b/fpdf/fpdf.py
@@ -4191,6 +4191,7 @@ def _putresourcedict(self):
             self._out(f"/F{idx} {pdf_ref(n)}")
         self._out(">>")
 
+        # if self.images: [TODO] uncomment this & indent the next 3 lines in order to save 15 bytes / page without image
         self._out("/XObject <<")
         self._putxobjectdict()
         self._out(">>")

diff --git a/fpdf/syntax.py b/fpdf/syntax.py
@@ -67,6 +67,8 @@
 remains something to be looked into.
 """
 from abc import ABC
+from binascii import hexlify
+from codecs import BOM_UTF16_BE
 import re
 
 from .util import object_id_for_page
@@ -232,7 +234,17 @@ def camel_case(snake_case):
 
 
 class PDFString(str):
+    USE_HEX_ENCODING = True
+    """
+    Setting this to False can reduce the encoded strings size,
+    but then there can be a risk of badly encoding some unicode strings - cf. issue #458
+    """
+
     def serialize(self):
+        if self.USE_HEX_ENCODING:
+            # Using the "Hexadecimal String" format defined in the PDF spec:
+            hex_str = hexlify(BOM_UTF16_BE + self.encode("utf-16-be")).decode("latin-1")
+            return f"<{hex_str}>"
         return f'({self.encode("UTF-16").decode("latin-1")})'
 
 

diff --git a/test/html/html_custom_heading_sizes.pdf b/test/html/html_custom_heading_sizes.pdf
diff --git a/test/html/html_features.pdf b/test/html/html_features.pdf
diff --git a/test/html/html_heading_hebrew.pdf b/test/html/html_heading_hebrew.pdf
diff --git a/test/html/html_headings_line_height.pdf b/test/html/html_headings_line_height.pdf
diff --git a/test/image/alt_text/alt_text_and_title.pdf b/test/image/alt_text/alt_text_and_title.pdf
diff --git a/test/image/alt_text/test_alt_text_on_two_pages.pdf b/test/image/alt_text/test_alt_text_on_two_pages.pdf
diff --git a/test/link_alt_text.pdf b/test/link_alt_text.pdf
diff --git a/test/outline/2_pages_outline.pdf b/test/outline/2_pages_outline.pdf
diff --git a/test/outline/custom_HTML2FPDF.pdf b/test/outline/custom_HTML2FPDF.pdf
diff --git a/test/outline/html_toc.pdf b/test/outline/html_toc.pdf
diff --git a/test/outline/html_toc_2_pages.pdf b/test/outline/html_toc_2_pages.pdf
diff --git a/test/outline/html_toc_with_h1_as_2nd_heading.pdf b/test/outline/html_toc_with_h1_as_2nd_heading.pdf
diff --git a/test/outline/russian_heading.pdf b/test/outline/russian_heading.pdf
diff --git a/test/outline/self_refering_outline.pdf b/test/outline/self_refering_outline.pdf
diff --git a/test/outline/simple_outline.pdf b/test/outline/simple_outline.pdf
diff --git a/test/outline/test_outline.py b/test/outline/test_outline.py
@@ -187,6 +187,19 @@ def test_russian_heading(tmp_path):  # issue-320
     assert_pdf_equal(pdf, HERE / "russian_heading.pdf", tmp_path)
 
 
+def test_thai_headings(tmp_path):  # issue-458
+    pdf = FPDF()
+    for txt in [
+        "ลักษณะเฉพาะของคุณ",
+        "ระดับฮอร์โมนเพศชาย",
+        "ระดับฮอร์โมนเพศหญิง",
+        "hello",
+    ]:
+        pdf.add_page()
+        pdf.start_section(txt)
+    assert_pdf_equal(pdf, HERE / "thai_headings.pdf", tmp_path)
+
+
 def test_self_refering_outline(tmp_path):
     """
     Based on Jens Müller talk at NDSS: Processing Dangerous Paths.

diff --git a/test/outline/test_outline_serializer.py b/test/outline/test_outline_serializer.py
@@ -1,5 +1,5 @@
 from fpdf.outline import OutlineSection, serialize_outline
-from fpdf.syntax import DestinationXYZ
+from fpdf.syntax import DestinationXYZ, PDFString
 
 
 def test_serialize_outline():
@@ -18,7 +18,7 @@ def test_serialize_outline():
     )
     assert (
         serialize_outline(sections, first_object_id=6)
-        == f"""\
+        == """\
 6 0 obj
 <<
 /Count 2
@@ -35,15 +35,15 @@ def test_serialize_outline():
 /Last 8 0 R
 /Next 9 0 R
 /Parent 6 0 R
-/Title ({'Title 1'.encode('UTF-16').decode('latin-1')})
+/Title <feff005400690074006c006500200031>
 >>
 endobj
 8 0 obj
 <<
 /Count 0
 /Dest [5 0 R /XYZ 0 0 null]
 /Parent 7 0 R
-/Title ({'Subtitle 1.1'.encode('UTF-16').decode('latin-1')})
+/Title <feff005300750062007400690074006c006500200031002e0031>
 >>
 endobj
 9 0 obj
@@ -54,7 +54,7 @@ def test_serialize_outline():
 /Last 11 0 R
 /Parent 6 0 R
 /Prev 7 0 R
-/Title ({'Title 2'.encode('UTF-16').decode('latin-1')})
+/Title <feff005400690074006c006500200032>
 >>
 endobj
 10 0 obj
@@ -63,7 +63,7 @@ def test_serialize_outline():
 /Dest [9 0 R /XYZ 0 0 null]
 /Next 11 0 R
 /Parent 9 0 R
-/Title ({'Subtitle 2.1'.encode('UTF-16').decode('latin-1')})
+/Title <feff005300750062007400690074006c006500200032002e0031>
 >>
 endobj
 11 0 obj
@@ -72,7 +72,7 @@ def test_serialize_outline():
 /Dest [11 0 R /XYZ 0 0 null]
 /Parent 9 0 R
 /Prev 10 0 R
-/Title ({'Subtitle 2.2'.encode('UTF-16').decode('latin-1')})
+/Title <feff005300750062007400690074006c006500200032002e0032>
 >>
 endobj"""
     )
@@ -88,7 +88,7 @@ def test_serialize_outline_with_headless_hierarchy():  # issues 239
     )
     assert (
         serialize_outline(sections, first_object_id=6)
-        == f"""\
+        == """\
 6 0 obj
 <<
 /Count 2
@@ -104,15 +104,15 @@ def test_serialize_outline_with_headless_hierarchy():  # issues 239
 /First 8 0 R
 /Last 8 0 R
 /Parent 6 0 R
-/Title ({'?-1'.encode('UTF-16').decode('latin-1')})
+/Title <feff003f002d0031>
 >>
 endobj
 8 0 obj
 <<
 /Count 0
 /Dest [5 0 R /XYZ 0 0 null]
 /Parent 7 0 R
-/Title ({'?-1-1'.encode('UTF-16').decode('latin-1')})
+/Title <feff003f002d0031002d0031>
 >>
 endobj
 9 0 obj
@@ -122,7 +122,7 @@ def test_serialize_outline_with_headless_hierarchy():  # issues 239
 /First 10 0 R
 /Last 10 0 R
 /Parent 6 0 R
-/Title ({'1'.encode('UTF-16').decode('latin-1')})
+/Title <feff0031>
 >>
 endobj
 10 0 obj
@@ -132,15 +132,47 @@ def test_serialize_outline_with_headless_hierarchy():  # issues 239
 /First 11 0 R
 /Last 11 0 R
 /Parent 9 0 R
-/Title ({'1-1'.encode('UTF-16').decode('latin-1')})
+/Title <feff0031002d0031>
 >>
 endobj
 11 0 obj
 <<
 /Count 0
 /Dest [5 0 R /XYZ 0 0 null]
 /Parent 10 0 R
-/Title ({'1-1-1'.encode('UTF-16').decode('latin-1')})
+/Title <feff0031002d0031002d0031>
 >>
 endobj"""
     )
+
+
+def test_serialize_outline_without_hex_encoding():  # issue-458
+    PDFString.USE_HEX_ENCODING = False
+    try:
+        sections = (
+            OutlineSection(
+                "Title", level=0, page_number=1, dest=DestinationXYZ(page=1)
+            ),
+        )
+        assert (
+            serialize_outline(sections, first_object_id=1)
+            == f"""\
+1 0 obj
+<<
+/Count 1
+/First 2 0 R
+/Last 2 0 R
+/Type /Outlines
+>>
+endobj
+2 0 obj
+<<
+/Count 0
+/Dest [3 0 R /XYZ 0 0 null]
+/Parent 1 0 R
+/Title ({'Title'.encode('UTF-16').decode('latin-1')})
+>>
+endobj"""
+        )
+    finally:
+        PDFString.USE_HEX_ENCODING = True  # restore default value
diff --git a/test/outline/thai_headings.pdf b/test/outline/thai_headings.pdf
diff --git a/test/test_structure_tree.py b/test/test_structure_tree.py
@@ -83,7 +83,7 @@ def test_single_image_structure_tree():
     )
     assert (
         struct_builder.serialize(first_object_id=3)
-        == f"""\
+        == """\
 3 0 obj
 <<
 /K [4 0 R]
@@ -106,12 +106,12 @@ def test_single_image_structure_tree():
 endobj
 6 0 obj
 <<
-/Alt ({'Image description'.encode('UTF-16').decode('latin-1')})
+/Alt <feff0049006d0061006700650020006400650073006300720069007000740069006f006e>
 /K [0]
 /P 4 0 R
 /Pg 1 0 R
 /S /Figure
-/T ({'Image title'.encode('UTF-16').decode('latin-1')})
+/T <feff0049006d0061006700650020007400690074006c0065>
 /Type /StructElem
 >>
 endobj"""