Skip to content

Commit

Permalink
Fixing document outline encoding - close #458 (#463)
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucas-C authored Jun 30, 2022
1 parent be3ad07 commit 22491d3
Show file tree
Hide file tree
Showing 22 changed files with 77 additions and 18 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ This can also be enabled programmatically with `warnings.simplefilter('default',

## [2.5.6] - not released yet
### Changed
- the [svg.path](https://pypi.org/project/svg.path/) package was added as dependency to better parse SVG files
- the [svg.path](https://pypi.org/project/svg.path/) package was added as a dependency to better parse SVG images
### Fixed
- properly parsing single-digits arguments in SVG paths - _cf._ [#450](https://github.com/PyFPDF/fpdf2/issues/450)
- document outline encoding: it was found to be broken when using a thai font - _cf._ [#458](https://github.com/PyFPDF/fpdf2/issues/458)

## [2.5.5] - 2022-06-17
### Added
Expand All @@ -35,7 +36,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
- allowing to change appearance of [highlight annotations](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.highlight) by specifying a [`TextMarkupType`](https://pyfpdf.github.io/fpdf2/fpdf/enums.html#fpdf.enums.TextMarkupType)
- documentation on how to control objects transparency: [link to docs](https://pyfpdf.github.io/fpdf2/Transparency.html)
- documentation on how to create tables and charts using [pandas](https://pandas.pydata.org/) DataFrames: [link to docs](https://pyfpdf.github.io/fpdf2/Maths.html), thanks to @iwayankurniawan
- added argument `round_corners` to `FPDF.rect()` that allows to draw rectangles with round corners: [link to docs](https://pyfpdf.github.io/fpdf2/Shapes.html#rectangle)
- added argument `round_corners` to `FPDF.rect()` that allows to draw rectangles with round corners: [link to docs](https://pyfpdf.github.io/fpdf2/Shapes.html#rectangle) - thanks to @gonzalobarbaran
### Fixed
- support for `"x"` & `"y"` attributes in SVG `<use>` tags - _cf._ [#446](https://github.com/PyFPDF/fpdf2/issues/446)
- `CreationDate` of PDFs generated, that was broken - _cf._ [#451](https://github.com/PyFPDF/fpdf2/issues/451)
Expand Down
1 change: 1 addition & 0 deletions fpdf/fpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4191,6 +4191,7 @@ def _putresourcedict(self):
self._out(f"/F{idx} {pdf_ref(n)}")
self._out(">>")

# if self.images: [TODO] uncomment this & indent the next 3 lines in order to save 15 bytes / page without image
self._out("/XObject <<")
self._putxobjectdict()
self._out(">>")
Expand Down
12 changes: 12 additions & 0 deletions fpdf/syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
remains something to be looked into.
"""
from abc import ABC
from binascii import hexlify
from codecs import BOM_UTF16_BE
import re

from .util import object_id_for_page
Expand Down Expand Up @@ -232,7 +234,17 @@ def camel_case(snake_case):


class PDFString(str):
USE_HEX_ENCODING = True
"""
Setting this to False can reduce the encoded strings size,
but then there can be a risk of badly encoding some unicode strings - cf. issue #458
"""

def serialize(self):
if self.USE_HEX_ENCODING:
# Using the "Hexadecimal String" format defined in the PDF spec:
hex_str = hexlify(BOM_UTF16_BE + self.encode("utf-16-be")).decode("latin-1")
return f"<{hex_str}>"
return f'({self.encode("UTF-16").decode("latin-1")})'


Expand Down
Binary file modified test/html/html_custom_heading_sizes.pdf
Binary file not shown.
Binary file modified test/html/html_features.pdf
Binary file not shown.
Binary file modified test/html/html_heading_hebrew.pdf
Binary file not shown.
Binary file modified test/html/html_headings_line_height.pdf
Binary file not shown.
Binary file modified test/image/alt_text/alt_text_and_title.pdf
Binary file not shown.
Binary file modified test/image/alt_text/test_alt_text_on_two_pages.pdf
Binary file not shown.
Binary file modified test/link_alt_text.pdf
Binary file not shown.
Binary file modified test/outline/2_pages_outline.pdf
Binary file not shown.
Binary file modified test/outline/custom_HTML2FPDF.pdf
Binary file not shown.
Binary file modified test/outline/html_toc.pdf
Binary file not shown.
Binary file modified test/outline/html_toc_2_pages.pdf
Binary file not shown.
Binary file modified test/outline/html_toc_with_h1_as_2nd_heading.pdf
Binary file not shown.
Binary file modified test/outline/russian_heading.pdf
Binary file not shown.
Binary file modified test/outline/self_refering_outline.pdf
Binary file not shown.
Binary file modified test/outline/simple_outline.pdf
Binary file not shown.
13 changes: 13 additions & 0 deletions test/outline/test_outline.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,19 @@ def test_russian_heading(tmp_path): # issue-320
assert_pdf_equal(pdf, HERE / "russian_heading.pdf", tmp_path)


def test_thai_headings(tmp_path): # issue-458
pdf = FPDF()
for txt in [
"ลักษณะเฉพาะของคุณ",
"ระดับฮอร์โมนเพศชาย",
"ระดับฮอร์โมนเพศหญิง",
"hello",
]:
pdf.add_page()
pdf.start_section(txt)
assert_pdf_equal(pdf, HERE / "thai_headings.pdf", tmp_path)


def test_self_refering_outline(tmp_path):
"""
Based on Jens Müller talk at NDSS: Processing Dangerous Paths.
Expand Down
58 changes: 45 additions & 13 deletions test/outline/test_outline_serializer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fpdf.outline import OutlineSection, serialize_outline
from fpdf.syntax import DestinationXYZ
from fpdf.syntax import DestinationXYZ, PDFString


def test_serialize_outline():
Expand All @@ -18,7 +18,7 @@ def test_serialize_outline():
)
assert (
serialize_outline(sections, first_object_id=6)
== f"""\
== """\
6 0 obj
<<
/Count 2
Expand All @@ -35,15 +35,15 @@ def test_serialize_outline():
/Last 8 0 R
/Next 9 0 R
/Parent 6 0 R
/Title ({'Title 1'.encode('UTF-16').decode('latin-1')})
/Title <feff005400690074006c006500200031>
>>
endobj
8 0 obj
<<
/Count 0
/Dest [5 0 R /XYZ 0 0 null]
/Parent 7 0 R
/Title ({'Subtitle 1.1'.encode('UTF-16').decode('latin-1')})
/Title <feff005300750062007400690074006c006500200031002e0031>
>>
endobj
9 0 obj
Expand All @@ -54,7 +54,7 @@ def test_serialize_outline():
/Last 11 0 R
/Parent 6 0 R
/Prev 7 0 R
/Title ({'Title 2'.encode('UTF-16').decode('latin-1')})
/Title <feff005400690074006c006500200032>
>>
endobj
10 0 obj
Expand All @@ -63,7 +63,7 @@ def test_serialize_outline():
/Dest [9 0 R /XYZ 0 0 null]
/Next 11 0 R
/Parent 9 0 R
/Title ({'Subtitle 2.1'.encode('UTF-16').decode('latin-1')})
/Title <feff005300750062007400690074006c006500200032002e0031>
>>
endobj
11 0 obj
Expand All @@ -72,7 +72,7 @@ def test_serialize_outline():
/Dest [11 0 R /XYZ 0 0 null]
/Parent 9 0 R
/Prev 10 0 R
/Title ({'Subtitle 2.2'.encode('UTF-16').decode('latin-1')})
/Title <feff005300750062007400690074006c006500200032002e0032>
>>
endobj"""
)
Expand All @@ -88,7 +88,7 @@ def test_serialize_outline_with_headless_hierarchy(): # issues 239
)
assert (
serialize_outline(sections, first_object_id=6)
== f"""\
== """\
6 0 obj
<<
/Count 2
Expand All @@ -104,15 +104,15 @@ def test_serialize_outline_with_headless_hierarchy(): # issues 239
/First 8 0 R
/Last 8 0 R
/Parent 6 0 R
/Title ({'?-1'.encode('UTF-16').decode('latin-1')})
/Title <feff003f002d0031>
>>
endobj
8 0 obj
<<
/Count 0
/Dest [5 0 R /XYZ 0 0 null]
/Parent 7 0 R
/Title ({'?-1-1'.encode('UTF-16').decode('latin-1')})
/Title <feff003f002d0031002d0031>
>>
endobj
9 0 obj
Expand All @@ -122,7 +122,7 @@ def test_serialize_outline_with_headless_hierarchy(): # issues 239
/First 10 0 R
/Last 10 0 R
/Parent 6 0 R
/Title ({'1'.encode('UTF-16').decode('latin-1')})
/Title <feff0031>
>>
endobj
10 0 obj
Expand All @@ -132,15 +132,47 @@ def test_serialize_outline_with_headless_hierarchy(): # issues 239
/First 11 0 R
/Last 11 0 R
/Parent 9 0 R
/Title ({'1-1'.encode('UTF-16').decode('latin-1')})
/Title <feff0031002d0031>
>>
endobj
11 0 obj
<<
/Count 0
/Dest [5 0 R /XYZ 0 0 null]
/Parent 10 0 R
/Title ({'1-1-1'.encode('UTF-16').decode('latin-1')})
/Title <feff0031002d0031002d0031>
>>
endobj"""
)


def test_serialize_outline_without_hex_encoding(): # issue-458
PDFString.USE_HEX_ENCODING = False
try:
sections = (
OutlineSection(
"Title", level=0, page_number=1, dest=DestinationXYZ(page=1)
),
)
assert (
serialize_outline(sections, first_object_id=1)
== f"""\
1 0 obj
<<
/Count 1
/First 2 0 R
/Last 2 0 R
/Type /Outlines
>>
endobj
2 0 obj
<<
/Count 0
/Dest [3 0 R /XYZ 0 0 null]
/Parent 1 0 R
/Title ({'Title'.encode('UTF-16').decode('latin-1')})
>>
endobj"""
)
finally:
PDFString.USE_HEX_ENCODING = True # restore default value
Binary file added test/outline/thai_headings.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions test/test_structure_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_single_image_structure_tree():
)
assert (
struct_builder.serialize(first_object_id=3)
== f"""\
== """\
3 0 obj
<<
/K [4 0 R]
Expand All @@ -106,12 +106,12 @@ def test_single_image_structure_tree():
endobj
6 0 obj
<<
/Alt ({'Image description'.encode('UTF-16').decode('latin-1')})
/Alt <feff0049006d0061006700650020006400650073006300720069007000740069006f006e>
/K [0]
/P 4 0 R
/Pg 1 0 R
/S /Figure
/T ({'Image title'.encode('UTF-16').decode('latin-1')})
/T <feff0049006d0061006700650020007400690074006c0065>
/Type /StructElem
>>
endobj"""
Expand Down

0 comments on commit 22491d3

Please sign in to comment.