From 250362e0152dab0e3d4de496268281768016ece2 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Sun, 10 Mar 2024 11:59:25 +0000 Subject: [PATCH] Ensure string characters are escaped when encoding is Utf16BE and fix #789 --- .../NonAsciiCharactersBookmarksTests.cs | 112 ++++++++++++++++++ src/UglyToad.PdfPig/Writer/TokenWriter.cs | 66 +++++------ 2 files changed, 145 insertions(+), 33 deletions(-) create mode 100644 src/UglyToad.PdfPig.Tests/Integration/NonAsciiCharactersBookmarksTests.cs diff --git a/src/UglyToad.PdfPig.Tests/Integration/NonAsciiCharactersBookmarksTests.cs b/src/UglyToad.PdfPig.Tests/Integration/NonAsciiCharactersBookmarksTests.cs new file mode 100644 index 000000000..006164157 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/NonAsciiCharactersBookmarksTests.cs @@ -0,0 +1,112 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.IO; + using System.Linq; + using UglyToad.PdfPig.Content; + using UglyToad.PdfPig.Outline; + using UglyToad.PdfPig.Outline.Destinations; + using UglyToad.PdfPig.Writer; + using Xunit; + + public class NonAsciiCharactersBookmarksTests + { + [Theory] + [MemberData(nameof(TestData.TestData_Pass), MemberType = typeof(TestData))] + [MemberData(nameof(TestData.TestData_Failed), MemberType = typeof(TestData))] + public void CanGetBookmarks(string words) + { + using var builder = new PdfDocumentBuilder(); + builder.AddPage(PageSize.A4); + + // Set bookmark items. + var inputs = words.Split(" ", StringSplitOptions.RemoveEmptyEntries); + builder.Bookmarks = new Bookmarks(inputs.Select(x => new DocumentBookmarkNode(x, + 0, + new ExplicitDestination(1, + ExplicitDestinationType.XyzCoordinates, + ExplicitDestinationCoordinates.Empty), + Array.Empty())).ToArray()); + + // Build PDF data + var bytes = builder.Build(); + + // Read PDF from bytes. And read bookmark data. + using var doc = PdfDocument.Open(bytes); + bool isSuccess = doc.TryGetBookmarks(out var bookmarks); + + // Assert + Assert.True(isSuccess); + var results = bookmarks.GetNodes().Select(x => x.Title).ToArray(); + Assert.Equivalent(inputs, results); + } + + private static class TestData + { + public static TheoryData TestData_Failed = new TheoryData + { + "ШЩHI차岸岩還館小少尚", + "A Ш Z", // CYRILLIC CAPITAL LETTER + "AШZ", // CYRILLIC CAPITAL LETTER + "A Щ Z", // CYRILLIC CAPITAL LETTER + "AЩZ", // CYRILLIC CAPITAL LETTER + "H I", // FULLWIDTH LATIN CAPITAL LETTER A + "HI", // FULLWIDTH LATIN CAPITAL LETTER A + "차", // HANGUL + "岸 岩", // KANJI + "岸岩", // KANJI + "還 館", // KANJI + "還館", // KANJI + "小 少 尚", // KANJI + "小少尚", // KANJI + }; + + public static TheoryData TestData_Pass = new TheoryData + { + // FRENCH Alphabet Diacritics and ligatures + "É À È Ù Â Ê Î Ô Û Ë Ï Ü Ç Œ Æ", + "é à è ù â ê î ô û ë ï ü ç œ æ", + + // GREEK Alphabet + "Α Β Γ Δ Ε Ζ Η Θ Ι Κ Λ Μ Ν Ξ Ο Π Ρ Σ Τ Υ Φ Χ Ψ Ω", + "α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω", + + // CYRILLIC CAPITAL LETTER + // "А Б В Г Д Е Ж З И К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ы Э Ю Я", + + // CYRILLIC SMALL LETTER + "а б в г д е ж з и к л м н о п р с т у ф х ц ч ш щ ы э ю я", + + // HANGUL CHOSEONG + "ㄱ ㄴ ㄷ ㄹ ㅁ ㅂ ㅅ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ", + + // HANGUL GANADA + //"가 나 다 라 마 바 사 아 자 차 카 타 파 하", + + // FULLWIDTH LATIN CAPITAL LETTER + // "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z", + + // FULLWIDTH LATIN SMALL LETTER + "a b c d e f g h i j k l m n o p q r s t u v w x y z", + + // Halfwidth Katakana + "ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ヲ ン", + + // Fullwidth Katakana + "ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ヲ ン", + + // Fullwidth Hiragana + "あ い う え お か き く け こ さ し す せ そ た ち つ て と な に ぬ ね の は ひ ふ へ ほ ま み む め も や ゆ よ ら り る れ ろ わ を ん", + + // Kanji (Surrogate Pair) + "𩸽 𩹉 𡵅", + + // Emoji + "🏠 🚗 📝", + + // Emoji (with ZWJ Sequences) + "👨‍💻 👁‍🗨 😶‍🌫️" + }; + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 4cc7b7d56..8a769ba2e 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -529,8 +529,6 @@ protected virtual void WriteStream(StreamToken streamToken, Stream outputStream) /// /// Write string to the stream, with whitespace at the end /// - /// - /// protected virtual void WriteString(StringToken stringToken, Stream outputStream) { outputStream.WriteByte(StringStart); @@ -541,41 +539,43 @@ protected virtual void WriteString(StringToken stringToken, Stream outputStream) // have these chars but seems like internally this isn't obeyed (see: // CanCreateDocumentInformationDictionaryWithNonAsciiCharacters test) and it may // happen during parsing as well -> switch to unicode - if (stringToken.Data.Any(x => x > 255)) + + var data = stringToken.Data.ToCharArray(); + if (data.Any(x => x > 255)) { - var data = new StringToken(stringToken.Data, StringToken.Encoding.Utf16BE).GetBytes(); - outputStream.Write(data, 0, data.Length); + data = new StringToken(stringToken.Data, StringToken.Encoding.Utf16BE) + .GetBytes() + .Select(b => (char)b) + .ToArray(); } - else + + int ei; + for (var i = 0; i < data.Length; i++) { - int ei; - for (var i = 0; i < stringToken.Data.Length; i++) + var c = (int)data[i]; + if (c == (int)'(' || c == (int)')') // wastes a little space if escaping not needed but better than forward searching + { + outputStream.WriteByte((byte)'\\'); + outputStream.WriteByte((byte)c); + } + else if ((ei = Array.IndexOf(EscapeNeeded, c)) > -1) + { + outputStream.WriteByte((byte)'\\'); + outputStream.WriteByte((byte)Escaped[ei]); + } + else if (c < 32 || c > 126) // non printable + { + var b3 = c / 64; + var b2 = (c - b3 * 64) / 8; + var b1 = c % 8; + outputStream.WriteByte((byte)'\\'); + outputStream.WriteByte((byte)(b3 + '0')); + outputStream.WriteByte((byte)(b2 + '0')); + outputStream.WriteByte((byte)(b1 + '0')); + } + else { - var c = (int)stringToken.Data[i]; - if (c == (int)'(' || c == (int)')') // wastes a little space if escaping not needed but better than forward searching - { - outputStream.WriteByte((byte)'\\'); - outputStream.WriteByte((byte)c); - } - else if ((ei = Array.IndexOf(EscapeNeeded, c)) > -1) - { - outputStream.WriteByte((byte)'\\'); - outputStream.WriteByte((byte)Escaped[ei]); - } - else if (c < 32 || c > 126) // non printable - { - var b3 = c / 64; - var b2 = (c - b3 * 64) / 8; - var b1 = c % 8; - outputStream.WriteByte((byte)'\\'); - outputStream.WriteByte((byte)(b3 + '0')); - outputStream.WriteByte((byte)(b2 + '0')); - outputStream.WriteByte((byte)(b1 + '0')); - } - else - { - outputStream.WriteByte((byte)c); - } + outputStream.WriteByte((byte)c); } } }