Skip to content

Commit

Permalink
Ensure string characters are escaped when encoding is Utf16BE and fix #…
Browse files Browse the repository at this point in the history
  • Loading branch information
BobLd committed Mar 11, 2024
1 parent ac0276f commit 250362e
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.IO;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Outline;
using UglyToad.PdfPig.Outline.Destinations;
using UglyToad.PdfPig.Writer;
using Xunit;

public class NonAsciiCharactersBookmarksTests
{
[Theory]
[MemberData(nameof(TestData.TestData_Pass), MemberType = typeof(TestData))]
[MemberData(nameof(TestData.TestData_Failed), MemberType = typeof(TestData))]
public void CanGetBookmarks(string words)
{
using var builder = new PdfDocumentBuilder();
builder.AddPage(PageSize.A4);

// Set bookmark items.
var inputs = words.Split(" ", StringSplitOptions.RemoveEmptyEntries);
builder.Bookmarks = new Bookmarks(inputs.Select(x => new DocumentBookmarkNode(x,
0,
new ExplicitDestination(1,
ExplicitDestinationType.XyzCoordinates,
ExplicitDestinationCoordinates.Empty),
Array.Empty<BookmarkNode>())).ToArray());

// Build PDF data
var bytes = builder.Build();

// Read PDF from bytes. And read bookmark data.
using var doc = PdfDocument.Open(bytes);
bool isSuccess = doc.TryGetBookmarks(out var bookmarks);

// Assert
Assert.True(isSuccess);
var results = bookmarks.GetNodes().Select(x => x.Title).ToArray();
Assert.Equivalent(inputs, results);
}

private static class TestData
{
public static TheoryData<string> TestData_Failed = new TheoryData<string>
{
"ШЩHI차岸岩還館小少尚",
"A Ш Z", // CYRILLIC CAPITAL LETTER
"AШZ", // CYRILLIC CAPITAL LETTER
"A Щ Z", // CYRILLIC CAPITAL LETTER
"AЩZ", // CYRILLIC CAPITAL LETTER
"H I", // FULLWIDTH LATIN CAPITAL LETTER A
"HI", // FULLWIDTH LATIN CAPITAL LETTER A
"", // HANGUL
"岸 岩", // KANJI
"岸岩", // KANJI
"還 館", // KANJI
"還館", // KANJI
"小 少 尚", // KANJI
"小少尚", // KANJI
};

public static TheoryData<string> TestData_Pass = new TheoryData<string>
{
// FRENCH Alphabet Diacritics and ligatures
"É À È Ù Â Ê Î Ô Û Ë Ï Ü Ç Œ Æ",
"é à è ù â ê î ô û ë ï ü ç œ æ",

// GREEK Alphabet
"Α Β Γ Δ Ε Ζ Η Θ Ι Κ Λ Μ Ν Ξ Ο Π Ρ Σ Τ Υ Φ Χ Ψ Ω",
"α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω",

// CYRILLIC CAPITAL LETTER
// "А Б В Г Д Е Ж З И К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ы Э Ю Я",

// CYRILLIC SMALL LETTER
"а б в г д е ж з и к л м н о п р с т у ф х ц ч ш щ ы э ю я",

// HANGUL CHOSEONG
"ㄱ ㄴ ㄷ ㄹ ㅁ ㅂ ㅅ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ",

// HANGUL GANADA
//"가 나 다 라 마 바 사 아 자 차 카 타 파 하",

// FULLWIDTH LATIN CAPITAL LETTER
// "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z",

// FULLWIDTH LATIN SMALL LETTER
"a b c d e f g h i j k l m n o p q r s t u v w x y z",

// Halfwidth Katakana
"ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ヲ ン",

// Fullwidth Katakana
"ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ヲ ン",

// Fullwidth Hiragana
"あ い う え お か き く け こ さ し す せ そ た ち つ て と な に ぬ ね の は ひ ふ へ ほ ま み む め も や ゆ よ ら り る れ ろ わ を ん",

// Kanji (Surrogate Pair)
"𩸽 𩹉 𡵅",

// Emoji
"🏠 🚗 📝",

// Emoji (with ZWJ Sequences)
"👨‍💻 👁‍🗨 😶‍🌫️"
};
}
}
}
66 changes: 33 additions & 33 deletions src/UglyToad.PdfPig/Writer/TokenWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -529,8 +529,6 @@ protected virtual void WriteStream(StreamToken streamToken, Stream outputStream)
/// <summary>
/// Write string to the stream, with whitespace at the end
/// </summary>
/// <param name="stringToken"></param>
/// <param name="outputStream"></param>
protected virtual void WriteString(StringToken stringToken, Stream outputStream)
{
outputStream.WriteByte(StringStart);
Expand All @@ -541,41 +539,43 @@ protected virtual void WriteString(StringToken stringToken, Stream outputStream)
// have these chars but seems like internally this isn't obeyed (see:
// CanCreateDocumentInformationDictionaryWithNonAsciiCharacters test) and it may
// happen during parsing as well -> switch to unicode
if (stringToken.Data.Any(x => x > 255))

var data = stringToken.Data.ToCharArray();
if (data.Any(x => x > 255))
{
var data = new StringToken(stringToken.Data, StringToken.Encoding.Utf16BE).GetBytes();
outputStream.Write(data, 0, data.Length);
data = new StringToken(stringToken.Data, StringToken.Encoding.Utf16BE)
.GetBytes()
.Select(b => (char)b)
.ToArray();
}
else

int ei;
for (var i = 0; i < data.Length; i++)
{
int ei;
for (var i = 0; i < stringToken.Data.Length; i++)
var c = (int)data[i];
if (c == (int)'(' || c == (int)')') // wastes a little space if escaping not needed but better than forward searching
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)c);
}
else if ((ei = Array.IndexOf(EscapeNeeded, c)) > -1)
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)Escaped[ei]);
}
else if (c < 32 || c > 126) // non printable
{
var b3 = c / 64;
var b2 = (c - b3 * 64) / 8;
var b1 = c % 8;
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)(b3 + '0'));
outputStream.WriteByte((byte)(b2 + '0'));
outputStream.WriteByte((byte)(b1 + '0'));
}
else
{
var c = (int)stringToken.Data[i];
if (c == (int)'(' || c == (int)')') // wastes a little space if escaping not needed but better than forward searching
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)c);
}
else if ((ei = Array.IndexOf(EscapeNeeded, c)) > -1)
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)Escaped[ei]);
}
else if (c < 32 || c > 126) // non printable
{
var b3 = c / 64;
var b2 = (c - b3 * 64) / 8;
var b1 = c % 8;
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)(b3 + '0'));
outputStream.WriteByte((byte)(b2 + '0'));
outputStream.WriteByte((byte)(b1 + '0'));
}
else
{
outputStream.WriteByte((byte)c);
}
outputStream.WriteByte((byte)c);
}
}
}
Expand Down

0 comments on commit 250362e

Please sign in to comment.