Skip to content

Commit

Permalink
Allow lenient parsing in DictionaryTokenizer and fix #791
Browse files Browse the repository at this point in the history
  • Loading branch information
BobLd committed Mar 11, 2024
1 parent 250362e commit acfe8b5
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 17 deletions.
49 changes: 49 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/DocumentInformationTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using PdfPig.Core;
using PdfPig.Tokens;
using Xunit;

Expand Down Expand Up @@ -33,5 +34,53 @@ public void CanReadDocumentInformation()
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
}
}

[Fact]
public void CanReadInvalidDocumentInformation()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf");

/*
<<
/Producer (pdfTeX-1.40.21)
Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords()
/CreationDate (D:20230418010134Z)
/ModDate (D:20230418010134Z)
/Trapped /False
/PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2)
>>
*/

// Lenient Parsing On -> can process
using (var document = PdfDocument.Open(path))
{
var information = document.Information;

Assert.Equal("LaTeX with hyperref", information.Creator);
Assert.Equal("", information.Keywords);
Assert.Equal("pdfTeX-1.40.21", information.Producer);
Assert.Equal("", information.Subject);
Assert.Equal("", information.Title);
Assert.Equal("", information.Author);
Assert.Equal("D:20230418010134Z", information.CreationDate);
Assert.Equal("D:20230418010134Z", information.ModifiedDate);

var infoDictionary = information.DocumentInformationDictionary;

var nameToken = NameToken.Create("Trapped");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken));
Assert.IsType<NameToken>(valueToken);
Assert.Equal("False", ((NameToken)valueToken).Data);

nameToken = NameToken.Create("PTEX.Fullbanner");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2));
Assert.IsType<StringToken>(valueToken2);
Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data);
}

// Lenient Parsing Off -> throws
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message);
}
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider l
var input = StringBytesTestConverter.Convert(s, false);

return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance);
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
}

private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
Expand Down
21 changes: 15 additions & 6 deletions src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ internal class DictionaryTokenizer : ITokenizer
{
private readonly bool usePdfDocEncoding;
private readonly IReadOnlyList<NameToken> requiredKeys;
private readonly bool useLenientParsing;

public bool ReadsNextByte { get; } = false;

Expand All @@ -22,10 +23,12 @@ internal class DictionaryTokenizer : ITokenizer
/// Can be provided to recover from errors with missing dictionary end symbols if the
/// set of keys expected in the dictionary are known.
/// </param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null)
/// <param name="useLenientParsing">Whether to use lenient parsing.</param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null, bool useLenientParsing = false)
{
this.usePdfDocEncoding = usePdfDocEncoding;
this.requiredKeys = requiredKeys;
this.useLenientParsing = useLenientParsing;
}

public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
Expand Down Expand Up @@ -80,7 +83,7 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool
return false;
}

var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing);

var tokens = new List<IToken>();

Expand All @@ -96,7 +99,7 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool
// Has enough key/values for each required key
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
{
var proposedDictionary = ConvertToDictionary(tokens);
var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing);

var isAcceptable = true;
foreach (var key in requiredKeys)
Expand All @@ -118,15 +121,14 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool
}
}

var dictionary = ConvertToDictionary(tokens);
var dictionary = ConvertToDictionary(tokens, useLenientParsing);

token = new DictionaryToken(dictionary);

return true;

}

private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens, bool useLenientParsing)
{
var result = new Dictionary<NameToken, IToken>();

Expand All @@ -143,6 +145,13 @@ private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> to
continue;
}

if (useLenientParsing)
{
// TODO - Log warning
System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token);
continue;
}

throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
}

Expand Down
11 changes: 7 additions & 4 deletions src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ public class CoreTokenScanner : ISeekableTokenScanner
private readonly IInputBytes inputBytes;
private readonly bool usePdfDocEncoding;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();

private readonly bool useLenientParsing;

/// <summary>
/// The offset in the input data at which the <see cref="CurrentToken"/> starts.
/// </summary>
Expand All @@ -52,15 +53,17 @@ public CoreTokenScanner(
IInputBytes inputBytes,
bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
bool useLenientParsing = false)
{
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.usePdfDocEncoding = usePdfDocEncoding;
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing);
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
this.useLenientParsing = useLenientParsing;
}

/// <inheritdoc />
Expand Down Expand Up @@ -140,7 +143,7 @@ public bool MoveNext()
&& CurrentToken is NameToken name
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
{
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing);
}
}
else
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig/Parser/PageContentParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public IReadOnlyList<IGraphicsStateOperation> Parse(
IInputBytes inputBytes,
ILog log)
{
var scanner = new CoreTokenScanner(inputBytes, false);
var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing);

var precedingTokens = new List<IToken>();
var graphicsStateOperations = new List<IGraphicsStateOperation>();
Expand Down
4 changes: 2 additions & 2 deletions src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options =
SkipMissingFonts = false
};

var tokenScanner = new CoreTokenScanner(inputBytes, true);
var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing);

var passwords = new List<string>();

Expand Down Expand Up @@ -115,7 +115,7 @@ private static PdfDocument OpenDocument(
// We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);

var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);
Expand Down
8 changes: 5 additions & 3 deletions src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ internal class PdfTokenScanner : IPdfTokenScanner
private readonly IObjectLocationProvider objectLocationProvider;
private readonly ILookupFilterProvider filterProvider;
private readonly CoreTokenScanner coreTokenScanner;
private readonly ParsingOptions parsingOptions;

private IEncryptionHandler encryptionHandler;
private bool isDisposed;
Expand Down Expand Up @@ -52,13 +53,14 @@ internal class PdfTokenScanner : IPdfTokenScanner
public long Length => coreTokenScanner.Length;

public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
IEncryptionHandler encryptionHandler)
IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions)
{
this.inputBytes = inputBytes;
this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider;
this.encryptionHandler = encryptionHandler;
coreTokenScanner = new CoreTokenScanner(inputBytes, true);
this.parsingOptions = parsingOptions;
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
}

public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
Expand Down Expand Up @@ -815,7 +817,7 @@ private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long of
// Read the N integers
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));

var scanner = new CoreTokenScanner(bytes, true);
var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);

var objects = new List<Tuple<long, long>>();

Expand Down

0 comments on commit acfe8b5

Please sign in to comment.