diff --git a/src/UglyToad.PdfPig.Tests/Integration/DocumentInformationTests.cs b/src/UglyToad.PdfPig.Tests/Integration/DocumentInformationTests.cs index a25af2bc1..84f8e7159 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/DocumentInformationTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/DocumentInformationTests.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig.Tests.Integration { + using PdfPig.Core; using PdfPig.Tokens; using Xunit; @@ -33,5 +34,53 @@ public void CanReadDocumentInformation() Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data); } } + + [Fact] + public void CanReadInvalidDocumentInformation() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf"); + + /* + << + /Producer (pdfTeX-1.40.21) + Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords() + /CreationDate (D:20230418010134Z) + /ModDate (D:20230418010134Z) + /Trapped /False + /PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2) + >> + */ + + // Lenient Parsing On -> can process + using (var document = PdfDocument.Open(path)) + { + var information = document.Information; + + Assert.Equal("LaTeX with hyperref", information.Creator); + Assert.Equal("", information.Keywords); + Assert.Equal("pdfTeX-1.40.21", information.Producer); + Assert.Equal("", information.Subject); + Assert.Equal("", information.Title); + Assert.Equal("", information.Author); + Assert.Equal("D:20230418010134Z", information.CreationDate); + Assert.Equal("D:20230418010134Z", information.ModifiedDate); + + var infoDictionary = information.DocumentInformationDictionary; + + var nameToken = NameToken.Create("Trapped"); + Assert.True(infoDictionary.TryGet(nameToken, out var valueToken)); + Assert.IsType(valueToken); + Assert.Equal("False", ((NameToken)valueToken).Data); + + nameToken = NameToken.Create("PTEX.Fullbanner"); + Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2)); + Assert.IsType(valueToken2); + Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data); + } + + // Lenient Parsing Off -> throws + var ex = Assert.Throws(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff)); + Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message); + } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-pdf-structure-pdfminer-entire-doc.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-pdf-structure-pdfminer-entire-doc.pdf new file mode 100644 index 000000000..ad9efe7e7 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-pdf-structure-pdfminer-entire-doc.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 4d8bf7808..06cdcc535 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -499,7 +499,7 @@ private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider l var input = StringBytesTestConverter.Convert(s, false); return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(), - new TestFilterProvider(), NoOpEncryptionHandler.Instance); + new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff); } private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) diff --git a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs index e715db989..dd52d8dd8 100644 --- a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs @@ -9,6 +9,7 @@ internal class DictionaryTokenizer : ITokenizer { private readonly bool usePdfDocEncoding; private readonly IReadOnlyList requiredKeys; + private readonly bool useLenientParsing; public bool ReadsNextByte { get; } = false; @@ -22,10 +23,12 @@ internal class DictionaryTokenizer : ITokenizer /// Can be provided to recover from errors with missing dictionary end symbols if the /// set of keys expected in the dictionary are known. /// - public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList requiredKeys = null) + /// Whether to use lenient parsing. + public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList requiredKeys = null, bool useLenientParsing = false) { this.usePdfDocEncoding = usePdfDocEncoding; this.requiredKeys = requiredKeys; + this.useLenientParsing = useLenientParsing; } public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) @@ -80,7 +83,7 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool return false; } - var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary); + var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing); var tokens = new List(); @@ -96,7 +99,7 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool // Has enough key/values for each required key if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2) { - var proposedDictionary = ConvertToDictionary(tokens); + var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing); var isAcceptable = true; foreach (var key in requiredKeys) @@ -118,15 +121,14 @@ private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool } } - var dictionary = ConvertToDictionary(tokens); + var dictionary = ConvertToDictionary(tokens, useLenientParsing); token = new DictionaryToken(dictionary); return true; - } - private static Dictionary ConvertToDictionary(List tokens) + private static Dictionary ConvertToDictionary(List tokens, bool useLenientParsing) { var result = new Dictionary(); @@ -143,6 +145,13 @@ private static Dictionary ConvertToDictionary(List to continue; } + if (useLenientParsing) + { + // TODO - Log warning + System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token); + continue; + } + throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token); } diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 5536d319b..05e9ca21e 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -27,7 +27,8 @@ public class CoreTokenScanner : ISeekableTokenScanner private readonly IInputBytes inputBytes; private readonly bool usePdfDocEncoding; private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); - + private readonly bool useLenientParsing; + /// /// The offset in the input data at which the starts. /// @@ -52,15 +53,17 @@ public CoreTokenScanner( IInputBytes inputBytes, bool usePdfDocEncoding, ScannerScope scope = ScannerScope.None, - IReadOnlyDictionary> namedDictionaryRequiredKeys = null) + IReadOnlyDictionary> namedDictionaryRequiredKeys = null, + bool useLenientParsing = false) { this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); this.usePdfDocEncoding = usePdfDocEncoding; this.stringTokenizer = new StringTokenizer(usePdfDocEncoding); this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding); - this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding); + this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing); this.scope = scope; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; + this.useLenientParsing = useLenientParsing; } /// @@ -140,7 +143,7 @@ public bool MoveNext() && CurrentToken is NameToken name && namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys)) { - tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys); + tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing); } } else diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index e8822e4cb..f290e9d76 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -28,7 +28,7 @@ public IReadOnlyList Parse( IInputBytes inputBytes, ILog log) { - var scanner = new CoreTokenScanner(inputBytes, false); + var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing); var precedingTokens = new List(); var graphicsStateOperations = new List(); diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 806fd80e6..348bda8ea 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -75,7 +75,7 @@ private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options = SkipMissingFonts = false }; - var tokenScanner = new CoreTokenScanner(inputBytes, true); + var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing); var passwords = new List(); @@ -115,7 +115,7 @@ private static PdfDocument OpenDocument( // We're ok with this since our intent is to lazily load the cross reference table. // ReSharper disable once AccessToModifiedClosure var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes); - var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance); + var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions); var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider); var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 622be80a3..4e25fe4b9 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -25,6 +25,7 @@ internal class PdfTokenScanner : IPdfTokenScanner private readonly IObjectLocationProvider objectLocationProvider; private readonly ILookupFilterProvider filterProvider; private readonly CoreTokenScanner coreTokenScanner; + private readonly ParsingOptions parsingOptions; private IEncryptionHandler encryptionHandler; private bool isDisposed; @@ -52,13 +53,14 @@ internal class PdfTokenScanner : IPdfTokenScanner public long Length => coreTokenScanner.Length; public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider, - IEncryptionHandler encryptionHandler) + IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions) { this.inputBytes = inputBytes; this.objectLocationProvider = objectLocationProvider; this.filterProvider = filterProvider; this.encryptionHandler = encryptionHandler; - coreTokenScanner = new CoreTokenScanner(inputBytes, true); + this.parsingOptions = parsingOptions; + coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing); } public void UpdateEncryptionHandler(IEncryptionHandler newHandler) @@ -815,7 +817,7 @@ private IReadOnlyList ParseObjectStream(StreamToken stream, long of // Read the N integers var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this)); - var scanner = new CoreTokenScanner(bytes, true); + var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing); var objects = new List>();