Improve TryReadStream with simplification & fix of Stream Invalid Len…

…gth cutting off Streams (#838) * Improve TryReadStream with simplification & fix of Stream Invalid Length cutting off Streams - Fix of Stream invalid Length issue causing stream data being cut off: fix #809 - Improve Stream Token read performance by: - simplifying TryReadStream(), avoiding use of MemoryStream, with benefice of already existing Memory Span of "inputBytes" - removing the unecessary List<> * Add Stream with Invalid Length unit test * Use of Memory<> instead of direct Span to avoid byte array allocation .ToArray. Suggestion from (https://github.com/UglyToad/PdfPig/pull/838/files/4153e4a1b421aee6158799175ced081c9f533a13#r1619509165)
UglyToad · May 31, 2024 · 65a18b2 · 65a18b2
1 parent d7e434e
commit 65a18b2
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 105 deletions.
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
@@ -256,6 +256,44 @@ 353 0 obj
             Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
         }
 
+        [Fact]
+        public void ReadsStreamObjectWithInvalidLength()
+        {
+            string invalidLengthStream = "ABCD" + new string('e', 3996);
+
+            string s = $@"
+352 0 obj
+<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> 
+stream
+{invalidLengthStream}
+endstream
+endobj
+353 0 obj
+1479
+endobj";
+
+            var locationProvider = new TestObjectLocationProvider();
+            // Mark location of "353 0 obj"
+            locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
+
+            var scanner = GetScanner(s, locationProvider);
+
+            var tokens = ReadToEnd(scanner);
+
+            Assert.Equal(2, tokens.Count);
+
+            var stream = Assert.IsType<StreamToken>(tokens[0].Data);
+
+            var data = stream.Data.ToArray();
+
+            var str = Encoding.UTF8.GetString(data);
+
+            Assert.Equal(data.Length, invalidLengthStream.Length);
+            Assert.StartsWith("ABCDeeeee", str);
+
+            Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
+        }
+
         [Fact]
         public void ReadsSimpleStreamObject()
         {

diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -7,6 +7,7 @@
     using System.Globalization;
     using System.IO;
     using System.Linq;
+    using System.Text;
     using System.Text.RegularExpressions;
     using Core;
     using Encryption;
@@ -320,7 +321,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
             int endStreamPosition = 0;
             int commonPartPosition = 0;
 
-            const string commonPart = "end";
+            const string endWordPart = "end";
             const string streamPart = "stream";
             const string objPart = "obj";
 
@@ -330,150 +331,129 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
                 return true;
             }
 
-            // Track any 'endobj' or 'endstream' operators we see.
-            var observedEndLocations = new List<PossibleStreamEndLocation>();
+            long streamDataStart = inputBytes.CurrentOffset;
 
-            // Begin reading the stream.
-            using (var memoryStream = new MemoryStream())
-            using (var binaryWrite = new BinaryWriter(memoryStream))
+            PossibleStreamEndLocation? possibleEndLocation = null;
+
+
+            while (inputBytes.MoveNext())
             {
-                while (inputBytes.MoveNext())
+                if (length.HasValue && read == length)
                 {
-                    if (length.HasValue && read == length)
-                    {
-                        // TODO: read ahead and check we're at the end...
-                        // break;
-                    }
+                    // TODO: read ahead and check we're at the end...
+                    // break;
+                }
 
-                    // We are reading 'end' (possibly).
-                    if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
-                    {
-                        commonPartPosition++;
-                    }
-                    else if (commonPartPosition == commonPart.Length)
+                // We are reading 'end' (possibly).
+                if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
+                {
+                    commonPartPosition++;
+                }
+                else if (commonPartPosition == endWordPart.Length)
+                {
+                    // We are reading 'stream' after 'end'
+                    if (inputBytes.CurrentByte == streamPart[endStreamPosition])
                     {
-                        // We are reading 'stream' after 'end'
-                        if (inputBytes.CurrentByte == streamPart[endStreamPosition])
-                        {
-                            endObjPosition = 0;
-                            endStreamPosition++;
-
-                            // We've finished reading 'endstream', add it to the end tokens we've seen.
-                            if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
-                            {
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
+                        endObjPosition = 0;
+                        endStreamPosition++;
 
-                                observedEndLocations.Add(token);
+                        // We've finished reading 'endstream', add it to the end tokens we've seen.
+                        if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
+                        {
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
 
-                                if (length.HasValue && read > length)
-                                {
-                                    break;
-                                }
+                            possibleEndLocation = token;
 
-                                endStreamPosition = 0;
+                            if (length.HasValue && read > length)
+                            {
+                                break;
                             }
-                        }
-                        else if (inputBytes.CurrentByte == objPart[endObjPosition])
-                        {
-                            // We are reading 'obj' after 'end'
 
                             endStreamPosition = 0;
-                            endObjPosition++;
+                        }
+                    }
+                    else if (inputBytes.CurrentByte == objPart[endObjPosition])
+                    {
+                        // We are reading 'obj' after 'end'
+
+                        endStreamPosition = 0;
+                        endObjPosition++;
 
-                            // We have finished reading 'endobj'.
-                            if (endObjPosition == objPart.Length)
+                        // We have finished reading 'endobj'.
+                        if (endObjPosition == objPart.Length)
+                        {
+                            // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
+                            if (possibleEndLocation != null)
                             {
-                                // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
-                                if (observedEndLocations.Count > 0)
-                                {
-                                    var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
+                                var lastEndToken = possibleEndLocation.Value;
 
-                                    inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
+                                inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
 
-                                    break;
-                                }
+                                break;
+                            }
 
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
-                                observedEndLocations.Add(token);
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
 
-                                if (read > length)
-                                {
-                                    break;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            // We were reading 'end' but then we had a character mismatch.
-                            // Reset all the counters.
+                            possibleEndLocation = token;
 
-                            endStreamPosition = 0;
-                            endObjPosition = 0;
-                            commonPartPosition = 0;
+                            if (read > length)
+                            {
+                                break;
+                            }
                         }
                     }
                     else
                     {
-                        // For safety reset every counter in case we had a partial read.
+                        // We were reading 'end' but then we had a character mismatch.
+                        // Reset all the counters.
 
                         endStreamPosition = 0;
                         endObjPosition = 0;
-                        commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
+                        commonPartPosition = 0;
                     }
-
-                    binaryWrite.Write(inputBytes.CurrentByte);
-
-                    read++;
                 }
-
-                binaryWrite.Flush();
-
-                if (observedEndLocations.Count == 0)
+                else
                 {
-                    return false;
+                    // For safety reset every counter in case we had a partial read.
+
+                    endStreamPosition = 0;
+                    endObjPosition = 0;
+                    commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
                 }
 
-                memoryStream.Seek(0, SeekOrigin.Begin);
-                if (length.HasValue && memoryStream.Length >= length)
-                {
-                    // Use the declared length to copy just the data we want.
-                    byte[] data = new byte[length.Value];
+                read++;
+            }
 
-                    memoryStream.Read(data, 0, (int)length.Value);
+            long streamDataEnd = inputBytes.CurrentOffset + 1;
 
-                    stream = new StreamToken(streamDictionaryToken, data);
-                }
-                else
-                {
-                    // Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
-                    var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
+            if (possibleEndLocation == null)
+                return false;
 
-                    var dataLength = lastEnd.Offset - startDataOffset;
+            var lastEnd = possibleEndLocation;
 
-                    var current = inputBytes.CurrentOffset;
+            var dataLength = lastEnd.Value.Offset - startDataOffset;
 
-                    // 3 characters, 'e', '\n' and possibly '\r'
-                    inputBytes.Seek(lastEnd.Offset - 3);
-                    inputBytes.MoveNext();
+            // 3 characters, 'e', '\n' and possibly '\r'
+            inputBytes.Seek(lastEnd.Value.Offset - 3);
+            inputBytes.MoveNext();
 
-                    if (inputBytes.CurrentByte == '\r')
-                    {
-                        dataLength -= 3;
-                    }
-                    else
-                    {
-                        dataLength -= 2;
-                    }
+            if (inputBytes.CurrentByte == '\r')
+            {
+                dataLength -= 3;
+            }
+            else
+            {
+                dataLength -= 2;
+            }
 
-                    inputBytes.Seek(current);
+            Memory<byte> data = new byte[dataLength];
 
-                    byte[] data = new byte[dataLength];
+            inputBytes.Seek(streamDataStart);
+            inputBytes.Read(data.Span);
 
-                    memoryStream.Read(data, 0, (int)dataLength);
+            inputBytes.Seek(streamDataEnd);
 
-                    stream = new StreamToken(streamDictionaryToken, data);
-                }
-            }
+            stream = new StreamToken(streamDictionaryToken, data);
 
             return true;
         }