UglyToad · sbruyere · May 21, 2024 · May 21, 2024 · May 21, 2024 · May 21, 2024
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -7,6 +7,7 @@
     using System.Globalization;
     using System.IO;
     using System.Linq;
+    using System.Text;
     using System.Text.RegularExpressions;
     using Core;
     using Encryption;
@@ -320,7 +321,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
             int endStreamPosition = 0;
             int commonPartPosition = 0;
 
-            const string commonPart = "end";
+            const string endWordPart = "end";
             const string streamPart = "stream";
             const string objPart = "obj";
 
@@ -330,150 +331,129 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
                 return true;
             }
 
-            // Track any 'endobj' or 'endstream' operators we see.
-            var observedEndLocations = new List<PossibleStreamEndLocation>();
+            long streamDataStart = inputBytes.CurrentOffset;
 
-            // Begin reading the stream.
-            using (var memoryStream = new MemoryStream())
-            using (var binaryWrite = new BinaryWriter(memoryStream))
+            PossibleStreamEndLocation? possibleEndLocation = null;
+
+            while (inputBytes.MoveNext())
             {
-                while (inputBytes.MoveNext())
+                if (length.HasValue && read == length)
                 {
-                    if (length.HasValue && read == length)
-                    {
-                        // TODO: read ahead and check we're at the end...
-                        // break;
-                    }
+                    // TODO: read ahead and check we're at the end...
+                    // break;
+                }
 
-                    // We are reading 'end' (possibly).
-                    if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
-                    {
-                        commonPartPosition++;
-                    }
-                    else if (commonPartPosition == commonPart.Length)
+                // We are reading 'end' (possibly).
+                if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
+                {
+                    commonPartPosition++;
+                }
+                else if (commonPartPosition == endWordPart.Length)
+                {
+                    // We are reading 'stream' after 'end'
+                    if (inputBytes.CurrentByte == streamPart[endStreamPosition])
                     {
-                        // We are reading 'stream' after 'end'
-                        if (inputBytes.CurrentByte == streamPart[endStreamPosition])
-                        {
-                            endObjPosition = 0;
-                            endStreamPosition++;
-
-                            // We've finished reading 'endstream', add it to the end tokens we've seen.
-                            if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
-                            {
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
+                        endObjPosition = 0;
+                        endStreamPosition++;
 
-                                observedEndLocations.Add(token);
+                        // We've finished reading 'endstream', add it to the end tokens we've seen.
+                        if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
+                        {
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
 
-                                if (length.HasValue && read > length)
-                                {
-                                    break;
-                                }
+                            possibleEndLocation = token;
+                            //observedEndLocations.Add(token);
 
-                                endStreamPosition = 0;
+                            if (length.HasValue && read > length)
+                            {
+                                break;
                             }
-                        }
-                        else if (inputBytes.CurrentByte == objPart[endObjPosition])
-                        {
-                            // We are reading 'obj' after 'end'
 
                             endStreamPosition = 0;
-                            endObjPosition++;
+                        }
+                    }
+                    else if (inputBytes.CurrentByte == objPart[endObjPosition])
+                    {
+                        // We are reading 'obj' after 'end'
+
+                        endStreamPosition = 0;
+                        endObjPosition++;
 
-                            // We have finished reading 'endobj'.
-                            if (endObjPosition == objPart.Length)
+                        // We have finished reading 'endobj'.
+                        if (endObjPosition == objPart.Length)
+                        {
+                            // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
+                            if (possibleEndLocation != null)
                             {
-                                // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
-                                if (observedEndLocations.Count > 0)
-                                {
-                                    var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
+                                var lastEndToken = possibleEndLocation.Value; //observedEndLocations[observedEndLocations.Count - 1];
 
-                                    inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
+                                inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
+                                //streamDataEnd = lastEndToken.Offset + lastEndToken.Type.Data.Length + 1;
 
-                                    break;
-                                }
+                                break;
+                            }
 
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
-                                observedEndLocations.Add(token);
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
+                            possibleEndLocation = token;
 
-                                if (read > length)
-                                {
-                                    break;
-                                }
+                            if (read > length)
+                            {
+                                break;
                             }
                         }
-                        else
-                        {
-                            // We were reading 'end' but then we had a character mismatch.
-                            // Reset all the counters.
-
-                            endStreamPosition = 0;
-                            endObjPosition = 0;
-                            commonPartPosition = 0;
-                        }
                     }
                     else
                     {
-                        // For safety reset every counter in case we had a partial read.
+                        // We were reading 'end' but then we had a character mismatch.
+                        // Reset all the counters.
 
                         endStreamPosition = 0;
                         endObjPosition = 0;
-                        commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
+                        commonPartPosition = 0;
                     }
-
-                    binaryWrite.Write(inputBytes.CurrentByte);
-
-                    read++;
                 }
-
-                binaryWrite.Flush();
-
-                if (observedEndLocations.Count == 0)
+                else
                 {
-                    return false;
+                    // For safety reset every counter in case we had a partial read.
+
+                    endStreamPosition = 0;
+                    endObjPosition = 0;
+                    commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
                 }
 
-                memoryStream.Seek(0, SeekOrigin.Begin);
-                if (length.HasValue && memoryStream.Length >= length)
-                {
-                    // Use the declared length to copy just the data we want.
-                    byte[] data = new byte[length.Value];
+                read++;
+            }
 
-                    memoryStream.Read(data, 0, (int)length.Value);
+            long streamDataEnd = inputBytes.CurrentOffset + 1;
 
-                    stream = new StreamToken(streamDictionaryToken, data);
-                }
-                else
-                {
-                    // Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
-                    var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
+            if (possibleEndLocation == null)
+                return false;
 
-                    var dataLength = lastEnd.Offset - startDataOffset;
+            var lastEnd = possibleEndLocation;
 
-                    var current = inputBytes.CurrentOffset;
+            var dataLength = lastEnd.Value.Offset - startDataOffset;
 
-                    // 3 characters, 'e', '\n' and possibly '\r'
-                    inputBytes.Seek(lastEnd.Offset - 3);
-                    inputBytes.MoveNext();
+            // 3 characters, 'e', '\n' and possibly '\r'
+            inputBytes.Seek(lastEnd.Value.Offset - 3);
+            inputBytes.MoveNext();
 
-                    if (inputBytes.CurrentByte == '\r')
-                    {
-                        dataLength -= 3;
-                    }
-                    else
-                    {
-                        dataLength -= 2;
-                    }
+            if (inputBytes.CurrentByte == '\r')
+            {
+                dataLength -= 3;
+            }
+            else
+            {
+                dataLength -= 2;
+            }
 
-                    inputBytes.Seek(current);
+            Span<byte> data = new byte[dataLength];
 
-                    byte[] data = new byte[dataLength];
+            inputBytes.Seek(streamDataStart);
+            inputBytes.Read(data);
 
-                    memoryStream.Read(data, 0, (int)dataLength);
+            inputBytes.Seek(streamDataEnd);
 
-                    stream = new StreamToken(streamDictionaryToken, data);
-                }
-            }
+            stream = new StreamToken(streamDictionaryToken, data.ToArray());
 
             return true;
         }