Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve TryReadStream with simplification & fix of Stream Invalid Length cutting off Streams #837

Closed
wants to merge 6 commits into from
190 changes: 85 additions & 105 deletions src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Core;
using Encryption;
Expand Down Expand Up @@ -320,7 +321,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
int endStreamPosition = 0;
int commonPartPosition = 0;

const string commonPart = "end";
const string endWordPart = "end";
const string streamPart = "stream";
const string objPart = "obj";

Expand All @@ -330,150 +331,129 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull
return true;
}

// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
long streamDataStart = inputBytes.CurrentOffset;

// Begin reading the stream.
using (var memoryStream = new MemoryStream())
using (var binaryWrite = new BinaryWriter(memoryStream))
PossibleStreamEndLocation? possibleEndLocation = null;

while (inputBytes.MoveNext())
{
while (inputBytes.MoveNext())
if (length.HasValue && read == length)
{
if (length.HasValue && read == length)
{
// TODO: read ahead and check we're at the end...
// break;
}
// TODO: read ahead and check we're at the end...
// break;
}

// We are reading 'end' (possibly).
if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == commonPart.Length)
// We are reading 'end' (possibly).
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == endWordPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
endObjPosition = 0;
endStreamPosition++;

// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
endObjPosition = 0;
endStreamPosition++;

observedEndLocations.Add(token);
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);

if (length.HasValue && read > length)
{
break;
}
possibleEndLocation = token;
//observedEndLocations.Add(token);

endStreamPosition = 0;
if (length.HasValue && read > length)
{
break;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'

endStreamPosition = 0;
endObjPosition++;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'

endStreamPosition = 0;
endObjPosition++;

// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (possibleEndLocation != null)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (observedEndLocations.Count > 0)
{
var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
var lastEndToken = possibleEndLocation.Value; //observedEndLocations[observedEndLocations.Count - 1];

inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
//streamDataEnd = lastEndToken.Offset + lastEndToken.Type.Data.Length + 1;

break;
}
break;
}

var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
observedEndLocations.Add(token);
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
possibleEndLocation = token;

if (read > length)
{
break;
}
if (read > length)
{
break;
}
}
else
{
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
}
}
else
{
// For safety reset every counter in case we had a partial read.
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
commonPartPosition = 0;
}

binaryWrite.Write(inputBytes.CurrentByte);

read++;
}

binaryWrite.Flush();

if (observedEndLocations.Count == 0)
else
{
return false;
// For safety reset every counter in case we had a partial read.

endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
}

memoryStream.Seek(0, SeekOrigin.Begin);
if (length.HasValue && memoryStream.Length >= length)
{
// Use the declared length to copy just the data we want.
byte[] data = new byte[length.Value];
read++;
}

memoryStream.Read(data, 0, (int)length.Value);
long streamDataEnd = inputBytes.CurrentOffset + 1;

stream = new StreamToken(streamDictionaryToken, data);
}
else
{
// Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
if (possibleEndLocation == null)
return false;

var dataLength = lastEnd.Offset - startDataOffset;
var lastEnd = possibleEndLocation;

var current = inputBytes.CurrentOffset;
var dataLength = lastEnd.Value.Offset - startDataOffset;

// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Offset - 3);
inputBytes.MoveNext();
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Value.Offset - 3);
inputBytes.MoveNext();

if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}
if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}

inputBytes.Seek(current);
Span<byte> data = new byte[dataLength];

byte[] data = new byte[dataLength];
inputBytes.Seek(streamDataStart);
inputBytes.Read(data);

memoryStream.Read(data, 0, (int)dataLength);
inputBytes.Seek(streamDataEnd);

stream = new StreamToken(streamDictionaryToken, data);
}
}
stream = new StreamToken(streamDictionaryToken, data.ToArray());

return true;
}
Expand Down