Skip to content

Commit

Permalink
Fixes #478 (/Index problem) (#479)
Browse files Browse the repository at this point in the history
* Add files via upload

Fixing problem of incomplete analysis of the /Index entry.

* Delete RawDataParser.php

Wrong subdirectory.

* Add files via upload

Fix problem of uncomplete analysis of /Index entry.

* Update RawDataParser.php

optical changes

* Update RawDataParser.php

optical changes

* Update RawDataParser.php

optical changes

* Add files via upload

After adding a description to the file, the valid /Index entry now contains two entries (consisting of 2 values: first object number, number of objects):
/Index[2 1 21 2]

* Update RawDataParserTest.php

Adding test for issue 479

* Update RawDataParserTest.php

Forgot a {

* Update RawDataParser.php

Code style update

* Update RawDataParserTest.php

Added more description and more checks.

* Update PageTest.php

Issue #331 is fixed by issue #479: test updated

* Update RawDataParserTest.php

optical fix

* Update PageTest.php

optical changes

* Update RawDataParser.php

change to remove the native_function_invocation message

* Update tests/Integration/PageTest.php

Co-authored-by: Konrad Abicht <hi@inspirito.de>

* Update RawDataParser.php

Added comments...

* Update RawDataParser.php

Changes for CS fixer

* Update PageTest.php

Comment update

* Update tests/Integration/PageTest.php

Co-authored-by: Konrad Abicht <hi@inspirito.de>

Co-authored-by: Konrad Abicht <hi@inspirito.de>
  • Loading branch information
yasheena and k00ni authored Nov 22, 2021
1 parent a216ccd commit 768d1d6
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 10 deletions.
Binary file added samples/bugs/Issue479.pdf
Binary file not shown.
27 changes: 23 additions & 4 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,11 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
) {
$valid_crs = true;
} elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
// first object number in the subsection
$index_first = (int) ($sarr[($k + 1)][1][0][1]);
// initialize list for: first object number in the subsection / number of objects
$index_blocks = [];
for ($m = 0; $m < \count($sarr[($k + 1)][1]); $m += 2) {
$index_blocks[] = [$sarr[($k + 1)][1][$m][1], $sarr[($k + 1)][1][$m + 1][1]];
}
} elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
// get previous xref offset
$prevxref = (int) ($sarr[($k + 1)][1]);
Expand Down Expand Up @@ -432,8 +435,9 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
}

// fill xref
if (isset($index_first)) {
$obj_num = $index_first;
if (isset($index_blocks)) {
// load the first object number of the first /Index entry
$obj_num = $index_blocks[0][0];
} else {
$obj_num = 0;
}
Expand Down Expand Up @@ -463,6 +467,21 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
break;
}
++$obj_num;
if (isset($index_blocks)) {
// reduce the number of remaining objects
--$index_blocks[0][1];
if (0 == $index_blocks[0][1]) {
// remove the actual used /Index entry
array_shift($index_blocks);
if (0 < \count($index_blocks)) {
// load the first object number of the following /Index entry
$obj_num = $index_blocks[0][0];
} else {
// if there are no more entries, remove $index_blocks to avoid actions on an empty array
unset($index_blocks);
}
}
}
}
} // end decoding data
if (isset($prevxref)) {
Expand Down
17 changes: 11 additions & 6 deletions tests/Integration/PageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -486,12 +486,17 @@ public function testGetPages()
$document = $this->getParserInstance()->parseFile($filename);
$pages = $document->getPages();

// This should actually be 3 pages, but as long as the cause for issue #331
// has not been found and the issue is not fixed, we'll settle for 2 here.
// We still test for the count, so in case the bug should be fixed
// unknowingly, we don't forget to resolve the issue as well and make sure
// this assertion is present.
$this->assertCount(2, $pages);
/*
* The problem of issue #331 is fixed by the pull request of the issue #479.
* The original Issue331.pdf was modified so for the updated version (actual
* version) a new xref was added and now the valid /Index has the following value:
* [1 1 3 1 7 1 175 1 178 1 219 2]
* This means, that there a 6 pairs containing the values for 'first object id'
* and 'number of objects'. Till now only the first entry was used and so the
* objects of all following entries gots a wrong id.
* By the fix of issue #479 now the expected number of pages is counted.
*/
$this->assertCount(3, $pages);

foreach ($pages as $page) {
$this->assertTrue($page instanceof Page);
Expand Down
31 changes: 31 additions & 0 deletions tests/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,35 @@ public function testDecodeObjectHeaderIssue405()

$this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
}

/**
* Tests buggy behavior of decodeXrefStream.
*
* When PDF has more than one entry in the /Index area (for example by changing
* the document description), only the first entry is used.
* If the fix is not used the array returned by getDetails() contains only the entry
* with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
* 'Subject' (which come from the 'Info' object) are not listed, because the
* 'Info' object gets a wrong object id during parsing the data into the xref structure.
* So the object id listed at the /Info entry is not valid and the data of the info object
* cannot be loaded during executing Document::buildDetails().
*
* @see https://github.com/smalot/pdfparser/pull/479
*/
public function testDecodeXrefStreamIssue479()
{
$filename = $this->rootDir.'/samples/bugs/Issue479.pdf';

$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$details = $document->getDetails();

$this->assertArrayHasKey('Author', $details);
$this->assertArrayHasKey('CreationDate', $details);
$this->assertArrayHasKey('Creator', $details);
$this->assertArrayHasKey('ModDate', $details);
$this->assertArrayHasKey('Producer', $details);
$this->assertArrayHasKey('Subject', $details);
$this->assertArrayHasKey('Title', $details);
}
}

0 comments on commit 768d1d6

Please sign in to comment.