Skip to content

Commit

Permalink
hotfix for #359 and #360: fallback for glyphs not in the postscript l…
Browse files Browse the repository at this point in the history
…ookup table (#362)

* hotfix for #359 and #360: fallback for glyphs not in the postscript lookup table

* test comment and assertion: actually just one character being decoded incorrectly

* added @todo keyword to test case comment so we can keep track of this

* moved comment before code as requested in the review

* fix code linting
  • Loading branch information
Connum authored Oct 29, 2020
1 parent 722061c commit dc1e422
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
Binary file added samples/bugs/Issue359.pdf
Binary file not shown.
8 changes: 7 additions & 1 deletion src/Smalot/PdfParser/Encoding/PostScriptGlyphs.php
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,12 @@ public static function getGlyphs()

public static function getCodePoint($glyph)
{
return hexdec(static::getGlyphs()[$glyph]);
$glyphsMap = static::getGlyphs();

if (isset($glyphsMap[$glyph])) {
return hexdec($glyphsMap[$glyph]);
}

return $glyph;
}
}
23 changes: 23 additions & 0 deletions tests/Integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ public function testIssue334()

$this->assertStringContainsString('This question already has an answer here', $document->getText());
}

/**
* Test that issue related pdf can now be parsed:
* Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset"
*
* @see https://github.com/smalot/pdfparser/issues/359
*/
public function testIssue359()
{
$filename = $this->rootDir.'/samples/bugs/Issue359.pdf';

$document = $this->fixture->parseFile($filename);

$this->assertStringContainsString('dnia 10 maja 2018 roku o ochronie danych osobowych', $document->getText());
$this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText());
/*
* @todo Note that the "ł" in przepływu is decoded as a space character. This was already
* the case before the PR that caused this issue and is not currently covered by this
* test case. However, this issue should be addressed in the future and its fix can then
* be incorporated into this test by uncommenting the following assertion.
*/
// $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText());
}
}

class ParserSub extends Parser
Expand Down

0 comments on commit dc1e422

Please sign in to comment.