Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add font fallback + Support for font IDs containing hyphens #614

Merged
merged 10 commits into from
Jul 31, 2023
Binary file added samples/FontIDHyphen.pdf
k00ni marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
Binary file added samples/ImproperFontFallback.pdf
GreyWyvern marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ public static function getCodePage(): array
"\xfc" => "\u{00fc}", // udieresis
"\xfd" => "\u{00fd}", // yacute
"\xfe" => "\u{00fe}", // thorn
"\xff" => "\u{00ff}", // ydieresis
"\xff" => "\u{00ff}", // ydieresis
];
}

Expand Down
48 changes: 42 additions & 6 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,36 @@ private function getDefaultFont(Page $page = null): Font
return new Font($this->document, null, null, $this->config);
}

private function getTJUsingFontFallback(Font $font, array $command, Page $page = null): string
k00ni marked this conversation as resolved.
Show resolved Hide resolved
{
$orig_text = $font->decodeText($command);
$text = $orig_text;

// If we make this a Config option, we can add a check if it's
// enabled here.
if (null !== $page) {
$font_ids = array_keys($page->getFonts());

// If the decoded text contains UTF-8 control characters
// then the font page being used is probably the wrong one.
// Loop through the rest of the fonts to see if we can get
// a good decode.
while (preg_match('/[\x00-\x1f\x7f]/u', $text)) {
// If we're out of font IDs, then give up and use the
// original string
if (0 == \count($font_ids)) {
return $orig_text;
}

// Try the next font ID
$font = $page->getFont(array_shift($font_ids));
$text = $font->decodeText($command);
}
}

return $text;
}

/**
* @throws \Exception
*/
Expand Down Expand Up @@ -339,8 +369,11 @@ public function getText(Page $page = null): string
$command[self::COMMAND] = [$command];
// no break
case 'TJ':
$sub_text = $current_font->decodeText($command[self::COMMAND]);
$text .= $sub_text;
$text .= $this->getTJUsingFontFallback(
$current_font,
$command[self::COMMAND],
$page
);
break;

// set leading
Expand Down Expand Up @@ -492,8 +525,11 @@ public function getTextArray(Page $page = null): array
$command[self::COMMAND] = [$command];
// no break
case 'TJ':
$sub_text = $current_font->decodeText($command[self::COMMAND]);
$text[] = $sub_text;
$text[] = $this->getTJUsingFontFallback(
$current_font,
$command[self::COMMAND],
$page
);
break;

// set leading
Expand Down Expand Up @@ -592,7 +628,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array
case '/':
$type = $char;
if (preg_match(
'/\G\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
'/\G\/([A-Z0-9\._,\+-]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
$text_part,
$matches,
0,
Expand All @@ -603,7 +639,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array
$command = $matches[1];
$offset += \strlen($matches[0]);
} elseif (preg_match(
'/\G\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
'/\G\/([A-Z0-9\._,\+-]+)\s+([A-Z]+)\s*/si',
$text_part,
$matches,
0,
Expand Down
34 changes: 34 additions & 0 deletions tests/PHPUnit/Integration/PDFObjectTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -256,4 +256,38 @@ public function testReversedChars(): void

$this->assertStringContainsString('שלומי טסט', $pages[0]->getText());
}

/**
* Tests that a text stream with an improperly selected font code
* page falls back to one that maps all characters.
*
* @see: https://github.com/smalot/pdfparser/issues/586
*/
public function testImproperFontFallback(): void
{
$filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';

$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$pages = $document->getPages();

$this->assertStringContainsString('сделал', $pages[0]->getText());
}

/**
* Tests that a font ID containing a hyphen / dash character was
* correctly parsed
*
* @see: https://github.com/smalot/pdfparser/issues/145
*/
public function testFontIDWithHyphen(): void
{
$filename = $this->rootDir.'/samples/FontIDHyphen.pdf';

$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$pages = $document->getPages();

$this->assertStringContainsString('AERODROME', $pages[0]->getText());
}
}