From a2c316887d2edd7a3d197fb591bd14cf77211a1f Mon Sep 17 00:00:00 2001 From: Serge Titov Date: Wed, 8 Mar 2023 10:49:07 +0300 Subject: [PATCH 1/3] reduce excessive memory allocations --- .../PdfParser/RawData/RawDataParser.php | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ff5a15ad..86037a56 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -635,14 +635,10 @@ protected function getRawObject(string $pdfData, int $offset = 0): array // name object $objtype = $char; ++$offset; - $pregResult = preg_match( - '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/', - substr($pdfData, $offset, 256), - $matches - ); - if (1 == $pregResult) { - $objval = $matches[1]; // unescaped value - $offset += \strlen($objval); + $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256); + if ($span > 0) { + $objval = substr($pdfData, $offset, $span); // unescaped value + $offset += $span; } break; @@ -723,15 +719,12 @@ protected function getRawObject(string $pdfData, int $offset = 0): array // hexadecimal string object $objtype = $char; ++$offset; - $pregResult = preg_match( - '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU', - substr($pdfData, $offset), - $matches - ); - if (('<' == $char) && 1 == $pregResult) { + + $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset); + if (('<' == $char) && $span > 0 && @$pdfData[$offset+$span] == '>') { // remove white space characters - $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), ''); - $offset += \strlen($matches[0]); + $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), ''); + $offset += $span + 1; } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) { $offset = $endpos + 1; } @@ -762,17 +755,18 @@ protected function getRawObject(string $pdfData, int $offset = 0): array // start stream object $objtype = 'stream'; $offset += 6; - if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) { + if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { $offset += \strlen($matches[0]); $pregResult = preg_match( '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU', - substr($pdfData, $offset), + $pdfData, $matches, - \PREG_OFFSET_CAPTURE + \PREG_OFFSET_CAPTURE, + $offset ); if (1 == $pregResult) { - $objval = substr($pdfData, $offset, $matches[0][1]); - $offset += $matches[1][1]; + $objval = substr($pdfData, $offset, $matches[0][1] - $offset); + $offset = $matches[1][1]; } } } elseif ('endstream' == substr($pdfData, $offset, 9)) { @@ -888,7 +882,7 @@ public function parseData(string $data): array } // get PDF content string - $pdfData = substr($data, $trimpos); + $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data; // get xref and trailer data $xref = $this->getXrefData($pdfData); From 619a9eea86c638f7f699d966432acf0926c49d83 Mon Sep 17 00:00:00 2001 From: Serge Titov Date: Sun, 12 Mar 2023 10:57:00 +0300 Subject: [PATCH 2/3] yoda-style comparison --- src/Smalot/PdfParser/RawData/RawDataParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 86037a56..ff2a869b 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -721,7 +721,7 @@ protected function getRawObject(string $pdfData, int $offset = 0): array ++$offset; $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset); - if (('<' == $char) && $span > 0 && @$pdfData[$offset+$span] == '>') { + if ('<' == $char && $span > 0 && '>' == @$pdfData[$offset + $span]) { // remove white space characters $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), ''); $offset += $span + 1; From c6729a813ad96d52c62fefa38ac95bfdc1ffd9e6 Mon Sep 17 00:00:00 2001 From: Serge Titov Date: Mon, 13 Mar 2023 13:27:58 +0300 Subject: [PATCH 3/3] minor: test nullness explicitly Co-authored-by: Konrad Abicht --- src/Smalot/PdfParser/RawData/RawDataParser.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ff2a869b..71abbd87 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -721,7 +721,8 @@ protected function getRawObject(string $pdfData, int $offset = 0): array ++$offset; $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset); - if ('<' == $char && $span > 0 && '>' == @$pdfData[$offset + $span]) { + $dataToCheck = $pdfData[$offset + $span] ?? null; + if ('<' == $char && $span > 0 && '>' == $dataToCheck) { // remove white space characters $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), ''); $offset += $span + 1;