Skip to content

Commit

Permalink
Merge pull request #122 from j0k3r/update-pdf-deps
Browse files Browse the repository at this point in the history
Update pdf deps
  • Loading branch information
j0k3r authored Oct 9, 2017
2 parents 5bbcead + e8c0af7 commit c467f10
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 45 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"htmlawed/htmlawed": "dev-master",
"symfony/options-resolver": "~2.6|~3.0",
"monolog/monolog": "^1.13.1",
"smalot/pdfparser": "~0.9.24",
"smalot/pdfparser": "~0.11",
"wallabag/tcpdf": "^6.2",
"true/punycode": "~2.0"
},
Expand Down
28 changes: 14 additions & 14 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ function ($element, $currentEntity) {

// If no year has been found during date_parse, we nuke the whole value
// because the date is invalid
if ($parseDate['year'] === false) {
if (false === $parseDate['year']) {
$this->date = null;
}

Expand All @@ -481,7 +481,7 @@ function ($element, $currentEntity) {
$this->logger->log('debug', 'Detecting body');
$this->body = $this->readability->getContent();

if ($this->body->childNodes->length === 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
if (1 === $this->body->childNodes->length && XML_ELEMENT_NODE === $this->body->firstChild->nodeType) {
$this->body = $this->body->firstChild;
}

Expand All @@ -495,14 +495,14 @@ function ($element, $currentEntity) {
if (isset($this->body)) {
// remove any h1-h6 elements that appear as first thing in the body
// and which match our title
if (isset($this->title) && $this->title !== '' && null !== $this->body->firstChild) {
if (isset($this->title) && '' !== $this->title && null !== $this->body->firstChild) {
$firstChild = $this->body->firstChild;

while ($firstChild->nextSibling !== null && $firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
while (null !== $firstChild->nextSibling && $firstChild->nodeType && (XML_ELEMENT_NODE !== $firstChild->nodeType)) {
$firstChild = $firstChild->nextSibling;
}

if ($firstChild->nodeType === XML_ELEMENT_NODE
if (XML_ELEMENT_NODE === $firstChild->nodeType
&& in_array(strtolower($firstChild->tagName), ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], true)
&& (strtolower(trim($firstChild->textContent)) === strtolower(trim($this->title)))) {
$this->body->removeChild($firstChild);
Expand Down Expand Up @@ -532,7 +532,7 @@ function ($element, $currentEntity) {
// inside the data-lazy-src attribute. It also places the original image inside a noscript element
// next to the amended one.
// @see https://plugins.trac.wordpress.org/browser/lazy-load/trunk/lazy-load.php
if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
if (null !== $e->nextSibling && 'noscript' === $e->nextSibling->nodeName) {
$newElem = $e->ownerDocument->createDocumentFragment();
$newElem->appendXML($e->nextSibling->innerHTML);
$e->nextSibling->parentNode->replaceChild($newElem, $e->nextSibling);
Expand Down Expand Up @@ -780,13 +780,13 @@ private function extractAuthor($detectAuthor, \DOMNode $node)

if ($fns && $fns->length > 0) {
foreach ($fns as $fn) {
if (trim($fn->textContent) !== '') {
if ('' !== trim($fn->textContent)) {
$this->authors[] = trim($fn->textContent);
$this->logger->log('debug', 'hNews: found author: ' . trim($fn->textContent));
}
}
} else {
if (trim($author->textContent) !== '') {
if ('' !== trim($author->textContent)) {
$this->authors[] = trim($author->textContent);
$this->logger->log('debug', 'hNews: found author: ' . trim($author->textContent));
}
Expand Down Expand Up @@ -822,7 +822,7 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node, $typ

$this->logger->log('debug', $type . ': found "' . $elems->length . '" with ' . $xpathExpression);

if ($elems->length === 1) {
if (1 === $elems->length) {
$this->body = $elems->item(0);

// prune (clean up elements that may not be content)
Expand All @@ -846,7 +846,7 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node, $typ
$isDescendant = false;
foreach ($this->body->childNodes as $parent) {
$node = $elem->parentNode;
while ($node !== null) {
while (null !== $node) {
if ($node->isSameNode($parent)) {
$isDescendant = true;
break 2;
Expand Down Expand Up @@ -933,7 +933,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n
$elems = $this->xpath->evaluate($pattern, $this->readability->dom);
$entityValue = null;

if (is_string($elems) && trim($elems) !== '') {
if (is_string($elems) && '' !== trim($elems)) {
$entityValue = $returnCallback($elems);

$this->logger->log('debug', "{$entity} expression evaluated as string: {{$entity}}", [$entity => $entityValue]);
Expand All @@ -952,7 +952,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n
}
}

if ($entityValue !== null) {
if (null !== $entityValue) {
$this->{$entity} = $entityValue;

return true;
Expand Down Expand Up @@ -984,7 +984,7 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall
$elems = $this->xpath->evaluate($pattern, $this->readability->dom);
$entityValue = null;

if (is_string($elems) && trim($elems) !== '') {
if (is_string($elems) && '' !== trim($elems)) {
$entityValue[] = $returnCallback($elems);

$this->logger->log('debug', "{$entity} expression evaluated as string: {{$entity}}", [$entity => $entityValue]);
Expand All @@ -1005,7 +1005,7 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall
$this->logger->log('debug', '...XPath match: {pattern}', ['pattern', $pattern]);
}

if ($entityValue !== null) {
if (null !== $entityValue) {
$this->{$entity} = $entityValue;

return true;
Expand Down
8 changes: 4 additions & 4 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (strpos($effectiveUrl, '_escaped_fragment_') === false) {
if (false === strpos($effectiveUrl, '_escaped_fragment_')) {
$redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body);

if (false !== $redirectURL) {
Expand Down Expand Up @@ -225,7 +225,7 @@ private function cleanupUrl($url)
{
// rewrite part of urls to something more readable
foreach ($this->config['rewrite_url'] as $find => $action) {
if (strpos($url, $find) !== false && is_array($action)) {
if (false !== strpos($url, $find) && is_array($action)) {
$url = strtr($url, $action);
}
}
Expand Down Expand Up @@ -333,7 +333,7 @@ private function getUserAgent($url, $httpHeader = [])

$host = parse_url($url, PHP_URL_HOST);

if (strtolower(substr($host, 0, 4)) === 'www.') {
if ('www.' === strtolower(substr($host, 0, 4))) {
$host = substr($host, 4);
}

Expand Down Expand Up @@ -422,7 +422,7 @@ private function headerOnlyType($contentType)
*/
private function getMetaRefreshURL($url, $html)
{
if ($html === '') {
if ('' === $html) {
return false;
}

Expand Down
28 changes: 14 additions & 14 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public function cleanupHtml($contentBlock, $url)
}

// footnotes
if ($this->config['content_links'] === 'footnotes' && strpos($url, 'wikipedia.org') === false) {
if ('footnotes' === $this->config['content_links'] && false === strpos($url, 'wikipedia.org')) {
$this->extractor->readability->addFootnotes($contentBlock);
}

Expand All @@ -190,13 +190,13 @@ public function cleanupHtml($contentBlock, $url)

// remove empty text nodes
foreach ($contentBlock->childNodes as $n) {
if ($n->nodeType === XML_TEXT_NODE && trim($n->textContent) === '') {
if (XML_TEXT_NODE === $n->nodeType && '' === trim($n->textContent)) {
$contentBlock->removeChild($n);
}
}

// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while ($contentBlock->childNodes->length === 1 && $contentBlock->firstChild->nodeType === XML_ELEMENT_NODE) {
while (1 === $contentBlock->childNodes->length && XML_ELEMENT_NODE === $contentBlock->firstChild->nodeType) {
// only follow these tag names
if (!in_array(strtolower($contentBlock->tagName), ['div', 'article', 'section', 'header', 'footer'], true)) {
break;
Expand All @@ -215,7 +215,7 @@ public function cleanupHtml($contentBlock, $url)

// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($this->config['content_links'] === 'remove') {
if ('remove' === $this->config['content_links']) {
$html = preg_replace('!</?a[^>]*>!', '', $html);
}

Expand Down Expand Up @@ -408,7 +408,7 @@ private function validateUrl($url)
{
// Check for feed URL
$url = trim($url);
if (strtolower(substr($url, 0, 7)) === 'feed://') {
if ('feed://' === strtolower(substr($url, 0, 7))) {
$url = 'http://' . substr($url, 7);
}

Expand Down Expand Up @@ -458,13 +458,13 @@ private function isUrlAllowed($url)

if (!empty($allowedUrls)) {
foreach ($allowedUrls as $allowurl) {
if (stristr($url, $allowurl) !== false) {
if (false !== stristr($url, $allowurl)) {
return true;
}
}
} else {
foreach ($blockedUrls as $blockurl) {
if (stristr($url, $blockurl) !== false) {
if (false !== stristr($url, $blockurl)) {
return false;
}
}
Expand Down Expand Up @@ -551,11 +551,11 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = [])

$infos['html'] = '<a href="' . $effectiveUrl . '">Download ' . $mimeInfo['name'] . '</a>';

if ($mimeInfo['type'] === 'image') {
if ('image' === $mimeInfo['type']) {
$infos['html'] = '<a href="' . $effectiveUrl . '"><img src="' . $effectiveUrl . '" alt="' . $mimeInfo['name'] . '" /></a>';
}

if ($mimeInfo['mime'] === 'application/pdf') {
if ('application/pdf' === $mimeInfo['mime']) {
$parser = new PdfParser();
$pdf = $parser->parseContent($body);

Expand Down Expand Up @@ -597,7 +597,7 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = [])
}
}

if ($mimeInfo['mime'] === 'text/plain') {
if ('text/plain' === $mimeInfo['mime']) {
$infos['html'] = '<pre>' .
$this->cleanupXss(
$this->convert2Utf8($body, isset($response['all_headers']) ? $response['all_headers'] : [])
Expand Down Expand Up @@ -918,7 +918,7 @@ private function convert2Utf8($html, array $headers = [])
// If it's not, result will be empty string.
// For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
// Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
if (empty($encoding) || $encoding === 'none') {
if (empty($encoding) || 'none' === $encoding) {
// search for encoding in HTML - only look at the first 50000 characters
// Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
// TODO: improve this so it looks at smaller chunks first
Expand All @@ -940,11 +940,11 @@ private function convert2Utf8($html, array $headers = [])
$encoding = strtolower(trim($encoding));

// fix bad encoding values
if ($encoding === 'iso-8850-1') {
if ('iso-8850-1' === $encoding) {
$encoding = 'iso-8859-1';
}

if (empty($encoding) || $encoding === 'iso-8859-1') {
if (empty($encoding) || 'iso-8859-1' === $encoding) {
// replace MS Word smart qutoes
$trans = [];
$trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
Expand Down Expand Up @@ -974,7 +974,7 @@ private function convert2Utf8($html, array $headers = [])
$html = strtr($html, $trans);
}

if ($encoding !== 'utf-8') {
if ('utf-8' !== $encoding) {
// https://www.w3.org/International/articles/http-charset/index#charset
// HTTP 1.1 says that the default charset is ISO-8859-1
$encoding = $encoding ?: 'iso-8859-1';
Expand Down
20 changes: 10 additions & 10 deletions src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public function setLogger(LoggerInterface $logger)
public function addToCache($key, SiteConfig $config)
{
$key = strtolower($key);
if (substr($key, 0, 4) === 'www.') {
if ('www.' === substr($key, 0, 4)) {
$key = substr($key, 4);
}

Expand All @@ -78,7 +78,7 @@ public function addToCache($key, SiteConfig $config)
public function getCachedVersion($key)
{
$key = strtolower($key);
if (substr($key, 0, 4) === 'www.') {
if ('www.' === substr($key, 0, 4)) {
$key = substr($key, 4);
}

Expand Down Expand Up @@ -128,7 +128,7 @@ public function buildFromUrl($url, $addToCache = true)
public function buildForHost($host, $addToCache = true)
{
$host = strtolower($host);
if (substr($host, 0, 4) === 'www.') {
if ('www.' === substr($host, 0, 4)) {
$host = substr($host, 4);
}

Expand Down Expand Up @@ -201,7 +201,7 @@ public function build($host, $exactHostMatch = false)
public function loadSiteConfig($host, $exactHostMatch = false)
{
$host = strtolower($host);
if (substr($host, 0, 4) === 'www.') {
if ('www.' === substr($host, 0, 4)) {
$host = substr($host, 4);
}

Expand Down Expand Up @@ -315,20 +315,20 @@ public function parseLines(array $lines)
$line = trim($line);

// skip comments, empty lines
if ($line === '' || $line[0] === '#') {
if ('' === $line || '#' === $line[0]) {
continue;
}

// get command
$command = explode(':', $line, 2);
// if there's no colon ':', skip this line
if (count($command) !== 2) {
if (2 !== count($command)) {
continue;
}

$val = trim($command[1]);
$command = trim($command[0]);
if ($command === '' || $val === '') {
if ('' === $command || '' === $val) {
continue;
}

Expand All @@ -337,15 +337,15 @@ public function parseLines(array $lines)
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login'], true)) {
$config->$command = ($val === 'yes' || $val === 'true');
$config->$command = ('yes' === $val || 'true' === $val);
// check for single statement commands stored as strings
} elseif (in_array($command, ['parser', 'login_username_field', 'login_password_field', 'not_logged_in_xpath', 'login_uri'], true)) {
$config->$command = $val;
// check for replace_string(find): replace
} elseif ((substr($command, -1) === ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match) && $match[1] === 'replace_string') {
} elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match) && 'replace_string' === $match[1]) {
array_push($config->find_string, $match[2]);
array_push($config->replace_string, $val);
} elseif ((substr($command, -1) === ')') && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && $match[1] === 'http_header' && in_array($match[2], ['user-agent', 'referer'], true)) {
} elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && 'http_header' === $match[1] && in_array($match[2], ['user-agent', 'referer'], true)) {
$config->http_header[$match[2]] = $val;
}
}
Expand Down
4 changes: 2 additions & 2 deletions tests/Extractor/ContentExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ public function testBuildSiteConfig()

$this->assertInstanceOf('Graby\SiteConfig\SiteConfig', $res);

foreach (['author', 'single_page_link', 'next_page_link', 'find_string', 'replace_string'] as $value) {
foreach (['author', 'single_page_link', 'next_page_link'] as $value) {
$this->assertEmpty($res->$value, 'Check empty value for: ' . $value);
}

foreach (['date', 'strip_image_src', 'http_header'] as $value) {
foreach (['date', 'strip_image_src', 'http_header', 'find_string', 'replace_string'] as $value) {
$this->assertNotEmpty($res->$value, 'Check not empty value for: ' . $value);
}

Expand Down

0 comments on commit c467f10

Please sign in to comment.