diff --git a/composer.json b/composer.json index 93629654..70e637c1 100644 --- a/composer.json +++ b/composer.json @@ -24,7 +24,7 @@ "htmlawed/htmlawed": "dev-master", "symfony/options-resolver": "~2.6|~3.0", "monolog/monolog": "^1.13.1", - "smalot/pdfparser": "~0.9.24", + "smalot/pdfparser": "~0.11", "wallabag/tcpdf": "^6.2", "true/punycode": "~2.0" }, diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 729c395c..e6b8353d 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -469,7 +469,7 @@ function ($element, $currentEntity) { // If no year has been found during date_parse, we nuke the whole value // because the date is invalid - if ($parseDate['year'] === false) { + if (false === $parseDate['year']) { $this->date = null; } @@ -481,7 +481,7 @@ function ($element, $currentEntity) { $this->logger->log('debug', 'Detecting body'); $this->body = $this->readability->getContent(); - if ($this->body->childNodes->length === 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { + if (1 === $this->body->childNodes->length && XML_ELEMENT_NODE === $this->body->firstChild->nodeType) { $this->body = $this->body->firstChild; } @@ -495,14 +495,14 @@ function ($element, $currentEntity) { if (isset($this->body)) { // remove any h1-h6 elements that appear as first thing in the body // and which match our title - if (isset($this->title) && $this->title !== '' && null !== $this->body->firstChild) { + if (isset($this->title) && '' !== $this->title && null !== $this->body->firstChild) { $firstChild = $this->body->firstChild; - while ($firstChild->nextSibling !== null && $firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { + while (null !== $firstChild->nextSibling && $firstChild->nodeType && (XML_ELEMENT_NODE !== $firstChild->nodeType)) { $firstChild = $firstChild->nextSibling; } - if ($firstChild->nodeType === XML_ELEMENT_NODE + if (XML_ELEMENT_NODE === $firstChild->nodeType && in_array(strtolower($firstChild->tagName), ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], true) && (strtolower(trim($firstChild->textContent)) === strtolower(trim($this->title)))) { $this->body->removeChild($firstChild); @@ -532,7 +532,7 @@ function ($element, $currentEntity) { // inside the data-lazy-src attribute. It also places the original image inside a noscript element // next to the amended one. // @see https://plugins.trac.wordpress.org/browser/lazy-load/trunk/lazy-load.php - if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { + if (null !== $e->nextSibling && 'noscript' === $e->nextSibling->nodeName) { $newElem = $e->ownerDocument->createDocumentFragment(); $newElem->appendXML($e->nextSibling->innerHTML); $e->nextSibling->parentNode->replaceChild($newElem, $e->nextSibling); @@ -780,13 +780,13 @@ private function extractAuthor($detectAuthor, \DOMNode $node) if ($fns && $fns->length > 0) { foreach ($fns as $fn) { - if (trim($fn->textContent) !== '') { + if ('' !== trim($fn->textContent)) { $this->authors[] = trim($fn->textContent); $this->logger->log('debug', 'hNews: found author: ' . trim($fn->textContent)); } } } else { - if (trim($author->textContent) !== '') { + if ('' !== trim($author->textContent)) { $this->authors[] = trim($author->textContent); $this->logger->log('debug', 'hNews: found author: ' . trim($author->textContent)); } @@ -822,7 +822,7 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node, $typ $this->logger->log('debug', $type . ': found "' . $elems->length . '" with ' . $xpathExpression); - if ($elems->length === 1) { + if (1 === $elems->length) { $this->body = $elems->item(0); // prune (clean up elements that may not be content) @@ -846,7 +846,7 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node, $typ $isDescendant = false; foreach ($this->body->childNodes as $parent) { $node = $elem->parentNode; - while ($node !== null) { + while (null !== $node) { if ($node->isSameNode($parent)) { $isDescendant = true; break 2; @@ -933,7 +933,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n $elems = $this->xpath->evaluate($pattern, $this->readability->dom); $entityValue = null; - if (is_string($elems) && trim($elems) !== '') { + if (is_string($elems) && '' !== trim($elems)) { $entityValue = $returnCallback($elems); $this->logger->log('debug', "{$entity} expression evaluated as string: {{$entity}}", [$entity => $entityValue]); @@ -952,7 +952,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n } } - if ($entityValue !== null) { + if (null !== $entityValue) { $this->{$entity} = $entityValue; return true; @@ -984,7 +984,7 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall $elems = $this->xpath->evaluate($pattern, $this->readability->dom); $entityValue = null; - if (is_string($elems) && trim($elems) !== '') { + if (is_string($elems) && '' !== trim($elems)) { $entityValue[] = $returnCallback($elems); $this->logger->log('debug', "{$entity} expression evaluated as string: {{$entity}}", [$entity => $entityValue]); @@ -1005,7 +1005,7 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall $this->logger->log('debug', '...XPath match: {pattern}', ['pattern', $pattern]); } - if ($entityValue !== null) { + if (null !== $entityValue) { $this->{$entity} = $entityValue; return true; diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index 99d6d9bf..d5b21b2c 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -186,7 +186,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) // check for // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (strpos($effectiveUrl, '_escaped_fragment_') === false) { + if (false === strpos($effectiveUrl, '_escaped_fragment_')) { $redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body); if (false !== $redirectURL) { @@ -225,7 +225,7 @@ private function cleanupUrl($url) { // rewrite part of urls to something more readable foreach ($this->config['rewrite_url'] as $find => $action) { - if (strpos($url, $find) !== false && is_array($action)) { + if (false !== strpos($url, $find) && is_array($action)) { $url = strtr($url, $action); } } @@ -333,7 +333,7 @@ private function getUserAgent($url, $httpHeader = []) $host = parse_url($url, PHP_URL_HOST); - if (strtolower(substr($host, 0, 4)) === 'www.') { + if ('www.' === strtolower(substr($host, 0, 4))) { $host = substr($host, 4); } @@ -422,7 +422,7 @@ private function headerOnlyType($contentType) */ private function getMetaRefreshURL($url, $html) { - if ($html === '') { + if ('' === $html) { return false; } diff --git a/src/Graby.php b/src/Graby.php index 87c94352..91dadae2 100644 --- a/src/Graby.php +++ b/src/Graby.php @@ -181,7 +181,7 @@ public function cleanupHtml($contentBlock, $url) } // footnotes - if ($this->config['content_links'] === 'footnotes' && strpos($url, 'wikipedia.org') === false) { + if ('footnotes' === $this->config['content_links'] && false === strpos($url, 'wikipedia.org')) { $this->extractor->readability->addFootnotes($contentBlock); } @@ -190,13 +190,13 @@ public function cleanupHtml($contentBlock, $url) // remove empty text nodes foreach ($contentBlock->childNodes as $n) { - if ($n->nodeType === XML_TEXT_NODE && trim($n->textContent) === '') { + if (XML_TEXT_NODE === $n->nodeType && '' === trim($n->textContent)) { $contentBlock->removeChild($n); } } // remove nesting:
test
test
- while ($contentBlock->childNodes->length === 1 && $contentBlock->firstChild->nodeType === XML_ELEMENT_NODE) { + while (1 === $contentBlock->childNodes->length && XML_ELEMENT_NODE === $contentBlock->firstChild->nodeType) { // only follow these tag names if (!in_array(strtolower($contentBlock->tagName), ['div', 'article', 'section', 'header', 'footer'], true)) { break; @@ -215,7 +215,7 @@ public function cleanupHtml($contentBlock, $url) // post-processing cleanup $html = preg_replace('![\s\h\v]*
!u', '', $html); - if ($this->config['content_links'] === 'remove') { + if ('remove' === $this->config['content_links']) { $html = preg_replace('!?a[^>]*>!', '', $html); } @@ -408,7 +408,7 @@ private function validateUrl($url) { // Check for feed URL $url = trim($url); - if (strtolower(substr($url, 0, 7)) === 'feed://') { + if ('feed://' === strtolower(substr($url, 0, 7))) { $url = 'http://' . substr($url, 7); } @@ -458,13 +458,13 @@ private function isUrlAllowed($url) if (!empty($allowedUrls)) { foreach ($allowedUrls as $allowurl) { - if (stristr($url, $allowurl) !== false) { + if (false !== stristr($url, $allowurl)) { return true; } } } else { foreach ($blockedUrls as $blockurl) { - if (stristr($url, $blockurl) !== false) { + if (false !== stristr($url, $blockurl)) { return false; } } @@ -551,11 +551,11 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = []) $infos['html'] = 'Download ' . $mimeInfo['name'] . ''; - if ($mimeInfo['type'] === 'image') { + if ('image' === $mimeInfo['type']) { $infos['html'] = ''; } - if ($mimeInfo['mime'] === 'application/pdf') { + if ('application/pdf' === $mimeInfo['mime']) { $parser = new PdfParser(); $pdf = $parser->parseContent($body); @@ -597,7 +597,7 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = []) } } - if ($mimeInfo['mime'] === 'text/plain') { + if ('text/plain' === $mimeInfo['mime']) { $infos['html'] = '' . $this->cleanupXss( $this->convert2Utf8($body, isset($response['all_headers']) ? $response['all_headers'] : []) @@ -918,7 +918,7 @@ private function convert2Utf8($html, array $headers = []) // If it's not, result will be empty string. // For now we'll check for invalid encoding types returned by some sites, e.g. 'none' // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html - if (empty($encoding) || $encoding === 'none') { + if (empty($encoding) || 'none' === $encoding) { // search for encoding in HTML - only look at the first 50000 characters // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html // TODO: improve this so it looks at smaller chunks first @@ -940,11 +940,11 @@ private function convert2Utf8($html, array $headers = []) $encoding = strtolower(trim($encoding)); // fix bad encoding values - if ($encoding === 'iso-8850-1') { + if ('iso-8850-1' === $encoding) { $encoding = 'iso-8859-1'; } - if (empty($encoding) || $encoding === 'iso-8859-1') { + if (empty($encoding) || 'iso-8859-1' === $encoding) { // replace MS Word smart qutoes $trans = []; $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark @@ -974,7 +974,7 @@ private function convert2Utf8($html, array $headers = []) $html = strtr($html, $trans); } - if ($encoding !== 'utf-8') { + if ('utf-8' !== $encoding) { // https://www.w3.org/International/articles/http-charset/index#charset // HTTP 1.1 says that the default charset is ISO-8859-1 $encoding = $encoding ?: 'iso-8859-1'; diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index 1aeb6d33..667136e7 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -54,7 +54,7 @@ public function setLogger(LoggerInterface $logger) public function addToCache($key, SiteConfig $config) { $key = strtolower($key); - if (substr($key, 0, 4) === 'www.') { + if ('www.' === substr($key, 0, 4)) { $key = substr($key, 4); } @@ -78,7 +78,7 @@ public function addToCache($key, SiteConfig $config) public function getCachedVersion($key) { $key = strtolower($key); - if (substr($key, 0, 4) === 'www.') { + if ('www.' === substr($key, 0, 4)) { $key = substr($key, 4); } @@ -128,7 +128,7 @@ public function buildFromUrl($url, $addToCache = true) public function buildForHost($host, $addToCache = true) { $host = strtolower($host); - if (substr($host, 0, 4) === 'www.') { + if ('www.' === substr($host, 0, 4)) { $host = substr($host, 4); } @@ -201,7 +201,7 @@ public function build($host, $exactHostMatch = false) public function loadSiteConfig($host, $exactHostMatch = false) { $host = strtolower($host); - if (substr($host, 0, 4) === 'www.') { + if ('www.' === substr($host, 0, 4)) { $host = substr($host, 4); } @@ -315,20 +315,20 @@ public function parseLines(array $lines) $line = trim($line); // skip comments, empty lines - if ($line === '' || $line[0] === '#') { + if ('' === $line || '#' === $line[0]) { continue; } // get command $command = explode(':', $line, 2); // if there's no colon ':', skip this line - if (count($command) !== 2) { + if (2 !== count($command)) { continue; } $val = trim($command[1]); $command = trim($command[0]); - if ($command === '' || $val === '') { + if ('' === $command || '' === $val) { continue; } @@ -337,15 +337,15 @@ public function parseLines(array $lines) array_push($config->$command, $val); // check for single statement commands that evaluate to true or false } elseif (in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login'], true)) { - $config->$command = ($val === 'yes' || $val === 'true'); + $config->$command = ('yes' === $val || 'true' === $val); // check for single statement commands stored as strings } elseif (in_array($command, ['parser', 'login_username_field', 'login_password_field', 'not_logged_in_xpath', 'login_uri'], true)) { $config->$command = $val; // check for replace_string(find): replace - } elseif ((substr($command, -1) === ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match) && $match[1] === 'replace_string') { + } elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match) && 'replace_string' === $match[1]) { array_push($config->find_string, $match[2]); array_push($config->replace_string, $val); - } elseif ((substr($command, -1) === ')') && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && $match[1] === 'http_header' && in_array($match[2], ['user-agent', 'referer'], true)) { + } elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && 'http_header' === $match[1] && in_array($match[2], ['user-agent', 'referer'], true)) { $config->http_header[$match[2]] = $val; } } diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index 8c139793..9057b8ab 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -94,11 +94,11 @@ public function testBuildSiteConfig() $this->assertInstanceOf('Graby\SiteConfig\SiteConfig', $res); - foreach (['author', 'single_page_link', 'next_page_link', 'find_string', 'replace_string'] as $value) { + foreach (['author', 'single_page_link', 'next_page_link'] as $value) { $this->assertEmpty($res->$value, 'Check empty value for: ' . $value); } - foreach (['date', 'strip_image_src', 'http_header'] as $value) { + foreach (['date', 'strip_image_src', 'http_header', 'find_string', 'replace_string'] as $value) { $this->assertNotEmpty($res->$value, 'Check not empty value for: ' . $value); }