Skip to content

Commit

Permalink
PHPStan level 3 reached
Browse files Browse the repository at this point in the history
Remove `SimplePie_IRI` and use `GuzzleHttp\Psr7\Uri`
  • Loading branch information
j0k3r committed Feb 6, 2019
1 parent 87c2972 commit 05ce203
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 118 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ script:
- mkdir -p build/logs
- php vendor/bin/simple-phpunit -v $PHPUNIT_FLAGS
- if [ "$CS_FIXER" = "run" ]; then php vendor/bin/php-cs-fixer fix --verbose --dry-run ; fi;
- if [ "$CS_FIXER" = "run" ]; then php vendor/bin/phpstan analyse src tests --no-progress --level 1 ; fi;
- if [ "$CS_FIXER" = "run" ]; then php vendor/bin/phpstan analyse src tests --no-progress --level 3 ; fi;

after_success:
- |
Expand Down
4 changes: 2 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@
"php-http/discovery": "^1.0",
"php-http/client-common": "^2.0",
"php-http/message": "^1.7",
"j0k3r/httplug-ssrf-plugin": "^2.0"
"j0k3r/httplug-ssrf-plugin": "^2.0",
"guzzlehttp/psr7": "^1.5"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "~2.14",
"symfony/phpunit-bridge": "~2.6|~3.0|~4.0",
"php-http/mock-client": "^1.2",
"php-http/guzzle6-adapter": "^2.0",
"guzzlehttp/psr7": "^1.5",
"php-coveralls/php-coveralls": "^2.0",
"phpstan/phpstan": "^0.11",
"phpstan/phpstan-phpunit": "^0.11"
Expand Down
16 changes: 2 additions & 14 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Graby\HttpClient\Plugin\History;
use Graby\HttpClient\Plugin\ServerSideRequestForgeryProtection\ServerSideRequestForgeryProtectionPlugin;
use GuzzleHttp\Psr7\Uri;
use Http\Client\Common\Exception\LoopException;
use Http\Client\Common\HttpMethodsClient;
use Http\Client\Common\Plugin;
Expand Down Expand Up @@ -518,20 +519,7 @@ private function getMetaRefreshURL($url, $html)
return $redirectUrl;
}

// absolutize redirect URL
$base = new \SimplePie_IRI($url);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->ipath)) {
$base->ipath = str_replace('//', '/', $base->ipath);
}

if ($absolute = \SimplePie_IRI::absolutize($base, $redirectUrl)) {
$this->logger->info('Meta refresh redirect found (http-equiv="refresh"), new URL: ' . $absolute);

return $absolute->get_iri();
}

return false;
return (string) Uri::resolve(new Uri($url), $redirectUrl);
}

/**
Expand Down
93 changes: 28 additions & 65 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use Graby\Extractor\ContentExtractor;
use Graby\Extractor\HttpClient;
use Graby\SiteConfig\ConfigBuilder;
use GuzzleHttp\Psr7\Uri;
use Http\Client\Common\Plugin\CookiePlugin;
use Http\Client\Common\PluginClient;
use Http\Client\HttpClient as Client;
Expand Down Expand Up @@ -456,36 +457,32 @@ private function validateUrl($url)
$url = 'http://' . $url;
}

// explode url to convert accents
$parsedUrl = parse_url($url);
$uri = new Uri((string) $url);

if (false === $parsedUrl) {
throw new \Exception(sprintf('Url "%s" is not valid.', $url));
if (preg_match('/[\x80-\xff]/', $uri->getHost())) {
$uri = $uri->withHost($this->punycode->encode($uri->getHost()));
}

if (isset($parsedUrl['host']) && preg_match('/[\x80-\xff]/', $parsedUrl['host'])) {
$parsedUrl['host'] = $this->punycode->encode($parsedUrl['host']);
}

if (isset($parsedUrl['path']) && preg_match('/[\x80-\xff]/', $parsedUrl['path'])) {
if (\strlen($uri->getPath()) && preg_match('/[\x80-\xff]/', $uri->getPath())) {
$path = [];
foreach (explode('/', $parsedUrl['path']) as $value) {
foreach (explode('/', $uri->getPath()) as $value) {
$path[] = urlencode($value);
}
$parsedUrl['path'] = implode('/', $path);

$uri = $uri->withPath(implode('/', $path));
}

// everything should be converted, rebuild the final url
$url = $this->unparseUrl($parsedUrl);
$url = (string) $uri;

if (false === filter_var($url, FILTER_VALIDATE_URL)) {
throw new \Exception(sprintf('Url "%s" is not valid.', $url));
throw new \InvalidArgumentException(sprintf('Url "%s" is not valid.', $url));
}

$url = filter_var($url, FILTER_SANITIZE_URL);

if (false === $this->isUrlAllowed($url)) {
throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $url));
throw new \InvalidArgumentException(sprintf('Url "%s" is not allowed to be parsed.', $url));
}

return $url;
Expand Down Expand Up @@ -565,7 +562,7 @@ private function getMimeActionInfo(array $headers)
private function handleMimeAction($mimeInfo, $effectiveUrl, $response = [])
{
if (!isset($mimeInfo['action']) || !\in_array($mimeInfo['action'], ['link', 'exclude'], true)) {
return;
return null;
}

$body = isset($response['body']) ? $response['body'] : '';
Expand Down Expand Up @@ -735,17 +732,12 @@ private function getSinglePage($html, $url)
/**
* Make an absolute url from an element.
*
* @param string $base The base url
* @param \DOMNode $elem Element on which we'll retrieve the attribute
* @param string $base The base url
* @param \DOMElement $elem Element on which we'll retrieve the attribute
*/
private function makeAbsolute($base, \DOMNode $elem)
private function makeAbsolute($base, \DOMElement $elem)
{
$base = new \SimplePie_IRI($base);

// remove '//' in URL path (used to prevent URLs from resolving properly)
if (isset($base->ipath)) {
$base->ipath = str_replace('//', '/', $base->ipath);
}
$base = trim($base, '/');

foreach (['a' => 'href', 'img' => 'src', 'iframe' => 'src'] as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
Expand All @@ -764,11 +756,11 @@ private function makeAbsolute($base, \DOMNode $elem)
/**
* Make an attribute absolute (href or src).
*
* @param string $base The base url
* @param \DOMNode $e Element on which we'll retrieve the attribute
* @param string $attr Attribute that contains the url to absolutize
* @param string $base The base url
* @param \DOMElement $e Element on which we'll retrieve the attribute
* @param string $attr Attribute that contains the url to absolutize
*/
private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
private function makeAbsoluteAttr($base, \DOMElement $e, $attr)
{
if (!$e->attributes->getNamedItem($attr)) {
return;
Expand All @@ -779,11 +771,7 @@ private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
$url = str_replace(' ', '%20', $url);

if (!preg_match('!^(https?://|#)!i', $url)) {
if ($absolute = \SimplePie_IRI::absolutize($base, $url)) {
$e->setAttribute($attr, $absolute);
}
}
$e->setAttribute($attr, $this->makeAbsoluteStr($base, $url));
}

/**
Expand All @@ -805,18 +793,16 @@ private function makeAbsoluteStr($base, $url)
return $url;
}

$base = new \SimplePie_IRI($base);

// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->ipath)) {
$base->ipath = preg_replace('!//+!', '/', $base->ipath);
}
$base = new Uri($base);
// ensure the base has no path at all (to avoid // between host & path)
$base = str_replace($base->getPath(), '', (string) $base);

if ($absolute = \SimplePie_IRI::absolutize($base, $url)) {
return $absolute->get_uri();
// in case the url has no scheme & host
if (0 === \strlen($base)) {
return false;
}

return false;
return (string) Uri::resolve(new Uri($base), $url);
}

/**
Expand Down Expand Up @@ -854,29 +840,6 @@ private function getExcerpt($text, $length = 250, $separator = ' …')
return $text;
}

/**
* Rebuild an url using the response from parse_url.
* Useful to rebuild an url after editing only the host, for example.
*
* @param array $data
*
* @return array
*/
private function unparseUrl($data)
{
$scheme = isset($data['scheme']) ? $data['scheme'] . '://' : '';
$host = isset($data['host']) ? $data['host'] : '';
$port = isset($data['port']) ? ':' . $data['port'] : '';
$user = isset($data['user']) ? $data['user'] : '';
$pass = isset($data['pass']) ? ':' . $data['pass'] : '';
$pass = ($user || $pass) ? "$pass@" : '';
$path = isset($data['path']) ? $data['path'] : '';
$query = isset($data['query']) ? '?' . $data['query'] : '';
$fragment = isset($data['fragment']) ? '#' . $data['fragment'] : '';

return "$scheme$user$pass$host$port$path$query$fragment";
}

/**
* Convert string to utf8
* (uses HTTP headers and HTML to find encoding).
Expand Down
10 changes: 5 additions & 5 deletions src/SiteConfig/SiteConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,28 @@ class SiteConfig
*
* @var string
*/
public $not_logged_in_xpath = false;
public $not_logged_in_xpath;

/**
* Site's login form URI, if applicable.
*
* @var string
*/
public $login_uri = false;
public $login_uri;

/**
* Name of the site's login form username field. Example: username.
*
* @var string
*/
public $login_username_field = false;
public $login_username_field;

/**
* Name of the site's login form password field. Example: password.
*
* @var string
*/
public $login_password_field = false;
public $login_password_field;

/**
* Extra fields to POST to the site's login form.
Expand All @@ -139,7 +139,7 @@ class SiteConfig
/**
* Explicitly skip getting data from JSON-LD.
*
* @var string
* @var bool
*/
public $skip_json_ld = false;

Expand Down
22 changes: 0 additions & 22 deletions tests/Extractor/HttpClientTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -173,28 +173,6 @@ public function testFetchGetWithMetaRefresh($url, $body, $metaUrl)
$this->assertSame(200, $res['status']);
}

/**
* This will force `SimplePie_IRI::absolutize` to return false because the relative url is wrong.
*/
public function testFetchGetWithMetaRefreshBadBase()
{
$url = 'http://wikipedia.org/wiki/Copyright';
$body = '<html><meta HTTP-EQUIV="REFRESH" content="0; url=::/bernama/v6/newsindex.php?id=943513"></html>';

$httpMockClient = new HttpMockClient();
$httpMockClient->addResponse(new Response(200, ['Content-Type' => 'text/html'], $body));

$http = new HttpClient($httpMockClient);
$res = $http->fetch($url);

$this->assertCount(1, $httpMockClient->getRequests());
$this->assertEquals('GET', $httpMockClient->getRequests()[0]->getMethod());
$this->assertSame($url, $res['effective_url']);
$this->assertSame($body, $res['body']);
$this->assertSame('text/html', $res['headers']['content-type']);
$this->assertSame(200, $res['status']);
}

public function testWith404ResponseWithResponse()
{
$httpMockClient = new HttpMockClient();
Expand Down
18 changes: 11 additions & 7 deletions tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,7 @@ public function dataForNotValid()
/**
* @dataProvider dataForNotValid
*
* @expectedException \Exception
* @expectedExceptionMessage is not valid.
* @expectedException \InvalidArgumentException
*/
public function testNotValidUrls($url)
{
Expand Down Expand Up @@ -624,6 +623,11 @@ public function testMultiplePageBadAbsoluteUrl()
['Content-Type' => 'text/html'],
'<html><h2 class="primary">my title</h2><div class="story">my content</div><ul><li class="next"><a href="/:/">next page</a></li></ul></html>'
));
$httpMockClient->addResponse(new Response(
200,
['Content-Type' => 'text/html'],
'<html><h2 class="primary">my title</h2><div class="story">my content</div><ul><li class="next"><a href="/:/">next page</a></li></ul></html>'
));
$httpMockClient->addResponse(new Response(
200,
['Content-Type' => 'text/html'],
Expand Down Expand Up @@ -750,7 +754,7 @@ public function dataForMakeAbsoluteAttr()
['http://example.org', '<img src=" /path/to/image.jpg" />', 'src', 'src', 'http://example.org/path/to/image.jpg'],
['http://example.org', '<a href="/lol">test</a>', 'src', 'src', ''],
['http://example.org', '<iframe src="/lol" />', 'src', 'src', 'http://example.org/lol'],
['http://example.org', '<a href="#fn-ref-23">1</a>', 'href', 'href', '#fn-ref-23'],
['http://example.org', '<a href="#fn-ref-23">1</a>', 'href', 'href', 'http://example.org#fn-ref-23'],
];
}

Expand All @@ -764,7 +768,7 @@ public function testMakeAbsoluteAttr($base, $string, $attr, $expectedAttr, $expe
$doc = new \DOMDocument();
$doc->loadXML($string);

$e = $doc->firstChild;
$e = $doc->documentElement;

$reflection = new \ReflectionClass(\get_class($graby));
$method = $reflection->getMethod('makeAbsoluteAttr');
Expand Down Expand Up @@ -796,7 +800,7 @@ public function testMakeAbsolute($base, $string, $expectedAttr, $expectedResult)
$doc = new \DOMDocument();
$doc->loadXML($string);

$e = $doc->firstChild;
$e = $doc->documentElement;

$reflection = new \ReflectionClass(\get_class($graby));
$method = $reflection->getMethod('makeAbsolute');
Expand All @@ -817,7 +821,7 @@ public function testMakeAbsoluteMultiple()
$doc = new \DOMDocument();
$doc->loadXML('<a href="/lol"><img src=" /path/to/image.jpg" /></a>');

$e = $doc->firstChild;
$e = $doc->documentElement;

$reflection = new \ReflectionClass(\get_class($graby));
$method = $reflection->getMethod('makeAbsolute');
Expand All @@ -826,7 +830,7 @@ public function testMakeAbsoluteMultiple()
$method->invokeArgs($graby, ['http://example.org', $e]);

$this->assertSame('http://example.org/lol', $e->getAttribute('href'));
$this->assertSame('http://example.org/path/to/image.jpg', $e->firstChild->getAttribute('src'));
$this->assertSame('http://example.org/path/to/image.jpg', $e->firstChild->attributes->getNamedItem('src')->nodeValue);
}

public function testContentLinksRemove()
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/sites/blogger.test
Original file line number Diff line number Diff line change
Expand Up @@ -3296,12 +3296,12 @@ Most developers interact with databases these days through their preferred frame
<li>Implementing the <a href="http://docs.mongodb.org/meta-driver/latest/legacy/mongodb-wire-protocol/" target="_blank">MongoDB wire protocol</a></li>
</ol><br/><h4>Again, What's the Point?</h4>
Well, now we can provide you with a stable driver that really shouldn't change all that much, which means less extension upgrades.
<p>Furthermore, it should be faster. The legacy driver, which dates back five years, had unfortunate design quirks that couldn't be fully resolved without a costly rewrite (e.g. the way MongoGridFS invokes MongoCollection methods internally). Creating a brand new, no-frills<sup><a href="#note-1" id="back-note-1">[1]</a></sup>, and simple to use driver gives us a fresh starting point for the next five years.</p>
<p>Furthermore, it should be faster. The legacy driver, which dates back five years, had unfortunate design quirks that couldn't be fully resolved without a costly rewrite (e.g. the way MongoGridFS invokes MongoCollection methods internally). Creating a brand new, no-frills<sup><a href="http://bjori.blogspot.fr?_escaped_fragment_=#note-1" id="back-note-1">[1]</a></sup>, and simple to use driver gives us a fresh starting point for the next five years.</p>
<h4>MongoDB PHP Libraries</h4>
Of course, we aren't planning on leaving our users out to dry. Whether or not your framework of choice offers an amazing MongoDB abstraction layer, we do want to make it easy, simple, and natural for you to develop applications with MongoDB.
<p>To that end, we are writing a <a href="https://github.com/10gen-labs/mongo-php-library-prototype" target="_blank">PHP library</a> on top of this new extension, which will have all of the frills, bells, and whistles you might expect. It implements the "<a href="https://github.com/mongodb/specifications/blob/master/source/crud/crud.rst" target="_blank">Standard MongoDB Driver CRUD API</a>" (among others) and we'll continue roll in new features into this library as needed. And because this library will be implemented in PHP, we expect to iterate on new features much more quickly than we were able to do with the legacy driver.</p>
<p>This won't be the only library we will be writing. There are also plans to develop a library to deal with MongoDB administrative tasks (e.g. creating users, reconfiguring wiredTiger nodes, tailing oplogs) and develop tools to introspect MongoDB clusters.</p>
<h4>What do you think?</h4>
The biggest question at this point is: "what else?" What is missing? What are the other pain points you've experienced as a MongoDB developer, that the driver or a library can help with?<br/>
We would love to <a href="https://jira.mongodb.org/secure/CreateIssue.jspa?pid=12484&amp;issuetype=6">hear your feedback</a>! Check out the projects on GitHub (<a href="https://github.com/10gen-labs/mongo-php-driver-prototype">driver</a>, <a href="https://github.com/10gen-labs/mongo-php-library-prototype">library</a>), and let us know what you think.
<p><a href="#back-note-1" id="note-1"><sup>[1]</sup></a> Well.. Almost. There is one <em>frilled</em> feature I'm excited about -- we'll cover that later.<br/></p>
<p><a href="http://bjori.blogspot.fr?_escaped_fragment_=#back-note-1" id="note-1"><sup>[1]</sup></a> Well.. Almost. There is one <em>frilled</em> feature I'm excited about -- we'll cover that later.<br/></p>

0 comments on commit 05ce203

Please sign in to comment.