mirror of
https://github.com/friendica/friendica
synced 2024-12-23 16:00:16 +00:00
Move "Network::finalUrl" to "HTTPRequest" class
This commit is contained in:
parent
57587efe58
commit
60e18736b0
3 changed files with 123 additions and 124 deletions
|
@ -21,10 +21,13 @@
|
|||
|
||||
namespace Friendica\Network;
|
||||
|
||||
use DOMDocument;
|
||||
use DomXPath;
|
||||
use Friendica\App;
|
||||
use Friendica\Core\Config\IConfig;
|
||||
use Friendica\Core\Logger;
|
||||
use Friendica\Core\System;
|
||||
use Friendica\DI;
|
||||
use Friendica\Util\Network;
|
||||
use Friendica\Util\Profiler;
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
@ -323,6 +326,124 @@ class HTTPRequest
|
|||
return $curlResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the original URL of the provided URL
|
||||
*
|
||||
* This function strips tracking query params and follows redirections, either
|
||||
* through HTTP code or meta refresh tags. Stops after 10 redirections.
|
||||
*
|
||||
* @todo Remove the $fetchbody parameter that generates an extraneous HEAD request
|
||||
*
|
||||
* @see ParseUrl::getSiteinfo
|
||||
*
|
||||
* @param string $url A user-submitted URL
|
||||
* @param int $depth The current redirection recursion level (internal)
|
||||
* @param bool $fetchbody Wether to fetch the body or not after the HEAD requests
|
||||
* @return string A canonical URL
|
||||
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
|
||||
*/
|
||||
public static function finalUrl(string $url, int $depth = 1, bool $fetchbody = false)
|
||||
{
|
||||
$url = Network::stripTrackingQueryParams($url);
|
||||
|
||||
if ($depth > 10) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$url = trim($url, "'");
|
||||
|
||||
$stamp1 = microtime(true);
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 1);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 1);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, DI::httpRequest()->getUserAgent());
|
||||
|
||||
curl_exec($ch);
|
||||
$curl_info = @curl_getinfo($ch);
|
||||
$http_code = $curl_info['http_code'];
|
||||
curl_close($ch);
|
||||
|
||||
DI::profiler()->saveTimestamp($stamp1, "network", System::callstack());
|
||||
|
||||
if ($http_code == 0) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
if (in_array($http_code, ['301', '302'])) {
|
||||
if (!empty($curl_info['redirect_url'])) {
|
||||
return self::finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
|
||||
} elseif (!empty($curl_info['location'])) {
|
||||
return self::finalUrl($curl_info['location'], ++$depth, $fetchbody);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for redirects in the meta elements of the body if there are no redirects in the header.
|
||||
if (!$fetchbody) {
|
||||
return self::finalUrl($url, ++$depth, true);
|
||||
}
|
||||
|
||||
// if the file is too large then exit
|
||||
if ($curl_info["download_content_length"] > 1000000) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// if it isn't a HTML file then exit
|
||||
if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$stamp1 = microtime(true);
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 0);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, DI::httpRequest()->getUserAgent());
|
||||
|
||||
$body = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
|
||||
DI::profiler()->saveTimestamp($stamp1, "network", System::callstack());
|
||||
|
||||
if (trim($body) == "") {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// Check for redirect in meta elements
|
||||
$doc = new DOMDocument();
|
||||
@$doc->loadHTML($body);
|
||||
|
||||
$xpath = new DomXPath($doc);
|
||||
|
||||
$list = $xpath->query("//meta[@content]");
|
||||
foreach ($list as $node) {
|
||||
$attr = [];
|
||||
if ($node->attributes->length) {
|
||||
foreach ($node->attributes as $attribute) {
|
||||
$attr[$attribute->name] = $attribute->value;
|
||||
}
|
||||
}
|
||||
|
||||
if (@$attr["http-equiv"] == 'refresh') {
|
||||
$path = $attr["content"];
|
||||
$pathinfo = explode(";", $path);
|
||||
foreach ($pathinfo as $value) {
|
||||
if (substr(strtolower($value), 0, 4) == "url=") {
|
||||
return self::finalUrl(substr($value, 4), ++$depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Curl wrapper
|
||||
*
|
||||
|
|
|
@ -35,6 +35,7 @@ use Friendica\Model\Contact;
|
|||
use Friendica\Model\Item;
|
||||
use Friendica\Model\Tag;
|
||||
use Friendica\Model\User;
|
||||
use Friendica\Network\HTTPRequest;
|
||||
use Friendica\Util\DateTimeFormat;
|
||||
use Friendica\Util\Network;
|
||||
use Friendica\Util\ParseUrl;
|
||||
|
@ -350,7 +351,7 @@ class Feed
|
|||
|
||||
$orig_plink = $item["plink"];
|
||||
|
||||
$item["plink"] = Network::finalUrl($item["plink"]);
|
||||
$item["plink"] = HTTPRequest::finalUrl($item["plink"]);
|
||||
|
||||
$item["parent-uri"] = $item["uri"];
|
||||
|
||||
|
|
|
@ -21,11 +21,8 @@
|
|||
|
||||
namespace Friendica\Util;
|
||||
|
||||
use DOMDocument;
|
||||
use DomXPath;
|
||||
use Friendica\Core\Hook;
|
||||
use Friendica\Core\Logger;
|
||||
use Friendica\Core\System;
|
||||
use Friendica\DI;
|
||||
|
||||
class Network
|
||||
|
@ -314,126 +311,6 @@ class Network
|
|||
return self::unparseURL($parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the original URL of the provided URL
|
||||
*
|
||||
* This function strips tracking query params and follows redirections, either
|
||||
* through HTTP code or meta refresh tags. Stops after 10 redirections.
|
||||
*
|
||||
* @todo Remove the $fetchbody parameter that generates an extraneous HEAD request
|
||||
*
|
||||
* @see ParseUrl::getSiteinfo
|
||||
*
|
||||
* @param string $url A user-submitted URL
|
||||
* @param int $depth The current redirection recursion level (internal)
|
||||
* @param bool $fetchbody Wether to fetch the body or not after the HEAD requests
|
||||
* @return string A canonical URL
|
||||
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
|
||||
*/
|
||||
public static function finalUrl(string $url, int $depth = 1, bool $fetchbody = false)
|
||||
{
|
||||
$a = DI::app();
|
||||
|
||||
$url = self::stripTrackingQueryParams($url);
|
||||
|
||||
if ($depth > 10) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$url = trim($url, "'");
|
||||
|
||||
$stamp1 = microtime(true);
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 1);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 1);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, DI::httpRequest()->getUserAgent());
|
||||
|
||||
curl_exec($ch);
|
||||
$curl_info = @curl_getinfo($ch);
|
||||
$http_code = $curl_info['http_code'];
|
||||
curl_close($ch);
|
||||
|
||||
DI::profiler()->saveTimestamp($stamp1, "network", System::callstack());
|
||||
|
||||
if ($http_code == 0) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
if (in_array($http_code, ['301', '302'])) {
|
||||
if (!empty($curl_info['redirect_url'])) {
|
||||
return self::finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
|
||||
} elseif (!empty($curl_info['location'])) {
|
||||
return self::finalUrl($curl_info['location'], ++$depth, $fetchbody);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for redirects in the meta elements of the body if there are no redirects in the header.
|
||||
if (!$fetchbody) {
|
||||
return(self::finalUrl($url, ++$depth, true));
|
||||
}
|
||||
|
||||
// if the file is too large then exit
|
||||
if ($curl_info["download_content_length"] > 1000000) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// if it isn't a HTML file then exit
|
||||
if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$stamp1 = microtime(true);
|
||||
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOBODY, 0);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_USERAGENT, DI::httpRequest()->getUserAgent());
|
||||
|
||||
$body = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
|
||||
DI::profiler()->saveTimestamp($stamp1, "network", System::callstack());
|
||||
|
||||
if (trim($body) == "") {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// Check for redirect in meta elements
|
||||
$doc = new DOMDocument();
|
||||
@$doc->loadHTML($body);
|
||||
|
||||
$xpath = new DomXPath($doc);
|
||||
|
||||
$list = $xpath->query("//meta[@content]");
|
||||
foreach ($list as $node) {
|
||||
$attr = [];
|
||||
if ($node->attributes->length) {
|
||||
foreach ($node->attributes as $attribute) {
|
||||
$attr[$attribute->name] = $attribute->value;
|
||||
}
|
||||
}
|
||||
|
||||
if (@$attr["http-equiv"] == 'refresh') {
|
||||
$path = $attr["content"];
|
||||
$pathinfo = explode(";", $path);
|
||||
foreach ($pathinfo as $value) {
|
||||
if (substr(strtolower($value), 0, 4) == "url=") {
|
||||
return self::finalUrl(substr($value, 4), ++$depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the matching part between two url
|
||||
*
|
||||
|
|
Loading…
Reference in a new issue