mirror of
https://github.com/friendica/friendica
synced 2025-01-05 20:44:43 +00:00
Add implementation of HTTP Media Type
- Add charset extraction from DOMDocument - TESTS!
This commit is contained in:
parent
f4b5d22396
commit
5e2b655b43
4 changed files with 1671 additions and 0 deletions
|
@ -23,6 +23,7 @@ namespace Friendica\Content\Text;
|
||||||
|
|
||||||
use DOMDocument;
|
use DOMDocument;
|
||||||
use DOMXPath;
|
use DOMXPath;
|
||||||
|
use Friendica\Protocol\HTTP\MediaType;
|
||||||
use Friendica\Content\Widget\ContactBlock;
|
use Friendica\Content\Widget\ContactBlock;
|
||||||
use Friendica\Core\Hook;
|
use Friendica\Core\Hook;
|
||||||
use Friendica\Core\Renderer;
|
use Friendica\Core\Renderer;
|
||||||
|
@ -1055,4 +1056,30 @@ class HTML
|
||||||
|
|
||||||
return $result !== false && $result->length > 0;
|
return $result !== false && $result->length > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param DOMDocument $doc
|
||||||
|
* @return string|null Lowercase charset
|
||||||
|
*/
|
||||||
|
public static function extractCharset(DOMDocument $doc): ?string
|
||||||
|
{
|
||||||
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
|
$expression = "string(//meta[@charset]/@charset)";
|
||||||
|
if ($charset = $xpath->evaluate($expression)) {
|
||||||
|
return strtolower($charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// This expression looks for a meta tag with the http-equiv attribute set to "content-type" ignoring case
|
||||||
|
// whose content attribute contains a "charset" string and returns its value
|
||||||
|
$expression = "string(//meta[@http-equiv][translate(@http-equiv, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'content-type'][contains(translate(@content, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'charset')]/@content)";
|
||||||
|
$mediaType = MediaType::fromContentType($xpath->evaluate($expression));
|
||||||
|
if (isset($mediaType->parameters['charset'])) {
|
||||||
|
return strtolower($mediaType->parameters['charset']);
|
||||||
|
}
|
||||||
|
} catch(\InvalidArgumentException $e) {}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
237
src/Protocol/HTTP/MediaType.php
Normal file
237
src/Protocol/HTTP/MediaType.php
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* @copyright Copyright (C) 2010-2023, the Friendica project
|
||||||
|
*
|
||||||
|
* @license GNU AGPL version 3 or any later version
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as
|
||||||
|
* published by the Free Software Foundation, either version 3 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Friendica\Protocol\HTTP;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see https://httpwg.org/specs/rfc9110.html#media.type
|
||||||
|
*
|
||||||
|
* @property-read string $type
|
||||||
|
* @property-read string $subType
|
||||||
|
* @property-read string $parameters
|
||||||
|
*/
|
||||||
|
final class MediaType
|
||||||
|
{
|
||||||
|
const DQUOTE = '"';
|
||||||
|
const DIGIT = '0-9';
|
||||||
|
const ALPHA = 'a-zA-Z';
|
||||||
|
|
||||||
|
// @see https://www.charset.org/charsets/us-ascii
|
||||||
|
const VCHAR = "\\x21-\\x7E";
|
||||||
|
|
||||||
|
const SYMBOL_NO_DELIM = "!#$%&'*+-.^_`|~";
|
||||||
|
|
||||||
|
const OBSTEXT = "\\x80-\\xFF";
|
||||||
|
|
||||||
|
const QDTEXT = "\t \\x21\\x23-\\x5B\\x5D-\\x7E" . self::OBSTEXT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var @string
|
||||||
|
*/
|
||||||
|
private $subType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var string[]
|
||||||
|
*/
|
||||||
|
private $parameters;
|
||||||
|
|
||||||
|
public function __construct(string $type, string $subType, array $parameters = [])
|
||||||
|
{
|
||||||
|
if (!self::isToken($type)) {
|
||||||
|
throw new \InvalidArgumentException("Type isn't a valid token: " . $type);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!self::isToken($subType)) {
|
||||||
|
throw new \InvalidArgumentException("Subtype isn't a valid token: " . $subType);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($parameters as $key => $value) {
|
||||||
|
if (!self::isToken($key)) {
|
||||||
|
throw new \InvalidArgumentException("Parameter key isn't a valid token: " . $key);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!self::isToken($value) && !self::isQuotableString($value)) {
|
||||||
|
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quotable string: " . $value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->type = $type;
|
||||||
|
$this->subType = $subType;
|
||||||
|
$this->parameters = $parameters;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function __get(string $name)
|
||||||
|
{
|
||||||
|
if (!isset($this->$name)) {
|
||||||
|
throw new \InvalidArgumentException('Unknown property ' . $name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->$name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fromContentType(string $contentType): self
|
||||||
|
{
|
||||||
|
if (!$contentType) {
|
||||||
|
throw new \InvalidArgumentException('Provided string is empty');
|
||||||
|
}
|
||||||
|
|
||||||
|
$parts = explode(';', $contentType);
|
||||||
|
$mimeTypeParts = explode('/', trim(array_shift($parts)));
|
||||||
|
if (count($mimeTypeParts) !== 2) {
|
||||||
|
throw new \InvalidArgumentException('Provided string doesn\'t look like a MIME type: ' . $contentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
list($type, $subType) = $mimeTypeParts;
|
||||||
|
|
||||||
|
$parameters = [];
|
||||||
|
foreach ($parts as $parameterString) {
|
||||||
|
if (!trim($parameterString)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$parameterParts = explode('=', trim($parameterString));
|
||||||
|
|
||||||
|
if (count($parameterParts) < 2) {
|
||||||
|
throw new \InvalidArgumentException('Parameter lacks a value: ' . $parameterString);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count($parameterParts) > 2) {
|
||||||
|
throw new \InvalidArgumentException('Parameter has too many values: ' . $parameterString);
|
||||||
|
}
|
||||||
|
|
||||||
|
list($key, $value) = $parameterParts;
|
||||||
|
|
||||||
|
if (!self::isToken($value) && !self::isQuotedString($value)) {
|
||||||
|
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quoted string: \"" . $value . '"');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self::isQuotedString($value)) {
|
||||||
|
$value = self::extractQuotedStringValue($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parameter keys are case-insensitive, values are not
|
||||||
|
$parameters[strtolower($key)] = $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new self($type, $subType, $parameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function __toString(): string
|
||||||
|
{
|
||||||
|
$parameters = $this->parameters;
|
||||||
|
|
||||||
|
array_walk($parameters, function (&$value, $key) {
|
||||||
|
$value = '; ' . $key . '=' . (self::isToken($value) ? $value : '"' . addcslashes($value, '"\\') . '"');
|
||||||
|
});
|
||||||
|
|
||||||
|
return $this->type . '/' . $this->subType . implode($parameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* token = 1*tchar
|
||||||
|
* tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
|
||||||
|
* / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
|
||||||
|
* / DIGIT / ALPHA
|
||||||
|
* ; any VCHAR, except delimiters
|
||||||
|
*
|
||||||
|
* @see https://httpwg.org/specs/rfc9110.html#tokens
|
||||||
|
*
|
||||||
|
* @param string $string
|
||||||
|
* @return false|int
|
||||||
|
*/
|
||||||
|
private static function isToken(string $string)
|
||||||
|
{
|
||||||
|
$symbol = preg_quote(self::SYMBOL_NO_DELIM, '/');
|
||||||
|
$digit = self::DIGIT;
|
||||||
|
$alpha = self::ALPHA;
|
||||||
|
|
||||||
|
$pattern = "/^[$symbol$digit$alpha]+$/";
|
||||||
|
|
||||||
|
return preg_match($pattern, $string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
|
||||||
|
* qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text
|
||||||
|
*
|
||||||
|
* @see https://httpwg.org/specs/rfc9110.html#quoted.strings
|
||||||
|
*
|
||||||
|
* @param string $string
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
private static function isQuotedString(string $string): bool
|
||||||
|
{
|
||||||
|
$dquote = self::DQUOTE;
|
||||||
|
|
||||||
|
$vchar = self::VCHAR;
|
||||||
|
|
||||||
|
$obsText = self::OBSTEXT;
|
||||||
|
|
||||||
|
$qdtext = '[' . self::QDTEXT . ']';
|
||||||
|
|
||||||
|
$quotedPair = "\\\\[\t $vchar$obsText]";
|
||||||
|
|
||||||
|
$pattern = "/^$dquote(?:$qdtext|$quotedPair)*$dquote$/";
|
||||||
|
|
||||||
|
return preg_match($pattern, $string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the string an extracted quoted string value?
|
||||||
|
*
|
||||||
|
* @param string $string
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
private static function isQuotableString(string $string): bool
|
||||||
|
{
|
||||||
|
$vchar = self::VCHAR;
|
||||||
|
|
||||||
|
$obsText = self::OBSTEXT;
|
||||||
|
|
||||||
|
$qdtext = '[' . self::QDTEXT . ']';
|
||||||
|
|
||||||
|
$quotedSingle = "[\t $vchar$obsText]";
|
||||||
|
|
||||||
|
$pattern = "/^(?:$qdtext|$quotedSingle)*$/";
|
||||||
|
|
||||||
|
return preg_match($pattern, $string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the value from a quoted-string, removing quoted pairs
|
||||||
|
*
|
||||||
|
* @param string $value
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private static function extractQuotedStringValue(string $value): string
|
||||||
|
{
|
||||||
|
return preg_replace_callback('/^"(.*)"$/', function ($matches) {
|
||||||
|
$vchar = self::VCHAR;
|
||||||
|
$obsText = self::OBSTEXT;
|
||||||
|
return preg_replace("/\\\\([\t $vchar$obsText])/", '$1', $matches[1]);
|
||||||
|
}, $value);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
150
tests/src/Protocol/HTTP/MediaTypeTest.php
Normal file
150
tests/src/Protocol/HTTP/MediaTypeTest.php
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* @copyright Copyright (C) 2010-2023, the Friendica project
|
||||||
|
*
|
||||||
|
* @license GNU AGPL version 3 or any later version
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as
|
||||||
|
* published by the Free Software Foundation, either version 3 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Friendica\Test\src\Protocol\HTTP;
|
||||||
|
|
||||||
|
use Friendica\Protocol\HTTP\MediaType;
|
||||||
|
|
||||||
|
class MediaTypeTest extends \PHPUnit\Framework\TestCase
|
||||||
|
{
|
||||||
|
public function dataValid(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'HTML UTF-8' => [
|
||||||
|
'expected' => new MediaType('text', 'html', ['charset' => 'utf-8']),
|
||||||
|
'content-type' => 'text/html; charset=utf-8',
|
||||||
|
],
|
||||||
|
'HTML Northern Europe' => [
|
||||||
|
'expected' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
|
||||||
|
'content-type' => 'text/html; charset=ISO-8859-4',
|
||||||
|
],
|
||||||
|
'multipart/form-data' => [
|
||||||
|
'expected' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
|
||||||
|
'content-type' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
|
||||||
|
],
|
||||||
|
'Multiple parameters' => [
|
||||||
|
'expected' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
|
||||||
|
'content-type' => 'application/octet-stream; charset=ISO-8859-4 ; another=parameter',
|
||||||
|
],
|
||||||
|
'No parameters' => [
|
||||||
|
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||||
|
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip',
|
||||||
|
],
|
||||||
|
'No parameters colon' => [
|
||||||
|
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||||
|
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip;',
|
||||||
|
],
|
||||||
|
'No parameters space colon' => [
|
||||||
|
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||||
|
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ;',
|
||||||
|
],
|
||||||
|
'No parameters space colon space' => [
|
||||||
|
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||||
|
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ; ',
|
||||||
|
],
|
||||||
|
'Parameter quoted string' => [
|
||||||
|
'expected' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
|
||||||
|
'content-type' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
|
||||||
|
]
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataValid
|
||||||
|
*
|
||||||
|
* @param MediaType $expected
|
||||||
|
* @param string $contentType
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testValid(MediaType $expected, string $contentType)
|
||||||
|
{
|
||||||
|
$this->assertEquals($expected, MediaType::fromContentType($contentType));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function dataInvalid(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'no slash' => ['application'],
|
||||||
|
'two slashes' => ['application/octet/stream'],
|
||||||
|
'parameter no value' => ['application/octet-stream ; parameter'],
|
||||||
|
'parameter too many values' => ['application/octet-stream ; parameter=value1=value2'],
|
||||||
|
'type non token' => ['appli"cation/octet-stream'],
|
||||||
|
'subtype non token' => ['application/octet\-stream'],
|
||||||
|
'parameter name non token' => ['application/octet-stream; para"meter=value'],
|
||||||
|
'parameter value invalid' => ['application/octet-stream; parameter="value"value'],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataInvalid
|
||||||
|
*
|
||||||
|
* @param string $contentType
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testInvalid(string $contentType)
|
||||||
|
{
|
||||||
|
$this->expectException(\InvalidArgumentException::class);
|
||||||
|
|
||||||
|
MediaType::fromContentType($contentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function dataToString(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'HTML UTF-8' => [
|
||||||
|
'content-type' => 'text/html; charset=utf-8',
|
||||||
|
'mediaType' => new MediaType('text', 'html', ['charset' => 'utf-8']),
|
||||||
|
],
|
||||||
|
'HTML Northern Europe' => [
|
||||||
|
'expected' => 'text/html; charset=ISO-8859-4',
|
||||||
|
'mediaType' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
|
||||||
|
],
|
||||||
|
'multipart/form-data' => [
|
||||||
|
'expected' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
|
||||||
|
'mediaType' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
|
||||||
|
],
|
||||||
|
'Multiple parameters' => [
|
||||||
|
'expected' => 'application/octet-stream; charset=ISO-8859-4; another=parameter',
|
||||||
|
'mediaType' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
|
||||||
|
],
|
||||||
|
'No parameters' => [
|
||||||
|
'expected' => 'application/vnd.adobe.air-application-installer-package+zip',
|
||||||
|
'mediaType' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||||
|
],
|
||||||
|
'Parameter quoted string' => [
|
||||||
|
'expected' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
|
||||||
|
'mediaType' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
|
||||||
|
],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataToString
|
||||||
|
*
|
||||||
|
* @param string $expected
|
||||||
|
* @param MediaType $mediaType
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testToString(string $expected, MediaType $mediaType)
|
||||||
|
{
|
||||||
|
$this->assertEquals($expected, $mediaType->__toString());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue