Request JSON for oEmbed requests (and ignore XML only providers). (#10759)

This adds the format to the request arguments / URL to
ensure that JSON data is returned (which is all that
Synapse supports).

This also adds additional error checking / filtering to the
configuration file to ignore XML-only providers.
This commit is contained in:
Patrick Cloke 2021-09-08 07:17:52 -04:00 committed by GitHub
parent aacdce8fc0
commit 580a15e039
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 98 additions and 8 deletions

View file

@ -0,0 +1 @@
Allow configuration of the oEmbed URLs used for URL previews.

View file

@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import json import json
import re import re
from typing import Any, Dict, Iterable, List, Pattern from typing import Any, Dict, Iterable, List, Optional, Pattern
from urllib import parse as urlparse from urllib import parse as urlparse
import attr import attr
@ -31,6 +31,8 @@ class OEmbedEndpointConfig:
api_endpoint: str api_endpoint: str
# The patterns to match. # The patterns to match.
url_patterns: List[Pattern] url_patterns: List[Pattern]
# The supported formats.
formats: Optional[List[str]]
class OembedConfig(Config): class OembedConfig(Config):
@ -93,11 +95,22 @@ class OembedConfig(Config):
# might have multiple patterns to match. # might have multiple patterns to match.
for endpoint in provider["endpoints"]: for endpoint in provider["endpoints"]:
api_endpoint = endpoint["url"] api_endpoint = endpoint["url"]
# The API endpoint must be an HTTP(S) URL.
results = urlparse.urlparse(api_endpoint)
if results.scheme not in {"http", "https"}:
raise ConfigError(
f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}",
config_path,
)
patterns = [ patterns = [
self._glob_to_pattern(glob, config_path) self._glob_to_pattern(glob, config_path)
for glob in endpoint["schemes"] for glob in endpoint["schemes"]
] ]
yield OEmbedEndpointConfig(api_endpoint, patterns) yield OEmbedEndpointConfig(
api_endpoint, patterns, endpoint.get("formats")
)
def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern:
""" """
@ -114,9 +127,12 @@ class OembedConfig(Config):
""" """
results = urlparse.urlparse(glob) results = urlparse.urlparse(glob)
# Ensure the scheme does not have wildcards (and is a sane scheme). # The scheme must be HTTP(S) (and cannot contain wildcards).
if results.scheme not in {"http", "https"}: if results.scheme not in {"http", "https"}:
raise ConfigError(f"Insecure oEmbed scheme: {results.scheme}", config_path) raise ConfigError(
f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}",
config_path,
)
pattern = urlparse.urlunparse( pattern = urlparse.urlunparse(
[ [

View file

@ -49,8 +49,24 @@ class OEmbedProvider:
def __init__(self, hs: "HomeServer", client: SimpleHttpClient): def __init__(self, hs: "HomeServer", client: SimpleHttpClient):
self._oembed_patterns = {} self._oembed_patterns = {}
for oembed_endpoint in hs.config.oembed.oembed_patterns: for oembed_endpoint in hs.config.oembed.oembed_patterns:
api_endpoint = oembed_endpoint.api_endpoint
# Only JSON is supported at the moment. This could be declared in
# the formats field. Otherwise, if the endpoint ends in .xml assume
# it doesn't support JSON.
if (
oembed_endpoint.formats is not None
and "json" not in oembed_endpoint.formats
) or api_endpoint.endswith(".xml"):
logger.info(
"Ignoring oEmbed endpoint due to not supporting JSON: %s",
api_endpoint,
)
continue
# Iterate through each URL pattern and point it to the endpoint.
for pattern in oembed_endpoint.url_patterns: for pattern in oembed_endpoint.url_patterns:
self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint self._oembed_patterns[pattern] = api_endpoint
self._client = client self._client = client
def get_oembed_url(self, url: str) -> Optional[str]: def get_oembed_url(self, url: str) -> Optional[str]:
@ -86,11 +102,15 @@ class OEmbedProvider:
""" """
try: try:
logger.debug("Trying to get oEmbed content for url '%s'", url) logger.debug("Trying to get oEmbed content for url '%s'", url)
# Note that only the JSON format is supported, some endpoints want
# this in the URL, others want it as an argument.
endpoint = endpoint.replace("{format}", "json")
result = await self._client.get_json( result = await self._client.get_json(
endpoint, endpoint,
# TODO Specify max height / width. # TODO Specify max height / width.
# Note that only the JSON format is supported. args={"url": url, "format": "json"},
args={"url": url},
) )
# Ensure there's a version of 1.0. # Ensure there's a version of 1.0.

View file

@ -92,7 +92,15 @@ class URLPreviewTests(unittest.HomeserverTestCase):
url_patterns=[ url_patterns=[
re.compile(r"http://twitter\.com/.+/status/.+"), re.compile(r"http://twitter\.com/.+/status/.+"),
], ],
) formats=None,
),
OEmbedEndpointConfig(
api_endpoint="http://www.hulu.com/api/oembed.{format}",
url_patterns=[
re.compile(r"http://www\.hulu\.com/watch/.+"),
],
formats=["json"],
),
] ]
return hs return hs
@ -656,3 +664,48 @@ class URLPreviewTests(unittest.HomeserverTestCase):
channel.json_body, channel.json_body,
{"og:title": None, "og:description": "Content Preview"}, {"og:title": None, "og:description": "Content Preview"},
) )
def test_oembed_format(self):
"""Test an oEmbed endpoint which requires the format in the URL."""
self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
result = {
"version": "1.0",
"type": "rich",
"html": "<div>Content Preview</div>",
}
end_content = json.dumps(result).encode("utf-8")
channel = self.make_request(
"GET",
"preview_url?url=http://www.hulu.com/watch/12345",
shorthand=False,
await_result=False,
)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
# The {format} should have been turned into json.
self.assertIn(b"/api/oembed.json", server.data)
# A URL parameter of format=json should be provided.
self.assertIn(b"format=json", server.data)
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
)