diff --git a/changelog.d/11936.bugfix b/changelog.d/11936.bugfix new file mode 100644 index 0000000000..bc149f2801 --- /dev/null +++ b/changelog.d/11936.bugfix @@ -0,0 +1 @@ +Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers. diff --git a/synapse/http/client.py b/synapse/http/client.py index 743a7ffcb1..d617055617 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -20,6 +20,7 @@ from typing import ( TYPE_CHECKING, Any, BinaryIO, + Callable, Dict, Iterable, List, @@ -693,12 +694,18 @@ class SimpleHttpClient: output_stream: BinaryIO, max_size: Optional[int] = None, headers: Optional[RawHeaders] = None, + is_allowed_content_type: Optional[Callable[[str], bool]] = None, ) -> Tuple[int, Dict[bytes, List[bytes]], str, int]: """GETs a file from a given URL Args: url: The URL to GET output_stream: File to write the response body to. headers: A map from header name to a list of values for that header + is_allowed_content_type: A predicate to determine whether the + content type of the file we're downloading is allowed. If set and + it evaluates to False when called with the content type, the + request will be terminated before completing the download by + raising SynapseError. Returns: A tuple of the file length, dict of the response headers, absolute URI of the response and HTTP response code. @@ -726,6 +733,17 @@ class SimpleHttpClient: HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN ) + if is_allowed_content_type and b"Content-Type" in resp_headers: + content_type = resp_headers[b"Content-Type"][0].decode("ascii") + if not is_allowed_content_type(content_type): + raise SynapseError( + HTTPStatus.BAD_GATEWAY, + ( + "Requested file's content type not allowed for this operation: %s" + % content_type + ), + ) + # TODO: if our Content-Type is HTML or something, just read the first # N bytes into RAM rather than saving it all to disk only to read it # straight back in again diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index efd84ced8f..8d3d1e54dc 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -403,6 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource): output_stream=output_stream, max_size=self.max_spider_size, headers={"Accept-Language": self.url_preview_accept_language}, + is_allowed_content_type=_is_previewable, ) except SynapseError: # Pass SynapseErrors through directly, so that the servlet @@ -761,3 +762,10 @@ def _is_html(content_type: str) -> bool: def _is_json(content_type: str) -> bool: return content_type.lower().startswith("application/json") + + +def _is_previewable(content_type: str) -> bool: + """Returns True for content types for which we will perform URL preview and False + otherwise.""" + + return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 53f6186213..da2c533260 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -243,6 +243,78 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual(channel.code, 200) self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") + def test_video_rejected(self): + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + end_content = b"anything" + + channel = self.make_request( + "GET", + "preview_url?url=http://matrix.org", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: video/mp4\r\n\r\n" + ) + % (len(end_content)) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 502) + self.assertEqual( + channel.json_body, + { + "errcode": "M_UNKNOWN", + "error": "Requested file's content type not allowed for this operation: video/mp4", + }, + ) + + def test_audio_rejected(self): + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + end_content = b"anything" + + channel = self.make_request( + "GET", + "preview_url?url=http://matrix.org", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: audio/aac\r\n\r\n" + ) + % (len(end_content)) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 502) + self.assertEqual( + channel.json_body, + { + "errcode": "M_UNKNOWN", + "error": "Requested file's content type not allowed for this operation: audio/aac", + }, + ) + def test_non_ascii_preview_content_type(self): self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]