diff --git a/docs/url_previews.rst b/docs/url_previews.rst index 1dc6ee0c45..634d9d907f 100644 --- a/docs/url_previews.rst +++ b/docs/url_previews.rst @@ -56,7 +56,7 @@ As a first cut, let's do #2 and have the receiver hit the API to calculate its o API --- -GET /_matrix/media/r0/previewUrl?url=http://wherever.com +GET /_matrix/media/r0/preview_url?url=http://wherever.com 200 OK { "og:type" : "article" diff --git a/synapse/http/client.py b/synapse/http/client.py index a735300db0..cfdea91b57 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -26,6 +26,7 @@ from twisted.web.client import ( Agent, readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http_headers import Headers +from twisted.web._newclient import ResponseDone from StringIO import StringIO @@ -266,7 +267,7 @@ class SimpleHttpClient(object): headers = dict(response.headers.getAllRawHeaders()) - if headers['Content-Length'] > max_size: + if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? diff --git a/synapse/rest/media/v1/base_resource.py b/synapse/rest/media/v1/base_resource.py index 58ef91c0b8..2b1938dc8e 100644 --- a/synapse/rest/media/v1/base_resource.py +++ b/synapse/rest/media/v1/base_resource.py @@ -72,6 +72,7 @@ class BaseMediaResource(Resource): self.store = hs.get_datastore() self.max_upload_size = hs.config.max_upload_size self.max_image_pixels = hs.config.max_image_pixels + self.max_spider_size = hs.config.max_spider_size self.filepaths = filepaths self.version_string = hs.version_string self.downloads = {} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 5c8e20e23c..408b103367 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .base_resource import BaseMediaResource +from synapse.api.errors import Codes from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from synapse.util.stringutils import random_string from synapse.http.client import SimpleHttpClient -from synapse.http.server import request_handler, respond_with_json_bytes +from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes + +import os import ujson as json import logging logger = logging.getLogger(__name__) -class PreviewUrlResource(Resource): +class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): - Resource.__init__(self) + BaseMediaResource.__init__(self, hs, filepaths) self.client = SimpleHttpClient(hs) - self.filepaths = filepaths - self.max_spider_size = hs.config.max_spider_size - self.server_name = hs.hostname - self.clock = hs.get_clock() def render_GET(self, request): self._async_render_GET(request) @@ -40,57 +42,76 @@ class PreviewUrlResource(Resource): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - url = request.args.get("url") try: + # XXX: if get_user_by_req fails, what should we do in an async render? + requester = yield self.auth.get_user_by_req(request) + url = request.args.get("url")[0] + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. - media_info = self._download_url(url) + media_info = yield self._download_url(url, requester.user) + + logger.warn("got media_info of '%s'" % media_info) + + if self._is_media(media_info['media_type']): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info['media_type'], + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_info['media_type']): + tree = html.parse(media_info['filename']) + logger.warn(html.tostring(tree)) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + else: + logger.warn("Failed to find any OG data in %s", url) + og = {} + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - os.remove(fname) + # XXX: if we don't explicitly respond here, the request never returns. + # isn't this what server.py's wrapper is meant to be doing for us? + respond_with_json( + request, + 500, + { + "error": "Internal server error", + "errcode": Codes.UNKNOWN, + }, + send_cors=True + ) raise - if self._is_media(media_type): - dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info - ) - - og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), - "og:image:type" : media_info.media_type, - "og:image:width" : dims.width, - "og:image:height" : dims.height, - } - - # define our OG response for this media - elif self._is_html(media_type): - tree = html.parse(media_info.filename) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - - def _download_url(url): - requester = yield self.auth.get_user_by_req(request) + @defer.inlineCallbacks + def _download_url(self, url, user): # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) @@ -99,6 +120,7 @@ class PreviewUrlResource(Resource): try: with open(fname, "wb") as f: + logger.warn("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) @@ -137,14 +159,14 @@ class PreviewUrlResource(Resource): time_now_ms=self.clock.time_msec(), upload_name=download_name, media_length=length, - user_id=requester.user, + user_id=user, ) except: os.remove(fname) raise - yield ({ + defer.returnValue({ "media_type": media_type, "media_length": length, "download_name": download_name, @@ -152,14 +174,13 @@ class PreviewUrlResource(Resource): "filesystem_id": file_id, "filename": fname, }) - return - def _is_media(content_type): + def _is_media(self, content_type): if content_type.lower().startswith("image/"): return True - def _is_html(content_type): + def _is_html(self, content_type): content_type = content_type.lower() - if (content_type == "text/html" or + if (content_type.startswith("text/html") or content_type.startswith("application/xhtml")): return True