From 683e564815be5f7852c417cbab06876db6122401 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 23:52:58 +0100 Subject: [PATCH] handle spidered relative images correctly --- synapse/http/client.py | 2 +- synapse/rest/media/v1/preview_url_resource.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 219b734268..1b6f7cb795 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -282,7 +282,7 @@ class SimpleHttpClient(object): logger.exception("Failed to download body") raise - defer.returnValue((length, headers)) + defer.returnValue((length, headers, response.request.absoluteURI)) # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 04d02ee427..bae3905a43 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -18,6 +18,7 @@ from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -125,7 +126,14 @@ class PreviewUrlResource(BaseMediaResource): images = big_images if big_images else images if images: - og['og:image'] = images[0].attrib['src'] + base = list(urlparse(media_info['uri'])) + src = list(urlparse(images[0].attrib['src'])) + if not src[0] and not src[1]: + src[0] = base[0] + src[1] = base[1] + if not src[2].startswith('/'): + src[2] = re.sub(r'/[^/]+$', '/', base[2]) + src[2] + og['og:image'] = urlunparse(src) text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") @@ -140,6 +148,7 @@ class PreviewUrlResource(BaseMediaResource): text = text.strip()[:1024] og['og:description'] = text if text else None + # TODO: extract a favicon? # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too # TODO: store our OG details in a cache (and expire them when stale) # TODO: delete the content to stop diskfilling, as we only ever cared about its OG @@ -180,7 +189,7 @@ class PreviewUrlResource(BaseMediaResource): try: with open(fname, "wb") as f: logger.debug("Trying to get url '%s'" % url) - length, headers = yield self.client.get_file( + length, headers, uri = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) # FIXME: handle 404s sanely - don't spider an error page @@ -233,6 +242,7 @@ class PreviewUrlResource(BaseMediaResource): "created_ts": time_now_ms, "filesystem_id": file_id, "filename": fname, + "uri": uri, }) def _is_media(self, content_type):