Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Support rendering previews with data: URLs in them #11767

Merged
merged 16 commits into from
Jan 24, 2022
Merged
7 changes: 5 additions & 2 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,13 @@ def _iterate_over_text(
def rebase_url(url: str, base: str) -> str:
clokep marked this conversation as resolved.
Show resolved Hide resolved
base_parts = list(urlparse.urlparse(base))
url_parts = list(urlparse.urlparse(url))
if not url_parts[0]: # fix up schema
# Add a scheme, if one does not exist.
if not url_parts[0]:
url_parts[0] = base_parts[0] or "http"
if not url_parts[1]: # fix up hostname
# Fix up the hostname, if this is not a data URL.
if url_parts[0] != "data" and not url_parts[1]:
url_parts[1] = base_parts[1]
# If the path does not start with a /, nest it under the base path.
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
clokep marked this conversation as resolved.
Show resolved Hide resolved
return urlparse.urlunparse(url_parts)
Expand Down
63 changes: 54 additions & 9 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import traceback
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
from urllib import parse as urlparse
from urllib.request import urlopen

import attr

Expand Down Expand Up @@ -417,6 +418,38 @@ async def _download_url(

return length, uri, code, media_type, download_name, expires, etag

async def _parse_data_url(
self, url: str, output_stream: BinaryIO
) -> Tuple[int, str, int, str, Optional[str], int, Optional[str]]:
"""
Parses a data: URL and stores it.
"""
try:
logger.debug("Trying to parse data url '%s'", url)
with urlopen(url) as url_info:
# TODO Can this be more efficient.
output_stream.write(url_info.read())
except Exception as e:
logger.warning("Error parsing data: URL %s: %r", url, e)

raise SynapseError(
500,
"Failed to parse data URL: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)

length = output_stream.seek(1)
clokep marked this conversation as resolved.
Show resolved Hide resolved

media_type = url_info.headers.get_content_type()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we cross reference this with the mime-type in the data url?

(Do we want to have some kind of whitelist of trusted mime types which are okay to preview?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the mime-type from the data URL, it gets parsed out for us and shoved into a headers property for some reason.

The docs for urlopen have some info on how data URLs are handled:

For FTP, file, and data URLs [...], this function returns a urllib.response.addinfourl object.

The addinfourl object has a headers property:

Returns the headers of the response in the form of an EmailMessage instance.

The EmailMessage object has a get_content_type() property:

Return the message’s content type, coerced to lower case of the form maintype/subtype. If there is no Content-Type header in the message return the value returned by get_default_type(). If the Content-Type header is invalid, return text/plain.

This all doesn't really make sense for data URLs, but from trial and error it seems to do the correct thing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a clarifying comment in 116015f -- hopefully that helps?


# Some features are not supported by data: URLs.
download_name = None
expires = ONE_HOUR
etag = None

return length, url, 200, media_type, download_name, expires, etag

async def _handle_url(self, url: str, user: UserID) -> MediaInfo:
"""
Fetches remote content and parses the headers to generate a MediaInfo.
Expand All @@ -441,15 +474,27 @@ async def _handle_url(self, url: str, user: UserID) -> MediaInfo:
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

with self.media_storage.store_into_file(file_info) as (f, fname, finish):
(
length,
uri,
code,
media_type,
download_name,
expires,
etag,
) = await self._download_url(url, f)
if url.startswith("data:"):
(
length,
uri,
code,
media_type,
download_name,
expires,
etag,
) = await self._parse_data_url(url, f)
else:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to assert HTTP(S) schemes here as additional error checking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I feel like I'd be happier having a few select allowed protocols rather than singling out data:; not sure it really makes sense to ask us to preview an ftp: URI either for example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, note that those will fail right now (we won't try them), but it would be better to be explicit, I think!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might actually be tougher to get correct than I initially though since we seem to support previewing scheme-less URLs. I'm not sure it makes sense to refactor that as part of this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, fair enough.

(
length,
uri,
code,
media_type,
download_name,
expires,
etag,
) = await self._download_url(url, f)

await finish()

try:
Expand Down
54 changes: 46 additions & 8 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import base64
import json
import os
import re
Expand All @@ -23,6 +24,7 @@

from synapse.config.oembed import OEmbedEndpointConfig
from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
from synapse.types import JsonDict
from synapse.util.stringutils import parse_and_validate_mxc_uri

from tests import unittest
Expand Down Expand Up @@ -142,6 +144,13 @@ def resolveHostName(
def create_test_resource(self):
return self.hs.get_media_repository_resource()

def _assert_small_png(self, json_body: JsonDict) -> None:
"""Assert properties from the SMALL_PNG test image."""
self.assertTrue(json_body["og:image"].startswith("mxc://"))
self.assertEqual(json_body["og:image:height"], 1)
self.assertEqual(json_body["og:image:width"], 1)
self.assertEqual(json_body["og:image:type"], "image/png")

def test_cache_returns_correct_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

Expand Down Expand Up @@ -569,6 +578,41 @@ def test_accept_language_config_option(self):
server.data,
)

def test_data_url(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

data = base64.b64encode(SMALL_PNG)

end_content = (
b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
) % (data,)

channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(channel.code, 200)
self._assert_small_png(channel.json_body)

def test_oembed_photo(self):
"""Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
Expand Down Expand Up @@ -626,10 +670,7 @@ def test_oembed_photo(self):
self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")
self._assert_small_png(body)

def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
Expand Down Expand Up @@ -820,10 +861,7 @@ def test_oembed_autodiscovery(self):
self.assertEqual(
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
)
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")
self._assert_small_png(body)

def _download_image(self):
"""Downloads an image into the URL cache.
Expand Down