Fix preview of imgur and Tenor URLs. (#11669)

By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.
matrix-org · Jan 18, 2022 · 15ffc41 · 15ffc41
1 parent 9eab71a
commit 15ffc41
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 14 deletions.
diff --git a/changelog.d/11669.bugfix b/changelog.d/11669.bugfix
@@ -0,0 +1 @@
+Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.
diff --git a/docs/development/url_previews.md b/docs/development/url_previews.md
@@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
    5. If the media is HTML:
       1. Decodes the HTML via the stored file.
       2. Generates an Open Graph response from the HTML.
-      3. If an image exists in the Open Graph response:
+      3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
+         1. Downloads the URL and stores it into a file via the media storage provider
+            and saves the local media metadata.
+         2. Convert the oEmbed response to an Open Graph response.
+         3. Override any Open Graph data from the HTML with data from oEmbed.
+      4. If an image exists in the Open Graph response:
          1. Downloads the URL and stores it into a file via the media storage
             provider and saves the local media metadata.
          2. Generates thumbnails.

diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
@@ -33,6 +33,8 @@
 class OEmbedResult:
     # The Open Graph result (converted from the oEmbed result).
     open_graph_result: JsonDict
+    # The author_name of the oEmbed result
+    author_name: Optional[str]
     # Number of milliseconds to cache the content, according to the oEmbed response.
     #
     # This will be None if no cache-age is provided in the oEmbed response (or
@@ -154,11 +156,12 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
                 "og:url": url,
             }
 
-            # Use either title or author's name as the title.
-            title = oembed.get("title") or oembed.get("author_name")
+            title = oembed.get("title")
             if title:
                 open_graph_response["og:title"] = title
 
+            author_name = oembed.get("author_name")
+
             # Use the provider name and as the site.
             provider_name = oembed.get("provider_name")
             if provider_name:
@@ -193,9 +196,10 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
             # Trap any exception and let the code follow as usual.
             logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
             open_graph_response = {}
+            author_name = None
             cache_age = None
 
-        return OEmbedResult(open_graph_response, cache_age)
+        return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
 def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
@@ -262,6 +262,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
         # The number of milliseconds that the response should be considered valid.
         expiration_ms = media_info.expires
+        author_name: Optional[str] = None
 
         if _is_media(media_info.media_type):
             file_id = media_info.filesystem_id
@@ -294,25 +295,33 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 # Check if this HTML document points to oEmbed information and
                 # defer to that.
                 oembed_url = self._oembed.autodiscover_from_html(tree)
-                og = {}
+                og_from_oembed: JsonDict = {}
                 if oembed_url:
                     oembed_info = await self._download_url(oembed_url, user)
-                    og, expiration_ms = await self._handle_oembed_response(
+                    (
+                        og_from_oembed,
+                        author_name,
+                        expiration_ms,
+                    ) = await self._handle_oembed_response(
                         url, oembed_info, expiration_ms
                     )
 
-                # If there was no oEmbed URL (or oEmbed parsing failed), attempt
-                # to generate the Open Graph information from the HTML.
-                if not oembed_url or not og:
-                    og = parse_html_to_open_graph(tree, media_info.uri)
+                # Parse Open Graph information from the HTML in case the oEmbed
+                # response failed or is incomplete.
+                og_from_html = parse_html_to_open_graph(tree, media_info.uri)
+
+                # Compile the Open Graph response by using the scraped
+                # information from the HTML and overlaying any information
+                # from the oEmbed response.
+                og = {**og_from_html, **og_from_oembed}
 
                 await self._precache_image_url(user, media_info, og)
             else:
                 og = {}
 
         elif oembed_url:
             # Handle the oEmbed information.
-            og, expiration_ms = await self._handle_oembed_response(
+            og, author_name, expiration_ms = await self._handle_oembed_response(
                 url, media_info, expiration_ms
             )
             await self._precache_image_url(user, media_info, og)
@@ -321,6 +330,11 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
             logger.warning("Failed to find any OG data in %s", url)
             og = {}
 
+        # If we don't have a title but we have author_name, copy it as
+        # title
+        if not og.get("og:title") and author_name:
+            og["og:title"] = author_name
+
         # filter out any stupidly long values
         keys_to_remove = []
         for k, v in og.items():
@@ -484,7 +498,7 @@ async def _precache_image_url(
 
     async def _handle_oembed_response(
         self, url: str, media_info: MediaInfo, expiration_ms: int
-    ) -> Tuple[JsonDict, int]:
+    ) -> Tuple[JsonDict, Optional[str], int]:
         """
         Parse the downloaded oEmbed info.
 
@@ -497,11 +511,12 @@ async def _handle_oembed_response(
         Returns:
             A tuple of:
                 The Open Graph dictionary, if the oEmbed info can be parsed.
+                The author name if it could be retrieved from oEmbed.
                 The (possibly updated) length of time, in milliseconds, the media is valid for.
         """
         # If JSON was not returned, there's nothing to do.
         if not _is_json(media_info.media_type):
-            return {}, expiration_ms
+            return {}, None, expiration_ms
 
         with open(media_info.filename, "rb") as file:
             body = file.read()
@@ -513,7 +528,7 @@ async def _handle_oembed_response(
         if open_graph_result and oembed_response.cache_age is not None:
             expiration_ms = oembed_response.cache_age
 
-        return open_graph_result, expiration_ms
+        return open_graph_result, oembed_response.author_name, expiration_ms
 
     def _start_expire_url_cache_data(self) -> Deferred:
         return run_as_background_process(