Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Try to recover from unknown encodings when previewing media. (#9164)
Browse files Browse the repository at this point in the history
Treat unknown encodings (according to lxml) as UTF-8
when generating a preview for HTML documents. This
isn't fully accurate, but will hopefully give a reasonable
title and summary.
  • Loading branch information
clokep authored Jan 26, 2021
1 parent e74bb96 commit 4937fe3
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 10 deletions.
1 change: 1 addition & 0 deletions changelog.d/9164.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.
44 changes: 34 additions & 10 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.
Params:
Args:
url: The URL to check.
Returns:
Expand All @@ -403,7 +403,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.
Params:
Args:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
Expand Down Expand Up @@ -692,27 +692,51 @@ async def _expire_url_cache_data(self) -> None:
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
"""
Calculate metadata for an HTML document.
This uses lxml to parse the HTML document into the OG response. If errors
occur during processing of the document, an empty response is returned.
Args:
body: The HTML document, as bytes.
media_url: The URI used to download the body.
request_encoding: The character encoding of the body, as a string.
Returns:
The OG response as a dictionary.
"""
# If there's no body, nothing useful is going to be found.
if not body:
return {}

from lxml import etree

# Create an HTML parser. If this fails, log and return no metadata.
try:
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body, parser)
og = _calc_og(tree, media_uri)
except LookupError:
# blindly consider the encoding as utf-8.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}

def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)
return _calc_og(tree, media_uri)

# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_calc_og(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
og = _calc_og(tree, media_uri)

return og
return _attempt_calc_og(body.decode("utf-8", "ignore"))


def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.

# if we see any image URLs in the OG response, then spider them
Expand Down
29 changes: 29 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,32 @@ def test_empty(self):
html = ""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = """
<html>
<head><title>Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(
html, "http://example.com/test.html", "invalid-encoding"
)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding2(self):
"""A body which doesn't match the sent character encoding."""
# Note that this contains an invalid UTF-8 sequence in the title.
html = b"""
<html>
<head><title>\xff\xff Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

0 comments on commit 4937fe3

Please sign in to comment.