From f3c9aeab85a4cfb2cad3b10868ab8d56475d1b98 Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 4 Feb 2021 10:58:09 +0100 Subject: [PATCH] Detecting bad files during upload/download, rather than while we're sending messages. This should be much more responsive for users. --- synapse_spamcheck_badlist/bad_list_filter.py | 50 +++++++------------- test/4_test.py | 32 +++---------- test/before_test.sh | 3 ++ test/run_tests.sh | 2 +- 4 files changed, 27 insertions(+), 60 deletions(-) diff --git a/synapse_spamcheck_badlist/bad_list_filter.py b/synapse_spamcheck_badlist/bad_list_filter.py index 21f84c1..2e02357 100644 --- a/synapse_spamcheck_badlist/bad_list_filter.py +++ b/synapse_spamcheck_badlist/bad_list_filter.py @@ -153,43 +153,25 @@ async def check_event_for_spam(self, event) -> bool: logger.info("Rejected bad link") return True - # If it's a file, download content, extract hash. - with self._md5_check_performance.time(): - if content.get("msgtype", "") in ["m.file", "m.image", "m.audio"]: - if not await self.can_we_check_md5(): - return False - - match = self._mxc_re.match(content.get("url", "")) - if match != None: - server_name = match.group('server_name') - media_id = match.group('media_id') - response = None - try: - url = "%s/_matrix/media/r0/download/%s/%s" % ( - self._base_url, - urlquote(server_name), - urlquote(media_id) - ) - response = await self._api.http_client.request("GET", url) - except Exception as e: - # In case of timeout or error, there's nothing we can do. - # Let's not take the risk of blocking valid contents. - logger.warn("Could not download media: '%s', assuming it's not spam." % e) - return False - if response.code == 429: - logger.warn("We were rate-limited, assuming it's not spam.") - return False - - md5 = hashlib.md5() - await response.collect(lambda batch: md5.update(batch)) - is_bad_upload = await self._api.run_db_interaction("Check upload against evil db", _db_is_bad_upload, self._md5_table, md5.hexdigest()) - if is_bad_upload: - logger.info("Rejected bad upload") - return True - # Not spam return False + async def check_media_file_for_spam(self, file_wrapper, file_info): + if await self.can_we_check_md5(): + logger.info("Checking media file") + # Compute MD5 of file. + hasher = hashlib.md5() + await file_wrapper.write_chunks_to(hasher.update) + + hex_digest = hasher.hexdigest() + + # Check if it shows up in the db. + if await self._api.run_db_interaction("Check whether this md5 shows up in the database", _db_is_bad_upload, self._md5_table, hex_digest): + logger.info("Rejected bad media file") + return True + + return False # allow all media + def check_username_for_spam(self, user_profile): return False # allow all usernames diff --git a/test/4_test.py b/test/4_test.py index 2d2e985..1bc54eb 100644 --- a/test/4_test.py +++ b/test/4_test.py @@ -7,7 +7,8 @@ import requests logging.basicConfig(filename = "/data/test.log") -logger = logging.getLogger("Test") +logger = logging.getLogger("synapse_spamcheck_badlist.test") + class Test: def __init__(self): @@ -80,7 +81,7 @@ def _upload_content(self, prefix, content): """ Upload a file. - Argument `prefix` is prepended to the file name, to aid with lookup up + Argument `prefix` is prepended to the file name, to aid with looking up stuff in the Synapse logs. """ response = requests.post('http://localhost:8080/_matrix/media/r0/upload?filename=%s-%s' % (prefix, uuid.uuid1()), @@ -90,7 +91,7 @@ def _upload_content(self, prefix, content): }, data = content ).json() - return response['content_uri'] + return response.get('content_uri', None) def _sync_with_server(self, since): """ @@ -182,9 +183,10 @@ def test(self): good_mxid = self._upload_content('good', good_file_content) logger.info('Good image is %s' % good_mxid) - logger.info('Upload a bad image, for the time being, it should be accepted') + logger.info('Upload a bad image, it should be rejected') evil_mxid = self._upload_content('evil', evil_file_content) - logger.info('Bad image is %s' % evil_mxid) + assert evil_mxid is None + for message_type in ['m.file', 'm.image', 'm.audio']: logger.info('Send good image with good description, it should be accepted') @@ -223,26 +225,6 @@ def test(self): # Message may be redacted later bad_events[event_id] = "Good image with bad description, type %s" % message_type - logger.info('Send bad image with good description, it should be rejected') - event_id = self._send_message_to_room( - 'bad-image-with-good-description', - { - 'body': 'A text without any link', - 'msgtype': message_type, - 'url': evil_mxid, - 'info': { - 'w': 320, - 'h': 200, - 'size': len(evil_file_content), - } - } - ) - if event_id is None: - logger.info('Message was rejected immediately') - else: - # Message may be redacted later - bad_events[event_id] = "Good image with bad description, type %s" % message_type - logger.info('Sending canary event, to ensure that all previous events have been flushed') event_id = self._send_message_to_room( 'canary-event', diff --git a/test/before_test.sh b/test/before_test.sh index 3634e7c..8764532 100755 --- a/test/before_test.sh +++ b/test/before_test.sh @@ -4,4 +4,7 @@ \rm -Rf synapse git clone https://github.com/matrix-org/synapse.git +cd synapse +git checkout erikj/media_spam_checker +cd .. docker build -t matrixdotorg/synapse -f synapse/docker/Dockerfile synapse diff --git a/test/run_tests.sh b/test/run_tests.sh index ee689ea..e999461 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -37,7 +37,7 @@ register_new_matrix_user -c /data/homeserver.yaml -u user_2 -p user_2 --no-admin # 4. Running test echo TESTER: Running test -python /data/test/4_test.py +python /data/test/4_test.py &> /data/test.log RESULT=$? # 5. In case of failure, display logs