From 1d3858371e9577faf3382d1feee97154e5085cd4 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 8 Oct 2019 16:21:17 +0100 Subject: [PATCH 1/5] Disable bytes usage with postgres More often than not passing bytes to `txn.execute` is a bug (where we meant to pass a string) that just happens to work if `BYTEA_OUTPUT` is set to `ESCAPE`. However, this is a bit of a footgun so we want to instead error when this happens, and force using `bytearray` if we actually want to use bytes. --- synapse/storage/engines/postgres.py | 7 +++++++ synapse/storage/filtering.py | 4 ++-- synapse/storage/pusher.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 601617b21e76..d670286fa5c1 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -22,6 +22,13 @@ class PostgresEngine(object): def __init__(self, database_module, database_config): self.module = database_module self.module.extensions.register_type(self.module.extensions.UNICODE) + + # Disables passing `bytes` to txn.execute, c.f. #6186. If you do + # actually want to use bytes than wrap it in `bytearray`. + def _disable_bytes_adapter(_): + raise Exception("Passing bytes to DB is disabled.") + + self.module.extensions.register_adapter(bytes, _disable_bytes_adapter) self.synchronous_commit = database_config.get("synchronous_commit", True) self._version = None # unknown as yet diff --git a/synapse/storage/filtering.py b/synapse/storage/filtering.py index 23b48f6ceade..7c2a7da83619 100644 --- a/synapse/storage/filtering.py +++ b/synapse/storage/filtering.py @@ -51,7 +51,7 @@ def _do_txn(txn): "SELECT filter_id FROM user_filters " "WHERE user_id = ? AND filter_json = ?" ) - txn.execute(sql, (user_localpart, def_json)) + txn.execute(sql, (user_localpart, bytearray(def_json))) filter_id_response = txn.fetchone() if filter_id_response is not None: return filter_id_response[0] @@ -68,7 +68,7 @@ def _do_txn(txn): "INSERT INTO user_filters (user_id, filter_id, filter_json)" "VALUES(?, ?, ?)" ) - txn.execute(sql, (user_localpart, filter_id, def_json)) + txn.execute(sql, (user_localpart, filter_id, bytearray(def_json))) return filter_id diff --git a/synapse/storage/pusher.py b/synapse/storage/pusher.py index 3e0e834a6237..b12e80440abe 100644 --- a/synapse/storage/pusher.py +++ b/synapse/storage/pusher.py @@ -241,7 +241,7 @@ def add_pusher( "device_display_name": device_display_name, "ts": pushkey_ts, "lang": lang, - "data": encode_canonical_json(data), + "data": bytearray(encode_canonical_json(data)), "last_stream_ordering": last_stream_ordering, "profile_tag": profile_tag, "id": stream_id, From 7f18b3d5262746d4095c747f6b80899445f0aa2d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 9 Oct 2019 16:03:24 +0100 Subject: [PATCH 2/5] Do the update as a background index --- synapse/storage/events_bg_updates.py | 43 +++++++++++++++++++ .../redaction_censor3_fix_update.sql.postgres | 17 ++++---- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/synapse/storage/events_bg_updates.py b/synapse/storage/events_bg_updates.py index 5717baf48cc6..e77a7e28af0a 100644 --- a/synapse/storage/events_bg_updates.py +++ b/synapse/storage/events_bg_updates.py @@ -71,6 +71,19 @@ def __init__(self, db_conn, hs): "redactions_received_ts", self._redactions_received_ts ) + # This index gets deleted in `event_fix_redactions_bytes` update + self.register_background_index_update( + "event_fix_redactions_bytes_create_index", + index_name="redactions_censored_redacts", + table="redactions", + columns=["redacts"], + where_clause="have_censored", + ) + + self.register_background_update_handler( + "event_fix_redactions_bytes", self._event_fix_redactions_bytes + ) + @defer.inlineCallbacks def _background_reindex_fields_sender(self, progress, batch_size): target_min_stream_id = progress["target_min_stream_id_inclusive"] @@ -458,3 +471,33 @@ def _redactions_received_ts_txn(txn): yield self._end_background_update("redactions_received_ts") return count + + @defer.inlineCallbacks + def _event_fix_redactions_bytes(self, progress, batch_size): + """Undoes hex encoded censored redacted event JSON. + """ + + def _event_fix_redactions_bytes_txn(txn): + # This update is quite fast due to new index. + txn.execute( + """ + UPDATE event_json + SET + json = convert_from(json::bytea, 'utf8') + FROM redactions + WHERE + redactions.have_censored + AND event_json.event_id = redactions.redacts + AND json NOT LIKE '{%'; + """ + ) + + txn.execute("DROP INDEX redactions_censored_redacts") + + yield self.runInteraction( + "_event_fix_redactions_bytes", _event_fix_redactions_bytes_txn + ) + + yield self._end_background_update("event_fix_redactions_bytes") + + return 1 diff --git a/synapse/storage/schema/delta/56/redaction_censor3_fix_update.sql.postgres b/synapse/storage/schema/delta/56/redaction_censor3_fix_update.sql.postgres index f7bcc5e2f29e..67471f3ef556 100644 --- a/synapse/storage/schema/delta/56/redaction_censor3_fix_update.sql.postgres +++ b/synapse/storage/schema/delta/56/redaction_censor3_fix_update.sql.postgres @@ -15,12 +15,11 @@ -- There was a bug where we may have updated censored redactions as bytes, --- which can (somehow) cause json to be inserted hex encoded. This goes and --- undoes any such hex encoded JSON. -UPDATE event_json SET json = convert_from(json::bytea, 'utf8') -WHERE event_id IN ( - SELECT event_json.event_id - FROM event_json - INNER JOIN redactions ON (event_json.event_id = redacts) - WHERE have_censored AND json NOT LIKE '{%' -); +-- which can (somehow) cause json to be inserted hex encoded. These updates go +-- and undoes any such hex encoded JSON. + +INSERT into background_updates (update_name, progress_json) + VALUES ('event_fix_redactions_bytes_create_index', '{}'); + +INSERT into background_updates (update_name, progress_json, depends_on) + VALUES ('event_fix_redactions_bytes', '{}', 'event_fix_redactions_bytes_create_index'); From c3b34dc32f85ed0b526dde1ed3d61316a8f461d8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 9 Oct 2019 16:32:04 +0100 Subject: [PATCH 3/5] Newsfile --- changelog.d/6186.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/6186.bugfix diff --git a/changelog.d/6186.bugfix b/changelog.d/6186.bugfix new file mode 100644 index 000000000000..199ec690329f --- /dev/null +++ b/changelog.d/6186.bugfix @@ -0,0 +1 @@ +Fix bug where we were updating censored events as bytes rather than text, occaisonally causing invalid JSON being inserted breaking APIs that attempted to fetch such events. From ca3e01e50d2caca6a55b7c7808f0e948b430363d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 10 Oct 2019 14:52:29 +0100 Subject: [PATCH 4/5] Fix store_url_cache using bytes --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7a56cd4b6c57..0c68c3aad52c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -270,7 +270,7 @@ def _do_preview(self, url, user, ts): logger.debug("Calculated OG for %s as %s" % (url, og)) - jsonog = json.dumps(og).encode("utf8") + jsonog = json.dumps(og) # store OG in history-aware DB cache yield self.store.store_url_cache( @@ -283,7 +283,7 @@ def _do_preview(self, url, user, ts): media_info["created_ts"], ) - return jsonog + return jsonog.encode("utf8") @defer.inlineCallbacks def _download_url(self, url, user): From bc244627ac759bbd4691a2a20ac16383b4c2348c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 10 Oct 2019 15:37:53 +0100 Subject: [PATCH 5/5] Fix postgres unit tests --- tests/storage/test_event_federation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storage/test_event_federation.py b/tests/storage/test_event_federation.py index b58386994e4e..2fe50377f843 100644 --- a/tests/storage/test_event_federation.py +++ b/tests/storage/test_event_federation.py @@ -57,7 +57,7 @@ def insert_event(txn, i): "(event_id, algorithm, hash) " "VALUES (?, 'sha256', ?)" ), - (event_id, b"ffff"), + (event_id, bytearray(b"ffff")), ) for i in range(0, 11):