From 7b9d8066e050400ff0c13e2f553fb428f7539029 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Fri, 18 Feb 2022 17:06:41 +0000 Subject: [PATCH 01/12] Handle non-strings in the `event_search` table in `synapse_port_db` Signed-off-by: Sean Quah --- changelog.d/12037.bugfix | 1 + scripts/synapse_port_db | 4 +++- synapse/storage/databases/main/events.py | 6 +++--- 3 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 changelog.d/12037.bugfix diff --git a/changelog.d/12037.bugfix b/changelog.d/12037.bugfix new file mode 100644 index 000000000000..fea56565616a --- /dev/null +++ b/changelog.d/12037.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug where integers could get into the `event_search` table when using sqlite and prevent migration to PostgreSQL. diff --git a/scripts/synapse_port_db b/scripts/synapse_port_db index db354b3c8c5c..766728a5c6be 100755 --- a/scripts/synapse_port_db +++ b/scripts/synapse_port_db @@ -473,7 +473,9 @@ class Porter(object): rows_dict = [] for row in rows: d = dict(zip(headers, row)) - if "\0" in d["value"]: + if not isinstance(d["value"], str) or "\0" in d["value"]: + # `value` must be a string and contain no null characters. + # Previous versions of Synapse allowed integers to slip into the column. logger.warning("dropping search row %s", d) else: rows_dict.append(d) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index a1d7a9b41300..24fbbe9b428f 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1954,19 +1954,19 @@ def _handle_redaction(self, txn, redacted_event_id): ) def _store_room_topic_txn(self, txn, event): - if hasattr(event, "content") and "topic" in event.content: + if hasattr(event, "content") and isinstance(event.content.get("topic"), str): self.store_event_search_txn( txn, event, "content.topic", event.content["topic"] ) def _store_room_name_txn(self, txn, event): - if hasattr(event, "content") and "name" in event.content: + if hasattr(event, "content") and isinstance(event.content.get("name"), str): self.store_event_search_txn( txn, event, "content.name", event.content["name"] ) def _store_room_message_txn(self, txn, event): - if hasattr(event, "content") and "body" in event.content: + if hasattr(event, "content") and isinstance(event.content.get("body"), str): self.store_event_search_txn( txn, event, "content.body", event.content["body"] ) From 5889d87dc3a9dcfd61c7362cc4ef111cf6d2fdb1 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 22 Feb 2022 15:45:17 +0000 Subject: [PATCH 02/12] Add test for message received over federation with non-string body --- tests/storage/test_room_search.py | 72 ++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 8971ecccbd6d..c077f1f7a8da 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -13,13 +13,16 @@ # limitations under the License. import synapse.rest.admin +from synapse.api.constants import EventTypes +from synapse.api.errors import StoreError from synapse.rest.client import login, room from synapse.storage.engines import PostgresEngine -from tests.unittest import HomeserverTestCase +from tests.unittest import HomeserverTestCase, skip_unless +from tests.utils import USE_POSTGRES_FOR_TESTS -class NullByteInsertionTest(HomeserverTestCase): +class EventSearchInsertionTest(HomeserverTestCase): servlets = [ synapse.rest.admin.register_servlets_for_client_rest_resource, login.register_servlets, @@ -72,3 +75,68 @@ def test_null_byte(self): ) if isinstance(store.database_engine, PostgresEngine): self.assertIn("alice", result.get("highlights")) + + @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite") + def test_sqlite_non_string(self): + """Test that non-string `value`s are not inserted into `event_search` on sqlite. + + Regression test for #11918. + """ + store = self.hs.get_datastore() + + # Register a user and create a room + user_id = self.register_user("alice", "password") + access_token = self.login("alice", "password") + room_id = self.helper.create_room_as("alice", tok=access_token) + room_version = self.get_success(store.get_room_version(room_id)) + + # Construct a message with a numeric body to be received over federation + # The message can't be sent using the client API, since Synapse's event + # validation will reject it. + prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id)) + prev_event = self.get_success(store.get_event(prev_event_ids[0])) + prev_state_map = self.get_success( + self.hs.get_storage().state.get_state_ids_for_event(prev_event_ids[0]) + ) + + event_dict = { + "type": EventTypes.Message, + "content": {"msgtype": "m.text", "body": 2}, + "room_id": room_id, + "sender": user_id, + "depth": prev_event.depth + 1, + "prev_events": prev_event_ids, + "origin_server_ts": self.clock.time_msec(), + } + builder = self.hs.get_event_builder_factory().for_room_version( + room_version, event_dict + ) + event = self.get_success( + builder.build( + prev_event_ids=prev_event_ids, + auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events( + builder, + prev_state_map, + for_verification=False, + ), + depth=event_dict["depth"], + ) + ) + + # Receive the event + self.get_success( + self.hs.get_federation_event_handler().on_receive_pdu( + self.hs.hostname, event + ) + ) + + # The event should not have an entry in the `event_search` table + f = self.get_failure( + store.db_pool.simple_select_one_onecol( + "event_search", + {"room_id": room_id, "event_id": event.event_id}, + "value", + ), + StoreError, + ) + self.assertEqual(f.value.code, 404) From c6d0534fa7bc203ac23095762d7781e1c10e78a3 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 22 Feb 2022 15:45:36 +0000 Subject: [PATCH 03/12] Reword newsfile --- changelog.d/12037.bugfix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/12037.bugfix b/changelog.d/12037.bugfix index fea56565616a..7d178caf7677 100644 --- a/changelog.d/12037.bugfix +++ b/changelog.d/12037.bugfix @@ -1 +1 @@ -Fix a long-standing bug where integers could get into the `event_search` table when using sqlite and prevent migration to PostgreSQL. +Clean up after a long-standing bug that was fixed by accident in Synapse 1.44.0 where integers could be inserted into the `event_search` table when using sqlite. From c7b6710f4ec828c2fad97fe191e6381d6d48beaa Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 22 Feb 2022 15:52:29 +0000 Subject: [PATCH 04/12] Revert fix to `scripts/synapse_port_db` in favour of a background update --- scripts/synapse_port_db | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/synapse_port_db b/scripts/synapse_port_db index 766728a5c6be..db354b3c8c5c 100755 --- a/scripts/synapse_port_db +++ b/scripts/synapse_port_db @@ -473,9 +473,7 @@ class Porter(object): rows_dict = [] for row in rows: d = dict(zip(headers, row)) - if not isinstance(d["value"], str) or "\0" in d["value"]: - # `value` must be a string and contain no null characters. - # Previous versions of Synapse allowed integers to slip into the column. + if "\0" in d["value"]: logger.warning("dropping search row %s", d) else: rows_dict.append(d) From b6aff6e17e0174bec212860c470c221e2cac9fa6 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 22 Feb 2022 16:39:48 +0000 Subject: [PATCH 05/12] Add a background update to clear out bad rows from `event_search` when using sqlite --- synapse/storage/databases/main/search.py | 27 ++++++++++++ ...5_delete_non_strings_from_event_search.sql | 22 ++++++++++ tests/storage/test_room_search.py | 42 +++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index acea300ed343..25ce93963bab 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -115,6 +115,7 @@ class SearchBackgroundUpdateStore(SearchWorkerStore): EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order" EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist" EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin" + EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings" def __init__( self, @@ -147,6 +148,10 @@ def __init__( self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search ) + self.db_pool.updates.register_background_update_handler( + self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings + ) + async def _background_reindex_search(self, progress, batch_size): # we work through the events table from highest stream id to lowest target_min_stream_id = progress["target_min_stream_id_inclusive"] @@ -372,6 +377,28 @@ def reindex_search_txn(txn): return num_rows + async def _background_delete_non_strings( + self, progress: JsonDict, batch_size: int + ) -> int: + """Deletes rows with non-string `value`s from `event_search` if using sqlite. + + Prior to Synapse 1.44.0, malformed events received over federation could cause integers + to be inserted into the `event_search` table when using sqlite. + """ + + def delete_non_strings_txn(txn: LoggingTransaction) -> None: + txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'") + + if isinstance(self.database_engine, Sqlite3Engine): + await self.db_pool.runInteraction( + self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn + ) + + await self.db_pool.updates._end_background_update( + self.EVENT_SEARCH_DELETE_NON_STRINGS + ) + return 1 + class SearchStore(SearchBackgroundUpdateStore): def __init__( diff --git a/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql b/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql new file mode 100644 index 000000000000..140df652642d --- /dev/null +++ b/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql @@ -0,0 +1,22 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +-- Delete rows with non-string `value`s from `event_search` if using sqlite. +-- +-- Prior to Synapse 1.44.0, malformed events received over federation could +-- cause integers to be inserted into the `event_search` table. +INSERT INTO background_updates (ordering, update_name, progress_json) VALUES + (6805, 'event_search_sqlite_delete_non_strings', '{}'); diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index c077f1f7a8da..8590f6e2a23d 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -140,3 +140,45 @@ def test_sqlite_non_string(self): StoreError, ) self.assertEqual(f.value.code, 404) + + @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite") + def test_sqlite_non_string_deletion_background_update(self): + """Test the background update to delete bad rows from `event_search`.""" + store = self.hs.get_datastore() + + # Populate `event_search` with dummy data + self.get_success( + store.db_pool.simple_insert_many( + "event_search", + keys=["event_id", "room_id", "key", "value"], + values=[ + ("event1", "room_id", "content.body", "hi"), + ("event2", "room_id", "content.body", "2"), + ("event3", "room_id", "content.body", 3), + ], + desc="populate_event_search", + ) + ) + + # Run the background update + store.db_pool.updates._all_done = False + self.get_success( + store.db_pool.simple_insert( + "background_updates", + { + "update_name": "event_search_sqlite_delete_non_strings", + "progress_json": "{}", + }, + ) + ) + self.wait_for_background_updates() + + # The non-string `value`s ought to be gone now. + values = self.get_success( + store.db_pool.simple_select_onecol( + "event_search", + {"room_id": "room_id"}, + "value", + ), + ) + self.assertCountEqual(values, ["hi", "2"]) From f04501990e57d5fb14177ba47538b7a66db9a5d3 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 11:55:01 +0000 Subject: [PATCH 06/12] Use `.get_datastores().main` instead of `.get_datastore()` in tests --- tests/storage/test_room_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 2fb14b8f7147..7aa6a60e34bc 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -82,7 +82,7 @@ def test_sqlite_non_string(self): Regression test for #11918. """ - store = self.hs.get_datastore() + store = self.hs.get_datastores().main # Register a user and create a room user_id = self.register_user("alice", "password") @@ -144,7 +144,7 @@ def test_sqlite_non_string(self): @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite") def test_sqlite_non_string_deletion_background_update(self): """Test the background update to delete bad rows from `event_search`.""" - store = self.hs.get_datastore() + store = self.hs.get_datastores().main # Populate `event_search` with dummy data self.get_success( From a895049221ea7f151f233bf86ab715df4f467e3c Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 12:03:43 +0000 Subject: [PATCH 07/12] Run test on both sqlite and postgres --- tests/storage/test_room_search.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 7aa6a60e34bc..886a74517737 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -76,9 +76,12 @@ def test_null_byte(self): if isinstance(store.database_engine, PostgresEngine): self.assertIn("alice", result.get("highlights")) - @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite") - def test_sqlite_non_string(self): - """Test that non-string `value`s are not inserted into `event_search` on sqlite. + def test_non_string(self): + """Test that non-string `value`s are not inserted into `event_search`. + + This is particularly important when using sqlite, since a sqlite column can hold + both strings and integers. When using Postgres, integers are automatically + converted to strings. Regression test for #11918. """ From 37c1798840257ae1b4b55c530a715ea16f22c0d7 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 12:06:18 +0000 Subject: [PATCH 08/12] Make schema delta sqlite only --- ...rch.sql => 05_delete_non_strings_from_event_search.sql.sqlite} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename synapse/storage/schema/main/delta/68/{05_delete_non_strings_from_event_search.sql => 05_delete_non_strings_from_event_search.sql.sqlite} (100%) diff --git a/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql b/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite similarity index 100% rename from synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql rename to synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite From 02d8fe4f543f0126c4358aee1c720ef384c92d21 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 12:17:35 +0000 Subject: [PATCH 09/12] Reword changelog, again --- changelog.d/12037.bugfix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/12037.bugfix b/changelog.d/12037.bugfix index 7d178caf7677..9295cb4dc0d9 100644 --- a/changelog.d/12037.bugfix +++ b/changelog.d/12037.bugfix @@ -1 +1 @@ -Clean up after a long-standing bug that was fixed by accident in Synapse 1.44.0 where integers could be inserted into the `event_search` table when using sqlite. +Properly fix a long-standing bug where wrong data could be inserted in the `event_search` table when using sqlite. This could block running `synapse_port_db` with an "argument of type 'int' is not iterable" error. This bug was partially fixed by a change in Synapse 1.44.0. From 4b5ec980f64cece011b0801dc14b0d5779e5b9f2 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 13:21:20 +0000 Subject: [PATCH 10/12] Sprinkle some type hints around --- synapse/storage/databases/main/events.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 24fbbe9b428f..e53e84054ad0 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1473,10 +1473,10 @@ def _store_rejected_events_txn(self, txn, events_and_contexts): def _update_metadata_tables_txn( self, - txn, + txn: LoggingTransaction, *, - events_and_contexts, - all_events_and_contexts, + events_and_contexts: List[Tuple[EventBase, EventContext]], + all_events_and_contexts: List[Tuple[EventBase, EventContext]], inhibit_local_membership_updates: bool = False, ): """Update all the miscellaneous tables for new events @@ -1953,20 +1953,20 @@ def _handle_redaction(self, txn, redacted_event_id): txn, table="event_relations", keyvalues={"event_id": redacted_event_id} ) - def _store_room_topic_txn(self, txn, event): - if hasattr(event, "content") and isinstance(event.content.get("topic"), str): + def _store_room_topic_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("topic"), str): self.store_event_search_txn( txn, event, "content.topic", event.content["topic"] ) - def _store_room_name_txn(self, txn, event): - if hasattr(event, "content") and isinstance(event.content.get("name"), str): + def _store_room_name_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("name"), str): self.store_event_search_txn( txn, event, "content.name", event.content["name"] ) - def _store_room_message_txn(self, txn, event): - if hasattr(event, "content") and isinstance(event.content.get("body"), str): + def _store_room_message_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("body"), str): self.store_event_search_txn( txn, event, "content.body", event.content["body"] ) From b7ad0a928aa8d3e6ab8e6f59ba859a6b7bd6c589 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 23 Feb 2022 14:06:49 +0000 Subject: [PATCH 11/12] Fix test for postgres schema --- tests/storage/test_room_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 886a74517737..d62e01726cba 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -138,7 +138,7 @@ def test_non_string(self): store.db_pool.simple_select_one_onecol( "event_search", {"room_id": room_id, "event_id": event.event_id}, - "value", + "event_id", ), StoreError, ) From 75bc1bf65b535b4a096c9ef7574ff44797f09d82 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 24 Feb 2022 11:12:43 +0000 Subject: [PATCH 12/12] Remove redundant sqlite guard --- synapse/storage/databases/main/search.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 25ce93963bab..e23b1190726b 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -389,10 +389,9 @@ async def _background_delete_non_strings( def delete_non_strings_txn(txn: LoggingTransaction) -> None: txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'") - if isinstance(self.database_engine, Sqlite3Engine): - await self.db_pool.runInteraction( - self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn - ) + await self.db_pool.runInteraction( + self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn + ) await self.db_pool.updates._end_background_update( self.EVENT_SEARCH_DELETE_NON_STRINGS