Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Handle non-strings in the event_search table in synapse_port_db #12037

Merged
merged 13 commits into from
Feb 24, 2022
1 change: 1 addition & 0 deletions changelog.d/12037.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Properly fix a long-standing bug where wrong data could be inserted in the `event_search` table when using sqlite. This could block running `synapse_port_db` with an "argument of type 'int' is not iterable" error. This bug was partially fixed by a change in Synapse 1.44.0.
18 changes: 9 additions & 9 deletions synapse/storage/databases/main/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -1473,10 +1473,10 @@ def _store_rejected_events_txn(self, txn, events_and_contexts):

def _update_metadata_tables_txn(
self,
txn,
txn: LoggingTransaction,
*,
events_and_contexts,
all_events_and_contexts,
events_and_contexts: List[Tuple[EventBase, EventContext]],
all_events_and_contexts: List[Tuple[EventBase, EventContext]],
inhibit_local_membership_updates: bool = False,
):
"""Update all the miscellaneous tables for new events
Expand Down Expand Up @@ -1953,20 +1953,20 @@ def _handle_redaction(self, txn, redacted_event_id):
txn, table="event_relations", keyvalues={"event_id": redacted_event_id}
)

def _store_room_topic_txn(self, txn, event):
if hasattr(event, "content") and "topic" in event.content:
def _store_room_topic_txn(self, txn: LoggingTransaction, event: EventBase):
if isinstance(event.content.get("topic"), str):
self.store_event_search_txn(
txn, event, "content.topic", event.content["topic"]
)

def _store_room_name_txn(self, txn, event):
if hasattr(event, "content") and "name" in event.content:
def _store_room_name_txn(self, txn: LoggingTransaction, event: EventBase):
if isinstance(event.content.get("name"), str):
self.store_event_search_txn(
txn, event, "content.name", event.content["name"]
)

def _store_room_message_txn(self, txn, event):
if hasattr(event, "content") and "body" in event.content:
def _store_room_message_txn(self, txn: LoggingTransaction, event: EventBase):
if isinstance(event.content.get("body"), str):
self.store_event_search_txn(
txn, event, "content.body", event.content["body"]
)
Expand Down
26 changes: 26 additions & 0 deletions synapse/storage/databases/main/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist"
EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin"
EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings"

def __init__(
self,
Expand Down Expand Up @@ -147,6 +148,10 @@ def __init__(
self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search
)

self.db_pool.updates.register_background_update_handler(
self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings
)

async def _background_reindex_search(self, progress, batch_size):
# we work through the events table from highest stream id to lowest
target_min_stream_id = progress["target_min_stream_id_inclusive"]
Expand Down Expand Up @@ -372,6 +377,27 @@ def reindex_search_txn(txn):

return num_rows

async def _background_delete_non_strings(
self, progress: JsonDict, batch_size: int
) -> int:
"""Deletes rows with non-string `value`s from `event_search` if using sqlite.

Prior to Synapse 1.44.0, malformed events received over federation could cause integers
to be inserted into the `event_search` table when using sqlite.
"""

def delete_non_strings_txn(txn: LoggingTransaction) -> None:
txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to limit this in any way? (Will this cause any locks on the table?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! It crossed my mind at some point and I forgot to make a note of it on the PR.

event_search is defined as a full text search virtual table with no explicit indexes:
CREATE VIRTUAL TABLE event_search USING fts4 (event_id, room_id, sender, key, value);
so it looked like doing the processing in batches would be slower, since DELETEing by specific event_ids would have to scan the whole table anyway.

I think this will lock the table for the duration of the DELETE. The sqlite docs suggest that locks are done on the file level, so it's actually the entire database that gets locked, which is unfortunate.

We could add an index first, then delete in batches, but if we're going to ask sqlite to scan the whole table to build an index, we might as well just do the delete.


await self.db_pool.runInteraction(
self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn
)

await self.db_pool.updates._end_background_update(
self.EVENT_SEARCH_DELETE_NON_STRINGS
)
return 1


class SearchStore(SearchBackgroundUpdateStore):
def __init__(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/* Copyright 2022 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


-- Delete rows with non-string `value`s from `event_search` if using sqlite.
--
-- Prior to Synapse 1.44.0, malformed events received over federation could
-- cause integers to be inserted into the `event_search` table.
INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
(6805, 'event_search_sqlite_delete_non_strings', '{}');
117 changes: 115 additions & 2 deletions tests/storage/test_room_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
# limitations under the License.

import synapse.rest.admin
from synapse.api.constants import EventTypes
from synapse.api.errors import StoreError
from synapse.rest.client import login, room
from synapse.storage.engines import PostgresEngine

from tests.unittest import HomeserverTestCase
from tests.unittest import HomeserverTestCase, skip_unless
from tests.utils import USE_POSTGRES_FOR_TESTS


class NullByteInsertionTest(HomeserverTestCase):
class EventSearchInsertionTest(HomeserverTestCase):
servlets = [
synapse.rest.admin.register_servlets_for_client_rest_resource,
login.register_servlets,
Expand Down Expand Up @@ -72,3 +75,113 @@ def test_null_byte(self):
)
if isinstance(store.database_engine, PostgresEngine):
self.assertIn("alice", result.get("highlights"))

def test_non_string(self):
"""Test that non-string `value`s are not inserted into `event_search`.

This is particularly important when using sqlite, since a sqlite column can hold
both strings and integers. When using Postgres, integers are automatically
converted to strings.

Regression test for #11918.
"""
store = self.hs.get_datastores().main

# Register a user and create a room
user_id = self.register_user("alice", "password")
access_token = self.login("alice", "password")
room_id = self.helper.create_room_as("alice", tok=access_token)
room_version = self.get_success(store.get_room_version(room_id))

# Construct a message with a numeric body to be received over federation
# The message can't be sent using the client API, since Synapse's event
# validation will reject it.
prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id))
prev_event = self.get_success(store.get_event(prev_event_ids[0]))
prev_state_map = self.get_success(
self.hs.get_storage().state.get_state_ids_for_event(prev_event_ids[0])
)

event_dict = {
"type": EventTypes.Message,
"content": {"msgtype": "m.text", "body": 2},
"room_id": room_id,
"sender": user_id,
"depth": prev_event.depth + 1,
"prev_events": prev_event_ids,
"origin_server_ts": self.clock.time_msec(),
}
builder = self.hs.get_event_builder_factory().for_room_version(
room_version, event_dict
)
event = self.get_success(
builder.build(
prev_event_ids=prev_event_ids,
auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events(
builder,
prev_state_map,
for_verification=False,
),
depth=event_dict["depth"],
)
)

# Receive the event
self.get_success(
self.hs.get_federation_event_handler().on_receive_pdu(
self.hs.hostname, event
)
)

# The event should not have an entry in the `event_search` table
f = self.get_failure(
store.db_pool.simple_select_one_onecol(
"event_search",
{"room_id": room_id, "event_id": event.event_id},
"event_id",
),
StoreError,
)
self.assertEqual(f.value.code, 404)

@skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite")
def test_sqlite_non_string_deletion_background_update(self):
"""Test the background update to delete bad rows from `event_search`."""
store = self.hs.get_datastores().main

# Populate `event_search` with dummy data
self.get_success(
store.db_pool.simple_insert_many(
"event_search",
keys=["event_id", "room_id", "key", "value"],
values=[
("event1", "room_id", "content.body", "hi"),
("event2", "room_id", "content.body", "2"),
("event3", "room_id", "content.body", 3),
],
desc="populate_event_search",
)
)

# Run the background update
store.db_pool.updates._all_done = False
self.get_success(
store.db_pool.simple_insert(
"background_updates",
{
"update_name": "event_search_sqlite_delete_non_strings",
"progress_json": "{}",
},
)
)
self.wait_for_background_updates()

# The non-string `value`s ought to be gone now.
values = self.get_success(
store.db_pool.simple_select_onecol(
"event_search",
{"room_id": "room_id"},
"value",
),
)
self.assertCountEqual(values, ["hi", "2"])