Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Draft: Prevent Failed to fetch missing state events... warning being logged every time we tried to fetch state_ids for a missing prev_event (_get_state_ids_after_missing_prev_event(...)) #13249

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13249.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Prevent `Failed to fetch missing state events...` warning being logged every time we tried to fetch `state_ids` for a missing `prev_event`.
46 changes: 37 additions & 9 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,20 +1064,26 @@ async def _get_state_ids_after_missing_prev_event(
)

logger.debug(
"state_ids returned %i state events, %i auth events",
"_get_state_ids_after_missing_prev_event(event_id=%s): state_ids returned %i state events, %i auth events",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just giving all of these a namespaced/scoped context to easily compartmentalize them and follow down the rabbit hole.

event_id,
len(state_event_ids),
len(auth_event_ids),
)

# Start by checking events we already have in the DB
desired_events = set(state_event_ids)
desired_events.add(event_id)
logger.debug("Fetching %i events from cache/store", len(desired_events))
logger.debug(
"_get_state_ids_after_missing_prev_event(event_id=%s): Fetching %i events from cache/store",
event_id,
len(desired_events),
)
have_events = await self._store.have_seen_events(room_id, desired_events)

missing_desired_event_ids = desired_events - have_events
logger.debug(
"We are missing %i events (got %i)",
"_get_state_ids_after_missing_prev_event(event_id=%s): We are missing %i events (got %i)",
event_id,
len(missing_desired_event_ids),
len(have_events),
)
Expand All @@ -1094,7 +1100,11 @@ async def _get_state_ids_after_missing_prev_event(
missing_auth_event_ids.difference_update(
await self._store.have_seen_events(room_id, missing_auth_event_ids)
)
logger.debug("We are also missing %i auth events", len(missing_auth_event_ids))
logger.debug(
"_get_state_ids_after_missing_prev_event(event_id=%s): We are also missing %i auth events",
event_id,
len(missing_auth_event_ids),
)

missing_event_ids = missing_desired_event_ids | missing_auth_event_ids

Expand Down Expand Up @@ -1125,10 +1135,17 @@ async def _get_state_ids_after_missing_prev_event(
# TODO: might it be better to have an API which lets us do an aggregate event
# request
if (len(missing_event_ids) * 10) >= len(auth_event_ids) + len(state_event_ids):
logger.debug("Requesting complete state from remote")
logger.debug(
"_get_state_ids_after_missing_prev_event(event_id=%s): Requesting complete state from remote",
event_id,
)
await self._get_state_and_persist(destination, room_id, event_id)
else:
logger.debug("Fetching %i events from remote", len(missing_event_ids))
logger.debug(
"_get_state_ids_after_missing_prev_event(event_id=%s): Fetching %i events from remote",
event_id,
len(missing_event_ids),
)
await self._get_events_and_persist(
destination=destination, room_id=room_id, event_ids=missing_event_ids
)
Expand All @@ -1147,8 +1164,9 @@ async def _get_state_ids_after_missing_prev_event(
# This can happen if a remote server claims that the state or
# auth_events at an event in room A are actually events in room B
logger.warning(
"Remote server %s claims event %s in room %s is an auth/state "
"_get_state_ids_after_missing_prev_event(event_id=%s): Remote server %s claims event %s in room %s is an auth/state "
"event in room %s",
event_id,
destination,
state_event_id,
metadata.room_id,
Expand All @@ -1158,7 +1176,9 @@ async def _get_state_ids_after_missing_prev_event(

if metadata.state_key is None:
logger.warning(
"Remote server gave us non-state event in state: %s", state_event_id
"_get_state_ids_after_missing_prev_event(event_id=%s): Remote server gave us non-state event in state: %s",
event_id,
state_event_id,
)
continue

Expand All @@ -1174,6 +1194,14 @@ async def _get_state_ids_after_missing_prev_event(
if not remote_event:
raise Exception("Unable to get missing prev_event %s" % (event_id,))

# Figure out which events we failed to fetch
failed_to_fetch = desired_events - event_metadata.keys()
# The event_id is part of the `desired_events` but isn't fetched as part
# of the `event_metadata` so we remove it here separately if we did find it.
have_event_id = await self._store.have_seen_event(room_id, event_id)
if have_event_id:
failed_to_fetch = failed_to_fetch - {event_id}
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved

# missing state at that event is a warning, not a blocker
# XXX: this doesn't sound right? it means that we'll end up with incomplete
# state.
Expand All @@ -1183,7 +1211,7 @@ async def _get_state_ids_after_missing_prev_event(
failed_to_fetch.discard(event_id)
if failed_to_fetch:
logger.warning(
"Failed to fetch missing state events for %s %s",
"_get_state_ids_after_missing_prev_event(event_id=%s): Failed to fetch missing state events %s",
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
event_id,
failed_to_fetch,
)
Expand Down
6 changes: 4 additions & 2 deletions synapse/storage/databases/main/events_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1482,7 +1482,8 @@ async def have_seen_events(

return results

@cachedList(cached_method_name="have_seen_event", list_name="keys")
# TODO: The cache is giving us stale results for the `failed_to_fetch` stuff.
# @cachedList(cached_method_name="have_seen_event", list_name="keys")
async def _have_seen_events_dict(
self, keys: Collection[Tuple[str, str]]
) -> Dict[Tuple[str, str], bool]:
Expand Down Expand Up @@ -1524,7 +1525,8 @@ def have_seen_events_txn(txn: LoggingTransaction) -> None:
await self.db_pool.runInteraction("have_seen_events", have_seen_events_txn)
return results

@cached(max_entries=100000, tree=True)
# TODO: The cache is giving us stale results for the `failed_to_fetch` stuff.
# @cached(max_entries=100000, tree=True)
async def have_seen_event(self, room_id: str, event_id: str) -> bool:
res = await self._have_seen_events_dict(((room_id, event_id),))
return res[(room_id, event_id)]
Expand Down