Faster room joins: Try other destinations when resyncing the state of a partial-state room (#12812)

Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
Sean Quah 2022-05-31 15:50:29 +01:00 committed by GitHub
parent 3594f6c1f3
commit 2fba1076c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 94 additions and 9 deletions

1
changelog.d/12812.misc Normal file
View file

@ -0,0 +1 @@
Try other homeservers when re-syncing state for rooms with partial state.

View file

@ -405,6 +405,9 @@ class FederationClient(FederationBase):
Returns: Returns:
a tuple of (state event_ids, auth event_ids) a tuple of (state event_ids, auth event_ids)
Raises:
InvalidResponseError: if fields in the response have the wrong type.
""" """
result = await self.transport_layer.get_room_state_ids( result = await self.transport_layer.get_room_state_ids(
destination, room_id, event_id=event_id destination, room_id, event_id=event_id
@ -416,7 +419,7 @@ class FederationClient(FederationBase):
if not isinstance(state_event_ids, list) or not isinstance( if not isinstance(state_event_ids, list) or not isinstance(
auth_event_ids, list auth_event_ids, list
): ):
raise Exception("invalid response from /state_ids") raise InvalidResponseError("invalid response from /state_ids")
return state_event_ids, auth_event_ids return state_event_ids, auth_event_ids

View file

@ -20,7 +20,16 @@ import itertools
import logging import logging
from enum import Enum from enum import Enum
from http import HTTPStatus from http import HTTPStatus
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union from typing import (
TYPE_CHECKING,
Collection,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
)
import attr import attr
from signedjson.key import decode_verify_key_bytes from signedjson.key import decode_verify_key_bytes
@ -34,6 +43,7 @@ from synapse.api.errors import (
CodeMessageException, CodeMessageException,
Codes, Codes,
FederationDeniedError, FederationDeniedError,
FederationError,
HttpResponseException, HttpResponseException,
NotFoundError, NotFoundError,
RequestSendFailed, RequestSendFailed,
@ -545,7 +555,8 @@ class FederationHandler:
run_as_background_process( run_as_background_process(
desc="sync_partial_state_room", desc="sync_partial_state_room",
func=self._sync_partial_state_room, func=self._sync_partial_state_room,
destination=origin, initial_destination=origin,
other_destinations=ret.servers_in_room,
room_id=room_id, room_id=room_id,
) )
@ -1454,13 +1465,16 @@ class FederationHandler:
async def _sync_partial_state_room( async def _sync_partial_state_room(
self, self,
destination: str, initial_destination: Optional[str],
other_destinations: Collection[str],
room_id: str, room_id: str,
) -> None: ) -> None:
"""Background process to resync the state of a partial-state room """Background process to resync the state of a partial-state room
Args: Args:
destination: homeserver to pull the state from initial_destination: the initial homeserver to pull the state from
other_destinations: other homeservers to try to pull the state from, if
`initial_destination` is unavailable
room_id: room to be resynced room_id: room to be resynced
""" """
@ -1472,8 +1486,29 @@ class FederationHandler:
# really leave, that might mean we have difficulty getting the room state over # really leave, that might mean we have difficulty getting the room state over
# federation. # federation.
# #
# TODO(faster_joins): try other destinations if the one we have fails # TODO(faster_joins): we need some way of prioritising which homeservers in
# `other_destinations` to try first, otherwise we'll spend ages trying dead
# homeservers for large rooms.
if initial_destination is None and len(other_destinations) == 0:
raise ValueError(
f"Cannot resync state of {room_id}: no destinations provided"
)
# Make an infinite iterator of destinations to try. Once we find a working
# destination, we'll stick with it until it flakes.
if initial_destination is not None:
# Move `initial_destination` to the front of the list.
destinations = list(other_destinations)
if initial_destination in destinations:
destinations.remove(initial_destination)
destinations = [initial_destination] + destinations
destination_iter = itertools.cycle(destinations)
else:
destination_iter = itertools.cycle(other_destinations)
# `destination` is the current remote homeserver we're pulling from.
destination = next(destination_iter)
logger.info("Syncing state for room %s via %s", room_id, destination) logger.info("Syncing state for room %s via %s", room_id, destination)
# we work through the queue in order of increasing stream ordering. # we work through the queue in order of increasing stream ordering.
@ -1511,6 +1546,41 @@ class FederationHandler:
allow_rejected=True, allow_rejected=True,
) )
for event in events: for event in events:
for attempt in itertools.count():
try:
await self._federation_event_handler.update_state_for_partial_state_event( await self._federation_event_handler.update_state_for_partial_state_event(
destination, event destination, event
) )
break
except FederationError as e:
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
# if there's a temporary network outage. retrying
# indefinitely is also not the right thing to do if we can
# reach all homeservers and they all claim they don't have
# the state we want.
logger.error(
"Failed to get state for %s at %s from %s because %s, "
"giving up!",
room_id,
event,
destination,
e,
)
raise
# Try the next remote server.
logger.info(
"Failed to get state for %s at %s from %s because %s",
room_id,
event,
destination,
e,
)
destination = next(destination_iter)
logger.info(
"Syncing state for room %s via %s instead",
room_id,
destination,
)

View file

@ -505,6 +505,9 @@ class FederationEventHandler:
Args: Args:
destination: server to request full state from destination: server to request full state from
event: partial-state event to be de-partial-stated event: partial-state event to be de-partial-stated
Raises:
FederationError if we fail to request state from the remote server.
""" """
logger.info("Updating state for %s", event.event_id) logger.info("Updating state for %s", event.event_id)
with nested_logging_context(suffix=event.event_id): with nested_logging_context(suffix=event.event_id):
@ -815,6 +818,10 @@ class FederationEventHandler:
Returns: Returns:
if we already had all the prev events, `None`. Otherwise, returns if we already had all the prev events, `None`. Otherwise, returns
the event ids of the state at `event`. the event ids of the state at `event`.
Raises:
FederationError if we fail to get the state from the remote server after any
missing `prev_event`s.
""" """
room_id = event.room_id room_id = event.room_id
event_id = event.event_id event_id = event.event_id
@ -901,6 +908,10 @@ class FederationEventHandler:
Returns: Returns:
The event ids of the state *after* the given event. The event ids of the state *after* the given event.
Raises:
InvalidResponseError: if the remote homeserver's response contains fields
of the wrong type.
""" """
( (
state_event_ids, state_event_ids,