diff --git a/changelog.d/16903.bugfix b/changelog.d/16903.bugfix new file mode 100644 index 0000000000..85a909b681 --- /dev/null +++ b/changelog.d/16903.bugfix @@ -0,0 +1 @@ +Fix performance issue when joining very large rooms that can cause the server to lock up. Introduced in v1.100.0. diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index bde45308d4..83f6a25981 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -1757,17 +1757,25 @@ class FederationEventHandler: events_and_contexts_to_persist.append((event, context)) - for event in sorted_auth_events: + for i, event in enumerate(sorted_auth_events): await prep(event) - await self.persist_events_and_notify( - room_id, - events_and_contexts_to_persist, - # Mark these events backfilled as they're historic events that will - # eventually be backfilled. For example, missing events we fetch - # during backfill should be marked as backfilled as well. - backfilled=True, - ) + # The above function is typically not async, and so won't yield to + # the reactor. For large rooms let's yield to the reactor + # occasionally to ensure we don't block other work. + if (i + 1) % 1000 == 0: + await self._clock.sleep(0) + + # Also persist the new event in batches for similar reasons as above. + for batch in batch_iter(events_and_contexts_to_persist, 1000): + await self.persist_events_and_notify( + room_id, + batch, + # Mark these events as backfilled as they're historic events that will + # eventually be backfilled. For example, missing events we fetch + # during backfill should be marked as backfilled as well. + backfilled=True, + ) @trace async def _check_event_auth(