From 4cf9f92f395e8c448b94eccb48fbfe2e7e61d7cd Mon Sep 17 00:00:00 2001 From: Jason Little Date: Wed, 5 Jul 2023 05:44:02 -0500 Subject: [PATCH] Fix could not serialize access due to concurrent `DELETE` from presence_stream (#15826) * Change update_presence to have a isolation level of READ_COMMITTED * changelog --- changelog.d/15826.misc | 1 + synapse/storage/databases/main/presence.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 changelog.d/15826.misc diff --git a/changelog.d/15826.misc b/changelog.d/15826.misc new file mode 100644 index 0000000000..88903f3f7c --- /dev/null +++ b/changelog.d/15826.misc @@ -0,0 +1 @@ +Use lower isolation level when cleaning old presence stream data to avoid serialization errors. diff --git a/synapse/storage/databases/main/presence.py b/synapse/storage/databases/main/presence.py index beb210f8ee..b51d20ac26 100644 --- a/synapse/storage/databases/main/presence.py +++ b/synapse/storage/databases/main/presence.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, cast from synapse.api.presence import PresenceState, UserPresenceState @@ -24,6 +23,7 @@ from synapse.storage.database import ( ) from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore from synapse.storage.engines import PostgresEngine +from synapse.storage.engines._base import IsolationLevel from synapse.storage.types import Connection from synapse.storage.util.id_generators import ( AbstractStreamIdGenerator, @@ -115,11 +115,16 @@ class PresenceStore(PresenceBackgroundUpdateStore, CacheInvalidationWorkerStore) ) async with stream_ordering_manager as stream_orderings: + # Run the interaction with an isolation level of READ_COMMITTED to avoid + # serialization errors(and rollbacks) in the database. This way it will + # ignore new rows during the DELETE, but will pick them up the next time + # this is run. Currently, that is between 5-60 seconds. await self.db_pool.runInteraction( "update_presence", self._update_presence_txn, stream_orderings, presence_states, + isolation_level=IsolationLevel.READ_COMMITTED, ) return stream_orderings[-1], self._presence_id_gen.get_current_token()