From d84e66144dc12dacf71c987a2ba802dd59c0b68e Mon Sep 17 00:00:00 2001 From: Shay Date: Fri, 9 Jun 2023 00:00:46 -0700 Subject: [PATCH] Allow for the configuration of max request retries and min/max retry delays in the matrix federation client (#12504) Co-authored-by: Mathieu Velten Co-authored-by: Erik Johnston --- changelog.d/12504.misc | 1 + .../configuration/config_documentation.md | 26 +++++++++++++++++++ synapse/config/federation.py | 10 +++++++ synapse/http/matrixfederationclient.py | 21 ++++++++------- tests/http/test_matrixfederationclient.py | 20 +++++++++++++- 5 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 changelog.d/12504.misc diff --git a/changelog.d/12504.misc b/changelog.d/12504.misc new file mode 100644 index 0000000000..0bebaa213d --- /dev/null +++ b/changelog.d/12504.misc @@ -0,0 +1 @@ +Allow for the configuration of max request retries and min/max retry delays in the matrix federation client. diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 0cf6e075ff..8426de0417 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1196,6 +1196,32 @@ Example configuration: allow_device_name_lookup_over_federation: true ``` --- +### `federation` + +The federation section defines some sub-options related to federation. + +The following options are related to configuring timeout and retry logic for one request, +independently of the others. +Short retry algorithm is used when something or someone will wait for the request to have an +answer, while long retry is used for requests that happen in the background, +like sending a federation transaction. + +* `client_timeout`: timeout for the federation requests in seconds. Default to 60s. +* `max_short_retry_delay`: maximum delay to be used for the short retry algo in seconds. Default to 2s. +* `max_long_retry_delay`: maximum delay to be used for the short retry algo in seconds. Default to 60s. +* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts. +* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts. + +Example configuration: +```yaml +federation: + client_timeout: 180 + max_short_retry_delay: 7 + max_long_retry_delay: 100 + max_short_retries: 5 + max_long_retries: 20 +``` +--- ## Caching Options related to caching. diff --git a/synapse/config/federation.py b/synapse/config/federation.py index 336fca578a..d21f7fd02a 100644 --- a/synapse/config/federation.py +++ b/synapse/config/federation.py @@ -22,6 +22,8 @@ class FederationConfig(Config): section = "federation" def read_config(self, config: JsonDict, **kwargs: Any) -> None: + federation_config = config.setdefault("federation", {}) + # FIXME: federation_domain_whitelist needs sytests self.federation_domain_whitelist: Optional[dict] = None federation_domain_whitelist = config.get("federation_domain_whitelist", None) @@ -49,5 +51,13 @@ class FederationConfig(Config): "allow_device_name_lookup_over_federation", False ) + # Allow for the configuration of timeout, max request retries + # and min/max retry delays in the matrix federation client. + self.client_timeout = federation_config.get("client_timeout", 60) + self.max_long_retry_delay = federation_config.get("max_long_retry_delay", 60) + self.max_short_retry_delay = federation_config.get("max_short_retry_delay", 2) + self.max_long_retries = federation_config.get("max_long_retries", 10) + self.max_short_retries = federation_config.get("max_short_retries", 3) + _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}} diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index abb5ae5815..ed36825b67 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -95,8 +95,6 @@ incoming_responses_counter = Counter( ) -MAX_LONG_RETRIES = 10 -MAX_SHORT_RETRIES = 3 MAXINT = sys.maxsize @@ -406,7 +404,12 @@ class MatrixFederationHttpClient: self.clock = hs.get_clock() self._store = hs.get_datastores().main self.version_string_bytes = hs.version_string.encode("ascii") - self.default_timeout = 60 + self.default_timeout = hs.config.federation.client_timeout + + self.max_long_retry_delay = hs.config.federation.max_long_retry_delay + self.max_short_retry_delay = hs.config.federation.max_short_retry_delay + self.max_long_retries = hs.config.federation.max_long_retries + self.max_short_retries = hs.config.federation.max_short_retries self._cooperator = Cooperator(scheduler=_make_scheduler(self.reactor)) @@ -583,9 +586,9 @@ class MatrixFederationHttpClient: # XXX: Would be much nicer to retry only at the transaction-layer # (once we have reliable transactions in place) if long_retries: - retries_left = MAX_LONG_RETRIES + retries_left = self.max_long_retries else: - retries_left = MAX_SHORT_RETRIES + retries_left = self.max_short_retries url_bytes = request.uri url_str = url_bytes.decode("ascii") @@ -730,12 +733,12 @@ class MatrixFederationHttpClient: if retries_left and not timeout: if long_retries: - delay = 4 ** (MAX_LONG_RETRIES + 1 - retries_left) - delay = min(delay, 60) + delay = 4 ** (self.max_long_retries + 1 - retries_left) + delay = min(delay, self.max_long_retry_delay) delay *= random.uniform(0.8, 1.4) else: - delay = 0.5 * 2 ** (MAX_SHORT_RETRIES - retries_left) - delay = min(delay, 2) + delay = 0.5 * 2 ** (self.max_short_retries - retries_left) + delay = min(delay, self.max_short_retry_delay) delay *= random.uniform(0.8, 1.4) logger.debug( diff --git a/tests/http/test_matrixfederationclient.py b/tests/http/test_matrixfederationclient.py index 0dfc03ce50..8565f8ac64 100644 --- a/tests/http/test_matrixfederationclient.py +++ b/tests/http/test_matrixfederationclient.py @@ -40,7 +40,7 @@ from synapse.server import HomeServer from synapse.util import Clock from tests.server import FakeTransport -from tests.unittest import HomeserverTestCase +from tests.unittest import HomeserverTestCase, override_config def check_logcontext(context: LoggingContextOrSentinel) -> None: @@ -640,3 +640,21 @@ class FederationClientTests(HomeserverTestCase): self.cl.build_auth_headers( b"", b"GET", b"https://example.com", destination_is=b"" ) + + @override_config( + { + "federation": { + "client_timeout": 180, + "max_long_retry_delay": 100, + "max_short_retry_delay": 7, + "max_long_retries": 20, + "max_short_retries": 5, + } + } + ) + def test_configurable_retry_and_delay_values(self) -> None: + self.assertEqual(self.cl.default_timeout, 180) + self.assertEqual(self.cl.max_long_retry_delay, 100) + self.assertEqual(self.cl.max_short_retry_delay, 7) + self.assertEqual(self.cl.max_long_retries, 20) + self.assertEqual(self.cl.max_short_retries, 5)