From eacb068ac2a6df8494d9f7255c80f4429e779209 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 2 Nov 2015 16:49:05 +0000 Subject: [PATCH] Retry dead servers a lot less often --- synapse/http/matrixfederationclient.py | 10 ++++++++-- synapse/util/retryutils.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index b50a0c445c..6e53538a52 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -35,6 +35,7 @@ from signedjson.sign import sign_json import simplejson as json import logging +import random import sys import urllib import urlparse @@ -55,6 +56,9 @@ incoming_responses_counter = metrics.register_counter( ) +MAX_RETRIES = 4 + + class MatrixFederationEndpointFactory(object): def __init__(self, hs): self.tls_server_context_factory = hs.tls_server_context_factory @@ -119,7 +123,7 @@ class MatrixFederationHttpClient(object): # XXX: Would be much nicer to retry only at the transaction-layer # (once we have reliable transactions in place) - retries_left = 5 + retries_left = MAX_RETRIES http_url_bytes = urlparse.urlunparse( ("", "", path_bytes, param_bytes, query_bytes, "") @@ -180,7 +184,9 @@ class MatrixFederationHttpClient(object): ) if retries_left and not timeout: - yield sleep(2 ** (5 - retries_left)) + delay = 5 ** (MAX_RETRIES + 1 - retries_left) + delay *= random.uniform(0.8, 1.4) + yield sleep(delay) retries_left -= 1 else: raise diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index a42138f556..2fe6814807 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -18,6 +18,7 @@ from twisted.internet import defer from synapse.api.errors import CodeMessageException import logging +import random logger = logging.getLogger(__name__) @@ -85,8 +86,9 @@ def get_retry_limiter(destination, clock, store, **kwargs): class RetryDestinationLimiter(object): def __init__(self, destination, clock, store, retry_interval, - min_retry_interval=5000, max_retry_interval=60 * 60 * 1000, - multiplier_retry_interval=2,): + min_retry_interval=10 * 60 * 1000, + max_retry_interval=24 * 60 * 60 * 1000, + multiplier_retry_interval=5,): """Marks the destination as "down" if an exception is thrown in the context, except for CodeMessageException with code < 500. @@ -140,6 +142,7 @@ class RetryDestinationLimiter(object): # We couldn't connect. if self.retry_interval: self.retry_interval *= self.multiplier_retry_interval + self.retry_interval *= int(random.uniform(0.8, 1.4)) if self.retry_interval >= self.max_retry_interval: self.retry_interval = self.max_retry_interval