From c1ef579b63a39d8d6fb31ddc1d3cf173eaf7e5b7 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:09:12 +0100 Subject: [PATCH] Add prometheus metrics to track federation delays (#8430) Add a pair of federation metrics to track the delays in sending PDUs to/from particular servers. --- changelog.d/8430.feature | 1 + docs/sample_config.yaml | 12 +++++++++ synapse/config/_util.py | 6 +++-- synapse/config/federation.py | 27 ++++++++++++++++++- synapse/config/homeserver.py | 1 - synapse/config/tls.py | 1 - synapse/federation/federation_server.py | 24 ++++++++++++++++- .../federation/sender/transaction_manager.py | 22 +++++++++++++++ 8 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 changelog.d/8430.feature diff --git a/changelog.d/8430.feature b/changelog.d/8430.feature new file mode 100644 index 0000000000..1f31d42bc1 --- /dev/null +++ b/changelog.d/8430.feature @@ -0,0 +1 @@ +Add prometheus metrics to track federation delays. diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 066844b5a9..8a3206e845 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -629,6 +629,7 @@ acme: #tls_fingerprints: [{"sha256": ""}] +## Federation ## # Restrict federation to the following whitelist of domains. # N.B. we recommend also firewalling your federation listener to limit @@ -662,6 +663,17 @@ federation_ip_range_blacklist: - 'fe80::/64' - 'fc00::/7' +# Report prometheus metrics on the age of PDUs being sent to and received from +# the following domains. This can be used to give an idea of "delay" on inbound +# and outbound federation, though be aware that any delay can be due to problems +# at either end or with the intermediate network. +# +# By default, no domains are monitored in this way. +# +#federation_metrics_domains: +# - matrix.org +# - example.com + ## Caching ## diff --git a/synapse/config/_util.py b/synapse/config/_util.py index cd31b1c3c9..c74969a977 100644 --- a/synapse/config/_util.py +++ b/synapse/config/_util.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List +from typing import Any, Iterable import jsonschema @@ -20,7 +20,9 @@ from synapse.config._base import ConfigError from synapse.types import JsonDict -def validate_config(json_schema: JsonDict, config: Any, config_path: List[str]) -> None: +def validate_config( + json_schema: JsonDict, config: Any, config_path: Iterable[str] +) -> None: """Validates a config setting against a JsonSchema definition This can be used to validate a section of the config file against a schema diff --git a/synapse/config/federation.py b/synapse/config/federation.py index 2c77d8f85b..ffd8fca54e 100644 --- a/synapse/config/federation.py +++ b/synapse/config/federation.py @@ -17,7 +17,8 @@ from typing import Optional from netaddr import IPSet -from ._base import Config, ConfigError +from synapse.config._base import Config, ConfigError +from synapse.config._util import validate_config class FederationConfig(Config): @@ -52,8 +53,18 @@ class FederationConfig(Config): "Invalid range(s) provided in federation_ip_range_blacklist: %s" % e ) + federation_metrics_domains = config.get("federation_metrics_domains") or [] + validate_config( + _METRICS_FOR_DOMAINS_SCHEMA, + federation_metrics_domains, + ("federation_metrics_domains",), + ) + self.federation_metrics_domains = set(federation_metrics_domains) + def generate_config_section(self, config_dir_path, server_name, **kwargs): return """\ + ## Federation ## + # Restrict federation to the following whitelist of domains. # N.B. we recommend also firewalling your federation listener to limit # inbound federation traffic as early as possible, rather than relying @@ -85,4 +96,18 @@ class FederationConfig(Config): - '::1/128' - 'fe80::/64' - 'fc00::/7' + + # Report prometheus metrics on the age of PDUs being sent to and received from + # the following domains. This can be used to give an idea of "delay" on inbound + # and outbound federation, though be aware that any delay can be due to problems + # at either end or with the intermediate network. + # + # By default, no domains are monitored in this way. + # + #federation_metrics_domains: + # - matrix.org + # - example.com """ + + +_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}} diff --git a/synapse/config/homeserver.py b/synapse/config/homeserver.py index 556e291495..be65554524 100644 --- a/synapse/config/homeserver.py +++ b/synapse/config/homeserver.py @@ -92,5 +92,4 @@ class HomeServerConfig(RootConfig): TracerConfig, WorkerConfig, RedisConfig, - FederationConfig, ] diff --git a/synapse/config/tls.py b/synapse/config/tls.py index e368ea564d..9ddb8b546b 100644 --- a/synapse/config/tls.py +++ b/synapse/config/tls.py @@ -471,7 +471,6 @@ class TlsConfig(Config): # or by checking matrix.org/federationtester/api/report?server_name=$host # #tls_fingerprints: [{"sha256": ""}] - """ # Lowercase the string representation of boolean values % { diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 2dcd081cbc..24329dd0e3 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -28,7 +28,7 @@ from typing import ( Union, ) -from prometheus_client import Counter, Histogram +from prometheus_client import Counter, Gauge, Histogram from twisted.internet import defer from twisted.internet.abstract import isIPAddress @@ -88,6 +88,13 @@ pdu_process_time = Histogram( ) +last_pdu_age_metric = Gauge( + "synapse_federation_last_received_pdu_age", + "The age (in seconds) of the last PDU successfully received from the given domain", + labelnames=("server_name",), +) + + class FederationServer(FederationBase): def __init__(self, hs): super().__init__(hs) @@ -118,6 +125,10 @@ class FederationServer(FederationBase): hs, "state_ids_resp", timeout_ms=30000 ) + self._federation_metrics_domains = ( + hs.get_config().federation.federation_metrics_domains + ) + async def on_backfill_request( self, origin: str, room_id: str, versions: List[str], limit: int ) -> Tuple[int, Dict[str, Any]]: @@ -262,7 +273,11 @@ class FederationServer(FederationBase): pdus_by_room = {} # type: Dict[str, List[EventBase]] + newest_pdu_ts = 0 + for p in transaction.pdus: # type: ignore + # FIXME (richardv): I don't think this works: + # https://github.com/matrix-org/synapse/issues/8429 if "unsigned" in p: unsigned = p["unsigned"] if "age" in unsigned: @@ -300,6 +315,9 @@ class FederationServer(FederationBase): event = event_from_pdu_json(p, room_version) pdus_by_room.setdefault(room_id, []).append(event) + if event.origin_server_ts > newest_pdu_ts: + newest_pdu_ts = event.origin_server_ts + pdu_results = {} # we can process different rooms in parallel (which is useful if they @@ -340,6 +358,10 @@ class FederationServer(FederationBase): process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT ) + if newest_pdu_ts and origin in self._federation_metrics_domains: + newest_pdu_age = self._clock.time_msec() - newest_pdu_ts + last_pdu_age_metric.labels(server_name=origin).set(newest_pdu_age / 1000) + return pdu_results async def _handle_edus_in_txn(self, origin: str, transaction: Transaction): diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index c84072ab73..3e07f925e0 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -15,6 +15,8 @@ import logging from typing import TYPE_CHECKING, List +from prometheus_client import Gauge + from synapse.api.errors import HttpResponseException from synapse.events import EventBase from synapse.federation.persistence import TransactionActions @@ -34,6 +36,12 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +last_pdu_age_metric = Gauge( + "synapse_federation_last_sent_pdu_age", + "The age (in seconds) of the last PDU successfully sent to the given domain", + labelnames=("server_name",), +) + class TransactionManager: """Helper class which handles building and sending transactions @@ -48,6 +56,10 @@ class TransactionManager: self._transaction_actions = TransactionActions(self._store) self._transport_layer = hs.get_federation_transport_client() + self._federation_metrics_domains = ( + hs.get_config().federation.federation_metrics_domains + ) + # HACK to get unique tx id self._next_txn_id = int(self.clock.time_msec()) @@ -119,6 +131,9 @@ class TransactionManager: # FIXME (erikj): This is a bit of a hack to make the Pdu age # keys work + # FIXME (richardv): I also believe it no longer works. We (now?) store + # "age_ts" in "unsigned" rather than at the top level. See + # https://github.com/matrix-org/synapse/issues/8429. def json_data_cb(): data = transaction.get_dict() now = int(self.clock.time_msec()) @@ -167,5 +182,12 @@ class TransactionManager: ) success = False + if success and pdus and destination in self._federation_metrics_domains: + last_pdu = pdus[-1] + last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts + last_pdu_age_metric.labels(server_name=destination).set( + last_pdu_age / 1000 + ) + set_tag(tags.ERROR, not success) return success