From be4250c7a888e314e361df42042bfa344ab65d55 Mon Sep 17 00:00:00 2001 From: reivilibre Date: Wed, 24 Aug 2022 11:35:54 +0000 Subject: [PATCH] Add experimental configuration option to allow disabling legacy Prometheus metric names. (#13540) Co-authored-by: David Robertson --- changelog.d/13540.misc | 1 + synapse/app/_base.py | 39 +++++++++++++++++-- synapse/app/generic_worker.py | 6 ++- synapse/app/homeserver.py | 6 ++- synapse/config/metrics.py | 29 ++++++++++++++ synapse/metrics/__init__.py | 4 +- .../{_exposition.py => _legacy_exposition.py} | 34 +++++++++++++--- synapse/util/caches/__init__.py | 16 ++++---- tests/test_metrics.py | 36 +++++++++++++++++ 9 files changed, 150 insertions(+), 21 deletions(-) create mode 100644 changelog.d/13540.misc rename synapse/metrics/{_exposition.py => _legacy_exposition.py} (85%) diff --git a/changelog.d/13540.misc b/changelog.d/13540.misc new file mode 100644 index 0000000000..07ace50b12 --- /dev/null +++ b/changelog.d/13540.misc @@ -0,0 +1 @@ +Add experimental configuration option to allow disabling legacy Prometheus metric names. \ No newline at end of file diff --git a/synapse/app/_base.py b/synapse/app/_base.py index 923891ae0d..4742435d3b 100644 --- a/synapse/app/_base.py +++ b/synapse/app/_base.py @@ -266,15 +266,48 @@ def register_start( reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper())) -def listen_metrics(bind_addresses: Iterable[str], port: int) -> None: +def listen_metrics( + bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool +) -> None: """ Start Prometheus metrics server. """ - from synapse.metrics import RegistryProxy, start_http_server + from prometheus_client import start_http_server as start_http_server_prometheus + + from synapse.metrics import ( + RegistryProxy, + start_http_server as start_http_server_legacy, + ) for host in bind_addresses: logger.info("Starting metrics listener on %s:%d", host, port) - start_http_server(port, addr=host, registry=RegistryProxy) + if enable_legacy_metric_names: + start_http_server_legacy(port, addr=host, registry=RegistryProxy) + else: + _set_prometheus_client_use_created_metrics(False) + start_http_server_prometheus(port, addr=host, registry=RegistryProxy) + + +def _set_prometheus_client_use_created_metrics(new_value: bool) -> None: + """ + Sets whether prometheus_client should expose `_created`-suffixed metrics for + all gauges, histograms and summaries. + There is no programmatic way to disable this without poking at internals; + the proper way is to use an environment variable which prometheus_client + loads at import time. + + The motivation for disabling these `_created` metrics is that they're + a waste of space as they're not useful but they take up space in Prometheus. + """ + + import prometheus_client.metrics + + if hasattr(prometheus_client.metrics, "_use_created"): + prometheus_client.metrics._use_created = new_value + else: + logger.error( + "Can't disable `_created` metrics in prometheus_client (brittle hack broken?)" + ) def listen_manhole( diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 30e21d9707..5e3825fca6 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -412,7 +412,11 @@ class GenericWorkerServer(HomeServer): "enable_metrics is not True!" ) else: - _base.listen_metrics(listener.bind_addresses, listener.port) + _base.listen_metrics( + listener.bind_addresses, + listener.port, + enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics, + ) else: logger.warning("Unsupported listener type: %s", listener.type) diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index 68993d91a9..e57a926032 100644 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -307,7 +307,11 @@ class SynapseHomeServer(HomeServer): "enable_metrics is not True!" ) else: - _base.listen_metrics(listener.bind_addresses, listener.port) + _base.listen_metrics( + listener.bind_addresses, + listener.port, + enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics, + ) else: # this shouldn't happen, as the listener type should have been checked # during parsing diff --git a/synapse/config/metrics.py b/synapse/config/metrics.py index 3b42be5b5b..f3134834e5 100644 --- a/synapse/config/metrics.py +++ b/synapse/config/metrics.py @@ -42,6 +42,35 @@ class MetricsConfig(Config): def read_config(self, config: JsonDict, **kwargs: Any) -> None: self.enable_metrics = config.get("enable_metrics", False) + + """ + ### `enable_legacy_metrics` (experimental) + + **Experimental: this option may be removed or have its behaviour + changed at any time, with no notice.** + + Set to `true` to publish both legacy and non-legacy Prometheus metric names, + or to `false` to only publish non-legacy Prometheus metric names. + Defaults to `true`. Has no effect if `enable_metrics` is `false`. + + Legacy metric names include: + - metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules; + - counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard. + + These legacy metric names are unconventional and not compliant with OpenMetrics standards. + They are included for backwards compatibility. + + Example configuration: + ```yaml + enable_legacy_metrics: false + ``` + + See https://github.com/matrix-org/synapse/issues/11106 for context. + + *Since v1.67.0.* + """ + self.enable_legacy_metrics = config.get("enable_legacy_metrics", True) + self.report_stats = config.get("report_stats", None) self.report_stats_endpoint = config.get( "report_stats_endpoint", "https://matrix.org/report-usage-stats/push" diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 496fce2ecc..c3d3daf877 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -46,12 +46,12 @@ from twisted.python.threadpool import ThreadPool # This module is imported for its side effects; flake8 needn't warn that it's unused. import synapse.metrics._reactor_metrics # noqa: F401 -from synapse.metrics._exposition import ( +from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager +from synapse.metrics._legacy_exposition import ( MetricsResource, generate_latest, start_http_server, ) -from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager from synapse.metrics._types import Collector from synapse.util import SYNAPSE_VERSION diff --git a/synapse/metrics/_exposition.py b/synapse/metrics/_legacy_exposition.py similarity index 85% rename from synapse/metrics/_exposition.py rename to synapse/metrics/_legacy_exposition.py index 353d0a63b6..ff640a49af 100644 --- a/synapse/metrics/_exposition.py +++ b/synapse/metrics/_legacy_exposition.py @@ -80,7 +80,27 @@ def sample_line(line: Sample, name: str) -> str: return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp) +# Mapping from new metric names to legacy metric names. +# We translate these back to their old names when exposing them through our +# legacy vendored exporter. +# Only this legacy exposition module applies these name changes. +LEGACY_METRIC_NAMES = { + "synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits", + "synapse_util_caches_cache_size": "synapse_util_caches_cache:size", + "synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size", + "synapse_util_caches_cache_total": "synapse_util_caches_cache:total", + "synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size", + "synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits", + "synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size", + "synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total", +} + + def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes: + """ + Generate metrics in legacy format. Modern metrics are generated directly + by prometheus-client. + """ # Trigger the cache metrics to be rescraped, which updates the common # metrics but do not produce metrics themselves @@ -94,7 +114,8 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt # No samples, don't bother. continue - mname = metric.name + # Translate to legacy metric name if it has one. + mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name) mnewname = metric.name mtype = metric.type @@ -124,7 +145,7 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt om_samples: Dict[str, List[str]] = {} for s in metric.samples: for suffix in ["_created", "_gsum", "_gcount"]: - if s.name == metric.name + suffix: + if s.name == mname + suffix: # OpenMetrics specific sample, put in a gauge at the end. # (these come from gaugehistograms which don't get renamed, # so no need to faff with mnewname) @@ -140,12 +161,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt if emit_help: output.append( "# HELP {}{} {}\n".format( - metric.name, + mname, suffix, metric.documentation.replace("\\", r"\\").replace("\n", r"\n"), ) ) - output.append(f"# TYPE {metric.name}{suffix} gauge\n") + output.append(f"# TYPE {mname}{suffix} gauge\n") output.extend(lines) # Get rid of the weird colon things while we're at it @@ -170,11 +191,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt # Get rid of the OpenMetrics specific samples (we should already have # dealt with them above anyway.) for suffix in ["_created", "_gsum", "_gcount"]: - if s.name == metric.name + suffix: + if s.name == mname + suffix: break else: + sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name) output.append( - sample_line(s, s.name.replace(":total", "").replace(":", "_")) + sample_line(s, sample_name.replace(":total", "").replace(":", "_")) ) return "".join(output).encode("utf-8") diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py index 42f6abb5e1..bdf9b0dc8c 100644 --- a/synapse/util/caches/__init__.py +++ b/synapse/util/caches/__init__.py @@ -34,10 +34,10 @@ TRACK_MEMORY_USAGE = False caches_by_name: Dict[str, Sized] = {} collectors_by_name: Dict[str, "CacheMetric"] = {} -cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"]) -cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"]) -cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"]) -cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"]) +cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"]) +cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"]) +cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"]) +cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"]) cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"]) cache_memory_usage = Gauge( "synapse_util_caches_cache_size_bytes", @@ -45,12 +45,12 @@ cache_memory_usage = Gauge( ["name"], ) -response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"]) -response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"]) +response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"]) +response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"]) response_cache_evicted = Gauge( - "synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"] + "synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"] ) -response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"]) +response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"]) class EvictionReason(Enum): diff --git a/tests/test_metrics.py b/tests/test_metrics.py index b4574b2ffe..1a70eddc9b 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -12,7 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +try: + from importlib import metadata +except ImportError: + import importlib_metadata as metadata # type: ignore[no-redef] +from unittest.mock import patch + +from pkg_resources import parse_version + +from synapse.app._base import _set_prometheus_client_use_created_metrics from synapse.metrics import REGISTRY, InFlightGauge, generate_latest from synapse.util.caches.deferred_cache import DeferredCache @@ -162,3 +171,30 @@ class CacheMetricsTests(unittest.HomeserverTestCase): self.assertEqual(items["synapse_util_caches_cache_size"], "1.0") self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0") + + +class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase): + if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"): + skip = "prometheus-client too old" + + def test_created_metrics_disabled(self) -> None: + """ + Tests that a brittle hack, to disable `_created` metrics, works. + This involves poking at the internals of prometheus-client. + It's not the end of the world if this doesn't work. + + This test gives us a way to notice if prometheus-client changes + their internals. + """ + import prometheus_client.metrics + + PRIVATE_FLAG_NAME = "_use_created" + + # By default, the pesky `_created` metrics are enabled. + # Check this assumption is still valid. + self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME)) + + with patch("prometheus_client.metrics") as mock: + setattr(mock, PRIVATE_FLAG_NAME, True) + _set_prometheus_client_use_created_metrics(False) + self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))