From 5d281c10dd3d4d1f96635e92d803a74e3880d6b7 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Wed, 21 Apr 2021 10:03:31 +0100 Subject: [PATCH 1/7] Stop BackgroundProcessLoggingContext making new prometheus timeseries (#9854) This undoes part of b076bc276e881b262048307b6a226061d96c4a8d. --- changelog.d/9854.bugfix | 1 + synapse/metrics/background_process_metrics.py | 20 +++++++++++++++---- synapse/replication/tcp/protocol.py | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) create mode 100644 changelog.d/9854.bugfix diff --git a/changelog.d/9854.bugfix b/changelog.d/9854.bugfix new file mode 100644 index 0000000000..e39a3f9915 --- /dev/null +++ b/changelog.d/9854.bugfix @@ -0,0 +1 @@ +Fix a regression in Synapse 1.32.0 which caused Synapse to report large numbers of Prometheus time series, potentially overwhelming Prometheus instances. diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 78e9cfbc26..3f621539f3 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -16,7 +16,7 @@ import logging import threading from functools import wraps -from typing import TYPE_CHECKING, Dict, Optional, Set +from typing import TYPE_CHECKING, Dict, Optional, Set, Union from prometheus_client.core import REGISTRY, Counter, Gauge @@ -199,7 +199,7 @@ def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwar _background_process_start_count.labels(desc).inc() _background_process_in_flight_count.labels(desc).inc() - with BackgroundProcessLoggingContext("%s-%s" % (desc, count)) as context: + with BackgroundProcessLoggingContext(desc, count) as context: try: ctx = noop_context_manager() if bg_start_span: @@ -244,8 +244,20 @@ class BackgroundProcessLoggingContext(LoggingContext): __slots__ = ["_proc"] - def __init__(self, name: str): - super().__init__(name) + def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None): + """ + + Args: + name: The name of the background process. Each distinct `name` gets a + separate prometheus time series. + + instance_id: an identifer to add to `name` to distinguish this instance of + the named background process in the logs. If this is `None`, one is + made up based on id(self). + """ + if instance_id is None: + instance_id = id(self) + super().__init__("%s-%s" % (name, instance_id)) self._proc = _BackgroundProcess(name, self) def start(self, rusage: "Optional[resource._RUsage]"): diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index ba753318bd..d10d574246 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -185,7 +185,7 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver): # a logcontext which we use for processing incoming commands. We declare it as a # background process so that the CPU stats get reported to prometheus. self._logging_context = BackgroundProcessLoggingContext( - "replication-conn-%s" % (self.conn_id,) + "replication-conn", self.conn_id ) def connectionMade(self): From 30c94862b4fbaa782e47eac37a673a53feae2bb1 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Tue, 20 Apr 2021 17:11:36 +0100 Subject: [PATCH 2/7] Mention Prometheus metrics regression in v1.32.0 --- CHANGES.md | 6 ++++++ UPGRADE.rst | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 170d1e447d..7713328f12 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,12 @@ Synapse 1.32.0 (2021-04-20) =========================== +**Note:** This release introduces [a regression](https://githubcom/matrix-org/synapse/issues/9853) +that can overwhelm connected Prometheus instances. This issue was not present in +Synapse v1.32.0rc1. It is recommended not to update to this release. If you have +upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be +resolved in a subsequent release version shortly. + **Note:** This release requires Python 3.6+ and Postgres 9.6+ or SQLite 3.22+. This release removes the deprecated `GET /_synapse/admin/v1/users/` admin API. Please use the [v2 API](https://github.com/matrix-org/synapse/blob/develop/docs/admin_api/user_admin_api.rst#query-user-account) instead, which has improved capabilities. diff --git a/UPGRADE.rst b/UPGRADE.rst index 7a9b869055..c8dce62227 100644 --- a/UPGRADE.rst +++ b/UPGRADE.rst @@ -88,6 +88,15 @@ for example: Upgrading to v1.32.0 ==================== +Regression causing connected Prometheus instances to become overwhelmed +----------------------------------------------------------------------- + +This release introduces `a regression `_ +that can overwhelm connected Prometheus instances. This issue was not present in +Synapse v1.32.0rc1. It is recommended not to update to this release. If you have +upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be +resolved in a subsequent release version shortly. + Dropping support for old Python, Postgres and SQLite versions ------------------------------------------------------------- From a745531c10da75079fcac152935bc2ff505eec14 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 21 Apr 2021 14:01:12 +0100 Subject: [PATCH 3/7] 1.32.1 --- CHANGES.md | 9 +++++++++ changelog.d/9854.bugfix | 1 - debian/changelog | 6 ++++++ synapse/__init__.py | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) delete mode 100644 changelog.d/9854.bugfix diff --git a/CHANGES.md b/CHANGES.md index 7713328f12..65819ee1e1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,12 @@ +Synapse 1.32.1 (2021-04-21) +=========================== + +Bugfixes +-------- + +- Fix a regression in Synapse 1.32.0 which caused Synapse to report large numbers of Prometheus time series, potentially overwhelming Prometheus instances. ([\#9854](https://github.com/matrix-org/synapse/issues/9854)) + + Synapse 1.32.0 (2021-04-20) =========================== diff --git a/changelog.d/9854.bugfix b/changelog.d/9854.bugfix deleted file mode 100644 index e39a3f9915..0000000000 --- a/changelog.d/9854.bugfix +++ /dev/null @@ -1 +0,0 @@ -Fix a regression in Synapse 1.32.0 which caused Synapse to report large numbers of Prometheus time series, potentially overwhelming Prometheus instances. diff --git a/debian/changelog b/debian/changelog index 83be4497ec..b8cf2cac58 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +matrix-synapse-py3 (1.32.1) stable; urgency=medium + + * New synapse release 1.32.1. + + -- Synapse Packaging team Wed, 21 Apr 2021 14:00:55 +0100 + matrix-synapse-py3 (1.32.0) stable; urgency=medium [ Dan Callahan ] diff --git a/synapse/__init__.py b/synapse/__init__.py index 79232c4de1..a0332d602d 100644 --- a/synapse/__init__.py +++ b/synapse/__init__.py @@ -48,7 +48,7 @@ try: except ImportError: pass -__version__ = "1.32.0" +__version__ = "1.32.1" if bool(os.environ.get("SYNAPSE_TEST_PATCH_LOG_CONTEXTS", False)): # We import here so that we don't have to install a bunch of deps when From 026a66f2b37998a6e62d07267e6a114f87dc9b84 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 21 Apr 2021 14:04:44 +0100 Subject: [PATCH 4/7] Fix typo in link to regression in 1.32.0 upgrade notes --- UPGRADE.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/UPGRADE.rst b/UPGRADE.rst index c8dce62227..0be6e27cc0 100644 --- a/UPGRADE.rst +++ b/UPGRADE.rst @@ -91,7 +91,7 @@ Upgrading to v1.32.0 Regression causing connected Prometheus instances to become overwhelmed ----------------------------------------------------------------------- -This release introduces `a regression `_ +This release introduces `a regression `_ that can overwhelm connected Prometheus instances. This issue was not present in Synapse v1.32.0rc1. It is recommended not to update to this release. If you have upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be From 98a1b84631e5fbc227f211821d9bce318dc921d8 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 21 Apr 2021 14:07:51 +0100 Subject: [PATCH 5/7] Add link to fixing prometheus to 1.32.0 upgrade notes; 1.32.1 has a fix --- CHANGES.md | 2 ++ UPGRADE.rst | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 65819ee1e1..d06bcd9ae8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ Synapse 1.32.1 (2021-04-21) =========================== +This release fixes the regression introduced in Synapse + Bugfixes -------- diff --git a/UPGRADE.rst b/UPGRADE.rst index 0be6e27cc0..d4651ec2d3 100644 --- a/UPGRADE.rst +++ b/UPGRADE.rst @@ -94,8 +94,10 @@ Regression causing connected Prometheus instances to become overwhelmed This release introduces `a regression `_ that can overwhelm connected Prometheus instances. This issue was not present in Synapse v1.32.0rc1. It is recommended not to update to this release. If you have -upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be -resolved in a subsequent release version shortly. +upgraded to v1.32.0 already, please upgrade to v1.31.1 which contains a fix. +If you started Synapse on v1.32.0, you may need to remove excess writeahead logs +in order for Prometheus to recover; instructions for doing so are +`here `_. Dropping support for old Python, Postgres and SQLite versions ------------------------------------------------------------- From acb8c81041aa66e03b22150e457cfd2c7a44d436 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 21 Apr 2021 14:24:16 +0100 Subject: [PATCH 6/7] Add regression notes to CHANGES.md; fix link in 1.32.0 changelog --- CHANGES.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d06bcd9ae8..cc66f2f01d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,7 +1,11 @@ Synapse 1.32.1 (2021-04-21) =========================== -This release fixes the regression introduced in Synapse +This release fixes [a regression](https://github.com/matrix-org/synapse/issues/9853) +in Synapse 1.32.0 that caused connected Prometheus instances to become unstable. If you +ran Synapse 1.32.0 with Prometheus metrics, first upgrade to Synapse 1.32.1 and follow +[these instructions](https://github.com/matrix-org/synapse/pull/9854#issuecomment-823472183) +to clean up any excess writeahead logs. Bugfixes -------- @@ -12,7 +16,7 @@ Bugfixes Synapse 1.32.0 (2021-04-20) =========================== -**Note:** This release introduces [a regression](https://githubcom/matrix-org/synapse/issues/9853) +**Note:** This release introduces [a regression](https://github.com/matrix-org/synapse/issues/9853) that can overwhelm connected Prometheus instances. This issue was not present in Synapse v1.32.0rc1. It is recommended not to update to this release. If you have upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be From bdb4c20dc1ed4b5c88ac2889d1a02a32a9dd8a1f Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 21 Apr 2021 14:44:04 +0100 Subject: [PATCH 7/7] Clarify 1.32.0/1 changelog and upgrade notes --- CHANGES.md | 4 +--- UPGRADE.rst | 11 ++++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index cc66f2f01d..7188f94445 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,9 +18,7 @@ Synapse 1.32.0 (2021-04-20) **Note:** This release introduces [a regression](https://github.com/matrix-org/synapse/issues/9853) that can overwhelm connected Prometheus instances. This issue was not present in -Synapse v1.32.0rc1. It is recommended not to update to this release. If you have -upgraded to v1.32.0 already, please downgrade to v1.31.0. This issue will be -resolved in a subsequent release version shortly. +1.32.0rc1, and is fixed in 1.32.1. See the changelog for 1.32.1 above for more information. **Note:** This release requires Python 3.6+ and Postgres 9.6+ or SQLite 3.22+. diff --git a/UPGRADE.rst b/UPGRADE.rst index d4651ec2d3..76d2ee394f 100644 --- a/UPGRADE.rst +++ b/UPGRADE.rst @@ -92,11 +92,12 @@ Regression causing connected Prometheus instances to become overwhelmed ----------------------------------------------------------------------- This release introduces `a regression `_ -that can overwhelm connected Prometheus instances. This issue was not present in -Synapse v1.32.0rc1. It is recommended not to update to this release. If you have -upgraded to v1.32.0 already, please upgrade to v1.31.1 which contains a fix. -If you started Synapse on v1.32.0, you may need to remove excess writeahead logs -in order for Prometheus to recover; instructions for doing so are +that can overwhelm connected Prometheus instances. This issue is not present in +Synapse v1.32.0rc1, and is fixed in Synapse v1.32.1. + +If you have been affected, please first upgrade to a more recent Synapse version. +You then may need to remove excess writeahead logs in order for Prometheus to recover. +Instructions for doing so are provided `here `_. Dropping support for old Python, Postgres and SQLite versions