Make the metrics less racy (#4061)

This commit is contained in:
Amber Brown 2018-10-19 21:45:45 +11:00 committed by GitHub
parent 6a4d01ee94
commit b69216f768
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 22 additions and 16 deletions

1
changelog.d/4061.bugfix Normal file
View file

@ -0,0 +1 @@
Fix some metrics being racy and causing exceptions when polled by Prometheus.

View file

@ -39,7 +39,8 @@ outgoing_responses_counter = Counter(
) )
response_timer = Histogram( response_timer = Histogram(
"synapse_http_server_response_time_seconds", "sec", "synapse_http_server_response_time_seconds",
"sec",
["method", "servlet", "tag", "code"], ["method", "servlet", "tag", "code"],
) )
@ -79,15 +80,11 @@ response_size = Counter(
# than when the response was written. # than when the response was written.
in_flight_requests_ru_utime = Counter( in_flight_requests_ru_utime = Counter(
"synapse_http_server_in_flight_requests_ru_utime_seconds", "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
"",
["method", "servlet"],
) )
in_flight_requests_ru_stime = Counter( in_flight_requests_ru_stime = Counter(
"synapse_http_server_in_flight_requests_ru_stime_seconds", "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
"",
["method", "servlet"],
) )
in_flight_requests_db_txn_count = Counter( in_flight_requests_db_txn_count = Counter(
@ -134,7 +131,7 @@ def _get_in_flight_counts():
# type # type
counts = {} counts = {}
for rm in reqs: for rm in reqs:
key = (rm.method, rm.name,) key = (rm.method, rm.name)
counts[key] = counts.get(key, 0) + 1 counts[key] = counts.get(key, 0) + 1
return counts return counts
@ -175,7 +172,8 @@ class RequestMetrics(object):
if context != self.start_context: if context != self.start_context:
logger.warn( logger.warn(
"Context have unexpectedly changed %r, %r", "Context have unexpectedly changed %r, %r",
context, self.start_context context,
self.start_context,
) )
return return
@ -192,10 +190,10 @@ class RequestMetrics(object):
resource_usage = context.get_resource_usage() resource_usage = context.get_resource_usage()
response_ru_utime.labels(self.method, self.name, tag).inc( response_ru_utime.labels(self.method, self.name, tag).inc(
resource_usage.ru_utime, resource_usage.ru_utime
) )
response_ru_stime.labels(self.method, self.name, tag).inc( response_ru_stime.labels(self.method, self.name, tag).inc(
resource_usage.ru_stime, resource_usage.ru_stime
) )
response_db_txn_count.labels(self.method, self.name, tag).inc( response_db_txn_count.labels(self.method, self.name, tag).inc(
resource_usage.db_txn_count resource_usage.db_txn_count
@ -222,8 +220,15 @@ class RequestMetrics(object):
diff = new_stats - self._request_stats diff = new_stats - self._request_stats
self._request_stats = new_stats self._request_stats = new_stats
in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime) # max() is used since rapid use of ru_stime/ru_utime can end up with the
in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime) # count going backwards due to NTP, time smearing, fine-grained
# correction, or floating points. Who knows, really?
in_flight_requests_ru_utime.labels(self.method, self.name).inc(
max(diff.ru_utime, 0)
)
in_flight_requests_ru_stime.labels(self.method, self.name).inc(
max(diff.ru_stime, 0)
)
in_flight_requests_db_txn_count.labels(self.method, self.name).inc( in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
diff.db_txn_count diff.db_txn_count

View file

@ -186,9 +186,9 @@ class Notifier(object):
def count_listeners(): def count_listeners():
all_user_streams = set() all_user_streams = set()
for x in self.room_to_user_streams.values(): for x in list(self.room_to_user_streams.values()):
all_user_streams |= x all_user_streams |= x
for x in self.user_to_user_stream.values(): for x in list(self.user_to_user_stream.values()):
all_user_streams.add(x) all_user_streams.add(x)
return sum(stream.count_listeners() for stream in all_user_streams) return sum(stream.count_listeners() for stream in all_user_streams)
@ -196,7 +196,7 @@ class Notifier(object):
LaterGauge( LaterGauge(
"synapse_notifier_rooms", "", [], "synapse_notifier_rooms", "", [],
lambda: count(bool, self.room_to_user_streams.values()), lambda: count(bool, list(self.room_to_user_streams.values())),
) )
LaterGauge( LaterGauge(
"synapse_notifier_users", "", [], "synapse_notifier_users", "", [],