Add admin endpoint to query room sizes (#15482)

This commit is contained in:
Erik Johnston 2023-04-26 17:00:11 +01:00 committed by GitHub
parent 710502c6d8
commit 9900f7c231
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 195 additions and 1 deletions

View file

@ -0,0 +1 @@
Add admin endpoint to query the largest rooms by disk space used in the database.

View file

@ -81,3 +81,52 @@ The following fields are returned in the JSON response body:
- `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`). - `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
* `next_token` - integer - Opaque value used for pagination. See above. * `next_token` - integer - Opaque value used for pagination. See above.
* `total` - integer - Total number of users after filtering. * `total` - integer - Total number of users after filtering.
# Get largest rooms by size in database
Returns the 10 largest rooms and an estimate of how much space in the database
they are taking.
This does not include the size of any associated media associated with the room.
Returns an error on SQLite.
*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
which means that the returned information can vary widely from reality. However,
it should be enough to get a rough idea of where database disk space is going.
The API is:
```
GET /_synapse/admin/v1/statistics/statistics/database/rooms
```
A response body like the following is returned:
```json
{
"rooms": [
{
"room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
"estimated_size": 47325417353
}
],
}
```
**Response**
The following fields are returned in the JSON response body:
* `rooms` - An array of objects, sorted by largest room first. Objects contain
the following fields:
- `room_id` - string - The room ID.
- `estimated_size` - integer - Estimated disk space used in bytes by the room
in the database.
*Added in Synapse 1.83.0*

View file

@ -68,7 +68,10 @@ from synapse.rest.admin.rooms import (
RoomTimestampToEventRestServlet, RoomTimestampToEventRestServlet,
) )
from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet from synapse.rest.admin.statistics import (
LargestRoomsStatistics,
UserMediaStatisticsRestServlet,
)
from synapse.rest.admin.username_available import UsernameAvailableRestServlet from synapse.rest.admin.username_available import UsernameAvailableRestServlet
from synapse.rest.admin.users import ( from synapse.rest.admin.users import (
AccountDataRestServlet, AccountDataRestServlet,
@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
UserRestServletV2(hs).register(http_server) UserRestServletV2(hs).register(http_server)
UsersRestServletV2(hs).register(http_server) UsersRestServletV2(hs).register(http_server)
UserMediaStatisticsRestServlet(hs).register(http_server) UserMediaStatisticsRestServlet(hs).register(http_server)
LargestRoomsStatistics(hs).register(http_server)
EventReportDetailRestServlet(hs).register(http_server) EventReportDetailRestServlet(hs).register(http_server)
EventReportsRestServlet(hs).register(http_server) EventReportsRestServlet(hs).register(http_server)
AccountDataRestServlet(hs).register(http_server) AccountDataRestServlet(hs).register(http_server)

View file

@ -113,3 +113,28 @@ class UserMediaStatisticsRestServlet(RestServlet):
ret["next_token"] = start + len(users_media) ret["next_token"] = start + len(users_media)
return HTTPStatus.OK, ret return HTTPStatus.OK, ret
class LargestRoomsStatistics(RestServlet):
"""Get the largest rooms by database size.
Only works when using PostgreSQL.
"""
PATTERNS = admin_patterns("/statistics/database/rooms$")
def __init__(self, hs: "HomeServer"):
self.auth = hs.get_auth()
self.stats_controller = hs.get_storage_controllers().stats
async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)
room_sizes = await self.stats_controller.get_room_db_size_estimate()
return HTTPStatus.OK, {
"rooms": [
{"room_id": room_id, "estimated_size": size}
for room_id, size in room_sizes
]
}

View file

@ -19,6 +19,7 @@ from synapse.storage.controllers.persist_events import (
) )
from synapse.storage.controllers.purge_events import PurgeEventsStorageController from synapse.storage.controllers.purge_events import PurgeEventsStorageController
from synapse.storage.controllers.state import StateStorageController from synapse.storage.controllers.state import StateStorageController
from synapse.storage.controllers.stats import StatsController
from synapse.storage.databases import Databases from synapse.storage.databases import Databases
from synapse.storage.databases.main import DataStore from synapse.storage.databases.main import DataStore
@ -40,6 +41,7 @@ class StorageControllers:
self.purge_events = PurgeEventsStorageController(hs, stores) self.purge_events = PurgeEventsStorageController(hs, stores)
self.state = StateStorageController(hs, stores) self.state = StateStorageController(hs, stores)
self.stats = StatsController(hs, stores)
self.persistence = None self.persistence = None
if stores.persist_events: if stores.persist_events:

View file

@ -0,0 +1,113 @@
# Copyright 2023 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from collections import Counter
from typing import TYPE_CHECKING, Collection, List, Tuple
from synapse.api.errors import SynapseError
from synapse.storage.database import LoggingTransaction
from synapse.storage.databases import Databases
from synapse.storage.engines import PostgresEngine
if TYPE_CHECKING:
from synapse.server import HomeServer
logger = logging.getLogger(__name__)
class StatsController:
"""High level interface for getting statistics."""
def __init__(self, hs: "HomeServer", stores: Databases):
self.stores = stores
async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
"""Get an estimate of the largest rooms and how much database space they
use, in bytes.
Only works against PostgreSQL.
Note: this uses the postgres statistics so is a very rough estimate.
"""
# Note: We look at both tables on the main and state databases.
if not isinstance(self.stores.main.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")
if not isinstance(self.stores.state.database_engine, PostgresEngine):
raise SynapseError(400, "Endpoint requires using PostgreSQL")
# For each "large" table, we go through and get the largest rooms
# and an estimate of how much space they take. We can then sum the
# results and return the top 10.
#
# This isn't the most accurate, but given all of these are estimates
# anyway its good enough.
room_estimates: Counter[str] = Counter()
# Return size of the table on disk, including indexes and TOAST.
table_sql = """
SELECT pg_total_relation_size(?)
"""
# Get an estimate for the largest rooms and their frequency.
#
# Note: the cast here is a hack to cast from `anyarray` to an actual
# type. This ensures that psycopg2 passes us a back a a Python list.
column_sql = """
SELECT
most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
FROM pg_stats
WHERE tablename = ? and attname = 'room_id'
"""
def get_room_db_size_estimate_txn(
txn: LoggingTransaction,
tables: Collection[str],
) -> None:
for table in tables:
txn.execute(table_sql, (table,))
row = txn.fetchone()
assert row is not None
(table_size,) = row
txn.execute(column_sql, (table,))
row = txn.fetchone()
assert row is not None
vals, freqs = row
for room_id, freq in zip(vals, freqs):
room_estimates[room_id] += int(freq * table_size)
await self.stores.main.db_pool.runInteraction(
"get_room_db_size_estimate_main",
get_room_db_size_estimate_txn,
(
"event_json",
"events",
"event_search",
"event_edges",
"event_push_actions",
"stream_ordering_to_exterm",
),
)
await self.stores.state.db_pool.runInteraction(
"get_room_db_size_estimate_state",
get_room_db_size_estimate_txn,
("state_groups_state",),
)
return room_estimates.most_common(10)