From 8be5d7c13e4d15ffe39bbcd5e01a2bd3455d51fc Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 26 Aug 2022 12:40:07 -0500 Subject: [PATCH 1/4] Fix rate limit metrics registering twice and misreporting Fix https://github.com/matrix-org/synapse/issues/13641 --- synapse/server.py | 4 +- synapse/util/ratelimitutils.py | 138 +++++++++++++++++++++++++-------- 2 files changed, 110 insertions(+), 32 deletions(-) diff --git a/synapse/server.py b/synapse/server.py index 181984a1a491..c2e55bf0b125 100644 --- a/synapse/server.py +++ b/synapse/server.py @@ -756,7 +756,9 @@ def get_replication_streams(self) -> Dict[str, Stream]: @cache_in_self def get_federation_ratelimiter(self) -> FederationRateLimiter: return FederationRateLimiter( - self.get_clock(), config=self.config.ratelimiting.rc_federation + self.get_clock(), + config=self.config.ratelimiting.rc_federation, + metrics_name="federation_servlets", ) @cache_in_self diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index f678b52cb4f5..eda7f23deb26 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -15,8 +15,9 @@ import collections import contextlib import logging +import threading import typing -from typing import Any, DefaultDict, Iterator, List, Set +from typing import Any, DefaultDict, Dict, Iterator, List, Mapping, Optional, Set, Tuple from prometheus_client.core import Counter @@ -40,12 +41,20 @@ # Track how much the ratelimiter is affecting requests -rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "") -rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "") +rate_limit_sleep_counter = Counter( + "synapse_rate_limit_sleep", + "Number of requests slept by the rate limiter", + ["rate_limiter_name"], +) +rate_limit_reject_counter = Counter( + "synapse_rate_limit_reject", + "Number of requests rejected by the rate limiter", + ["rate_limiter_name"], +) queue_wait_timer = Histogram( "synapse_rate_limit_queue_wait_time_seconds", - "sec", - [], + "Amount of time spent waiting for the rate limiter to let our request through.", + ["rate_limiter_name"], buckets=( 0.005, 0.01, @@ -65,35 +74,89 @@ ) +_rate_limiter_instances: Set["FederationRateLimiter"] = set() +# Protects the _rate_limiter_instances set from concurrent access +_rate_limiter_instances_lock = threading.Lock() + + +def _get_counts_from_rate_limiter_instance(count_func) -> Mapping[Tuple[str, ...], int]: + """Returns a count of something (slept/rejected hosts) by (metrics_name)""" + # Cast to a list to prevent it changing while the Prometheus + # thread is collecting metrics + with _rate_limiter_instances_lock: + rate_limiter_instances = list(_rate_limiter_instances) + + # Map from (metrics_name,) -> int, the number of something like slept hosts + # or rejected hosts. The key type is Tuple[str], but we leave the length + # unspecified for compatability with LaterGauge's annotations. + counts: Dict[Tuple[str, ...], int] = {} + for rate_limiter_instance in rate_limiter_instances: + # Only track metrics if they provided a `metrics_name` to + # differentiate this instance of the rate limiter. + if rate_limiter_instance.metrics_name: + key = (rate_limiter_instance.metrics_name,) + counts[key] = count_func(rate_limiter_instance) + + return counts + + +# We track the number of affected hosts per time-period so we can +# differentiate one really noisy homeserver from a general +# ratelimit tuning problem across the federation. +LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_sleep() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) +LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_reject() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) + + class FederationRateLimiter: - def __init__(self, clock: Clock, config: FederationRatelimitSettings): + """Used to rate limit request per-host.""" + + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str], + ): + """ + Args: + clock + config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics + + """ + self.metrics_name = metrics_name + def new_limiter() -> "_PerHostRatelimiter": - return _PerHostRatelimiter(clock=clock, config=config) + return _PerHostRatelimiter( + clock=clock, config=config, metrics_name=metrics_name + ) self.ratelimiters: DefaultDict[ str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) - # We track the number of affected hosts per time-period so we can - # differentiate one really noisy homeserver from a general - # ratelimit tuning problem across the federation. - LaterGauge( - "synapse_rate_limit_sleep_affected_hosts", - "Number of hosts that had requests put to sleep", - [], - lambda: sum( - ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() - ), - ) - LaterGauge( - "synapse_rate_limit_reject_affected_hosts", - "Number of hosts that had requests rejected", - [], - lambda: sum( - ratelimiter.should_reject() - for ratelimiter in self.ratelimiters.values() - ), - ) + with _rate_limiter_instances_lock: + _rate_limiter_instances.add(self) def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -114,13 +177,21 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None] class _PerHostRatelimiter: - def __init__(self, clock: Clock, config: FederationRatelimitSettings): + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str], + ): """ Args: clock config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics """ self.clock = clock + self.metrics_name = metrics_name self.window_size = config.window_size self.sleep_limit = config.sleep_limit @@ -178,7 +249,10 @@ def should_sleep(self) -> bool: return len(self.request_times) > self.sleep_limit async def _on_enter_with_tracing(self, request_id: object) -> None: - with start_active_span("ratelimit wait"), queue_wait_timer.time(): + maybe_metrics_cm = contextlib.nullcontext() + if self.metrics_name: + maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() + with start_active_span("ratelimit wait"), maybe_metrics_cm: await self._on_enter(request_id) def _on_enter(self, request_id: object) -> "defer.Deferred[None]": @@ -193,7 +267,8 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": # sleeping or in the ready queue). if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) - rate_limit_reject_counter.inc() + if self.metrics_name: + rate_limit_reject_counter.labels(self.metrics_name).inc() raise LimitExceededError( retry_after_ms=int(self.window_size / self.sleep_limit) ) @@ -228,7 +303,8 @@ def queue_request() -> "defer.Deferred[None]": id(request_id), self.sleep_sec, ) - rate_limit_sleep_counter.inc() + if self.metrics_name: + rate_limit_sleep_counter.labels(self.metrics_name).inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) From d8ec11aacab96df28a5ae128ca07ee6493457ea8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 26 Aug 2022 13:07:25 -0500 Subject: [PATCH 2/4] Fix lints --- synapse/util/ratelimitutils.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index eda7f23deb26..8fd260aa9a9f 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -17,9 +17,21 @@ import logging import threading import typing -from typing import Any, DefaultDict, Dict, Iterator, List, Mapping, Optional, Set, Tuple +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + Iterator, + List, + Mapping, + Optional, + Set, + Tuple, +) from prometheus_client.core import Counter +from typing_extensions import ContextManager from twisted.internet import defer @@ -79,7 +91,9 @@ _rate_limiter_instances_lock = threading.Lock() -def _get_counts_from_rate_limiter_instance(count_func) -> Mapping[Tuple[str, ...], int]: +def _get_counts_from_rate_limiter_instance( + count_func: Callable[["FederationRateLimiter"], int] +) -> Mapping[Tuple[str, ...], int]: """Returns a count of something (slept/rejected hosts) by (metrics_name)""" # Cast to a list to prevent it changing while the Prometheus # thread is collecting metrics @@ -134,7 +148,7 @@ def __init__( self, clock: Clock, config: FederationRatelimitSettings, - metrics_name: Optional[str], + metrics_name: Optional[str] = None, ): """ Args: @@ -181,7 +195,7 @@ def __init__( self, clock: Clock, config: FederationRatelimitSettings, - metrics_name: Optional[str], + metrics_name: Optional[str] = None, ): """ Args: @@ -249,7 +263,7 @@ def should_sleep(self) -> bool: return len(self.request_times) > self.sleep_limit async def _on_enter_with_tracing(self, request_id: object) -> None: - maybe_metrics_cm = contextlib.nullcontext() + maybe_metrics_cm: ContextManager = contextlib.nullcontext() if self.metrics_name: maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() with start_active_span("ratelimit wait"), maybe_metrics_cm: From 7921b1c799c1f516bd4105b22f509b9bc6b0c7c5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 26 Aug 2022 13:59:18 -0500 Subject: [PATCH 3/4] Add changelog --- changelog.d/13649.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/13649.bugfix diff --git a/changelog.d/13649.bugfix b/changelog.d/13649.bugfix new file mode 100644 index 000000000000..e6513a585aa5 --- /dev/null +++ b/changelog.d/13649.bugfix @@ -0,0 +1 @@ +Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). From b077d815e95da208ed981338793ded9d21c2e644 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 30 Aug 2022 12:07:23 +0100 Subject: [PATCH 4/4] Document `metrics_name=None`. --- synapse/util/ratelimitutils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 8fd260aa9a9f..9f64fed0d764 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -155,7 +155,8 @@ def __init__( clock config metrics_name: The name of the rate limiter so we can differentiate it - from the rest in the metrics + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. """ self.metrics_name = metrics_name @@ -202,6 +203,8 @@ def __init__( clock config metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. from the rest in the metrics """ self.clock = clock