From 6c821fa249963677f14f82213776f2ebbf893d81 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 10 Jan 2022 18:25:32 +0000 Subject: [PATCH 1/8] Move the GC metrics out to their own file Move the manual GC code into its own function, and then move that, and all the other GC stuff, out to a separate file. --- synapse/metrics/__init__.py | 155 +--------------------------- synapse/metrics/_gc.py | 195 ++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+), 152 deletions(-) create mode 100644 synapse/metrics/_gc.py diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index ceef57ad883f..ba6dba66d1fe 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -41,7 +41,6 @@ from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Metric from prometheus_client.core import ( REGISTRY, - CounterMetricFamily, GaugeHistogramMetricFamily, GaugeMetricFamily, ) @@ -56,6 +55,7 @@ generate_latest, start_http_server, ) +from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, maybe_gc from synapse.util.versionstring import get_version_string logger = logging.getLogger(__name__) @@ -369,121 +369,6 @@ def collect(self) -> Iterable[Metric]: REGISTRY.register(CPUMetrics()) -# -# Python GC metrics -# - -gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) -gc_time = Histogram( - "python_gc_time", - "Time taken to GC (sec)", - ["gen"], - buckets=[ - 0.0025, - 0.005, - 0.01, - 0.025, - 0.05, - 0.10, - 0.25, - 0.50, - 1.00, - 2.50, - 5.00, - 7.50, - 15.00, - 30.00, - 45.00, - 60.00, - ], -) - - -class GCCounts: - def collect(self) -> Iterable[Metric]: - cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) - for n, m in enumerate(gc.get_count()): - cm.add_metric([str(n)], m) - - yield cm - - -if not running_on_pypy: - REGISTRY.register(GCCounts()) - - -# -# PyPy GC / memory metrics -# - - -class PyPyGCStats: - def collect(self) -> Iterable[Metric]: - - # @stats is a pretty-printer object with __str__() returning a nice table, - # plus some fields that contain data from that table. - # unfortunately, fields are pretty-printed themselves (i. e. '4.5MB'). - stats = gc.get_stats(memory_pressure=False) # type: ignore - # @s contains same fields as @stats, but as actual integers. - s = stats._s # type: ignore - - # also note that field naming is completely braindead - # and only vaguely correlates with the pretty-printed table. - # >>>> gc.get_stats(False) - # Total memory consumed: - # GC used: 8.7MB (peak: 39.0MB) # s.total_gc_memory, s.peak_memory - # in arenas: 3.0MB # s.total_arena_memory - # rawmalloced: 1.7MB # s.total_rawmalloced_memory - # nursery: 4.0MB # s.nursery_size - # raw assembler used: 31.0kB # s.jit_backend_used - # ----------------------------- - # Total: 8.8MB # stats.memory_used_sum - # - # Total memory allocated: - # GC allocated: 38.7MB (peak: 41.1MB) # s.total_allocated_memory, s.peak_allocated_memory - # in arenas: 30.9MB # s.peak_arena_memory - # rawmalloced: 4.1MB # s.peak_rawmalloced_memory - # nursery: 4.0MB # s.nursery_size - # raw assembler allocated: 1.0MB # s.jit_backend_allocated - # ----------------------------- - # Total: 39.7MB # stats.memory_allocated_sum - # - # Total time spent in GC: 0.073 # s.total_gc_time - - pypy_gc_time = CounterMetricFamily( - "pypy_gc_time_seconds_total", - "Total time spent in PyPy GC", - labels=[], - ) - pypy_gc_time.add_metric([], s.total_gc_time / 1000) - yield pypy_gc_time - - pypy_mem = GaugeMetricFamily( - "pypy_memory_bytes", - "Memory tracked by PyPy allocator", - labels=["state", "class", "kind"], - ) - # memory used by JIT assembler - pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used) - pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated) - # memory used by GCed objects - pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory) - pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory) - pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory) - pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory) - pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size) - pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size) - # totals - pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory) - pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory) - pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory) - pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory) - yield pypy_mem - - -if running_on_pypy: - REGISTRY.register(PyPyGCStats()) - # # Twisted reactor metrics @@ -612,14 +497,6 @@ def collect(self) -> Iterable[Metric]: REGISTRY.register(ReactorLastSeenMetric()) -# The minimum time in seconds between GCs for each generation, regardless of the current GC -# thresholds and counts. -MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0) - -# The time (in seconds since the epoch) of the last time we did a GC for each generation. -_last_gc = [0.0, 0.0, 0.0] - - F = TypeVar("F", bound=Callable[..., Any]) @@ -658,34 +535,7 @@ def f(*args: Any, **kwargs: Any) -> Any: global last_ticked last_ticked = end - if running_on_pypy: - return ret - - # Check if we need to do a manual GC (since its been disabled), and do - # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may - # promote an object into gen 2, and we don't want to handle the same - # object multiple times. - threshold = gc.get_threshold() - counts = gc.get_count() - for i in (2, 1, 0): - # We check if we need to do one based on a straightforward - # comparison between the threshold and count. We also do an extra - # check to make sure that we don't a GC too often. - if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: - if i == 0: - logger.debug("Collecting gc %d", i) - else: - logger.info("Collecting gc %d", i) - - start = time.time() - unreachable = gc.collect(i) - end = time.time() - - _last_gc[i] = end - - gc_time.labels(i).observe(end - start) - gc_unreachable.labels(i).set(unreachable) - + maybe_gc() return ret return cast(F, f) @@ -717,4 +567,5 @@ def f(*args: Any, **kwargs: Any) -> Any: "LaterGauge", "InFlightGauge", "GaugeBucketCollector", + "MIN_TIME_BETWEEN_GCS", ] diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py new file mode 100644 index 000000000000..208e65769987 --- /dev/null +++ b/synapse/metrics/_gc.py @@ -0,0 +1,195 @@ +# Copyright 2015-2022 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import gc +import logging +import platform +import time +from typing import Iterable + +from prometheus_client.core import ( + REGISTRY, + CounterMetricFamily, + Gauge, + GaugeMetricFamily, + Histogram, + Metric, +) + +"""Prometheus metrics for garbage collection""" + + +logger = logging.getLogger(__name__) + +# The minimum time in seconds between GCs for each generation, regardless of the current GC +# thresholds and counts. +MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0) + +running_on_pypy = platform.python_implementation() == "PyPy" + +# +# Python GC metrics +# + +gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) +gc_time = Histogram( + "python_gc_time", + "Time taken to GC (sec)", + ["gen"], + buckets=[ + 0.0025, + 0.005, + 0.01, + 0.025, + 0.05, + 0.10, + 0.25, + 0.50, + 1.00, + 2.50, + 5.00, + 7.50, + 15.00, + 30.00, + 45.00, + 60.00, + ], +) + + +class GCCounts: + def collect(self) -> Iterable[Metric]: + cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) + for n, m in enumerate(gc.get_count()): + cm.add_metric([str(n)], m) + + yield cm + + +if not running_on_pypy: + REGISTRY.register(GCCounts()) + + +# +# PyPy GC / memory metrics +# + + +class PyPyGCStats: + def collect(self) -> Iterable[Metric]: + + # @stats is a pretty-printer object with __str__() returning a nice table, + # plus some fields that contain data from that table. + # unfortunately, fields are pretty-printed themselves (i. e. '4.5MB'). + stats = gc.get_stats(memory_pressure=False) # type: ignore + # @s contains same fields as @stats, but as actual integers. + s = stats._s # type: ignore + + # also note that field naming is completely braindead + # and only vaguely correlates with the pretty-printed table. + # >>>> gc.get_stats(False) + # Total memory consumed: + # GC used: 8.7MB (peak: 39.0MB) # s.total_gc_memory, s.peak_memory + # in arenas: 3.0MB # s.total_arena_memory + # rawmalloced: 1.7MB # s.total_rawmalloced_memory + # nursery: 4.0MB # s.nursery_size + # raw assembler used: 31.0kB # s.jit_backend_used + # ----------------------------- + # Total: 8.8MB # stats.memory_used_sum + # + # Total memory allocated: + # GC allocated: 38.7MB (peak: 41.1MB) # s.total_allocated_memory, s.peak_allocated_memory + # in arenas: 30.9MB # s.peak_arena_memory + # rawmalloced: 4.1MB # s.peak_rawmalloced_memory + # nursery: 4.0MB # s.nursery_size + # raw assembler allocated: 1.0MB # s.jit_backend_allocated + # ----------------------------- + # Total: 39.7MB # stats.memory_allocated_sum + # + # Total time spent in GC: 0.073 # s.total_gc_time + + pypy_gc_time = CounterMetricFamily( + "pypy_gc_time_seconds_total", + "Total time spent in PyPy GC", + labels=[], + ) + pypy_gc_time.add_metric([], s.total_gc_time / 1000) + yield pypy_gc_time + + pypy_mem = GaugeMetricFamily( + "pypy_memory_bytes", + "Memory tracked by PyPy allocator", + labels=["state", "class", "kind"], + ) + # memory used by JIT assembler + pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used) + pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated) + # memory used by GCed objects + pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory) + pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory) + pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory) + pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory) + pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size) + pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size) + # totals + pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory) + pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory) + pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory) + pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory) + yield pypy_mem + + +if running_on_pypy: + REGISTRY.register(PyPyGCStats()) + + +# +# manually run a GC, if it's due, and we're not on pypy. +# + +# The time (in seconds since the epoch) of the last time we did a GC for each generation. +_last_gc = [0.0, 0.0, 0.0] + + +def maybe_gc() -> None: + if running_on_pypy: + # nothing to do on pypy, since we let automated GC happen as normal + return + + # Check if we need to do a manual GC (since its been disabled), and do + # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may + # promote an object into gen 2, and we don't want to handle the same + # object multiple times. + threshold = gc.get_threshold() + counts = gc.get_count() + end = time.time() + for i in (2, 1, 0): + # We check if we need to do one based on a straightforward + # comparison between the threshold and count. We also do an extra + # check to make sure that we don't a GC too often. + if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: + if i == 0: + logger.debug("Collecting gc %d", i) + else: + logger.info("Collecting gc %d", i) + + start = time.time() + unreachable = gc.collect(i) + end = time.time() + + _last_gc[i] = end + + gc_time.labels(i).observe(end - start) + gc_unreachable.labels(i).set(unreachable) From 05bdebd2a8558a62fe4f9c8aa61b56d1f62517d3 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 10 Jan 2022 20:27:24 +0000 Subject: [PATCH 2/8] Replace GC hooks in reactor with a timed call --- synapse/metrics/__init__.py | 10 +---- synapse/metrics/_gc.py | 78 ++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 49 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index ba6dba66d1fe..c95abd4394eb 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. import functools -import gc import itertools import logging import os @@ -55,14 +54,13 @@ generate_latest, start_http_server, ) -from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, maybe_gc +from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS from synapse.util.versionstring import get_version_string logger = logging.getLogger(__name__) METRICS_PREFIX = "/_synapse/metrics" -running_on_pypy = platform.python_implementation() == "PyPy" all_gauges: "Dict[str, Union[LaterGauge, InFlightGauge]]" = {} HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat") @@ -535,7 +533,6 @@ def f(*args: Any, **kwargs: Any) -> Any: global last_ticked last_ticked = end - maybe_gc() return ret return cast(F, f) @@ -551,11 +548,6 @@ def f(*args: Any, **kwargs: Any) -> Any: # runUntilCurrent is called when we have pending calls. It is called once # per iteratation after fd polling. reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore - - # We manually run the GC each reactor tick so that we can get some metrics - # about time spent doing GC, - if not running_on_pypy: - gc.disable() except AttributeError: pass diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 208e65769987..4de6244160b7 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -28,6 +28,8 @@ Metric, ) +from twisted.internet import task + """Prometheus metrics for garbage collection""" @@ -81,6 +83,42 @@ def collect(self) -> Iterable[Metric]: if not running_on_pypy: REGISTRY.register(GCCounts()) + # disable automatic GC, and replace it with a task that runs every 100ms, so that + # we can get some metrics about GC activity. + gc.disable() + + # The time (in seconds since the epoch) of the last time we did a GC for each generation. + _last_gc = [0.0, 0.0, 0.0] + + def _maybe_gc() -> None: + # Check if we need to do a manual GC (since its been disabled), and do + # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may + # promote an object into gen 2, and we don't want to handle the same + # object multiple times. + threshold = gc.get_threshold() + counts = gc.get_count() + end = time.time() + for i in (2, 1, 0): + # We check if we need to do one based on a straightforward + # comparison between the threshold and count. We also do an extra + # check to make sure that we don't a GC too often. + if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: + if i == 0: + logger.debug("Collecting gc %d", i) + else: + logger.info("Collecting gc %d", i) + + start = time.time() + unreachable = gc.collect(i) + end = time.time() + + _last_gc[i] = end + + gc_time.labels(i).observe(end - start) + gc_unreachable.labels(i).set(unreachable) + + gc_task = task.LoopingCall(_maybe_gc) + gc_task.start(0.1) # # PyPy GC / memory metrics @@ -153,43 +191,3 @@ def collect(self) -> Iterable[Metric]: if running_on_pypy: REGISTRY.register(PyPyGCStats()) - - -# -# manually run a GC, if it's due, and we're not on pypy. -# - -# The time (in seconds since the epoch) of the last time we did a GC for each generation. -_last_gc = [0.0, 0.0, 0.0] - - -def maybe_gc() -> None: - if running_on_pypy: - # nothing to do on pypy, since we let automated GC happen as normal - return - - # Check if we need to do a manual GC (since its been disabled), and do - # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may - # promote an object into gen 2, and we don't want to handle the same - # object multiple times. - threshold = gc.get_threshold() - counts = gc.get_count() - end = time.time() - for i in (2, 1, 0): - # We check if we need to do one based on a straightforward - # comparison between the threshold and count. We also do an extra - # check to make sure that we don't a GC too often. - if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: - if i == 0: - logger.debug("Collecting gc %d", i) - else: - logger.info("Collecting gc %d", i) - - start = time.time() - unreachable = gc.collect(i) - end = time.time() - - _last_gc[i] = end - - gc_time.labels(i).observe(end - start) - gc_unreachable.labels(i).set(unreachable) From fb74ba73be06362e0e54ab6aaa054040a1f330d2 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 11 Jan 2022 11:45:20 +0000 Subject: [PATCH 3/8] changelog --- changelog.d/11723.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/11723.misc diff --git a/changelog.d/11723.misc b/changelog.d/11723.misc new file mode 100644 index 000000000000..f99e02070a93 --- /dev/null +++ b/changelog.d/11723.misc @@ -0,0 +1 @@ +Simplify calculation of prometheus metrics for garbage collection. From 45051e862e8374b55f99cc379f1c4e9ccb8dfe1e Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 11 Jan 2022 12:37:43 +0000 Subject: [PATCH 4/8] Avoid GC management in unit tests Register the GC management task explicitly, so that it doesn't happen during unit tests. --- synapse/app/_base.py | 3 ++- synapse/metrics/__init__.py | 3 ++- synapse/metrics/_gc.py | 16 +++++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/synapse/app/_base.py b/synapse/app/_base.py index 5fc59c1be11d..579adbbca02d 100644 --- a/synapse/app/_base.py +++ b/synapse/app/_base.py @@ -60,7 +60,7 @@ from synapse.events.third_party_rules import load_legacy_third_party_event_rules from synapse.handlers.auth import load_legacy_password_auth_providers from synapse.logging.context import PreserveLoggingContext -from synapse.metrics import register_threadpool +from synapse.metrics import install_gc_manager, register_threadpool from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.metrics.jemalloc import setup_jemalloc_stats from synapse.types import ISynapseReactor @@ -159,6 +159,7 @@ def run() -> None: change_resource_limit(soft_file_limit) if gc_thresholds: gc.set_threshold(*gc_thresholds) + install_gc_manager() run_command() # make sure that we run the reactor with the sentinel log context, diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index c95abd4394eb..c7f57227071c 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -54,7 +54,7 @@ generate_latest, start_http_server, ) -from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS +from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager from synapse.util.versionstring import get_version_string logger = logging.getLogger(__name__) @@ -560,4 +560,5 @@ def f(*args: Any, **kwargs: Any) -> Any: "InFlightGauge", "GaugeBucketCollector", "MIN_TIME_BETWEEN_GCS", + "install_gc_manager", ] diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 4de6244160b7..2bc909efa0d3 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -80,11 +80,20 @@ def collect(self) -> Iterable[Metric]: yield cm -if not running_on_pypy: +def install_gc_manager() -> None: + """Disable automatic GC, and replace it with a task that runs every 100ms + + This means that (a) we can limit how often GC runs; (b) we can get some metrics + about GC activity. + + It does nothing on PyPy. + """ + + if running_on_pypy: + return + REGISTRY.register(GCCounts()) - # disable automatic GC, and replace it with a task that runs every 100ms, so that - # we can get some metrics about GC activity. gc.disable() # The time (in seconds since the epoch) of the last time we did a GC for each generation. @@ -120,6 +129,7 @@ def _maybe_gc() -> None: gc_task = task.LoopingCall(_maybe_gc) gc_task.start(0.1) + # # PyPy GC / memory metrics # From 3dbc342c70039ad3f084fc11afe810fcc29528f8 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 10 Jan 2022 20:37:10 +0000 Subject: [PATCH 5/8] Remove `python_twisted_reactor_pending_calls` I don't think it's terribly useful, and it is a bit of a pain to measure. --- synapse/metrics/__init__.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index c7f57227071c..335fff87803f 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -377,11 +377,6 @@ def collect(self) -> Iterable[Metric]: "Tick time of the Twisted reactor (sec)", buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5], ) -pending_calls_metric = Histogram( - "python_twisted_reactor_pending_calls", - "Pending calls", - buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000], -) # # Federation Metrics @@ -501,21 +496,6 @@ def collect(self) -> Iterable[Metric]: def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F: @functools.wraps(func) def f(*args: Any, **kwargs: Any) -> Any: - now = reactor.seconds() - num_pending = 0 - - # _newTimedCalls is one long list of *all* pending calls. Below loop - # is based off of impl of reactor.runUntilCurrent - for delayed_call in reactor._newTimedCalls: - if delayed_call.time > now: - break - - if delayed_call.delayed_time > 0: - continue - - num_pending += 1 - - num_pending += len(reactor.threadCallQueue) start = time.time() ret = func(*args, **kwargs) end = time.time() @@ -526,7 +506,6 @@ def f(*args: Any, **kwargs: Any) -> Any: # I/O events, but that is harder to capture without rewriting half the # reactor. tick_time.observe(end - start) - pending_calls_metric.observe(num_pending) # Update the time we last ticked, for the metric to test whether # Synapse's reactor has frozen @@ -540,10 +519,7 @@ def f(*args: Any, **kwargs: Any) -> Any: try: # Ensure the reactor has all the attributes we expect - reactor.seconds # type: ignore reactor.runUntilCurrent # type: ignore - reactor._newTimedCalls # type: ignore - reactor.threadCallQueue # type: ignore # runUntilCurrent is called when we have pending calls. It is called once # per iteratation after fd polling. From 3f55267e3f0742b04f7ea281b26765e5010307a3 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 10 Jan 2022 20:53:20 +0000 Subject: [PATCH 6/8] Wrap epoll.poll instead of runUntilCurrent --- synapse/metrics/__init__.py | 70 +----------------------- synapse/metrics/_reactor_metrics.py | 83 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 69 deletions(-) create mode 100644 synapse/metrics/_reactor_metrics.py diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 335fff87803f..122838d6aa99 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import functools import itertools import logging import os import platform import threading -import time from typing import ( - Any, Callable, Dict, Generic, @@ -33,7 +30,6 @@ Type, TypeVar, Union, - cast, ) import attr @@ -44,11 +40,9 @@ GaugeMetricFamily, ) -from twisted.internet import reactor -from twisted.internet.base import ReactorBase from twisted.python.threadpool import ThreadPool -import synapse +import synapse.metrics._reactor_metrics from synapse.metrics._exposition import ( MetricsResource, generate_latest, @@ -368,16 +362,6 @@ def collect(self) -> Iterable[Metric]: REGISTRY.register(CPUMetrics()) -# -# Twisted reactor metrics -# - -tick_time = Histogram( - "python_twisted_reactor_tick_time", - "Tick time of the Twisted reactor (sec)", - buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5], -) - # # Federation Metrics # @@ -429,8 +413,6 @@ def collect(self) -> Iterable[Metric]: " ".join([platform.system(), platform.release()]), ).set(1) -last_ticked = time.time() - # 3PID send info threepid_send_requests = Histogram( "synapse_threepid_send_requests_with_tries", @@ -478,56 +460,6 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None: ) -class ReactorLastSeenMetric: - def collect(self) -> Iterable[Metric]: - cm = GaugeMetricFamily( - "python_twisted_reactor_last_seen", - "Seconds since the Twisted reactor was last seen", - ) - cm.add_metric([], time.time() - last_ticked) - yield cm - - -REGISTRY.register(ReactorLastSeenMetric()) - -F = TypeVar("F", bound=Callable[..., Any]) - - -def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F: - @functools.wraps(func) - def f(*args: Any, **kwargs: Any) -> Any: - start = time.time() - ret = func(*args, **kwargs) - end = time.time() - - # record the amount of wallclock time spent running pending calls. - # This is a proxy for the actual amount of time between reactor polls, - # since about 25% of time is actually spent running things triggered by - # I/O events, but that is harder to capture without rewriting half the - # reactor. - tick_time.observe(end - start) - - # Update the time we last ticked, for the metric to test whether - # Synapse's reactor has frozen - global last_ticked - last_ticked = end - - return ret - - return cast(F, f) - - -try: - # Ensure the reactor has all the attributes we expect - reactor.runUntilCurrent # type: ignore - - # runUntilCurrent is called when we have pending calls. It is called once - # per iteratation after fd polling. - reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore -except AttributeError: - pass - - __all__ = [ "MetricsResource", "generate_latest", diff --git a/synapse/metrics/_reactor_metrics.py b/synapse/metrics/_reactor_metrics.py new file mode 100644 index 000000000000..ce0688621c37 --- /dev/null +++ b/synapse/metrics/_reactor_metrics.py @@ -0,0 +1,83 @@ +# Copyright 2022 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import select +import time +from typing import Any, Iterable, List, Tuple + +from prometheus_client import Histogram, Metric +from prometheus_client.core import REGISTRY, GaugeMetricFamily + +from twisted.internet import reactor + +# +# Twisted reactor metrics +# + +tick_time = Histogram( + "python_twisted_reactor_tick_time", + "Tick time of the Twisted reactor (sec)", + buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5], +) + + +class EpollWrapper: + """a wrapper for an epoll object which records the time between polls""" + + def __init__(self, poller: "select.epoll"): + self.last_polled = time.time() + self._poller = poller + + def poll(self, *args, **kwargs) -> List[Tuple[int, int]]: # type: ignore[no-untyped-def] + # record the time since poll() was last called. This gives a good proxy for + # how long it takes to run everything in the reactor - ie, how long anything + # waiting for the next tick will have to wait. + tick_time.observe(time.time() - self.last_polled) + + ret = self._poller.poll(*args, **kwargs) + + self.last_polled = time.time() + return ret + + def __getattr__(self, item: str) -> Any: + return getattr(self._poller, item) + + +class ReactorLastSeenMetric: + def __init__(self, epoll_wrapper: EpollWrapper): + self._epoll_wrapper = epoll_wrapper + + def collect(self) -> Iterable[Metric]: + cm = GaugeMetricFamily( + "python_twisted_reactor_last_seen", + "Seconds since the Twisted reactor was last seen", + ) + cm.add_metric([], time.time() - self._epoll_wrapper.last_polled) + yield cm + + +try: + # if the reactor has a `_poller` attribute, which is an `epoll` object + # (ie, it's an EPollReactor), we wrap the `epoll` with a thing that will + # measure the time between ticks + from select import epoll + + poller = reactor._poller # type: ignore[attr-defined] +except (AttributeError, ImportError): + pass +else: + if isinstance(poller, epoll): + poller = EpollWrapper(poller) + reactor._poller = poller # type: ignore[attr-defined] + REGISTRY.register(ReactorLastSeenMetric(poller)) From 584e2e9116edda40f207bd4c577698b81972f7cc Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 11 Jan 2022 11:52:49 +0000 Subject: [PATCH 7/8] changelog --- changelog.d/11724.misc | 1 + changelog.d/11724.removal | 1 + 2 files changed, 2 insertions(+) create mode 100644 changelog.d/11724.misc create mode 100644 changelog.d/11724.removal diff --git a/changelog.d/11724.misc b/changelog.d/11724.misc new file mode 100644 index 000000000000..e9d5dae857b2 --- /dev/null +++ b/changelog.d/11724.misc @@ -0,0 +1 @@ +Improve accuracy of `python_twisted_reactor_tick_time` prometheus metric. diff --git a/changelog.d/11724.removal b/changelog.d/11724.removal new file mode 100644 index 000000000000..088c3ff31ff8 --- /dev/null +++ b/changelog.d/11724.removal @@ -0,0 +1 @@ +Remove `python_twisted_reactor_pending_calls` prometheus metric. From c89abc21c3a437abf894aa67ffe655ed38c5ccff Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 14 Jan 2022 15:34:19 +0000 Subject: [PATCH 8/8] remove python_twisted_reactor_pending_calls from contributed prometheus console --- contrib/prometheus/consoles/synapse.html | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/contrib/prometheus/consoles/synapse.html b/contrib/prometheus/consoles/synapse.html index cd9ad15231fc..d17c8a08d9e3 100644 --- a/contrib/prometheus/consoles/synapse.html +++ b/contrib/prometheus/consoles/synapse.html @@ -92,22 +92,6 @@

Average reactor tick time

}) -

Pending calls per tick

-
- -

Storage

Queries