From 6c821fa249963677f14f82213776f2ebbf893d81 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Mon, 10 Jan 2022 18:25:32 +0000
Subject: [PATCH 1/8] Move the GC metrics out to their own file

Move the manual GC code into its own function, and then move that, and all the
other GC stuff, out to a separate file.
---
 synapse/metrics/__init__.py | 155 +---------------------------
 synapse/metrics/_gc.py      | 195 ++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+), 152 deletions(-)
 create mode 100644 synapse/metrics/_gc.py

diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index ceef57ad883f..ba6dba66d1fe 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -41,7 +41,6 @@
 from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Metric
 from prometheus_client.core import (
     REGISTRY,
-    CounterMetricFamily,
     GaugeHistogramMetricFamily,
     GaugeMetricFamily,
 )
@@ -56,6 +55,7 @@
     generate_latest,
     start_http_server,
 )
+from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, maybe_gc
 from synapse.util.versionstring import get_version_string
 
 logger = logging.getLogger(__name__)
@@ -369,121 +369,6 @@ def collect(self) -> Iterable[Metric]:
 
 REGISTRY.register(CPUMetrics())
 
-#
-# Python GC metrics
-#
-
-gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
-gc_time = Histogram(
-    "python_gc_time",
-    "Time taken to GC (sec)",
-    ["gen"],
-    buckets=[
-        0.0025,
-        0.005,
-        0.01,
-        0.025,
-        0.05,
-        0.10,
-        0.25,
-        0.50,
-        1.00,
-        2.50,
-        5.00,
-        7.50,
-        15.00,
-        30.00,
-        45.00,
-        60.00,
-    ],
-)
-
-
-class GCCounts:
-    def collect(self) -> Iterable[Metric]:
-        cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
-        for n, m in enumerate(gc.get_count()):
-            cm.add_metric([str(n)], m)
-
-        yield cm
-
-
-if not running_on_pypy:
-    REGISTRY.register(GCCounts())
-
-
-#
-# PyPy GC / memory metrics
-#
-
-
-class PyPyGCStats:
-    def collect(self) -> Iterable[Metric]:
-
-        # @stats is a pretty-printer object with __str__() returning a nice table,
-        # plus some fields that contain data from that table.
-        # unfortunately, fields are pretty-printed themselves (i. e. '4.5MB').
-        stats = gc.get_stats(memory_pressure=False)  # type: ignore
-        # @s contains same fields as @stats, but as actual integers.
-        s = stats._s  # type: ignore
-
-        # also note that field naming is completely braindead
-        # and only vaguely correlates with the pretty-printed table.
-        # >>>> gc.get_stats(False)
-        # Total memory consumed:
-        #     GC used:            8.7MB (peak: 39.0MB)        # s.total_gc_memory, s.peak_memory
-        #        in arenas:            3.0MB                  # s.total_arena_memory
-        #        rawmalloced:          1.7MB                  # s.total_rawmalloced_memory
-        #        nursery:              4.0MB                  # s.nursery_size
-        #     raw assembler used: 31.0kB                      # s.jit_backend_used
-        #     -----------------------------
-        #     Total:              8.8MB                       # stats.memory_used_sum
-        #
-        #     Total memory allocated:
-        #     GC allocated:            38.7MB (peak: 41.1MB)  # s.total_allocated_memory, s.peak_allocated_memory
-        #        in arenas:            30.9MB                 # s.peak_arena_memory
-        #        rawmalloced:          4.1MB                  # s.peak_rawmalloced_memory
-        #        nursery:              4.0MB                  # s.nursery_size
-        #     raw assembler allocated: 1.0MB                  # s.jit_backend_allocated
-        #     -----------------------------
-        #     Total:                   39.7MB                 # stats.memory_allocated_sum
-        #
-        #     Total time spent in GC:  0.073                  # s.total_gc_time
-
-        pypy_gc_time = CounterMetricFamily(
-            "pypy_gc_time_seconds_total",
-            "Total time spent in PyPy GC",
-            labels=[],
-        )
-        pypy_gc_time.add_metric([], s.total_gc_time / 1000)
-        yield pypy_gc_time
-
-        pypy_mem = GaugeMetricFamily(
-            "pypy_memory_bytes",
-            "Memory tracked by PyPy allocator",
-            labels=["state", "class", "kind"],
-        )
-        # memory used by JIT assembler
-        pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used)
-        pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated)
-        # memory used by GCed objects
-        pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory)
-        pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory)
-        pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory)
-        pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory)
-        pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size)
-        pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size)
-        # totals
-        pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory)
-        pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory)
-        pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory)
-        pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory)
-        yield pypy_mem
-
-
-if running_on_pypy:
-    REGISTRY.register(PyPyGCStats())
-
 
 #
 # Twisted reactor metrics
@@ -612,14 +497,6 @@ def collect(self) -> Iterable[Metric]:
 
 REGISTRY.register(ReactorLastSeenMetric())
 
-# The minimum time in seconds between GCs for each generation, regardless of the current GC
-# thresholds and counts.
-MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
-
-# The time (in seconds since the epoch) of the last time we did a GC for each generation.
-_last_gc = [0.0, 0.0, 0.0]
-
-
 F = TypeVar("F", bound=Callable[..., Any])
 
 
@@ -658,34 +535,7 @@ def f(*args: Any, **kwargs: Any) -> Any:
         global last_ticked
         last_ticked = end
 
-        if running_on_pypy:
-            return ret
-
-        # Check if we need to do a manual GC (since its been disabled), and do
-        # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
-        # promote an object into gen 2, and we don't want to handle the same
-        # object multiple times.
-        threshold = gc.get_threshold()
-        counts = gc.get_count()
-        for i in (2, 1, 0):
-            # We check if we need to do one based on a straightforward
-            # comparison between the threshold and count. We also do an extra
-            # check to make sure that we don't a GC too often.
-            if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
-                if i == 0:
-                    logger.debug("Collecting gc %d", i)
-                else:
-                    logger.info("Collecting gc %d", i)
-
-                start = time.time()
-                unreachable = gc.collect(i)
-                end = time.time()
-
-                _last_gc[i] = end
-
-                gc_time.labels(i).observe(end - start)
-                gc_unreachable.labels(i).set(unreachable)
-
+        maybe_gc()
         return ret
 
     return cast(F, f)
@@ -717,4 +567,5 @@ def f(*args: Any, **kwargs: Any) -> Any:
     "LaterGauge",
     "InFlightGauge",
     "GaugeBucketCollector",
+    "MIN_TIME_BETWEEN_GCS",
 ]
diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py
new file mode 100644
index 000000000000..208e65769987
--- /dev/null
+++ b/synapse/metrics/_gc.py
@@ -0,0 +1,195 @@
+# Copyright 2015-2022 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import logging
+import platform
+import time
+from typing import Iterable
+
+from prometheus_client.core import (
+    REGISTRY,
+    CounterMetricFamily,
+    Gauge,
+    GaugeMetricFamily,
+    Histogram,
+    Metric,
+)
+
+"""Prometheus metrics for garbage collection"""
+
+
+logger = logging.getLogger(__name__)
+
+# The minimum time in seconds between GCs for each generation, regardless of the current GC
+# thresholds and counts.
+MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
+
+running_on_pypy = platform.python_implementation() == "PyPy"
+
+#
+# Python GC metrics
+#
+
+gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
+gc_time = Histogram(
+    "python_gc_time",
+    "Time taken to GC (sec)",
+    ["gen"],
+    buckets=[
+        0.0025,
+        0.005,
+        0.01,
+        0.025,
+        0.05,
+        0.10,
+        0.25,
+        0.50,
+        1.00,
+        2.50,
+        5.00,
+        7.50,
+        15.00,
+        30.00,
+        45.00,
+        60.00,
+    ],
+)
+
+
+class GCCounts:
+    def collect(self) -> Iterable[Metric]:
+        cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
+        for n, m in enumerate(gc.get_count()):
+            cm.add_metric([str(n)], m)
+
+        yield cm
+
+
+if not running_on_pypy:
+    REGISTRY.register(GCCounts())
+
+
+#
+# PyPy GC / memory metrics
+#
+
+
+class PyPyGCStats:
+    def collect(self) -> Iterable[Metric]:
+
+        # @stats is a pretty-printer object with __str__() returning a nice table,
+        # plus some fields that contain data from that table.
+        # unfortunately, fields are pretty-printed themselves (i. e. '4.5MB').
+        stats = gc.get_stats(memory_pressure=False)  # type: ignore
+        # @s contains same fields as @stats, but as actual integers.
+        s = stats._s  # type: ignore
+
+        # also note that field naming is completely braindead
+        # and only vaguely correlates with the pretty-printed table.
+        # >>>> gc.get_stats(False)
+        # Total memory consumed:
+        #     GC used:            8.7MB (peak: 39.0MB)        # s.total_gc_memory, s.peak_memory
+        #        in arenas:            3.0MB                  # s.total_arena_memory
+        #        rawmalloced:          1.7MB                  # s.total_rawmalloced_memory
+        #        nursery:              4.0MB                  # s.nursery_size
+        #     raw assembler used: 31.0kB                      # s.jit_backend_used
+        #     -----------------------------
+        #     Total:              8.8MB                       # stats.memory_used_sum
+        #
+        #     Total memory allocated:
+        #     GC allocated:            38.7MB (peak: 41.1MB)  # s.total_allocated_memory, s.peak_allocated_memory
+        #        in arenas:            30.9MB                 # s.peak_arena_memory
+        #        rawmalloced:          4.1MB                  # s.peak_rawmalloced_memory
+        #        nursery:              4.0MB                  # s.nursery_size
+        #     raw assembler allocated: 1.0MB                  # s.jit_backend_allocated
+        #     -----------------------------
+        #     Total:                   39.7MB                 # stats.memory_allocated_sum
+        #
+        #     Total time spent in GC:  0.073                  # s.total_gc_time
+
+        pypy_gc_time = CounterMetricFamily(
+            "pypy_gc_time_seconds_total",
+            "Total time spent in PyPy GC",
+            labels=[],
+        )
+        pypy_gc_time.add_metric([], s.total_gc_time / 1000)
+        yield pypy_gc_time
+
+        pypy_mem = GaugeMetricFamily(
+            "pypy_memory_bytes",
+            "Memory tracked by PyPy allocator",
+            labels=["state", "class", "kind"],
+        )
+        # memory used by JIT assembler
+        pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used)
+        pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated)
+        # memory used by GCed objects
+        pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory)
+        pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory)
+        pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory)
+        pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory)
+        pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size)
+        pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size)
+        # totals
+        pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory)
+        pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory)
+        pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory)
+        pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory)
+        yield pypy_mem
+
+
+if running_on_pypy:
+    REGISTRY.register(PyPyGCStats())
+
+
+#
+# manually run a GC, if it's due, and we're not on pypy.
+#
+
+# The time (in seconds since the epoch) of the last time we did a GC for each generation.
+_last_gc = [0.0, 0.0, 0.0]
+
+
+def maybe_gc() -> None:
+    if running_on_pypy:
+        # nothing to do on pypy, since we let automated GC happen as normal
+        return
+
+    # Check if we need to do a manual GC (since its been disabled), and do
+    # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
+    # promote an object into gen 2, and we don't want to handle the same
+    # object multiple times.
+    threshold = gc.get_threshold()
+    counts = gc.get_count()
+    end = time.time()
+    for i in (2, 1, 0):
+        # We check if we need to do one based on a straightforward
+        # comparison between the threshold and count. We also do an extra
+        # check to make sure that we don't a GC too often.
+        if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
+            if i == 0:
+                logger.debug("Collecting gc %d", i)
+            else:
+                logger.info("Collecting gc %d", i)
+
+            start = time.time()
+            unreachable = gc.collect(i)
+            end = time.time()
+
+            _last_gc[i] = end
+
+            gc_time.labels(i).observe(end - start)
+            gc_unreachable.labels(i).set(unreachable)

From 05bdebd2a8558a62fe4f9c8aa61b56d1f62517d3 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Mon, 10 Jan 2022 20:27:24 +0000
Subject: [PATCH 2/8] Replace GC hooks in reactor with a timed call

---
 synapse/metrics/__init__.py | 10 +----
 synapse/metrics/_gc.py      | 78 ++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index ba6dba66d1fe..c95abd4394eb 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import functools
-import gc
 import itertools
 import logging
 import os
@@ -55,14 +54,13 @@
     generate_latest,
     start_http_server,
 )
-from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, maybe_gc
+from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS
 from synapse.util.versionstring import get_version_string
 
 logger = logging.getLogger(__name__)
 
 METRICS_PREFIX = "/_synapse/metrics"
 
-running_on_pypy = platform.python_implementation() == "PyPy"
 all_gauges: "Dict[str, Union[LaterGauge, InFlightGauge]]" = {}
 
 HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
@@ -535,7 +533,6 @@ def f(*args: Any, **kwargs: Any) -> Any:
         global last_ticked
         last_ticked = end
 
-        maybe_gc()
         return ret
 
     return cast(F, f)
@@ -551,11 +548,6 @@ def f(*args: Any, **kwargs: Any) -> Any:
     # runUntilCurrent is called when we have pending calls. It is called once
     # per iteratation after fd polling.
     reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent)  # type: ignore
-
-    # We manually run the GC each reactor tick so that we can get some metrics
-    # about time spent doing GC,
-    if not running_on_pypy:
-        gc.disable()
 except AttributeError:
     pass
 
diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py
index 208e65769987..4de6244160b7 100644
--- a/synapse/metrics/_gc.py
+++ b/synapse/metrics/_gc.py
@@ -28,6 +28,8 @@
     Metric,
 )
 
+from twisted.internet import task
+
 """Prometheus metrics for garbage collection"""
 
 
@@ -81,6 +83,42 @@ def collect(self) -> Iterable[Metric]:
 if not running_on_pypy:
     REGISTRY.register(GCCounts())
 
+    # disable automatic GC, and replace it with a task that runs every 100ms, so that
+    # we can get some metrics about GC activity.
+    gc.disable()
+
+    # The time (in seconds since the epoch) of the last time we did a GC for each generation.
+    _last_gc = [0.0, 0.0, 0.0]
+
+    def _maybe_gc() -> None:
+        # Check if we need to do a manual GC (since its been disabled), and do
+        # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
+        # promote an object into gen 2, and we don't want to handle the same
+        # object multiple times.
+        threshold = gc.get_threshold()
+        counts = gc.get_count()
+        end = time.time()
+        for i in (2, 1, 0):
+            # We check if we need to do one based on a straightforward
+            # comparison between the threshold and count. We also do an extra
+            # check to make sure that we don't a GC too often.
+            if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
+                if i == 0:
+                    logger.debug("Collecting gc %d", i)
+                else:
+                    logger.info("Collecting gc %d", i)
+
+                start = time.time()
+                unreachable = gc.collect(i)
+                end = time.time()
+
+                _last_gc[i] = end
+
+                gc_time.labels(i).observe(end - start)
+                gc_unreachable.labels(i).set(unreachable)
+
+    gc_task = task.LoopingCall(_maybe_gc)
+    gc_task.start(0.1)
 
 #
 # PyPy GC / memory metrics
@@ -153,43 +191,3 @@ def collect(self) -> Iterable[Metric]:
 
 if running_on_pypy:
     REGISTRY.register(PyPyGCStats())
-
-
-#
-# manually run a GC, if it's due, and we're not on pypy.
-#
-
-# The time (in seconds since the epoch) of the last time we did a GC for each generation.
-_last_gc = [0.0, 0.0, 0.0]
-
-
-def maybe_gc() -> None:
-    if running_on_pypy:
-        # nothing to do on pypy, since we let automated GC happen as normal
-        return
-
-    # Check if we need to do a manual GC (since its been disabled), and do
-    # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
-    # promote an object into gen 2, and we don't want to handle the same
-    # object multiple times.
-    threshold = gc.get_threshold()
-    counts = gc.get_count()
-    end = time.time()
-    for i in (2, 1, 0):
-        # We check if we need to do one based on a straightforward
-        # comparison between the threshold and count. We also do an extra
-        # check to make sure that we don't a GC too often.
-        if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
-            if i == 0:
-                logger.debug("Collecting gc %d", i)
-            else:
-                logger.info("Collecting gc %d", i)
-
-            start = time.time()
-            unreachable = gc.collect(i)
-            end = time.time()
-
-            _last_gc[i] = end
-
-            gc_time.labels(i).observe(end - start)
-            gc_unreachable.labels(i).set(unreachable)

From fb74ba73be06362e0e54ab6aaa054040a1f330d2 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Tue, 11 Jan 2022 11:45:20 +0000
Subject: [PATCH 3/8] changelog

---
 changelog.d/11723.misc | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/11723.misc

diff --git a/changelog.d/11723.misc b/changelog.d/11723.misc
new file mode 100644
index 000000000000..f99e02070a93
--- /dev/null
+++ b/changelog.d/11723.misc
@@ -0,0 +1 @@
+Simplify calculation of prometheus metrics for garbage collection.

From 45051e862e8374b55f99cc379f1c4e9ccb8dfe1e Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Tue, 11 Jan 2022 12:37:43 +0000
Subject: [PATCH 4/8] Avoid GC management in unit tests

Register the GC management task explicitly, so that it doesn't happen during
unit tests.
---
 synapse/app/_base.py        |  3 ++-
 synapse/metrics/__init__.py |  3 ++-
 synapse/metrics/_gc.py      | 16 +++++++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/synapse/app/_base.py b/synapse/app/_base.py
index 5fc59c1be11d..579adbbca02d 100644
--- a/synapse/app/_base.py
+++ b/synapse/app/_base.py
@@ -60,7 +60,7 @@
 from synapse.events.third_party_rules import load_legacy_third_party_event_rules
 from synapse.handlers.auth import load_legacy_password_auth_providers
 from synapse.logging.context import PreserveLoggingContext
-from synapse.metrics import register_threadpool
+from synapse.metrics import install_gc_manager, register_threadpool
 from synapse.metrics.background_process_metrics import wrap_as_background_process
 from synapse.metrics.jemalloc import setup_jemalloc_stats
 from synapse.types import ISynapseReactor
@@ -159,6 +159,7 @@ def run() -> None:
         change_resource_limit(soft_file_limit)
         if gc_thresholds:
             gc.set_threshold(*gc_thresholds)
+        install_gc_manager()
         run_command()
 
     # make sure that we run the reactor with the sentinel log context,
diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index c95abd4394eb..c7f57227071c 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -54,7 +54,7 @@
     generate_latest,
     start_http_server,
 )
-from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS
+from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
 from synapse.util.versionstring import get_version_string
 
 logger = logging.getLogger(__name__)
@@ -560,4 +560,5 @@ def f(*args: Any, **kwargs: Any) -> Any:
     "InFlightGauge",
     "GaugeBucketCollector",
     "MIN_TIME_BETWEEN_GCS",
+    "install_gc_manager",
 ]
diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py
index 4de6244160b7..2bc909efa0d3 100644
--- a/synapse/metrics/_gc.py
+++ b/synapse/metrics/_gc.py
@@ -80,11 +80,20 @@ def collect(self) -> Iterable[Metric]:
         yield cm
 
 
-if not running_on_pypy:
+def install_gc_manager() -> None:
+    """Disable automatic GC, and replace it with a task that runs every 100ms
+
+    This means that (a) we can limit how often GC runs; (b) we can get some metrics
+    about GC activity.
+
+    It does nothing on PyPy.
+    """
+
+    if running_on_pypy:
+        return
+
     REGISTRY.register(GCCounts())
 
-    # disable automatic GC, and replace it with a task that runs every 100ms, so that
-    # we can get some metrics about GC activity.
     gc.disable()
 
     # The time (in seconds since the epoch) of the last time we did a GC for each generation.
@@ -120,6 +129,7 @@ def _maybe_gc() -> None:
     gc_task = task.LoopingCall(_maybe_gc)
     gc_task.start(0.1)
 
+
 #
 # PyPy GC / memory metrics
 #

From 3dbc342c70039ad3f084fc11afe810fcc29528f8 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Mon, 10 Jan 2022 20:37:10 +0000
Subject: [PATCH 5/8] Remove `python_twisted_reactor_pending_calls`

I don't think it's terribly useful, and it is a bit of a pain to measure.
---
 synapse/metrics/__init__.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index c7f57227071c..335fff87803f 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -377,11 +377,6 @@ def collect(self) -> Iterable[Metric]:
     "Tick time of the Twisted reactor (sec)",
     buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
 )
-pending_calls_metric = Histogram(
-    "python_twisted_reactor_pending_calls",
-    "Pending calls",
-    buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
-)
 
 #
 # Federation Metrics
@@ -501,21 +496,6 @@ def collect(self) -> Iterable[Metric]:
 def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
     @functools.wraps(func)
     def f(*args: Any, **kwargs: Any) -> Any:
-        now = reactor.seconds()
-        num_pending = 0
-
-        # _newTimedCalls is one long list of *all* pending calls. Below loop
-        # is based off of impl of reactor.runUntilCurrent
-        for delayed_call in reactor._newTimedCalls:
-            if delayed_call.time > now:
-                break
-
-            if delayed_call.delayed_time > 0:
-                continue
-
-            num_pending += 1
-
-        num_pending += len(reactor.threadCallQueue)
         start = time.time()
         ret = func(*args, **kwargs)
         end = time.time()
@@ -526,7 +506,6 @@ def f(*args: Any, **kwargs: Any) -> Any:
         # I/O events, but that is harder to capture without rewriting half the
         # reactor.
         tick_time.observe(end - start)
-        pending_calls_metric.observe(num_pending)
 
         # Update the time we last ticked, for the metric to test whether
         # Synapse's reactor has frozen
@@ -540,10 +519,7 @@ def f(*args: Any, **kwargs: Any) -> Any:
 
 try:
     # Ensure the reactor has all the attributes we expect
-    reactor.seconds  # type: ignore
     reactor.runUntilCurrent  # type: ignore
-    reactor._newTimedCalls  # type: ignore
-    reactor.threadCallQueue  # type: ignore
 
     # runUntilCurrent is called when we have pending calls. It is called once
     # per iteratation after fd polling.

From 3f55267e3f0742b04f7ea281b26765e5010307a3 Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Mon, 10 Jan 2022 20:53:20 +0000
Subject: [PATCH 6/8] Wrap epoll.poll instead of runUntilCurrent

---
 synapse/metrics/__init__.py         | 70 +-----------------------
 synapse/metrics/_reactor_metrics.py | 83 +++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 69 deletions(-)
 create mode 100644 synapse/metrics/_reactor_metrics.py

diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py
index 335fff87803f..122838d6aa99 100644
--- a/synapse/metrics/__init__.py
+++ b/synapse/metrics/__init__.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import functools
 import itertools
 import logging
 import os
 import platform
 import threading
-import time
 from typing import (
-    Any,
     Callable,
     Dict,
     Generic,
@@ -33,7 +30,6 @@
     Type,
     TypeVar,
     Union,
-    cast,
 )
 
 import attr
@@ -44,11 +40,9 @@
     GaugeMetricFamily,
 )
 
-from twisted.internet import reactor
-from twisted.internet.base import ReactorBase
 from twisted.python.threadpool import ThreadPool
 
-import synapse
+import synapse.metrics._reactor_metrics
 from synapse.metrics._exposition import (
     MetricsResource,
     generate_latest,
@@ -368,16 +362,6 @@ def collect(self) -> Iterable[Metric]:
 REGISTRY.register(CPUMetrics())
 
 
-#
-# Twisted reactor metrics
-#
-
-tick_time = Histogram(
-    "python_twisted_reactor_tick_time",
-    "Tick time of the Twisted reactor (sec)",
-    buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
-)
-
 #
 # Federation Metrics
 #
@@ -429,8 +413,6 @@ def collect(self) -> Iterable[Metric]:
     " ".join([platform.system(), platform.release()]),
 ).set(1)
 
-last_ticked = time.time()
-
 # 3PID send info
 threepid_send_requests = Histogram(
     "synapse_threepid_send_requests_with_tries",
@@ -478,56 +460,6 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
     )
 
 
-class ReactorLastSeenMetric:
-    def collect(self) -> Iterable[Metric]:
-        cm = GaugeMetricFamily(
-            "python_twisted_reactor_last_seen",
-            "Seconds since the Twisted reactor was last seen",
-        )
-        cm.add_metric([], time.time() - last_ticked)
-        yield cm
-
-
-REGISTRY.register(ReactorLastSeenMetric())
-
-F = TypeVar("F", bound=Callable[..., Any])
-
-
-def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
-    @functools.wraps(func)
-    def f(*args: Any, **kwargs: Any) -> Any:
-        start = time.time()
-        ret = func(*args, **kwargs)
-        end = time.time()
-
-        # record the amount of wallclock time spent running pending calls.
-        # This is a proxy for the actual amount of time between reactor polls,
-        # since about 25% of time is actually spent running things triggered by
-        # I/O events, but that is harder to capture without rewriting half the
-        # reactor.
-        tick_time.observe(end - start)
-
-        # Update the time we last ticked, for the metric to test whether
-        # Synapse's reactor has frozen
-        global last_ticked
-        last_ticked = end
-
-        return ret
-
-    return cast(F, f)
-
-
-try:
-    # Ensure the reactor has all the attributes we expect
-    reactor.runUntilCurrent  # type: ignore
-
-    # runUntilCurrent is called when we have pending calls. It is called once
-    # per iteratation after fd polling.
-    reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent)  # type: ignore
-except AttributeError:
-    pass
-
-
 __all__ = [
     "MetricsResource",
     "generate_latest",
diff --git a/synapse/metrics/_reactor_metrics.py b/synapse/metrics/_reactor_metrics.py
new file mode 100644
index 000000000000..ce0688621c37
--- /dev/null
+++ b/synapse/metrics/_reactor_metrics.py
@@ -0,0 +1,83 @@
+# Copyright 2022 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import select
+import time
+from typing import Any, Iterable, List, Tuple
+
+from prometheus_client import Histogram, Metric
+from prometheus_client.core import REGISTRY, GaugeMetricFamily
+
+from twisted.internet import reactor
+
+#
+# Twisted reactor metrics
+#
+
+tick_time = Histogram(
+    "python_twisted_reactor_tick_time",
+    "Tick time of the Twisted reactor (sec)",
+    buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
+)
+
+
+class EpollWrapper:
+    """a wrapper for an epoll object which records the time between polls"""
+
+    def __init__(self, poller: "select.epoll"):
+        self.last_polled = time.time()
+        self._poller = poller
+
+    def poll(self, *args, **kwargs) -> List[Tuple[int, int]]:  # type: ignore[no-untyped-def]
+        # record the time since poll() was last called. This gives a good proxy for
+        # how long it takes to run everything in the reactor - ie, how long anything
+        # waiting for the next tick will have to wait.
+        tick_time.observe(time.time() - self.last_polled)
+
+        ret = self._poller.poll(*args, **kwargs)
+
+        self.last_polled = time.time()
+        return ret
+
+    def __getattr__(self, item: str) -> Any:
+        return getattr(self._poller, item)
+
+
+class ReactorLastSeenMetric:
+    def __init__(self, epoll_wrapper: EpollWrapper):
+        self._epoll_wrapper = epoll_wrapper
+
+    def collect(self) -> Iterable[Metric]:
+        cm = GaugeMetricFamily(
+            "python_twisted_reactor_last_seen",
+            "Seconds since the Twisted reactor was last seen",
+        )
+        cm.add_metric([], time.time() - self._epoll_wrapper.last_polled)
+        yield cm
+
+
+try:
+    # if the reactor has a `_poller` attribute, which is an `epoll` object
+    # (ie, it's an EPollReactor), we wrap the `epoll` with a thing that will
+    # measure the time between ticks
+    from select import epoll
+
+    poller = reactor._poller  # type: ignore[attr-defined]
+except (AttributeError, ImportError):
+    pass
+else:
+    if isinstance(poller, epoll):
+        poller = EpollWrapper(poller)
+        reactor._poller = poller  # type: ignore[attr-defined]
+        REGISTRY.register(ReactorLastSeenMetric(poller))

From 584e2e9116edda40f207bd4c577698b81972f7cc Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Tue, 11 Jan 2022 11:52:49 +0000
Subject: [PATCH 7/8] changelog

---
 changelog.d/11724.misc    | 1 +
 changelog.d/11724.removal | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 changelog.d/11724.misc
 create mode 100644 changelog.d/11724.removal

diff --git a/changelog.d/11724.misc b/changelog.d/11724.misc
new file mode 100644
index 000000000000..e9d5dae857b2
--- /dev/null
+++ b/changelog.d/11724.misc
@@ -0,0 +1 @@
+Improve accuracy of `python_twisted_reactor_tick_time` prometheus metric.
diff --git a/changelog.d/11724.removal b/changelog.d/11724.removal
new file mode 100644
index 000000000000..088c3ff31ff8
--- /dev/null
+++ b/changelog.d/11724.removal
@@ -0,0 +1 @@
+Remove `python_twisted_reactor_pending_calls` prometheus metric.

From c89abc21c3a437abf894aa67ffe655ed38c5ccff Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <richard@matrix.org>
Date: Fri, 14 Jan 2022 15:34:19 +0000
Subject: [PATCH 8/8] remove python_twisted_reactor_pending_calls from
 contributed prometheus console

---
 contrib/prometheus/consoles/synapse.html | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/contrib/prometheus/consoles/synapse.html b/contrib/prometheus/consoles/synapse.html
index cd9ad15231fc..d17c8a08d9e3 100644
--- a/contrib/prometheus/consoles/synapse.html
+++ b/contrib/prometheus/consoles/synapse.html
@@ -92,22 +92,6 @@ <h3>Average reactor tick time</h3>
 })
 </script>
 
-<h3>Pending calls per tick</h3>
-<div id="reactor_pending_calls"></div>
-<script>
-new PromConsole.Graph({
-  node: document.querySelector("#reactor_pending_calls"),
-  expr: "rate(python_twisted_reactor_pending_calls_sum[30s]) / rate(python_twisted_reactor_pending_calls_count[30s])",
-  name: "[[job]]-[[index]]",
-  min: 0,
-  renderer: "line",
-  height: 150,
-  yAxisFormatter: PromConsole.NumberFormatter.humanize,
-  yHoverFormatter: PromConsole.NumberFormatter.humanize,
-  yTitle: "Pending Calls"
-})
-</script>
-
 <h1>Storage</h1>
 
 <h3>Queries</h3>