Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Merge pull request #5499 from matrix-org/rav/cleanup_metrics
Browse files Browse the repository at this point in the history
Cleanups and sanity-checking in cpu and db metrics
  • Loading branch information
richvdh authored Jun 24, 2019
2 parents deb4fe6 + 1793de6 commit e59a8cd
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 29 deletions.
1 change: 1 addition & 0 deletions changelog.d/5499.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some cleanups and sanity-checking in the CPU and database metrics.
35 changes: 13 additions & 22 deletions synapse/storage/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,22 @@ def __init__(self):
self.current_counters = {}
self.previous_counters = {}

def update(self, key, start_time, end_time=None):
if end_time is None:
end_time = time.time()
duration = end_time - start_time
def update(self, key, duration_secs):
count, cum_time = self.current_counters.get(key, (0, 0))
count += 1
cum_time += duration
cum_time += duration_secs
self.current_counters[key] = (count, cum_time)
return end_time

def interval(self, interval_duration, limit=3):
def interval(self, interval_duration_secs, limit=3):
counters = []
for name, (count, cum_time) in iteritems(self.current_counters):
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
counters.append(
((cum_time - prev_time) / interval_duration, count - prev_count, name)
(
(cum_time - prev_time) / interval_duration_secs,
count - prev_count,
name,
)
)

self.previous_counters = dict(self.current_counters)
Expand Down Expand Up @@ -221,7 +221,6 @@ def __init__(self, db_conn, hs):
# is running in mainline, and we have some nice monitoring frontends
# to watch it
self._txn_perf_counters = PerformanceCounters()
self._get_event_counters = PerformanceCounters()

self._get_event_cache = Cache(
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
Expand Down Expand Up @@ -369,21 +368,13 @@ def loop():
time_then = self._previous_loop_ts
self._previous_loop_ts = time_now

ratio = (curr - prev) / (time_now - time_then)

top_three_counters = self._txn_perf_counters.interval(
time_now - time_then, limit=3
)
duration = time_now - time_then
ratio = (curr - prev) / duration

top_3_event_counters = self._get_event_counters.interval(
time_now - time_then, limit=3
)
top_three_counters = self._txn_perf_counters.interval(duration, limit=3)

perf_logger.info(
"Total database time: %.3f%% {%s} {%s}",
ratio * 100,
top_three_counters,
top_3_event_counters,
"Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
)

self._clock.looping_call(loop, 10000)
Expand Down Expand Up @@ -465,7 +456,7 @@ def _new_transaction(
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)

self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end)
self._txn_perf_counters.update(desc, duration)
sql_txn_timer.labels(desc).observe(duration)

@defer.inlineCallbacks
Expand Down
46 changes: 39 additions & 7 deletions synapse/util/logcontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,9 @@ def stop(self):
logger.warning("Called stop on logcontext %s without calling start", self)
return

usage_end = get_thread_resource_usage()

self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
self._resource_usage.ru_utime += utime_delta
self._resource_usage.ru_stime += stime_delta

self.usage_start = None

Expand All @@ -357,13 +356,44 @@ def get_resource_usage(self):
# can include resource usage so far.
is_main_thread = threading.current_thread() is self.main_thread
if self.alive and self.usage_start and is_main_thread:
current = get_thread_resource_usage()
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
res.ru_utime += utime_delta
res.ru_stime += stime_delta

return res

def _get_cputime(self):
"""Get the cpu usage time so far
Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
"""
current = get_thread_resource_usage()

utime_delta = current.ru_utime - self.usage_start.ru_utime
stime_delta = current.ru_stime - self.usage_start.ru_stime

# sanity check
if utime_delta < 0:
logger.error(
"utime went backwards! %f < %f",
current.ru_utime,
self.usage_start.ru_utime,
)
utime_delta = 0

if stime_delta < 0:
logger.error(
"stime went backwards! %f < %f",
current.ru_stime,
self.usage_start.ru_stime,
)
stime_delta = 0

return utime_delta, stime_delta

def add_database_transaction(self, duration_sec):
if duration_sec < 0:
raise ValueError("DB txn time can only be non-negative")
self._resource_usage.db_txn_count += 1
self._resource_usage.db_txn_duration_sec += duration_sec

Expand All @@ -374,6 +404,8 @@ def add_database_scheduled(self, sched_sec):
sched_sec (float): number of seconds it took us to get a
connection
"""
if sched_sec < 0:
raise ValueError("DB scheduling time can only be non-negative")
self._resource_usage.db_sched_duration_sec += sched_sec

def record_event_fetch(self, event_count):
Expand Down

0 comments on commit e59a8cd

Please sign in to comment.