From 2f771451dda60f59a430d1cf20f145b604a6cbf3 Mon Sep 17 00:00:00 2001 From: Conrad Hoffmann Date: Wed, 6 Nov 2024 12:33:07 +0100 Subject: [PATCH] Export last replay age in replication collector The exported replication lag does not handle all failure modes, and can report 0 for replicas that are out of sync and incapable of recovery. A proper replacement for that metric would require a different approach (see e.g. #1007), but for a lot of folks, simply exporting the age of the last replay can provide a pretty strong signal for something being amiss. I think this solution might be preferable to #977, though the lag metric needs to be fixed or abandoned eventually. Signed-off-by: Conrad Hoffmann --- collector/pg_replication.go | 19 +++++++++++++++++-- collector/pg_replication_test.go | 5 +++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/collector/pg_replication.go b/collector/pg_replication.go index 6067cc9b1..7f8b2fbd7 100644 --- a/collector/pg_replication.go +++ b/collector/pg_replication.go @@ -51,6 +51,15 @@ var ( "Indicates if the server is a replica", []string{}, nil, ) + pgReplicationLastReplay = prometheus.NewDesc( + prometheus.BuildFQName( + namespace, + replicationSubsystem, + "last_replay_seconds", + ), + "Age of last replay in seconds", + []string{}, nil, + ) pgReplicationQuery = `SELECT CASE @@ -61,7 +70,8 @@ var ( CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 - END as is_replica` + END as is_replica, + GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay` ) func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error { @@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, var lag float64 var isReplica int64 - err := row.Scan(&lag, &isReplica) + var replayAge float64 + err := row.Scan(&lag, &isReplica, &replayAge) if err != nil { return err } @@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, pgReplicationIsReplica, prometheus.GaugeValue, float64(isReplica), ) + ch <- prometheus.MustNewConstMetric( + pgReplicationLastReplay, + prometheus.GaugeValue, replayAge, + ) return nil } diff --git a/collector/pg_replication_test.go b/collector/pg_replication_test.go index b6df698e3..a48e9fd69 100644 --- a/collector/pg_replication_test.go +++ b/collector/pg_replication_test.go @@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) { inst := &instance{db: db} - columns := []string{"lag", "is_replica"} + columns := []string{"lag", "is_replica", "last_replay"} rows := sqlmock.NewRows(columns). - AddRow(1000, 1) + AddRow(1000, 1, 3) mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows) ch := make(chan prometheus.Metric) @@ -49,6 +49,7 @@ func TestPgReplicationCollector(t *testing.T) { expected := []MetricResult{ {labels: labelMap{}, value: 1000, metricType: dto.MetricType_GAUGE}, {labels: labelMap{}, value: 1, metricType: dto.MetricType_GAUGE}, + {labels: labelMap{}, value: 3, metricType: dto.MetricType_GAUGE}, } convey.Convey("Metrics comparison", t, func() {