fix: use the proper views for pgmonitor-extension queries. doc update…

…. new table stat metric
CrunchyData · Jun 6, 2024 · dabea81 · dabea81
1 parent a7ca2dc
commit dabea81
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 86 deletions.
diff --git a/hugo/content/prometheus/_index.md b/hugo/content/prometheus/_index.md
@@ -39,7 +39,10 @@ Or you can also download [Prometheus](https://prometheus.io/) and [Alertmanager]
 
 ##### Minimum Versions
 
-pgMonitor assumes to be using at least Prometheus 2.9.x. We recommend to always use the latest minor version of Prometheus.
+pgMonitor has been tested with the following versions at a minimum. Later versions should generally work. If they do not, please open an issue on our Github.
+
+ * Prometheus 2.49.1
+ * Alertmanager 0.26.0
 
 ##### User and Configuration Directory Installation
 
@@ -118,10 +121,10 @@ The below files dictate how Prometheus and Alertmanager will behave at runtime f
 
 | File                                     | Instructions |
 |------------------------------------------|--------------|
-| /etc/prometheus/crunchy-prometheus.yml | Modify to set scrape interval if different from the default of 30s. Activate alert rules and Alertmanager by uncommenting lines when set as needed. Activate blackbox_exporter monitoring if desired. Service file provided by pgMonitor expects config file to be named "crunchy-prometheus.yml" |
-| /etc/prometheus/crunchy-alertmanager.yml | Setup alert target (e.g., SMTP, SMS, etc.), receiver and route information. Service file provided by pgMonitor expects config file to be named "crunchy-alertmanager.yml" |
-| /etc/prometheus/alert-ruled.d/crunchy-alert-rules-\*.yml.example | Update rules as needed and remove ".example" suffix. Prometheus config provided by pgmonitor expects ".yml" files to be located in "/etc/prometheus/alert-rules.d/" |
-| /etc/prometheus/auto.d/*.yml | You will need at least one file with a final ".yml" extension. Copy the example files to create as many additional targets as needed.  Ensure the configuration files you want to use do not end in ".yml.example" but only with ".yml". Note that in order to use the provided Grafana dashboards, the extra "exp_type" label must be applied to all targets and be set appropriately (pg or node). Also, PostgreSQL targets make use of the "cluster_name" variable and should be given a relevant value so all systems (primary & replicas) can be related to each other when needed (Grafana dashboards, etc). See the example target files provided for how to set the labels for postgres or node exporter targets. |
+| /etc/prometheus/crunchy-prometheus.yml | Main configuration file for prometheus to set things like scrape intervals and alerting. blackbox_exporter monitoring can also be enabled if desired. Service file provided by pgMonitor expects config file to be named "crunchy-prometheus.yml". For full configration options please see the [Prometheus upstream documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) |
+| /etc/prometheus/crunchy-alertmanager.yml | Setup alert target (e.g., SMTP, SMS, etc.), receiver and route information. Service file provided by pgMonitor expects config file to be named "crunchy-alertmanager.yml". For full configuration options please see the [Alertmanager upstream documentation](https://prometheus.io/docs/alerting/latest/configuration/) |
+| /etc/prometheus/alert-ruled.d/crunchy-alert-rules-\*.yml.example | Update rules as needed and remove ".example" suffix. Prometheus config provided by pgmonitor expects ".yml" files to be located in "/etc/prometheus/alert-rules.d/". Additional information on configuring alert rules can be found in the [alert rules upstream documentation](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). |
+| /etc/prometheus/auto.d/*.yml | You will need at least one file with a final ".yml" extension. Copy the example files to create as many additional targets as needed.  Ensure the configuration files you want to use do not end in ".yml.example" but only with ".yml". Note that in order to use the provided Grafana dashboards, the extra "exp_type" label must be applied to all targets and be set appropriately (pg, node, etcd, pgbouncer, etc). Also, PostgreSQL targets make use of the "cluster_name" variable and should be given a relevant value so all systems (primary & replicas) can be related to each other when needed (Grafana dashboards, etc). See the example target files provided for how to set the labels for postgres or node exporter targets. |
 
 #### Blackbox Exporter
 

diff --git a/sql_exporter/common/crunchy_global_collector.yml b/sql_exporter/common/crunchy_global_collector.yml
@@ -362,32 +362,24 @@ queries:
 
   - query_name: ccp_archive_command_status
     query: |
-      SELECT CASE 
-          WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0
-          WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0
-          ELSE EXTRACT(epoch from (last_failed_time - last_archived_time)) 
-          END AS seconds_since_last_fail
-      , EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive
+      SELECT  seconds_since_last_fail
+      , seconds_since_last_archive
       , archived_count
       , failed_count
-      FROM pg_catalog.pg_stat_archiver
+      FROM pgmonitor_ext.ccp_archive_command_status
 
 
   - query_name: ccp_connection_stats
     query: |
-      SELECT ((total - idle) - idle_in_txn) AS active
+      SELECT active
         , total
         , idle
         , idle_in_txn
-        , (select coalesce(extract(epoch from (max(clock_timestamp() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') AS max_idle_in_txn_time
-        , (select coalesce(extract(epoch from (max(clock_timestamp() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' AND state NOT LIKE 'idle%' ) AS max_query_time
-        , (select coalesce(extract(epoch from (max(clock_timestamp() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) AS max_blocked_query_time
+        , max_idle_in_txn_time
+        , max_query_time
+        , max_blocked_query_time
         , max_connections
-        FROM (
-                SELECT count(*) AS total
-                        , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle
-                        , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x
-        JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true)
+        FROM pgmonitor_ext.ccp_connection_stats
 
 
   - query_name: ccp_database_size
@@ -399,8 +391,8 @@ queries:
 
   - query_name: ccp_is_in_recovery
     query: |
-      SELECT CASE WHEN pg_is_in_recovery = true THEN 1 ELSE 2 END AS status
-      FROM pg_is_in_recovery()
+      SELECT status
+      FROM pgmonitor_ext.ccp_pg_is_in_recovery
 
 
   - query_name: ccp_locks
@@ -419,53 +411,48 @@ queries:
   - query_name: ccp_pg_settings_checksum
     query: |
       SELECT pgmonitor_ext.pg_settings_checksum() AS status
-   
+
 
   - query_name: ccp_postgresql_version
     query: |
-      SELECT current_setting('server_version_num')::int AS current
+      SELECT current
+      FROM pgmonitor_ext.ccp_postgresql_version
 
 
   - query_name: ccp_postmaster_runtime
     query: |
-      SELECT extract('epoch' from pg_postmaster_start_time) as start_time_seconds from pg_catalog.pg_postmaster_start_time()
+      SELECT start_time_seconds
+      FROM pgmonitor_ext.ccp_postmaster_runtime
 
 
   - query_name: ccp_postmaster_uptime
     query: |
-      SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds
+      SELECT seconds
+      FROM pgmonitor_ext.ccp_postmaster_uptime
 
 
   - query_name: ccp_replication_lag
     query: |
-      SELECT
-           CASE
-           WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0
-           ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER
-           END
-        AS replay_time
-      ,  CASE
-           WHEN pg_is_in_recovery() = false THEN 0
-           ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER
-           END
-        AS received_time
+      SELECT replay_time
+      , received_time
+      FROM pgmonitor_ext.ccp_replication_lag
 
 
   - query_name: ccp_replication_lag_size
     query: |
-      SELECT client_addr AS replica
-        , client_hostname AS replica_hostname
-        , client_port AS replica_port
-        , pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes 
-      FROM pg_catalog.pg_stat_replication
+      SELECT replica
+        , replica_hostname
+        , replica_port
+        , bytes 
+      FROM pgmonitor_ext.ccp_replication_lag_size
 
 
   - query_name: ccp_replication_slots
     query: |
       SELECT slot_name
-        , active::int
-        , pg_wal_lsn_diff(CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_insert_lsn() END, restart_lsn) AS retained_bytes
-      FROM pg_catalog.pg_replication_slots
+        , active
+        , retained_bytes
+      FROM pgmonitor_ext.ccp_replication_slots
 
 
   - query_name: ccp_sequence_exhaustion
@@ -475,7 +462,8 @@ queries:
 
   - query_name: ccp_settings_pending_restart
     query: |
-      SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true
+      SELECT count
+      FROM pgmonitor_ext.ccp_settings_pending_restart
 
 
   - query_name: ccp_stat_bgwriter
@@ -495,50 +483,33 @@ queries:
 
   - query_name: ccp_stat_database
     query: |
-      SELECT d.datname AS dbname
-        , s.xact_commit
-        , s.xact_rollback
-        , s.blks_read
-        , s.blks_hit
-        , s.tup_returned
-        , s.tup_fetched
-        , s.tup_inserted
-        , s.tup_updated
-        , s.tup_deleted
-        , s.conflicts
-        , s.temp_files
-        , s.temp_bytes
-        , s.deadlocks
-      FROM pg_catalog.pg_stat_database s
-      JOIN pg_catalog.pg_database d ON d.datname = s.datname
-      WHERE d.datistemplate = false
+      SELECT dbname
+        , xact_commit
+        , xact_rollback
+        , blks_read
+        , blks_hit
+        , tup_returned
+        , tup_fetched
+        , tup_inserted
+        , tup_updated
+        , tup_deleted
+        , conflicts
+        , temp_files
+        , temp_bytes
+        , deadlocks
+      FROM pgmonitor_ext.ccp_stat_database
 
 
   - query_name: ccp_transaction_wraparound
     query: |
-      WITH max_age AS (
-        SELECT 2000000000 as max_old_xid, setting AS autovacuum_freeze_max_age FROM pg_catalog.pg_settings WHERE name = 'autovacuum_freeze_max_age'
-      )
-        , per_database_stats AS (
-            SELECT datname
-              , m.max_old_xid::int
-              , m.autovacuum_freeze_max_age::int
-              , age(d.datfrozenxid) AS oldest_current_xid
-            FROM pg_catalog.pg_database d
-            JOIN max_age m ON (true) WHERE d.datallowconn
-          )
-      SELECT max(oldest_current_xid) AS oldest_current_xid
-        , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound
-        , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac
-      FROM per_database_stats
-  
+      SELECT oldest_current_xid
+        , percent_towards_wraparound
+        , percent_towards_emergency_autovac
+      FROM pgmonitor_ext.ccp_transaction_wraparound
+
 
   - query_name: ccp_wal_activity
     query: |
       SELECT last_5_min_size_bytes
-        , (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes
-      FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes
-              FROM pg_catalog.pg_ls_waldir()
-              WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x
-
-
+        , total_size_bytes
+      FROM pgmonitor_ext.ccp_wal_activity
diff --git a/sql_exporter/common/crunchy_per_db_collector.yml b/sql_exporter/common/crunchy_per_db_collector.yml
@@ -92,6 +92,16 @@ metrics:
      - relname
    query_ref: ccp_stat_user_tables
 
+ - metric_name: ccp_stat_user_tables_n_tup_newpage_upd
+   type: gauge
+   help: "Number of rows updated where the successor version goes onto a new heap page, leaving behind an original version with a t_ctid field that points to a different heap page. These are always non-HOT updates."
+   values: [n_tup_newpage_upd]
+   key_labels:
+     - dbname
+     - schemaname
+     - relname
+   query_ref: ccp_stat_user_tables
+
  - metric_name: ccp_stat_user_tables_n_live_tup
    type: gauge
    help: "Estimated number of live rows"
@@ -206,6 +216,7 @@ queries:
         , n_tup_upd
         , n_tup_del
         , n_tup_hot_upd
+        , n_tup_newpage_upd
         , n_live_tup
         , n_dead_tup
         , vacuum_count