From 38c54276cb803a5603451d233292687d38dff362 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 17:54:44 +0000 Subject: [PATCH 01/23] lint --- .../src/utils/prometheus_metric.py | 90 ++++++++++--------- discovery-provider/src/utils/redis_metrics.py | 13 ++- 2 files changed, 56 insertions(+), 47 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 3586496188b..c5179930d69 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -17,15 +17,15 @@ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): histogram_metric = PrometheusMetric( - f"{metric_group}_completed_duration_seconds", - f"How long a {metric_group} took to complete", - ("func_name", "success"), + PrometheusRegistry[ + PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS + ] ) + gauge_metric = PrometheusMetric( - f"{metric_group}_last_duration_seconds", - f"How long the last {metric_group} ran", - ("func_name", "success"), - metric_type=PrometheusType.GAUGE, + PrometheusRegistry[ + PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS + ] ) try: # safely return this result under all circumstances @@ -70,42 +70,52 @@ class PrometheusType: GAUGE = "gauge" +METRIC_PREFIX = "audius_dn" + + +class PrometheusMetricNames: + FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" + CELERY_TASK_COMPLETED_DURATION_SECONDS = "celery_task_completed_duration_seconds" + CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" + + +PrometheusRegistry = { + PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS}", + "Runtimes for flask routes", + ( + "route", + "code", + ), + ), + PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS}", + "How long a celery_task took to complete", + ( + "func_name", + "success", + ), + ), + PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS}", + "How long the last celery_task ran", + ( + "func_name", + "success", + ), + ), +} + + class PrometheusMetric: - histograms: Dict[str, Histogram] = {} - gauges: Dict[str, Gauge] = {} registered_collectors: Dict[str, Callable] = {} - def __init_metric( - self, name, description, labelnames, collection, prometheus_metric_cls - ): - if name not in collection: - collection[name] = prometheus_metric_cls( - name, description, labelnames=labelnames - ) - self.metric = collection[name] - - def __init__( - self, name, description, labelnames=(), metric_type=PrometheusType.HISTOGRAM - ): + def __init__(self, name): self.reset_timer() - # set metric prefix of audius_project_ - name = f"audius_dn_{name}" - - # CollectorRegistries must be uniquely named - # NOTE: we only set labelnames once. - # unsure if overloading is supported. - self.metric_type = metric_type - if self.metric_type == PrometheusType.HISTOGRAM: - self.__init_metric( - name, description, labelnames, PrometheusMetric.histograms, Histogram - ) - elif self.metric_type == PrometheusType.GAUGE: - self.__init_metric( - name, description, labelnames, PrometheusMetric.gauges, Gauge - ) - else: - raise TypeError(f"metric_type '{self.metric_type}' not found") + if name not in PrometheusRegistry: + raise TypeError(f"Metric name '{name}' not found") + self.metric = PrometheusRegistry[name] def reset_timer(self): self.start_time = time() @@ -123,9 +133,9 @@ def save(self, value, labels=None): if labels: this_metric = this_metric.labels(**labels) - if self.metric_type == PrometheusType.HISTOGRAM: + if isinstance(self.metric, Histogram): this_metric.observe(value) - elif self.metric_type == PrometheusType.GAUGE: + elif isinstance(self.metric, Gauge): this_metric.set(value) @classmethod diff --git a/discovery-provider/src/utils/redis_metrics.py b/discovery-provider/src/utils/redis_metrics.py index a07c3510bb9..227c43c7661 100644 --- a/discovery-provider/src/utils/redis_metrics.py +++ b/discovery-provider/src/utils/redis_metrics.py @@ -26,7 +26,11 @@ ) from src.utils.config import shared_config from src.utils.helpers import get_ip, redis_get_or_restore, redis_set_and_dump -from src.utils.prometheus_metric import PrometheusMetric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.query_params import app_name_param, stringify_query_params from werkzeug.wrappers.response import Response as wResponse @@ -653,12 +657,7 @@ def wrap(*args, **kwargs): logger.error("Error while recording metrics: %s", e.message) metric = PrometheusMetric( - "flask_route_latency_seconds", - "Runtimes for flask routes", - ( - "route", - "code", - ), + PrometheusRegistry[PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS] ) result = func(*args, **kwargs) From 726a61fe851ba86cf06e437926e47ee448cd795d Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:29:31 +0000 Subject: [PATCH 02/23] various changes --- .../src/queries/get_celery_tasks.py | 11 ++--- discovery-provider/src/queries/get_health.py | 14 +++--- discovery-provider/src/tasks/index_metrics.py | 18 ++++---- .../src/tasks/index_trending.py | 13 +++--- discovery-provider/src/tasks/users.py | 14 +++--- .../src/utils/prometheus_metric.py | 44 ++++++++++++++++--- 6 files changed, 74 insertions(+), 40 deletions(-) diff --git a/discovery-provider/src/queries/get_celery_tasks.py b/discovery-provider/src/queries/get_celery_tasks.py index dcf19c71fd1..6c5ae4f2dca 100644 --- a/discovery-provider/src/queries/get_celery_tasks.py +++ b/discovery-provider/src/queries/get_celery_tasks.py @@ -3,7 +3,11 @@ import pytz from src.monitors import monitor_names, monitors -from src.utils.prometheus_metric import PrometheusMetric, PrometheusType +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) logger = logging.getLogger(__name__) MONITORS = monitors.MONITORS @@ -33,10 +37,7 @@ def celery_tasks_prometheus_exporter(): registered_tasks = all_tasks["registered_celery_tasks"] metric = PrometheusMetric( - "celery_task_active_duration_seconds", - "How long the currently running celery task has been running", - labelnames=["task_name"], - metric_type=PrometheusType.GAUGE, + PrometheusRegistry[PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS] ) active_task_names = [] diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index 8048fc5d0d8..042892f6efc 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -24,7 +24,11 @@ from src.utils.config import shared_config from src.utils.elasticdsl import ES_INDEXES, esclient from src.utils.helpers import redis_get_or_restore, redis_set_and_dump -from src.utils.prometheus_metric import PrometheusMetric, PrometheusType +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.redis_constants import ( LAST_REACTIONS_INDEX_TIME_KEY, LAST_SEEN_NEW_REACTION_TIME_KEY, @@ -463,15 +467,11 @@ def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) PrometheusMetric( - "health_check_block_difference_current", - "Difference between the latest block and the latest indexed block", - metric_type=PrometheusType.GAUGE, + PrometheusRegistry[PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT] ).save(health_results["block_difference"]) PrometheusMetric( - "health_check_latest_indexed_block_num_current", - "Latest indexed block number", - metric_type=PrometheusType.GAUGE, + PrometheusRegistry[PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT] ).save(health_results["web"]["blocknumber"]) diff --git a/discovery-provider/src/tasks/index_metrics.py b/discovery-provider/src/tasks/index_metrics.py index 241f550d6e5..8d0a1befb7c 100644 --- a/discovery-provider/src/tasks/index_metrics.py +++ b/discovery-provider/src/tasks/index_metrics.py @@ -15,7 +15,11 @@ from src.tasks.celery_app import celery from src.utils.get_all_other_nodes import get_all_other_nodes from src.utils.helpers import redis_get_or_restore, redis_set_and_dump -from src.utils.prometheus_metric import PrometheusMetric, save_duration_metric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.redis_metrics import ( METRICS_INTERVAL, datetime_format_secondary, @@ -414,9 +418,7 @@ def update_metrics(self): f"index_metrics.py | update_metrics | {self.request.id} | Acquired update_metrics_lock" ) metric = PrometheusMetric( - "index_metrics_duration_seconds", - "Runtimes for src.task.index_metrics:celery.task()", - ("task_name",), + PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] ) sweep_metrics(db, redis) refresh_metrics_matviews(db) @@ -458,9 +460,7 @@ def aggregate_metrics(self): f"index_metrics.py | aggregate_metrics | {self.request.id} | Acquired aggregate_metrics_lock" ) metric = PrometheusMetric( - "index_metrics_duration_seconds", - "Runtimes for src.task.index_metrics:celery.task()", - ("task_name",), + PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] ) consolidate_metrics_from_other_nodes(self, db, redis) metric.save_time({"task_name": "aggregate_metrics"}) @@ -503,9 +503,7 @@ def synchronize_metrics(self): f"index_metrics.py | synchronize_metrics | {self.request.id} | Acquired synchronize_metrics_lock" ) metric = PrometheusMetric( - "index_metrics_duration_seconds", - "Runtimes for src.task.index_metrics:celery.task()", - ("task_name",), + PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] ) synchronize_all_node_metrics(self, db) metric.save_time({"task_name": "synchronize_metrics"}) diff --git a/discovery-provider/src/tasks/index_trending.py b/discovery-provider/src/tasks/index_trending.py index 4ac5af8bdde..5ac31fafbe2 100644 --- a/discovery-provider/src/tasks/index_trending.py +++ b/discovery-provider/src/tasks/index_trending.py @@ -20,7 +20,11 @@ from src.trending_strategies.trending_strategy_factory import TrendingStrategyFactory from src.trending_strategies.trending_type_and_version import TrendingType from src.utils.config import shared_config -from src.utils.prometheus_metric import PrometheusMetric, save_duration_metric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.redis_cache import set_json_cached_key from src.utils.redis_constants import trending_tracks_last_completion_redis_key from src.utils.session_manager import SessionManager @@ -105,9 +109,7 @@ def get_genres(session: Session) -> List[str]: def update_view(session: Session, mat_view_name: str): start_time = time.time() metric = PrometheusMetric( - "update_trending_view_duration_seconds", - "Runtimes for src.task.index_trending:update_view()", - ("mat_view_name",), + PrometheusRegistry[PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS] ) session.execute(f"REFRESH MATERIALIZED VIEW {mat_view_name}") update_time = time.time() - start_time @@ -126,8 +128,7 @@ def index_trending(self, db: SessionManager, redis: Redis, timestamp): logger.info("index_trending.py | starting indexing") update_start = time.time() metric = PrometheusMetric( - "index_trending_duration_seconds", - "Runtimes for src.task.index_trending:index_trending()", + PrometheusRegistry[PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS] ) with db.scoped_session() as session: genres = get_genres(session) diff --git a/discovery-provider/src/tasks/users.py b/discovery-provider/src/tasks/users.py index 087bc2363f3..853b2ace371 100644 --- a/discovery-provider/src/tasks/users.py +++ b/discovery-provider/src/tasks/users.py @@ -21,7 +21,11 @@ from src.utils import helpers from src.utils.indexing_errors import EntityMissingRequiredFieldError, IndexingError from src.utils.model_nullable_validator import all_required_fields_present -from src.utils.prometheus_metric import PrometheusMetric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.user_event_constants import user_event_types_arr, user_event_types_lookup logger = logging.getLogger(__name__) @@ -41,9 +45,7 @@ def user_state_update( """Return tuple containing int representing number of User model state changes found in transaction and set of processed user IDs.""" begin_user_state_update = datetime.now() metric = PrometheusMetric( - "user_state_update_duration_seconds", - "Runtimes for src.task.users:user_state_update()", - ("scope",), + PrometheusRegistry[PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS] ) blockhash = update_task.web3.toHex(block_hash) @@ -150,9 +152,7 @@ def process_user_txs_serial( skipped_tx_count, ): metric = PrometheusMetric( - "user_state_update_duration_seconds", - "Runtimes for src.task.users:user_state_update()", - ("scope",), + PrometheusRegistry[PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS] ) processed_entries = 0 for user_tx in user_txs: diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index c5179930d69..5bc9d3f21d6 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -65,11 +65,6 @@ def wrapper(*args, **kwargs): return decorator -class PrometheusType: - HISTOGRAM = "histogram" - GAUGE = "gauge" - - METRIC_PREFIX = "audius_dn" @@ -77,6 +72,13 @@ class PrometheusMetricNames: FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" CELERY_TASK_COMPLETED_DURATION_SECONDS = "celery_task_completed_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" + CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" + HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT = "health_check_block_difference_current" + HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT = "health_check_latest_indexed_block_num_current" + UPDATE_TRENDING_VIEW_DURATION_SECONDS = "update_trending_view_duration_seconds" + INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" + INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" + USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" PrometheusRegistry = { @@ -104,6 +106,38 @@ class PrometheusMetricNames: "success", ), ), + PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS}", + "How long the currently running celery task has been running", + ("task_name",), + ), + PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT}", + "Difference between the latest block and the latest indexed block", + ), + PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT}", + "Latest indexed block number", + ), + PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS}", + "Runtimes for src.task.index_trending:update_view()", + ("mat_view_name",), + ), + PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS}", + "Runtimes for src.task.index_trending:index_trending()", + ), + PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS}", + "Runtimes for src.task.index_metrics:celery.task()", + ("task_name",), + ), + PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS}", + "Runtimes for src.task.users:user_state_update()", + ("scope",), + ), } From 328e8c3ca5c3cb35741a77f6fbff535d19518b08 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:29:53 +0000 Subject: [PATCH 03/23] perfect example of previous changes --- discovery-provider/src/tasks/tracks.py | 10 ++++++---- discovery-provider/src/utils/prometheus_metric.py | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/discovery-provider/src/tasks/tracks.py b/discovery-provider/src/tasks/tracks.py index 0304619f04c..cc48ebffd1e 100644 --- a/discovery-provider/src/tasks/tracks.py +++ b/discovery-provider/src/tasks/tracks.py @@ -18,7 +18,11 @@ from src.utils import helpers, multihash from src.utils.indexing_errors import EntityMissingRequiredFieldError, IndexingError from src.utils.model_nullable_validator import all_required_fields_present -from src.utils.prometheus_metric import PrometheusMetric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.track_event_constants import ( track_event_types_arr, track_event_types_lookup, @@ -41,9 +45,7 @@ def track_state_update( """Return tuple containing int representing number of Track model state changes found in transaction and set of processed track IDs.""" begin_track_state_update = datetime.now() metric = PrometheusMetric( - "track_state_update_duration_seconds", - "Runtimes for src.task.tracks:track_state_update()", - ("scope",), + PrometheusRegistry[PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS] ) blockhash = update_task.web3.toHex(block_hash) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 5bc9d3f21d6..7bd707243e9 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -79,6 +79,7 @@ class PrometheusMetricNames: INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" + TRACK_STATE_UPDATE_DURATION_SECONDS = "track_state_update_duration_seconds" PrometheusRegistry = { @@ -138,6 +139,11 @@ class PrometheusMetricNames: "Runtimes for src.task.users:user_state_update()", ("scope",), ), + PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS}", + "Runtimes for src.task.tracks:track_state_update()", + ("scope",), + ), } From 0897b0761bdaff924c48fb99dbc4ec2e86cdef67 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:32:42 +0000 Subject: [PATCH 04/23] lint --- discovery-provider/src/queries/get_health.py | 4 +++- discovery-provider/src/tasks/index_metrics.py | 1 + discovery-provider/src/tasks/index_trending.py | 1 + discovery-provider/src/utils/prometheus_metric.py | 4 +++- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index 042892f6efc..2ffe3ced69e 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -471,7 +471,9 @@ def health_check_prometheus_exporter(): ).save(health_results["block_difference"]) PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT] + PrometheusRegistry[ + PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT + ] ).save(health_results["web"]["blocknumber"]) diff --git a/discovery-provider/src/tasks/index_metrics.py b/discovery-provider/src/tasks/index_metrics.py index 8d0a1befb7c..74848f8f0d9 100644 --- a/discovery-provider/src/tasks/index_metrics.py +++ b/discovery-provider/src/tasks/index_metrics.py @@ -19,6 +19,7 @@ PrometheusMetric, PrometheusMetricNames, PrometheusRegistry, + save_duration_metric, ) from src.utils.redis_metrics import ( METRICS_INTERVAL, diff --git a/discovery-provider/src/tasks/index_trending.py b/discovery-provider/src/tasks/index_trending.py index 5ac31fafbe2..0c947c3e305 100644 --- a/discovery-provider/src/tasks/index_trending.py +++ b/discovery-provider/src/tasks/index_trending.py @@ -24,6 +24,7 @@ PrometheusMetric, PrometheusMetricNames, PrometheusRegistry, + save_duration_metric, ) from src.utils.redis_cache import set_json_cached_key from src.utils.redis_constants import trending_tracks_last_completion_redis_key diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 7bd707243e9..817902e0eb9 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -74,7 +74,9 @@ class PrometheusMetricNames: CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT = "health_check_block_difference_current" - HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT = "health_check_latest_indexed_block_num_current" + HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT = ( + "health_check_latest_indexed_block_num_current" + ) UPDATE_TRENDING_VIEW_DURATION_SECONDS = "update_trending_view_duration_seconds" INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" From c62323d9864a133092bfc7d976275cc9815252b4 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:42:30 +0000 Subject: [PATCH 05/23] Lint --- .../src/tasks/aggregates/__init__.py | 10 ++++++---- discovery-provider/src/tasks/index.py | 11 ++++++---- .../src/tasks/update_track_is_available.py | 13 ++++++++---- .../src/utils/prometheus_metric.py | 20 +++++++++++++++++++ 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/discovery-provider/src/tasks/aggregates/__init__.py b/discovery-provider/src/tasks/aggregates/__init__.py index 1d8625eb7a3..9fc52d6edea 100644 --- a/discovery-provider/src/tasks/aggregates/__init__.py +++ b/discovery-provider/src/tasks/aggregates/__init__.py @@ -7,7 +7,11 @@ from sqlalchemy import text from sqlalchemy.orm.session import Session from src.models.indexing.block import Block -from src.utils.prometheus_metric import PrometheusMetric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, +) from src.utils.update_indexing_checkpoints import ( get_last_indexed_checkpoint, save_indexed_checkpoint, @@ -75,9 +79,7 @@ def update_aggregate_table( current_checkpoint, ): metric = PrometheusMetric( - "update_aggregate_table_latency_seconds", - "Runtimes for src.task.aggregates:update_aggregate_table()", - ("table_name", "task_name"), + PrometheusRegistry[PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS] ) # get name of the caller function diff --git a/discovery-provider/src/tasks/index.py b/discovery-provider/src/tasks/index.py index bcab8b1271c..421ccc57849 100644 --- a/discovery-provider/src/tasks/index.py +++ b/discovery-provider/src/tasks/index.py @@ -50,7 +50,12 @@ sweep_old_index_blocks_ms, ) from src.utils.indexing_errors import IndexingError -from src.utils.prometheus_metric import PrometheusMetric, save_duration_metric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, + save_duration_metric, +) from src.utils.redis_cache import ( remove_cached_playlist_ids, remove_cached_track_ids, @@ -562,9 +567,7 @@ def index_blocks(self, db, blocks_list): latest_block_timestamp = None changed_entity_ids_map = {} metric = PrometheusMetric( - "index_blocks_duration_seconds", - "Runtimes for src.task.index:index_blocks()", - ("scope",), + PrometheusRegistry[PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS] ) for i in block_order_range: start_time = time.time() diff --git a/discovery-provider/src/tasks/update_track_is_available.py b/discovery-provider/src/tasks/update_track_is_available.py index 58adf8c7067..c1aaa2f3cc1 100644 --- a/discovery-provider/src/tasks/update_track_is_available.py +++ b/discovery-provider/src/tasks/update_track_is_available.py @@ -7,7 +7,12 @@ from src.models.tracks.track import Track from src.models.users.user import User from src.tasks.celery_app import celery -from src.utils.prometheus_metric import PrometheusMetric, save_duration_metric +from src.utils.prometheus_metric import ( + PrometheusMetric, + PrometheusMetricNames, + PrometheusRegistry, + save_duration_metric, +) from src.utils.redis_constants import ( ALL_UNAVAILABLE_TRACKS_REDIS_KEY, UPDATE_TRACK_IS_AVAILABLE_FINISH_REDIS_KEY, @@ -231,9 +236,9 @@ def update_track_is_available(self) -> None: have_lock = update_lock.acquire(blocking=False) if have_lock: metric = PrometheusMetric( - "update_track_is_available_duration_seconds", - "Runtimes for src.task.update_track_is_available:celery.task()", - ("task_name", "success"), + PrometheusRegistry[ + PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS + ] ) try: # TODO: we can deprecate this manual redis timestamp tracker once we confirm diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 817902e0eb9..be79a0d4b86 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -82,6 +82,11 @@ class PrometheusMetricNames: INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" TRACK_STATE_UPDATE_DURATION_SECONDS = "track_state_update_duration_seconds" + INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" + UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS = ( + "update_track_is_available_duration_seconds" + ) + UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS = "update_aggregate_table_latency_seconds" PrometheusRegistry = { @@ -146,6 +151,21 @@ class PrometheusMetricNames: "Runtimes for src.task.tracks:track_state_update()", ("scope",), ), + PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS}", + "Runtimes for src.task.index:index_blocks()", + ("scope",), + ), + PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS}", + "Runtimes for src.task.update_track_is_available:celery.task()", + ("task_name", "success"), + ), + PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS}", + "Runtimes for src.task.aggregates:update_aggregate_table()", + ("table_name", "task_name"), + ), } From cdb4f6337b864055d2877dff1fe6bb22989f6414 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:50:45 +0000 Subject: [PATCH 06/23] simplify usage --- .../src/queries/get_celery_tasks.py | 10 ++-------- discovery-provider/src/queries/get_health.py | 16 +++++----------- .../src/tasks/aggregates/__init__.py | 8 ++------ discovery-provider/src/tasks/index.py | 5 +---- discovery-provider/src/tasks/index_metrics.py | 7 +++---- discovery-provider/src/tasks/index_trending.py | 7 ++----- discovery-provider/src/tasks/tracks.py | 10 ++-------- .../src/tasks/update_track_is_available.py | 5 +---- discovery-provider/src/tasks/users.py | 14 +++----------- discovery-provider/src/utils/redis_metrics.py | 10 ++-------- 10 files changed, 23 insertions(+), 69 deletions(-) diff --git a/discovery-provider/src/queries/get_celery_tasks.py b/discovery-provider/src/queries/get_celery_tasks.py index 6c5ae4f2dca..d97298003b5 100644 --- a/discovery-provider/src/queries/get_celery_tasks.py +++ b/discovery-provider/src/queries/get_celery_tasks.py @@ -3,11 +3,7 @@ import pytz from src.monitors import monitor_names, monitors -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames logger = logging.getLogger(__name__) MONITORS = monitors.MONITORS @@ -36,9 +32,7 @@ def celery_tasks_prometheus_exporter(): active_tasks = all_tasks["active_tasks"] registered_tasks = all_tasks["registered_celery_tasks"] - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS) active_task_names = [] for task in active_tasks: diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index 2ffe3ced69e..fbdafc444f9 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -24,11 +24,7 @@ from src.utils.config import shared_config from src.utils.elasticdsl import ES_INDEXES, esclient from src.utils.helpers import redis_get_or_restore, redis_set_and_dump -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames from src.utils.redis_constants import ( LAST_REACTIONS_INDEX_TIME_KEY, LAST_SEEN_NEW_REACTION_TIME_KEY, @@ -466,14 +462,12 @@ def get_elasticsearch_health_info( def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) - PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT] - ).save(health_results["block_difference"]) + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT).save( + health_results["block_difference"] + ) PrometheusMetric( - PrometheusRegistry[ - PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT - ] + PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT ).save(health_results["web"]["blocknumber"]) diff --git a/discovery-provider/src/tasks/aggregates/__init__.py b/discovery-provider/src/tasks/aggregates/__init__.py index 9fc52d6edea..dacde9e64fa 100644 --- a/discovery-provider/src/tasks/aggregates/__init__.py +++ b/discovery-provider/src/tasks/aggregates/__init__.py @@ -7,11 +7,7 @@ from sqlalchemy import text from sqlalchemy.orm.session import Session from src.models.indexing.block import Block -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames from src.utils.update_indexing_checkpoints import ( get_last_indexed_checkpoint, save_indexed_checkpoint, @@ -79,7 +75,7 @@ def update_aggregate_table( current_checkpoint, ): metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS] + PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS ) # get name of the caller function diff --git a/discovery-provider/src/tasks/index.py b/discovery-provider/src/tasks/index.py index 421ccc57849..87234e1cc84 100644 --- a/discovery-provider/src/tasks/index.py +++ b/discovery-provider/src/tasks/index.py @@ -53,7 +53,6 @@ from src.utils.prometheus_metric import ( PrometheusMetric, PrometheusMetricNames, - PrometheusRegistry, save_duration_metric, ) from src.utils.redis_cache import ( @@ -566,9 +565,7 @@ def index_blocks(self, db, blocks_list): block_order_range = range(len(blocks_list) - 1, -1, -1) latest_block_timestamp = None changed_entity_ids_map = {} - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS) for i in block_order_range: start_time = time.time() metric.reset_timer() diff --git a/discovery-provider/src/tasks/index_metrics.py b/discovery-provider/src/tasks/index_metrics.py index 74848f8f0d9..3ea28277146 100644 --- a/discovery-provider/src/tasks/index_metrics.py +++ b/discovery-provider/src/tasks/index_metrics.py @@ -18,7 +18,6 @@ from src.utils.prometheus_metric import ( PrometheusMetric, PrometheusMetricNames, - PrometheusRegistry, save_duration_metric, ) from src.utils.redis_metrics import ( @@ -419,7 +418,7 @@ def update_metrics(self): f"index_metrics.py | update_metrics | {self.request.id} | Acquired update_metrics_lock" ) metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] + PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS ) sweep_metrics(db, redis) refresh_metrics_matviews(db) @@ -461,7 +460,7 @@ def aggregate_metrics(self): f"index_metrics.py | aggregate_metrics | {self.request.id} | Acquired aggregate_metrics_lock" ) metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] + PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS ) consolidate_metrics_from_other_nodes(self, db, redis) metric.save_time({"task_name": "aggregate_metrics"}) @@ -504,7 +503,7 @@ def synchronize_metrics(self): f"index_metrics.py | synchronize_metrics | {self.request.id} | Acquired synchronize_metrics_lock" ) metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS] + PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS ) synchronize_all_node_metrics(self, db) metric.save_time({"task_name": "synchronize_metrics"}) diff --git a/discovery-provider/src/tasks/index_trending.py b/discovery-provider/src/tasks/index_trending.py index 0c947c3e305..fd55228a6d3 100644 --- a/discovery-provider/src/tasks/index_trending.py +++ b/discovery-provider/src/tasks/index_trending.py @@ -23,7 +23,6 @@ from src.utils.prometheus_metric import ( PrometheusMetric, PrometheusMetricNames, - PrometheusRegistry, save_duration_metric, ) from src.utils.redis_cache import set_json_cached_key @@ -110,7 +109,7 @@ def get_genres(session: Session) -> List[str]: def update_view(session: Session, mat_view_name: str): start_time = time.time() metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS] + PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS ) session.execute(f"REFRESH MATERIALIZED VIEW {mat_view_name}") update_time = time.time() - start_time @@ -128,9 +127,7 @@ def update_view(session: Session, mat_view_name: str): def index_trending(self, db: SessionManager, redis: Redis, timestamp): logger.info("index_trending.py | starting indexing") update_start = time.time() - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS) with db.scoped_session() as session: genres = get_genres(session) diff --git a/discovery-provider/src/tasks/tracks.py b/discovery-provider/src/tasks/tracks.py index cc48ebffd1e..25b91c24036 100644 --- a/discovery-provider/src/tasks/tracks.py +++ b/discovery-provider/src/tasks/tracks.py @@ -18,11 +18,7 @@ from src.utils import helpers, multihash from src.utils.indexing_errors import EntityMissingRequiredFieldError, IndexingError from src.utils.model_nullable_validator import all_required_fields_present -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames from src.utils.track_event_constants import ( track_event_types_arr, track_event_types_lookup, @@ -44,9 +40,7 @@ def track_state_update( ) -> Tuple[int, Set]: """Return tuple containing int representing number of Track model state changes found in transaction and set of processed track IDs.""" begin_track_state_update = datetime.now() - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS) blockhash = update_task.web3.toHex(block_hash) num_total_changes = 0 diff --git a/discovery-provider/src/tasks/update_track_is_available.py b/discovery-provider/src/tasks/update_track_is_available.py index c1aaa2f3cc1..71bdb9b4ad9 100644 --- a/discovery-provider/src/tasks/update_track_is_available.py +++ b/discovery-provider/src/tasks/update_track_is_available.py @@ -10,7 +10,6 @@ from src.utils.prometheus_metric import ( PrometheusMetric, PrometheusMetricNames, - PrometheusRegistry, save_duration_metric, ) from src.utils.redis_constants import ( @@ -236,9 +235,7 @@ def update_track_is_available(self) -> None: have_lock = update_lock.acquire(blocking=False) if have_lock: metric = PrometheusMetric( - PrometheusRegistry[ - PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS - ] + PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS ) try: # TODO: we can deprecate this manual redis timestamp tracker once we confirm diff --git a/discovery-provider/src/tasks/users.py b/discovery-provider/src/tasks/users.py index 853b2ace371..98e69e43d34 100644 --- a/discovery-provider/src/tasks/users.py +++ b/discovery-provider/src/tasks/users.py @@ -21,11 +21,7 @@ from src.utils import helpers from src.utils.indexing_errors import EntityMissingRequiredFieldError, IndexingError from src.utils.model_nullable_validator import all_required_fields_present -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames from src.utils.user_event_constants import user_event_types_arr, user_event_types_lookup logger = logging.getLogger(__name__) @@ -44,9 +40,7 @@ def user_state_update( ) -> Tuple[int, Set]: """Return tuple containing int representing number of User model state changes found in transaction and set of processed user IDs.""" begin_user_state_update = datetime.now() - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS) blockhash = update_task.web3.toHex(block_hash) num_total_changes = 0 @@ -151,9 +145,7 @@ def process_user_txs_serial( user_ids, skipped_tx_count, ): - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS) processed_entries = 0 for user_tx in user_txs: try: diff --git a/discovery-provider/src/utils/redis_metrics.py b/discovery-provider/src/utils/redis_metrics.py index 227c43c7661..03194cc3c50 100644 --- a/discovery-provider/src/utils/redis_metrics.py +++ b/discovery-provider/src/utils/redis_metrics.py @@ -26,11 +26,7 @@ ) from src.utils.config import shared_config from src.utils.helpers import get_ip, redis_get_or_restore, redis_set_and_dump -from src.utils.prometheus_metric import ( - PrometheusMetric, - PrometheusMetricNames, - PrometheusRegistry, -) +from src.utils.prometheus_metric import PrometheusMetric, PrometheusMetricNames from src.utils.query_params import app_name_param, stringify_query_params from werkzeug.wrappers.response import Response as wResponse @@ -656,9 +652,7 @@ def wrap(*args, **kwargs): except Exception as e: logger.error("Error while recording metrics: %s", e.message) - metric = PrometheusMetric( - PrometheusRegistry[PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS] - ) + metric = PrometheusMetric(PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS) result = func(*args, **kwargs) From 977a21a0935cbfb91186e1a6880241c5596ea173 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Thu, 7 Jul 2022 23:56:03 +0000 Subject: [PATCH 07/23] one more --- discovery-provider/src/utils/prometheus_metric.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index be79a0d4b86..98ea04f5846 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -17,15 +17,11 @@ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): histogram_metric = PrometheusMetric( - PrometheusRegistry[ - PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS - ] + PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS ) gauge_metric = PrometheusMetric( - PrometheusRegistry[ - PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS - ] + PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS ) try: # safely return this result under all circumstances From 1bb52e4bc00be727113713cf68a46f0577bc031d Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:02:40 +0000 Subject: [PATCH 08/23] sorted --- .../src/utils/prometheus_metric.py | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 98ea04f5846..be2d3b354f7 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -65,34 +65,31 @@ def wrapper(*args, **kwargs): class PrometheusMetricNames: - FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" + CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" CELERY_TASK_COMPLETED_DURATION_SECONDS = "celery_task_completed_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" - CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" + FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT = "health_check_block_difference_current" HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT = ( "health_check_latest_indexed_block_num_current" ) - UPDATE_TRENDING_VIEW_DURATION_SECONDS = "update_trending_view_duration_seconds" - INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" + INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" - USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" + INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" TRACK_STATE_UPDATE_DURATION_SECONDS = "track_state_update_duration_seconds" - INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" + UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS = "update_aggregate_table_latency_seconds" UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS = ( "update_track_is_available_duration_seconds" ) - UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS = "update_aggregate_table_latency_seconds" + UPDATE_TRENDING_VIEW_DURATION_SECONDS = "update_trending_view_duration_seconds" + USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" PrometheusRegistry = { - PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS}", - "Runtimes for flask routes", - ( - "route", - "code", - ), + PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS}", + "How long the currently running celery task has been running", + ("task_name",), ), PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS}", @@ -110,10 +107,13 @@ class PrometheusMetricNames: "success", ), ), - PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS}", - "How long the currently running celery task has been running", - ("task_name",), + PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS}", + "Runtimes for flask routes", + ( + "route", + "code", + ), ), PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT: Gauge( f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT}", @@ -123,44 +123,44 @@ class PrometheusMetricNames: f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT}", "Latest indexed block number", ), - PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS}", - "Runtimes for src.task.index_trending:update_view()", - ("mat_view_name",), - ), - PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS}", - "Runtimes for src.task.index_trending:index_trending()", + PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS}", + "Runtimes for src.task.index:index_blocks()", + ("scope",), ), PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_METRICS_DURATION_SECONDS}", "Runtimes for src.task.index_metrics:celery.task()", ("task_name",), ), - PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS}", - "Runtimes for src.task.users:user_state_update()", - ("scope",), + PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_TRENDING_DURATION_SECONDS}", + "Runtimes for src.task.index_trending:index_trending()", ), PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.TRACK_STATE_UPDATE_DURATION_SECONDS}", "Runtimes for src.task.tracks:track_state_update()", ("scope",), ), - PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS}", - "Runtimes for src.task.index:index_blocks()", - ("scope",), + PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS}", + "Runtimes for src.task.aggregates:update_aggregate_table()", + ("table_name", "task_name"), ), PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS}", "Runtimes for src.task.update_track_is_available:celery.task()", ("task_name", "success"), ), - PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS}", - "Runtimes for src.task.aggregates:update_aggregate_table()", - ("table_name", "task_name"), + PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS}", + "Runtimes for src.task.index_trending:update_view()", + ("mat_view_name",), + ), + PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.USER_STATE_UPDATE_DURATION_SECONDS}", + "Runtimes for src.task.users:user_state_update()", + ("scope",), ), } From 38e95f9757b403aa474fed78b481166de3d32bc4 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:08:18 +0000 Subject: [PATCH 09/23] fixes --- discovery-provider/src/utils/prometheus_metric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index be2d3b354f7..8f28a051a06 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -172,7 +172,7 @@ def __init__(self, name): self.reset_timer() if name not in PrometheusRegistry: - raise TypeError(f"Metric name '{name}' not found") + raise NameError(f"Metric name '{name}' not found") self.metric = PrometheusRegistry[name] def reset_timer(self): @@ -191,9 +191,9 @@ def save(self, value, labels=None): if labels: this_metric = this_metric.labels(**labels) - if isinstance(self.metric, Histogram): + if isinstance(this_metric, Histogram): this_metric.observe(value) - elif isinstance(self.metric, Gauge): + elif isinstance(this_metric, Gauge): this_metric.set(value) @classmethod From d7f0ff56b45a3331bae4b913dabc18c10b7c6fae Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:11:50 +0000 Subject: [PATCH 10/23] metric_groups --- .../src/utils/prometheus_metric.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 8f28a051a06..2b9651f2b7f 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -16,13 +16,15 @@ def save_duration_metric(metric_group): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): - histogram_metric = PrometheusMetric( - PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS - ) - - gauge_metric = PrometheusMetric( - PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS - ) + if metric_group == "celery_task": + histogram_metric = PrometheusMetric( + PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS + ) + gauge_metric = PrometheusMetric( + PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS + ) + else: + raise NameError(f"Metric Group '{metric_group}' not found.") try: # safely return this result under all circumstances result = func(*args, **kwargs) From 7b2ca4141a96848156adb4024ab85f65ab1cc08e Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:13:54 +0000 Subject: [PATCH 11/23] always ensure commas --- discovery-provider/src/utils/prometheus_metric.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 2b9651f2b7f..66d68eb5a70 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -113,8 +113,8 @@ class PrometheusMetricNames: f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS}", "Runtimes for flask routes", ( - "route", "code", + "route", ), ), PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT: Gauge( @@ -147,12 +147,18 @@ class PrometheusMetricNames: PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS}", "Runtimes for src.task.aggregates:update_aggregate_table()", - ("table_name", "task_name"), + ( + "table_name", + "task_name", + ), ), PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS}", "Runtimes for src.task.update_track_is_available:celery.task()", - ("task_name", "success"), + ( + "success", + "task_name", + ), ), PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_TRENDING_VIEW_DURATION_SECONDS}", From f1bd5d6193175a7d37af52aaaa77b8473a6a757f Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:30:53 +0000 Subject: [PATCH 12/23] goodbye _current, welcome _latest --- discovery-provider/src/queries/get_health.py | 2 +- .../src/utils/prometheus_metric.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index fbdafc444f9..ad3a88d64fb 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -467,7 +467,7 @@ def health_check_prometheus_exporter(): ) PrometheusMetric( - PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT + PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM ).save(health_results["web"]["blocknumber"]) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 66d68eb5a70..cdd2089c0e4 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -18,7 +18,7 @@ def decorator(func): def wrapper(*args, **kwargs): if metric_group == "celery_task": histogram_metric = PrometheusMetric( - PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS + PrometheusMetricNames.CELERY_TASK_DURATION_SECONDS ) gauge_metric = PrometheusMetric( PrometheusMetricNames.CELERY_TASK_LAST_DURATION_SECONDS @@ -68,12 +68,12 @@ def wrapper(*args, **kwargs): class PrometheusMetricNames: CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" - CELERY_TASK_COMPLETED_DURATION_SECONDS = "celery_task_completed_duration_seconds" + CELERY_TASK_DURATION_SECONDS = "celery_task_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" - HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT = "health_check_block_difference_current" - HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT = ( - "health_check_latest_indexed_block_num_current" + HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE = "health_check_latest_block_difference" + HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM = ( + "health_check_latest_indexed_block_num" ) INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" @@ -93,8 +93,8 @@ class PrometheusMetricNames: "How long the currently running celery task has been running", ("task_name",), ), - PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_COMPLETED_DURATION_SECONDS}", + PrometheusMetricNames.CELERY_TASK_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_DURATION_SECONDS}", "How long a celery_task took to complete", ( "func_name", @@ -118,11 +118,11 @@ class PrometheusMetricNames: ), ), PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT}", + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE}", "Difference between the latest block and the latest indexed block", ), - PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM_CURRENT}", + PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM}", "Latest indexed block number", ), PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( From a226b4d793fcb8cb33cafafb316c031707a12041 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:33:02 +0000 Subject: [PATCH 13/23] missed change --- discovery-provider/src/queries/get_health.py | 2 +- discovery-provider/src/utils/prometheus_metric.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index ad3a88d64fb..0f6a295692b 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -462,7 +462,7 @@ def get_elasticsearch_health_info( def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) - PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT).save( + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE).save( health_results["block_difference"] ) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index cdd2089c0e4..8cc02f3cf3a 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -117,7 +117,7 @@ class PrometheusMetricNames: "route", ), ), - PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_CURRENT: Gauge( + PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE: Gauge( f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE}", "Difference between the latest block and the latest indexed block", ), From 89b08eb7fe3ee6969ea4a82ed56966346e0850cc Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:33:40 +0000 Subject: [PATCH 14/23] lint --- discovery-provider/src/queries/get_health.py | 6 +++--- discovery-provider/src/utils/prometheus_metric.py | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index 0f6a295692b..334c1c3557c 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -466,9 +466,9 @@ def health_check_prometheus_exporter(): health_results["block_difference"] ) - PrometheusMetric( - PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM - ).save(health_results["web"]["blocknumber"]) + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM).save( + health_results["web"]["blocknumber"] + ) PrometheusMetric.register_collector( diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 8cc02f3cf3a..e2f653c07c4 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -72,9 +72,7 @@ class PrometheusMetricNames: CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE = "health_check_latest_block_difference" - HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM = ( - "health_check_latest_indexed_block_num" - ) + HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM = "health_check_latest_indexed_block_num" INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" From cf31695a11fd12a2ba9616ed6f28e19b8be50c77 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:40:26 +0000 Subject: [PATCH 15/23] audius_dn_celery_task_duration_seconds_bucket --- ...iscovery-provider-blockchain-indexing.json | 30 +++++++++---------- ...iscovery-provider-celery-task-details.json | 14 ++++----- .../audius-discovery-provider-overview.json | 20 ++++++------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json index b8fc7ab823f..de6548993fc 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json @@ -408,7 +408,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -419,7 +419,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -430,7 +430,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -441,7 +441,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -452,7 +452,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, @@ -563,7 +563,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -574,7 +574,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -585,7 +585,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -596,7 +596,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -607,7 +607,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, @@ -693,7 +693,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -704,7 +704,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -715,7 +715,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -726,7 +726,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -737,7 +737,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_eth\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json b/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json index b01dca77d38..0d94929a8d2 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json @@ -122,7 +122,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -133,7 +133,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -144,7 +144,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -155,7 +155,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -166,7 +166,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\", func_name=\"$task_name\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, @@ -237,7 +237,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_celery_task_completed_duration_seconds_bucket, func_name)", + "definition": "label_values(audius_dn_celery_task_duration_seconds_bucket, func_name)", "description": "", "hide": 0, "includeAll": true, @@ -246,7 +246,7 @@ "name": "task_name", "options": [], "query": { - "query": "label_values(audius_dn_celery_task_completed_duration_seconds_bucket, func_name)", + "query": "label_values(audius_dn_celery_task_duration_seconds_bucket, func_name)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json index d64b895a792..7c8a9b7771a 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json @@ -839,7 +839,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -850,7 +850,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -861,7 +861,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -872,7 +872,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -883,7 +883,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"update_task\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, @@ -994,7 +994,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p50", "range": true, @@ -1005,7 +1005,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p90", "range": true, @@ -1016,7 +1016,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p95", "range": true, @@ -1027,7 +1027,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p99", "range": true, @@ -1038,7 +1038,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_completed_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(audius_dn_celery_task_duration_seconds_bucket{environment=~\"$env\", host=~\"$host\",func_name=\"index_solana_plays\"}[5m])) by (le))", "hide": false, "legendFormat": "p999", "range": true, From 6e782f0bd7d1089146ff01350efcfc180df5212c Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:43:45 +0000 Subject: [PATCH 16/23] audius_dn_health_check_latest_block_difference --- monitoring/grafana/dashboards/joaquin-playground.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/grafana/dashboards/joaquin-playground.json b/monitoring/grafana/dashboards/joaquin-playground.json index ccbb3077b8b..586a234583a 100644 --- a/monitoring/grafana/dashboards/joaquin-playground.json +++ b/monitoring/grafana/dashboards/joaquin-playground.json @@ -409,7 +409,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "audius_dn_health_check_latest_indexed_block_num_current{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", "interval": "", "legendFormat": "{{host}}", "refId": "A" From 5cc97490fde0e5e1ef1978fb121881bcfedd6662 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 00:47:15 +0000 Subject: [PATCH 17/23] audius_dn_health_check_latest_block_difference --- monitoring/README.md | 2 +- .../audius-discovery-provider-blockchain-indexing.json | 8 ++++---- .../dashboards/audius-discovery-provider-overview.json | 2 +- monitoring/grafana/dashboards/joaquin-playground.json | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/monitoring/README.md b/monitoring/README.md index 7af3c0794ff..629af29ac22 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -159,7 +159,7 @@ When additional complexity is required, visit the [official Prometheus documenta Gauges are the easiest pattern since they simply display the value of a metric that was displayed at scrape time: -> `audius_dn_health_check_block_difference_current{environment=~"$env", host=~"$host"}` +> `audius_dn_health_check_latest_block_difference{environment=~"$env", host=~"$host"}` Notice how we restrict the `environment` and `host` labels associated with the metric to match the Dashboard Variables discussed in the previous section. diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json index de6548993fc..1e7c6b81781 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json @@ -202,7 +202,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", "legendFormat": "{{host}}", "range": true, "refId": "A" @@ -263,7 +263,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "max(audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"})", + "expr": "max(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", "legendFormat": "Max", "range": true, "refId": "A" @@ -273,7 +273,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "avg(audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"})", + "expr": "avg(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", "hide": false, "legendFormat": "Average", "range": true, @@ -284,7 +284,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "min(audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"})", + "expr": "min(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", "hide": false, "legendFormat": "Min", "range": true, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json index 7c8a9b7771a..77d40f6d100 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json @@ -729,7 +729,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", "legendFormat": "{{host}}", "range": true, "refId": "A" diff --git a/monitoring/grafana/dashboards/joaquin-playground.json b/monitoring/grafana/dashboards/joaquin-playground.json index 586a234583a..4d86c798d9a 100644 --- a/monitoring/grafana/dashboards/joaquin-playground.json +++ b/monitoring/grafana/dashboards/joaquin-playground.json @@ -318,7 +318,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "audius_dn_health_check_block_difference_current{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", "interval": "", "legendFormat": "{{host}}", "refId": "A" From 58e53897c24ce152335f7d7763633cd203bb2610 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 01:00:37 +0000 Subject: [PATCH 18/23] latency -> duration --- discovery-provider/src/tasks/aggregates/__init__.py | 2 +- discovery-provider/src/utils/prometheus_metric.py | 6 +++--- monitoring/grafana/dashboards/joaquin-playground.json | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/discovery-provider/src/tasks/aggregates/__init__.py b/discovery-provider/src/tasks/aggregates/__init__.py index dacde9e64fa..886d500a4eb 100644 --- a/discovery-provider/src/tasks/aggregates/__init__.py +++ b/discovery-provider/src/tasks/aggregates/__init__.py @@ -75,7 +75,7 @@ def update_aggregate_table( current_checkpoint, ): metric = PrometheusMetric( - PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS + PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_DURATION_SECONDS ) # get name of the caller function diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index e2f653c07c4..8a4a29ec88f 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -77,7 +77,7 @@ class PrometheusMetricNames: INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" TRACK_STATE_UPDATE_DURATION_SECONDS = "track_state_update_duration_seconds" - UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS = "update_aggregate_table_latency_seconds" + UPDATE_AGGREGATE_TABLE_DURATION_SECONDS = "update_aggregate_table_duration_seconds" UPDATE_TRACK_IS_AVAILABLE_DURATION_SECONDS = ( "update_track_is_available_duration_seconds" ) @@ -142,8 +142,8 @@ class PrometheusMetricNames: "Runtimes for src.task.tracks:track_state_update()", ("scope",), ), - PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_LATENCY_SECONDS}", + PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.UPDATE_AGGREGATE_TABLE_DURATION_SECONDS}", "Runtimes for src.task.aggregates:update_aggregate_table()", ( "table_name", diff --git a/monitoring/grafana/dashboards/joaquin-playground.json b/monitoring/grafana/dashboards/joaquin-playground.json index 4d86c798d9a..b76f070767d 100644 --- a/monitoring/grafana/dashboards/joaquin-playground.json +++ b/monitoring/grafana/dashboards/joaquin-playground.json @@ -1002,7 +1002,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "audius_dn_update_aggregate_table_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_update_aggregate_table_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -1094,7 +1094,7 @@ "pluginVersion": "8.4.1", "targets": [ { - "expr": "rate(audius_dn_update_aggregate_table_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_update_aggregate_table_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m])", + "expr": "rate(audius_dn_update_aggregate_table_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_update_aggregate_table_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{table_name}}", From 29f5d65eee914e8c5776a7e93e319bf94198e047 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 01:08:22 +0000 Subject: [PATCH 19/23] flask_route_duration_seconds --- .../src/utils/prometheus_metric.py | 6 +++--- discovery-provider/src/utils/redis_metrics.py | 2 +- monitoring/README.md | 6 +++--- .../grafana/dashboards/audius-boilerplate.json | 8 ++++---- ...s-discovery-provider-blockchain-indexing.json | 8 ++++---- ...s-discovery-provider-celery-task-details.json | 8 ++++---- .../audius-discovery-provider-celery-tasks.json | 8 ++++---- .../audius-discovery-provider-overview.json | 16 ++++++++-------- .../dashboards/audius-network-monitoring.json | 4 ++-- .../dheeraj-content-node-dashboard-test.json | 4 ++-- .../joaquin-network-monitoring-v2.json | 8 ++++---- .../grafana/dashboards/joaquin-playground.json | 14 +++++++------- 12 files changed, 46 insertions(+), 46 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 8a4a29ec88f..6d2c861c485 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -70,7 +70,7 @@ class PrometheusMetricNames: CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" CELERY_TASK_DURATION_SECONDS = "celery_task_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" - FLASK_ROUTE_LATENCY_SECONDS = "flask_route_latency_seconds" + FLASK_ROUTE_DURATION_SECONDS = "flask_route_duration_seconds" HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE = "health_check_latest_block_difference" HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM = "health_check_latest_indexed_block_num" INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" @@ -107,8 +107,8 @@ class PrometheusMetricNames: "success", ), ), - PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS: Histogram( - f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS}", + PrometheusMetricNames.FLASK_ROUTE_DURATION_SECONDS: Histogram( + f"{METRIC_PREFIX}_{PrometheusMetricNames.FLASK_ROUTE_DURATION_SECONDS}", "Runtimes for flask routes", ( "code", diff --git a/discovery-provider/src/utils/redis_metrics.py b/discovery-provider/src/utils/redis_metrics.py index 03194cc3c50..48433b2b79a 100644 --- a/discovery-provider/src/utils/redis_metrics.py +++ b/discovery-provider/src/utils/redis_metrics.py @@ -652,7 +652,7 @@ def wrap(*args, **kwargs): except Exception as e: logger.error("Error while recording metrics: %s", e.message) - metric = PrometheusMetric(PrometheusMetricNames.FLASK_ROUTE_LATENCY_SECONDS) + metric = PrometheusMetric(PrometheusMetricNames.FLASK_ROUTE_DURATION_SECONDS) result = func(*args, **kwargs) diff --git a/monitoring/README.md b/monitoring/README.md index 629af29ac22..1f229251a4b 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -127,8 +127,8 @@ Try to keep the number of personal dashboards low to maintain navigability. Our dashboards use common set of Variables (Dashboard `Settings` -> `Variables`): -* `env`: `label_values(audius_dn_flask_route_latency_seconds_count, environment)` -* `host`: `label_values(audius_dn_flask_route_latency_seconds_count{environment=~"$env"}, host)` +* `env`: `label_values(audius_dn_flask_route_duration_seconds_count, environment)` +* `host`: `label_values(audius_dn_flask_route_duration_seconds_count{environment=~"$env"}, host)` To simplify the process of setting up dashboards each time, we can navigate to the `Audius - Boilerplate` dashboard's `Settings` -> `Save As...` dialog to copy the boilerplate. @@ -176,7 +176,7 @@ Notice how we restrict the `environment` and `host` labels associated with the m A common pattern for histograms is to display the average latency of a recorded metric like the example below: -> `max by (route) (rate(audius_dn_flask_route_latency_seconds_sum{environment=~"$env", host=~"$host"}[5m]) / rate(audius_dn_flask_route_latency_seconds_count{environment=~"$env", host=~"$host"}[5m]))` +> `max by (route) (rate(audius_dn_flask_route_duration_seconds_sum{environment=~"$env", host=~"$host"}[5m]) / rate(audius_dn_flask_route_duration_seconds_count{environment=~"$env", host=~"$host"}[5m]))` The bulk of the query comes from official docs on [calculating averages from histograms](https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations) while including PromQL filters for `environment` and `host`. diff --git a/monitoring/grafana/dashboards/audius-boilerplate.json b/monitoring/grafana/dashboards/audius-boilerplate.json index a5813c2f2fe..c1c42bbed80 100644 --- a/monitoring/grafana/dashboards/audius-boilerplate.json +++ b/monitoring/grafana/dashboards/audius-boilerplate.json @@ -40,7 +40,7 @@ "datasource": { "type": "prometheus" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -49,7 +49,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -63,7 +63,7 @@ "datasource": { "type": "prometheus" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -72,7 +72,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json index 1e7c6b81781..08d08248820 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json @@ -760,7 +760,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -769,7 +769,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -784,7 +784,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -793,7 +793,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json b/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json index 0d94929a8d2..eb5cd4658d4 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-celery-task-details.json @@ -189,7 +189,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -198,7 +198,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -213,7 +213,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -222,7 +222,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-celery-tasks.json b/monitoring/grafana/dashboards/audius-discovery-provider-celery-tasks.json index f40c61e40a9..38a39191ca8 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-celery-tasks.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-celery-tasks.json @@ -906,7 +906,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -915,7 +915,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -930,7 +930,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -939,7 +939,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json index 77d40f6d100..d1e309c495b 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json @@ -344,7 +344,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(audius_dn_flask_route_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])) / sum(rate(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "sum(rate(audius_dn_flask_route_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])) / sum(rate(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", "legendFormat": "average", "range": true, "refId": "A" @@ -354,7 +354,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "max by (route) (rate(audius_dn_flask_route_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "max by (route) (rate(audius_dn_flask_route_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", "hide": false, "legendFormat": "{{route}}", "range": true, @@ -416,7 +416,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(audius_dn_flask_route_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])) / sum(rate(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "sum(rate(audius_dn_flask_route_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])) / sum(rate(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", "legendFormat": "average", "range": true, "refId": "A" @@ -521,7 +521,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "(sum by (code) (rate(audius_dn_flask_route_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])))", + "expr": "(sum by (code) (rate(audius_dn_flask_route_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m])))", "hide": false, "legendFormat": "__auto", "range": true, @@ -1061,7 +1061,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -1070,7 +1070,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -1085,7 +1085,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -1094,7 +1094,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/audius-network-monitoring.json b/monitoring/grafana/dashboards/audius-network-monitoring.json index e580c06392f..edf68497428 100644 --- a/monitoring/grafana/dashboards/audius-network-monitoring.json +++ b/monitoring/grafana/dashboards/audius-network-monitoring.json @@ -918,7 +918,7 @@ "list": [ { "allValue": ".*", - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "hide": 0, "includeAll": true, "label": "Environment", @@ -926,7 +926,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/dheeraj-content-node-dashboard-test.json b/monitoring/grafana/dashboards/dheeraj-content-node-dashboard-test.json index 1ea46f15c9e..175b6532bbf 100644 --- a/monitoring/grafana/dashboards/dheeraj-content-node-dashboard-test.json +++ b/monitoring/grafana/dashboards/dheeraj-content-node-dashboard-test.json @@ -344,7 +344,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -353,7 +353,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/joaquin-network-monitoring-v2.json b/monitoring/grafana/dashboards/joaquin-network-monitoring-v2.json index 5288c46aa1d..c89cf4c8d21 100644 --- a/monitoring/grafana/dashboards/joaquin-network-monitoring-v2.json +++ b/monitoring/grafana/dashboards/joaquin-network-monitoring-v2.json @@ -401,7 +401,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -410,7 +410,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -425,7 +425,7 @@ "type": "prometheus", "uid": "r2_nnDL7z" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -434,7 +434,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/monitoring/grafana/dashboards/joaquin-playground.json b/monitoring/grafana/dashboards/joaquin-playground.json index b76f070767d..886629ee003 100644 --- a/monitoring/grafana/dashboards/joaquin-playground.json +++ b/monitoring/grafana/dashboards/joaquin-playground.json @@ -708,7 +708,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "max by (route) (rate(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "max by (route) (rate(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", "interval": "", "legendFormat": "{{host}}{{route}}", "refId": "A" @@ -801,7 +801,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "max by (route) (rate(audius_dn_flask_route_latency_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "max by (route) (rate(audius_dn_flask_route_duration_seconds_sum{environment=~\"$env\", host=~\"$host\"}[5m]) / rate(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\", host=~\"$host\"}[5m]))", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -895,7 +895,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "max by (host, route, code) (rate(audius_dn_flask_route_latency_seconds_count{code!=\"200\", code!=\"\", environment=~\"$env\", host=~\"$host\"}[5m]))", + "expr": "max by (host, route, code) (rate(audius_dn_flask_route_duration_seconds_count{code!=\"200\", code!=\"\", environment=~\"$env\", host=~\"$host\"}[5m]))", "interval": "", "legendFormat": "{{host}}{{route}} ({{code}})", "refId": "A" @@ -2150,7 +2150,7 @@ "datasource": { "type": "prometheus" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "description": "", "hide": 0, "includeAll": true, @@ -2159,7 +2159,7 @@ "name": "env", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count, environment)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count, environment)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -2173,7 +2173,7 @@ "datasource": { "type": "prometheus" }, - "definition": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "definition": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "description": "", "hide": 0, "includeAll": true, @@ -2182,7 +2182,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(audius_dn_flask_route_latency_seconds_count{environment=~\"$env\"}, host)", + "query": "label_values(audius_dn_flask_route_duration_seconds_count{environment=~\"$env\"}, host)", "refId": "StandardVariableQuery" }, "refresh": 1, From fe20015ccd956e5a758d2a9b94f5edfde1523612 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 01:37:47 +0000 Subject: [PATCH 20/23] latest to suffix --- discovery-provider/src/queries/get_health.py | 4 ++-- discovery-provider/src/utils/prometheus_metric.py | 12 ++++++------ monitoring/README.md | 2 +- ...udius-discovery-provider-blockchain-indexing.json | 8 ++++---- .../audius-discovery-provider-overview.json | 2 +- .../grafana/dashboards/joaquin-playground.json | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index 334c1c3557c..053e5122911 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -462,11 +462,11 @@ def get_elasticsearch_health_info( def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) - PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE).save( + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST).save( health_results["block_difference"] ) - PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM).save( + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST).save( health_results["web"]["blocknumber"] ) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 6d2c861c485..30dcd38b23a 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -71,8 +71,8 @@ class PrometheusMetricNames: CELERY_TASK_DURATION_SECONDS = "celery_task_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" FLASK_ROUTE_DURATION_SECONDS = "flask_route_duration_seconds" - HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE = "health_check_latest_block_difference" - HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM = "health_check_latest_indexed_block_num" + HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST = "health_check_block_difference_latest" + HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST = "health_check_indexed_block_num_latest" INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" @@ -115,12 +115,12 @@ class PrometheusMetricNames: "route", ), ), - PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_BLOCK_DIFFERENCE}", + PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST}", "Difference between the latest block and the latest indexed block", ), - PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_LATEST_INDEXED_BLOCK_NUM}", + PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST}", "Latest indexed block number", ), PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( diff --git a/monitoring/README.md b/monitoring/README.md index 1f229251a4b..a0ffbdd68f1 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -159,7 +159,7 @@ When additional complexity is required, visit the [official Prometheus documenta Gauges are the easiest pattern since they simply display the value of a metric that was displayed at scrape time: -> `audius_dn_health_check_latest_block_difference{environment=~"$env", host=~"$host"}` +> `audius_dn_health_check_block_difference_latest{environment=~"$env", host=~"$host"}` Notice how we restrict the `environment` and `host` labels associated with the metric to match the Dashboard Variables discussed in the previous section. diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json index 08d08248820..45f728eb4e9 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-blockchain-indexing.json @@ -202,7 +202,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"}", "legendFormat": "{{host}}", "range": true, "refId": "A" @@ -263,7 +263,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "max(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", + "expr": "max(audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"})", "legendFormat": "Max", "range": true, "refId": "A" @@ -273,7 +273,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "avg(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", + "expr": "avg(audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"})", "hide": false, "legendFormat": "Average", "range": true, @@ -284,7 +284,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "min(audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"})", + "expr": "min(audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"})", "hide": false, "legendFormat": "Min", "range": true, diff --git a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json index d1e309c495b..9b3a2c8b3c3 100644 --- a/monitoring/grafana/dashboards/audius-discovery-provider-overview.json +++ b/monitoring/grafana/dashboards/audius-discovery-provider-overview.json @@ -729,7 +729,7 @@ "type": "prometheus" }, "editorMode": "code", - "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"}", "legendFormat": "{{host}}", "range": true, "refId": "A" diff --git a/monitoring/grafana/dashboards/joaquin-playground.json b/monitoring/grafana/dashboards/joaquin-playground.json index 886629ee003..7329b735285 100644 --- a/monitoring/grafana/dashboards/joaquin-playground.json +++ b/monitoring/grafana/dashboards/joaquin-playground.json @@ -318,7 +318,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"}", "interval": "", "legendFormat": "{{host}}", "refId": "A" @@ -409,7 +409,7 @@ "type": "prometheus" }, "exemplar": true, - "expr": "audius_dn_health_check_latest_block_difference{environment=~\"$env\", host=~\"$host\"}", + "expr": "audius_dn_health_check_block_difference_latest{environment=~\"$env\", host=~\"$host\"}", "interval": "", "legendFormat": "{{host}}", "refId": "A" From 2156670fb2aa1903bafa567c0341dcb22828ea60 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 01:46:14 +0000 Subject: [PATCH 21/23] add docs --- .../src/utils/prometheus_metric.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 30dcd38b23a..5b63c6f5c87 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -67,6 +67,32 @@ def wrapper(*args, **kwargs): class PrometheusMetricNames: + """ + Attempt to group metrics with high_level prefixes like: + * `flask_` + * `celery_task_` + + Antepenultimate Suffixes: + * `_active` when measuring runtimes of actively running processes that are yet to be + completed + * `_last` when the last completed runtime is needed (duration in seconds) + * (default: do not use) `_completed` is always implied if the other two are missing. + Used when measuring runtimes of completed processes. + + Penultimate Suffixes: + * `_duration` when measuring task duration or runtimes + + Ultimate Suffixes: + * `_seconds` always the base unit (never microseconds, milliseconds, etc) + * `_latest` when looking at a snapshot of unit-less data + * `_total`, when accumulating a count, in addition to the unit if applicable + * `_info` for a pseudo-metric that provides metadata about the running binary + + See the following resources for related information: + * [Creator Node's docs](https://github.com/AudiusProject/audius-protocol/blob/master/creator-node/src/services/prometheusMonitoring/README.md) + * [Official docs](https://prometheus.io/docs/practices/naming) + """ + CELERY_TASK_ACTIVE_DURATION_SECONDS = "celery_task_active_duration_seconds" CELERY_TASK_DURATION_SECONDS = "celery_task_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" From 1b3681928d0ea8a00fab122dfd9d75545afa33de Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 01:52:34 +0000 Subject: [PATCH 22/23] ultimate suffix section --- discovery-provider/src/utils/prometheus_metric.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 5b63c6f5c87..5a11d48faa0 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -82,12 +82,14 @@ class PrometheusMetricNames: Penultimate Suffixes: * `_duration` when measuring task duration or runtimes - Ultimate Suffixes: + Suffixes: * `_seconds` always the base unit (never microseconds, milliseconds, etc) * `_latest` when looking at a snapshot of unit-less data - * `_total`, when accumulating a count, in addition to the unit if applicable * `_info` for a pseudo-metric that provides metadata about the running binary + Ultimate Suffixes: + * `_total`, when accumulating a count, in addition to above suffixes if applicable + See the following resources for related information: * [Creator Node's docs](https://github.com/AudiusProject/audius-protocol/blob/master/creator-node/src/services/prometheusMonitoring/README.md) * [Official docs](https://prometheus.io/docs/practices/naming) From 2b88bbc2016e888794fdb85a56af34e5c4649237 Mon Sep 17 00:00:00 2001 From: Joaquin Casares Date: Fri, 8 Jul 2022 02:03:47 +0000 Subject: [PATCH 23/23] docs for metric types and labels --- .../src/utils/prometheus_metric.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 5a11d48faa0..1c1bfd03cf8 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -113,6 +113,34 @@ class PrometheusMetricNames: USER_STATE_UPDATE_DURATION_SECONDS = "user_state_update_duration_seconds" +""" +Metric Types: + +* Prometheus Gauges: Prometheus Gauges (not to be confused with the Grafana Panel Type + which is a UI element which looks like a speedometer) will export a single metric + which is useful for point-in-time collection. +* Prometheus Histograms: Histograms are far more common, especially when timing how long + code runs, since a single metric endpoint will be exploded to create 11 additional + metrics (sum, count, and 9 statistical buckets). + * When looking at the raw /prometheus_metrics endpoint for + `audius_dn_update_aggregate_table_latency_seconds_bucket`, you can see how a + single metric explodes into multiple statistical helpers. + +Labels: + +Only use labels when labeling across **low-cardinality** of options. + +As a general guideline, try to keep the cardinality of your metrics below 10, and for +metrics that exceed that, aim to limit them to a handful across your whole system. +The vast majority of your metrics should have no labels. + +A few example labels: + +* `scope` is used when measuring a larger unit of work that may have subtasks we want to + measure runtimes for. + * `{scope=”full”}` is reserved for the larger base unit of work +* `task_name` when similar CeleryTasks use the same helper code from different callers +""" PrometheusRegistry = { PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS: Gauge( f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS}",