diff --git a/clipper_admin/clipper_admin/docker/docker_metric_utils.py b/clipper_admin/clipper_admin/docker/docker_metric_utils.py index 08f487f77..c756db123 100644 --- a/clipper_admin/clipper_admin/docker/docker_metric_utils.py +++ b/clipper_admin/clipper_admin/docker/docker_metric_utils.py @@ -4,6 +4,8 @@ import os from ..version import __version__ +PROM_VERSION = "v2.1.0" + def ensure_clipper_tmp(): """ @@ -98,7 +100,7 @@ def run_metric_image(docker_client, common_labels, prometheus_port, ] metric_labels = common_labels.copy() docker_client.containers.run( - "prom/prometheus", + "prom/prometheus:{}".format(PROM_VERSION), metric_cmd, name="metric_frontend-{}".format(random.randint(0, 100000)), ports={'9090/tcp': prometheus_port}, diff --git a/examples/monitoring/Clipper_Dashboard.json b/examples/monitoring/Clipper_Dashboard.json index 16ae92538..1f508116a 100644 --- a/examples/monitoring/Clipper_Dashboard.json +++ b/examples/monitoring/Clipper_Dashboard.json @@ -18,11 +18,97 @@ "hideControls": false, "id": 1, "links": [], + "refresh": false, "rows": [ { "collapse": false, - "height": 247, + "height": 234, "panels": [ + { + "content": "# Fizz-Buzz Feature Sum Model\n\nThis dashboard demonstrates Clipper serving two \"Fizz-Buzz\" feature sum model. \n\nFor each prediction #i: the time to make the prediction (client side end-to-end) is\n- Long ~600ms: i divisible by 15. \n- Medium ~300ms: i divisible by 5.\n- Short ~ 150: i divisible by 3.\n- Very Short ~50ms: Other", + "id": 6, + "links": [], + "mode": "markdown", + "span": 6, + "title": "About", + "type": "text" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Clipper", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "app_simple_example_num_predictions_count", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "title": "Total Prediction Made", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, { "aliasColors": {}, "bars": true, @@ -30,14 +116,12 @@ "dashes": false, "datasource": "Clipper", "fill": 1, - "id": 1, + "id": 5, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": false, "total": false, "values": false @@ -52,21 +136,22 @@ "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "span": 10, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(clipper_mc_end_to_end_latency_ms_bucket) by (le)", + "expr": "clipper_mc_pred_total", "format": "time_series", "intervalFactor": 2, + "legendFormat": "{{job}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Average End-to-End Latency Histogram (ms)", + "title": "Prediction Count by Model Container", "tooltip": { "shared": false, "sort": 0, @@ -84,11 +169,12 @@ }, "yaxes": [ { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -100,81 +186,179 @@ "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 198, + "panels": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, "datasource": "Clipper", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "total": false, + "values": false }, - "id": 2, - "interval": null, + "lines": false, + "linewidth": 1, "links": [], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"10.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"5.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5 - 10", + "refId": "A" }, { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"20.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"10.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "10 - 20", + "refId": "B" + }, { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"35.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"20.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "20 - 35", + "refId": "C" + }, { - "expr": "sum(clipper_mc_pred_total)", + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"50.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"35.0\"})", "format": "time_series", "intervalFactor": 2, - "refId": "A" + "legendFormat": "35 - 50", + "refId": "D" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"75.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"50.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "50 - 75", + "refId": "E" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"100.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"75.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "75 - 100", + "refId": "F" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"150.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"100.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "100 - 150", + "refId": "G" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"200.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"150.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "150 - 200", + "refId": "H" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"250.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"200.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "200 - 250", + "refId": "I" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"300.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"250.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "250 - 300", + "refId": "J" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"400.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"300.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "300 - 400", + "refId": "K" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"500.0\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"400.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "400 - 500", + "refId": "L" + }, + { + "expr": "avg(clipper_mc_handle_time_ms_bucket{le=\"+Inf\"}) - avg(clipper_mc_handle_time_ms_bucket{le=\"500.0\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "500 - ", + "refId": "M" } ], - "thresholds": "", - "title": "Total Prediction Made", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Average Handle Time Histogram (ms)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": true, + "values": [ + "current" + ] + }, + "yaxes": [ { - "op": "=", - "text": "N/A", - "value": "null" + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "valueName": "avg" + ] } ], "repeat": null, @@ -186,7 +370,7 @@ }, { "collapse": false, - "height": 250, + "height": 263, "panels": [ { "aliasColors": {}, @@ -194,7 +378,7 @@ "dashLength": 10, "dashes": false, "datasource": "Clipper", - "description": "For the past 10 minutes", + "description": "Sample over 30 sec range", "fill": 1, "id": 3, "legend": { @@ -216,33 +400,33 @@ "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "span": 12, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(rate(clipper_mc_end_to_end_latency_ms_sum[10m])/rate(clipper_mc_end_to_end_latency_ms_count[10m]))", + "expr": "avg(rate(clipper_mc_end_to_end_latency_ms_sum[30s])/rate(clipper_mc_end_to_end_latency_ms_count[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "End to End", "refId": "A" }, { - "expr": "avg(rate(clipper_mc_recv_time_ms_sum[10m])/rate(clipper_mc_recv_time_ms_count[10m]))", + "expr": "avg(rate(clipper_mc_recv_time_ms_sum[30s])/rate(clipper_mc_recv_time_ms_count[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive Time", "refId": "B" }, { - "expr": "avg(rate(clipper_mc_parse_time_ms_sum[10m])/rate(clipper_mc_parse_time_ms_count[10m]))", + "expr": "avg(rate(clipper_mc_parse_time_ms_sum[30s])/rate(clipper_mc_parse_time_ms_count[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Parse Time", "refId": "C" }, { - "expr": "avg(rate(clipper_mc_handle_time_ms_sum[10m])/rate(clipper_mc_handle_time_ms_count[10m]))", + "expr": "avg(rate(clipper_mc_handle_time_ms_sum[30s])/rate(clipper_mc_handle_time_ms_count[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Handle Time", @@ -250,7 +434,7 @@ } ], "thresholds": [], - "timeFrom": "10m", + "timeFrom": null, "timeShift": null, "title": "Average Latency (ms)", "tooltip": { @@ -284,6 +468,96 @@ "show": true } ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Clipper", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(histogram_quantile(0.99, rate(clipper_mc_end_to_end_latency_ms_bucket[10m])))", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "avg(histogram_quantile(0.95, rate(clipper_mc_end_to_end_latency_ms_bucket[10m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "95th Percentile", + "refId": "B" + }, + { + "expr": "avg(histogram_quantile(0.5, rate(clipper_mc_end_to_end_latency_ms_bucket[10m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "50th Percentile", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Model Containers: Average End-to-end Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "ms", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } ], "repeat": null, @@ -301,7 +575,7 @@ "list": [] }, "time": { - "from": "now-6h", + "from": "now-5m", "to": "now" }, "timepicker": { diff --git a/examples/monitoring/query.py b/examples/monitoring/query.py index 4b83b1fe6..752d8b4d7 100644 --- a/examples/monitoring/query.py +++ b/examples/monitoring/query.py @@ -40,6 +40,22 @@ def signal_handler(signal, frame): sys.exit(0) +def produce_query_arr_for_ms(ms): + size = int(ms * 8000) ## Use python sum, scale linearly. + return np.random.random(size) + + +def fizz_buzz(i): + if i % 15 == 0: + return produce_query_arr_for_ms(200) + elif i % 5 == 0: + return produce_query_arr_for_ms(100) + elif i % 3 == 0: + return produce_query_arr_for_ms(50) + else: + return produce_query_arr_for_ms(10) + + if __name__ == '__main__': signal.signal(signal.SIGINT, signal_handler) clipper_conn = ClipperConnection(DockerContainerManager()) @@ -50,18 +66,12 @@ def signal_handler(signal, frame): time.sleep(2) print("Starting Prediction") - # For batch inputs set this number > 1 - batch_size = 1 - try: + counter = 0 while True: - if batch_size > 1: - predict( - clipper_conn.get_query_addr(), - [list(np.random.random(200)) for i in range(batch_size)], - batch=True) - else: - predict(clipper_conn.get_query_addr(), np.random.random(200)) + print(counter) + predict(clipper_conn.get_query_addr(), fizz_buzz(counter)) + counter += 1 time.sleep(0.2) except Exception as e: clipper_conn.stop_all()