From e0a1d513d4913733c9040e7457501e3ab61714ce Mon Sep 17 00:00:00 2001 From: Grace Do Date: Tue, 2 Oct 2018 13:01:16 -0700 Subject: [PATCH 1/4] Collect framework_offers, allocator metrics with mesos input plugin --- plugins/inputs/mesos/README.md | 52 ++++++ plugins/inputs/mesos/mesos.go | 213 ++++++++++++++++++++++- plugins/inputs/mesos/mesos_test.go | 271 ++++++++++++++++++++++++++++- 3 files changed, 534 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/mesos/README.md b/plugins/inputs/mesos/README.md index b18908b8a3b9a..6964f0a6d40e9 100644 --- a/plugins/inputs/mesos/README.md +++ b/plugins/inputs/mesos/README.md @@ -19,10 +19,12 @@ For more information, please check the [Mesos Observability Metrics](http://meso "system", "agents", "frameworks", + "framework_offers", "tasks", "messages", "evqueue", "registrar", + "allocator", ] ## A list of Mesos slaves, default is [] # slaves = [] @@ -108,6 +110,22 @@ Mesos master metric groups - master/frameworks_inactive - master/outstanding_offers +- framework offers + - master/frameworks/subscribed + - master/frameworks/calls_total + - master/frameworks/calls + - master/frameworks/events_total + - master/frameworks/events + - master/frameworks/operations_total + - master/frameworks/operations + - master/frameworks/tasks/active + - master/frameworks/tasks/terminal + - master/frameworks/offers/sent + - master/frameworks/offers/accepted + - master/frameworks/offers/declined + - master/frameworks/offers/rescinded + - master/frameworks/roles/suppressed + - tasks - master/tasks_error - master/tasks_failed @@ -173,6 +191,40 @@ Mesos master metric groups - registrar/state_store_ms/p999 - registrar/state_store_ms/p9999 +- allocator + - allocator/allocation_run_ms + - allocator/allocation_run_ms/count + - allocator/allocation_run_ms/max + - allocator/allocation_run_ms/min + - allocator/allocation_run_ms/p50 + - allocator/allocation_run_ms/p90 + - allocator/allocation_run_ms/p95 + - allocator/allocation_run_ms/p99 + - allocator/allocation_run_ms/p999 + - allocator/allocation_run_ms/p9999 + - allocator/allocation_runs + - allocator/allocation_run_latency_ms + - allocator/allocation_run_latency_ms/count + - allocator/allocation_run_latency_ms/max + - allocator/allocation_run_latency_ms/min + - allocator/allocation_run_latency_ms/p50 + - allocator/allocation_run_latency_ms/p90 + - allocator/allocation_run_latency_ms/p95 + - allocator/allocation_run_latency_ms/p99 + - allocator/allocation_run_latency_ms/p999 + - allocator/allocation_run_latency_ms/p9999 + - allocator/roles/shares/dominant + - allocator/event_queue_dispatches + - allocator/offer_filters/roles/active + - allocator/quota/roles/resources/offered_or_allocated + - allocator/quota/roles/resources/guarantee + - allocator/resources/cpus/offered_or_allocated + - allocator/resources/cpus/total + - allocator/resources/disk/offered_or_allocated + - allocator/resources/disk/total + - allocator/resources/mem/offered_or_allocated + - allocator/resources/mem/total + Mesos slave metric groups - resources - slave/cpus_percent diff --git a/plugins/inputs/mesos/mesos.go b/plugins/inputs/mesos/mesos.go index 9190ceae8784b..8b322b84ddf2d 100644 --- a/plugins/inputs/mesos/mesos.go +++ b/plugins/inputs/mesos/mesos.go @@ -3,6 +3,7 @@ package mesos import ( "encoding/json" "errors" + "fmt" "io/ioutil" "log" "net" @@ -41,8 +42,80 @@ type Mesos struct { slaveURLs []*url.URL } +// TaggedField allows us to create a predictable hash from each unique +// combination of tags +type TaggedField struct { + FrameworkName string + CallType string + EventType string + OperationType string + TaskState string + RoleName string + FieldName string + Resource string + Value interface{} +} + +func (tf TaggedField) hash() string { + buffer := "tf" + + if tf.FrameworkName != "" { + buffer += "_fn:" + tf.FrameworkName + } + if tf.CallType != "" { + buffer += "_ct:" + tf.CallType + } + if tf.EventType != "" { + buffer += "_et:" + tf.EventType + } + if tf.OperationType != "" { + buffer += "_ot:" + tf.OperationType + } + if tf.TaskState != "" { + buffer += "_ts:" + tf.TaskState + } + if tf.RoleName != "" { + buffer += "_rn:" + tf.RoleName + } + if tf.Resource != "" { + buffer += "_r:" + tf.Resource + } + + return buffer +} + +type fieldTags map[string]string + +func (tf TaggedField) tags() fieldTags { + tags := fieldTags{} + + if tf.FrameworkName != "" { + tags["framework_name"] = tf.FrameworkName + } + if tf.CallType != "" { + tags["call_type"] = tf.CallType + } + if tf.EventType != "" { + tags["event_type"] = tf.EventType + } + if tf.OperationType != "" { + tags["operation_type"] = tf.OperationType + } + if tf.TaskState != "" { + tags["task_state"] = tf.TaskState + } + if tf.RoleName != "" { + tags["role_name"] = tf.RoleName + } + if tf.Resource != "" { + tags["resource"] = tf.Resource + } + + return tags +} + var allMetrics = map[Role][]string{ - MASTER: {"resources", "master", "system", "agents", "frameworks", "tasks", "messages", "evqueue", "registrar"}, + MASTER: {"resources", "master", "system", "agents", "frameworks", "framework_offers", "tasks", "messages", "evqueue", "registrar", "allocator"}, SLAVE: {"resources", "agent", "system", "executors", "tasks", "messages"}, } @@ -58,10 +131,12 @@ var sampleConfig = ` "system", "agents", "frameworks", + "framework_offers", "tasks", "messages", "evqueue", "registrar", + "allocator", ] ## A list of Mesos slaves, default is [] # slaves = [] @@ -315,6 +390,12 @@ func getMetrics(role Role, group string) []string { "master/outstanding_offers", } + // These groups are empty because filtering is done in gatherMainMetrics + // based on presence of "framework_offers"/"allocator" in MasterCols. + // These lines are included to prevent the "unknown" info log below. + m["framework_offers"] = []string{} + m["allocator"] = []string{} + m["tasks"] = []string{ "master/tasks_error", "master/tasks_failed", @@ -593,11 +674,141 @@ func (m *Mesos) gatherMainMetrics(u *url.URL, role Role, acc telegraf.Accumulato } } + var includeFrameworkOffers, includeAllocator bool + for _, col := range m.MasterCols { + if col == "framework_offers" { + includeFrameworkOffers = true + } else if col == "allocator" { + includeAllocator = true + } + } + + taggedFields := map[string][]TaggedField{} + extraTags := map[string]fieldTags{} + + for metricName, val := range jf.Fields { + if !strings.HasPrefix(metricName, "master/frameworks/") && !strings.HasPrefix(metricName, "allocator/") { + continue + } + + // filter out framework offers/allocator metrics if necessary + if (!includeFrameworkOffers && strings.HasPrefix(metricName, "master/frameworks/")) || + (!includeAllocator && strings.HasPrefix(metricName, "allocator/")) { + delete(jf.Fields, metricName) + continue + } + + parts := strings.Split(metricName, "/") + if (parts[0] == "master" && len(parts) < 5) || (parts[0] == "allocator" && len(parts) <= 5) { + // All framework offers metrics have at least 5 parts. + // All allocator metrics with <= 5 parts can be sent as is and does not pull + // any params out into tags. + // (e.g. allocator/mesos/allocation_run_ms/count vs allocator/mesos/roles//shares/dominant) + continue + } + + tf := generateTaggedField(parts) + tf.Value = val + + if len(tf.tags()) == 0 { + // indicates no extra tags were added + continue + } + + tfh := tf.hash() + if _, ok := taggedFields[tfh]; !ok { + taggedFields[tfh] = []TaggedField{} + } + taggedFields[tfh] = append(taggedFields[tfh], tf) + + if _, ok := extraTags[tfh]; !ok { + extraTags[tfh] = tf.tags() + } + + delete(jf.Fields, metricName) + } + acc.AddFields("mesos", jf.Fields, tags) + for tfh, tfs := range taggedFields { + fields := map[string]interface{}{} + for _, tf := range tfs { + fields[tf.FieldName] = tf.Value + } + for k, v := range tags { + extraTags[tfh][k] = v + } + + acc.AddFields("mesos", fields, extraTags[tfh]) + } + return nil } +func generateTaggedField(parts []string) TaggedField { + tf := TaggedField{} + + if parts[0] == "master" { + tf.FrameworkName = parts[2] + if len(parts) == 5 { + // e.g. /master/frameworks/calls_total + tf.FieldName = fmt.Sprintf("%s/%s/%s_total", parts[0], parts[1], parts[4]) + } else { + switch parts[4] { + case "offers": + // e.g. /master/frameworks/offers/sent + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[5]) + case "calls": + // e.g. /master/frameworks/calls/decline + tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) + tf.CallType = parts[5] + case "events": + // e.g. /master/frameworks/events/heartbeat + tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) + tf.EventType = parts[5] + case "operations": + // e.g. /master/frameworks/operations/create + tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) + tf.OperationType = parts[5] + case "tasks": + // e.g. /master/frameworks/tasks/active/running + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[5]) + tf.TaskState = parts[6] + case "roles": + // e.g. /master/frameworks/roles/public + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[6]) + tf.RoleName = parts[5] + default: + // default to excluding framework name and id, but otherwise leaving path as is + log.Printf("I! Unexpected metric name %s", parts[4]) + tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], strings.Join(parts[4:], "/")) + } + } + } else if parts[0] == "allocator" { + switch parts[2] { + case "roles": + // e.g. /allocator/roles/shares/dominant + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[2], parts[4], parts[5]) + tf.RoleName = parts[3] + case "offer_filters": + // e.g. /allocator/offer_filters/roles/active + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[2], parts[3], parts[5]) + tf.RoleName = parts[4] + case "quota": + // e.g. /allocator/quota/roles/resources/offered_or_allocated + tf.FieldName = fmt.Sprintf("%s/%s/%s/%s/%s", parts[0], parts[2], parts[3], parts[5], parts[7]) + tf.RoleName = parts[4] + tf.Resource = parts[6] + default: + // default to leaving path as is + log.Printf("I! Unexpected metric name %s", parts[2]) + tf.FieldName = strings.Join(parts, "/") + } + } + + return tf +} + func init() { inputs.Add("mesos", func() telegraf.Input { return &Mesos{} diff --git a/plugins/inputs/mesos/mesos_test.go b/plugins/inputs/mesos/mesos_test.go index 905adb6e3d44d..322335de60b12 100644 --- a/plugins/inputs/mesos/mesos_test.go +++ b/plugins/inputs/mesos/mesos_test.go @@ -8,9 +8,11 @@ import ( "net/http/httptest" "net/url" "os" + "strings" "testing" "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -83,6 +85,20 @@ func generateMetrics() { "master/frameworks_disconnected", "master/frameworks_inactive", "master/outstanding_offers", + // framework offers + "master/frameworks/marathon/abc-123/calls", + "master/frameworks/marathon/abc-123/calls/accept", + "master/frameworks/marathon/abc-123/events", + "master/frameworks/marathon/abc-123/events/error", + "master/frameworks/marathon/abc-123/offers/sent", + "master/frameworks/marathon/abc-123/operations", + "master/frameworks/marathon/abc-123/operations/create", + "master/frameworks/marathon/abc-123/roles/*/suppressed", + "master/frameworks/marathon/abc-123/subscribed", + "master/frameworks/marathon/abc-123/tasks/active/task_killing", + "master/frameworks/marathon/abc-123/tasks/active/task_dropped", + "master/frameworks/marathon/abc-123/tasks/terminal/task_dropped", + "master/frameworks/marathon/abc-123/unknown/unknown", // test case for unknown metric type // tasks "master/tasks_error", "master/tasks_failed", @@ -144,6 +160,44 @@ func generateMetrics() { "registrar/state_store_ms/p99", "registrar/state_store_ms/p999", "registrar/state_store_ms/p9999", + // allocator + "allocator/mesos/allocation_run_ms", + "allocator/mesos/allocation_run_ms/count", + "allocator/mesos/allocation_run_ms/max", + "allocator/mesos/allocation_run_ms/min", + "allocator/mesos/allocation_run_ms/p50", + "allocator/mesos/allocation_run_ms/p90", + "allocator/mesos/allocation_run_ms/p95", + "allocator/mesos/allocation_run_ms/p99", + "allocator/mesos/allocation_run_ms/p999", + "allocator/mesos/allocation_run_ms/p9999", + "allocator/mesos/allocation_runs", + "allocator/mesos/allocation_run_latency_ms", + "allocator/mesos/allocation_run_latency_ms/count", + "allocator/mesos/allocation_run_latency_ms/max", + "allocator/mesos/allocation_run_latency_ms/min", + "allocator/mesos/allocation_run_latency_ms/p50", + "allocator/mesos/allocation_run_latency_ms/p90", + "allocator/mesos/allocation_run_latency_ms/p95", + "allocator/mesos/allocation_run_latency_ms/p99", + "allocator/mesos/allocation_run_latency_ms/p999", + "allocator/mesos/allocation_run_latency_ms/p9999", + "allocator/mesos/roles/*/shares/dominant", + // test case against hash collisions in TaggedFields + // e.g. framework_name=marathon and role_name=marathon + "allocator/mesos/roles/marathon/shares/dominant", + "allocator/mesos/event_queue_dispatches", + "allocator/mesos/offer_filters/roles/*/active", + "allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated", + "allocator/mesos/quota/roles/*/resources/mem/guarantee", + "allocator/mesos/quota/roles/*/resources/disk/guarantee", + "allocator/mesos/resources/cpus/offered_or_allocated", + "allocator/mesos/resources/cpus/total", + "allocator/mesos/resources/disk/offered_or_allocated", + "allocator/mesos/resources/disk/total", + "allocator/mesos/resources/mem/offered_or_allocated", + "allocator/mesos/resources/mem/total", + "allocator/mesos/unknown/unknown/unknown/unknown", // test case for unknown metric type } for _, k := range metricNames { @@ -290,7 +344,166 @@ func TestMesosMaster(t *testing.T) { t.Errorf(err.Error()) } - acc.AssertContainsFields(t, "mesos", masterMetrics) + expectedUntaggedMetrics := map[string]interface{}{} + for k, v := range masterMetrics { + parts := strings.Split(k, "/") + if !strings.HasPrefix(k, "master/frameworks/") && (!strings.HasPrefix(k, "allocator/") || len(parts) <= 5) { + expectedUntaggedMetrics[k] = v + } + } + // for unknown allocator metric type test case, expect no additional tags + expectedUntaggedMetrics["allocator/mesos/unknown/unknown/unknown/unknown"] = masterMetrics["allocator/mesos/unknown/unknown/unknown/unknown"] + + acc.AssertContainsFields(t, "mesos", expectedUntaggedMetrics) + + frameworkFields := []map[string]interface{}{ + // framework offers + { + // "unknown" metric type should still contain framework_name tag + "master/frameworks/unknown/unknown": masterMetrics["master/frameworks/marathon/abc-123/unknown/unknown"], + "master/frameworks/calls_total": masterMetrics["master/frameworks/marathon/abc-123/calls"], + "master/frameworks/events_total": masterMetrics["master/frameworks/marathon/abc-123/events"], + "master/frameworks/operations_total": masterMetrics["master/frameworks/marathon/abc-123/operations"], + "master/frameworks/subscribed_total": masterMetrics["master/frameworks/marathon/abc-123/subscribed"], + "master/frameworks/offers/sent": masterMetrics["master/frameworks/marathon/abc-123/offers/sent"], + }, + { + "master/frameworks/tasks/active": masterMetrics["master/frameworks/marathon/abc-123/tasks/active/task_killing"], + }, + { + "master/frameworks/tasks/active": masterMetrics["master/frameworks/marathon/abc-123/tasks/active/task_dropped"], + "master/frameworks/tasks/terminal": masterMetrics["master/frameworks/marathon/abc-123/tasks/terminal/task_dropped"], + }, + { + "master/frameworks/roles/suppressed": masterMetrics["master/frameworks/marathon/abc-123/roles/*/suppressed"], + }, + { + "master/frameworks/calls": masterMetrics["master/frameworks/marathon/abc-123/calls/accept"], + }, + { + "master/frameworks/events": masterMetrics["master/frameworks/marathon/abc-123/events/error"], + }, + { + "master/frameworks/operations": masterMetrics["master/frameworks/marathon/abc-123/operations/create"], + }, + // allocator + { + "allocator/roles/shares/dominant": masterMetrics["allocator/mesos/roles/*/shares/dominant"], + "allocator/offer_filters/roles/active": masterMetrics["allocator/mesos/offer_filters/roles/*/active"], + }, + { + "allocator/roles/shares/dominant": masterMetrics["allocator/mesos/roles/marathon/shares/dominant"], + }, + { + "allocator/quota/roles/resources/offered_or_allocated": masterMetrics["allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated"], + "allocator/quota/roles/resources/guarantee": masterMetrics["allocator/mesos/quota/roles/*/resources/disk/guarantee"], + }, + { + "allocator/quota/roles/resources/guarantee": masterMetrics["allocator/mesos/quota/roles/*/resources/mem/guarantee"], + }, + } + + frameworkTags := []map[string]string{ + // framework offers + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "task_state": "task_killing", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "task_state": "task_dropped", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "role_name": "*", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "call_type": "accept", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "event_type": "error", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "framework_name": "marathon", + "operation_type": "create", + }, + // allocator + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "role_name": "*", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "role_name": "marathon", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "role_name": "*", + "resource": "disk", + }, + { + "server": m.masterURLs[0].Hostname(), + "url": masterTestServer.URL, + "role": "master", + "state": "leader", + "role_name": "*", + "resource": "mem", + }, + } + + for i := 0; i < len(frameworkFields); i++ { + acc.AssertContainsTaggedFields(t, "mesos", frameworkFields[i], frameworkTags[i]) + // Test that none of the other metrics share the same tags, which + // tests against potential hash collisions in TaggedFields. + for j := 0; j < len(frameworkFields); j++ { + if j == i { + continue + } + acc.AssertDoesNotContainsTaggedFields(t, "mesos", frameworkFields[j], frameworkTags[i]) + } + } } func TestMasterFilter(t *testing.T) { @@ -396,3 +609,59 @@ func TestURLTagDoesNotModify(t *testing.T) { require.Equal(t, u.String(), "http://a:b@localhost:5051?timeout=1ms") require.Equal(t, v, "http://localhost:5051") } + +func TestTaggedFieldHash(t *testing.T) { + assert := assert.New(t) + tf := TaggedField{ + FrameworkName: "marathon", + CallType: "accept", + EventType: "error", + OperationType: "create", + TaskState: "active", + RoleName: "marathon", + FieldName: "field/name", + Resource: "mem", + Value: 1.0, + } + assert.Equal("tf_fn:marathon_ct:accept_et:error_ot:create_ts:active_rn:marathon_r:mem", tf.hash()) + + // Test against hash collisions + tf1 := TaggedField{ + FrameworkName: "marathon", + FieldName: "field/name/1", + Value: 1.0, + } + tf2 := TaggedField{ + RoleName: "marathon", + FieldName: "field/name/2", + Value: 1.0, + } + assert.NotEqual(tf1.hash(), tf2.hash()) +} + +func TestTaggedFieldTags(t *testing.T) { + assert := assert.New(t) + tf := TaggedField{ + FrameworkName: "marathon", + CallType: "accept", + EventType: "error", + OperationType: "create", + TaskState: "active", + RoleName: "marathon", + FieldName: "field/name", + Resource: "mem", + Value: 1.0, + } + + expectedTags := fieldTags{ + "framework_name": "marathon", + "call_type": "accept", + "event_type": "error", + "operation_type": "create", + "task_state": "active", + "role_name": "marathon", + "resource": "mem", + } + + assert.Equal(expectedTags, tf.tags()) +} From 697073ec83cca6f9996ca506c393ea36cd6e3cc9 Mon Sep 17 00:00:00 2001 From: Branden Rolston Date: Thu, 1 Aug 2019 21:44:42 -0700 Subject: [PATCH 2/4] Collect additional Mesos metrics --- plugins/inputs/mesos/README.md | 20 ++++++++++++++++++++ plugins/inputs/mesos/mesos.go | 20 ++++++++++++++++++++ plugins/inputs/mesos/mesos_test.go | 20 ++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/plugins/inputs/mesos/README.md b/plugins/inputs/mesos/README.md index 6964f0a6d40e9..b9a46eaa9a632 100644 --- a/plugins/inputs/mesos/README.md +++ b/plugins/inputs/mesos/README.md @@ -102,6 +102,10 @@ Mesos master metric groups - master/slaves_connected - master/slaves_disconnected - master/slaves_inactive + - master/slave_unreachable_canceled + - master/slave_unreachable_completed + - master/slave_unreachable_scheduled + - master/slaves_unreachable - frameworks - master/frameworks_active @@ -135,6 +139,11 @@ Mesos master metric groups - master/tasks_running - master/tasks_staging - master/tasks_starting + - master/tasks_dropped + - master/tasks_gone + - master/tasks_gone_by_operator + - master/tasks_killing + - master/tasks_unreachable - messages - master/invalid_executor_to_framework_messages @@ -173,11 +182,17 @@ Mesos master metric groups - master/task_lost/source_master/reason_slave_removed - master/task_lost/source_slave/reason_executor_terminated - master/valid_executor_to_framework_messages + - master/invalid_operation_status_update_acknowledgements + - master/messages_operation_status_update_acknowledgement + - master/messages_reconcile_operations + - master/messages_suppress_offers + - master/valid_operation_status_update_acknowledgements - evqueue - master/event_queue_dispatches - master/event_queue_http_requests - master/event_queue_messages + - master/operator_event_stream_subscribers - registrar - registrar/state_fetch_ms @@ -190,6 +205,11 @@ Mesos master metric groups - registrar/state_store_ms/p99 - registrar/state_store_ms/p999 - registrar/state_store_ms/p9999 + - registrar/state_store_ms/count + - registrar/log/ensemble_size + - registrar/log/recovered + - registrar/queued_operations + - registrar/registry_size_bytes - allocator - allocator/allocation_run_ms diff --git a/plugins/inputs/mesos/mesos.go b/plugins/inputs/mesos/mesos.go index 8b322b84ddf2d..2c4cb4028077a 100644 --- a/plugins/inputs/mesos/mesos.go +++ b/plugins/inputs/mesos/mesos.go @@ -380,6 +380,10 @@ func getMetrics(role Role, group string) []string { "master/slaves_connected", "master/slaves_disconnected", "master/slaves_inactive", + "master/slave_unreachable_canceled", + "master/slave_unreachable_completed", + "master/slave_unreachable_scheduled", + "master/slaves_unreachable", } m["frameworks"] = []string{ @@ -405,6 +409,11 @@ func getMetrics(role Role, group string) []string { "master/tasks_running", "master/tasks_staging", "master/tasks_starting", + "master/tasks_dropped", + "master/tasks_gone", + "master/tasks_gone_by_operator", + "master/tasks_killing", + "master/tasks_unreachable", } m["messages"] = []string{ @@ -444,12 +453,18 @@ func getMetrics(role Role, group string) []string { "master/task_lost/source_master/reason_slave_removed", "master/task_lost/source_slave/reason_executor_terminated", "master/valid_executor_to_framework_messages", + "master/invalid_operation_status_update_acknowledgements", + "master/messages_operation_status_update_acknowledgement", + "master/messages_reconcile_operations", + "master/messages_suppress_offers", + "master/valid_operation_status_update_acknowledgements", } m["evqueue"] = []string{ "master/event_queue_dispatches", "master/event_queue_http_requests", "master/event_queue_messages", + "master/operator_event_stream_subscribers", } m["registrar"] = []string{ @@ -463,6 +478,11 @@ func getMetrics(role Role, group string) []string { "registrar/state_store_ms/p99", "registrar/state_store_ms/p999", "registrar/state_store_ms/p9999", + "registrar/log/ensemble_size", + "registrar/log/recovered", + "registrar/queued_operations", + "registrar/registry_size_bytes", + "registrar/state_store_ms/count", } } else if role == SLAVE { m["resources"] = []string{ diff --git a/plugins/inputs/mesos/mesos_test.go b/plugins/inputs/mesos/mesos_test.go index 322335de60b12..c33aa48dfa2c9 100644 --- a/plugins/inputs/mesos/mesos_test.go +++ b/plugins/inputs/mesos/mesos_test.go @@ -79,6 +79,10 @@ func generateMetrics() { "master/slaves_connected", "master/slaves_disconnected", "master/slaves_inactive", + "master/slave_unreachable_canceled", + "master/slave_unreachable_completed", + "master/slave_unreachable_scheduled", + "master/slaves_unreachable", // frameworks "master/frameworks_active", "master/frameworks_connected", @@ -108,6 +112,11 @@ func generateMetrics() { "master/tasks_running", "master/tasks_staging", "master/tasks_starting", + "master/tasks_dropped", + "master/tasks_gone", + "master/tasks_gone_by_operator", + "master/tasks_killing", + "master/tasks_unreachable", // messages "master/invalid_executor_to_framework_messages", "master/invalid_framework_to_executor_messages", @@ -145,11 +154,21 @@ func generateMetrics() { "master/task_lost/source_master/reason_slave_removed", "master/task_lost/source_slave/reason_executor_terminated", "master/valid_executor_to_framework_messages", + "master/invalid_operation_status_update_acknowledgements", + "master/messages_operation_status_update_acknowledgement", + "master/messages_reconcile_operations", + "master/messages_suppress_offers", + "master/valid_operation_status_update_acknowledgements", // evgqueue "master/event_queue_dispatches", "master/event_queue_http_requests", "master/event_queue_messages", + "master/operator_event_stream_subscribers", // registrar + "registrar/log/ensemble_size", + "registrar/log/recovered", + "registrar/queued_operations", + "registrar/registry_size_bytes", "registrar/state_fetch_ms", "registrar/state_store_ms", "registrar/state_store_ms/max", @@ -160,6 +179,7 @@ func generateMetrics() { "registrar/state_store_ms/p99", "registrar/state_store_ms/p999", "registrar/state_store_ms/p9999", + "registrar/state_store_ms/count", // allocator "allocator/mesos/allocation_run_ms", "allocator/mesos/allocation_run_ms/count", From effb993e04d36aecdeaafbbc217134de11d4738c Mon Sep 17 00:00:00 2001 From: Branden Rolston Date: Thu, 1 Aug 2019 22:22:49 -0700 Subject: [PATCH 3/4] Don't parse framework and allocator metric names This restores framework and allocator metrics to their original names, without extracting parts into tags. --- plugins/inputs/mesos/mesos.go | 188 +---------------------- plugins/inputs/mesos/mesos_test.go | 235 +---------------------------- 2 files changed, 5 insertions(+), 418 deletions(-) diff --git a/plugins/inputs/mesos/mesos.go b/plugins/inputs/mesos/mesos.go index 2c4cb4028077a..44d27dc4921d1 100644 --- a/plugins/inputs/mesos/mesos.go +++ b/plugins/inputs/mesos/mesos.go @@ -3,7 +3,6 @@ package mesos import ( "encoding/json" "errors" - "fmt" "io/ioutil" "log" "net" @@ -42,78 +41,6 @@ type Mesos struct { slaveURLs []*url.URL } -// TaggedField allows us to create a predictable hash from each unique -// combination of tags -type TaggedField struct { - FrameworkName string - CallType string - EventType string - OperationType string - TaskState string - RoleName string - FieldName string - Resource string - Value interface{} -} - -func (tf TaggedField) hash() string { - buffer := "tf" - - if tf.FrameworkName != "" { - buffer += "_fn:" + tf.FrameworkName - } - if tf.CallType != "" { - buffer += "_ct:" + tf.CallType - } - if tf.EventType != "" { - buffer += "_et:" + tf.EventType - } - if tf.OperationType != "" { - buffer += "_ot:" + tf.OperationType - } - if tf.TaskState != "" { - buffer += "_ts:" + tf.TaskState - } - if tf.RoleName != "" { - buffer += "_rn:" + tf.RoleName - } - if tf.Resource != "" { - buffer += "_r:" + tf.Resource - } - - return buffer -} - -type fieldTags map[string]string - -func (tf TaggedField) tags() fieldTags { - tags := fieldTags{} - - if tf.FrameworkName != "" { - tags["framework_name"] = tf.FrameworkName - } - if tf.CallType != "" { - tags["call_type"] = tf.CallType - } - if tf.EventType != "" { - tags["event_type"] = tf.EventType - } - if tf.OperationType != "" { - tags["operation_type"] = tf.OperationType - } - if tf.TaskState != "" { - tags["task_state"] = tf.TaskState - } - if tf.RoleName != "" { - tags["role_name"] = tf.RoleName - } - if tf.Resource != "" { - tags["resource"] = tf.Resource - } - - return tags -} - var allMetrics = map[Role][]string{ MASTER: {"resources", "master", "system", "agents", "frameworks", "framework_offers", "tasks", "messages", "evqueue", "registrar", "allocator"}, SLAVE: {"resources", "agent", "system", "executors", "tasks", "messages"}, @@ -703,132 +630,25 @@ func (m *Mesos) gatherMainMetrics(u *url.URL, role Role, acc telegraf.Accumulato } } - taggedFields := map[string][]TaggedField{} - extraTags := map[string]fieldTags{} - - for metricName, val := range jf.Fields { - if !strings.HasPrefix(metricName, "master/frameworks/") && !strings.HasPrefix(metricName, "allocator/") { + for metricName := range jf.Fields { + if !strings.HasPrefix(metricName, "master/frameworks/") && !strings.HasPrefix(metricName, "frameworks/") && !strings.HasPrefix(metricName, "allocator/") { continue } // filter out framework offers/allocator metrics if necessary - if (!includeFrameworkOffers && strings.HasPrefix(metricName, "master/frameworks/")) || + if !includeFrameworkOffers && + (strings.HasPrefix(metricName, "master/frameworks/") || strings.HasPrefix(metricName, "frameworks/")) || (!includeAllocator && strings.HasPrefix(metricName, "allocator/")) { delete(jf.Fields, metricName) continue } - - parts := strings.Split(metricName, "/") - if (parts[0] == "master" && len(parts) < 5) || (parts[0] == "allocator" && len(parts) <= 5) { - // All framework offers metrics have at least 5 parts. - // All allocator metrics with <= 5 parts can be sent as is and does not pull - // any params out into tags. - // (e.g. allocator/mesos/allocation_run_ms/count vs allocator/mesos/roles//shares/dominant) - continue - } - - tf := generateTaggedField(parts) - tf.Value = val - - if len(tf.tags()) == 0 { - // indicates no extra tags were added - continue - } - - tfh := tf.hash() - if _, ok := taggedFields[tfh]; !ok { - taggedFields[tfh] = []TaggedField{} - } - taggedFields[tfh] = append(taggedFields[tfh], tf) - - if _, ok := extraTags[tfh]; !ok { - extraTags[tfh] = tf.tags() - } - - delete(jf.Fields, metricName) } acc.AddFields("mesos", jf.Fields, tags) - for tfh, tfs := range taggedFields { - fields := map[string]interface{}{} - for _, tf := range tfs { - fields[tf.FieldName] = tf.Value - } - for k, v := range tags { - extraTags[tfh][k] = v - } - - acc.AddFields("mesos", fields, extraTags[tfh]) - } - return nil } -func generateTaggedField(parts []string) TaggedField { - tf := TaggedField{} - - if parts[0] == "master" { - tf.FrameworkName = parts[2] - if len(parts) == 5 { - // e.g. /master/frameworks/calls_total - tf.FieldName = fmt.Sprintf("%s/%s/%s_total", parts[0], parts[1], parts[4]) - } else { - switch parts[4] { - case "offers": - // e.g. /master/frameworks/offers/sent - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[5]) - case "calls": - // e.g. /master/frameworks/calls/decline - tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) - tf.CallType = parts[5] - case "events": - // e.g. /master/frameworks/events/heartbeat - tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) - tf.EventType = parts[5] - case "operations": - // e.g. /master/frameworks/operations/create - tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], parts[4]) - tf.OperationType = parts[5] - case "tasks": - // e.g. /master/frameworks/tasks/active/running - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[5]) - tf.TaskState = parts[6] - case "roles": - // e.g. /master/frameworks/roles/public - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[1], parts[4], parts[6]) - tf.RoleName = parts[5] - default: - // default to excluding framework name and id, but otherwise leaving path as is - log.Printf("I! Unexpected metric name %s", parts[4]) - tf.FieldName = fmt.Sprintf("%s/%s/%s", parts[0], parts[1], strings.Join(parts[4:], "/")) - } - } - } else if parts[0] == "allocator" { - switch parts[2] { - case "roles": - // e.g. /allocator/roles/shares/dominant - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[2], parts[4], parts[5]) - tf.RoleName = parts[3] - case "offer_filters": - // e.g. /allocator/offer_filters/roles/active - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s", parts[0], parts[2], parts[3], parts[5]) - tf.RoleName = parts[4] - case "quota": - // e.g. /allocator/quota/roles/resources/offered_or_allocated - tf.FieldName = fmt.Sprintf("%s/%s/%s/%s/%s", parts[0], parts[2], parts[3], parts[5], parts[7]) - tf.RoleName = parts[4] - tf.Resource = parts[6] - default: - // default to leaving path as is - log.Printf("I! Unexpected metric name %s", parts[2]) - tf.FieldName = strings.Join(parts, "/") - } - } - - return tf -} - func init() { inputs.Add("mesos", func() telegraf.Input { return &Mesos{} diff --git a/plugins/inputs/mesos/mesos_test.go b/plugins/inputs/mesos/mesos_test.go index c33aa48dfa2c9..1e00dd307856d 100644 --- a/plugins/inputs/mesos/mesos_test.go +++ b/plugins/inputs/mesos/mesos_test.go @@ -8,11 +8,9 @@ import ( "net/http/httptest" "net/url" "os" - "strings" "testing" "github.com/influxdata/telegraf/testutil" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -203,9 +201,6 @@ func generateMetrics() { "allocator/mesos/allocation_run_latency_ms/p999", "allocator/mesos/allocation_run_latency_ms/p9999", "allocator/mesos/roles/*/shares/dominant", - // test case against hash collisions in TaggedFields - // e.g. framework_name=marathon and role_name=marathon - "allocator/mesos/roles/marathon/shares/dominant", "allocator/mesos/event_queue_dispatches", "allocator/mesos/offer_filters/roles/*/active", "allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated", @@ -217,7 +212,6 @@ func generateMetrics() { "allocator/mesos/resources/disk/total", "allocator/mesos/resources/mem/offered_or_allocated", "allocator/mesos/resources/mem/total", - "allocator/mesos/unknown/unknown/unknown/unknown", // test case for unknown metric type } for _, k := range metricNames { @@ -364,166 +358,7 @@ func TestMesosMaster(t *testing.T) { t.Errorf(err.Error()) } - expectedUntaggedMetrics := map[string]interface{}{} - for k, v := range masterMetrics { - parts := strings.Split(k, "/") - if !strings.HasPrefix(k, "master/frameworks/") && (!strings.HasPrefix(k, "allocator/") || len(parts) <= 5) { - expectedUntaggedMetrics[k] = v - } - } - // for unknown allocator metric type test case, expect no additional tags - expectedUntaggedMetrics["allocator/mesos/unknown/unknown/unknown/unknown"] = masterMetrics["allocator/mesos/unknown/unknown/unknown/unknown"] - - acc.AssertContainsFields(t, "mesos", expectedUntaggedMetrics) - - frameworkFields := []map[string]interface{}{ - // framework offers - { - // "unknown" metric type should still contain framework_name tag - "master/frameworks/unknown/unknown": masterMetrics["master/frameworks/marathon/abc-123/unknown/unknown"], - "master/frameworks/calls_total": masterMetrics["master/frameworks/marathon/abc-123/calls"], - "master/frameworks/events_total": masterMetrics["master/frameworks/marathon/abc-123/events"], - "master/frameworks/operations_total": masterMetrics["master/frameworks/marathon/abc-123/operations"], - "master/frameworks/subscribed_total": masterMetrics["master/frameworks/marathon/abc-123/subscribed"], - "master/frameworks/offers/sent": masterMetrics["master/frameworks/marathon/abc-123/offers/sent"], - }, - { - "master/frameworks/tasks/active": masterMetrics["master/frameworks/marathon/abc-123/tasks/active/task_killing"], - }, - { - "master/frameworks/tasks/active": masterMetrics["master/frameworks/marathon/abc-123/tasks/active/task_dropped"], - "master/frameworks/tasks/terminal": masterMetrics["master/frameworks/marathon/abc-123/tasks/terminal/task_dropped"], - }, - { - "master/frameworks/roles/suppressed": masterMetrics["master/frameworks/marathon/abc-123/roles/*/suppressed"], - }, - { - "master/frameworks/calls": masterMetrics["master/frameworks/marathon/abc-123/calls/accept"], - }, - { - "master/frameworks/events": masterMetrics["master/frameworks/marathon/abc-123/events/error"], - }, - { - "master/frameworks/operations": masterMetrics["master/frameworks/marathon/abc-123/operations/create"], - }, - // allocator - { - "allocator/roles/shares/dominant": masterMetrics["allocator/mesos/roles/*/shares/dominant"], - "allocator/offer_filters/roles/active": masterMetrics["allocator/mesos/offer_filters/roles/*/active"], - }, - { - "allocator/roles/shares/dominant": masterMetrics["allocator/mesos/roles/marathon/shares/dominant"], - }, - { - "allocator/quota/roles/resources/offered_or_allocated": masterMetrics["allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated"], - "allocator/quota/roles/resources/guarantee": masterMetrics["allocator/mesos/quota/roles/*/resources/disk/guarantee"], - }, - { - "allocator/quota/roles/resources/guarantee": masterMetrics["allocator/mesos/quota/roles/*/resources/mem/guarantee"], - }, - } - - frameworkTags := []map[string]string{ - // framework offers - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "task_state": "task_killing", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "task_state": "task_dropped", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "role_name": "*", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "call_type": "accept", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "event_type": "error", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "framework_name": "marathon", - "operation_type": "create", - }, - // allocator - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "role_name": "*", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "role_name": "marathon", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "role_name": "*", - "resource": "disk", - }, - { - "server": m.masterURLs[0].Hostname(), - "url": masterTestServer.URL, - "role": "master", - "state": "leader", - "role_name": "*", - "resource": "mem", - }, - } - - for i := 0; i < len(frameworkFields); i++ { - acc.AssertContainsTaggedFields(t, "mesos", frameworkFields[i], frameworkTags[i]) - // Test that none of the other metrics share the same tags, which - // tests against potential hash collisions in TaggedFields. - for j := 0; j < len(frameworkFields); j++ { - if j == i { - continue - } - acc.AssertDoesNotContainsTaggedFields(t, "mesos", frameworkFields[j], frameworkTags[i]) - } - } + acc.AssertContainsFields(t, "mesos", masterMetrics) } func TestMasterFilter(t *testing.T) { @@ -572,18 +407,6 @@ func TestMesosSlave(t *testing.T) { } acc.AssertContainsFields(t, "mesos", slaveMetrics) - - // expectedFields := make(map[string]interface{}, len(slaveTaskMetrics["statistics"].(map[string]interface{}))+1) - // for k, v := range slaveTaskMetrics["statistics"].(map[string]interface{}) { - // expectedFields[k] = v - // } - // expectedFields["executor_id"] = slaveTaskMetrics["executor_id"] - - // acc.AssertContainsTaggedFields( - // t, - // "mesos_tasks", - // expectedFields, - // map[string]string{"server": "127.0.0.1", "framework_id": slaveTaskMetrics["framework_id"].(string)}) } func TestSlaveFilter(t *testing.T) { @@ -629,59 +452,3 @@ func TestURLTagDoesNotModify(t *testing.T) { require.Equal(t, u.String(), "http://a:b@localhost:5051?timeout=1ms") require.Equal(t, v, "http://localhost:5051") } - -func TestTaggedFieldHash(t *testing.T) { - assert := assert.New(t) - tf := TaggedField{ - FrameworkName: "marathon", - CallType: "accept", - EventType: "error", - OperationType: "create", - TaskState: "active", - RoleName: "marathon", - FieldName: "field/name", - Resource: "mem", - Value: 1.0, - } - assert.Equal("tf_fn:marathon_ct:accept_et:error_ot:create_ts:active_rn:marathon_r:mem", tf.hash()) - - // Test against hash collisions - tf1 := TaggedField{ - FrameworkName: "marathon", - FieldName: "field/name/1", - Value: 1.0, - } - tf2 := TaggedField{ - RoleName: "marathon", - FieldName: "field/name/2", - Value: 1.0, - } - assert.NotEqual(tf1.hash(), tf2.hash()) -} - -func TestTaggedFieldTags(t *testing.T) { - assert := assert.New(t) - tf := TaggedField{ - FrameworkName: "marathon", - CallType: "accept", - EventType: "error", - OperationType: "create", - TaskState: "active", - RoleName: "marathon", - FieldName: "field/name", - Resource: "mem", - Value: 1.0, - } - - expectedTags := fieldTags{ - "framework_name": "marathon", - "call_type": "accept", - "event_type": "error", - "operation_type": "create", - "task_state": "active", - "role_name": "marathon", - "resource": "mem", - } - - assert.Equal(expectedTags, tf.tags()) -} From 1ee17893e828277f761eef771d0055bf59d8b833 Mon Sep 17 00:00:00 2001 From: Branden Rolston Date: Wed, 7 Aug 2019 19:26:11 -0700 Subject: [PATCH 4/4] Filter allocator and framework_offers in filterMetrics() --- plugins/inputs/mesos/mesos.go | 53 ++- plugins/inputs/mesos/mesos_test.go | 534 +++++++++++++++-------------- 2 files changed, 302 insertions(+), 285 deletions(-) diff --git a/plugins/inputs/mesos/mesos.go b/plugins/inputs/mesos/mesos.go index 44d27dc4921d1..3e0e256915646 100644 --- a/plugins/inputs/mesos/mesos.go +++ b/plugins/inputs/mesos/mesos.go @@ -321,9 +321,9 @@ func getMetrics(role Role, group string) []string { "master/outstanding_offers", } - // These groups are empty because filtering is done in gatherMainMetrics - // based on presence of "framework_offers"/"allocator" in MasterCols. - // These lines are included to prevent the "unknown" info log below. + // framework_offers and allocator metrics have unpredictable names, so they can't be listed here. + // These empty groups are included to prevent the "unknown metrics group" info log below. + // filterMetrics() filters these metrics by looking for names with the corresponding prefix. m["framework_offers"] = []string{} m["allocator"] = []string{} @@ -505,9 +505,27 @@ func (m *Mesos) filterMetrics(role Role, metrics *map[string]interface{}) { } for _, k := range metricsDiff(role, selectedMetrics) { - for _, v := range getMetrics(role, k) { - if _, ok = (*metrics)[v]; ok { - delete((*metrics), v) + switch k { + // allocator and framework_offers metrics have unpredictable names, so we have to identify them by name prefix. + case "allocator": + for m := range *metrics { + if strings.HasPrefix(m, "allocator/") { + delete((*metrics), m) + } + } + case "framework_offers": + for m := range *metrics { + if strings.HasPrefix(m, "master/frameworks/") || strings.HasPrefix(m, "frameworks/") { + delete((*metrics), m) + } + } + + // All other metrics have predictable names. We can use getMetrics() to retrieve them. + default: + for _, v := range getMetrics(role, k) { + if _, ok = (*metrics)[v]; ok { + delete((*metrics), v) + } } } } @@ -621,29 +639,6 @@ func (m *Mesos) gatherMainMetrics(u *url.URL, role Role, acc telegraf.Accumulato } } - var includeFrameworkOffers, includeAllocator bool - for _, col := range m.MasterCols { - if col == "framework_offers" { - includeFrameworkOffers = true - } else if col == "allocator" { - includeAllocator = true - } - } - - for metricName := range jf.Fields { - if !strings.HasPrefix(metricName, "master/frameworks/") && !strings.HasPrefix(metricName, "frameworks/") && !strings.HasPrefix(metricName, "allocator/") { - continue - } - - // filter out framework offers/allocator metrics if necessary - if !includeFrameworkOffers && - (strings.HasPrefix(metricName, "master/frameworks/") || strings.HasPrefix(metricName, "frameworks/")) || - (!includeAllocator && strings.HasPrefix(metricName, "allocator/")) { - delete(jf.Fields, metricName) - continue - } - } - acc.AddFields("mesos", jf.Fields, tags) return nil diff --git a/plugins/inputs/mesos/mesos_test.go b/plugins/inputs/mesos/mesos_test.go index 1e00dd307856d..066d5b971f6a4 100644 --- a/plugins/inputs/mesos/mesos_test.go +++ b/plugins/inputs/mesos/mesos_test.go @@ -8,6 +8,7 @@ import ( "net/http/httptest" "net/url" "os" + "strings" "testing" "github.com/influxdata/telegraf/testutil" @@ -27,262 +28,262 @@ func randUUID() string { return fmt.Sprintf("%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:]) } -func generateMetrics() { - masterMetrics = make(map[string]interface{}) +// master metrics that will be returned by generateMetrics() +var masterMetricNames []string = []string{ + // resources + "master/cpus_percent", + "master/cpus_used", + "master/cpus_total", + "master/cpus_revocable_percent", + "master/cpus_revocable_total", + "master/cpus_revocable_used", + "master/disk_percent", + "master/disk_used", + "master/disk_total", + "master/disk_revocable_percent", + "master/disk_revocable_total", + "master/disk_revocable_used", + "master/gpus_percent", + "master/gpus_used", + "master/gpus_total", + "master/gpus_revocable_percent", + "master/gpus_revocable_total", + "master/gpus_revocable_used", + "master/mem_percent", + "master/mem_used", + "master/mem_total", + "master/mem_revocable_percent", + "master/mem_revocable_total", + "master/mem_revocable_used", + // master + "master/elected", + "master/uptime_secs", + // system + "system/cpus_total", + "system/load_15min", + "system/load_5min", + "system/load_1min", + "system/mem_free_bytes", + "system/mem_total_bytes", + // agents + "master/slave_registrations", + "master/slave_removals", + "master/slave_reregistrations", + "master/slave_shutdowns_scheduled", + "master/slave_shutdowns_canceled", + "master/slave_shutdowns_completed", + "master/slaves_active", + "master/slaves_connected", + "master/slaves_disconnected", + "master/slaves_inactive", + "master/slave_unreachable_canceled", + "master/slave_unreachable_completed", + "master/slave_unreachable_scheduled", + "master/slaves_unreachable", + // frameworks + "master/frameworks_active", + "master/frameworks_connected", + "master/frameworks_disconnected", + "master/frameworks_inactive", + "master/outstanding_offers", + // framework offers + "master/frameworks/marathon/abc-123/calls", + "master/frameworks/marathon/abc-123/calls/accept", + "master/frameworks/marathon/abc-123/events", + "master/frameworks/marathon/abc-123/events/error", + "master/frameworks/marathon/abc-123/offers/sent", + "master/frameworks/marathon/abc-123/operations", + "master/frameworks/marathon/abc-123/operations/create", + "master/frameworks/marathon/abc-123/roles/*/suppressed", + "master/frameworks/marathon/abc-123/subscribed", + "master/frameworks/marathon/abc-123/tasks/active/task_killing", + "master/frameworks/marathon/abc-123/tasks/active/task_dropped", + "master/frameworks/marathon/abc-123/tasks/terminal/task_dropped", + "master/frameworks/marathon/abc-123/unknown/unknown", // test case for unknown metric type + // tasks + "master/tasks_error", + "master/tasks_failed", + "master/tasks_finished", + "master/tasks_killed", + "master/tasks_lost", + "master/tasks_running", + "master/tasks_staging", + "master/tasks_starting", + "master/tasks_dropped", + "master/tasks_gone", + "master/tasks_gone_by_operator", + "master/tasks_killing", + "master/tasks_unreachable", + // messages + "master/invalid_executor_to_framework_messages", + "master/invalid_framework_to_executor_messages", + "master/invalid_status_update_acknowledgements", + "master/invalid_status_updates", + "master/dropped_messages", + "master/messages_authenticate", + "master/messages_deactivate_framework", + "master/messages_decline_offers", + "master/messages_executor_to_framework", + "master/messages_exited_executor", + "master/messages_framework_to_executor", + "master/messages_kill_task", + "master/messages_launch_tasks", + "master/messages_reconcile_tasks", + "master/messages_register_framework", + "master/messages_register_slave", + "master/messages_reregister_framework", + "master/messages_reregister_slave", + "master/messages_resource_request", + "master/messages_revive_offers", + "master/messages_status_update", + "master/messages_status_update_acknowledgement", + "master/messages_unregister_framework", + "master/messages_unregister_slave", + "master/messages_update_slave", + "master/recovery_slave_removals", + "master/slave_removals/reason_registered", + "master/slave_removals/reason_unhealthy", + "master/slave_removals/reason_unregistered", + "master/valid_framework_to_executor_messages", + "master/valid_status_update_acknowledgements", + "master/valid_status_updates", + "master/task_lost/source_master/reason_invalid_offers", + "master/task_lost/source_master/reason_slave_removed", + "master/task_lost/source_slave/reason_executor_terminated", + "master/valid_executor_to_framework_messages", + "master/invalid_operation_status_update_acknowledgements", + "master/messages_operation_status_update_acknowledgement", + "master/messages_reconcile_operations", + "master/messages_suppress_offers", + "master/valid_operation_status_update_acknowledgements", + // evgqueue + "master/event_queue_dispatches", + "master/event_queue_http_requests", + "master/event_queue_messages", + "master/operator_event_stream_subscribers", + // registrar + "registrar/log/ensemble_size", + "registrar/log/recovered", + "registrar/queued_operations", + "registrar/registry_size_bytes", + "registrar/state_fetch_ms", + "registrar/state_store_ms", + "registrar/state_store_ms/max", + "registrar/state_store_ms/min", + "registrar/state_store_ms/p50", + "registrar/state_store_ms/p90", + "registrar/state_store_ms/p95", + "registrar/state_store_ms/p99", + "registrar/state_store_ms/p999", + "registrar/state_store_ms/p9999", + "registrar/state_store_ms/count", + // allocator + "allocator/mesos/allocation_run_ms", + "allocator/mesos/allocation_run_ms/count", + "allocator/mesos/allocation_run_ms/max", + "allocator/mesos/allocation_run_ms/min", + "allocator/mesos/allocation_run_ms/p50", + "allocator/mesos/allocation_run_ms/p90", + "allocator/mesos/allocation_run_ms/p95", + "allocator/mesos/allocation_run_ms/p99", + "allocator/mesos/allocation_run_ms/p999", + "allocator/mesos/allocation_run_ms/p9999", + "allocator/mesos/allocation_runs", + "allocator/mesos/allocation_run_latency_ms", + "allocator/mesos/allocation_run_latency_ms/count", + "allocator/mesos/allocation_run_latency_ms/max", + "allocator/mesos/allocation_run_latency_ms/min", + "allocator/mesos/allocation_run_latency_ms/p50", + "allocator/mesos/allocation_run_latency_ms/p90", + "allocator/mesos/allocation_run_latency_ms/p95", + "allocator/mesos/allocation_run_latency_ms/p99", + "allocator/mesos/allocation_run_latency_ms/p999", + "allocator/mesos/allocation_run_latency_ms/p9999", + "allocator/mesos/roles/*/shares/dominant", + "allocator/mesos/event_queue_dispatches", + "allocator/mesos/offer_filters/roles/*/active", + "allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated", + "allocator/mesos/quota/roles/*/resources/mem/guarantee", + "allocator/mesos/quota/roles/*/resources/disk/guarantee", + "allocator/mesos/resources/cpus/offered_or_allocated", + "allocator/mesos/resources/cpus/total", + "allocator/mesos/resources/disk/offered_or_allocated", + "allocator/mesos/resources/disk/total", + "allocator/mesos/resources/mem/offered_or_allocated", + "allocator/mesos/resources/mem/total", +} - metricNames := []string{ - // resources - "master/cpus_percent", - "master/cpus_used", - "master/cpus_total", - "master/cpus_revocable_percent", - "master/cpus_revocable_total", - "master/cpus_revocable_used", - "master/disk_percent", - "master/disk_used", - "master/disk_total", - "master/disk_revocable_percent", - "master/disk_revocable_total", - "master/disk_revocable_used", - "master/gpus_percent", - "master/gpus_used", - "master/gpus_total", - "master/gpus_revocable_percent", - "master/gpus_revocable_total", - "master/gpus_revocable_used", - "master/mem_percent", - "master/mem_used", - "master/mem_total", - "master/mem_revocable_percent", - "master/mem_revocable_total", - "master/mem_revocable_used", - // master - "master/elected", - "master/uptime_secs", - // system - "system/cpus_total", - "system/load_15min", - "system/load_5min", - "system/load_1min", - "system/mem_free_bytes", - "system/mem_total_bytes", - // agents - "master/slave_registrations", - "master/slave_removals", - "master/slave_reregistrations", - "master/slave_shutdowns_scheduled", - "master/slave_shutdowns_canceled", - "master/slave_shutdowns_completed", - "master/slaves_active", - "master/slaves_connected", - "master/slaves_disconnected", - "master/slaves_inactive", - "master/slave_unreachable_canceled", - "master/slave_unreachable_completed", - "master/slave_unreachable_scheduled", - "master/slaves_unreachable", - // frameworks - "master/frameworks_active", - "master/frameworks_connected", - "master/frameworks_disconnected", - "master/frameworks_inactive", - "master/outstanding_offers", - // framework offers - "master/frameworks/marathon/abc-123/calls", - "master/frameworks/marathon/abc-123/calls/accept", - "master/frameworks/marathon/abc-123/events", - "master/frameworks/marathon/abc-123/events/error", - "master/frameworks/marathon/abc-123/offers/sent", - "master/frameworks/marathon/abc-123/operations", - "master/frameworks/marathon/abc-123/operations/create", - "master/frameworks/marathon/abc-123/roles/*/suppressed", - "master/frameworks/marathon/abc-123/subscribed", - "master/frameworks/marathon/abc-123/tasks/active/task_killing", - "master/frameworks/marathon/abc-123/tasks/active/task_dropped", - "master/frameworks/marathon/abc-123/tasks/terminal/task_dropped", - "master/frameworks/marathon/abc-123/unknown/unknown", // test case for unknown metric type - // tasks - "master/tasks_error", - "master/tasks_failed", - "master/tasks_finished", - "master/tasks_killed", - "master/tasks_lost", - "master/tasks_running", - "master/tasks_staging", - "master/tasks_starting", - "master/tasks_dropped", - "master/tasks_gone", - "master/tasks_gone_by_operator", - "master/tasks_killing", - "master/tasks_unreachable", - // messages - "master/invalid_executor_to_framework_messages", - "master/invalid_framework_to_executor_messages", - "master/invalid_status_update_acknowledgements", - "master/invalid_status_updates", - "master/dropped_messages", - "master/messages_authenticate", - "master/messages_deactivate_framework", - "master/messages_decline_offers", - "master/messages_executor_to_framework", - "master/messages_exited_executor", - "master/messages_framework_to_executor", - "master/messages_kill_task", - "master/messages_launch_tasks", - "master/messages_reconcile_tasks", - "master/messages_register_framework", - "master/messages_register_slave", - "master/messages_reregister_framework", - "master/messages_reregister_slave", - "master/messages_resource_request", - "master/messages_revive_offers", - "master/messages_status_update", - "master/messages_status_update_acknowledgement", - "master/messages_unregister_framework", - "master/messages_unregister_slave", - "master/messages_update_slave", - "master/recovery_slave_removals", - "master/slave_removals/reason_registered", - "master/slave_removals/reason_unhealthy", - "master/slave_removals/reason_unregistered", - "master/valid_framework_to_executor_messages", - "master/valid_status_update_acknowledgements", - "master/valid_status_updates", - "master/task_lost/source_master/reason_invalid_offers", - "master/task_lost/source_master/reason_slave_removed", - "master/task_lost/source_slave/reason_executor_terminated", - "master/valid_executor_to_framework_messages", - "master/invalid_operation_status_update_acknowledgements", - "master/messages_operation_status_update_acknowledgement", - "master/messages_reconcile_operations", - "master/messages_suppress_offers", - "master/valid_operation_status_update_acknowledgements", - // evgqueue - "master/event_queue_dispatches", - "master/event_queue_http_requests", - "master/event_queue_messages", - "master/operator_event_stream_subscribers", - // registrar - "registrar/log/ensemble_size", - "registrar/log/recovered", - "registrar/queued_operations", - "registrar/registry_size_bytes", - "registrar/state_fetch_ms", - "registrar/state_store_ms", - "registrar/state_store_ms/max", - "registrar/state_store_ms/min", - "registrar/state_store_ms/p50", - "registrar/state_store_ms/p90", - "registrar/state_store_ms/p95", - "registrar/state_store_ms/p99", - "registrar/state_store_ms/p999", - "registrar/state_store_ms/p9999", - "registrar/state_store_ms/count", - // allocator - "allocator/mesos/allocation_run_ms", - "allocator/mesos/allocation_run_ms/count", - "allocator/mesos/allocation_run_ms/max", - "allocator/mesos/allocation_run_ms/min", - "allocator/mesos/allocation_run_ms/p50", - "allocator/mesos/allocation_run_ms/p90", - "allocator/mesos/allocation_run_ms/p95", - "allocator/mesos/allocation_run_ms/p99", - "allocator/mesos/allocation_run_ms/p999", - "allocator/mesos/allocation_run_ms/p9999", - "allocator/mesos/allocation_runs", - "allocator/mesos/allocation_run_latency_ms", - "allocator/mesos/allocation_run_latency_ms/count", - "allocator/mesos/allocation_run_latency_ms/max", - "allocator/mesos/allocation_run_latency_ms/min", - "allocator/mesos/allocation_run_latency_ms/p50", - "allocator/mesos/allocation_run_latency_ms/p90", - "allocator/mesos/allocation_run_latency_ms/p95", - "allocator/mesos/allocation_run_latency_ms/p99", - "allocator/mesos/allocation_run_latency_ms/p999", - "allocator/mesos/allocation_run_latency_ms/p9999", - "allocator/mesos/roles/*/shares/dominant", - "allocator/mesos/event_queue_dispatches", - "allocator/mesos/offer_filters/roles/*/active", - "allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated", - "allocator/mesos/quota/roles/*/resources/mem/guarantee", - "allocator/mesos/quota/roles/*/resources/disk/guarantee", - "allocator/mesos/resources/cpus/offered_or_allocated", - "allocator/mesos/resources/cpus/total", - "allocator/mesos/resources/disk/offered_or_allocated", - "allocator/mesos/resources/disk/total", - "allocator/mesos/resources/mem/offered_or_allocated", - "allocator/mesos/resources/mem/total", - } +// slave metrics that will be returned by generateMetrics() +var slaveMetricNames []string = []string{ + // resources + "slave/cpus_percent", + "slave/cpus_used", + "slave/cpus_total", + "slave/cpus_revocable_percent", + "slave/cpus_revocable_total", + "slave/cpus_revocable_used", + "slave/disk_percent", + "slave/disk_used", + "slave/disk_total", + "slave/disk_revocable_percent", + "slave/disk_revocable_total", + "slave/disk_revocable_used", + "slave/gpus_percent", + "slave/gpus_used", + "slave/gpus_total", + "slave/gpus_revocable_percent", + "slave/gpus_revocable_total", + "slave/gpus_revocable_used", + "slave/mem_percent", + "slave/mem_used", + "slave/mem_total", + "slave/mem_revocable_percent", + "slave/mem_revocable_total", + "slave/mem_revocable_used", + // agent + "slave/registered", + "slave/uptime_secs", + // system + "system/cpus_total", + "system/load_15min", + "system/load_5min", + "system/load_1min", + "system/mem_free_bytes", + "system/mem_total_bytes", + // executors + "containerizer/mesos/container_destroy_errors", + "slave/container_launch_errors", + "slave/executors_preempted", + "slave/frameworks_active", + "slave/executor_directory_max_allowed_age_secs", + "slave/executors_registering", + "slave/executors_running", + "slave/executors_terminated", + "slave/executors_terminating", + "slave/recovery_errors", + // tasks + "slave/tasks_failed", + "slave/tasks_finished", + "slave/tasks_killed", + "slave/tasks_lost", + "slave/tasks_running", + "slave/tasks_staging", + "slave/tasks_starting", + // messages + "slave/invalid_framework_messages", + "slave/invalid_status_updates", + "slave/valid_framework_messages", + "slave/valid_status_updates", +} - for _, k := range metricNames { +func generateMetrics() { + masterMetrics = make(map[string]interface{}) + for _, k := range masterMetricNames { masterMetrics[k] = rand.Float64() } slaveMetrics = make(map[string]interface{}) - - metricNames = []string{ - // resources - "slave/cpus_percent", - "slave/cpus_used", - "slave/cpus_total", - "slave/cpus_revocable_percent", - "slave/cpus_revocable_total", - "slave/cpus_revocable_used", - "slave/disk_percent", - "slave/disk_used", - "slave/disk_total", - "slave/disk_revocable_percent", - "slave/disk_revocable_total", - "slave/disk_revocable_used", - "slave/gpus_percent", - "slave/gpus_used", - "slave/gpus_total", - "slave/gpus_revocable_percent", - "slave/gpus_revocable_total", - "slave/gpus_revocable_used", - "slave/mem_percent", - "slave/mem_used", - "slave/mem_total", - "slave/mem_revocable_percent", - "slave/mem_revocable_total", - "slave/mem_revocable_used", - // agent - "slave/registered", - "slave/uptime_secs", - // system - "system/cpus_total", - "system/load_15min", - "system/load_5min", - "system/load_1min", - "system/mem_free_bytes", - "system/mem_total_bytes", - // executors - "containerizer/mesos/container_destroy_errors", - "slave/container_launch_errors", - "slave/executors_preempted", - "slave/frameworks_active", - "slave/executor_directory_max_allowed_age_secs", - "slave/executors_registering", - "slave/executors_running", - "slave/executors_terminated", - "slave/executors_terminating", - "slave/recovery_errors", - // tasks - "slave/tasks_failed", - "slave/tasks_finished", - "slave/tasks_killed", - "slave/tasks_lost", - "slave/tasks_running", - "slave/tasks_staging", - "slave/tasks_starting", - // messages - "slave/invalid_framework_messages", - "slave/invalid_status_updates", - "slave/valid_framework_messages", - "slave/valid_status_updates", - } - - for _, k := range metricNames { + for _, k := range slaveMetricNames { slaveMetrics[k] = rand.Float64() } @@ -364,7 +365,7 @@ func TestMesosMaster(t *testing.T) { func TestMasterFilter(t *testing.T) { m := Mesos{ MasterCols: []string{ - "resources", "master", "registrar", + "resources", "master", "registrar", "allocator", }, } b := []string{ @@ -374,6 +375,26 @@ func TestMasterFilter(t *testing.T) { m.filterMetrics(MASTER, &masterMetrics) + // Assert expected metrics are present. + for _, v := range m.MasterCols { + for _, x := range getMetrics(MASTER, v) { + if _, ok := masterMetrics[x]; !ok { + t.Errorf("Didn't find key %s, it should present.", x) + } + } + } + // m.MasterCols includes "allocator", so allocator metrics should be present. + // allocator metrics have unpredictable names, so we can't rely on the list of metrics returned from + // getMetrics(). We have to find them by checking name prefixes. + for _, x := range masterMetricNames { + if strings.HasPrefix(x, "allocator/") { + if _, ok := masterMetrics[x]; !ok { + t.Errorf("Didn't find key %s, it should be present.", x) + } + } + } + + // Assert unexpected metrics are not present. for _, v := range b { for _, x := range getMetrics(MASTER, v) { if _, ok := masterMetrics[x]; ok { @@ -381,11 +402,12 @@ func TestMasterFilter(t *testing.T) { } } } - for _, v := range m.MasterCols { - for _, x := range getMetrics(MASTER, v) { - if _, ok := masterMetrics[x]; !ok { - t.Errorf("Didn't find key %s, it should present.", x) - } + // m.MasterCols does not include "framework_offers", so framework_offers metrics should not be present. + // framework_offers metrics have unpredictable names, so we can't rely on the list of metrics returned from + // getMetrics(). We have to find them by checking name prefixes. + for k := range masterMetrics { + if strings.HasPrefix(k, "master/frameworks/") || strings.HasPrefix(k, "frameworks/") { + t.Errorf("Found key %s, it should be gone.", k) } } }