From 3408955c564dbed42b5bc511e57878748df7118d Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 9 Nov 2018 19:44:51 -0500 Subject: [PATCH 01/34] Initial implementation of Finder --- plugins/inputs/vsphere/finder.go | 126 +++++++++++++++++++++++++ plugins/inputs/vsphere/vsphere_test.go | 53 +++++++++++ 2 files changed, 179 insertions(+) create mode 100644 plugins/inputs/vsphere/finder.go diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go new file mode 100644 index 0000000000000..e639926e2bc4a --- /dev/null +++ b/plugins/inputs/vsphere/finder.go @@ -0,0 +1,126 @@ +package vsphere + +import ( + "context" + "strings" + + "github.com/vmware/govmomi/property" + "github.com/vmware/govmomi/view" + "github.com/vmware/govmomi/vim25/types" +) + +var childTypes map[string][]string + +type Finder struct { + client *Client +} + +type nameAndRef struct { + name string + ref types.ManagedObjectReference +} + +func (f *Finder) Find(ctx context.Context, resType, path string) ([]types.ManagedObjectReference, error) { + p := strings.Split(path, "/") + flt := make([]property.Filter, len(p)-1) + for i := 1; i < len(p); i++ { + flt[i-1] = property.Filter{"name": p[i]} + } + moids, err := f.descend(ctx, f.client.Client.ServiceContent.RootFolder, resType, flt, 0) + if err != nil { + return nil, err + } + return moids, nil +} + +func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, resType string, + parts []property.Filter, pos int) ([]types.ManagedObjectReference, error) { + + // We found what we're looking for. Consider it a leaf and stop descending + if root.Reference().Type == resType { + return []types.ManagedObjectReference{root}, nil + } + + // No more tokens to match? + if pos >= len(parts) { + return []types.ManagedObjectReference{}, nil + } + + // Get children + ct, ok := childTypes[root.Reference().Type] + if !ok { + // We don't know how to handle children of this type. Stop descending. + return []types.ManagedObjectReference{}, nil + } + m := view.NewManager(f.client.Client.Client) + defer m.Destroy(ctx) + v, err := m.CreateContainerView(ctx, root, ct, false) + if err != nil { + return nil, err + } + defer v.Destroy(ctx) + var content []types.ObjectContent + + err = v.Retrieve(ctx, ct, []string{"name"}, &content) + if err != nil { + return nil, err + } + moids := make([]types.ManagedObjectReference, 0, 100) + for _, c := range content { + if !parts[pos].MatchPropertyList(c.PropSet) { + continue + } + + // Deal with recursive wildcards (**) + inc := 1 // Normally we advance one token. + if parts[pos]["name"] == "**" { + if pos >= len(parts) { + inc = 0 // Can't advance past last token, so keep descending the tree + } else { + // Lookahead to next token. If it matches this child, we are out of + // the recursive wildcard handling and we can advance TWO tokens ahead, since + // the token that ended the recursive wildcard mode is now consumed. + if parts[pos+1].MatchPropertyList(c.PropSet) { + if pos < len(parts)-3 { + inc = 2 + } else { + // We didn't break out of recursicve wildcard mode yet, so stay on this token. + inc = 0 + } + } + } + } + r, err := f.descend(ctx, c.Obj, resType, parts, pos+inc) + if err != nil { + return nil, err + } + moids = append(moids, r...) + } + return moids, nil +} + +func nameFromObjectContent(o types.ObjectContent) string { + for _, p := range o.PropSet { + if p.Name == "name" { + return p.Val.(string) + } + } + return "" +} + +func init() { + childTypes = map[string][]string{ + "HostSystem": []string{"VirtualMachine"}, + "ComputeResource": []string{"HostSystem", "ResourcePool"}, + "ClusterComputeResource": []string{"HostSystem", "ResourcePool"}, + "Datacenter": []string{"Folder"}, + "Folder": []string{ + "Folder", + "Datacenter", + "VirtualMachine", + "ComputeResource", + "ClusterComputeResource", + "Datastore", + }, + } +} diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 4eb3d28f810e6..339265179d222 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -289,6 +289,59 @@ func TestMaxQuery(t *testing.T) { c2.close() } +func TestFinder(t *testing.T) { + m, s, err := createSim() + if err != nil { + t.Fatal(err) + } + defer m.Remove() + defer s.Close() + + v := defaultVSphere() + ctx := context.Background() + + c, err := NewClient(ctx, s.URL, v) + + f := Finder{c} + + objs, err := f.Find(ctx, "Datacenter", "/DC0") + require.NoError(t, err) + require.Equal(t, 1, len(objs)) + require.Equal(t, "Datacenter:datacenter-2", objs[0].Reference().String()) + + objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_H0/DC0_H0") + require.NoError(t, err) + require.Equal(t, 1, len(objs)) + require.Equal(t, "HostSystem:host-19", objs[0].Reference().String()) + + objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/DC0_C0_H0") + require.NoError(t, err) + require.Equal(t, 1, len(objs)) + require.Equal(t, "HostSystem:host-30", objs[0].Reference().String()) + + objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/*") + require.NoError(t, err) + require.Equal(t, 3, len(objs)) + + objs, err = f.Find(ctx, "VirtualMachine", "/DC0/vm/DC0_H0_VM0") + require.NoError(t, err) + require.Equal(t, 1, len(objs)) + require.Equal(t, "VirtualMachine:vm-51", objs[0].Reference().String()) + + objs, err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_VM0") + require.NoError(t, err) + require.Equal(t, 1, len(objs)) + require.Equal(t, "VirtualMachine:vm-51", objs[0].Reference().String()) + + objs, err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_*") + require.NoError(t, err) + require.Equal(t, 2, len(objs)) + + objs, err = f.Find(ctx, "VirtualMachine", "/DC0/**/DC0_H0_VM") + require.NoError(t, err) + require.Equal(t, 2, len(objs)) +} + func TestAll(t *testing.T) { m, s, err := createSim() if err != nil { From 3468e5e772475feaa16af6b842b899dda286acad Mon Sep 17 00:00:00 2001 From: prydin Date: Sat, 10 Nov 2018 08:52:14 -0500 Subject: [PATCH 02/34] Refactored to load all properties in one pass --- plugins/inputs/vsphere/finder.go | 85 +++++++++++++++++++------- plugins/inputs/vsphere/vsphere_test.go | 56 ++++++++++------- 2 files changed, 98 insertions(+), 43 deletions(-) diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index e639926e2bc4a..dda7b8ff5015a 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -2,10 +2,12 @@ package vsphere import ( "context" + "reflect" "strings" "github.com/vmware/govmomi/property" "github.com/vmware/govmomi/view" + "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/types" ) @@ -20,83 +22,91 @@ type nameAndRef struct { ref types.ManagedObjectReference } -func (f *Finder) Find(ctx context.Context, resType, path string) ([]types.ManagedObjectReference, error) { +func (f *Finder) Find(ctx context.Context, resType, path string, dst interface{}) error { p := strings.Split(path, "/") flt := make([]property.Filter, len(p)-1) for i := 1; i < len(p); i++ { flt[i-1] = property.Filter{"name": p[i]} } - moids, err := f.descend(ctx, f.client.Client.ServiceContent.RootFolder, resType, flt, 0) + objs := make(map[string]types.ObjectContent) + err := f.descend(ctx, f.client.Client.ServiceContent.RootFolder, resType, flt, 0, objs) if err != nil { - return nil, err + return err } - return moids, nil + objectContentToTypedArray(objs, dst) + return nil } func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, resType string, - parts []property.Filter, pos int) ([]types.ManagedObjectReference, error) { - - // We found what we're looking for. Consider it a leaf and stop descending - if root.Reference().Type == resType { - return []types.ManagedObjectReference{root}, nil - } + parts []property.Filter, pos int, objs map[string]types.ObjectContent) error { // No more tokens to match? if pos >= len(parts) { - return []types.ManagedObjectReference{}, nil + return nil } // Get children ct, ok := childTypes[root.Reference().Type] if !ok { // We don't know how to handle children of this type. Stop descending. - return []types.ManagedObjectReference{}, nil + return nil } m := view.NewManager(f.client.Client.Client) defer m.Destroy(ctx) v, err := m.CreateContainerView(ctx, root, ct, false) if err != nil { - return nil, err + return err } defer v.Destroy(ctx) var content []types.ObjectContent err = v.Retrieve(ctx, ct, []string{"name"}, &content) if err != nil { - return nil, err + return err } - moids := make([]types.ManagedObjectReference, 0, 100) for _, c := range content { if !parts[pos].MatchPropertyList(c.PropSet) { continue } + if _, ok := objs[root.Reference().String()]; ok { + continue + } + + if c.Obj.Type == resType { + // We found what we're looking for. Consider it a leaf and stop descending + objs[c.Obj.String()] = c + continue + } + // Deal with recursive wildcards (**) inc := 1 // Normally we advance one token. if parts[pos]["name"] == "**" { - if pos >= len(parts) { + if pos >= len(parts)-1 { inc = 0 // Can't advance past last token, so keep descending the tree } else { // Lookahead to next token. If it matches this child, we are out of // the recursive wildcard handling and we can advance TWO tokens ahead, since // the token that ended the recursive wildcard mode is now consumed. if parts[pos+1].MatchPropertyList(c.PropSet) { - if pos < len(parts)-3 { + if pos < len(parts)-2 { inc = 2 } else { - // We didn't break out of recursicve wildcard mode yet, so stay on this token. inc = 0 } + } else { + // We didn't break out of recursicve wildcard mode yet, so stay on this token. + inc = 0 + } } } - r, err := f.descend(ctx, c.Obj, resType, parts, pos+inc) + err := f.descend(ctx, c.Obj, resType, parts, pos+inc, objs) if err != nil { - return nil, err + return err } - moids = append(moids, r...) } - return moids, nil + return nil } func nameFromObjectContent(o types.ObjectContent) string { @@ -108,6 +118,37 @@ func nameFromObjectContent(o types.ObjectContent) string { return "" } +func objectContentToTypedArray(objs map[string]types.ObjectContent, dst interface{}) error { + rt := reflect.TypeOf(dst) + if rt == nil || rt.Kind() != reflect.Ptr { + panic("need pointer") + } + + rv := reflect.ValueOf(dst).Elem() + if !rv.CanSet() { + panic("cannot set dst") + } + for _, p := range objs { + v, err := mo.ObjectContentToType(p) + if err != nil { + return err + } + + vt := reflect.TypeOf(v) + + if !rv.Type().AssignableTo(vt) { + // For example: dst is []ManagedEntity, res is []HostSystem + if field, ok := vt.FieldByName(rt.Elem().Elem().Name()); ok && field.Anonymous { + rv.Set(reflect.Append(rv, reflect.ValueOf(v).FieldByIndex(field.Index))) + continue + } + } + + rv.Set(reflect.Append(rv, reflect.ValueOf(v))) + } + return nil +} + func init() { childTypes = map[string][]string{ "HostSystem": []string{"VirtualMachine"}, diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 339265179d222..94a620b32f8cf 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -17,6 +17,7 @@ import ( "github.com/stretchr/testify/require" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/simulator" + "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/types" ) @@ -304,42 +305,55 @@ func TestFinder(t *testing.T) { f := Finder{c} - objs, err := f.Find(ctx, "Datacenter", "/DC0") + dc := []mo.Datacenter{} + err = f.Find(ctx, "Datacenter", "/DC0", &dc) require.NoError(t, err) - require.Equal(t, 1, len(objs)) - require.Equal(t, "Datacenter:datacenter-2", objs[0].Reference().String()) + require.Equal(t, 1, len(dc)) + require.Equal(t, "DC0", dc[0].Name) - objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_H0/DC0_H0") + host := []mo.HostSystem{} + err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_H0/DC0_H0", &host) require.NoError(t, err) - require.Equal(t, 1, len(objs)) - require.Equal(t, "HostSystem:host-19", objs[0].Reference().String()) + require.Equal(t, 1, len(host)) + require.Equal(t, "DC0_H0", host[0].Name) - objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/DC0_C0_H0") + host = []mo.HostSystem{} + err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/DC0_C0_H0", &host) require.NoError(t, err) - require.Equal(t, 1, len(objs)) - require.Equal(t, "HostSystem:host-30", objs[0].Reference().String()) + require.Equal(t, 1, len(host)) + require.Equal(t, "DC0_C0_H0", host[0].Name) - objs, err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/*") + host = []mo.HostSystem{} + err = f.Find(ctx, "HostSystem", "/DC0/host/DC0_C0/*", &host) require.NoError(t, err) - require.Equal(t, 3, len(objs)) + require.Equal(t, 3, len(host)) - objs, err = f.Find(ctx, "VirtualMachine", "/DC0/vm/DC0_H0_VM0") + vm := []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/vm/DC0_H0_VM0", &vm) require.NoError(t, err) - require.Equal(t, 1, len(objs)) - require.Equal(t, "VirtualMachine:vm-51", objs[0].Reference().String()) + require.Equal(t, 1, len(dc)) + require.Equal(t, "DC0_H0_VM0", vm[0].Name) - objs, err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_VM0") + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_VM0", &vm) require.NoError(t, err) - require.Equal(t, 1, len(objs)) - require.Equal(t, "VirtualMachine:vm-51", objs[0].Reference().String()) + require.Equal(t, 1, len(dc)) + require.Equal(t, "DC0_H0_VM0", vm[0].Name) - objs, err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_*") + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_*", &vm) require.NoError(t, err) - require.Equal(t, 2, len(objs)) + require.Equal(t, 2, len(vm)) - objs, err = f.Find(ctx, "VirtualMachine", "/DC0/**/DC0_H0_VM") + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/**/DC0_H0_VM*", &vm) require.NoError(t, err) - require.Equal(t, 2, len(objs)) + require.Equal(t, 2, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/**", &vm) + require.NoError(t, err) + require.Equal(t, 4, len(vm)) } func TestAll(t *testing.T) { From 2b343f27ce8ef709c2d8a86b6de82b2faefa6bfd Mon Sep 17 00:00:00 2001 From: prydin Date: Mon, 12 Nov 2018 09:50:30 -0500 Subject: [PATCH 03/34] Fully implemented but not completely tested --- Gopkg.toml | 2 +- plugins/inputs/vsphere/endpoint.go | 54 ++++++++---- plugins/inputs/vsphere/finder.go | 112 +++++++++++++++++++++---- plugins/inputs/vsphere/vsphere.go | 39 ++++++--- plugins/inputs/vsphere/vsphere_test.go | 41 ++++++++- 5 files changed, 202 insertions(+), 46 deletions(-) diff --git a/Gopkg.toml b/Gopkg.toml index 23b8444fe6523..8572918b2ca8f 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -228,7 +228,7 @@ [[constraint]] name = "github.com/vmware/govmomi" - version = "0.18.0" + version = "0.19.0" [[constraint]] name = "github.com/Azure/go-autorest" diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index dbc67dd959366..2028ef461b79b 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -17,7 +17,6 @@ import ( "github.com/influxdata/telegraf" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/performance" - "github.com/vmware/govmomi/view" "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/types" ) @@ -45,6 +44,7 @@ type Endpoint struct { type resourceKind struct { name string + vcName string pKey string parentTag string enabled bool @@ -52,8 +52,9 @@ type resourceKind struct { sampling int32 objects objectMap filters filter.Filter + paths []string collectInstances bool - getObjects func(context.Context, *Endpoint, *view.ContainerView) (objectMap, error) + getObjects func(context.Context, *Endpoint, *ResourceFilter) (objectMap, error) } type metricEntry struct { @@ -109,6 +110,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, e.resourceKinds = map[string]resourceKind{ "datacenter": { name: "datacenter", + vcName: "Datacenter", pKey: "dcname", parentTag: "", enabled: anythingEnabled(parent.DatacenterMetricExclude), @@ -116,11 +118,13 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.DatacenterMetricInclude, parent.DatacenterMetricExclude), + paths: parent.DatacenterInclude, collectInstances: parent.DatacenterInstances, getObjects: getDatacenters, }, "cluster": { name: "cluster", + vcName: "ClusterComputeResource", pKey: "clustername", parentTag: "dcname", enabled: anythingEnabled(parent.ClusterMetricExclude), @@ -128,11 +132,13 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.ClusterMetricInclude, parent.ClusterMetricExclude), + paths: parent.ClusterInclude, collectInstances: parent.ClusterInstances, getObjects: getClusters, }, "host": { name: "host", + vcName: "HostSystem", pKey: "esxhostname", parentTag: "clustername", enabled: anythingEnabled(parent.HostMetricExclude), @@ -140,11 +146,13 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 20, objects: make(objectMap), filters: newFilterOrPanic(parent.HostMetricInclude, parent.HostMetricExclude), + paths: parent.HostInclude, collectInstances: parent.HostInstances, getObjects: getHosts, }, "vm": { name: "vm", + vcName: "VirtualMachine", pKey: "vmname", parentTag: "esxhostname", enabled: anythingEnabled(parent.VMMetricExclude), @@ -152,17 +160,20 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 20, objects: make(objectMap), filters: newFilterOrPanic(parent.VMMetricInclude, parent.VMMetricExclude), + paths: parent.VMInclude, collectInstances: parent.VMInstances, getObjects: getVMs, }, "datastore": { name: "datastore", + vcName: "Datastore", pKey: "dsname", enabled: anythingEnabled(parent.DatastoreMetricExclude), realTime: false, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.DatastoreMetricInclude, parent.DatastoreMetricExclude), + paths: parent.DatastoreInclude, collectInstances: parent.DatastoreInstances, getObjects: getDatastores, }, @@ -359,7 +370,12 @@ func (e *Endpoint) discover(ctx context.Context) error { log.Printf("D! [input.vsphere] Discovering resources for %s", res.name) // Need to do this for all resource types even if they are not enabled if res.enabled || k != "vm" { - objects, err := res.getObjects(ctx, e, client.Root) + rf := ResourceFilter{ + finder: &Finder{client}, + resType: res.vcName, + paths: res.paths} + + objects, err := res.getObjects(ctx, e, &rf) if err != nil { return err } @@ -437,11 +453,11 @@ func (e *Endpoint) discover(ctx context.Context) error { return nil } -func getDatacenters(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getDatacenters(ctx context.Context, e *Endpoint, filter *ResourceFilter) (objectMap, error) { var resources []mo.Datacenter ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() - err := root.Retrieve(ctx1, []string{"Datacenter"}, []string{"name", "parent"}, &resources) + err := filter.FindAll(ctx1, &resources) if err != nil { return nil, err } @@ -453,11 +469,11 @@ func getDatacenters(ctx context.Context, e *Endpoint, root *view.ContainerView) return m, nil } -func getClusters(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getClusters(ctx context.Context, e *Endpoint, filter *ResourceFilter) (objectMap, error) { var resources []mo.ClusterComputeResource ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() - err := root.Retrieve(ctx1, []string{"ClusterComputeResource"}, []string{"name", "parent"}, &resources) + err := filter.FindAll(ctx1, &resources) if err != nil { return nil, err } @@ -467,11 +483,17 @@ func getClusters(ctx context.Context, e *Endpoint, root *view.ContainerView) (ob // We're not interested in the immediate parent (a folder), but the data center. p, ok := cache[r.Parent.Value] if !ok { - o := object.NewFolder(root.Client(), *r.Parent) - var folder mo.Folder ctx2, cancel2 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel2() - err := o.Properties(ctx2, *r.Parent, []string{"parent"}, &folder) + client, err := e.clientFactory.GetClient(ctx2) + if err != nil { + return nil, err + } + o := object.NewFolder(client.Client.Client, *r.Parent) + var folder mo.Folder + ctx3, cancel3 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) + defer cancel3() + err = o.Properties(ctx3, *r.Parent, []string{"parent"}, &folder) if err != nil { log.Printf("W! [input.vsphere] Error while getting folder parent: %e", err) p = nil @@ -487,9 +509,9 @@ func getClusters(ctx context.Context, e *Endpoint, root *view.ContainerView) (ob return m, nil } -func getHosts(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getHosts(ctx context.Context, e *Endpoint, filter *ResourceFilter) (objectMap, error) { var resources []mo.HostSystem - err := root.Retrieve(ctx, []string{"HostSystem"}, []string{"name", "parent"}, &resources) + err := filter.FindAll(ctx, &resources) if err != nil { return nil, err } @@ -501,11 +523,11 @@ func getHosts(ctx context.Context, e *Endpoint, root *view.ContainerView) (objec return m, nil } -func getVMs(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getVMs(ctx context.Context, e *Endpoint, filter *ResourceFilter) (objectMap, error) { var resources []mo.VirtualMachine ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() - err := root.Retrieve(ctx1, []string{"VirtualMachine"}, []string{"name", "runtime.host", "config.guestId", "config.uuid"}, &resources) + err := filter.FindAll(ctx1, &resources) if err != nil { return nil, err } @@ -525,11 +547,11 @@ func getVMs(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectM return m, nil } -func getDatastores(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getDatastores(ctx context.Context, e *Endpoint, filter *ResourceFilter) (objectMap, error) { var resources []mo.Datastore ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() - err := root.Retrieve(ctx1, []string{"Datastore"}, []string{"name", "parent", "info"}, &resources) + err := filter.FindAll(ctx1, &resources) if err != nil { return nil, err } diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index dda7b8ff5015a..1c6060d0489e5 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -2,6 +2,7 @@ package vsphere import ( "context" + "log" "reflect" "strings" @@ -13,15 +14,32 @@ import ( var childTypes map[string][]string +var addFields map[string][]string + type Finder struct { client *Client } +type ResourceFilter struct { + finder *Finder + resType string + paths []string +} + type nameAndRef struct { name string ref types.ManagedObjectReference } +func (f *Finder) FindAll(ctx context.Context, resType string, paths []string, dst interface{}) error { + for _, p := range paths { + if err := f.Find(ctx, resType, p, dst); err != nil { + return err + } + } + return nil +} + func (f *Finder) Find(ctx context.Context, resType, path string, dst interface{}) error { p := strings.Split(path, "/") flt := make([]property.Filter, len(p)-1) @@ -34,23 +52,27 @@ func (f *Finder) Find(ctx context.Context, resType, path string, dst interface{} return err } objectContentToTypedArray(objs, dst) + log.Printf("D! [input.vsphere] Find(%s, %s) returned %d objects", resType, path, len(objs)) return nil } func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, resType string, - parts []property.Filter, pos int, objs map[string]types.ObjectContent) error { + tokens []property.Filter, pos int, objs map[string]types.ObjectContent) error { + isLeaf := pos == len(tokens)-1 // No more tokens to match? - if pos >= len(parts) { + if pos >= len(tokens) { return nil } - // Get children + // Determine child types + ct, ok := childTypes[root.Reference().Type] if !ok { // We don't know how to handle children of this type. Stop descending. return nil } + m := view.NewManager(f.client.Client.Client) defer m.Destroy(ctx) v, err := m.CreateContainerView(ctx, root, ct, false) @@ -60,20 +82,68 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, defer v.Destroy(ctx) var content []types.ObjectContent - err = v.Retrieve(ctx, ct, []string{"name"}, &content) - if err != nil { - return err + // If we're at a potential leaf, we need to collect all properties specified for a target type. However, + // if we're reached a node that may have multiple types of children, we have to do it in two + // passes, since asking for fields that don't exist in all types will throw an error. + // This is needed because of recursive wildcards. Even if we're at the last token, we can't determine + // whether we've actually reached a leaf. This would happen for e.g. "/DC0/vm/**". + fields := []string{"name"} + if isLeaf { + // Filter out the requested type from potential fields. + fct := make([]string, 0, len(ct)) + for _, t := range ct { + if t != resType { + fct = append(fct, t) + } + } + // Was the type present? (I.e. did we remove anything) + if len(ct) != len(fct) { + // Make a pass without the requested type with just the standard fields + if len(fct) > 0 { + err = v.Retrieve(ctx, fct, fields, &content) + if err != nil { + return err + } + } + + // Now make a pass with a full set of fields, but only for the requested type + if af, ok := addFields[resType]; ok { + fields = append(fields, af...) + } + var content1 []types.ObjectContent + err = v.Retrieve(ctx, []string{resType}, fields, &content1) + if err != nil { + return err + } + content = append(content, content1...) + } else { + // The requested type wasn't part of potential children, so just collect the basics + err = v.Retrieve(ctx, ct, fields, &content) + + if err != nil { + return err + } + } + } else { + // Not at a leaf, so we can keep things simple + err = v.Retrieve(ctx, ct, fields, &content) + + if err != nil { + return err + } } + for _, c := range content { - if !parts[pos].MatchPropertyList(c.PropSet) { + if !tokens[pos].MatchPropertyList(c.PropSet[:1]) { continue } + // Already been here through another path? Skip! if _, ok := objs[root.Reference().String()]; ok { continue } - if c.Obj.Type == resType { + if c.Obj.Type == resType && isLeaf { // We found what we're looking for. Consider it a leaf and stop descending objs[c.Obj.String()] = c continue @@ -81,18 +151,20 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, // Deal with recursive wildcards (**) inc := 1 // Normally we advance one token. - if parts[pos]["name"] == "**" { - if pos >= len(parts)-1 { + if tokens[pos]["name"] == "**" { + if isLeaf { inc = 0 // Can't advance past last token, so keep descending the tree } else { // Lookahead to next token. If it matches this child, we are out of // the recursive wildcard handling and we can advance TWO tokens ahead, since // the token that ended the recursive wildcard mode is now consumed. - if parts[pos+1].MatchPropertyList(c.PropSet) { - if pos < len(parts)-2 { + if tokens[pos+1].MatchPropertyList(c.PropSet) { + if pos < len(tokens)-2 { inc = 2 } else { - inc = 0 + // We found match and it's at a leaf! Grab it! + objs[c.Obj.String()] = c + continue } } else { // We didn't break out of recursicve wildcard mode yet, so stay on this token. @@ -101,7 +173,7 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, } } } - err := f.descend(ctx, c.Obj, resType, parts, pos+inc, objs) + err := f.descend(ctx, c.Obj, resType, tokens, pos+inc, objs) if err != nil { return err } @@ -149,6 +221,10 @@ func objectContentToTypedArray(objs map[string]types.ObjectContent, dst interfac return nil } +func (r *ResourceFilter) FindAll(ctx context.Context, dst interface{}) error { + return r.finder.FindAll(ctx, r.resType, r.paths, dst) +} + func init() { childTypes = map[string][]string{ "HostSystem": []string{"VirtualMachine"}, @@ -164,4 +240,12 @@ func init() { "Datastore", }, } + + addFields = map[string][]string{ + "HostSystem": []string{"parent"}, + "VirtualMachine": []string{"runtime.host", "config.guestId", "config.uuid"}, + "Datastore": []string{"parent", "info"}, + "ClusterComputeResource": []string{"parent"}, + "Datacenter": []string{"parent"}, + } } diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index f0bb5dca99c38..4af4f0d301016 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -22,18 +22,23 @@ type VSphere struct { DatacenterInstances bool DatacenterMetricInclude []string DatacenterMetricExclude []string + DatacenterInclude []string ClusterInstances bool ClusterMetricInclude []string ClusterMetricExclude []string + ClusterInclude []string HostInstances bool HostMetricInclude []string HostMetricExclude []string + HostInclude []string VMInstances bool `toml:"vm_instances"` VMMetricInclude []string `toml:"vm_metric_include"` VMMetricExclude []string `toml:"vm_metric_exclude"` + VMInclude []string `toml:"vm_include"` DatastoreInstances bool DatastoreMetricInclude []string DatastoreMetricExclude []string + DatastoreInclude []string Separator string MaxQueryObjects int @@ -291,19 +296,27 @@ func init() { return &VSphere{ Vcenters: []string{}, - ClusterInstances: true, - ClusterMetricInclude: nil, - ClusterMetricExclude: nil, - HostInstances: true, - HostMetricInclude: nil, - HostMetricExclude: nil, - VMInstances: true, - VMMetricInclude: nil, - VMMetricExclude: nil, - DatastoreInstances: false, - DatastoreMetricInclude: nil, - DatastoreMetricExclude: nil, - Separator: "_", + DatacenterInstances: true, + DatacenterMetricInclude: nil, + DatacenterMetricExclude: nil, + DatacenterInclude: []string{"/*"}, + ClusterInstances: true, + ClusterMetricInclude: nil, + ClusterMetricExclude: nil, + ClusterInclude: []string{"/*/host/**"}, + HostInstances: true, + HostMetricInclude: nil, + HostMetricExclude: nil, + HostInclude: []string{"/*/host/**"}, + VMInstances: true, + VMMetricInclude: nil, + VMMetricExclude: nil, + VMInclude: []string{"/*/vm/**"}, + DatastoreInstances: false, + DatastoreMetricInclude: nil, + DatastoreMetricExclude: nil, + DatastoreInclude: []string{"/*/datastore/**"}, + Separator: "_", MaxQueryObjects: 256, MaxQueryMetrics: 256, diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 94a620b32f8cf..870273a4dda1a 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -110,6 +110,7 @@ func defaultVSphere() *VSphere { "mem.usage.*", "mem.active.*"}, ClusterMetricExclude: nil, + ClusterInclude: []string{"/**"}, HostMetricInclude: []string{ "cpu.ready.summation.delta.millisecond", "cpu.latency.average.rate.percent", @@ -147,6 +148,7 @@ func defaultVSphere() *VSphere { "disk.kernelReadLatency.average.absolute.millisecond", "disk.kernelWriteLatency.average.absolute.millisecond"}, HostMetricExclude: nil, + HostInclude: []string{"/**"}, VMMetricInclude: []string{ "cpu.ready.summation.delta.millisecond", "mem.swapinRate.average.rate.kiloBytesPerSecond", @@ -166,11 +168,16 @@ func defaultVSphere() *VSphere { "virtualDisk.read.average.rate.kiloBytesPerSecond", "virtualDisk.write.average.rate.kiloBytesPerSecond"}, VMMetricExclude: nil, + VMInclude: []string{"/**"}, DatastoreMetricInclude: []string{ "disk.used.*", "disk.provsioned.*"}, - DatastoreMetricExclude: nil, - ClientConfig: itls.ClientConfig{InsecureSkipVerify: true}, + DatastoreMetricExclude: nil, + DatastoreInclude: []string{"/**"}, + DatacenterMetricInclude: nil, + DatacenterMetricExclude: nil, + DatacenterInclude: []string{"/**"}, + ClientConfig: itls.ClientConfig{InsecureSkipVerify: true}, MaxQueryObjects: 256, ObjectDiscoveryInterval: internal.Duration{Duration: time.Second * 300}, @@ -334,6 +341,11 @@ func TestFinder(t *testing.T) { require.Equal(t, 1, len(dc)) require.Equal(t, "DC0_H0_VM0", vm[0].Name) + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/DC0/vm/DC0_C0*", &vm) + require.NoError(t, err) + require.Equal(t, 1, len(dc)) + vm = []mo.VirtualMachine{} err = f.Find(ctx, "VirtualMachine", "/DC0/*/DC0_H0_VM0", &vm) require.NoError(t, err) @@ -354,6 +366,31 @@ func TestFinder(t *testing.T) { err = f.Find(ctx, "VirtualMachine", "/DC0/**", &vm) require.NoError(t, err) require.Equal(t, 4, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/**", &vm) + require.NoError(t, err) + require.Equal(t, 4, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/**/DC0_H0_VM*", &vm) + require.NoError(t, err) + require.Equal(t, 2, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/**/vm/**", &vm) + require.NoError(t, err) + require.Equal(t, 4, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.FindAll(ctx, "VirtualMachine", []string{"/DC0/vm/DC0_H0*", "/DC0/vm/DC0_C0*"}, &vm) + require.NoError(t, err) + require.Equal(t, 4, len(vm)) + + vm = []mo.VirtualMachine{} + err = f.FindAll(ctx, "VirtualMachine", []string{"/**"}, &vm) + require.NoError(t, err) + require.Equal(t, 4, len(vm)) } func TestAll(t *testing.T) { From 47cca7c2763f481449623885d02623e8205d6a8c Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 13 Nov 2018 14:28:12 -0500 Subject: [PATCH 04/34] PR candidate 1 --- plugins/inputs/vsphere/finder.go | 53 +++++++++-------------- plugins/inputs/vsphere/vsphere_test.go | 58 ++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index 1c6060d0489e5..95dfa5a0f79cb 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -82,52 +82,37 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, defer v.Destroy(ctx) var content []types.ObjectContent - // If we're at a potential leaf, we need to collect all properties specified for a target type. However, - // if we're reached a node that may have multiple types of children, we have to do it in two - // passes, since asking for fields that don't exist in all types will throw an error. - // This is needed because of recursive wildcards. Even if we're at the last token, we can't determine - // whether we've actually reached a leaf. This would happen for e.g. "/DC0/vm/**". fields := []string{"name"} if isLeaf { - // Filter out the requested type from potential fields. - fct := make([]string, 0, len(ct)) - for _, t := range ct { - if t != resType { - fct = append(fct, t) - } - } - // Was the type present? (I.e. did we remove anything) - if len(ct) != len(fct) { - // Make a pass without the requested type with just the standard fields - if len(fct) > 0 { - err = v.Retrieve(ctx, fct, fields, &content) - if err != nil { - return err - } - } - - // Now make a pass with a full set of fields, but only for the requested type + // Special case: The last token is a recursive wildcard, so we can grab everything + // recursively in a single call. + if tokens[pos]["name"] == "**" { + v2, err := m.CreateContainerView(ctx, root, []string{resType}, true) + defer v2.Destroy(ctx) if af, ok := addFields[resType]; ok { fields = append(fields, af...) } - var content1 []types.ObjectContent - err = v.Retrieve(ctx, []string{resType}, fields, &content1) + err = v2.Retrieve(ctx, []string{resType}, fields, &content) if err != nil { return err } - content = append(content, content1...) - } else { - // The requested type wasn't part of potential children, so just collect the basics - err = v.Retrieve(ctx, ct, fields, &content) - - if err != nil { - return err + log.Printf("D! [input.vsphere] Recursive query returned %d objects", len(content)) + for _, c := range content { + objs[c.Obj.String()] = c } + return nil + } + + if af, ok := addFields[resType]; ok { + fields = append(fields, af...) } + err = v.Retrieve(ctx, []string{resType}, fields, &content) + if err != nil { + return err + } + log.Printf("D! [input.vsphere] Target type query returned %d objects", len(content)) } else { - // Not at a leaf, so we can keep things simple err = v.Retrieve(ctx, ct, fields, &content) - if err != nil { return err } diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 870273a4dda1a..0648116f105fd 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -4,6 +4,8 @@ import ( "context" "crypto/tls" "fmt" + "net/url" + "os" "regexp" "sort" "strings" @@ -239,20 +241,27 @@ func TestWorkerPool(t *testing.T) { } func TestTimeout(t *testing.T) { - m, s, err := createSim() - if err != nil { - t.Fatal(err) + v := defaultVSphere() + url := os.Getenv("TGF_TEST_VSPHERE_URL") + if url != "" { + m, s, err := createSim() + if err != nil { + t.Fatal(err) + } + defer m.Remove() + defer s.Close() + url = s.URL.String() + } else { + v.Username = os.Getenv("TGF_TEST_VSPHERE_USER") + v.Password = os.Getenv("TGF_TEST_VSPHERE_PASSWORD") } - defer m.Remove() - defer s.Close() var acc testutil.Accumulator - v := defaultVSphere() - v.Vcenters = []string{s.URL.String()} + v.Vcenters = []string{url} v.Timeout = internal.Duration{Duration: 1 * time.Nanosecond} require.NoError(t, v.Start(nil)) // We're not using the Accumulator, so it can be nil. defer v.Stop() - err = v.Gather(&acc) + err := v.Gather(&acc) require.NotNil(t, err, "Error should not be nil here") // The accumulator must contain exactly one error and it must be a deadline exceeded. @@ -393,6 +402,39 @@ func TestFinder(t *testing.T) { require.Equal(t, 4, len(vm)) } +func TestExternalFinder(t *testing.T) { + os.Setenv("TGF_TEST_VSPHERE_URL", "https://10.198.15.245/sdk") + os.Setenv("TGF_TEST_VSPHERE_USER", "administrator@vsphere.local") + os.Setenv("TGF_TEST_VSPHERE_PASSWORD", "Admin!23") + + v := defaultVSphere() + vu := os.Getenv("TGF_TEST_VSPHERE_URL") + if vu == "" { + t.Skip("No external vCenter specified. Skipping") + } else { + v.Username = os.Getenv("TGF_TEST_VSPHERE_USER") + v.Password = os.Getenv("TGF_TEST_VSPHERE_PASSWORD") + } + + ctx := context.Background() + u, err := url.Parse(vu) + require.NoError(t, err, "Error parsing URL") + c, err := NewClient(ctx, u, v) + require.NoError(t, err, "Error connecting to vCenter") + + f := Finder{c} + + vm := []mo.VirtualMachine{} + err = f.Find(ctx, "VirtualMachine", "/**", &vm) + require.NoError(t, err) + require.True(t, len(vm) > 0) + + dc := []mo.Datacenter{} + err = f.Find(ctx, "Datacenter", "/*", &dc) + require.NoError(t, err) + require.Equal(t, 1, len(dc)) +} + func TestAll(t *testing.T) { m, s, err := createSim() if err != nil { From 01e4ac96e271b1691844dcbd845db03dc7e23e2d Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 13 Nov 2018 16:51:56 -0500 Subject: [PATCH 05/34] Removed excessive logging --- plugins/inputs/vsphere/finder.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index 95dfa5a0f79cb..fc5396aaf94b5 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -96,7 +96,6 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, if err != nil { return err } - log.Printf("D! [input.vsphere] Recursive query returned %d objects", len(content)) for _, c := range content { objs[c.Obj.String()] = c } @@ -110,7 +109,6 @@ func (f *Finder) descend(ctx context.Context, root types.ManagedObjectReference, if err != nil { return err } - log.Printf("D! [input.vsphere] Target type query returned %d objects", len(content)) } else { err = v.Retrieve(ctx, ct, fields, &content) if err != nil { From 3124ad60453166c83ee686905bdafd4b1d0dae77 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 14 Nov 2018 11:35:41 -0500 Subject: [PATCH 06/34] Added comments --- plugins/inputs/vsphere/finder.go | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index fc5396aaf94b5..732c020046814 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -16,10 +16,13 @@ var childTypes map[string][]string var addFields map[string][]string +// Finder allows callers to find resources in vCenter given a query string. type Finder struct { client *Client } +// ResourceFilter is a convenience class holding a finder and a set of paths. It is useful when you need a +// self contained object capable of returning a certain set of resources. type ResourceFilter struct { finder *Finder resType string @@ -31,6 +34,7 @@ type nameAndRef struct { ref types.ManagedObjectReference } +// FindAll returns the union of resources found given the supplied resource type and paths. func (f *Finder) FindAll(ctx context.Context, resType string, paths []string, dst interface{}) error { for _, p := range paths { if err := f.Find(ctx, resType, p, dst); err != nil { @@ -40,6 +44,7 @@ func (f *Finder) FindAll(ctx context.Context, resType string, paths []string, ds return nil } +// Find returns the resources matching the specified path. func (f *Finder) Find(ctx context.Context, resType, path string, dst interface{}) error { p := strings.Split(path, "/") flt := make([]property.Filter, len(p)-1) @@ -204,17 +209,19 @@ func objectContentToTypedArray(objs map[string]types.ObjectContent, dst interfac return nil } +// FindAll finds all resources matching the paths that were specified upon creation of +// the ResourceFilter. func (r *ResourceFilter) FindAll(ctx context.Context, dst interface{}) error { return r.finder.FindAll(ctx, r.resType, r.paths, dst) } func init() { childTypes = map[string][]string{ - "HostSystem": []string{"VirtualMachine"}, - "ComputeResource": []string{"HostSystem", "ResourcePool"}, - "ClusterComputeResource": []string{"HostSystem", "ResourcePool"}, - "Datacenter": []string{"Folder"}, - "Folder": []string{ + "HostSystem": {"VirtualMachine"}, + "ComputeResource": {"HostSystem", "ResourcePool"}, + "ClusterComputeResource": {"HostSystem", "ResourcePool"}, + "Datacenter": {"Folder"}, + "Folder": { "Folder", "Datacenter", "VirtualMachine", @@ -225,10 +232,10 @@ func init() { } addFields = map[string][]string{ - "HostSystem": []string{"parent"}, - "VirtualMachine": []string{"runtime.host", "config.guestId", "config.uuid"}, - "Datastore": []string{"parent", "info"}, - "ClusterComputeResource": []string{"parent"}, - "Datacenter": []string{"parent"}, + "HostSystem": {"parent"}, + "VirtualMachine": {"runtime.host", "config.guestId", "config.uuid"}, + "Datastore": {"parent", "info"}, + "ClusterComputeResource": {"parent"}, + "Datacenter": {"parent"}, } } From db694e6105fc76ae4b0d1a6719487d9353b17e98 Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 16 Nov 2018 17:09:58 -0500 Subject: [PATCH 07/34] Scale and performance improvements --- plugins/inputs/vsphere/endpoint.go | 158 ++++++++++++++----------- plugins/outputs/wavefront/wavefront.go | 2 +- 2 files changed, 90 insertions(+), 70 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index dbc67dd959366..27f6653398437 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "math/rand" "net/url" "regexp" "strconv" @@ -26,13 +27,15 @@ var isolateLUN = regexp.MustCompile(".*/([^/]+)/?$") const metricLookback = 3 +const rtMetricLookback = 3 + // Endpoint is a high-level representation of a connected vCenter endpoint. It is backed by the lower // level Client type. type Endpoint struct { - Parent *VSphere - URL *url.URL - lastColls map[string]time.Time - instanceInfo map[string]resourceInfo + Parent *VSphere + URL *url.URL + lastColls map[string]time.Time + //instanceInfo map[string]resourceInfo resourceKinds map[string]resourceKind hwMarks *TSCache lun2ds map[string]string @@ -52,6 +55,7 @@ type resourceKind struct { sampling int32 objects objectMap filters filter.Filter + metrics performance.MetricList collectInstances bool getObjects func(context.Context, *Endpoint, *view.ContainerView) (objectMap, error) } @@ -74,12 +78,6 @@ type objectRef struct { dcname string } -type resourceInfo struct { - name string - metrics performance.MetricList - parentRef *types.ManagedObjectReference -} - type metricQRequest struct { res *resourceKind obj objectRef @@ -100,7 +98,6 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, Parent: parent, lastColls: make(map[string]time.Time), hwMarks: NewTSCache(1 * time.Hour), - instanceInfo: make(map[string]resourceInfo), lun2ds: make(map[string]string), initialized: false, clientFactory: NewClientFactory(ctx, url, parent), @@ -276,6 +273,21 @@ func (e *Endpoint) getMetricNameMap(ctx context.Context) (map[int32]string, erro return names, nil } +func (e *Endpoint) getMetadata2(ctx context.Context, obj objectRef, sampling int32) (performance.MetricList, error) { + client, err := e.clientFactory.GetClient(ctx) + if err != nil { + return nil, err + } + + ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) + defer cancel1() + metrics, err := client.Perf.AvailableMetric(ctx1, obj.ref.Reference(), sampling) + if err != nil && err != context.Canceled { + return nil, err + } + return metrics, nil +} + func (e *Endpoint) getMetadata(ctx context.Context, in interface{}) interface{} { client, err := e.clientFactory.GetClient(ctx) if err != nil { @@ -350,7 +362,7 @@ func (e *Endpoint) discover(ctx context.Context) error { log.Printf("D! [input.vsphere]: Discover new objects for %s", e.URL.Host) - instInfo := make(map[string]resourceInfo) + instInfoMux := sync.Mutex{} resourceKinds := make(map[string]resourceKind) dcNameCache := make(map[string]string) @@ -374,40 +386,51 @@ func (e *Endpoint) discover(ctx context.Context) error { } } - // Set up a worker pool for processing metadata queries concurrently - wp := NewWorkerPool(10) - wp.Run(ctx, e.getMetadata, e.Parent.DiscoverConcurrency) - - // Fill the input channels with resources that need to be queried - // for metadata. - wp.Fill(ctx, func(ctx context.Context, f PushFunc) { - for _, obj := range objects { - f(ctx, &metricQRequest{obj: obj, res: &res}) + // Get metric metadata and filter metrics + prob := 100.0 / float64(len(objects)) + log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) + wg := sync.WaitGroup{} + limiter := make(chan struct{}, e.Parent.DiscoverConcurrency) + for _, obj := range objects { + if rand.Float64() > prob { + continue } - }) - - // Drain the resulting metadata and build instance infos. - wp.Drain(ctx, func(ctx context.Context, in interface{}) bool { - switch resp := in.(type) { - case *metricQResponse: - mList := make(performance.MetricList, 0) - if res.enabled { - for _, m := range *resp.metrics { - if m.Instance != "" && !res.collectInstances { - continue - } - if res.filters.Match(metricNames[m.CounterId]) { - mList = append(mList, m) - } + wg.Add(1) + go func(obj objectRef) { + defer wg.Done() + limiter <- struct{}{} + defer func() { + <-limiter + }() + metrics, err := e.getMetadata2(ctx, obj, res.sampling) + if err != nil { + log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + } + mMap := make(map[string]types.PerfMetricId) + for _, m := range metrics { + if m.Instance != "" && res.collectInstances { + m.Instance = "*" + } else { + m.Instance = "" + } + if res.filters.Match(metricNames[m.CounterId]) { + mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m } } - instInfo[resp.obj.ref.Value] = resourceInfo{name: resp.obj.name, metrics: mList, parentRef: resp.obj.parentRef} - case error: - log.Printf("W! [input.vsphere]: Error while discovering resources: %s", resp) - return false - } - return true - }) + log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) + instInfoMux.Lock() + defer instInfoMux.Unlock() + if len(mMap) > len(res.metrics) { + res.metrics = make(performance.MetricList, len(mMap)) + i := 0 + for _, m := range mMap { + res.metrics[i] = m + i++ + } + } + }(obj) + } + wg.Wait() res.objects = objects resourceKinds[k] = res } @@ -428,12 +451,11 @@ func (e *Endpoint) discover(ctx context.Context) error { e.collectMux.Lock() defer e.collectMux.Unlock() - e.instanceInfo = instInfo e.resourceKinds = resourceKinds e.lun2ds = l2d sw.Stop() - SendInternalCounter("discovered_objects", e.URL.Host, int64(len(instInfo))) + // SendInternalCounter("discovered_objects", e.URL.Host, int64(len(instInfo))) TODO: Count the correct way return nil } @@ -505,12 +527,16 @@ func getVMs(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectM var resources []mo.VirtualMachine ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() - err := root.Retrieve(ctx1, []string{"VirtualMachine"}, []string{"name", "runtime.host", "config.guestId", "config.uuid"}, &resources) + err := root.Retrieve(ctx1, []string{"VirtualMachine"}, []string{"name", "runtime.host", "runtime.powerState", "config.guestId", "config.uuid"}, &resources) if err != nil { return nil, err } m := make(objectMap) for _, r := range resources { + if r.Runtime.PowerState != "poweredOn" { + log.Printf("D! [input.vsphere] Skipped powered off VM: %s", r.Name) + continue + } guest := "unknown" uuid := "" // Sometimes Config is unknown and returns a nil pointer @@ -609,22 +635,18 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n total := 0 nRes := 0 for _, object := range res.objects { - info, found := e.instanceInfo[object.ref.Value] - if !found { - log.Printf("E! [input.vsphere]: Internal error: Instance info not found for MOID %s", object.ref) - } - mr := len(info.metrics) + mr := len(res.metrics) for mr > 0 { mc := mr headroom := maxMetrics - metrics if !res.realTime && mc > headroom { // Metric query limit only applies to non-realtime metrics mc = headroom } - fm := len(info.metrics) - mr + fm := len(res.metrics) - mr pq := types.PerfQuerySpec{ Entity: object.ref, - MaxSample: 1, - MetricId: info.metrics[fm : fm+mc], + MaxSample: rtMetricLookback, + MetricId: res.metrics[fm : fm+mc], IntervalId: res.sampling, Format: "normal", } @@ -694,7 +716,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc elapsed := now.Sub(latest).Seconds() + 5.0 // Allow 5 second jitter. log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) if !res.realTime && elapsed < float64(res.sampling) { - // No new data would be available. We're outta herE! [input.vsphere]: + // No new data would be available. We're outta here! log.Printf("D! [input.vsphere]: Sampling period for %s of %d has not elapsed on %s", resourceType, res.sampling, e.URL.Host) return nil @@ -715,7 +737,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc wp := NewWorkerPool(10) wp.Run(ctx, func(ctx context.Context, in interface{}) interface{} { chunk := in.([]types.PerfQuerySpec) - n, err := e.collectChunk(ctx, chunk, resourceType, res, acc) + n, err := e.collectChunk(ctx, chunk, resourceType, &res, acc) log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { return err @@ -754,7 +776,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, resourceType string, - res resourceKind, acc telegraf.Accumulator) (int, error) { + res *resourceKind, acc telegraf.Accumulator) (int, error) { count := 0 prefix := "vsphere" + e.Parent.Separator + resourceType @@ -788,7 +810,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, // Iterate through results for _, em := range ems { moid := em.Entity.Reference().Value - instInfo, found := e.instanceInfo[moid] + instInfo, found := res.objects[moid] if !found { log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping! (This should not happen!)", moid) continue @@ -808,16 +830,16 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping", moid) continue } - e.populateTags(&objectRef, resourceType, &res, t, &v) + e.populateTags(&objectRef, resourceType, res, t, &v) // Now deal with the values. Iterate backwards so we start with the latest value tsKey := moid + "|" + name + "|" + v.Instance for idx := len(v.Value) - 1; idx >= 0; idx-- { ts := em.SampleInfo[idx].Timestamp - // Since non-realtime metrics are queries with a lookback, we need to check the high-water mark + // For queries with a lookback, we need to check the high-water mark // to determine if this should be included. Only samples not seen before should be included. - if !(res.realTime || e.hwMarks.IsNew(tsKey, ts)) { + if !e.hwMarks.IsNew(tsKey, ts) { continue } value := v.Value[idx] @@ -850,9 +872,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, count++ // Update highwater marks for non-realtime metrics. - if !res.realTime { - e.hwMarks.Put(tsKey, ts) - } + e.hwMarks.Put(tsKey, ts) } } // We've iterated through all the metrics and collected buckets for each @@ -864,13 +884,13 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, nil } -func (e *Endpoint) getParent(obj resourceInfo) (resourceInfo, bool) { +func (e *Endpoint) getParent(obj objectRef, res *resourceKind) (objectRef, bool) { p := obj.parentRef if p == nil { log.Printf("D! [input.vsphere] No parent found for %s", obj.name) - return resourceInfo{}, false + return objectRef{}, false } - r, ok := e.instanceInfo[p.Value] + r, ok := res.objects[p.Value] return r, ok } @@ -885,14 +905,14 @@ func (e *Endpoint) populateTags(objectRef *objectRef, resourceType string, resou } // Map parent reference - parent, found := e.instanceInfo[objectRef.parentRef.Value] + parent, found := resource.objects[objectRef.parentRef.Value] if found { t[resource.parentTag] = parent.name if resourceType == "vm" { if objectRef.guest != "" { t["guest"] = objectRef.guest } - if c, ok := e.getParent(parent); ok { + if c, ok := e.getParent(parent, resource); ok { t["clustername"] = c.name } } diff --git a/plugins/outputs/wavefront/wavefront.go b/plugins/outputs/wavefront/wavefront.go index ef36d1804045f..df1d42158dc07 100644 --- a/plugins/outputs/wavefront/wavefront.go +++ b/plugins/outputs/wavefront/wavefront.go @@ -122,11 +122,11 @@ func (w *Wavefront) Write(metrics []telegraf.Metric) error { return fmt.Errorf("Wavefront: TCP connect fail %s", err.Error()) } defer connection.Close() - connection.SetWriteDeadline(time.Now().Add(5 * time.Second)) for _, m := range metrics { for _, metricPoint := range buildMetrics(m, w) { metricLine := formatMetricPoint(metricPoint, w) + connection.SetWriteDeadline(time.Now().Add(30 * time.Second)) _, err := connection.Write([]byte(metricLine)) if err != nil { return fmt.Errorf("Wavefront: TCP writing error %s", err.Error()) From 5fdabdbd18e63bb72f9493128e2999a0fb3f05f9 Mon Sep 17 00:00:00 2001 From: prydin Date: Sun, 18 Nov 2018 09:24:58 -0500 Subject: [PATCH 08/34] Use timestamp of latest sample as start point for next round --- plugins/inputs/vsphere/endpoint.go | 77 +++++++++++++++++------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 27f6653398437..44e6b408e5de2 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -29,6 +29,8 @@ const metricLookback = 3 const rtMetricLookback = 3 +const maxSampleConst = 10 + // Endpoint is a high-level representation of a connected vCenter endpoint. It is backed by the lower // level Client type. type Endpoint struct { @@ -57,6 +59,7 @@ type resourceKind struct { filters filter.Filter metrics performance.MetricList collectInstances bool + parent string getObjects func(context.Context, *Endpoint, *view.ContainerView) (objectMap, error) } @@ -90,6 +93,15 @@ type metricQResponse struct { type multiError []error +func (e *Endpoint) getParent(obj *objectRef, res *resourceKind) (*objectRef, bool) { + if pKind, ok := e.resourceKinds[res.parent]; ok { + if p, ok := pKind.objects[obj.parentRef.Value]; ok { + return &p, true + } + } + return nil, false +} + // NewEndpoint returns a new connection to a vCenter based on the URL and configuration passed // as parameters. func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, error) { @@ -115,6 +127,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, filters: newFilterOrPanic(parent.DatacenterMetricInclude, parent.DatacenterMetricExclude), collectInstances: parent.DatacenterInstances, getObjects: getDatacenters, + parent: "", }, "cluster": { name: "cluster", @@ -127,6 +140,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, filters: newFilterOrPanic(parent.ClusterMetricInclude, parent.ClusterMetricExclude), collectInstances: parent.ClusterInstances, getObjects: getClusters, + parent: "datacenter", }, "host": { name: "host", @@ -139,6 +153,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, filters: newFilterOrPanic(parent.HostMetricInclude, parent.HostMetricExclude), collectInstances: parent.HostInstances, getObjects: getHosts, + parent: "cluster", }, "vm": { name: "vm", @@ -151,6 +166,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, filters: newFilterOrPanic(parent.VMMetricInclude, parent.VMMetricExclude), collectInstances: parent.VMInstances, getObjects: getVMs, + parent: "host", }, "datastore": { name: "datastore", @@ -162,6 +178,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, filters: newFilterOrPanic(parent.DatastoreMetricInclude, parent.DatastoreMetricExclude), collectInstances: parent.DatastoreInstances, getObjects: getDatastores, + parent: "", }, } @@ -645,24 +662,17 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n fm := len(res.metrics) - mr pq := types.PerfQuerySpec{ Entity: object.ref, - MaxSample: rtMetricLookback, + MaxSample: maxSampleConst, MetricId: res.metrics[fm : fm+mc], IntervalId: res.sampling, Format: "normal", } - // For non-realtime metrics, we need to look back a few samples in case - // the vCenter is late reporting metrics. - if !res.realTime { - pq.MaxSample = metricLookback - } - // Look back 3 sampling periods - start := latest.Add(time.Duration(-res.sampling) * time.Second * (metricLookback - 1)) - if !res.realTime { - pq.StartTime = &start - pq.EndTime = &now - } + //start := latest.Add(time.Duration(-res.sampling) * time.Second * (metricLookback - 1)) + pq.StartTime = &latest + pq.EndTime = &now + pqs = append(pqs, pq) mr -= mc metrics += mc @@ -733,16 +743,23 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc count := int64(0) + var tsMux sync.Mutex + latestSample := time.Time{} // Set up a worker pool for collecting chunk metrics wp := NewWorkerPool(10) wp.Run(ctx, func(ctx context.Context, in interface{}) interface{} { chunk := in.([]types.PerfQuerySpec) - n, err := e.collectChunk(ctx, chunk, resourceType, &res, acc) + n, localLatest, err := e.collectChunk(ctx, chunk, resourceType, &res, acc) log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { return err } atomic.AddInt64(&count, int64(n)) + tsMux.Lock() + defer tsMux.Unlock() + if localLatest.After(latestSample) { + latestSample = localLatest + } return nil }, e.Parent.CollectConcurrency) @@ -765,8 +782,8 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } return true }) - e.lastColls[resourceType] = now // Use value captured at the beginning to avoid blind spots. - + log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) + e.lastColls[resourceType] = latestSample sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) if len(merr) > 0 { @@ -776,34 +793,35 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, resourceType string, - res *resourceKind, acc telegraf.Accumulator) (int, error) { + res *resourceKind, acc telegraf.Accumulator) (int, time.Time, error) { + latestSample := time.Time{} count := 0 prefix := "vsphere" + e.Parent.Separator + resourceType client, err := e.clientFactory.GetClient(ctx) if err != nil { - return 0, err + return count, latestSample, err } ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() metricInfo, err := client.Perf.CounterInfoByName(ctx1) if err != nil { - return count, err + return count, latestSample, err } ctx2, cancel2 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel2() metrics, err := client.Perf.Query(ctx2, pqs) if err != nil { - return count, err + return count, latestSample, err } ctx3, cancel3 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel3() ems, err := client.Perf.ToMetricSeries(ctx3, metrics) if err != nil { - return count, err + return count, latestSample, err } log.Printf("D! [input.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) @@ -836,6 +854,9 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, tsKey := moid + "|" + name + "|" + v.Instance for idx := len(v.Value) - 1; idx >= 0; idx-- { ts := em.SampleInfo[idx].Timestamp + if ts.After(latestSample) { + latestSample = ts + } // For queries with a lookback, we need to check the high-water mark // to determine if this should be included. Only samples not seen before should be included. @@ -881,17 +902,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, acc.AddFields(bucket.name, bucket.fields, bucket.tags, bucket.ts) } } - return count, nil -} - -func (e *Endpoint) getParent(obj objectRef, res *resourceKind) (objectRef, bool) { - p := obj.parentRef - if p == nil { - log.Printf("D! [input.vsphere] No parent found for %s", obj.name) - return objectRef{}, false - } - r, ok := res.objects[p.Value] - return r, ok + return count, latestSample, nil } func (e *Endpoint) populateTags(objectRef *objectRef, resourceType string, resource *resourceKind, t map[string]string, v *performance.MetricSeries) { @@ -905,14 +916,14 @@ func (e *Endpoint) populateTags(objectRef *objectRef, resourceType string, resou } // Map parent reference - parent, found := resource.objects[objectRef.parentRef.Value] + parent, found := e.getParent(objectRef, resource) if found { t[resource.parentTag] = parent.name if resourceType == "vm" { if objectRef.guest != "" { t["guest"] = objectRef.guest } - if c, ok := e.getParent(parent, resource); ok { + if c, ok := e.resourceKinds["cluster"].objects[parent.parentRef.Value]; ok { t["clustername"] = c.name } } From 6c4cba00a57e05af1af178db4bf78c931fe868ff Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 27 Nov 2018 16:42:12 -0500 Subject: [PATCH 09/34] * Improved collection concurrency (one goroutine per object type) * Don't stop collection on partial error * Compute average if query returned multiple samples for a metric --- plugins/inputs/vsphere/endpoint.go | 208 ++++++++++++++++------------- plugins/inputs/vsphere/tscache.go | 8 ++ 2 files changed, 121 insertions(+), 95 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 44e6b408e5de2..864807b579a71 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -34,10 +34,10 @@ const maxSampleConst = 10 // Endpoint is a high-level representation of a connected vCenter endpoint. It is backed by the lower // level Client type. type Endpoint struct { - Parent *VSphere - URL *url.URL - lastColls map[string]time.Time - //instanceInfo map[string]resourceInfo + Parent *VSphere + URL *url.URL + lastColls map[string]time.Time + lastColl time.Time resourceKinds map[string]resourceKind hwMarks *TSCache lun2ds map[string]string @@ -403,51 +403,54 @@ func (e *Endpoint) discover(ctx context.Context) error { } } - // Get metric metadata and filter metrics - prob := 100.0 / float64(len(objects)) - log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) - wg := sync.WaitGroup{} - limiter := make(chan struct{}, e.Parent.DiscoverConcurrency) - for _, obj := range objects { - if rand.Float64() > prob { - continue - } - wg.Add(1) - go func(obj objectRef) { - defer wg.Done() - limiter <- struct{}{} - defer func() { - <-limiter - }() - metrics, err := e.getMetadata2(ctx, obj, res.sampling) - if err != nil { - log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + // No need to collect metric metadata if resource type is not enabled + if res.enabled { + // Get metric metadata and filter metrics + prob := 100.0 / float64(len(objects)) + log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) + wg := sync.WaitGroup{} + limiter := make(chan struct{}, e.Parent.DiscoverConcurrency) + for _, obj := range objects { + if rand.Float64() > prob { + continue } - mMap := make(map[string]types.PerfMetricId) - for _, m := range metrics { - if m.Instance != "" && res.collectInstances { - m.Instance = "*" - } else { - m.Instance = "" + wg.Add(1) + go func(obj objectRef) { + defer wg.Done() + limiter <- struct{}{} + defer func() { + <-limiter + }() + metrics, err := e.getMetadata2(ctx, obj, res.sampling) + if err != nil { + log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) } - if res.filters.Match(metricNames[m.CounterId]) { - mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m + mMap := make(map[string]types.PerfMetricId) + for _, m := range metrics { + if m.Instance != "" && res.collectInstances { + m.Instance = "*" + } else { + m.Instance = "" + } + if res.filters.Match(metricNames[m.CounterId]) { + mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m + } } - } - log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) - instInfoMux.Lock() - defer instInfoMux.Unlock() - if len(mMap) > len(res.metrics) { - res.metrics = make(performance.MetricList, len(mMap)) - i := 0 - for _, m := range mMap { - res.metrics[i] = m - i++ + log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) + instInfoMux.Lock() + defer instInfoMux.Unlock() + if len(mMap) > len(res.metrics) { + res.metrics = make(performance.MetricList, len(mMap)) + i := 0 + for _, m := range mMap { + res.metrics[i] = m + i++ + } } - } - }(obj) + }(obj) + } + wg.Wait() } - wg.Wait() res.objects = objects resourceKinds[k] = res } @@ -601,7 +604,6 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error // If we never managed to do a discovery, collection will be a no-op. Therefore, // we need to check that a connection is available, or the collection will // silently fail. - // if _, err := e.clientFactory.GetClient(ctx); err != nil { return err } @@ -614,21 +616,26 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error } // If discovery interval is disabled (0), discover on each collection cycle - // if e.Parent.ObjectDiscoveryInterval.Duration == 0 { err := e.discover(ctx) if err != nil { return err } } + var wg sync.WaitGroup for k, res := range e.resourceKinds { if res.enabled { - err := e.collectResource(ctx, k, acc) - if err != nil { - return err - } + wg.Add(1) + go func(k string) { + defer wg.Done() + err := e.collectResource(ctx, k, acc) + if err != nil { + acc.AddError(err) + } + }(k) } } + wg.Wait() // Purge old timestamps from the cache e.hwMarks.Purge() @@ -668,9 +675,12 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n Format: "normal", } - // Look back 3 sampling periods - //start := latest.Add(time.Duration(-res.sampling) * time.Second * (metricLookback - 1)) - pq.StartTime = &latest + start, ok := e.hwMarks.Get(object.ref.Value) + if !ok { + // Look back 3 sampling periods by default + start = latest.Add(time.Duration(-res.sampling) * time.Second * (metricLookback - 1)) + } + pq.StartTime = &start pq.EndTime = &now pqs = append(pqs, pq) @@ -711,7 +721,6 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc telegraf.Accumulator) error { - // Do we have new data yet? res := e.resourceKinds[resourceType] client, err := e.clientFactory.GetClient(ctx) if err != nil { @@ -734,6 +743,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } else { latest = now.Add(time.Duration(-res.sampling) * time.Second) } + e.lastColl = now internalTags := map[string]string{"resourcetype": resourceType} sw := NewStopwatchWithTags("gather_duration", e.URL.Host, internalTags) @@ -757,7 +767,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc atomic.AddInt64(&count, int64(n)) tsMux.Lock() defer tsMux.Unlock() - if localLatest.After(latestSample) { + if localLatest.After(latestSample) && !localLatest.IsZero() { latestSample = localLatest } return nil @@ -783,7 +793,9 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc return true }) log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) - e.lastColls[resourceType] = latestSample + if !latestSample.IsZero() { + e.lastColls[resourceType] = latestSample + } sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) if len(merr) > 0 { @@ -850,51 +862,57 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, } e.populateTags(&objectRef, resourceType, res, t, &v) - // Now deal with the values. Iterate backwards so we start with the latest value - tsKey := moid + "|" + name + "|" + v.Instance - for idx := len(v.Value) - 1; idx >= 0; idx-- { - ts := em.SampleInfo[idx].Timestamp - if ts.After(latestSample) { - latestSample = ts - } - - // For queries with a lookback, we need to check the high-water mark - // to determine if this should be included. Only samples not seen before should be included. - if !e.hwMarks.IsNew(tsKey, ts) { - continue - } - value := v.Value[idx] - - // Organize the metrics into a bucket per measurement. - // Data SHOULD be presented to us with the same timestamp for all samples, but in case - // they don't we use the measurement name + timestamp as the key for the bucket. - mn, fn := e.makeMetricIdentifier(prefix, name) - bKey := mn + " " + v.Instance + " " + strconv.FormatInt(ts.UnixNano(), 10) - bucket, found := buckets[bKey] - if !found { - bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: t} - buckets[bKey] = bucket - } + avg := float64(0) + nValues := 0 + //log.Printf("D! [input.vsphere] %s %d samples", name, len(v.Value)) + for idx, sample := range em.SampleInfo { + value := float64(v.Value[idx]) if value < 0 { - log.Printf("D! [input.vsphere]: Negative value for %s on %s. Indicates missing samples", name, objectRef.name) continue } - - // Percentage values must be scaled down by 100. - info, ok := metricInfo[name] - if !ok { - log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) - } - if info.UnitInfo.GetElementDescription().Key == "percent" { - bucket.fields[fn] = float64(value) / 100.0 - } else { - bucket.fields[fn] = value + ts := sample.Timestamp + if ts.After(latestSample) { + latestSample = ts } - count++ + avg += float64(value) + nValues++ + } + if nValues == 0 { + log.Printf("D! [input.vsphere]: Missing value for: %s, %s", name, objectRef.name) + continue + } + + // If we're catching up with metrics arriving late, calculate the average + // of them and pick the midpoint timestamp. This is a reasonable way of + // filling in missed collections that doesn't cause us to deliver metrics + // faster than the interval. + avg /= float64(nValues) + midTs := em.SampleInfo[len(em.SampleInfo)/2].Timestamp + + // Organize the metrics into a bucket per measurement. + mn, fn := e.makeMetricIdentifier(prefix, name) + bKey := mn + " " + v.Instance + " " + strconv.FormatInt(midTs.UnixNano(), 10) + bucket, found := buckets[bKey] + if !found { + bucket = metricEntry{name: mn, ts: midTs, fields: make(map[string]interface{}), tags: t} + buckets[bKey] = bucket + } - // Update highwater marks for non-realtime metrics. - e.hwMarks.Put(tsKey, ts) + // Percentage values must be scaled down by 100. + info, ok := metricInfo[name] + if !ok { + log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) } + if info.UnitInfo.GetElementDescription().Key == "percent" { + bucket.fields[fn] = float64(avg) / 100.0 + } else { + bucket.fields[fn] = avg + } + count++ + + // Update highwater marks + e.hwMarks.Put(moid, latestSample) + } // We've iterated through all the metrics and collected buckets for each // measurement name. Now emit them! diff --git a/plugins/inputs/vsphere/tscache.go b/plugins/inputs/vsphere/tscache.go index 9abe24ea725c5..1d1f00ebea3cc 100644 --- a/plugins/inputs/vsphere/tscache.go +++ b/plugins/inputs/vsphere/tscache.go @@ -49,6 +49,14 @@ func (t *TSCache) IsNew(key string, tm time.Time) bool { return !tm.Before(v) } +// Get returns a timestamp (if present) +func (t *TSCache) Get(key string) (time.Time, bool) { + t.mux.RLock() + defer t.mux.RUnlock() + ts, ok := t.table[key] + return ts, ok +} + // Put updates the latest timestamp for the supplied key. func (t *TSCache) Put(key string, time time.Time) { t.mux.Lock() From 26e153627a2cd0fce67a74a070a5b6de36dbbee4 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 28 Nov 2018 11:18:51 -0500 Subject: [PATCH 10/34] Added hard 100000 metric query limit --- plugins/inputs/vsphere/endpoint.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 864807b579a71..de6a68311ca28 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -690,7 +690,8 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n // We need to dump the current chunk of metrics for one of two reasons: // 1) We filled up the metric quota while processing the current resource // 2) We are at the last resource and have no more data to process. - if mr > 0 || (!res.realTime && metrics >= maxMetrics) || nRes >= e.Parent.MaxQueryObjects { + // 3) The query contains more than 100,000 individual metrics + if mr > 0 || (!res.realTime && metrics >= maxMetrics) || nRes >= e.Parent.MaxQueryObjects || len(pqs) > 100000 { log.Printf("D! [input.vsphere]: Queueing query: %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d", len(pqs), metrics, mr, res.name, e.URL.Host, total+1, len(res.objects)) @@ -710,7 +711,6 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n nRes++ } // There may be dangling stuff in the queue. Handle them - // if len(pqs) > 0 { // Call push function log.Printf("D! [input.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)", From aaa675476fdf71de872d931214e80b4797d5741a Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 30 Nov 2018 09:15:48 -0500 Subject: [PATCH 11/34] Moved timeout logic to client.go --- plugins/inputs/vsphere/client.go | 32 ++++++++++++- plugins/inputs/vsphere/endpoint.go | 74 ++++++++---------------------- 2 files changed, 49 insertions(+), 57 deletions(-) diff --git a/plugins/inputs/vsphere/client.go b/plugins/inputs/vsphere/client.go index ebad2bea79d30..176bfb425f4c3 100644 --- a/plugins/inputs/vsphere/client.go +++ b/plugins/inputs/vsphere/client.go @@ -10,6 +10,8 @@ import ( "sync" "time" + "github.com/vmware/govmomi/vim25/types" + "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/performance" @@ -76,6 +78,7 @@ func (cf *ClientFactory) GetClient(ctx context.Context) (*Client, error) { ctx2, cancel2 := context.WithTimeout(ctx, cf.parent.Timeout.Duration) defer cancel2() if cf.client.Client.SessionManager.Login(ctx2, url.UserPassword(cf.parent.Username, cf.parent.Password)) != nil { + log.Printf("W! [input.vsphere]: Client reauthentication failed.") return nil, err } } @@ -205,6 +208,8 @@ func (c *Client) close() { // GetServerTime returns the time at the vCenter server func (c *Client) GetServerTime(ctx context.Context) (time.Time, error) { + ctx, cancel := context.WithTimeout(ctx, c.Timeout) + defer cancel() t, err := methods.GetCurrentTime(ctx, c.Client) if err != nil { return time.Time{}, err @@ -235,7 +240,7 @@ func (c *Client) GetMaxQueryMetrics(ctx context.Context) (int, error) { // Fall through version-based inference if value isn't usable } } else { - log.Println("I! [input.vsphere] Option query for maxQueryMetrics failed. Using default") + log.Println("D! [input.vsphere] Option query for maxQueryMetrics failed. Using default") } // No usable maxQueryMetrics setting. Infer based on version @@ -255,3 +260,28 @@ func (c *Client) GetMaxQueryMetrics(ctx context.Context) (int, error) { } return 256, nil } + +func (c *Client) QueryMetrics(ctx context.Context, pqs []types.PerfQuerySpec) ([]performance.EntityMetric, error) { + ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) + defer cancel1() + metrics, err := c.Perf.Query(ctx1, pqs) + if err != nil { + return nil, err + } + + ctx2, cancel2 := context.WithTimeout(ctx, c.Timeout) + defer cancel2() + return c.Perf.ToMetricSeries(ctx2, metrics) +} + +func (c *Client) CounterInfoByName(ctx context.Context) (map[string]*types.PerfCounterInfo, error) { + ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) + defer cancel1() + return c.Perf.CounterInfoByName(ctx1) +} + +func (c *Client) ListResources(ctx context.Context, root *view.ContainerView, kind []string, ps []string, dst interface{}) error { + ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) + defer cancel1() + return root.Retrieve(ctx1, kind, ps, dst) +} diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index de6a68311ca28..05492a97841c6 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -60,7 +60,7 @@ type resourceKind struct { metrics performance.MetricList collectInstances bool parent string - getObjects func(context.Context, *Endpoint, *view.ContainerView) (objectMap, error) + getObjects func(context.Context, *Client, *Endpoint, *view.ContainerView) (objectMap, error) } type metricEntry struct { @@ -276,10 +276,7 @@ func (e *Endpoint) getMetricNameMap(ctx context.Context) (map[int32]string, erro return nil, err } - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - mn, err := client.Perf.CounterInfoByName(ctx1) - + mn, err := client.CounterInfoByName(ctx) if err != nil { return nil, err } @@ -290,7 +287,7 @@ func (e *Endpoint) getMetricNameMap(ctx context.Context) (map[int32]string, erro return names, nil } -func (e *Endpoint) getMetadata2(ctx context.Context, obj objectRef, sampling int32) (performance.MetricList, error) { +func (e *Endpoint) getMetadata(ctx context.Context, obj objectRef, sampling int32) (performance.MetricList, error) { client, err := e.clientFactory.GetClient(ctx) if err != nil { return nil, err @@ -299,28 +296,12 @@ func (e *Endpoint) getMetadata2(ctx context.Context, obj objectRef, sampling int ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) defer cancel1() metrics, err := client.Perf.AvailableMetric(ctx1, obj.ref.Reference(), sampling) - if err != nil && err != context.Canceled { + if err != nil { return nil, err } return metrics, nil } -func (e *Endpoint) getMetadata(ctx context.Context, in interface{}) interface{} { - client, err := e.clientFactory.GetClient(ctx) - if err != nil { - return err - } - - rq := in.(*metricQRequest) - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - metrics, err := client.Perf.AvailableMetric(ctx1, rq.obj.ref.Reference(), rq.res.sampling) - if err != nil && err != context.Canceled { - log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) - } - return &metricQResponse{metrics: &metrics, obj: rq.obj} -} - func (e *Endpoint) getDatacenterName(ctx context.Context, client *Client, cache map[string]string, r types.ManagedObjectReference) string { path := make([]string, 0) returnVal := "" @@ -388,7 +369,7 @@ func (e *Endpoint) discover(ctx context.Context) error { log.Printf("D! [input.vsphere] Discovering resources for %s", res.name) // Need to do this for all resource types even if they are not enabled if res.enabled || k != "vm" { - objects, err := res.getObjects(ctx, e, client.Root) + objects, err := res.getObjects(ctx, client, e, client.Root) if err != nil { return err } @@ -421,7 +402,7 @@ func (e *Endpoint) discover(ctx context.Context) error { defer func() { <-limiter }() - metrics, err := e.getMetadata2(ctx, obj, res.sampling) + metrics, err := e.getMetadata(ctx, obj, res.sampling) if err != nil { log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) } @@ -479,11 +460,9 @@ func (e *Endpoint) discover(ctx context.Context) error { return nil } -func getDatacenters(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getDatacenters(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.Datacenter - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - err := root.Retrieve(ctx1, []string{"Datacenter"}, []string{"name", "parent"}, &resources) + err := client.ListResources(ctx, root, []string{"Datacenter"}, []string{"name", "parent"}, &resources) if err != nil { return nil, err } @@ -495,11 +474,9 @@ func getDatacenters(ctx context.Context, e *Endpoint, root *view.ContainerView) return m, nil } -func getClusters(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getClusters(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.ClusterComputeResource - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - err := root.Retrieve(ctx1, []string{"ClusterComputeResource"}, []string{"name", "parent"}, &resources) + err := client.ListResources(ctx, root, []string{"ClusterComputeResource"}, []string{"name", "parent"}, &resources) if err != nil { return nil, err } @@ -529,9 +506,9 @@ func getClusters(ctx context.Context, e *Endpoint, root *view.ContainerView) (ob return m, nil } -func getHosts(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getHosts(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.HostSystem - err := root.Retrieve(ctx, []string{"HostSystem"}, []string{"name", "parent"}, &resources) + err := client.ListResources(ctx, root, []string{"HostSystem"}, []string{"name", "parent"}, &resources) if err != nil { return nil, err } @@ -543,11 +520,9 @@ func getHosts(ctx context.Context, e *Endpoint, root *view.ContainerView) (objec return m, nil } -func getVMs(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getVMs(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.VirtualMachine - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - err := root.Retrieve(ctx1, []string{"VirtualMachine"}, []string{"name", "runtime.host", "runtime.powerState", "config.guestId", "config.uuid"}, &resources) + err := client.ListResources(ctx, root, []string{"VirtualMachine"}, []string{"name", "runtime.host", "runtime.powerState", "config.guestId", "config.uuid"}, &resources) if err != nil { return nil, err } @@ -571,11 +546,9 @@ func getVMs(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectM return m, nil } -func getDatastores(ctx context.Context, e *Endpoint, root *view.ContainerView) (objectMap, error) { +func getDatastores(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.Datastore - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - err := root.Retrieve(ctx1, []string{"Datastore"}, []string{"name", "parent", "info"}, &resources) + err := client.ListResources(ctx, root, []string{"Datastore"}, []string{"name", "parent", "info"}, &resources) if err != nil { return nil, err } @@ -720,7 +693,6 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n } func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc telegraf.Accumulator) error { - res := e.resourceKinds[resourceType] client, err := e.clientFactory.GetClient(ctx) if err != nil { @@ -815,26 +787,16 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, latestSample, err } - ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel1() - metricInfo, err := client.Perf.CounterInfoByName(ctx1) + metricInfo, err := client.CounterInfoByName(ctx) if err != nil { return count, latestSample, err } - ctx2, cancel2 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel2() - metrics, err := client.Perf.Query(ctx2, pqs) + ems, err := client.QueryMetrics(ctx, pqs) if err != nil { return count, latestSample, err } - ctx3, cancel3 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) - defer cancel3() - ems, err := client.Perf.ToMetricSeries(ctx3, metrics) - if err != nil { - return count, latestSample, err - } log.Printf("D! [input.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) // Iterate through results From e9956cac9178336cd12b9bffea08d1bbdbd77649 Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 4 Dec 2018 14:45:26 -0500 Subject: [PATCH 12/34] Removed WorkerPool and added ThrottledExecutor instead --- plugins/inputs/vsphere/client.go | 10 + plugins/inputs/vsphere/endpoint.go | 247 ++++++++++++----------- plugins/inputs/vsphere/throttled_exec.go | 35 ++++ plugins/inputs/vsphere/vsphere.go | 5 - plugins/inputs/vsphere/vsphere_test.go | 46 +++-- plugins/inputs/vsphere/workerpool.go | 119 ----------- 6 files changed, 204 insertions(+), 258 deletions(-) create mode 100644 plugins/inputs/vsphere/throttled_exec.go delete mode 100644 plugins/inputs/vsphere/workerpool.go diff --git a/plugins/inputs/vsphere/client.go b/plugins/inputs/vsphere/client.go index 176bfb425f4c3..37f4b2c31776e 100644 --- a/plugins/inputs/vsphere/client.go +++ b/plugins/inputs/vsphere/client.go @@ -261,6 +261,7 @@ func (c *Client) GetMaxQueryMetrics(ctx context.Context) (int, error) { return 256, nil } +// QueryMetrics wraps performance.Query to give it proper timeouts func (c *Client) QueryMetrics(ctx context.Context, pqs []types.PerfQuerySpec) ([]performance.EntityMetric, error) { ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) defer cancel1() @@ -274,12 +275,21 @@ func (c *Client) QueryMetrics(ctx context.Context, pqs []types.PerfQuerySpec) ([ return c.Perf.ToMetricSeries(ctx2, metrics) } +// CounterInfoByName wraps performance.CounterInfoByName to give it proper timeouts func (c *Client) CounterInfoByName(ctx context.Context) (map[string]*types.PerfCounterInfo, error) { ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) defer cancel1() return c.Perf.CounterInfoByName(ctx1) } +// CounterInfoByKey wraps performance.CounterInfoByKey to give it proper timeouts +func (c *Client) CounterInfoByKey(ctx context.Context) (map[int32]*types.PerfCounterInfo, error) { + ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) + defer cancel1() + return c.Perf.CounterInfoByKey(ctx1) +} + +// ListResources wraps property.Collector.Retrieve to give it proper timeouts func (c *Client) ListResources(ctx context.Context, root *view.ContainerView, kind []string, ps []string, dst interface{}) error { ctx1, cancel1 := context.WithTimeout(ctx, c.Timeout) defer cancel1() diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 05492a97841c6..aba2a6ea08329 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -2,6 +2,7 @@ package vsphere import ( "context" + "errors" "fmt" "log" "math/rand" @@ -57,6 +58,8 @@ type resourceKind struct { sampling int32 objects objectMap filters filter.Filter + include []string + simple bool metrics performance.MetricList collectInstances bool parent string @@ -91,8 +94,6 @@ type metricQResponse struct { metrics *performance.MetricList } -type multiError []error - func (e *Endpoint) getParent(obj *objectRef, res *resourceKind) (*objectRef, bool) { if pKind, ok := e.resourceKinds[res.parent]; ok { if p, ok := pKind.objects[obj.parentRef.Value]; ok { @@ -125,6 +126,8 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.DatacenterMetricInclude, parent.DatacenterMetricExclude), + simple: isSimple(parent.DatacenterMetricInclude, parent.DatacenterMetricExclude), + include: parent.DatacenterMetricInclude, collectInstances: parent.DatacenterInstances, getObjects: getDatacenters, parent: "", @@ -138,6 +141,8 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.ClusterMetricInclude, parent.ClusterMetricExclude), + simple: isSimple(parent.ClusterMetricInclude, parent.ClusterMetricExclude), + include: parent.ClusterMetricInclude, collectInstances: parent.ClusterInstances, getObjects: getClusters, parent: "datacenter", @@ -151,6 +156,8 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 20, objects: make(objectMap), filters: newFilterOrPanic(parent.HostMetricInclude, parent.HostMetricExclude), + simple: isSimple(parent.HostMetricInclude, parent.HostMetricExclude), + include: parent.HostMetricInclude, collectInstances: parent.HostInstances, getObjects: getHosts, parent: "cluster", @@ -164,6 +171,8 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 20, objects: make(objectMap), filters: newFilterOrPanic(parent.VMMetricInclude, parent.VMMetricExclude), + simple: isSimple(parent.VMMetricInclude, parent.VMMetricExclude), + include: parent.VMMetricInclude, collectInstances: parent.VMInstances, getObjects: getVMs, parent: "host", @@ -176,6 +185,8 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, sampling: 300, objects: make(objectMap), filters: newFilterOrPanic(parent.DatastoreMetricInclude, parent.DatastoreMetricExclude), + simple: isSimple(parent.DatastoreMetricInclude, parent.DatastoreMetricExclude), + include: parent.DatastoreMetricInclude, collectInstances: parent.DatastoreInstances, getObjects: getDatastores, parent: "", @@ -188,24 +199,6 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, return &e, err } -func (m multiError) Error() string { - switch len(m) { - case 0: - return "No error recorded. Something is wrong!" - case 1: - return m[0].Error() - default: - s := "Multiple errors detected concurrently: " - for i, e := range m { - if i != 0 { - s += ", " - } - s += e.Error() - } - return s - } -} - func anythingEnabled(ex []string) bool { for _, s := range ex { if s == "*" { @@ -223,6 +216,18 @@ func newFilterOrPanic(include []string, exclude []string) filter.Filter { return f } +func isSimple(include []string, exclude []string) bool { + if len(exclude) > 0 || len(include) == 0 { + return false + } + for _, s := range include { + if strings.Contains(s, "*") { + return false + } + } + return true +} + func (e *Endpoint) startDiscovery(ctx context.Context) { e.discoveryTicker = time.NewTicker(e.Parent.ObjectDiscoveryInterval.Duration) go func() { @@ -359,8 +364,6 @@ func (e *Endpoint) discover(ctx context.Context) error { } log.Printf("D! [input.vsphere]: Discover new objects for %s", e.URL.Host) - - instInfoMux := sync.Mutex{} resourceKinds := make(map[string]resourceKind) dcNameCache := make(map[string]string) @@ -386,51 +389,11 @@ func (e *Endpoint) discover(ctx context.Context) error { // No need to collect metric metadata if resource type is not enabled if res.enabled { - // Get metric metadata and filter metrics - prob := 100.0 / float64(len(objects)) - log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) - wg := sync.WaitGroup{} - limiter := make(chan struct{}, e.Parent.DiscoverConcurrency) - for _, obj := range objects { - if rand.Float64() > prob { - continue - } - wg.Add(1) - go func(obj objectRef) { - defer wg.Done() - limiter <- struct{}{} - defer func() { - <-limiter - }() - metrics, err := e.getMetadata(ctx, obj, res.sampling) - if err != nil { - log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) - } - mMap := make(map[string]types.PerfMetricId) - for _, m := range metrics { - if m.Instance != "" && res.collectInstances { - m.Instance = "*" - } else { - m.Instance = "" - } - if res.filters.Match(metricNames[m.CounterId]) { - mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m - } - } - log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) - instInfoMux.Lock() - defer instInfoMux.Unlock() - if len(mMap) > len(res.metrics) { - res.metrics = make(performance.MetricList, len(mMap)) - i := 0 - for _, m := range mMap { - res.metrics[i] = m - i++ - } - } - }(obj) + if res.simple { + e.simpleMetadataSelect(ctx, client, &res) + } else { + e.complexMetadataSelect(ctx, &res, objects, metricNames) } - wg.Wait() } res.objects = objects resourceKinds[k] = res @@ -460,6 +423,74 @@ func (e *Endpoint) discover(ctx context.Context) error { return nil } +func (e *Endpoint) simpleMetadataSelect(ctx context.Context, client *Client, res *resourceKind) { + log.Printf("D! [input.vsphere] Using fast metric metadata selection for %s", res.name) + m, err := client.CounterInfoByName(ctx) + if err != nil { + log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + return + } + res.metrics = make(performance.MetricList, 0, len(res.include)) + for _, s := range res.include { + if pci, ok := m[s]; ok { + cnt := types.PerfMetricId{ + CounterId: pci.Key, + } + if res.collectInstances { + cnt.Instance = "*" + } else { + cnt.Instance = "" + } + res.metrics = append(res.metrics, cnt) + } else { + log.Printf("W! [input.vsphere] Metric name %s is unknown. Will not be collected", s) + } + } +} + +func (e *Endpoint) complexMetadataSelect(ctx context.Context, res *resourceKind, objects objectMap, metricNames map[int32]string) { + prob := 100.0 / float64(len(objects)) + log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) + instInfoMux := sync.Mutex{} + te := NewThrottledExecutor(e.Parent.DiscoverConcurrency) + for _, obj := range objects { + if rand.Float64() > prob { + continue + } + func(obj objectRef) { + te.Run(func() { + metrics, err := e.getMetadata(ctx, obj, res.sampling) + if err != nil { + log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + } + mMap := make(map[string]types.PerfMetricId) + for _, m := range metrics { + if m.Instance != "" && res.collectInstances { + m.Instance = "*" + } else { + m.Instance = "" + } + if res.filters.Match(metricNames[m.CounterId]) { + mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m + } + } + log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) + instInfoMux.Lock() + defer instInfoMux.Unlock() + if len(mMap) > len(res.metrics) { + res.metrics = make(performance.MetricList, len(mMap)) + i := 0 + for _, m := range mMap { + res.metrics[i] = m + i++ + } + } + }) + }(obj) + } + te.Wait() +} + func getDatacenters(ctx context.Context, client *Client, e *Endpoint, root *view.ContainerView) (objectMap, error) { var resources []mo.Datacenter err := client.ListResources(ctx, root, []string{"Datacenter"}, []string{"name", "parent"}, &resources) @@ -615,7 +646,15 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error return nil } -func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, now time.Time, latest time.Time) { +// Workaround to make sure pqs is a copy of the loop variable and won't change. +func submitChunkJob(te *ThrottledExecutor, job func([]types.PerfQuerySpec), pqs []types.PerfQuerySpec) { + te.Run(func() { + job(pqs) + }) +} + +func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Time, latest time.Time, acc telegraf.Accumulator, job func([]types.PerfQuerySpec)) { + te := NewThrottledExecutor(e.Parent.CollectConcurrency) maxMetrics := e.Parent.MaxQueryMetrics if maxMetrics < 1 { maxMetrics = 1 @@ -664,17 +703,17 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n // 1) We filled up the metric quota while processing the current resource // 2) We are at the last resource and have no more data to process. // 3) The query contains more than 100,000 individual metrics - if mr > 0 || (!res.realTime && metrics >= maxMetrics) || nRes >= e.Parent.MaxQueryObjects || len(pqs) > 100000 { + if mr > 0 || nRes >= e.Parent.MaxQueryObjects || len(pqs) > 100000 { log.Printf("D! [input.vsphere]: Queueing query: %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d", len(pqs), metrics, mr, res.name, e.URL.Host, total+1, len(res.objects)) - // To prevent deadlocks, don't send work items if the context has been cancelled. + // Don't send work items if the context has been cancelled. if ctx.Err() == context.Canceled { return } - // Call push function - f(ctx, pqs) + // Run collection job + submitChunkJob(te, job, pqs) pqs = make([]types.PerfQuerySpec, 0, e.Parent.MaxQueryObjects) metrics = 0 nRes = 0 @@ -683,13 +722,16 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n total++ nRes++ } - // There may be dangling stuff in the queue. Handle them + // Handle final partially filled chunk if len(pqs) > 0 { - // Call push function + // Run collection job log.Printf("D! [input.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)", len(pqs), metrics, res.name, e.URL.Host, len(res.objects)) - f(ctx, pqs) + submitChunkJob(te, job, pqs) } + + // Wait for background collection to finish + te.Wait() } func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc telegraf.Accumulator) error { @@ -728,58 +770,35 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc var tsMux sync.Mutex latestSample := time.Time{} // Set up a worker pool for collecting chunk metrics - wp := NewWorkerPool(10) - wp.Run(ctx, func(ctx context.Context, in interface{}) interface{} { - chunk := in.([]types.PerfQuerySpec) - n, localLatest, err := e.collectChunk(ctx, chunk, resourceType, &res, acc) - log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) - if err != nil { - return err - } - atomic.AddInt64(&count, int64(n)) - tsMux.Lock() - defer tsMux.Unlock() - if localLatest.After(latestSample) && !localLatest.IsZero() { - latestSample = localLatest - } - return nil - - }, e.Parent.CollectConcurrency) - - // Fill the input channel of the worker queue by running the chunking - // logic implemented in chunker() - wp.Fill(ctx, func(ctx context.Context, f PushFunc) { - e.chunker(ctx, f, &res, now, latest) - }) + e.chunkify(ctx, &res, now, latest, acc, + func(chunk []types.PerfQuerySpec) { + n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc) + log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) + if err != nil { + acc.AddError(errors.New("While collecting " + res.name + ": " + err.Error())) + } + atomic.AddInt64(&count, int64(n)) + tsMux.Lock() + defer tsMux.Unlock() + if localLatest.After(latestSample) && !localLatest.IsZero() { + latestSample = localLatest + } + }) - // Drain the pool. We're getting errors back. They should all be nil - var mux sync.Mutex - merr := make(multiError, 0) - wp.Drain(ctx, func(ctx context.Context, in interface{}) bool { - if in != nil { - mux.Lock() - defer mux.Unlock() - merr = append(merr, in.(error)) - return false - } - return true - }) log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) if !latestSample.IsZero() { e.lastColls[resourceType] = latestSample } sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) - if len(merr) > 0 { - return merr - } return nil } -func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, resourceType string, - res *resourceKind, acc telegraf.Accumulator) (int, time.Time, error) { +func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator) (int, time.Time, error) { + log.Printf("D! [input.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) latestSample := time.Time{} count := 0 + resourceType := res.name prefix := "vsphere" + e.Parent.Separator + resourceType client, err := e.clientFactory.GetClient(ctx) diff --git a/plugins/inputs/vsphere/throttled_exec.go b/plugins/inputs/vsphere/throttled_exec.go new file mode 100644 index 0000000000000..83586dd4eeec0 --- /dev/null +++ b/plugins/inputs/vsphere/throttled_exec.go @@ -0,0 +1,35 @@ +package vsphere + +import "sync" + +// ThrottledExecutor provides a simple mechanism for running jobs in separate +// goroutines while limit the number of concurrent jobs running at any given time. +type ThrottledExecutor struct { + limiter chan struct{} + wg sync.WaitGroup +} + +// NewThrottledExecutor creates a new ThrottlesExecutor with a specified maximum +// number of concurrent jobs +func NewThrottledExecutor(limit int) *ThrottledExecutor { + return &ThrottledExecutor{limiter: make(chan struct{}, limit)} +} + +// Run schedules a job for execution as soon as possible while respecting the +// maximum concurrency limit. +func (t *ThrottledExecutor) Run(job func()) { + t.wg.Add(1) + t.limiter <- struct{}{} + go func() { + defer func() { + <-t.limiter + }() + defer t.wg.Done() + job() + }() +} + +// Wait blocks until all scheduled jobs have finished +func (t *ThrottledExecutor) Wait() { + t.wg.Wait() +} diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index f0bb5dca99c38..23ce52ed56f7c 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -260,7 +260,6 @@ func (v *VSphere) Stop() { // Gather is the main data collection function called by the Telegraf core. It performs all // the data collection and writes all metrics into the Accumulator passed as an argument. func (v *VSphere) Gather(acc telegraf.Accumulator) error { - merr := make(multiError, 0) var wg sync.WaitGroup for _, ep := range v.endpoints { wg.Add(1) @@ -274,15 +273,11 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error { } if err != nil { acc.AddError(err) - merr = append(merr, err) } }(ep) } wg.Wait() - if len(merr) > 0 { - return merr - } return nil } diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 4eb3d28f810e6..f87d93d0f4670 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -7,6 +7,8 @@ import ( "regexp" "sort" "strings" + "sync" + "sync/atomic" "testing" "time" @@ -205,29 +207,33 @@ func TestParseConfig(t *testing.T) { } func TestWorkerPool(t *testing.T) { - wp := NewWorkerPool(100) - ctx := context.Background() - wp.Run(ctx, func(ctx context.Context, p interface{}) interface{} { - return p.(int) * 2 - }, 10) - - n := 100000 - wp.Fill(ctx, func(ctx context.Context, f PushFunc) { - for i := 0; i < n; i++ { - f(ctx, i) - } - }) - results := make([]int, n) - i := 0 - wp.Drain(ctx, func(ctx context.Context, p interface{}) bool { - results[i] = p.(int) - i++ - return true - }) + max := int64(0) + ngr := int64(0) + n := 10000 + var mux sync.Mutex + results := make([]int, 0, n) + te := NewThrottledExecutor(5) + for i := 0; i < n; i++ { + func(i int) { + te.Run(func() { + atomic.AddInt64(&ngr, 1) + mux.Lock() + defer mux.Unlock() + results = append(results, i*2) + if ngr > max { + max = ngr + } + time.Sleep(100 * time.Microsecond) + atomic.AddInt64(&ngr, -1) + }) + }(i) + } + te.Wait() sort.Ints(results) for i := 0; i < n; i++ { - require.Equal(t, results[i], i*2) + require.Equal(t, results[i], i*2, "Some jobs didn't run") } + require.Equal(t, int64(5), max, "Wrong number of goroutines spawned") } func TestTimeout(t *testing.T) { diff --git a/plugins/inputs/vsphere/workerpool.go b/plugins/inputs/vsphere/workerpool.go deleted file mode 100644 index 6695735ce3a22..0000000000000 --- a/plugins/inputs/vsphere/workerpool.go +++ /dev/null @@ -1,119 +0,0 @@ -package vsphere - -import ( - "context" - "log" - "sync" -) - -// WorkerFunc is a function that is supposed to do the actual work -// of the WorkerPool. It is similar to the "map" portion of the -// map/reduce semantics, in that it takes a single value as an input, -// does some processing and returns a single result. -type WorkerFunc func(context.Context, interface{}) interface{} - -// PushFunc is called from a FillerFunc to push a workitem onto -// the input channel. Wraps some logic for gracefulk shutdowns. -type PushFunc func(context.Context, interface{}) bool - -// DrainerFunc represents a function used to "drain" the WorkerPool, -// i.e. pull out all the results generated by the workers and processing -// them. The DrainerFunc is called once per result produced. -// If the function returns false, the draining of the pool is aborted. -type DrainerFunc func(context.Context, interface{}) bool - -// FillerFunc represents a function for filling the WorkerPool with jobs. -// It is called once and is responsible for pushing jobs onto the supplied channel. -type FillerFunc func(context.Context, PushFunc) - -// WorkerPool implements a simple work pooling mechanism. It runs a predefined -// number of goroutines to process jobs. Jobs are inserted using the Fill call -// and results are retrieved through the Drain function. -type WorkerPool struct { - wg sync.WaitGroup - In chan interface{} - Out chan interface{} -} - -// NewWorkerPool creates a worker pool -func NewWorkerPool(bufsize int) *WorkerPool { - return &WorkerPool{ - In: make(chan interface{}, bufsize), - Out: make(chan interface{}, bufsize), - } -} - -func (w *WorkerPool) push(ctx context.Context, job interface{}) bool { - select { - case w.In <- job: - return true - case <-ctx.Done(): - return false - } -} - -func (w *WorkerPool) pushOut(ctx context.Context, result interface{}) bool { - select { - case w.Out <- result: - return true - case <-ctx.Done(): - return false - } -} - -// Run takes a WorkerFunc and runs it in 'n' goroutines. -func (w *WorkerPool) Run(ctx context.Context, f WorkerFunc, n int) bool { - w.wg.Add(1) - go func() { - defer w.wg.Done() - var localWg sync.WaitGroup - localWg.Add(n) - for i := 0; i < n; i++ { - go func() { - defer localWg.Done() - for { - select { - case job, ok := <-w.In: - if !ok { - return - } - w.pushOut(ctx, f(ctx, job)) - case <-ctx.Done(): - log.Printf("D! [input.vsphere]: Stop requested for worker pool. Exiting.") - return - } - } - }() - } - localWg.Wait() - close(w.Out) - }() - return ctx.Err() == nil -} - -// Fill runs a FillerFunc responsible for supplying work to the pool. You may only -// call Fill once. Calling it twice will panic. -func (w *WorkerPool) Fill(ctx context.Context, f FillerFunc) bool { - w.wg.Add(1) - go func() { - defer w.wg.Done() - f(ctx, w.push) - close(w.In) - }() - return true -} - -// Drain runs a DrainerFunc for each result generated by the workers. -func (w *WorkerPool) Drain(ctx context.Context, f DrainerFunc) bool { - w.wg.Add(1) - go func() { - defer w.wg.Done() - for result := range w.Out { - if !f(ctx, result) { - break - } - } - }() - w.wg.Wait() - return ctx.Err() != nil -} From 94c6fb6d838bec0433b86056193b8ec8f95c60d3 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 5 Dec 2018 12:40:10 -0500 Subject: [PATCH 13/34] Changed cluster_instances default value to false, since true causes problems with some versions of vCenter --- plugins/inputs/vsphere/README.md | 6 ++--- plugins/inputs/vsphere/endpoint.go | 36 +++++++++++++++++++++--------- plugins/inputs/vsphere/vsphere.go | 4 ++-- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/plugins/inputs/vsphere/README.md b/plugins/inputs/vsphere/README.md index 7ba323bc73e9e..4bccbb2c880e8 100644 --- a/plugins/inputs/vsphere/README.md +++ b/plugins/inputs/vsphere/README.md @@ -122,17 +122,17 @@ vm_metric_exclude = [ "*" ] ## Clusters # cluster_metric_include = [] ## if omitted or empty, all metrics are collected # cluster_metric_exclude = [] ## Nothing excluded by default - # cluster_instances = true ## true by default + # cluster_instances = false ## false by default ## Datastores # datastore_metric_include = [] ## if omitted or empty, all metrics are collected # datastore_metric_exclude = [] ## Nothing excluded by default - # datastore_instances = false ## false by default for Datastores only + # datastore_instances = false ## false by default ## Datacenters datacenter_metric_include = [] ## if omitted or empty, all metrics are collected datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default. - # datacenter_instances = false ## false by default for Datastores only + # datacenter_instances = false ## false by default ## Plugin Settings ## separator character to use for measurement and field names (default: "_") diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index aba2a6ea08329..47d64156d4a35 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -26,11 +26,13 @@ import ( var isolateLUN = regexp.MustCompile(".*/([^/]+)/?$") -const metricLookback = 3 +const metricLookback = 3 // Number of time periods to look back at for non-realtime metrics -const rtMetricLookback = 3 +const rtMetricLookback = 3 // Number of time periods to look back at for realtime metrics -const maxSampleConst = 10 +const maxSampleConst = 10 // Absolute maximim number of samples regardless of period + +const maxMetadataSamples = 100 // Number of resources to sample for metric metadata // Endpoint is a high-level representation of a connected vCenter endpoint. It is backed by the lower // level Client type. @@ -449,14 +451,29 @@ func (e *Endpoint) simpleMetadataSelect(ctx context.Context, client *Client, res } func (e *Endpoint) complexMetadataSelect(ctx context.Context, res *resourceKind, objects objectMap, metricNames map[int32]string) { - prob := 100.0 / float64(len(objects)) - log.Printf("D! [input.vsphere] Probability of sampling a resource: %f", prob) - instInfoMux := sync.Mutex{} - te := NewThrottledExecutor(e.Parent.DiscoverConcurrency) + // We're only going to get metadata from maxMetadataSamples resources. If we have + // more resources than that, we pick maxMetadataSamples samples at random. + sampledObjects := make([]objectRef, len(objects)) + i := 0 for _, obj := range objects { - if rand.Float64() > prob { - continue + sampledObjects[i] = obj + i++ + } + n := len(sampledObjects) + if n > maxMetadataSamples { + // Shuffle samples into the maxMetadatSamples positions + for i := 0; i < maxMetadataSamples; i++ { + j := int(rand.Int31n(int32(i + 1))) + t := sampledObjects[i] + sampledObjects[i] = sampledObjects[j] + sampledObjects[j] = t } + sampledObjects = sampledObjects[0:maxMetadataSamples] + } + + instInfoMux := sync.Mutex{} + te := NewThrottledExecutor(e.Parent.DiscoverConcurrency) + for _, obj := range sampledObjects { func(obj objectRef) { te.Run(func() { metrics, err := e.getMetadata(ctx, obj, res.sampling) @@ -560,7 +577,6 @@ func getVMs(ctx context.Context, client *Client, e *Endpoint, root *view.Contain m := make(objectMap) for _, r := range resources { if r.Runtime.PowerState != "poweredOn" { - log.Printf("D! [input.vsphere] Skipped powered off VM: %s", r.Name) continue } guest := "unknown" diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index 23ce52ed56f7c..13186634fb51d 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -155,7 +155,7 @@ var sampleConfig = ` ## Clusters # cluster_metric_include = [] ## if omitted or empty, all metrics are collected # cluster_metric_exclude = [] ## Nothing excluded by default - # cluster_instances = true ## true by default + # cluster_instances = false ## false by default ## Datastores # datastore_metric_include = [] ## if omitted or empty, all metrics are collected @@ -286,7 +286,7 @@ func init() { return &VSphere{ Vcenters: []string{}, - ClusterInstances: true, + ClusterInstances: false, ClusterMetricInclude: nil, ClusterMetricExclude: nil, HostInstances: true, From 646c59609327c5ceea31575cfa7fff6b1367fc39 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 5 Dec 2018 16:28:38 -0500 Subject: [PATCH 14/34] Fixed broken test cases --- plugins/inputs/vsphere/throttled_exec.go | 3 +++ plugins/inputs/vsphere/vsphere_test.go | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/inputs/vsphere/throttled_exec.go b/plugins/inputs/vsphere/throttled_exec.go index 83586dd4eeec0..a7e07bedb7c67 100644 --- a/plugins/inputs/vsphere/throttled_exec.go +++ b/plugins/inputs/vsphere/throttled_exec.go @@ -12,6 +12,9 @@ type ThrottledExecutor struct { // NewThrottledExecutor creates a new ThrottlesExecutor with a specified maximum // number of concurrent jobs func NewThrottledExecutor(limit int) *ThrottledExecutor { + if limit == 0 { + panic("Limit must be > 0") + } return &ThrottledExecutor{limiter: make(chan struct{}, limit)} } diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index f87d93d0f4670..607493f5c0a5d 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -177,6 +177,8 @@ func defaultVSphere() *VSphere { ObjectDiscoveryInterval: internal.Duration{Duration: time.Second * 300}, Timeout: internal.Duration{Duration: time.Second * 20}, ForceDiscoverOnInit: true, + DiscoverConcurrency: 1, + CollectConcurrency: 1, } } @@ -251,7 +253,7 @@ func TestTimeout(t *testing.T) { require.NoError(t, v.Start(nil)) // We're not using the Accumulator, so it can be nil. defer v.Stop() err = v.Gather(&acc) - require.NotNil(t, err, "Error should not be nil here") + require.True(t, len(acc.Errors) > 0, "Errors should not be empty here") // The accumulator must contain exactly one error and it must be a deadline exceeded. require.Equal(t, 1, len(acc.Errors)) From 9ab5b945f5cdd5023f9da56956fb0fb71fdb5077 Mon Sep 17 00:00:00 2001 From: prydin Date: Thu, 6 Dec 2018 11:02:12 -0500 Subject: [PATCH 15/34] Reverted accidental change to wavefront.go --- plugins/outputs/wavefront/wavefront.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/outputs/wavefront/wavefront.go b/plugins/outputs/wavefront/wavefront.go index df1d42158dc07..ef36d1804045f 100644 --- a/plugins/outputs/wavefront/wavefront.go +++ b/plugins/outputs/wavefront/wavefront.go @@ -122,11 +122,11 @@ func (w *Wavefront) Write(metrics []telegraf.Metric) error { return fmt.Errorf("Wavefront: TCP connect fail %s", err.Error()) } defer connection.Close() + connection.SetWriteDeadline(time.Now().Add(5 * time.Second)) for _, m := range metrics { for _, metricPoint := range buildMetrics(m, w) { metricLine := formatMetricPoint(metricPoint, w) - connection.SetWriteDeadline(time.Now().Add(30 * time.Second)) _, err := connection.Write([]byte(metricLine)) if err != nil { return fmt.Errorf("Wavefront: TCP writing error %s", err.Error()) From 466b1399869b4b6c2d851f36a19d7fb7e1f47b50 Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 7 Dec 2018 18:30:53 -0500 Subject: [PATCH 16/34] Added check for value indices --- plugins/inputs/vsphere/endpoint.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 47d64156d4a35..c68f18bd47d69 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -861,8 +861,13 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, avg := float64(0) nValues := 0 - //log.Printf("D! [input.vsphere] %s %d samples", name, len(v.Value)) for idx, sample := range em.SampleInfo { + // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted + // data coming back with missing values. Take care of that gracefully! + if idx >= len(v.Value) { + log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(em.SampleInfo), len(v.Value)) + break + } value := float64(v.Value[idx]) if value < 0 { continue From bd3fe0d0773fa9824dc7f4054e8738c92e67dc9d Mon Sep 17 00:00:00 2001 From: prydin Date: Mon, 10 Dec 2018 16:57:23 -0500 Subject: [PATCH 17/34] More robust panic handling --- Gopkg.lock | 6 +++--- Gopkg.toml | 2 +- plugins/inputs/vsphere/endpoint.go | 15 +++++++++++++-- plugins/inputs/vsphere/throttled_exec.go | 8 ++++++-- plugins/inputs/vsphere/vsphere.go | 1 + plugins/inputs/vsphere/vsphere_test.go | 3 ++- 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index a2df3c81dfc55..5cfc176af9258 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1043,7 +1043,7 @@ version = "v1.0.0" [[projects]] - digest = "1:f9fe29bf856d49f9a51d6001588cb5ee5d65c8a7ff5e8b0dd5423c3a510f0833" + digest = "1:6af52ce6dae9a912aa3113f247a63cd82599760ddc328a6721c3ef0426d31ca2" name = "github.com/vmware/govmomi" packages = [ ".", @@ -1069,8 +1069,8 @@ "vim25/xml", ] pruneopts = "" - revision = "e3a01f9611c32b2362366434bcd671516e78955d" - version = "v0.18.0" + revision = "3617f28d167d448f93f282a867870f109516d2a5" + version = "v0.19.0" [[projects]] branch = "master" diff --git a/Gopkg.toml b/Gopkg.toml index 791e265e82da4..835bae18aa233 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -228,7 +228,7 @@ [[constraint]] name = "github.com/vmware/govmomi" - version = "0.18.0" + version = "0.19.0" [[constraint]] name = "github.com/Azure/go-autorest" diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index c68f18bd47d69..8efb9c6397345 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -233,6 +233,7 @@ func isSimple(include []string, exclude []string) bool { func (e *Endpoint) startDiscovery(ctx context.Context) { e.discoveryTicker = time.NewTicker(e.Parent.ObjectDiscoveryInterval.Duration) go func() { + defer HandlePanic() for { select { case <-e.discoveryTicker.C: @@ -270,7 +271,10 @@ func (e *Endpoint) init(ctx context.Context) error { } else { // Otherwise, just run it in the background. We'll probably have an incomplete first metric // collection this way. - go e.initalDiscovery(ctx) + go func() { + defer HandlePanic() + e.initalDiscovery(ctx) + }() } } e.initialized = true @@ -621,6 +625,7 @@ func (e *Endpoint) Close() { // Collect runs a round of data collections as specified in the configuration. func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error { + // If we never managed to do a discovery, collection will be a no-op. Therefore, // we need to check that a connection is available, or the collection will // silently fail. @@ -647,6 +652,7 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error if res.enabled { wg.Add(1) go func(k string) { + defer HandlePanic() defer wg.Done() err := e.collectResource(ctx, k, acc) if err != nil { @@ -785,9 +791,14 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc var tsMux sync.Mutex latestSample := time.Time{} - // Set up a worker pool for collecting chunk metrics + + // Divide workload into chunks and process them concurrently e.chunkify(ctx, &res, now, latest, acc, func(chunk []types.PerfQuerySpec) { + + // Handle panics gracefully + defer HandlePanicWithAcc(acc) + n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc) log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { diff --git a/plugins/inputs/vsphere/throttled_exec.go b/plugins/inputs/vsphere/throttled_exec.go index a7e07bedb7c67..15f66c5ab5d7e 100644 --- a/plugins/inputs/vsphere/throttled_exec.go +++ b/plugins/inputs/vsphere/throttled_exec.go @@ -1,6 +1,8 @@ package vsphere -import "sync" +import ( + "sync" +) // ThrottledExecutor provides a simple mechanism for running jobs in separate // goroutines while limit the number of concurrent jobs running at any given time. @@ -24,10 +26,12 @@ func (t *ThrottledExecutor) Run(job func()) { t.wg.Add(1) t.limiter <- struct{}{} go func() { + // Last resort panic handler. + defer HandlePanic() + defer t.wg.Done() defer func() { <-t.limiter }() - defer t.wg.Done() job() }() } diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index 13186634fb51d..5ceb14a7fd238 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -264,6 +264,7 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error { for _, ep := range v.endpoints { wg.Add(1) go func(endpoint *Endpoint) { + defer HandlePanicWithAcc(acc) defer wg.Done() err := endpoint.Collect(context.Background(), acc) if err == context.Canceled { diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 607493f5c0a5d..1f05f36bf9b41 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -308,7 +308,8 @@ func TestAll(t *testing.T) { var acc testutil.Accumulator v := defaultVSphere() v.Vcenters = []string{s.URL.String()} - v.Start(nil) // We're not using the Accumulator, so it can be nil. + v.Start(&acc) defer v.Stop() require.NoError(t, v.Gather(&acc)) + require.Equal(t, 0, len(acc.Errors), fmt.Sprintf("Errors found: %s", acc.Errors)) } From 3ede8cc6ae8e31f7f39814bd8ec2ae3f4a27cf28 Mon Sep 17 00:00:00 2001 From: prydin Date: Mon, 10 Dec 2018 16:58:20 -0500 Subject: [PATCH 18/34] Added panic_handler.go --- plugins/inputs/vsphere/panic_handler.go | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 plugins/inputs/vsphere/panic_handler.go diff --git a/plugins/inputs/vsphere/panic_handler.go b/plugins/inputs/vsphere/panic_handler.go new file mode 100644 index 0000000000000..1d6242510a972 --- /dev/null +++ b/plugins/inputs/vsphere/panic_handler.go @@ -0,0 +1,28 @@ +package vsphere + +import ( + "errors" + "fmt" + "log" + + "github.com/influxdata/telegraf" +) + +func HandlePanicWithAcc(acc telegraf.Accumulator) { + if p := recover(); p != nil { + switch p.(type) { + case string: + acc.AddError(errors.New(p.(string))) + case error: + acc.AddError(p.(error)) + default: + acc.AddError(fmt.Errorf("Unknown panic: %s", p)) + } + } +} + +func HandlePanic() { + if p := recover(); p != nil { + log.Printf("E! [input.vsphere] PANIC (recovered): %s", p) + } +} From 957762d31ee4e5e6250e8c7b353285c27a3b970e Mon Sep 17 00:00:00 2001 From: prydin Date: Mon, 10 Dec 2018 17:27:42 -0500 Subject: [PATCH 19/34] Reverted to govmomi 0.18.0 --- Gopkg.toml | 2 +- plugins/inputs/vsphere/endpoint.go | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/Gopkg.toml b/Gopkg.toml index 835bae18aa233..791e265e82da4 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -228,7 +228,7 @@ [[constraint]] name = "github.com/vmware/govmomi" - version = "0.19.0" + version = "0.18.0" [[constraint]] name = "github.com/Azure/go-autorest" diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 8efb9c6397345..6caebb4f77412 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -86,16 +86,6 @@ type objectRef struct { dcname string } -type metricQRequest struct { - res *resourceKind - obj objectRef -} - -type metricQResponse struct { - obj objectRef - metrics *performance.MetricList -} - func (e *Endpoint) getParent(obj *objectRef, res *resourceKind) (*objectRef, bool) { if pKind, ok := e.resourceKinds[res.parent]; ok { if p, ok := pKind.objects[obj.parentRef.Value]; ok { From f563cd8138a10036ea4441fc290835e8c67d109a Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 11 Dec 2018 10:05:38 -0500 Subject: [PATCH 20/34] Exclude tests requiring VPX simulator on 32-bit arch --- Gopkg.toml | 2 +- plugins/inputs/vsphere/vsphere_test.go | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Gopkg.toml b/Gopkg.toml index 791e265e82da4..835bae18aa233 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -228,7 +228,7 @@ [[constraint]] name = "github.com/vmware/govmomi" - version = "0.18.0" + version = "0.19.0" [[constraint]] name = "github.com/Azure/go-autorest" diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 1f05f36bf9b41..0da0681cc0a93 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -11,6 +11,7 @@ import ( "sync/atomic" "testing" "time" + "unsafe" "github.com/influxdata/telegraf/internal" itls "github.com/influxdata/telegraf/internal/tls" @@ -239,6 +240,13 @@ func TestWorkerPool(t *testing.T) { } func TestTimeout(t *testing.T) { + // Don't run test on 32-bit machines due to bug in simulator. + // https://github.com/vmware/govmomi/issues/1330 + var i int + if unsafe.Sizeof(i) < 8 { + return + } + m, s, err := createSim() if err != nil { t.Fatal(err) @@ -261,6 +269,12 @@ func TestTimeout(t *testing.T) { } func TestMaxQuery(t *testing.T) { + // Don't run test on 32-bit machines due to bug in simulator. + // https://github.com/vmware/govmomi/issues/1330 + var i int + if unsafe.Sizeof(i) < 8 { + return + } m, s, err := createSim() if err != nil { t.Fatal(err) @@ -298,6 +312,13 @@ func TestMaxQuery(t *testing.T) { } func TestAll(t *testing.T) { + // Don't run test on 32-bit machines due to bug in simulator. + // https://github.com/vmware/govmomi/issues/1330 + var i int + if unsafe.Sizeof(i) < 8 { + return + } + m, s, err := createSim() if err != nil { t.Fatal(err) From f71b46639011337ce66ede7c527164565668752e Mon Sep 17 00:00:00 2001 From: prydin Date: Thu, 13 Dec 2018 13:18:16 -0500 Subject: [PATCH 21/34] Finalized merge from prydin-scalability --- plugins/inputs/vsphere/endpoint.go | 15 +- plugins/inputs/vsphere/finder.go | 2 +- plugins/inputs/vsphere/vsphere_test.go | 199 +++++++++++-------------- 3 files changed, 97 insertions(+), 119 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 8eb410075da38..963502bc22612 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -38,8 +38,6 @@ const maxMetadataSamples = 100 // Number of resources to sample for metric metad type Endpoint struct { Parent *VSphere URL *url.URL - lastColls map[string]time.Time - lastColl time.Time resourceKinds map[string]resourceKind hwMarks *TSCache lun2ds map[string]string @@ -67,6 +65,7 @@ type resourceKind struct { simple bool metrics performance.MetricList parent string + lastColl time.Time } type metricEntry struct { @@ -102,7 +101,6 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, e := Endpoint{ URL: url, Parent: parent, - lastColls: make(map[string]time.Time), hwMarks: NewTSCache(1 * time.Hour), lun2ds: make(map[string]string), initialized: false, @@ -786,8 +784,8 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc if err != nil { return err } - latest, hasLatest := e.lastColls[resourceType] - if hasLatest { + latest := res.lastColl + if !latest.IsZero() { elapsed := now.Sub(latest).Seconds() + 5.0 // Allow 5 second jitter. log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) if !res.realTime && elapsed < float64(res.sampling) { @@ -799,7 +797,6 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } else { latest = now.Add(time.Duration(-res.sampling) * time.Second) } - e.lastColl = now internalTags := map[string]string{"resourcetype": resourceType} sw := NewStopwatchWithTags("gather_duration", e.URL.Host, internalTags) @@ -834,7 +831,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) if !latestSample.IsZero() { - e.lastColls[resourceType] = latestSample + res.lastColl = latestSample } sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) @@ -858,6 +855,10 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, latestSample, err } + for _, pq := range pqs { + log.Printf("D! [input.vsphere] StartTime: %s, EndTime: %s", *pq.StartTime, *pq.EndTime) + } + ems, err := client.QueryMetrics(ctx, pqs) if err != nil { return count, latestSample, err diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index 732c020046814..7e7f74a1d0665 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -233,7 +233,7 @@ func init() { addFields = map[string][]string{ "HostSystem": {"parent"}, - "VirtualMachine": {"runtime.host", "config.guestId", "config.uuid"}, + "VirtualMachine": {"runtime.host", "config.guestId", "config.uuid", "runtime.powerState"}, "Datastore": {"parent", "info"}, "ClusterComputeResource": {"parent"}, "Datacenter": {"parent"}, diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 8681b36edfaad..6121b89ba08dc 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -4,8 +4,6 @@ import ( "context" "crypto/tls" "fmt" - "net/url" - "os" "regexp" "sort" "strings" @@ -117,61 +115,89 @@ func defaultVSphere() *VSphere { ClusterMetricExclude: nil, ClusterInclude: []string{"/**"}, HostMetricInclude: []string{ - "cpu.ready.summation.delta.millisecond", - "cpu.latency.average.rate.percent", - "cpu.coreUtilization.average.rate.percent", - "mem.usage.average.absolute.percent", - "mem.swapinRate.average.rate.kiloBytesPerSecond", - "mem.state.latest.absolute.number", - "mem.latency.average.absolute.percent", - "mem.vmmemctl.average.absolute.kiloBytes", - "disk.read.average.rate.kiloBytesPerSecond", - "disk.write.average.rate.kiloBytesPerSecond", - "disk.numberReadAveraged.average.rate.number", - "disk.numberWriteAveraged.average.rate.number", - "disk.deviceReadLatency.average.absolute.millisecond", - "disk.deviceWriteLatency.average.absolute.millisecond", - "disk.totalReadLatency.average.absolute.millisecond", - "disk.totalWriteLatency.average.absolute.millisecond", - "storageAdapter.read.average.rate.kiloBytesPerSecond", - "storageAdapter.write.average.rate.kiloBytesPerSecond", - "storageAdapter.numberReadAveraged.average.rate.number", - "storageAdapter.numberWriteAveraged.average.rate.number", - "net.errorsRx.summation.delta.number", - "net.errorsTx.summation.delta.number", - "net.bytesRx.average.rate.kiloBytesPerSecond", - "net.bytesTx.average.rate.kiloBytesPerSecond", - "cpu.used.summation.delta.millisecond", - "cpu.usage.average.rate.percent", - "cpu.utilization.average.rate.percent", - "cpu.wait.summation.delta.millisecond", - "cpu.idle.summation.delta.millisecond", - "cpu.readiness.average.rate.percent", - "cpu.costop.summation.delta.millisecond", - "cpu.swapwait.summation.delta.millisecond", - "mem.swapoutRate.average.rate.kiloBytesPerSecond", - "disk.kernelReadLatency.average.absolute.millisecond", - "disk.kernelWriteLatency.average.absolute.millisecond"}, + "cpu.coreUtilization.average", + "cpu.costop.summation", + "cpu.demand.average", + "cpu.idle.summation", + "cpu.latency.average", + "cpu.readiness.average", + "cpu.ready.summation", + "cpu.swapwait.summation", + "cpu.usage.average", + "cpu.usagemhz.average", + "cpu.used.summation", + "cpu.utilization.average", + "cpu.wait.summation", + "disk.deviceReadLatency.average", + "disk.deviceWriteLatency.average", + "disk.kernelReadLatency.average", + "disk.kernelWriteLatency.average", + "disk.numberReadAveraged.average", + "disk.numberWriteAveraged.average", + "disk.read.average", + "disk.totalReadLatency.average", + "disk.totalWriteLatency.average", + "disk.write.average", + "mem.active.average", + "mem.latency.average", + "mem.state.latest", + "mem.swapin.average", + "mem.swapinRate.average", + "mem.swapout.average", + "mem.swapoutRate.average", + "mem.totalCapacity.average", + "mem.usage.average", + "mem.vmmemctl.average", + "net.bytesRx.average", + "net.bytesTx.average", + "net.droppedRx.summation", + "net.droppedTx.summation", + "net.errorsRx.summation", + "net.errorsTx.summation", + "net.usage.average", + "power.power.average", + "storageAdapter.numberReadAveraged.average", + "storageAdapter.numberWriteAveraged.average", + "storageAdapter.read.average", + "storageAdapter.write.average", + "sys.uptime.latest"}, HostMetricExclude: nil, HostInclude: []string{"/**"}, VMMetricInclude: []string{ - "cpu.ready.summation.delta.millisecond", - "mem.swapinRate.average.rate.kiloBytesPerSecond", - "virtualDisk.numberReadAveraged.average.rate.number", - "virtualDisk.numberWriteAveraged.average.rate.number", - "virtualDisk.totalReadLatency.average.absolute.millisecond", - "virtualDisk.totalWriteLatency.average.absolute.millisecond", - "virtualDisk.readOIO.latest.absolute.number", - "virtualDisk.writeOIO.latest.absolute.number", - "net.bytesRx.average.rate.kiloBytesPerSecond", - "net.bytesTx.average.rate.kiloBytesPerSecond", - "net.droppedRx.summation.delta.number", - "net.droppedTx.summation.delta.number", - "cpu.run.summation.delta.millisecond", - "cpu.used.summation.delta.millisecond", - "mem.swapoutRate.average.rate.kiloBytesPerSecond", - "virtualDisk.read.average.rate.kiloBytesPerSecond", - "virtualDisk.write.average.rate.kiloBytesPerSecond"}, + "cpu.demand.average", + "cpu.idle.summation", + "cpu.latency.average", + "cpu.readiness.average", + "cpu.ready.summation", + "cpu.run.summation", + "cpu.usagemhz.average", + "cpu.used.summation", + "cpu.wait.summation", + "mem.active.average", + "mem.granted.average", + "mem.latency.average", + "mem.swapin.average", + "mem.swapinRate.average", + "mem.swapout.average", + "mem.swapoutRate.average", + "mem.usage.average", + "mem.vmmemctl.average", + "net.bytesRx.average", + "net.bytesTx.average", + "net.droppedRx.summation", + "net.droppedTx.summation", + "net.usage.average", + "power.power.average", + "virtualDisk.numberReadAveraged.average", + "virtualDisk.numberWriteAveraged.average", + "virtualDisk.read.average", + "virtualDisk.readOIO.latest", + "virtualDisk.throughput.usage.average", + "virtualDisk.totalReadLatency.average", + "virtualDisk.totalWriteLatency.average", + "virtualDisk.write.average", + "virtualDisk.writeOIO.latest", + "sys.uptime.latest"}, VMMetricExclude: nil, VMInclude: []string{"/**"}, DatastoreMetricInclude: []string{ @@ -185,6 +211,7 @@ func defaultVSphere() *VSphere { ClientConfig: itls.ClientConfig{InsecureSkipVerify: true}, MaxQueryObjects: 256, + MaxQueryMetrics: 256, ObjectDiscoveryInterval: internal.Duration{Duration: time.Second * 300}, Timeout: internal.Duration{Duration: time.Second * 20}, ForceDiscoverOnInit: true, @@ -219,7 +246,7 @@ func TestParseConfig(t *testing.T) { require.NotNil(t, tab) } -func TestWorkerPool(t *testing.T) { +func TestThrottledExecutor(t *testing.T) { max := int64(0) ngr := int64(0) n := 10000 @@ -250,21 +277,6 @@ func TestWorkerPool(t *testing.T) { } func TestTimeout(t *testing.T) { -<<<<<<< HEAD - v := defaultVSphere() - url := os.Getenv("TGF_TEST_VSPHERE_URL") - if url != "" { - m, s, err := createSim() - if err != nil { - t.Fatal(err) - } - defer m.Remove() - defer s.Close() - url = s.URL.String() - } else { - v.Username = os.Getenv("TGF_TEST_VSPHERE_USER") - v.Password = os.Getenv("TGF_TEST_VSPHERE_PASSWORD") -======= // Don't run test on 32-bit machines due to bug in simulator. // https://github.com/vmware/govmomi/issues/1330 var i int @@ -275,21 +287,18 @@ func TestTimeout(t *testing.T) { m, s, err := createSim() if err != nil { t.Fatal(err) ->>>>>>> origin/prydin-scale-improvement } + defer m.Remove() + defer s.Close() + v := defaultVSphere() var acc testutil.Accumulator - v.Vcenters = []string{url} + v.Vcenters = []string{s.URL.String()} v.Timeout = internal.Duration{Duration: 1 * time.Nanosecond} require.NoError(t, v.Start(nil)) // We're not using the Accumulator, so it can be nil. defer v.Stop() -<<<<<<< HEAD - err := v.Gather(&acc) - require.NotNil(t, err, "Error should not be nil here") -======= - err = v.Gather(&acc) + require.NoError(t, v.Gather(&acc)) require.True(t, len(acc.Errors) > 0, "Errors should not be empty here") ->>>>>>> origin/prydin-scale-improvement // The accumulator must contain exactly one error and it must be a deadline exceeded. require.Equal(t, 1, len(acc.Errors)) @@ -435,39 +444,6 @@ func TestFinder(t *testing.T) { require.Equal(t, 4, len(vm)) } -func TestExternalFinder(t *testing.T) { - os.Setenv("TGF_TEST_VSPHERE_URL", "https://10.198.15.245/sdk") - os.Setenv("TGF_TEST_VSPHERE_USER", "administrator@vsphere.local") - os.Setenv("TGF_TEST_VSPHERE_PASSWORD", "Admin!23") - - v := defaultVSphere() - vu := os.Getenv("TGF_TEST_VSPHERE_URL") - if vu == "" { - t.Skip("No external vCenter specified. Skipping") - } else { - v.Username = os.Getenv("TGF_TEST_VSPHERE_USER") - v.Password = os.Getenv("TGF_TEST_VSPHERE_PASSWORD") - } - - ctx := context.Background() - u, err := url.Parse(vu) - require.NoError(t, err, "Error parsing URL") - c, err := NewClient(ctx, u, v) - require.NoError(t, err, "Error connecting to vCenter") - - f := Finder{c} - - vm := []mo.VirtualMachine{} - err = f.Find(ctx, "VirtualMachine", "/**", &vm) - require.NoError(t, err) - require.True(t, len(vm) > 0) - - dc := []mo.Datacenter{} - err = f.Find(ctx, "Datacenter", "/*", &dc) - require.NoError(t, err) - require.Equal(t, 1, len(dc)) -} - func TestAll(t *testing.T) { // Don't run test on 32-bit machines due to bug in simulator. // https://github.com/vmware/govmomi/issues/1330 @@ -490,4 +466,5 @@ func TestAll(t *testing.T) { defer v.Stop() require.NoError(t, v.Gather(&acc)) require.Equal(t, 0, len(acc.Errors), fmt.Sprintf("Errors found: %s", acc.Errors)) + require.True(t, len(acc.Metrics) > 0, "No metrics were collected") } From 60b4f1741fd64abf35af18a082c7e4ec3ff29bc1 Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 18 Dec 2018 19:31:09 -0500 Subject: [PATCH 22/34] Changed handling of late samples --- plugins/inputs/vsphere/endpoint.go | 137 ++++++++++++++++--------- plugins/inputs/vsphere/vsphere_test.go | 39 +++++++ 2 files changed, 127 insertions(+), 49 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 963502bc22612..e8c99b4ffee61 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -65,6 +65,7 @@ type resourceKind struct { simple bool metrics performance.MetricList parent string + latestSample time.Time lastColl time.Time } @@ -735,6 +736,14 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim pq.StartTime = &start pq.EndTime = &now + // Make sure endtime is always after start time. We may occasionally see samples from the future + // returned from vCenter. This is presumably due to time drift between vCenter and EXSi nodes. + if pq.StartTime.After(*pq.EndTime) { + log.Printf("D! [input.vsphere] Future sample. Res: %s, StartTime: %s, EndTime: %s, Now: %s", pq.Entity, *pq.StartTime, *pq.EndTime, now) + end := start.Add(time.Second) + pq.EndTime = &end + } + pqs = append(pqs, pq) mr -= mc metrics += mc @@ -784,7 +793,17 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc if err != nil { return err } - latest := res.lastColl + + // Estimate the interval at which we're invoked. Use local time (not server time) + // since this is about how we got invoked locally. + localNow := time.Now() + estInterval := time.Duration(time.Minute) + if !res.lastColl.IsZero() { + estInterval = localNow.Sub(res.lastColl).Truncate(time.Duration(res.sampling) * time.Second) + } + log.Printf("D! [input.vsphere] Interval estimated to %s", estInterval) + + latest := res.latestSample if !latest.IsZero() { elapsed := now.Sub(latest).Seconds() + 5.0 // Allow 5 second jitter. log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) @@ -816,7 +835,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc // Handle panics gracefully defer HandlePanicWithAcc(acc) - n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc) + n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc, now, estInterval) log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { acc.AddError(errors.New("While collecting " + res.name + ": " + err.Error())) @@ -831,14 +850,49 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) if !latestSample.IsZero() { - res.lastColl = latestSample + res.latestSample = latestSample } sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) return nil } -func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator) (int, time.Time, error) { +func alignSamples(info []types.PerfSampleInfo, values []int64, interval time.Duration) ([]types.PerfSampleInfo, []float64) { + rInfo := make([]types.PerfSampleInfo, 0, len(info)) + rValues := make([]float64, 0, len(values)) + bi := 1.0 + var lastBucket time.Time + for idx := range info { + // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted + // data coming back with missing values. Take care of that gracefully! + if idx >= len(values) { + log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(info), len(values)) + break + } + v := float64(values[idx]) + if v < 0 { + continue + } + ts := info[idx].Timestamp + roundedTs := ts.Truncate(interval) + + // Are we still working on the same bucket? + if roundedTs == lastBucket { + bi++ + p := len(rValues) - 1 + rValues[p] = ((bi-1)/bi)*float64(rValues[p]) + v/bi + } else { + rValues = append(rValues, v) + rInfo = append(rInfo, info[idx]) + bi = 1.0 + lastBucket = roundedTs + } + } + //log.Printf("D! [input.vsphere] Aligned samples: %d collapsed into %d", len(info), len(rInfo)) + return rInfo, rValues +} + +func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator, now time.Time, interval time.Duration) (int, time.Time, error) { log.Printf("D! [input.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) latestSample := time.Time{} count := 0 @@ -855,10 +909,6 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, latestSample, err } - for _, pq := range pqs { - log.Printf("D! [input.vsphere] StartTime: %s, EndTime: %s", *pq.StartTime, *pq.EndTime) - } - ems, err := client.QueryMetrics(ctx, pqs) if err != nil { return count, latestSample, err @@ -891,62 +941,51 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, } e.populateTags(&objectRef, resourceType, res, t, &v) - avg := float64(0) nValues := 0 - for idx, sample := range em.SampleInfo { + alignedInfo, alignedValues := alignSamples(em.SampleInfo, v.Value, interval) // TODO: Estimate interval + + for idx, sample := range alignedInfo { // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted // data coming back with missing values. Take care of that gracefully! - if idx >= len(v.Value) { - log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(em.SampleInfo), len(v.Value)) + if idx >= len(alignedValues) { + log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(alignedInfo), len(alignedValues)) break } - value := float64(v.Value[idx]) - if value < 0 { - continue - } ts := sample.Timestamp if ts.After(latestSample) { latestSample = ts } - avg += float64(value) nValues++ + + // Organize the metrics into a bucket per measurement. + mn, fn := e.makeMetricIdentifier(prefix, name) + bKey := mn + " " + v.Instance + " " + strconv.FormatInt(ts.UnixNano(), 10) + bucket, found := buckets[bKey] + if !found { + bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: t} + buckets[bKey] = bucket + } + + // Percentage values must be scaled down by 100. + info, ok := metricInfo[name] + if !ok { + log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) + } + v := alignedValues[idx] + if info.UnitInfo.GetElementDescription().Key == "percent" { + bucket.fields[fn] = float64(v) / 100.0 + } else { + bucket.fields[fn] = v + } + count++ + + // Update highwater marks + e.hwMarks.Put(moid, ts) } if nValues == 0 { log.Printf("D! [input.vsphere]: Missing value for: %s, %s", name, objectRef.name) continue } - - // If we're catching up with metrics arriving late, calculate the average - // of them and pick the midpoint timestamp. This is a reasonable way of - // filling in missed collections that doesn't cause us to deliver metrics - // faster than the interval. - avg /= float64(nValues) - midTs := em.SampleInfo[len(em.SampleInfo)/2].Timestamp - - // Organize the metrics into a bucket per measurement. - mn, fn := e.makeMetricIdentifier(prefix, name) - bKey := mn + " " + v.Instance + " " + strconv.FormatInt(midTs.UnixNano(), 10) - bucket, found := buckets[bKey] - if !found { - bucket = metricEntry{name: mn, ts: midTs, fields: make(map[string]interface{}), tags: t} - buckets[bKey] = bucket - } - - // Percentage values must be scaled down by 100. - info, ok := metricInfo[name] - if !ok { - log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) - } - if info.UnitInfo.GetElementDescription().Key == "percent" { - bucket.fields[fn] = float64(avg) / 100.0 - } else { - bucket.fields[fn] = avg - } - count++ - - // Update highwater marks - e.hwMarks.Put(moid, latestSample) - } // We've iterated through all the metrics and collected buckets for each // measurement name. Now emit them! diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 6121b89ba08dc..cf3bee1cdb4e5 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -234,6 +234,45 @@ func createSim() (*simulator.Model, *simulator.Server, error) { return model, s, nil } +func TestAlignMetrics(t *testing.T) { + // 20s to 60s aligmentn of all 1.0 + now := time.Now().Truncate(60 * time.Second) + n := 30 + info := make([]types.PerfSampleInfo, n) + values := make([]int64, n) + for i := 0; i < n; i++ { + info[i] = types.PerfSampleInfo{ + Timestamp: now.Add(time.Duration(20*i) * time.Second), + Interval: 20, + } + values[i] = 1 + } + newInfo, newValues := alignSamples(info, values, 60) + require.Equal(t, n/3, len(newInfo), "Aligned infos have wrong size") + require.Equal(t, n/3, len(newValues), "Aligned values have wrong size") + for _, v := range newValues { + require.Equal(t, 1.0, v, "Aligned value should be 1") + } + + // 20s to 60s of 1,2,3,1,2,3... (should average to 2) + n = 30 + info = make([]types.PerfSampleInfo, n) + values = make([]int64, n) + for i := 0; i < n; i++ { + info[i] = types.PerfSampleInfo{ + Timestamp: now.Add(time.Duration(20*i) * time.Second), + Interval: 20, + } + values[i] = int64(i%3 + 1) + } + newInfo, newValues = alignSamples(info, values, 60) + require.Equal(t, n/3, len(newInfo), "Aligned infos have wrong size") + require.Equal(t, n/3, len(newValues), "Aligned values have wrong size") + for _, v := range newValues { + require.Equal(t, 2.0, v, "Aligned value should be 2") + } +} + func TestParseConfig(t *testing.T) { v := VSphere{} c := v.SampleConfig() From 3e8c058d2b507bf4a299c1196e68acb5e4554311 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 19 Dec 2018 12:04:11 -0500 Subject: [PATCH 23/34] Align all timestamps to interval boundary --- plugins/inputs/vsphere/endpoint.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index e8c99b4ffee61..5a4aeacca8ba2 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -383,7 +383,9 @@ func (e *Endpoint) discover(ctx context.Context) error { resType: res.vcName, paths: res.paths} - objects, err := res.getObjects(ctx, e, &rf) + ctx1, cancel1 := context.WithTimeout(ctx, e.Parent.Timeout.Duration) + defer cancel1() + objects, err := res.getObjects(ctx1, e, &rf) if err != nil { return err } @@ -883,7 +885,11 @@ func alignSamples(info []types.PerfSampleInfo, values []int64, interval time.Dur rValues[p] = ((bi-1)/bi)*float64(rValues[p]) + v/bi } else { rValues = append(rValues, v) - rInfo = append(rInfo, info[idx]) + roundedInfo := types.PerfSampleInfo{ + Timestamp: roundedTs, + Interval: info[idx].Interval, + } + rInfo = append(rInfo, roundedInfo) bi = 1.0 lastBucket = roundedTs } From 6547068c2e06077b8ae254218e2ad7569b773967 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 19 Dec 2018 15:56:48 -0500 Subject: [PATCH 24/34] Added documentation for inventory paths --- plugins/inputs/vsphere/README.md | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/plugins/inputs/vsphere/README.md b/plugins/inputs/vsphere/README.md index 4bccbb2c880e8..42b9dc5330341 100644 --- a/plugins/inputs/vsphere/README.md +++ b/plugins/inputs/vsphere/README.md @@ -27,6 +27,7 @@ vm_metric_exclude = [ "*" ] ## VMs ## Typical VM metrics (if omitted or empty, all metrics are collected) + # vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected) vm_metric_include = [ "cpu.demand.average", "cpu.idle.summation", @@ -68,6 +69,7 @@ vm_metric_exclude = [ "*" ] ## Hosts ## Typical host metrics (if omitted or empty, all metrics are collected) + # host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected) host_metric_include = [ "cpu.coreUtilization.average", "cpu.costop.summation", @@ -120,16 +122,19 @@ vm_metric_exclude = [ "*" ] # host_instances = true ## true by default ## Clusters + # cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) # cluster_metric_include = [] ## if omitted or empty, all metrics are collected # cluster_metric_exclude = [] ## Nothing excluded by default # cluster_instances = false ## false by default ## Datastores + # cluster_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected) # datastore_metric_include = [] ## if omitted or empty, all metrics are collected # datastore_metric_exclude = [] ## Nothing excluded by default # datastore_instances = false ## false by default ## Datacenters + # datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) datacenter_metric_include = [] ## if omitted or empty, all metrics are collected datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default. # datacenter_instances = false ## false by default @@ -196,6 +201,49 @@ For setting up concurrency, modify `collect_concurrency` and `discover_concurren # discover_concurrency = 1 ``` +### Inventory Paths +Resources to be monitored can be selected using Inventory Paths. This treats the vSphere inventory as a tree structure similar +to a file system. A vSphere inventory has a structure similar to this: + +``` + ++-DC0 # Virtual datacenter + +-datastore # Datastore folder (created by system) + | +-Datastore1 + +-host # Host folder (created by system) + | +-Cluster1 + | | +-Host1 + | | | +-VM1 + | | | +-VM2 + | | | +-hadoop1 + | +-Host2 # Dummy cluster created for non-clustered host + | | +-Host2 + | | | +-VM3 + | | | +-VM4 + +-vm # VM folder (created by system) + | +-VM1 + | +-VM2 + | +-Folder1 + | | +-hadoop1 + | | +-NestedFolder1 + | | | +-VM3 + | | | +-VM4 +``` + +#### Using Inventory Paths +Using familiar UNIX-style paths, one could select e.g. VM2 with the path ```/DC0/vm/VM2```. + +Often, we want to select a group of resource, such as all the VMs in a folder. We could use the path ```/DC0/vm/Folder1/*``` for that. + +Another possibility is to select objects using a partial name, such as ```/DC0/vm/Folder1/hadoop*``` yielding all vms in Folder1 with a name starting with "hadoop". + +Finally, due to the arbitrary nesting of the folder structure, we need a "recursive wildcard" for traversing multiple folders. We use the "**" symbol for that. If we want to look for a VM with a name starting with "hadoop" in any folder, we could use the following path: ```/DC0/vm/**/hadoop*``` + +#### Multiple paths to VMs +As we can see from the example tree above, VMs appear both in its on folder under the datacenter, as well as under the hosts. This is useful when you like to select VMs on a specific host. For example, ```/DC0/host/Cluster1/Host1/hadoop*``` selects all VMs with a name starting with "hadoop" that are running on Host1. + +We can extend this to looking at a cluster level: ```/DC0/host/Cluster1/*/hadoop*```. This selects any VM matching "hadoop*" on any host in Cluster1. + ## Measurements & Fields - Cluster Stats From 444292020fb3e9357b1fe9a83a3d054f4c9ea64d Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 19 Dec 2018 17:24:07 -0500 Subject: [PATCH 25/34] Changed logtags from [input.vsphere] to [inputs.vsphere] --- plugins/inputs/vsphere/client.go | 20 ++++---- plugins/inputs/vsphere/endpoint.go | 64 ++++++++++++------------- plugins/inputs/vsphere/finder.go | 2 +- plugins/inputs/vsphere/panic_handler.go | 2 +- plugins/inputs/vsphere/tscache.go | 2 +- plugins/inputs/vsphere/vsphere.go | 6 +-- 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/plugins/inputs/vsphere/client.go b/plugins/inputs/vsphere/client.go index 37f4b2c31776e..f2f21b5ffe00c 100644 --- a/plugins/inputs/vsphere/client.go +++ b/plugins/inputs/vsphere/client.go @@ -74,11 +74,11 @@ func (cf *ClientFactory) GetClient(ctx context.Context) (*Client, error) { ctx1, cancel1 := context.WithTimeout(ctx, cf.parent.Timeout.Duration) defer cancel1() if _, err := methods.GetCurrentTime(ctx1, cf.client.Client); err != nil { - log.Printf("I! [input.vsphere]: Client session seems to have time out. Reauthenticating!") + log.Printf("I! [inputs.vsphere]: Client session seems to have time out. Reauthenticating!") ctx2, cancel2 := context.WithTimeout(ctx, cf.parent.Timeout.Duration) defer cancel2() if cf.client.Client.SessionManager.Login(ctx2, url.UserPassword(cf.parent.Username, cf.parent.Password)) != nil { - log.Printf("W! [input.vsphere]: Client reauthentication failed.") + log.Printf("W! [inputs.vsphere]: Client reauthentication failed.") return nil, err } } @@ -103,7 +103,7 @@ func NewClient(ctx context.Context, u *url.URL, vs *VSphere) (*Client, error) { u.User = url.UserPassword(vs.Username, vs.Password) } - log.Printf("D! [input.vsphere]: Creating client: %s", u.Host) + log.Printf("D! [inputs.vsphere]: Creating client: %s", u.Host) soapClient := soap.NewClient(u, tlsCfg.InsecureSkipVerify) // Add certificate if we have it. Use it to log us in. @@ -174,9 +174,9 @@ func NewClient(ctx context.Context, u *url.URL, vs *VSphere) (*Client, error) { if err != nil { return nil, err } - log.Printf("D! [input.vsphere] vCenter says max_query_metrics should be %d", n) + log.Printf("D! [inputs.vsphere] vCenter says max_query_metrics should be %d", n) if n < vs.MaxQueryMetrics { - log.Printf("W! [input.vsphere] Configured max_query_metrics is %d, but server limits it to %d. Reducing.", vs.MaxQueryMetrics, n) + log.Printf("W! [inputs.vsphere] Configured max_query_metrics is %d, but server limits it to %d. Reducing.", vs.MaxQueryMetrics, n) vs.MaxQueryMetrics = n } return client, nil @@ -200,7 +200,7 @@ func (c *Client) close() { defer cancel() if c.Client != nil { if err := c.Client.Logout(ctx); err != nil { - log.Printf("E! [input.vsphere]: Error during logout: %s", err) + log.Printf("E! [inputs.vsphere]: Error during logout: %s", err) } } }) @@ -229,7 +229,7 @@ func (c *Client) GetMaxQueryMetrics(ctx context.Context) (int, error) { if s, ok := res[0].GetOptionValue().Value.(string); ok { v, err := strconv.Atoi(s) if err == nil { - log.Printf("D! [input.vsphere] vCenter maxQueryMetrics is defined: %d", v) + log.Printf("D! [inputs.vsphere] vCenter maxQueryMetrics is defined: %d", v) if v == -1 { // Whatever the server says, we never ask for more metrics than this. return absoluteMaxMetrics, nil @@ -240,17 +240,17 @@ func (c *Client) GetMaxQueryMetrics(ctx context.Context) (int, error) { // Fall through version-based inference if value isn't usable } } else { - log.Println("D! [input.vsphere] Option query for maxQueryMetrics failed. Using default") + log.Println("D! [inputs.vsphere] Option query for maxQueryMetrics failed. Using default") } // No usable maxQueryMetrics setting. Infer based on version ver := c.Client.Client.ServiceContent.About.Version parts := strings.Split(ver, ".") if len(parts) < 2 { - log.Printf("W! [input.vsphere] vCenter returned an invalid version string: %s. Using default query size=64", ver) + log.Printf("W! [inputs.vsphere] vCenter returned an invalid version string: %s. Using default query size=64", ver) return 64, nil } - log.Printf("D! [input.vsphere] vCenter version is: %s", ver) + log.Printf("D! [inputs.vsphere] vCenter version is: %s", ver) major, err := strconv.Atoi(parts[0]) if err != nil { return 0, err diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 5a4aeacca8ba2..b51fecb33cf55 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -239,10 +239,10 @@ func (e *Endpoint) startDiscovery(ctx context.Context) { case <-e.discoveryTicker.C: err := e.discover(ctx) if err != nil && err != context.Canceled { - log.Printf("E! [input.vsphere]: Error in discovery for %s: %v", e.URL.Host, err) + log.Printf("E! [inputs.vsphere]: Error in discovery for %s: %v", e.URL.Host, err) } case <-ctx.Done(): - log.Printf("D! [input.vsphere]: Exiting discovery goroutine for %s", e.URL.Host) + log.Printf("D! [inputs.vsphere]: Exiting discovery goroutine for %s", e.URL.Host) e.discoveryTicker.Stop() return } @@ -253,7 +253,7 @@ func (e *Endpoint) startDiscovery(ctx context.Context) { func (e *Endpoint) initalDiscovery(ctx context.Context) { err := e.discover(ctx) if err != nil && err != context.Canceled { - log.Printf("E! [input.vsphere]: Error in discovery for %s: %v", e.URL.Host, err) + log.Printf("E! [inputs.vsphere]: Error in discovery for %s: %v", e.URL.Host, err) } e.startDiscovery(ctx) } @@ -266,7 +266,7 @@ func (e *Endpoint) init(ctx context.Context) error { // goroutine without waiting for it. This will probably cause us to report an empty // dataset on the first collection, but it solves the issue of the first collection timing out. if e.Parent.ForceDiscoverOnInit { - log.Printf("D! [input.vsphere]: Running initial discovery and waiting for it to finish") + log.Printf("D! [inputs.vsphere]: Running initial discovery and waiting for it to finish") e.initalDiscovery(ctx) } else { // Otherwise, just run it in the background. We'll probably have an incomplete first metric @@ -330,7 +330,7 @@ func (e *Endpoint) getDatacenterName(ctx context.Context, client *Client, cache defer cancel1() err := o.Properties(ctx1, here, []string{"parent", "name"}, &result) if err != nil { - log.Printf("W! [input.vsphere]: Error while resolving parent. Assuming no parent exists. Error: %s", err) + log.Printf("W! [inputs.vsphere]: Error while resolving parent. Assuming no parent exists. Error: %s", err) break } if result.Reference().Type == "Datacenter" { @@ -339,7 +339,7 @@ func (e *Endpoint) getDatacenterName(ctx context.Context, client *Client, cache break } if result.Parent == nil { - log.Printf("D! [input.vsphere]: No parent found for %s (ascending from %s)", here.Reference(), r.Reference()) + log.Printf("D! [inputs.vsphere]: No parent found for %s (ascending from %s)", here.Reference(), r.Reference()) break } here = result.Parent.Reference() @@ -369,13 +369,13 @@ func (e *Endpoint) discover(ctx context.Context) error { return err } - log.Printf("D! [input.vsphere]: Discover new objects for %s", e.URL.Host) + log.Printf("D! [inputs.vsphere]: Discover new objects for %s", e.URL.Host) resourceKinds := make(map[string]resourceKind) dcNameCache := make(map[string]string) // Populate resource objects, and endpoint instance info. for k, res := range e.resourceKinds { - log.Printf("D! [input.vsphere] Discovering resources for %s", res.name) + log.Printf("D! [inputs.vsphere] Discovering resources for %s", res.name) // Need to do this for all resource types even if they are not enabled if res.enabled || k != "vm" { rf := ResourceFilter{ @@ -437,10 +437,10 @@ func (e *Endpoint) discover(ctx context.Context) error { } func (e *Endpoint) simpleMetadataSelect(ctx context.Context, client *Client, res *resourceKind) { - log.Printf("D! [input.vsphere] Using fast metric metadata selection for %s", res.name) + log.Printf("D! [inputs.vsphere] Using fast metric metadata selection for %s", res.name) m, err := client.CounterInfoByName(ctx) if err != nil { - log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + log.Printf("E! [inputs.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) return } res.metrics = make(performance.MetricList, 0, len(res.include)) @@ -456,7 +456,7 @@ func (e *Endpoint) simpleMetadataSelect(ctx context.Context, client *Client, res } res.metrics = append(res.metrics, cnt) } else { - log.Printf("W! [input.vsphere] Metric name %s is unknown. Will not be collected", s) + log.Printf("W! [inputs.vsphere] Metric name %s is unknown. Will not be collected", s) } } } @@ -489,7 +489,7 @@ func (e *Endpoint) complexMetadataSelect(ctx context.Context, res *resourceKind, te.Run(func() { metrics, err := e.getMetadata(ctx, obj, res.sampling) if err != nil { - log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) + log.Printf("E! [inputs.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) } mMap := make(map[string]types.PerfMetricId) for _, m := range metrics { @@ -502,7 +502,7 @@ func (e *Endpoint) complexMetadataSelect(ctx context.Context, res *resourceKind, mMap[strconv.Itoa(int(m.CounterId))+"|"+m.Instance] = m } } - log.Printf("D! [input.vsphere] Found %d metrics for %s", len(mMap), obj.name) + log.Printf("D! [inputs.vsphere] Found %d metrics for %s", len(mMap), obj.name) instInfoMux.Lock() defer instInfoMux.Unlock() if len(mMap) > len(res.metrics) { @@ -561,7 +561,7 @@ func getClusters(ctx context.Context, e *Endpoint, filter *ResourceFilter) (obje defer cancel3() err = o.Properties(ctx3, *r.Parent, []string{"parent"}, &folder) if err != nil { - log.Printf("W! [input.vsphere] Error while getting folder parent: %e", err) + log.Printf("W! [inputs.vsphere] Error while getting folder parent: %e", err) p = nil } else { pp := folder.Parent.Reference() @@ -741,7 +741,7 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim // Make sure endtime is always after start time. We may occasionally see samples from the future // returned from vCenter. This is presumably due to time drift between vCenter and EXSi nodes. if pq.StartTime.After(*pq.EndTime) { - log.Printf("D! [input.vsphere] Future sample. Res: %s, StartTime: %s, EndTime: %s, Now: %s", pq.Entity, *pq.StartTime, *pq.EndTime, now) + log.Printf("D! [inputs.vsphere] Future sample. Res: %s, StartTime: %s, EndTime: %s, Now: %s", pq.Entity, *pq.StartTime, *pq.EndTime, now) end := start.Add(time.Second) pq.EndTime = &end } @@ -755,7 +755,7 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim // 2) We are at the last resource and have no more data to process. // 3) The query contains more than 100,000 individual metrics if mr > 0 || nRes >= e.Parent.MaxQueryObjects || len(pqs) > 100000 { - log.Printf("D! [input.vsphere]: Queueing query: %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d", + log.Printf("D! [inputs.vsphere]: Queueing query: %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d", len(pqs), metrics, mr, res.name, e.URL.Host, total+1, len(res.objects)) // Don't send work items if the context has been cancelled. @@ -776,7 +776,7 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim // Handle final partially filled chunk if len(pqs) > 0 { // Run collection job - log.Printf("D! [input.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)", + log.Printf("D! [inputs.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)", len(pqs), metrics, res.name, e.URL.Host, len(res.objects)) submitChunkJob(te, job, pqs) } @@ -803,15 +803,15 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc if !res.lastColl.IsZero() { estInterval = localNow.Sub(res.lastColl).Truncate(time.Duration(res.sampling) * time.Second) } - log.Printf("D! [input.vsphere] Interval estimated to %s", estInterval) + log.Printf("D! [inputs.vsphere] Interval estimated to %s", estInterval) latest := res.latestSample if !latest.IsZero() { elapsed := now.Sub(latest).Seconds() + 5.0 // Allow 5 second jitter. - log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) + log.Printf("D! [inputs.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) if !res.realTime && elapsed < float64(res.sampling) { // No new data would be available. We're outta here! - log.Printf("D! [input.vsphere]: Sampling period for %s of %d has not elapsed on %s", + log.Printf("D! [inputs.vsphere]: Sampling period for %s of %d has not elapsed on %s", resourceType, res.sampling, e.URL.Host) return nil } @@ -822,7 +822,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc internalTags := map[string]string{"resourcetype": resourceType} sw := NewStopwatchWithTags("gather_duration", e.URL.Host, internalTags) - log.Printf("D! [input.vsphere]: Collecting metrics for %d objects of type %s for %s", + log.Printf("D! [inputs.vsphere]: Collecting metrics for %d objects of type %s for %s", len(res.objects), resourceType, e.URL.Host) count := int64(0) @@ -838,7 +838,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc defer HandlePanicWithAcc(acc) n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc, now, estInterval) - log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) + log.Printf("D! [inputs.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { acc.AddError(errors.New("While collecting " + res.name + ": " + err.Error())) } @@ -850,7 +850,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } }) - log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) + log.Printf("D! [inputs.vsphere] Latest sample for %s set to %s", resourceType, latestSample) if !latestSample.IsZero() { res.latestSample = latestSample } @@ -868,7 +868,7 @@ func alignSamples(info []types.PerfSampleInfo, values []int64, interval time.Dur // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted // data coming back with missing values. Take care of that gracefully! if idx >= len(values) { - log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(info), len(values)) + log.Printf("D! [inputs.vsphere] len(SampleInfo)>len(Value) %d > %d", len(info), len(values)) break } v := float64(values[idx]) @@ -894,12 +894,12 @@ func alignSamples(info []types.PerfSampleInfo, values []int64, interval time.Dur lastBucket = roundedTs } } - //log.Printf("D! [input.vsphere] Aligned samples: %d collapsed into %d", len(info), len(rInfo)) + //log.Printf("D! [inputs.vsphere] Aligned samples: %d collapsed into %d", len(info), len(rInfo)) return rInfo, rValues } func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator, now time.Time, interval time.Duration) (int, time.Time, error) { - log.Printf("D! [input.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) + log.Printf("D! [inputs.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) latestSample := time.Time{} count := 0 resourceType := res.name @@ -920,14 +920,14 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, latestSample, err } - log.Printf("D! [input.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) + log.Printf("D! [inputs.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) // Iterate through results for _, em := range ems { moid := em.Entity.Reference().Value instInfo, found := res.objects[moid] if !found { - log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping! (This should not happen!)", moid) + log.Printf("E! [inputs.vsphere]: MOID %s not found in cache. Skipping! (This should not happen!)", moid) continue } buckets := make(map[string]metricEntry) @@ -942,7 +942,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, // Populate tags objectRef, ok := res.objects[moid] if !ok { - log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping", moid) + log.Printf("E! [inputs.vsphere]: MOID %s not found in cache. Skipping", moid) continue } e.populateTags(&objectRef, resourceType, res, t, &v) @@ -954,7 +954,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted // data coming back with missing values. Take care of that gracefully! if idx >= len(alignedValues) { - log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(alignedInfo), len(alignedValues)) + log.Printf("D! [inputs.vsphere] len(SampleInfo)>len(Value) %d > %d", len(alignedInfo), len(alignedValues)) break } ts := sample.Timestamp @@ -975,7 +975,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, // Percentage values must be scaled down by 100. info, ok := metricInfo[name] if !ok { - log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) + log.Printf("E! [inputs.vsphere]: Could not determine unit for %s. Skipping", name) } v := alignedValues[idx] if info.UnitInfo.GetElementDescription().Key == "percent" { @@ -989,7 +989,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, e.hwMarks.Put(moid, ts) } if nValues == 0 { - log.Printf("D! [input.vsphere]: Missing value for: %s, %s", name, objectRef.name) + log.Printf("D! [inputs.vsphere]: Missing value for: %s, %s", name, objectRef.name) continue } } diff --git a/plugins/inputs/vsphere/finder.go b/plugins/inputs/vsphere/finder.go index 7e7f74a1d0665..372aa5e3ba0ee 100644 --- a/plugins/inputs/vsphere/finder.go +++ b/plugins/inputs/vsphere/finder.go @@ -57,7 +57,7 @@ func (f *Finder) Find(ctx context.Context, resType, path string, dst interface{} return err } objectContentToTypedArray(objs, dst) - log.Printf("D! [input.vsphere] Find(%s, %s) returned %d objects", resType, path, len(objs)) + log.Printf("D! [inputs.vsphere] Find(%s, %s) returned %d objects", resType, path, len(objs)) return nil } diff --git a/plugins/inputs/vsphere/panic_handler.go b/plugins/inputs/vsphere/panic_handler.go index 1d6242510a972..b971135503302 100644 --- a/plugins/inputs/vsphere/panic_handler.go +++ b/plugins/inputs/vsphere/panic_handler.go @@ -23,6 +23,6 @@ func HandlePanicWithAcc(acc telegraf.Accumulator) { func HandlePanic() { if p := recover(); p != nil { - log.Printf("E! [input.vsphere] PANIC (recovered): %s", p) + log.Printf("E! [inputs.vsphere] PANIC (recovered): %s", p) } } diff --git a/plugins/inputs/vsphere/tscache.go b/plugins/inputs/vsphere/tscache.go index 1d1f00ebea3cc..4f73c4fe89155 100644 --- a/plugins/inputs/vsphere/tscache.go +++ b/plugins/inputs/vsphere/tscache.go @@ -34,7 +34,7 @@ func (t *TSCache) Purge() { n++ } } - log.Printf("D! [input.vsphere] Purged timestamp cache. %d deleted with %d remaining", n, len(t.table)) + log.Printf("D! [inputs.vsphere] Purged timestamp cache. %d deleted with %d remaining", n, len(t.table)) } // IsNew returns true if the supplied timestamp for the supplied key is more recent than the diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index b3ac90596d5d6..104f7b5117e95 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -221,7 +221,7 @@ func (v *VSphere) Description() string { // Start is called from telegraf core when a plugin is started and allows it to // perform initialization tasks. func (v *VSphere) Start(acc telegraf.Accumulator) error { - log.Println("D! [input.vsphere]: Starting plugin") + log.Println("D! [inputs.vsphere]: Starting plugin") ctx, cancel := context.WithCancel(context.Background()) v.cancel = cancel @@ -244,7 +244,7 @@ func (v *VSphere) Start(acc telegraf.Accumulator) error { // Stop is called from telegraf core when a plugin is stopped and allows it to // perform shutdown tasks. func (v *VSphere) Stop() { - log.Println("D! [input.vsphere]: Stopping plugin") + log.Println("D! [inputs.vsphere]: Stopping plugin") v.cancel() // Wait for all endpoints to finish. No need to wait for @@ -253,7 +253,7 @@ func (v *VSphere) Stop() { // wait for any discovery to complete by trying to grab the // "busy" mutex. for _, ep := range v.endpoints { - log.Printf("D! [input.vsphere]: Waiting for endpoint %s to finish", ep.URL.Host) + log.Printf("D! [inputs.vsphere]: Waiting for endpoint %s to finish", ep.URL.Host) func() { ep.busy.Lock() // Wait until discovery is finished defer ep.busy.Unlock() From 1bb9eae44acc2a5e8accdf37d8fb2e1f2ba45456 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 19 Dec 2018 17:38:51 -0500 Subject: [PATCH 26/34] Fixed broken test case --- plugins/inputs/vsphere/vsphere_test.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index cf3bee1cdb4e5..e675b96f2bdc7 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -234,10 +234,8 @@ func createSim() (*simulator.Model, *simulator.Server, error) { return model, s, nil } -func TestAlignMetrics(t *testing.T) { - // 20s to 60s aligmentn of all 1.0 +func testAlignUniform(t *testing.T, n int) { now := time.Now().Truncate(60 * time.Second) - n := 30 info := make([]types.PerfSampleInfo, n) values := make([]int64, n) for i := 0; i < n; i++ { @@ -247,17 +245,24 @@ func TestAlignMetrics(t *testing.T) { } values[i] = 1 } - newInfo, newValues := alignSamples(info, values, 60) + newInfo, newValues := alignSamples(info, values, 60*time.Second) require.Equal(t, n/3, len(newInfo), "Aligned infos have wrong size") require.Equal(t, n/3, len(newValues), "Aligned values have wrong size") for _, v := range newValues { require.Equal(t, 1.0, v, "Aligned value should be 1") } +} + +func TestAlignMetrics(t *testing.T) { + testAlignUniform(t, 3) + testAlignUniform(t, 30) + testAlignUniform(t, 333) // 20s to 60s of 1,2,3,1,2,3... (should average to 2) - n = 30 - info = make([]types.PerfSampleInfo, n) - values = make([]int64, n) + n := 30 + now := time.Now().Truncate(60 * time.Second) + info := make([]types.PerfSampleInfo, n) + values := make([]int64, n) for i := 0; i < n; i++ { info[i] = types.PerfSampleInfo{ Timestamp: now.Add(time.Duration(20*i) * time.Second), @@ -265,7 +270,7 @@ func TestAlignMetrics(t *testing.T) { } values[i] = int64(i%3 + 1) } - newInfo, newValues = alignSamples(info, values, 60) + newInfo, newValues := alignSamples(info, values, 60*time.Second) require.Equal(t, n/3, len(newInfo), "Aligned infos have wrong size") require.Equal(t, n/3, len(newValues), "Aligned values have wrong size") for _, v := range newValues { From dfdd0ee6c73943d0138b83edff0df53da1321da8 Mon Sep 17 00:00:00 2001 From: prydin Date: Wed, 19 Dec 2018 17:55:58 -0500 Subject: [PATCH 27/34] Fixed 32-bit test issue (bug in vSphere simulator) --- plugins/inputs/vsphere/vsphere_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index e675b96f2bdc7..ad1b175e1d22e 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -393,6 +393,13 @@ func TestMaxQuery(t *testing.T) { } func TestFinder(t *testing.T) { + // Don't run test on 32-bit machines due to bug in simulator. + // https://github.com/vmware/govmomi/issues/1330 + var i int + if unsafe.Sizeof(i) < 8 { + return + } + m, s, err := createSim() if err != nil { t.Fatal(err) From 180a7bfc5ec6e6e79ddea747be40928321e1d02b Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 21 Dec 2018 11:25:38 -0500 Subject: [PATCH 28/34] Added cancel-handler to ThrottledExecutor, removed unnecessary warning from GetClient(), fixed discovered object counting --- plugins/inputs/vsphere/client.go | 6 ++---- plugins/inputs/vsphere/endpoint.go | 17 +++++++++++------ plugins/inputs/vsphere/throttled_exec.go | 17 +++++++++++------ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/plugins/inputs/vsphere/client.go b/plugins/inputs/vsphere/client.go index 37f4b2c31776e..485361c34d2f8 100644 --- a/plugins/inputs/vsphere/client.go +++ b/plugins/inputs/vsphere/client.go @@ -10,8 +10,6 @@ import ( "sync" "time" - "github.com/vmware/govmomi/vim25/types" - "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/performance" @@ -20,6 +18,7 @@ import ( "github.com/vmware/govmomi/vim25" "github.com/vmware/govmomi/vim25/methods" "github.com/vmware/govmomi/vim25/soap" + "github.com/vmware/govmomi/vim25/types" ) // The highest number of metrics we can query for, no matter what settings @@ -78,8 +77,7 @@ func (cf *ClientFactory) GetClient(ctx context.Context) (*Client, error) { ctx2, cancel2 := context.WithTimeout(ctx, cf.parent.Timeout.Duration) defer cancel2() if cf.client.Client.SessionManager.Login(ctx2, url.UserPassword(cf.parent.Username, cf.parent.Password)) != nil { - log.Printf("W! [input.vsphere]: Client reauthentication failed.") - return nil, err + return nil.fmt.Errorf("Renewing authentication failed: %v", err) } } diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 6caebb4f77412..9df23264348ac 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -363,6 +363,8 @@ func (e *Endpoint) discover(ctx context.Context) error { resourceKinds := make(map[string]resourceKind) dcNameCache := make(map[string]string) + numRes := int64(0) + // Populate resource objects, and endpoint instance info. for k, res := range e.resourceKinds { log.Printf("D! [input.vsphere] Discovering resources for %s", res.name) @@ -393,6 +395,9 @@ func (e *Endpoint) discover(ctx context.Context) error { } res.objects = objects resourceKinds[k] = res + + SendInternalCounterWithTags("discovered_objects", e.URL.Host, map[string]string{"type": res.name}, int64(len(objects))) + numRes += int64(len(objects)) } } @@ -415,7 +420,7 @@ func (e *Endpoint) discover(ctx context.Context) error { e.lun2ds = l2d sw.Stop() - // SendInternalCounter("discovered_objects", e.URL.Host, int64(len(instInfo))) TODO: Count the correct way + SendInternalCounterWithTags("discovered_objects", e.URL.Host, map[string]string{"type": "instance-total"}, numRes) return nil } @@ -469,7 +474,7 @@ func (e *Endpoint) complexMetadataSelect(ctx context.Context, res *resourceKind, te := NewThrottledExecutor(e.Parent.DiscoverConcurrency) for _, obj := range sampledObjects { func(obj objectRef) { - te.Run(func() { + te.Run(ctx, func() { metrics, err := e.getMetadata(ctx, obj, res.sampling) if err != nil { log.Printf("E! [input.vsphere]: Error while getting metric metadata. Discovery will be incomplete. Error: %s", err) @@ -659,8 +664,8 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error } // Workaround to make sure pqs is a copy of the loop variable and won't change. -func submitChunkJob(te *ThrottledExecutor, job func([]types.PerfQuerySpec), pqs []types.PerfQuerySpec) { - te.Run(func() { +func submitChunkJob(ctx context.Context, te *ThrottledExecutor, job func([]types.PerfQuerySpec), pqs []types.PerfQuerySpec) { + te.Run(ctx, func() { job(pqs) }) } @@ -725,7 +730,7 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim } // Run collection job - submitChunkJob(te, job, pqs) + submitChunkJob(ctx, te, job, pqs) pqs = make([]types.PerfQuerySpec, 0, e.Parent.MaxQueryObjects) metrics = 0 nRes = 0 @@ -739,7 +744,7 @@ func (e *Endpoint) chunkify(ctx context.Context, res *resourceKind, now time.Tim // Run collection job log.Printf("D! [input.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)", len(pqs), metrics, res.name, e.URL.Host, len(res.objects)) - submitChunkJob(te, job, pqs) + submitChunkJob(ctx, te, job, pqs) } // Wait for background collection to finish diff --git a/plugins/inputs/vsphere/throttled_exec.go b/plugins/inputs/vsphere/throttled_exec.go index 15f66c5ab5d7e..0259a6de42ea8 100644 --- a/plugins/inputs/vsphere/throttled_exec.go +++ b/plugins/inputs/vsphere/throttled_exec.go @@ -1,6 +1,7 @@ package vsphere import ( + "context" "sync" ) @@ -22,17 +23,21 @@ func NewThrottledExecutor(limit int) *ThrottledExecutor { // Run schedules a job for execution as soon as possible while respecting the // maximum concurrency limit. -func (t *ThrottledExecutor) Run(job func()) { +func (t *ThrottledExecutor) Run(ctx context.Context, job func()) { t.wg.Add(1) - t.limiter <- struct{}{} go func() { // Last resort panic handler. defer HandlePanic() defer t.wg.Done() - defer func() { - <-t.limiter - }() - job() + select { + case t.limiter <- struct{}{}: + defer func() { + <-t.limiter + }() + job() + case <-ctx.Done(): + return + } }() } From a0ca6479c1806df498a85cd2bffc5672da48f110 Mon Sep 17 00:00:00 2001 From: prydin Date: Fri, 21 Dec 2018 11:33:31 -0500 Subject: [PATCH 29/34] Fixed test case issues --- plugins/inputs/vsphere/client.go | 3 ++- plugins/inputs/vsphere/vsphere_test.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/vsphere/client.go b/plugins/inputs/vsphere/client.go index 485361c34d2f8..8b1c4866ac9df 100644 --- a/plugins/inputs/vsphere/client.go +++ b/plugins/inputs/vsphere/client.go @@ -3,6 +3,7 @@ package vsphere import ( "context" "crypto/tls" + "fmt" "log" "net/url" "strconv" @@ -77,7 +78,7 @@ func (cf *ClientFactory) GetClient(ctx context.Context) (*Client, error) { ctx2, cancel2 := context.WithTimeout(ctx, cf.parent.Timeout.Duration) defer cancel2() if cf.client.Client.SessionManager.Login(ctx2, url.UserPassword(cf.parent.Username, cf.parent.Password)) != nil { - return nil.fmt.Errorf("Renewing authentication failed: %v", err) + return nil, fmt.Errorf("Renewing authentication failed: %v", err) } } diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 0da0681cc0a93..a4b931bd96568 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -218,7 +218,7 @@ func TestWorkerPool(t *testing.T) { te := NewThrottledExecutor(5) for i := 0; i < n; i++ { func(i int) { - te.Run(func() { + te.Run(context.Background(), func() { atomic.AddInt64(&ngr, 1) mux.Lock() defer mux.Unlock() From 1eb24a3bb02ad2ec7aeec09a3c32270fa85162b6 Mon Sep 17 00:00:00 2001 From: prydin Date: Sat, 22 Dec 2018 09:03:56 -0500 Subject: [PATCH 30/34] Back-ported timestamping fixes from pontus-issue-4790 --- plugins/inputs/vsphere/endpoint.go | 156 ++++++++++++++++++----------- 1 file changed, 96 insertions(+), 60 deletions(-) diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 9df23264348ac..aacf5bb2d20a4 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -39,8 +39,6 @@ const maxMetadataSamples = 100 // Number of resources to sample for metric metad type Endpoint struct { Parent *VSphere URL *url.URL - lastColls map[string]time.Time - lastColl time.Time resourceKinds map[string]resourceKind hwMarks *TSCache lun2ds map[string]string @@ -66,6 +64,8 @@ type resourceKind struct { collectInstances bool parent string getObjects func(context.Context, *Client, *Endpoint, *view.ContainerView) (objectMap, error) + latestSample time.Time + lastColl time.Time } type metricEntry struct { @@ -101,7 +101,6 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint, e := Endpoint{ URL: url, Parent: parent, - lastColls: make(map[string]time.Time), hwMarks: NewTSCache(1 * time.Hour), lun2ds: make(map[string]string), initialized: false, @@ -761,25 +760,34 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc if err != nil { return err } - latest, hasLatest := e.lastColls[resourceType] - if hasLatest { + + // Estimate the interval at which we're invoked. Use local time (not server time) + // since this is about how we got invoked locally. + localNow := time.Now() + estInterval := time.Duration(time.Minute) + if !res.lastColl.IsZero() { + estInterval = localNow.Sub(res.lastColl).Truncate(time.Duration(res.sampling) * time.Second) + } + log.Printf("D! [inputs.vsphere] Interval estimated to %s", estInterval) + + latest := res.latestSample + if !latest.IsZero() { elapsed := now.Sub(latest).Seconds() + 5.0 // Allow 5 second jitter. - log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) + log.Printf("D! [inputs.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType) if !res.realTime && elapsed < float64(res.sampling) { // No new data would be available. We're outta here! - log.Printf("D! [input.vsphere]: Sampling period for %s of %d has not elapsed on %s", + log.Printf("D! [inputs.vsphere]: Sampling period for %s of %d has not elapsed on %s", resourceType, res.sampling, e.URL.Host) return nil } } else { latest = now.Add(time.Duration(-res.sampling) * time.Second) } - e.lastColl = now internalTags := map[string]string{"resourcetype": resourceType} sw := NewStopwatchWithTags("gather_duration", e.URL.Host, internalTags) - log.Printf("D! [input.vsphere]: Collecting metrics for %d objects of type %s for %s", + log.Printf("D! [inputs.vsphere]: Collecting metrics for %d objects of type %s for %s", len(res.objects), resourceType, e.URL.Host) count := int64(0) @@ -794,8 +802,8 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc // Handle panics gracefully defer HandlePanicWithAcc(acc) - n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc) - log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) + n, localLatest, err := e.collectChunk(ctx, chunk, &res, acc, now, estInterval) + log.Printf("D! [inputs.vsphere] CollectChunk for %s returned %d metrics", resourceType, n) if err != nil { acc.AddError(errors.New("While collecting " + res.name + ": " + err.Error())) } @@ -807,17 +815,56 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc } }) - log.Printf("D! [input.vsphere] Latest sample for %s set to %s", resourceType, latestSample) + log.Printf("D! [inputs.vsphere] Latest sample for %s set to %s", resourceType, latestSample) if !latestSample.IsZero() { - e.lastColls[resourceType] = latestSample + res.latestSample = latestSample } sw.Stop() SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count) return nil } -func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator) (int, time.Time, error) { - log.Printf("D! [input.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) +func alignSamples(info []types.PerfSampleInfo, values []int64, interval time.Duration) ([]types.PerfSampleInfo, []float64) { + rInfo := make([]types.PerfSampleInfo, 0, len(info)) + rValues := make([]float64, 0, len(values)) + bi := 1.0 + var lastBucket time.Time + for idx := range info { + // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted + // data coming back with missing values. Take care of that gracefully! + if idx >= len(values) { + log.Printf("D! [inputs.vsphere] len(SampleInfo)>len(Value) %d > %d", len(info), len(values)) + break + } + v := float64(values[idx]) + if v < 0 { + continue + } + ts := info[idx].Timestamp + roundedTs := ts.Truncate(interval) + + // Are we still working on the same bucket? + if roundedTs == lastBucket { + bi++ + p := len(rValues) - 1 + rValues[p] = ((bi-1)/bi)*float64(rValues[p]) + v/bi + } else { + rValues = append(rValues, v) + roundedInfo := types.PerfSampleInfo{ + Timestamp: roundedTs, + Interval: info[idx].Interval, + } + rInfo = append(rInfo, roundedInfo) + bi = 1.0 + lastBucket = roundedTs + } + } + //log.Printf("D! [inputs.vsphere] Aligned samples: %d collapsed into %d", len(info), len(rInfo)) + return rInfo, rValues +} + +func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, res *resourceKind, acc telegraf.Accumulator, now time.Time, interval time.Duration) (int, time.Time, error) { + log.Printf("D! [inputs.vsphere] Query for %s has %d QuerySpecs", res.name, len(pqs)) latestSample := time.Time{} count := 0 resourceType := res.name @@ -838,14 +885,14 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, return count, latestSample, err } - log.Printf("D! [input.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) + log.Printf("D! [inputs.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems)) // Iterate through results for _, em := range ems { moid := em.Entity.Reference().Value instInfo, found := res.objects[moid] if !found { - log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping! (This should not happen!)", moid) + log.Printf("E! [inputs.vsphere]: MOID %s not found in cache. Skipping! (This should not happen!)", moid) continue } buckets := make(map[string]metricEntry) @@ -860,67 +907,56 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec, // Populate tags objectRef, ok := res.objects[moid] if !ok { - log.Printf("E! [input.vsphere]: MOID %s not found in cache. Skipping", moid) + log.Printf("E! [inputs.vsphere]: MOID %s not found in cache. Skipping", moid) continue } e.populateTags(&objectRef, resourceType, res, t, &v) - avg := float64(0) nValues := 0 - for idx, sample := range em.SampleInfo { + alignedInfo, alignedValues := alignSamples(em.SampleInfo, v.Value, interval) // TODO: Estimate interval + + for idx, sample := range alignedInfo { // According to the docs, SampleInfo and Value should have the same length, but we've seen corrupted // data coming back with missing values. Take care of that gracefully! - if idx >= len(v.Value) { - log.Printf("D! [input.vsphere] len(SampleInfo)>len(Value) %d > %d", len(em.SampleInfo), len(v.Value)) + if idx >= len(alignedValues) { + log.Printf("D! [inputs.vsphere] len(SampleInfo)>len(Value) %d > %d", len(alignedInfo), len(alignedValues)) break } - value := float64(v.Value[idx]) - if value < 0 { - continue - } ts := sample.Timestamp if ts.After(latestSample) { latestSample = ts } - avg += float64(value) nValues++ - } - if nValues == 0 { - log.Printf("D! [input.vsphere]: Missing value for: %s, %s", name, objectRef.name) - continue - } - // If we're catching up with metrics arriving late, calculate the average - // of them and pick the midpoint timestamp. This is a reasonable way of - // filling in missed collections that doesn't cause us to deliver metrics - // faster than the interval. - avg /= float64(nValues) - midTs := em.SampleInfo[len(em.SampleInfo)/2].Timestamp - - // Organize the metrics into a bucket per measurement. - mn, fn := e.makeMetricIdentifier(prefix, name) - bKey := mn + " " + v.Instance + " " + strconv.FormatInt(midTs.UnixNano(), 10) - bucket, found := buckets[bKey] - if !found { - bucket = metricEntry{name: mn, ts: midTs, fields: make(map[string]interface{}), tags: t} - buckets[bKey] = bucket - } + // Organize the metrics into a bucket per measurement. + mn, fn := e.makeMetricIdentifier(prefix, name) + bKey := mn + " " + v.Instance + " " + strconv.FormatInt(ts.UnixNano(), 10) + bucket, found := buckets[bKey] + if !found { + bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: t} + buckets[bKey] = bucket + } - // Percentage values must be scaled down by 100. - info, ok := metricInfo[name] - if !ok { - log.Printf("E! [input.vsphere]: Could not determine unit for %s. Skipping", name) + // Percentage values must be scaled down by 100. + info, ok := metricInfo[name] + if !ok { + log.Printf("E! [inputs.vsphere]: Could not determine unit for %s. Skipping", name) + } + v := alignedValues[idx] + if info.UnitInfo.GetElementDescription().Key == "percent" { + bucket.fields[fn] = float64(v) / 100.0 + } else { + bucket.fields[fn] = v + } + count++ + + // Update highwater marks + e.hwMarks.Put(moid, ts) } - if info.UnitInfo.GetElementDescription().Key == "percent" { - bucket.fields[fn] = float64(avg) / 100.0 - } else { - bucket.fields[fn] = avg + if nValues == 0 { + log.Printf("D! [inputs.vsphere]: Missing value for: %s, %s", name, objectRef.name) + continue } - count++ - - // Update highwater marks - e.hwMarks.Put(moid, latestSample) - } // We've iterated through all the metrics and collected buckets for each // measurement name. Now emit them! From cbf93d2c0ea09ad2b98281aa8b7b11b818b3959c Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 12 Feb 2019 11:13:16 -0500 Subject: [PATCH 31/34] Fixed some post-merge issues --- plugins/inputs/vsphere/endpoint.go | 1 - plugins/inputs/vsphere/panic_handler.go | 28 ------------------------- plugins/inputs/vsphere/vsphere.go | 1 - plugins/inputs/vsphere/vsphere_test.go | 12 ----------- 4 files changed, 42 deletions(-) delete mode 100644 plugins/inputs/vsphere/panic_handler.go diff --git a/plugins/inputs/vsphere/endpoint.go b/plugins/inputs/vsphere/endpoint.go index 59622785c833f..192a4a4875974 100644 --- a/plugins/inputs/vsphere/endpoint.go +++ b/plugins/inputs/vsphere/endpoint.go @@ -233,7 +233,6 @@ func isSimple(include []string, exclude []string) bool { func (e *Endpoint) startDiscovery(ctx context.Context) { e.discoveryTicker = time.NewTicker(e.Parent.ObjectDiscoveryInterval.Duration) go func() { - defer HandlePanic() for { select { case <-e.discoveryTicker.C: diff --git a/plugins/inputs/vsphere/panic_handler.go b/plugins/inputs/vsphere/panic_handler.go deleted file mode 100644 index b971135503302..0000000000000 --- a/plugins/inputs/vsphere/panic_handler.go +++ /dev/null @@ -1,28 +0,0 @@ -package vsphere - -import ( - "errors" - "fmt" - "log" - - "github.com/influxdata/telegraf" -) - -func HandlePanicWithAcc(acc telegraf.Accumulator) { - if p := recover(); p != nil { - switch p.(type) { - case string: - acc.AddError(errors.New(p.(string))) - case error: - acc.AddError(p.(error)) - default: - acc.AddError(fmt.Errorf("Unknown panic: %s", p)) - } - } -} - -func HandlePanic() { - if p := recover(); p != nil { - log.Printf("E! [inputs.vsphere] PANIC (recovered): %s", p) - } -} diff --git a/plugins/inputs/vsphere/vsphere.go b/plugins/inputs/vsphere/vsphere.go index 104f7b5117e95..809026e3e78a7 100644 --- a/plugins/inputs/vsphere/vsphere.go +++ b/plugins/inputs/vsphere/vsphere.go @@ -269,7 +269,6 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error { for _, ep := range v.endpoints { wg.Add(1) go func(endpoint *Endpoint) { - defer HandlePanicWithAcc(acc) defer wg.Done() err := endpoint.Collect(context.Background(), acc) if err == context.Canceled { diff --git a/plugins/inputs/vsphere/vsphere_test.go b/plugins/inputs/vsphere/vsphere_test.go index 95f3cbdb8c0f3..eff56a89d2bc1 100644 --- a/plugins/inputs/vsphere/vsphere_test.go +++ b/plugins/inputs/vsphere/vsphere_test.go @@ -290,11 +290,7 @@ func TestParseConfig(t *testing.T) { require.NotNil(t, tab) } -<<<<<<< HEAD func TestThrottledExecutor(t *testing.T) { -======= -func TestWorkerPool(t *testing.T) { ->>>>>>> upstream/master max := int64(0) ngr := int64(0) n := 10000 @@ -345,12 +341,7 @@ func TestTimeout(t *testing.T) { v.Timeout = internal.Duration{Duration: 1 * time.Nanosecond} require.NoError(t, v.Start(nil)) // We're not using the Accumulator, so it can be nil. defer v.Stop() -<<<<<<< HEAD - require.NoError(t, v.Gather(&acc)) -======= err = v.Gather(&acc) ->>>>>>> upstream/master - require.True(t, len(acc.Errors) > 0, "Errors should not be empty here") // The accumulator must contain exactly one error and it must be a deadline exceeded. require.Equal(t, 1, len(acc.Errors)) @@ -525,8 +516,5 @@ func TestAll(t *testing.T) { defer v.Stop() require.NoError(t, v.Gather(&acc)) require.Equal(t, 0, len(acc.Errors), fmt.Sprintf("Errors found: %s", acc.Errors)) -<<<<<<< HEAD require.True(t, len(acc.Metrics) > 0, "No metrics were collected") -======= ->>>>>>> upstream/master } From 72c3d89cdc9c868c1f80930d2e15edcfbd447942 Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 12 Feb 2019 11:20:16 -0500 Subject: [PATCH 32/34] Added Wavefront SDK back to Gopkg.lock --- Gopkg.lock | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Gopkg.lock b/Gopkg.lock index cf2fa56cb794d..98a185f51b54e 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1111,6 +1111,17 @@ version = "v0.19.0" [[projects]] + digest = "1:c3bdfb7e9b2a66bafbd47517a1a4e489706f75af37ad5bfb57621bf41c16b556" + name = "github.com/wavefronthq/wavefront-sdk-go" + packages = [ + "internal", + "senders", + ] + pruneopts = "" + revision = "7821ac6d8ae05fe70c6d090ebda380c64f1416e4" + version = "v0.9.1" + +[[projects]] branch = "master" digest = "1:98ed05e9796df287b90c1d96854e3913c8e349dbc546412d3cabb472ecf4b417" name = "github.com/wvanbergen/kafka" From ef83845c22c548d3da9818a260003b39992311d7 Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 12 Feb 2019 16:31:06 -0500 Subject: [PATCH 33/34] Removed trailing spaces --- Gopkg.lock | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 98a185f51b54e..34a2dbb3ca25b 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1111,16 +1111,16 @@ version = "v0.19.0" [[projects]] - digest = "1:c3bdfb7e9b2a66bafbd47517a1a4e489706f75af37ad5bfb57621bf41c16b556" - name = "github.com/wavefronthq/wavefront-sdk-go" - packages = [ - "internal", - "senders", - ] - pruneopts = "" - revision = "7821ac6d8ae05fe70c6d090ebda380c64f1416e4" + digest = "1:c3bdfb7e9b2a66bafbd47517a1a4e489706f75af37ad5bfb57621bf41c16b556" + name = "github.com/wavefronthq/wavefront-sdk-go" + packages = [ + "internal", + "senders", + ] + pruneopts = "" + revision = "7821ac6d8ae05fe70c6d090ebda380c64f1416e4" version = "v0.9.1" - + [[projects]] branch = "master" digest = "1:98ed05e9796df287b90c1d96854e3913c8e349dbc546412d3cabb472ecf4b417" From dc7054f9da536f7600fef424e92b3806e1571fc2 Mon Sep 17 00:00:00 2001 From: prydin Date: Tue, 12 Feb 2019 16:32:18 -0500 Subject: [PATCH 34/34] Removed trailing spaces --- Gopkg.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gopkg.lock b/Gopkg.lock index 34a2dbb3ca25b..93834b78a2ae8 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1121,7 +1121,7 @@ revision = "7821ac6d8ae05fe70c6d090ebda380c64f1416e4" version = "v0.9.1" -[[projects]] +[[projects]] branch = "master" digest = "1:98ed05e9796df287b90c1d96854e3913c8e349dbc546412d3cabb472ecf4b417" name = "github.com/wvanbergen/kafka"