From c4269ec5fb7ea216a74009139b2b69d9ba264f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Kaczy=C5=84ski?= <97508788+wkaczynski@users.noreply.github.com> Date: Wed, 22 Feb 2023 19:12:59 +0100 Subject: [PATCH] feat: provisioner label for metrics for created and terminated nodes (#210) * per-provisioner variants of metrics for created and terminated nodes * adding "provisioner" label to the existing metrics for created and terminated nodes * removing redundant parts of metric descriptions Co-authored-by: Jonathan Innis * bringing back samber/lo * addressing review comments * addressing review comments * addressing review comments * addressing review comments * addressing review comments --------- Co-authored-by: Jonathan Innis --- pkg/controllers/deprovisioning/controller.go | 14 ++++++++----- pkg/controllers/metrics/state/scraper/node.go | 6 +----- pkg/controllers/provisioning/provisioner.go | 21 +++++++++++++------ pkg/metrics/constants.go | 1 + pkg/metrics/metrics.go | 10 +++++---- pkg/test/expectations/expectations.go | 7 ++++--- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/pkg/controllers/deprovisioning/controller.go b/pkg/controllers/deprovisioning/controller.go index c4b987af01..1a0c44ce3e 100644 --- a/pkg/controllers/deprovisioning/controller.go +++ b/pkg/controllers/deprovisioning/controller.go @@ -144,8 +144,9 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman deprovisioningActionsPerformedCounter.With(prometheus.Labels{"action": fmt.Sprintf("%s/%s", d, command.action)}).Add(1) logging.FromContext(ctx).Infof("deprovisioning via %s %s", d, command) + reason := fmt.Sprintf("%s/%s", d, command.action) if command.action == actionReplace { - if err := c.launchReplacementNodes(ctx, command); err != nil { + if err := c.launchReplacementNodes(ctx, command, reason); err != nil { // If we failed to launch the replacement, don't deprovision. If this is some permanent failure, // we don't want to disrupt workloads with no way to provision new nodes for them. return fmt.Errorf("launching replacement node, %w", err) @@ -157,7 +158,10 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman if err := c.kubeClient.Delete(ctx, oldNode); client.IgnoreNotFound(err) != nil { logging.FromContext(ctx).Errorf("Deleting node, %s", err) } else { - metrics.NodesTerminatedCounter.WithLabelValues(fmt.Sprintf("%s/%s", d, command.action)).Inc() + metrics.NodesTerminatedCounter.With(prometheus.Labels{ + metrics.ReasonLabel: reason, + metrics.ProvisionerLabel: oldNode.Labels[v1alpha5.ProvisionerNameLabelKey], + }).Inc() } } @@ -195,7 +199,7 @@ func (c *Controller) waitForDeletion(ctx context.Context, node *v1.Node) { // launchReplacementNodes launches replacement nodes and blocks until it is ready // nolint:gocyclo -func (c *Controller) launchReplacementNodes(ctx context.Context, action Command) error { +func (c *Controller) launchReplacementNodes(ctx context.Context, action Command, reason string) error { defer metrics.Measure(deprovisioningReplacementNodeInitializedHistogram)() nodeNamesToRemove := lo.Map(action.nodesToRemove, func(n *v1.Node, _ int) string { return n.Name }) // cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes @@ -203,17 +207,17 @@ func (c *Controller) launchReplacementNodes(ctx context.Context, action Command) return fmt.Errorf("cordoning nodes, %w", err) } - nodeNames, err := c.provisioner.LaunchMachines(ctx, action.replacementNodes) + nodeNames, err := c.provisioner.LaunchMachines(ctx, action.replacementNodes, provisioning.WithReason(reason)) if err != nil { // uncordon the nodes as the launch may fail (e.g. ICE or incompatible AMI) err = multierr.Append(err, c.setNodesUnschedulable(ctx, false, nodeNamesToRemove...)) return err } + if len(nodeNames) != len(action.replacementNodes) { // shouldn't ever occur since a partially failed LaunchMachines should return an error return fmt.Errorf("expected %d node names, got %d", len(action.replacementNodes), len(nodeNames)) } - metrics.NodesCreatedCounter.WithLabelValues(metrics.DeprovisioningReason).Add(float64(len(nodeNames))) // We have the new nodes created at the API server so mark the old nodes for deletion c.cluster.MarkForDeletion(nodeNamesToRemove...) diff --git a/pkg/controllers/metrics/state/scraper/node.go b/pkg/controllers/metrics/state/scraper/node.go index 17a5581256..5f79a4bef7 100644 --- a/pkg/controllers/metrics/state/scraper/node.go +++ b/pkg/controllers/metrics/state/scraper/node.go @@ -219,11 +219,7 @@ func (ns *NodeScraper) getNodeLabels(node *v1.Node, resourceTypeName string) pro metricLabels := prometheus.Labels{} metricLabels[resourceType] = resourceTypeName metricLabels[nodeName] = node.GetName() - if provisionerName, ok := node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { - metricLabels[nodeProvisioner] = "N/A" - } else { - metricLabels[nodeProvisioner] = provisionerName - } + metricLabels[nodeProvisioner] = node.Labels[v1alpha5.ProvisionerNameLabelKey] metricLabels[nodePhase] = string(node.Status.Phase) // Populate well known labels diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 41cf904ed2..f0ef1b3b73 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -55,6 +55,7 @@ import ( // actions and configuration during scheduling type LaunchOptions struct { RecordPodNomination bool + Reason string } // RecordPodNomination causes nominate pod events to be recorded against the node. @@ -63,6 +64,13 @@ func RecordPodNomination(o LaunchOptions) LaunchOptions { return o } +func WithReason(reason string) func(LaunchOptions) LaunchOptions { + return func(o LaunchOptions) LaunchOptions { + o.Reason = reason + return o + } +} + // Provisioner waits for enqueued pods, batches them, creates capacity and binds the pods to the capacity. type Provisioner struct { cloudProvider cloudprovider.CloudProvider @@ -116,11 +124,7 @@ func (p *Provisioner) Reconcile(ctx context.Context, _ reconcile.Request) (resul if len(machines) == 0 { return reconcile.Result{}, nil } - machineNames, err := p.LaunchMachines(ctx, machines, RecordPodNomination) - - // Any successfully created node is going to have the nodeName value filled in the slice - successfullyCreatedNodeCount := lo.CountBy(machineNames, func(name string) bool { return name != "" }) - metrics.NodesCreatedCounter.WithLabelValues(metrics.ProvisioningReason).Add(float64(successfullyCreatedNodeCount)) + _, err = p.LaunchMachines(ctx, machines, WithReason(metrics.ProvisioningReason), RecordPodNomination) return reconcile.Result{}, err } @@ -351,8 +355,13 @@ func (p *Provisioner) Launch(ctx context.Context, machine *scheduler.Machine, op if err := p.cluster.UpdateNode(ctx, k8sNode); err != nil { return "", fmt.Errorf("updating cluster state, %w", err) } + launchOpts := functional.ResolveOptions(opts...) + metrics.NodesCreatedCounter.With(prometheus.Labels{ + metrics.ReasonLabel: launchOpts.Reason, + metrics.ProvisionerLabel: k8sNode.Labels[v1alpha5.ProvisionerNameLabelKey], + }).Inc() p.cluster.NominateNodeForPod(ctx, k8sNode.Name) - if functional.ResolveOptions(opts...).RecordPodNomination { + if launchOpts.RecordPodNomination { for _, pod := range machine.Pods { p.recorder.Publish(events.NominatePod(pod, k8sNode)) } diff --git a/pkg/metrics/constants.go b/pkg/metrics/constants.go index 3282e9d639..90f5c1d87c 100644 --- a/pkg/metrics/constants.go +++ b/pkg/metrics/constants.go @@ -26,6 +26,7 @@ const ( ErrorLabel = "error" ProvisionerLabel = "provisioner" + ReasonLabel = "reason" // Reasons for CREATE/DELETE shared metrics DeprovisioningReason = "deprovisioning" diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 963fcbe6bd..214c1dd662 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -30,10 +30,11 @@ var ( Namespace: Namespace, Subsystem: nodeSubsystem, Name: "created", - Help: "Number of nodes created in total by Karpenter. Labeled by reason the node was created.", + Help: "Number of nodes created in total by Karpenter. Labeled by reason the node was created and the owning provisioner.", }, []string{ - "reason", + ReasonLabel, + ProvisionerLabel, }, ) NodesTerminatedCounter = prometheus.NewCounterVec( @@ -41,10 +42,11 @@ var ( Namespace: Namespace, Subsystem: nodeSubsystem, Name: "terminated", - Help: "Number of nodes terminated in total by Karpenter. Labeled by reason the node was terminated.", + Help: "Number of nodes terminated in total by Karpenter. Labeled by reason the node was terminated and the owning provisioner.", }, []string{ - "reason", + ReasonLabel, + ProvisionerLabel, }, ) MachinesCreatedCounter = prometheus.NewCounterVec( diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index a2a8ed713d..08cdf076f3 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -40,13 +40,14 @@ import ( "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/metrics" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling" "github.com/aws/karpenter-core/pkg/controllers/state" + "github.com/aws/karpenter-core/pkg/metrics" "github.com/aws/karpenter-core/pkg/operator/injection" ) @@ -250,7 +251,7 @@ func ExpectProvisionedNoBindingWithOffset(offset int, ctx context.Context, c cli for _, m := range machines { ctx = injection.WithNamespacedName(ctx, types.NamespacedName{Name: m.Labels[v1alpha5.ProvisionerNameLabelKey]}) // TODO: Check the error on the provisioner launch - name, err := provisioner.Launch(ctx, m) + name, err := provisioner.Launch(ctx, m, provisioning.WithReason(metrics.ProvisioningReason)) if err != nil { return bindings } @@ -297,7 +298,7 @@ func ExpectOwnerReferenceExists(obj, owner client.Object) metav1.OwnerReference // FindMetricWithLabelValues attempts to find a metric with a name with a set of label values // If no metric is found, the *prometheus.Metric will be nil func FindMetricWithLabelValues(name string, labelValues map[string]string) (*prometheus.Metric, bool) { - metrics, err := metrics.Registry.Gather() + metrics, err := crmetrics.Registry.Gather() ExpectWithOffset(1, err).To(BeNil()) mf, found := lo.Find(metrics, func(mf *prometheus.MetricFamily) bool {