Skip to content

Commit

Permalink
feat: provisioner label for metrics for created and terminated nodes (#…
Browse files Browse the repository at this point in the history
…210)

* per-provisioner variants of metrics for created and terminated nodes

* adding "provisioner" label to the existing metrics for created and terminated nodes

* removing redundant parts of metric descriptions

Co-authored-by: Jonathan Innis <jonathan.innis.ji@gmail.com>

* bringing back samber/lo

* addressing review comments

* addressing review comments

* addressing review comments

* addressing review comments

* addressing review comments

---------

Co-authored-by: Jonathan Innis <jonathan.innis.ji@gmail.com>
  • Loading branch information
wkaczynski and jonathan-innis committed Feb 22, 2023
1 parent 0613466 commit c4269ec
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 23 deletions.
14 changes: 9 additions & 5 deletions pkg/controllers/deprovisioning/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman
deprovisioningActionsPerformedCounter.With(prometheus.Labels{"action": fmt.Sprintf("%s/%s", d, command.action)}).Add(1)
logging.FromContext(ctx).Infof("deprovisioning via %s %s", d, command)

reason := fmt.Sprintf("%s/%s", d, command.action)
if command.action == actionReplace {
if err := c.launchReplacementNodes(ctx, command); err != nil {
if err := c.launchReplacementNodes(ctx, command, reason); err != nil {
// If we failed to launch the replacement, don't deprovision. If this is some permanent failure,
// we don't want to disrupt workloads with no way to provision new nodes for them.
return fmt.Errorf("launching replacement node, %w", err)
Expand All @@ -157,7 +158,10 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman
if err := c.kubeClient.Delete(ctx, oldNode); client.IgnoreNotFound(err) != nil {
logging.FromContext(ctx).Errorf("Deleting node, %s", err)
} else {
metrics.NodesTerminatedCounter.WithLabelValues(fmt.Sprintf("%s/%s", d, command.action)).Inc()
metrics.NodesTerminatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: reason,
metrics.ProvisionerLabel: oldNode.Labels[v1alpha5.ProvisionerNameLabelKey],
}).Inc()
}
}

Expand Down Expand Up @@ -195,25 +199,25 @@ func (c *Controller) waitForDeletion(ctx context.Context, node *v1.Node) {

// launchReplacementNodes launches replacement nodes and blocks until it is ready
// nolint:gocyclo
func (c *Controller) launchReplacementNodes(ctx context.Context, action Command) error {
func (c *Controller) launchReplacementNodes(ctx context.Context, action Command, reason string) error {
defer metrics.Measure(deprovisioningReplacementNodeInitializedHistogram)()
nodeNamesToRemove := lo.Map(action.nodesToRemove, func(n *v1.Node, _ int) string { return n.Name })
// cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes
if err := c.setNodesUnschedulable(ctx, true, nodeNamesToRemove...); err != nil {
return fmt.Errorf("cordoning nodes, %w", err)
}

nodeNames, err := c.provisioner.LaunchMachines(ctx, action.replacementNodes)
nodeNames, err := c.provisioner.LaunchMachines(ctx, action.replacementNodes, provisioning.WithReason(reason))
if err != nil {
// uncordon the nodes as the launch may fail (e.g. ICE or incompatible AMI)
err = multierr.Append(err, c.setNodesUnschedulable(ctx, false, nodeNamesToRemove...))
return err
}

if len(nodeNames) != len(action.replacementNodes) {
// shouldn't ever occur since a partially failed LaunchMachines should return an error
return fmt.Errorf("expected %d node names, got %d", len(action.replacementNodes), len(nodeNames))
}
metrics.NodesCreatedCounter.WithLabelValues(metrics.DeprovisioningReason).Add(float64(len(nodeNames)))

// We have the new nodes created at the API server so mark the old nodes for deletion
c.cluster.MarkForDeletion(nodeNamesToRemove...)
Expand Down
6 changes: 1 addition & 5 deletions pkg/controllers/metrics/state/scraper/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,7 @@ func (ns *NodeScraper) getNodeLabels(node *v1.Node, resourceTypeName string) pro
metricLabels := prometheus.Labels{}
metricLabels[resourceType] = resourceTypeName
metricLabels[nodeName] = node.GetName()
if provisionerName, ok := node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok {
metricLabels[nodeProvisioner] = "N/A"
} else {
metricLabels[nodeProvisioner] = provisionerName
}
metricLabels[nodeProvisioner] = node.Labels[v1alpha5.ProvisionerNameLabelKey]
metricLabels[nodePhase] = string(node.Status.Phase)

// Populate well known labels
Expand Down
21 changes: 15 additions & 6 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ import (
// actions and configuration during scheduling
type LaunchOptions struct {
RecordPodNomination bool
Reason string
}

// RecordPodNomination causes nominate pod events to be recorded against the node.
Expand All @@ -63,6 +64,13 @@ func RecordPodNomination(o LaunchOptions) LaunchOptions {
return o
}

func WithReason(reason string) func(LaunchOptions) LaunchOptions {
return func(o LaunchOptions) LaunchOptions {
o.Reason = reason
return o
}
}

// Provisioner waits for enqueued pods, batches them, creates capacity and binds the pods to the capacity.
type Provisioner struct {
cloudProvider cloudprovider.CloudProvider
Expand Down Expand Up @@ -116,11 +124,7 @@ func (p *Provisioner) Reconcile(ctx context.Context, _ reconcile.Request) (resul
if len(machines) == 0 {
return reconcile.Result{}, nil
}
machineNames, err := p.LaunchMachines(ctx, machines, RecordPodNomination)

// Any successfully created node is going to have the nodeName value filled in the slice
successfullyCreatedNodeCount := lo.CountBy(machineNames, func(name string) bool { return name != "" })
metrics.NodesCreatedCounter.WithLabelValues(metrics.ProvisioningReason).Add(float64(successfullyCreatedNodeCount))
_, err = p.LaunchMachines(ctx, machines, WithReason(metrics.ProvisioningReason), RecordPodNomination)

return reconcile.Result{}, err
}
Expand Down Expand Up @@ -351,8 +355,13 @@ func (p *Provisioner) Launch(ctx context.Context, machine *scheduler.Machine, op
if err := p.cluster.UpdateNode(ctx, k8sNode); err != nil {
return "", fmt.Errorf("updating cluster state, %w", err)
}
launchOpts := functional.ResolveOptions(opts...)
metrics.NodesCreatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: launchOpts.Reason,
metrics.ProvisionerLabel: k8sNode.Labels[v1alpha5.ProvisionerNameLabelKey],
}).Inc()
p.cluster.NominateNodeForPod(ctx, k8sNode.Name)
if functional.ResolveOptions(opts...).RecordPodNomination {
if launchOpts.RecordPodNomination {
for _, pod := range machine.Pods {
p.recorder.Publish(events.NominatePod(pod, k8sNode))
}
Expand Down
1 change: 1 addition & 0 deletions pkg/metrics/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const (

ErrorLabel = "error"
ProvisionerLabel = "provisioner"
ReasonLabel = "reason"

// Reasons for CREATE/DELETE shared metrics
DeprovisioningReason = "deprovisioning"
Expand Down
10 changes: 6 additions & 4 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,23 @@ var (
Namespace: Namespace,
Subsystem: nodeSubsystem,
Name: "created",
Help: "Number of nodes created in total by Karpenter. Labeled by reason the node was created.",
Help: "Number of nodes created in total by Karpenter. Labeled by reason the node was created and the owning provisioner.",
},
[]string{
"reason",
ReasonLabel,
ProvisionerLabel,
},
)
NodesTerminatedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: nodeSubsystem,
Name: "terminated",
Help: "Number of nodes terminated in total by Karpenter. Labeled by reason the node was terminated.",
Help: "Number of nodes terminated in total by Karpenter. Labeled by reason the node was terminated and the owning provisioner.",
},
[]string{
"reason",
ReasonLabel,
ProvisionerLabel,
},
)
MachinesCreatedCounter = prometheus.NewCounterVec(
Expand Down
7 changes: 4 additions & 3 deletions pkg/test/expectations/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,14 @@ import (
"knative.dev/pkg/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/metrics"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/controllers/provisioning"
"github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling"
"github.com/aws/karpenter-core/pkg/controllers/state"
"github.com/aws/karpenter-core/pkg/metrics"
"github.com/aws/karpenter-core/pkg/operator/injection"
)

Expand Down Expand Up @@ -250,7 +251,7 @@ func ExpectProvisionedNoBindingWithOffset(offset int, ctx context.Context, c cli
for _, m := range machines {
ctx = injection.WithNamespacedName(ctx, types.NamespacedName{Name: m.Labels[v1alpha5.ProvisionerNameLabelKey]})
// TODO: Check the error on the provisioner launch
name, err := provisioner.Launch(ctx, m)
name, err := provisioner.Launch(ctx, m, provisioning.WithReason(metrics.ProvisioningReason))
if err != nil {
return bindings
}
Expand Down Expand Up @@ -297,7 +298,7 @@ func ExpectOwnerReferenceExists(obj, owner client.Object) metav1.OwnerReference
// FindMetricWithLabelValues attempts to find a metric with a name with a set of label values
// If no metric is found, the *prometheus.Metric will be nil
func FindMetricWithLabelValues(name string, labelValues map[string]string) (*prometheus.Metric, bool) {
metrics, err := metrics.Registry.Gather()
metrics, err := crmetrics.Registry.Gather()
ExpectWithOffset(1, err).To(BeNil())

mf, found := lo.Find(metrics, func(mf *prometheus.MetricFamily) bool {
Expand Down

0 comments on commit c4269ec

Please sign in to comment.