Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

podgc metrics should count all pod deletion behaviors #118480

Merged
merged 1 commit into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pkg/controller/podgc/gc_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
nodeutil "k8s.io/kubernetes/pkg/util/node"
Expand Down Expand Up @@ -69,11 +70,6 @@ type PodGCController struct {
quarantineTime time.Duration
}

func init() {
// Register prometheus metrics
RegisterMetrics()
}

func NewPodGC(ctx context.Context, kubeClient clientset.Interface, podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer, terminatedPodThreshold int) *PodGCController {
return NewPodGCInternal(ctx, kubeClient, podInformer, nodeInformer, terminatedPodThreshold, gcCheckPeriod, quarantineTime)
Expand All @@ -94,6 +90,8 @@ func NewPodGCInternal(ctx context.Context, kubeClient clientset.Interface, podIn
quarantineTime: quarantineTime,
}

// Register prometheus metrics
metrics.RegisterMetrics()
return gcc
}

Expand Down Expand Up @@ -179,11 +177,11 @@ func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) {
wait.Add(1)
go func(pod *v1.Pod) {
defer wait.Done()
deletingPodsTotal.WithLabelValues().Inc()
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
utilruntime.HandleError(err)
deletingPodsErrorTotal.WithLabelValues().Inc()
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
}
}(terminatingPods[i])
}
Expand Down Expand Up @@ -216,7 +214,9 @@ func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) {
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
defer utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}(terminatedPods[i])
}
wait.Wait()
Expand Down Expand Up @@ -254,9 +254,11 @@ func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, node
WithLastTransitionTime(metav1.Now())
if err := gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
} else {
klog.InfoS("Forced deletion of orphaned Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
}
}

Expand Down Expand Up @@ -303,9 +305,11 @@ func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods [
klog.V(2).InfoS("Found unscheduled terminating Pod not assigned to any Node, deleting.", "pod", klog.KObj(pod))
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
} else {
klog.InfoS("Forced deletion of unscheduled terminating Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
}
}

Expand Down
33 changes: 22 additions & 11 deletions pkg/controller/podgc/gc_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
featuregatetesting "k8s.io/component-base/featuregate/testing"
metricstestutil "k8s.io/component-base/metrics/testutil"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/controller/testutil"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
Expand Down Expand Up @@ -159,7 +160,7 @@ func TestGCTerminated(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}},
Status: v1.PodStatus{Phase: pod.phase, Reason: pod.reason},
Spec: v1.PodSpec{NodeName: "node"},
})
Expand All @@ -175,12 +176,16 @@ func TestGCTerminated(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 9 in this test
testDeletingPodsMetrics(t, 9, metrics.PodGCReasonTerminated)
}

func makePod(name string, nodeName string, phase v1.PodPhase) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Name: name,
Namespace: metav1.NamespaceDefault,
},
Spec: v1.PodSpec{NodeName: nodeName},
Status: v1.PodStatus{Phase: phase},
Expand Down Expand Up @@ -406,6 +411,9 @@ func TestGCOrphaned(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 10 in this test
testDeletingPodsMetrics(t, 10, metrics.PodGCReasonOrphaned)
}

func TestGCUnscheduledTerminating(t *testing.T) {
Expand Down Expand Up @@ -463,7 +471,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
Expand All @@ -486,6 +494,9 @@ func TestGCUnscheduledTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 6 in this test
testDeletingPodsMetrics(t, 6, metrics.PodGCReasonTerminatingUnscheduled)
}

func TestGCTerminating(t *testing.T) {
Expand Down Expand Up @@ -633,7 +644,7 @@ func TestGCTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
Expand All @@ -653,8 +664,8 @@ func TestGCTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// deletingPodsTotal is 7 in this test
testDeletingPodsMetrics(t, 7)
// testDeletingPodsMetrics is 7 in this test
testDeletingPodsMetrics(t, 7, metrics.PodGCReasonTerminatingOutOfService)
}

func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDeletedPodNames, wantPatchedPodNames sets.String) {
Expand All @@ -669,18 +680,18 @@ func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDelet
}
}

func testDeletingPodsMetrics(t *testing.T, inputDeletingPodsTotal int) {
func testDeletingPodsMetrics(t *testing.T, total int, reason string) {
t.Helper()

actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsTotal.WithLabelValues())
actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsTotal.WithLabelValues(metav1.NamespaceDefault, reason))
if err != nil {
t.Errorf("Error getting actualDeletingPodsTotal")
}
if actualDeletingPodsTotal != float64(inputDeletingPodsTotal) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", inputDeletingPodsTotal, actualDeletingPodsTotal)
if actualDeletingPodsTotal != float64(total) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", total, actualDeletingPodsTotal)
}

actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsErrorTotal.WithLabelValues())
actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsErrorTotal.WithLabelValues("", reason))
if err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add unit tests for all the other labels?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added.

t.Errorf("Error getting actualDeletingPodsErrorTotal")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package podgc
package metrics

import (
"sync"
Expand All @@ -28,32 +28,47 @@ const (
)

var (
deletingPodsTotal = metrics.NewCounterVec(
DeletingPodsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pods_total",
Help: "Number of pods that are being forcefully deleted since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
deletingPodsErrorTotal = metrics.NewCounterVec(
DeletingPodsErrorTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pod_errors_total",
Help: "Number of errors encountered when forcefully deleting the pods since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
)

const (
// Possible values for the "reason" label in the above metrics.

// PodGCReasonTerminated is used when the pod is terminated.
PodGCReasonTerminated = "terminated"
// PodGCReasonCompleted is used when the pod is terminating and the corresponding node
// is not ready and has `node.kubernetes.io/out-of-service` taint.
PodGCReasonTerminatingOutOfService = "out-of-service"
// PodGCReasonOrphaned is used when the pod is orphaned which means the corresponding node
// has been deleted.
PodGCReasonOrphaned = "orphaned"
// PodGCReasonUnscheduled is used when the pod is terminating and unscheduled.
PodGCReasonTerminatingUnscheduled = "unscheduled"
)

var registerMetrics sync.Once

// Register the metrics that are to be monitored.
func RegisterMetrics() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(deletingPodsTotal)
legacyregistry.MustRegister(deletingPodsErrorTotal)
legacyregistry.MustRegister(DeletingPodsTotal)
legacyregistry.MustRegister(DeletingPodsErrorTotal)
})
}