diff --git a/doc/metrics.md b/doc/metrics.md index a7f55689dd..16f4f42650 100644 --- a/doc/metrics.md +++ b/doc/metrics.md @@ -31,7 +31,7 @@ CDI operator status. Type: Gauge. Progress of volume population. Type: Counter. ### kubevirt_cdi_storageprofile_info -`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge. +`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone, `degraded` indicates it is not optimal for virtualization. Type: Gauge. ### kubevirt_cdi_upload_pods_high_restart The number of CDI upload server pods with high restart count. Type: Gauge. diff --git a/pkg/controller/storageprofile-controller.go b/pkg/controller/storageprofile-controller.go index ab0165b9bb..8f318c7179 100644 --- a/pkg/controller/storageprofile-controller.go +++ b/pkg/controller/storageprofile-controller.go @@ -9,6 +9,7 @@ import ( "github.com/go-logr/logr" snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1" + ocpconfigv1 "github.com/openshift/api/config/v1" "github.com/prometheus/client_golang/prometheus" v1 "k8s.io/api/core/v1" @@ -48,6 +49,7 @@ const ( counterLabelVirtDefault = "virtdefault" counterLabelRWX = "rwx" counterLabelSmartClone = "smartclone" + counterLabelDegraded = "degraded" ) // StorageProfileReconciler members @@ -296,13 +298,26 @@ func (r *StorageProfileReconciler) computeMetrics(profile *cdiv1.StorageProfile, return err } + isSNO := false + clusterInfra := &ocpconfigv1.Infrastructure{} + if err := r.client.Get(context.TODO(), types.NamespacedName{Name: "cluster"}, clusterInfra); err != nil { + if !meta.IsNoMatchError(err) && !k8serrors.IsNotFound(err) { + return err + } + } else { + isSNO = clusterInfra.Status.ControlPlaneTopology == ocpconfigv1.SingleReplicaTopologyMode && + clusterInfra.Status.InfrastructureTopology == ocpconfigv1.SingleReplicaTopologyMode + } + + isDegraded := (!isSNO && !isRWX) || !isSmartClone + // Setting the labeled Gauge to 1 will not delete older metric, so we need to explicitly delete them scLabels := prometheus.Labels{counterLabelStorageClass: storageClass, counterLabelProvisioner: provisioner} metricsDeleted := metrics.DeleteStorageProfileStatus(scLabels) - scLabels = createLabels(storageClass, provisioner, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone) + scLabels = createLabels(storageClass, provisioner, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded) metrics.SetStorageProfileStatus(scLabels, 1) - r.log.Info(fmt.Sprintf("Set metric:%s complete:%t default:%t vdefault:%t rwx:%t smartclone:%t (deleted %d)", - storageClass, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, metricsDeleted)) + r.log.Info(fmt.Sprintf("Set metric:%s complete:%t default:%t vdefault:%t rwx:%t smartclone:%t degraded:%t (deleted %d)", + storageClass, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded, metricsDeleted)) return nil } @@ -335,7 +350,7 @@ func (r *StorageProfileReconciler) hasSmartClone(sp *cdiv1.StorageProfile) (bool return false, nil } -func createLabels(storageClass, provisioner string, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone bool) prometheus.Labels { +func createLabels(storageClass, provisioner string, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded bool) prometheus.Labels { return prometheus.Labels{ counterLabelStorageClass: storageClass, counterLabelProvisioner: provisioner, @@ -344,6 +359,7 @@ func createLabels(storageClass, provisioner string, isComplete, isDefault, isVir counterLabelVirtDefault: strconv.FormatBool(isVirtDefault), counterLabelRWX: strconv.FormatBool(isRWX), counterLabelSmartClone: strconv.FormatBool(isSmartClone), + counterLabelDegraded: strconv.FormatBool(isDegraded), } } diff --git a/pkg/controller/storageprofile-controller_test.go b/pkg/controller/storageprofile-controller_test.go index ffa8a88add..dd13b3bf6b 100644 --- a/pkg/controller/storageprofile-controller_test.go +++ b/pkg/controller/storageprofile-controller_test.go @@ -25,8 +25,10 @@ import ( . "github.com/onsi/gomega" snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1" + ocpconfigv1 "github.com/openshift/api/config/v1" v1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -52,6 +54,7 @@ import ( const ( storageClassName = "testSC" snapshotClassName = "testSnapClass" + provisionerName = "testProvisioner" cephProvisioner = "rook-ceph.rbd.csi.ceph.com" ) @@ -471,7 +474,7 @@ var _ = Describe("Storage profile controller reconcile loop", func() { Entry("provisioner that is known to prefer csi clone", "csi-powermax.dellemc.com", cdiv1.CloneStrategyCsiClone, false), ) - DescribeTable("Should set the StorageProfileStatus metric correctly", func(provisioner string, count int) { + DescribeTable("Should set the StorageProfileStatus metric correctly", func(provisioner string, isComplete bool, count int) { storageClass := CreateStorageClassWithProvisioner(storageClassName, map[string]string{AnnDefaultStorageClass: "true"}, map[string]string{}, provisioner) reconciler = createStorageProfileReconciler(storageClass) _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}}) @@ -484,12 +487,60 @@ var _ = Describe("Storage profile controller reconcile loop", func() { Expect(*sp.Status.StorageClass).To(Equal(storageClassName)) Expect(sp.Status.ClaimPropertySets).To(BeEmpty()) - labels := createLabels(storageClassName, provisioner, false, true, false, false, false) + labels := createLabels(storageClassName, provisioner, isComplete, true, false, false, false, true) Expect(int(metrics.GetStorageProfileStatus(labels))).To(Equal(count)) }, - Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, 0), - Entry("Unknown provisioner", "unknown-provisioner", 1), + Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, false, 0), + Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, true, 1), + Entry("Unknown provisioner", "unknown-provisioner", false, 1), + Entry("Unknown provisioner", "unknown-provisioner", true, 0), ) + + DescribeTable("Should set the StorageProfileStatus degraded state correctly", func(accessMode v1.PersistentVolumeAccessMode, isSNO, isDegraded bool) { + storageClass := CreateStorageClassWithProvisioner(storageClassName, nil, nil, provisionerName) + driver := &storagev1.CSIDriver{ObjectMeta: metav1.ObjectMeta{Name: provisionerName}} + clusterInfra := &ocpconfigv1.Infrastructure{} + if isSNO { + clusterInfra.Name = "cluster" + clusterInfra.Status.ControlPlaneTopology = ocpconfigv1.SingleReplicaTopologyMode + clusterInfra.Status.InfrastructureTopology = ocpconfigv1.SingleReplicaTopologyMode + } + reconciler = createStorageProfileReconciler(storageClass, driver, clusterInfra) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}}) + Expect(err).ToNot(HaveOccurred()) + + sp := &cdiv1.StorageProfile{} + err = reconciler.client.Get(context.TODO(), types.NamespacedName{Name: storageClassName}, sp, &client.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(*sp.Status.StorageClass).To(Equal(storageClassName)) + Expect(sp.Status.ClaimPropertySets).To(BeEmpty()) + + sp.Spec.ClaimPropertySets = []cdiv1.ClaimPropertySet{ + { + AccessModes: []v1.PersistentVolumeAccessMode{accessMode}, + VolumeMode: &BlockMode, + }, + } + + cloneStrategy := cdiv1.CloneStrategyCsiClone + sp.Spec.CloneStrategy = &cloneStrategy + err = reconciler.client.Update(context.TODO(), sp, &client.UpdateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + _, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}}) + Expect(err).ToNot(HaveOccurred()) + err = reconciler.client.Get(context.TODO(), types.NamespacedName{Name: storageClassName}, sp, &client.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(sp.Status.ClaimPropertySets).ToNot(BeEmpty()) + + labels := createLabels(storageClassName, provisionerName, true, false, false, accessMode == v1.ReadWriteMany, true, isDegraded) + Expect(int(metrics.GetStorageProfileStatus(labels))).To(Equal(1)) + }, + Entry("With RWX, not degraded", v1.ReadWriteMany, false, false), + Entry("Without RWX, degraded", v1.ReadWriteOnce, false, true), + Entry("Without RWX, on SNO, not degraded", v1.ReadWriteOnce, true, false), + ) + }) func createStorageProfileReconciler(objects ...runtime.Object) *StorageProfileReconciler { @@ -508,6 +559,7 @@ func createStorageProfileReconciler(objects ...runtime.Object) *StorageProfileRe _ = cdiv1.AddToScheme(s) _ = snapshotv1.AddToScheme(s) _ = extv1.AddToScheme(s) + _ = ocpconfigv1.Install(s) // Create a fake client to mock API calls. cl := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(objs...).Build() diff --git a/pkg/monitoring/metrics/cdi-controller/storageprofile.go b/pkg/monitoring/metrics/cdi-controller/storageprofile.go index 235f2b2e69..74a3adb28a 100644 --- a/pkg/monitoring/metrics/cdi-controller/storageprofile.go +++ b/pkg/monitoring/metrics/cdi-controller/storageprofile.go @@ -14,6 +14,7 @@ const ( counterLabelVirtDefault = "virtdefault" counterLabelRWX = "rwx" counterLabelSmartClone = "smartclone" + counterLabelDegraded = "degraded" ) var ( @@ -30,7 +31,8 @@ var ( "`default` indicates if it's the Kubernetes default storage class, " + "`virtdefault` indicates if it's the default virtualization storage class, " + "`rwx` indicates if the storage class supports `ReadWriteMany`, " + - "`smartclone` indicates if it supports snapshot or CSI based clone", + "`smartclone` indicates if it supports snapshot or CSI based clone, " + + "`degraded` indicates it is not optimal for virtualization", }, []string{ counterLabelStorageClass, @@ -40,6 +42,7 @@ var ( counterLabelVirtDefault, counterLabelRWX, counterLabelSmartClone, + counterLabelDegraded, }, ) ) diff --git a/pkg/monitoring/rules/alerts/operator.go b/pkg/monitoring/rules/alerts/operator.go index b2f7c6e50f..4d1338220a 100644 --- a/pkg/monitoring/rules/alerts/operator.go +++ b/pkg/monitoring/rules/alerts/operator.go @@ -96,8 +96,8 @@ var operatorAlerts = []promv1.Rule{ }, { Alert: "CDIDefaultStorageClassDegraded", - Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) + - sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) + + Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",degraded="false"} or on() vector(0)) + + sum(kubevirt_cdi_storageprofile_info{virtdefault="true",degraded="false"} or on() vector(0)) + on () (0*(sum(kubevirt_cdi_storageprofile_info{default="true"}) or sum(kubevirt_cdi_storageprofile_info{virtdefault="true"}))) == 0`), For: (*promv1.Duration)(ptr.To("5m")), Annotations: map[string]string{ diff --git a/pkg/operator/resources/cluster/controller.go b/pkg/operator/resources/cluster/controller.go index 277350de96..2760dd2015 100644 --- a/pkg/operator/resources/cluster/controller.go +++ b/pkg/operator/resources/cluster/controller.go @@ -146,6 +146,7 @@ func getControllerClusterPolicyRules() []rbacv1.PolicyRule { }, Resources: []string{ "proxies", + "infrastructures", }, Verbs: []string{ "get",