Skip to content

Commit

Permalink
Suppress CDIDefaultStorageClassDegraded on SNO
Browse files Browse the repository at this point in the history
On single-node OpenShift, even if none of the default/virt default
storage classes supports `ReadWriteMany` (but supports smart clone),
we will not fire the `CDIDefaultStorageClassDegraded` alert.
We added `degraded` label to `kubevirt_cdi_storageprofile_info` to
simplify the alert expression.

Signed-off-by: Arnon Gilboa <agilboa@redhat.com>
  • Loading branch information
arnongilboa committed Jun 9, 2024
1 parent f3d0060 commit 3d99cfb
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 12 deletions.
2 changes: 1 addition & 1 deletion doc/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CDI operator status. Type: Gauge.
Progress of volume population. Type: Counter.

### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone, `degraded` indicates it is not optimal for virtualization. Type: Gauge.

### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge.
Expand Down
24 changes: 20 additions & 4 deletions pkg/controller/storageprofile-controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/go-logr/logr"
snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1"
ocpconfigv1 "github.com/openshift/api/config/v1"
"github.com/prometheus/client_golang/prometheus"

v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -48,6 +49,7 @@ const (
counterLabelVirtDefault = "virtdefault"
counterLabelRWX = "rwx"
counterLabelSmartClone = "smartclone"
counterLabelDegraded = "degraded"
)

// StorageProfileReconciler members
Expand Down Expand Up @@ -296,13 +298,26 @@ func (r *StorageProfileReconciler) computeMetrics(profile *cdiv1.StorageProfile,
return err
}

isSNO := false
clusterInfra := &ocpconfigv1.Infrastructure{}
if err := r.client.Get(context.TODO(), types.NamespacedName{Name: "cluster"}, clusterInfra); err != nil {
if !meta.IsNoMatchError(err) && !k8serrors.IsNotFound(err) {
return err
}
} else {
isSNO = clusterInfra.Status.ControlPlaneTopology == ocpconfigv1.SingleReplicaTopologyMode &&
clusterInfra.Status.InfrastructureTopology == ocpconfigv1.SingleReplicaTopologyMode
}

isDegraded := (!isSNO && !isRWX) || !isSmartClone

// Setting the labeled Gauge to 1 will not delete older metric, so we need to explicitly delete them
scLabels := prometheus.Labels{counterLabelStorageClass: storageClass, counterLabelProvisioner: provisioner}
metricsDeleted := metrics.DeleteStorageProfileStatus(scLabels)
scLabels = createLabels(storageClass, provisioner, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone)
scLabels = createLabels(storageClass, provisioner, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded)
metrics.SetStorageProfileStatus(scLabels, 1)
r.log.Info(fmt.Sprintf("Set metric:%s complete:%t default:%t vdefault:%t rwx:%t smartclone:%t (deleted %d)",
storageClass, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, metricsDeleted))
r.log.Info(fmt.Sprintf("Set metric:%s complete:%t default:%t vdefault:%t rwx:%t smartclone:%t degraded:%t (deleted %d)",
storageClass, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded, metricsDeleted))

return nil
}
Expand Down Expand Up @@ -335,7 +350,7 @@ func (r *StorageProfileReconciler) hasSmartClone(sp *cdiv1.StorageProfile) (bool
return false, nil
}

func createLabels(storageClass, provisioner string, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone bool) prometheus.Labels {
func createLabels(storageClass, provisioner string, isComplete, isDefault, isVirtDefault, isRWX, isSmartClone, isDegraded bool) prometheus.Labels {
return prometheus.Labels{
counterLabelStorageClass: storageClass,
counterLabelProvisioner: provisioner,
Expand All @@ -344,6 +359,7 @@ func createLabels(storageClass, provisioner string, isComplete, isDefault, isVir
counterLabelVirtDefault: strconv.FormatBool(isVirtDefault),
counterLabelRWX: strconv.FormatBool(isRWX),
counterLabelSmartClone: strconv.FormatBool(isSmartClone),
counterLabelDegraded: strconv.FormatBool(isDegraded),
}
}

Expand Down
60 changes: 56 additions & 4 deletions pkg/controller/storageprofile-controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ import (
. "github.com/onsi/gomega"

snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1"
ocpconfigv1 "github.com/openshift/api/config/v1"

v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -52,6 +54,7 @@ import (
const (
storageClassName = "testSC"
snapshotClassName = "testSnapClass"
provisionerName = "testProvisioner"
cephProvisioner = "rook-ceph.rbd.csi.ceph.com"
)

Expand Down Expand Up @@ -471,7 +474,7 @@ var _ = Describe("Storage profile controller reconcile loop", func() {
Entry("provisioner that is known to prefer csi clone", "csi-powermax.dellemc.com", cdiv1.CloneStrategyCsiClone, false),
)

DescribeTable("Should set the StorageProfileStatus metric correctly", func(provisioner string, count int) {
DescribeTable("Should set the StorageProfileStatus metric correctly", func(provisioner string, isComplete bool, count int) {
storageClass := CreateStorageClassWithProvisioner(storageClassName, map[string]string{AnnDefaultStorageClass: "true"}, map[string]string{}, provisioner)
reconciler = createStorageProfileReconciler(storageClass)
_, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}})
Expand All @@ -484,12 +487,60 @@ var _ = Describe("Storage profile controller reconcile loop", func() {
Expect(*sp.Status.StorageClass).To(Equal(storageClassName))
Expect(sp.Status.ClaimPropertySets).To(BeEmpty())

labels := createLabels(storageClassName, provisioner, false, true, false, false, false)
labels := createLabels(storageClassName, provisioner, isComplete, true, false, false, false, true)
Expect(int(metrics.GetStorageProfileStatus(labels))).To(Equal(count))
},
Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, 0),
Entry("Unknown provisioner", "unknown-provisioner", 1),
Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, false, 0),
Entry("Noobaa (not supported)", storagecapabilities.ProvisionerNoobaa, true, 1),
Entry("Unknown provisioner", "unknown-provisioner", false, 1),
Entry("Unknown provisioner", "unknown-provisioner", true, 0),
)

DescribeTable("Should set the StorageProfileStatus degraded state correctly", func(accessMode v1.PersistentVolumeAccessMode, isSNO, isDegraded bool) {
storageClass := CreateStorageClassWithProvisioner(storageClassName, nil, nil, provisionerName)
driver := &storagev1.CSIDriver{ObjectMeta: metav1.ObjectMeta{Name: provisionerName}}
clusterInfra := &ocpconfigv1.Infrastructure{}
if isSNO {
clusterInfra.Name = "cluster"
clusterInfra.Status.ControlPlaneTopology = ocpconfigv1.SingleReplicaTopologyMode
clusterInfra.Status.InfrastructureTopology = ocpconfigv1.SingleReplicaTopologyMode
}
reconciler = createStorageProfileReconciler(storageClass, driver, clusterInfra)
_, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}})
Expect(err).ToNot(HaveOccurred())

sp := &cdiv1.StorageProfile{}
err = reconciler.client.Get(context.TODO(), types.NamespacedName{Name: storageClassName}, sp, &client.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(*sp.Status.StorageClass).To(Equal(storageClassName))
Expect(sp.Status.ClaimPropertySets).To(BeEmpty())

sp.Spec.ClaimPropertySets = []cdiv1.ClaimPropertySet{
{
AccessModes: []v1.PersistentVolumeAccessMode{accessMode},
VolumeMode: &BlockMode,
},
}

cloneStrategy := cdiv1.CloneStrategyCsiClone
sp.Spec.CloneStrategy = &cloneStrategy
err = reconciler.client.Update(context.TODO(), sp, &client.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())

_, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: types.NamespacedName{Name: storageClassName}})
Expect(err).ToNot(HaveOccurred())
err = reconciler.client.Get(context.TODO(), types.NamespacedName{Name: storageClassName}, sp, &client.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(sp.Status.ClaimPropertySets).ToNot(BeEmpty())

labels := createLabels(storageClassName, provisionerName, true, false, false, accessMode == v1.ReadWriteMany, true, isDegraded)
Expect(int(metrics.GetStorageProfileStatus(labels))).To(Equal(1))
},
Entry("With RWX, not degraded", v1.ReadWriteMany, false, false),
Entry("Without RWX, degraded", v1.ReadWriteOnce, false, true),
Entry("Without RWX, on SNO, not degraded", v1.ReadWriteOnce, true, false),
)

})

func createStorageProfileReconciler(objects ...runtime.Object) *StorageProfileReconciler {
Expand All @@ -508,6 +559,7 @@ func createStorageProfileReconciler(objects ...runtime.Object) *StorageProfileRe
_ = cdiv1.AddToScheme(s)
_ = snapshotv1.AddToScheme(s)
_ = extv1.AddToScheme(s)
_ = ocpconfigv1.Install(s)

// Create a fake client to mock API calls.
cl := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(objs...).Build()
Expand Down
5 changes: 4 additions & 1 deletion pkg/monitoring/metrics/cdi-controller/storageprofile.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const (
counterLabelVirtDefault = "virtdefault"
counterLabelRWX = "rwx"
counterLabelSmartClone = "smartclone"
counterLabelDegraded = "degraded"
)

var (
Expand All @@ -30,7 +31,8 @@ var (
"`default` indicates if it's the Kubernetes default storage class, " +
"`virtdefault` indicates if it's the default virtualization storage class, " +
"`rwx` indicates if the storage class supports `ReadWriteMany`, " +
"`smartclone` indicates if it supports snapshot or CSI based clone",
"`smartclone` indicates if it supports snapshot or CSI based clone, " +
"`degraded` indicates it is not optimal for virtualization",
},
[]string{
counterLabelStorageClass,
Expand All @@ -40,6 +42,7 @@ var (
counterLabelVirtDefault,
counterLabelRWX,
counterLabelSmartClone,
counterLabelDegraded,
},
)
)
Expand Down
4 changes: 2 additions & 2 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ var operatorAlerts = []promv1.Rule{
},
{
Alert: "CDIDefaultStorageClassDegraded",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) +
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",degraded="false"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",degraded="false"} or on() vector(0)) +
on () (0*(sum(kubevirt_cdi_storageprofile_info{default="true"}) or sum(kubevirt_cdi_storageprofile_info{virtdefault="true"}))) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
Expand Down
1 change: 1 addition & 0 deletions pkg/operator/resources/cluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ func getControllerClusterPolicyRules() []rbacv1.PolicyRule {
},
Resources: []string{
"proxies",
"infrastructures",
},
Verbs: []string{
"get",
Expand Down

0 comments on commit 3d99cfb

Please sign in to comment.