Skip to content

Commit

Permalink
[DCA] Introduce use_component_status for kube_apiserver check (#8577)
Browse files Browse the repository at this point in the history
Previously, the kube_apiserver_controlplane used ComponentStatus to
report control plane components' liveness. This has been deprecated in
[Kubernetes 1.19](kubernetes/kubernetes#93570)
and will be removed at some point in the future.

To remediate that, we're following the recommendation in the deprecation
notice to use the API Server's health endpoint instead. This change also
removes the `component` tag in this service check, as it no longer
reports separate components, and just the API server itself.
Per-component service checks will eventually be available through the
kube_controller_manager and kube_scheduler checks themselves.
  • Loading branch information
juliogreff authored Jul 8, 2021
1 parent 305341b commit 94d950e
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type KubeASConfig struct {
MaxEventCollection int `yaml:"max_events_per_run"`
LeaderSkip bool `yaml:"skip_leader_election"`
ResyncPeriodEvents int `yaml:"kubernetes_event_resync_period_s"`
UseComponentStatus bool `yaml:"use_component_status"`
}

// EventC holds the information pertaining to which event we collected last and when we last re-synced.
Expand All @@ -77,6 +78,7 @@ func (c *KubeASConfig) parse(data []byte) error {
c.CollectEvent = config.Datadog.GetBool("collect_kubernetes_events")
c.CollectOShiftQuotas = true
c.ResyncPeriodEvents = defaultResyncPeriodInSecond
c.UseComponentStatus = true

return yaml.Unmarshal(data, c)
}
Expand Down Expand Up @@ -181,13 +183,15 @@ func (k *KubeASCheck) Run() error {
}

// Running the Control Plane status check.
componentsStatus, err := k.ac.ComponentStatuses()
if err != nil {
k.Warnf("Could not retrieve the status from the control plane's components %s", err.Error()) //nolint:errcheck
if k.instance.UseComponentStatus {
err = k.componentStatusCheck(sender)
if err != nil {
k.Warnf("Could not collect control plane status from ComponentStatus: %s", err.Error()) //nolint:errcheck
}
} else {
err = k.parseComponentStatus(sender, componentsStatus)
err = k.controlPlaneHealthCheck(context.TODO(), sender)
if err != nil {
k.Warnf("Could not collect API Server component status: %s", err.Error()) //nolint:errcheck
k.Warnf("Could not collect control plane status from health checks: %s", err.Error()) //nolint:errcheck
}
}

Expand Down Expand Up @@ -251,15 +255,14 @@ func (k *KubeASCheck) eventCollectionCheck() (newEvents []*v1.Event, err error)

func (k *KubeASCheck) parseComponentStatus(sender aggregator.Sender, componentsStatus *v1.ComponentStatusList) error {
for _, component := range componentsStatus.Items {

if component.ObjectMeta.Name == "" {
return errors.New("metadata structure has changed. Not collecting API Server's Components status")
}
if component.Conditions == nil || component.Name == "" {
log.Debug("API Server component's structure is not expected")
continue
}
tagComp := []string{fmt.Sprintf("component:%s", component.Name)}

for _, condition := range component.Conditions {
statusCheck := metrics.ServiceCheckUnknown
message := ""
Expand All @@ -269,6 +272,7 @@ func (k *KubeASCheck) parseComponentStatus(sender aggregator.Sender, componentsS
log.Debugf("Condition %q not supported", condition.Type)
continue
}

// We only expect True, False and Unknown (default).
switch condition.Status {
case "True":
Expand All @@ -277,8 +281,13 @@ func (k *KubeASCheck) parseComponentStatus(sender aggregator.Sender, componentsS
case "False":
statusCheck = metrics.ServiceCheckCritical
message = condition.Error
if message == "" {
message = condition.Message
}
}
sender.ServiceCheck(KubeControlPaneCheck, statusCheck, "", tagComp, message)

tags := []string{fmt.Sprintf("component:%s", component.Name)}
sender.ServiceCheck(KubeControlPaneCheck, statusCheck, "", tags, message)
}
}
return nil
Expand Down Expand Up @@ -316,6 +325,40 @@ func (k *KubeASCheck) processEvents(sender aggregator.Sender, events []*v1.Event
return nil
}

func (k *KubeASCheck) componentStatusCheck(sender aggregator.Sender) error {
componentsStatus, err := k.ac.ComponentStatuses()
if err != nil {
return err
}

return k.parseComponentStatus(sender, componentsStatus)
}

func (k *KubeASCheck) controlPlaneHealthCheck(ctx context.Context, sender aggregator.Sender) error {
ready, err := k.ac.IsAPIServerReady(ctx)

var (
msg string
status metrics.ServiceCheckStatus
)

if ready {
msg = "ok"
status = metrics.ServiceCheckOK
} else {
status = metrics.ServiceCheckCritical
if err != nil {
msg = err.Error()
} else {
msg = "unknown error"
}
}

sender.ServiceCheck(KubeControlPaneCheck, status, "", nil, msg)

return nil
}

// bundleID generates a unique ID to separate k8s events
// based on their InvolvedObject UIDs and event Types
func bundleID(e *v1.Event) string {
Expand Down
8 changes: 8 additions & 0 deletions pkg/util/kubernetes/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,14 @@ func (c *APIClient) GetRESTObject(path string, output runtime.Object) error {
return result.Into(output)
}

// IsAPIServerReady retrieves the API Server readiness status
func (c *APIClient) IsAPIServerReady(ctx context.Context) (bool, error) {
path := "/readyz"
_, err := c.Cl.Discovery().RESTClient().Get().AbsPath(path).DoRaw(ctx)

return err == nil, err
}

func convertmetadataMapperBundleToAPI(input *metadataMapperBundle) *apiv1.MetadataResponseBundle {
output := apiv1.NewMetadataResponseBundle()
if input == nil {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
features:
- |
Introduce a `use_component_status` config option to the
kubernetes_apiserver check. When set to false, it no longer uses the
`ComponentStatus` object (deprecated since Kubernetes 1.19) for the
Kubernetes API Server Control Plane health checks, and instead replaces it
with a single health check directly to the API Server.
enhancements:
- |
Report an error message if the kube_apiserver_controlplane.up service
check is critical.

0 comments on commit 94d950e

Please sign in to comment.