Skip to content

Commit

Permalink
Move deployment status-checking logic to library package
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Obuchowicz <aobuchow@redhat.com>
  • Loading branch information
AObuchow committed Jun 21, 2022
1 parent 9e1f086 commit 540f408
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 141 deletions.
168 changes: 168 additions & 0 deletions pkg/library/status/check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
//
// Copyright (c) 2019-2022 Red Hat, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package check

import (
"context"
"fmt"
"strings"

dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
"github.com/devfile/devworkspace-operator/pkg/common"
"github.com/devfile/devworkspace-operator/pkg/config"
"github.com/devfile/devworkspace-operator/pkg/infrastructure"
"github.com/devfile/devworkspace-operator/pkg/provision/sync"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/fields"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
runtimeClient "sigs.k8s.io/controller-runtime/pkg/client"
)

var containerFailureStateReasons = []string{
"CrashLoopBackOff",
"ImagePullBackOff",
"CreateContainerError",
"RunContainerError",
}

// unrecoverablePodEventReasons contains Kubernetes events that should fail workspace startup
// if they occur related to a workspace pod. Events are stored as a map with event names as keys
// and values representing the threshold of how many times we can see an event before it is considered
// unrecoverable.
var unrecoverablePodEventReasons = map[string]int32{
"FailedPostStartHook": 1,
"FailedMount": 3,
"FailedScheduling": 1,
"FailedCreate": 1,
"ReplicaSetCreateError": 1,
}

var unrecoverableDeploymentConditionReasons = []string{
"FailedCreate",
}

func CheckDeploymentStatus(deployment *appsv1.Deployment) (ready bool) {
return deployment.Status.ReadyReplicas > 0
}

func CheckDeploymentConditions(deployment *appsv1.Deployment) (healthy bool, errorMsg string) {
conditions := deployment.Status.Conditions
for _, condition := range conditions {
for _, unrecoverableReason := range unrecoverableDeploymentConditionReasons {
if condition.Reason == unrecoverableReason {
return false, fmt.Sprintf("Detected unrecoverable deployment condition: %s %s", condition.Reason, condition.Message)
}
}
}
return true, ""
}

// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable
// if it has a container with one of the containerFailureStateReasons states, or if an unrecoverable event (with reason
// matching unrecoverablePodEventReasons) has the pod as the involved object.
// Returns optional message with detected unrecoverable state details
// error if any happens during check
func CheckPodsState(workspace *dw.DevWorkspace, namespace string, labelSelector k8sclient.MatchingLabels,
clusterAPI sync.ClusterAPI) (stateMsg string, checkFailure error) {
podList, err := GetPods(namespace, labelSelector, clusterAPI.Client)
if err != nil {
return "", err
}

for _, pod := range podList.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if !CheckContainerStatusForFailure(&containerStatus) {
return fmt.Sprintf("Container %s has state %s", containerStatus.Name, containerStatus.State.Waiting.Reason), nil
}
}
for _, initContainerStatus := range pod.Status.InitContainerStatuses {
if !CheckContainerStatusForFailure(&initContainerStatus) {
return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil
}
}
if msg, err := CheckPodEvents(&pod, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" {
return msg, err
}
}
return "", nil
}

func CheckPodEvents(pod *corev1.Pod, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) {
evs := &corev1.EventList{}
selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name))
if err != nil {
return "", fmt.Errorf("failed to parse field selector: %s", err)
}
if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil {
return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err)
}
for _, ev := range evs.Items {
if ev.InvolvedObject.Kind != "Pod" {
continue
}

// On OpenShift, it's possible see "FailedMount" events when using a routingClass that depends on the service-ca
// operator. To avoid this, we always ignore FailedMount events if the message refers to the DWO-provisioned volume
if infrastructure.IsOpenShift() &&
ev.Reason == "FailedMount" &&
strings.Contains(ev.Message, common.ServingCertVolumeName(common.ServiceName(workspaceID))) {
continue
}

if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent {
if !checkIfUnrecoverableEventIgnored(ev.Reason) && ev.Count >= maxCount {
var msg string
if ev.Count > 1 {
msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s.", ev.Reason, ev.Count, ev.Message)
} else {
msg = fmt.Sprintf("Detected unrecoverable event %s: %s.", ev.Reason, ev.Message)
}
return msg, nil
}
}
}
return "", nil
}

func CheckContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool) {
if containerStatus.State.Waiting != nil {
for _, failureReason := range containerFailureStateReasons {
if containerStatus.State.Waiting.Reason == failureReason {
return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason)
}
}
}
return true
}

// TODO: Remove this function?
func GetPods(namespace string, labelSelector k8sclient.MatchingLabels, client runtimeClient.Client) (*corev1.PodList, error) {
pods := &corev1.PodList{}
if err := client.List(context.TODO(), pods, k8sclient.InNamespace(namespace), labelSelector); err != nil {
return nil, err
}
return pods, nil
}

func checkIfUnrecoverableEventIgnored(reason string) (ignored bool) {
for _, ignoredReason := range config.Workspace.IgnoredUnrecoverableEvents {
if ignoredReason == reason {
return true
}
}
return false
}
147 changes: 6 additions & 141 deletions pkg/provision/workspace/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ import (
"context"
"errors"
"fmt"
"strings"

check "github.com/devfile/devworkspace-operator/pkg/library/status"
nsconfig "github.com/devfile/devworkspace-operator/pkg/provision/config"
"github.com/devfile/devworkspace-operator/pkg/provision/sync"
"k8s.io/apimachinery/pkg/fields"

dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
"github.com/devfile/devworkspace-operator/apis/controller/v1alpha1"
Expand All @@ -44,29 +43,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
)

var containerFailureStateReasons = []string{
"CrashLoopBackOff",
"ImagePullBackOff",
"CreateContainerError",
"RunContainerError",
}

// unrecoverablePodEventReasons contains Kubernetes events that should fail workspace startup
// if they occur related to a workspace pod. Events are stored as a map with event names as keys
// and values representing the threshold of how many times we can see an event before it is considered
// unrecoverable.
var unrecoverablePodEventReasons = map[string]int32{
"FailedPostStartHook": 1,
"FailedMount": 3,
"FailedScheduling": 1,
"FailedCreate": 1,
"ReplicaSetCreateError": 1,
}

var unrecoverableDeploymentConditionReasons = []string{
"FailedCreate",
}

type DeploymentProvisioningStatus struct {
ProvisioningStatus
}
Expand Down Expand Up @@ -121,7 +97,7 @@ func SyncDeploymentToCluster(
}
clusterDeployment := clusterObj.(*appsv1.Deployment)

deploymentReady := checkDeploymentStatus(clusterDeployment)
deploymentReady := check.CheckDeploymentStatus(clusterDeployment)
if deploymentReady {
return DeploymentProvisioningStatus{
ProvisioningStatus: ProvisioningStatus{
Expand All @@ -130,7 +106,7 @@ func SyncDeploymentToCluster(
}
}

deploymentHealthy, deploymentErrMsg := checkDeploymentConditions(clusterDeployment)
deploymentHealthy, deploymentErrMsg := check.CheckDeploymentConditions(clusterDeployment)
if !deploymentHealthy {
return DeploymentProvisioningStatus{
ProvisioningStatus: ProvisioningStatus{
Expand All @@ -140,7 +116,9 @@ func SyncDeploymentToCluster(
}
}

failureMsg, checkErr := checkPodsState(workspace, clusterAPI)
failureMsg, checkErr := check.CheckPodsState(workspace, workspace.Namespace, k8sclient.MatchingLabels{
constants.DevWorkspaceIDLabel: workspace.Status.DevWorkspaceId,
}, clusterAPI)
if checkErr != nil {
return DeploymentProvisioningStatus{
ProvisioningStatus: ProvisioningStatus{
Expand Down Expand Up @@ -201,22 +179,6 @@ func GetDevWorkspaceSecurityContext() *corev1.PodSecurityContext {
return config.Workspace.PodSecurityContext
}

func checkDeploymentStatus(deployment *appsv1.Deployment) (ready bool) {
return deployment.Status.ReadyReplicas > 0
}

func checkDeploymentConditions(deployment *appsv1.Deployment) (healthy bool, errorMsg string) {
conditions := deployment.Status.Conditions
for _, condition := range conditions {
for _, unrecoverableReason := range unrecoverableDeploymentConditionReasons {
if condition.Reason == unrecoverableReason {
return false, fmt.Sprintf("Detected unrecoverable deployment condition: %s %s", condition.Reason, condition.Message)
}
}
}
return true, ""
}

func getSpecDeployment(
workspace *dw.DevWorkspace,
podAdditionsList []v1alpha1.PodAdditions,
Expand Down Expand Up @@ -337,46 +299,6 @@ func getSpecDeployment(
return deployment, nil
}

func getPods(workspace *dw.DevWorkspace, client runtimeClient.Client) (*corev1.PodList, error) {
pods := &corev1.PodList{}
if err := client.List(context.TODO(), pods, k8sclient.InNamespace(workspace.Namespace), k8sclient.MatchingLabels{
constants.DevWorkspaceIDLabel: workspace.Status.DevWorkspaceId,
}); err != nil {
return nil, err
}
return pods, nil
}

// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable
// if it has a container with one of the containerStateFailureReasons states, or if an unrecoverable event (with reason
// matching unrecoverablePodEventReasons) has the pod as the involved object.
// Returns optional message with detected unrecoverable state details
// error if any happens during check
func checkPodsState(workspace *dw.DevWorkspace,
clusterAPI sync.ClusterAPI) (stateMsg string, checkFailure error) {
podList, err := getPods(workspace, clusterAPI.Client)
if err != nil {
return "", err
}

for _, pod := range podList.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if !checkContainerStatusForFailure(&containerStatus) {
return fmt.Sprintf("Container %s has state %s", containerStatus.Name, containerStatus.State.Waiting.Reason), nil
}
}
for _, initContainerStatus := range pod.Status.InitContainerStatuses {
if !checkContainerStatusForFailure(&initContainerStatus) {
return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil
}
}
if msg, err := checkPodEvents(&pod, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" {
return msg, err
}
}
return "", nil
}

func mergePodAdditions(toMerge []v1alpha1.PodAdditions) (*v1alpha1.PodAdditions, error) {
podAdditions := &v1alpha1.PodAdditions{}

Expand Down Expand Up @@ -476,60 +398,3 @@ func getAdditionalAnnotations(workspace *dw.DevWorkspace) (map[string]string, er

return annotations, nil
}

func checkPodEvents(pod *corev1.Pod, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) {
evs := &corev1.EventList{}
selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name))
if err != nil {
return "", fmt.Errorf("failed to parse field selector: %s", err)
}
if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil {
return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err)
}
for _, ev := range evs.Items {
if ev.InvolvedObject.Kind != "Pod" {
continue
}

// On OpenShift, it's possible see "FailedMount" events when using a routingClass that depends on the service-ca
// operator. To avoid this, we always ignore FailedMount events if the message refers to the DWO-provisioned volume
if infrastructure.IsOpenShift() &&
ev.Reason == "FailedMount" &&
strings.Contains(ev.Message, common.ServingCertVolumeName(common.ServiceName(workspaceID))) {
continue
}

if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent {
if !checkIfUnrecoverableEventIgnored(ev.Reason) && ev.Count >= maxCount {
var msg string
if ev.Count > 1 {
msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s", ev.Reason, ev.Count, ev.Message)
} else {
msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, ev.Message)
}
return msg, nil
}
}
}
return "", nil
}

func checkContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool) {
if containerStatus.State.Waiting != nil {
for _, failureReason := range containerFailureStateReasons {
if containerStatus.State.Waiting.Reason == failureReason {
return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason)
}
}
}
return true
}

func checkIfUnrecoverableEventIgnored(reason string) (ignored bool) {
for _, ignoredReason := range config.Workspace.IgnoredUnrecoverableEvents {
if ignoredReason == reason {
return true
}
}
return false
}

0 comments on commit 540f408

Please sign in to comment.