Skip to content

Commit

Permalink
feat: report error when common PVC cleanup job hangs
Browse files Browse the repository at this point in the history
Fix devfile#551

Signed-off-by: Andrew Obuchowicz <aobuchow@redhat.com>
  • Loading branch information
AObuchow committed May 18, 2022
1 parent d7b3273 commit ecf9929
Showing 1 changed file with 38 additions and 0 deletions.
38 changes: 38 additions & 0 deletions pkg/provision/storage/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ import (
k8sErrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/devfile/devworkspace-operator/internal/images"
Expand All @@ -52,6 +54,10 @@ var (
pvcCleanupPodCPURequest = resource.MustParse(constants.PVCCleanupPodCPURequest)
)

var unrecoverablePodEventReasons = map[string]bool{
"FailedScheduling": true,
}

func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterAPI) error {
PVCexists, err := commonPVCExists(workspace, clusterAPI)
if err != nil {
Expand All @@ -78,10 +84,12 @@ func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterA
}

clusterJob := clusterObj.(*batchv1.Job)

for _, condition := range clusterJob.Status.Conditions {
if condition.Status != corev1.ConditionTrue {
continue
}

switch condition.Type {
case batchv1.JobComplete:
return nil
Expand All @@ -91,6 +99,14 @@ func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterA
}
}
}

if msg, err := checkCommonPVCCleanupJobEvents(clusterJob, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" {
return &ProvisioningError{
Err: err,
Message: fmt.Sprintf("DevWorkspace PVC cleanup job failed: see logs for job %q for details. Additional information: %s", clusterJob.Name, msg),
}
}

// Requeue at least each 10 seconds to check if PVC is not removed by someone else
return &NotReadyError{
Message: "Cleanup job is not in completed state",
Expand Down Expand Up @@ -203,3 +219,25 @@ func commonPVCExists(workspace *dw.DevWorkspace, clusterAPI sync.ClusterAPI) (bo
}
return true, nil
}

func checkCommonPVCCleanupJobEvents(job *batchv1.Job, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) {
evs := &corev1.EventList{}
selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", job.Name))
if err != nil {
return "", fmt.Errorf("failed to parse field selector: %s", err)
}
if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(job.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil {
return "", fmt.Errorf("failed to list events in namespace %s: %w", job.Namespace, err)
}
for _, ev := range evs.Items {
if ev.InvolvedObject.Kind != "Job" {
continue
}

if _, ok := unrecoverablePodEventReasons[ev.Reason]; ok {
msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, ev.Message)
return msg, nil
}
}
return "", nil
}

0 comments on commit ecf9929

Please sign in to comment.