From 94d9c56c365740ffc245216d67f3f97ebf67a1ab Mon Sep 17 00:00:00 2001 From: xuzhonghu Date: Mon, 12 Aug 2019 15:08:58 +0800 Subject: [PATCH 1/2] Add maxRetry in job controller to prevent endless loop --- pkg/controllers/job/job_controller.go | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pkg/controllers/job/job_controller.go b/pkg/controllers/job/job_controller.go index af8358f2a5..30a76b76e4 100644 --- a/pkg/controllers/job/job_controller.go +++ b/pkg/controllers/job/job_controller.go @@ -56,6 +56,15 @@ import ( "volcano.sh/volcano/pkg/controllers/job/state" ) +const ( + // maxRetries is the number of times a volcano job will be retried before it is dropped out of the queue. + // With the current rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers represent the times + // a volcano job is going to be requeued: + // + // 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s, 10.2s, 20.4s, 41s, 82s + maxRetries = 15 +) + // Controller the Job Controller type type Controller struct { kubeClients kubernetes.Interface @@ -312,11 +321,15 @@ func (cc *Controller) processNextReq(count uint32) bool { } if err := st.Execute(action); err != nil { - glog.Errorf("Failed to handle Job <%s/%s>: %v", - jobInfo.Job.Namespace, jobInfo.Job.Name, err) - // If any error, requeue it. - queue.AddRateLimited(req) - return true + if queue.NumRequeues(req) < maxRetries { + glog.V(2).Infof("Failed to handle Job <%s/%s>: %v", + jobInfo.Job.Namespace, jobInfo.Job.Name, err) + // If any error, requeue it. + queue.AddRateLimited(req) + return true + } + + glog.V(2).Infof("Dropping job<%s/%s> out of the queue: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err) } // If no error, forget it. From 490ba7bded6f612daf6b79ea4062f2578e3ae7e2 Mon Sep 17 00:00:00 2001 From: xuzhonghu Date: Mon, 19 Aug 2019 17:37:21 +0800 Subject: [PATCH 2/2] Add event --- pkg/controllers/job/job_controller.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/job/job_controller.go b/pkg/controllers/job/job_controller.go index 30a76b76e4..d49ad95b79 100644 --- a/pkg/controllers/job/job_controller.go +++ b/pkg/controllers/job/job_controller.go @@ -328,8 +328,9 @@ func (cc *Controller) processNextReq(count uint32) bool { queue.AddRateLimited(req) return true } - - glog.V(2).Infof("Dropping job<%s/%s> out of the queue: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err) + cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, vkbatchv1.ExecuteAction, fmt.Sprintf( + "Job failed on action %s for retry limit reached", action)) + glog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err) } // If no error, forget it.