Merge pull request volcano-sh#35 from lminzhw/performance_improvement

Performance improvement
kevin-wangzefeng · Jun 28, 2019 · e09c367 · e09c367
2 parents c93ffa9 + 9697ace
commit e09c367
Show file tree

Hide file tree

Showing 15 changed files with 537 additions and 121 deletions.
diff --git a/pkg/scheduler/actions/allocate/allocate.go b/pkg/scheduler/actions/allocate/allocate.go
@@ -52,6 +52,10 @@ func (alloc *allocateAction) Execute(ssn *framework.Session) {
 		if job.PodGroup.Status.Phase == v1alpha1.PodGroupPending {
 			continue
 		}
+		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
+			glog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
+			continue
+		}
 
 		if queue, found := ssn.Queues[job.Queue]; found {
 			queues.Push(queue)
@@ -150,7 +154,7 @@ func (alloc *allocateAction) Execute(ssn *framework.Session) {
 				break
 			}
 
-			nodeScores := util.PrioritizeNodes(task, predicateNodes, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
+			nodeScores := util.PrioritizeNodes(task, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
 
 			node := util.SelectBestNode(nodeScores)
 			// Allocate idle resource to the task.

diff --git a/pkg/scheduler/actions/backfill/backfill.go b/pkg/scheduler/actions/backfill/backfill.go
@@ -47,6 +47,10 @@ func (alloc *backfillAction) Execute(ssn *framework.Session) {
 		if job.PodGroup.Status.Phase == v1alpha1.PodGroupPending {
 			continue
 		}
+		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
+			glog.V(4).Infof("Job <%s/%s> Queue <%s> skip backfill, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
+			continue
+		}
 
 		for _, task := range job.TaskStatusIndex[api.Pending] {
 			if task.InitResreq.IsEmpty() {

diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go
@@ -56,6 +56,10 @@ func (alloc *preemptAction) Execute(ssn *framework.Session) {
 		if job.PodGroup.Status.Phase == v1alpha1.PodGroupPending {
 			continue
 		}
+		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
+			glog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
+			continue
+		}
 
 		if queue, found := ssn.Queues[job.Queue]; !found {
 			continue
@@ -186,7 +190,7 @@ func preempt(
 
 	predicateNodes := util.PredicateNodes(preemptor, allNodes, ssn.PredicateFn)
 
-	nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
+	nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
 
 	selectedNodes := util.SortNodes(nodeScores)
 	for _, node := range selectedNodes {

diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go
@@ -57,6 +57,10 @@ func (alloc *reclaimAction) Execute(ssn *framework.Session) {
 		if job.PodGroup.Status.Phase == v1alpha1.PodGroupPending {
 			continue
 		}
+		if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
+			glog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
+			continue
+		}
 
 		if queue, found := ssn.Queues[job.Queue]; !found {
 			glog.Errorf("Failed to find Queue <%s> for Job <%s/%s>",

diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go
@@ -302,7 +302,7 @@ func (ji *JobInfo) Clone() *JobInfo {
 		NodesFitDelta: make(NodeResourceMap),
 
 		PDB:      ji.PDB,
-		PodGroup: ji.PodGroup,
+		PodGroup: ji.PodGroup.DeepCopy(),
 
 		TaskStatusIndex: map[TaskStatus]tasksMap{},
 		Tasks:           tasksMap{},

diff --git a/pkg/scheduler/api/types.go b/pkg/scheduler/api/types.go
@@ -132,6 +132,9 @@ type EvictableFn func(*TaskInfo, []*TaskInfo) []*TaskInfo
 // NodeOrderFn is the func declaration used to get priority score for a node for a particular task.
 type NodeOrderFn func(*TaskInfo, *NodeInfo) (float64, error)
 
+// BatchNodeOrderFn is the func declaration used to get priority score for ALL nodes for a particular task.
+type BatchNodeOrderFn func(*TaskInfo, []*NodeInfo) (map[string]float64, error)
+
 // NodeMapFn is the func declaration used to get priority score for a node for a particular task.
 type NodeMapFn func(*TaskInfo, *NodeInfo) (float64, error)
 

diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go
@@ -147,6 +147,31 @@ type defaultStatusUpdater struct {
 	kbclient   *kbver.Clientset
 }
 
+// following the same logic as podutil.UpdatePodCondition
+func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool {
+	lastTransitionTime := metav1.Now()
+	// Try to find this pod condition.
+	_, oldCondition := podutil.GetPodCondition(status, condition.Type)
+
+	if oldCondition == nil {
+		// We are adding new pod condition.
+		return true
+	}
+	// We are updating an existing condition, so we need to check if it has changed.
+	if condition.Status == oldCondition.Status {
+		lastTransitionTime = oldCondition.LastTransitionTime
+	}
+
+	isEqual := condition.Status == oldCondition.Status &&
+		condition.Reason == oldCondition.Reason &&
+		condition.Message == oldCondition.Message &&
+		condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) &&
+		lastTransitionTime.Equal(&oldCondition.LastTransitionTime)
+
+	// Return true if one of the fields have changed.
+	return !isEqual
+}
+
 // UpdatePodCondition will Update pod with podCondition
 func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
 	glog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
@@ -184,22 +209,35 @@ func (dvb *defaultVolumeBinder) BindVolumes(task *api.TaskInfo) error {
 }
 
 func newSchedulerCache(config *rest.Config, schedulerName string, defaultQueue string) *SchedulerCache {
+	kubeClient, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		panic(fmt.Sprintf("failed init kubeClient, with err: %v", err))
+	}
+	kbClient, err := kbver.NewForConfig(config)
+	if err != nil {
+		panic(fmt.Sprintf("failed init kbClient, with err: %v", err))
+	}
+	eventClient, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		panic(fmt.Sprintf("failed init eventClient, with err: %v", err))
+	}
+
 	sc := &SchedulerCache{
 		Jobs:            make(map[kbapi.JobID]*kbapi.JobInfo),
 		Nodes:           make(map[string]*kbapi.NodeInfo),
 		Queues:          make(map[kbapi.QueueID]*kbapi.QueueInfo),
 		PriorityClasses: make(map[string]*v1beta1.PriorityClass),
 		errTasks:        workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
 		deletedJobs:     workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
-		kubeclient:      kubernetes.NewForConfigOrDie(config),
-		kbclient:        kbver.NewForConfigOrDie(config),
+		kubeclient:      kubeClient,
+		kbclient:        kbClient,
 		defaultQueue:    defaultQueue,
 		schedulerName:   schedulerName,
 	}
 
 	// Prepare event clients.
 	broadcaster := record.NewBroadcaster()
-	broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: sc.kubeclient.CoreV1().Events("")})
+	broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")})
 	sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: schedulerName})
 
 	sc.Binder = &defaultBinder{
@@ -460,22 +498,27 @@ func (sc *SchedulerCache) BindVolumes(task *api.TaskInfo) error {
 
 // taskUnschedulable updates pod status of pending task
 func (sc *SchedulerCache) taskUnschedulable(task *api.TaskInfo, message string) error {
-	sc.Mutex.Lock()
-	defer sc.Mutex.Unlock()
+	pod := task.Pod
 
-	pod := task.Pod.DeepCopy()
-
-	// The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
-	// k8s core, so using the same string here.
-	// The reason field in PodCondition should be "Unschedulable"
-	sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
-	if _, err := sc.StatusUpdater.UpdatePodCondition(pod, &v1.PodCondition{
+	condition := &v1.PodCondition{
 		Type:    v1.PodScheduled,
 		Status:  v1.ConditionFalse,
 		Reason:  v1.PodReasonUnschedulable,
 		Message: message,
-	}); err != nil {
-		return err
+	}
+
+	if podConditionHaveUpdate(&pod.Status, condition) {
+		pod = pod.DeepCopy()
+
+		// The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
+		// k8s core, so using the same string here.
+		// The reason field in PodCondition should be "Unschedulable"
+		sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
+		if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
+			return err
+		}
+	} else {
+		glog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message)
 	}
 
 	return nil
@@ -560,6 +603,30 @@ func (sc *SchedulerCache) Snapshot() *kbapi.ClusterInfo {
 		snapshot.Queues[value.UID] = value.Clone()
 	}
 
+	var cloneJobLock sync.Mutex
+	var wg sync.WaitGroup
+
+	cloneJob := func(value *api.JobInfo) {
+		if value.PodGroup != nil {
+			value.Priority = sc.defaultPriority
+
+			priName := value.PodGroup.Spec.PriorityClassName
+			if priorityClass, found := sc.PriorityClasses[priName]; found {
+				value.Priority = priorityClass.Value
+			}
+
+			glog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
+				value.Namespace, value.Name, priName, value.Priority)
+		}
+
+		clonedJob := value.Clone()
+
+		cloneJobLock.Lock()
+		snapshot.Jobs[value.UID] = clonedJob
+		cloneJobLock.Unlock()
+		wg.Done()
+	}
+
 	for _, value := range sc.Jobs {
 		// If no scheduling spec, does not handle it.
 		if value.PodGroup == nil && value.PDB == nil {
@@ -575,20 +642,10 @@ func (sc *SchedulerCache) Snapshot() *kbapi.ClusterInfo {
 			continue
 		}
 
-		if value.PodGroup != nil {
-			value.Priority = sc.defaultPriority
-
-			priName := value.PodGroup.Spec.PriorityClassName
-			if priorityClass, found := sc.PriorityClasses[priName]; found {
-				value.Priority = priorityClass.Value
-			}
-
-			glog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
-				value.Namespace, value.Name, priName, value.Priority)
-		}
-
-		snapshot.Jobs[value.UID] = value.Clone()
+		wg.Add(1)
+		go cloneJob(value)
 	}
+	wg.Wait()
 
 	glog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.",
 		len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes))
@@ -658,8 +715,8 @@ func (sc *SchedulerCache) RecordJobStatusEvent(job *kbapi.JobInfo) {
 }
 
 // UpdateJobStatus update the status of job and its tasks.
-func (sc *SchedulerCache) UpdateJobStatus(job *kbapi.JobInfo) (*kbapi.JobInfo, error) {
-	if !shadowPodGroup(job.PodGroup) {
+func (sc *SchedulerCache) UpdateJobStatus(job *kbapi.JobInfo, updatePG bool) (*kbapi.JobInfo, error) {
+	if updatePG && !shadowPodGroup(job.PodGroup) {
 		pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup)
 		if err != nil {
 			return nil, err

diff --git a/pkg/scheduler/cache/interface.go b/pkg/scheduler/cache/interface.go
@@ -46,7 +46,7 @@ type Cache interface {
 	RecordJobStatusEvent(job *api.JobInfo)
 
 	// UpdateJobStatus puts job in backlog for a while.
-	UpdateJobStatus(job *api.JobInfo) (*api.JobInfo, error)
+	UpdateJobStatus(job *api.JobInfo, updatePG bool) (*api.JobInfo, error)
 
 	// AllocateVolumes allocates volume on the host to the task
 	AllocateVolumes(task *api.TaskInfo, hostname string) error

diff --git a/pkg/scheduler/framework/job_updater.go b/pkg/scheduler/framework/job_updater.go
@@ -0,0 +1,122 @@
+package framework
+
+import (
+	"context"
+	"math/rand"
+	"reflect"
+	"time"
+
+	"github.com/golang/glog"
+
+	"k8s.io/client-go/util/workqueue"
+
+	"github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1"
+	"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/api"
+)
+
+const (
+	jobUpdaterWorker = 16
+
+	jobConditionUpdateTime       = time.Minute
+	jobConditionUpdateTimeJitter = 30 * time.Second
+)
+
+// TimeJitterAfter means: new after old + duration + jitter
+func TimeJitterAfter(new, old time.Time, duration, maxJitter time.Duration) bool {
+	var jitter int64
+	if maxJitter > 0 {
+		jitter = rand.Int63n(int64(maxJitter))
+	}
+	return new.After(old.Add(duration + time.Duration(jitter)))
+}
+
+type jobUpdater struct {
+	ssn      *Session
+	jobQueue []*api.JobInfo
+}
+
+func newJobUpdater(ssn *Session) *jobUpdater {
+	queue := make([]*api.JobInfo, 0, len(ssn.Jobs))
+	for _, job := range ssn.Jobs {
+		queue = append(queue, job)
+	}
+
+	ju := &jobUpdater{
+		ssn:      ssn,
+		jobQueue: queue,
+	}
+	return ju
+}
+
+func (ju *jobUpdater) UpdateAll() {
+	workqueue.ParallelizeUntil(context.TODO(), jobUpdaterWorker, len(ju.jobQueue), ju.updateJob)
+}
+
+func isPodGroupConditionsUpdated(newCondition, oldCondition []v1alpha1.PodGroupCondition) bool {
+	if len(newCondition) != len(oldCondition) {
+		return true
+	}
+
+	for index, newCond := range newCondition {
+		oldCond := oldCondition[index]
+
+		newTime := newCond.LastTransitionTime
+		oldTime := oldCond.LastTransitionTime
+		if TimeJitterAfter(newTime.Time, oldTime.Time, jobConditionUpdateTime, jobConditionUpdateTimeJitter) {
+			return true
+		}
+
+		// if newCond is not new enough, we treat it the same as the old one
+		newCond.LastTransitionTime = oldTime
+
+		// comparing should ignore the TransitionID
+		newTransitionID := newCond.TransitionID
+		newCond.TransitionID = oldCond.TransitionID
+
+		shouldUpdate := !reflect.DeepEqual(&newCond, &oldCond)
+
+		newCond.LastTransitionTime = newTime
+		newCond.TransitionID = newTransitionID
+		if shouldUpdate {
+			return true
+		}
+	}
+
+	return false
+}
+
+func isPodGroupStatusUpdated(newStatus, oldStatus *v1alpha1.PodGroupStatus) bool {
+	newCondition := newStatus.Conditions
+	newStatus.Conditions = nil
+	oldCondition := oldStatus.Conditions
+	oldStatus.Conditions = nil
+
+	shouldUpdate := !reflect.DeepEqual(newStatus, oldStatus) || isPodGroupConditionsUpdated(newCondition, oldCondition)
+
+	newStatus.Conditions = newCondition
+	oldStatus.Conditions = oldCondition
+
+	return shouldUpdate
+}
+
+// updateJob update specified job
+func (ju *jobUpdater) updateJob(index int) {
+	job := ju.jobQueue[index]
+	ssn := ju.ssn
+
+	// If job is using PDB, ignore it.
+	// TODO(k82cn): remove it when removing PDB support
+	if job.PodGroup == nil {
+		ssn.cache.RecordJobStatusEvent(job)
+		return
+	}
+
+	job.PodGroup.Status = jobStatus(ssn, job)
+	oldStatus, found := ssn.podGroupStatus[job.UID]
+	updatePG := !found || isPodGroupStatusUpdated(&job.PodGroup.Status, oldStatus)
+
+	if _, err := ssn.cache.UpdateJobStatus(job, updatePG); err != nil {
+		glog.Errorf("Failed to update job <%s/%s>: %v",
+			job.Namespace, job.Name, err)
+	}
+}