Skip to content

Commit

Permalink
Merge pull request #289 from lminzhw/add_unknown_pod
Browse files Browse the repository at this point in the history
count unknown pod as [Unknown] in job status
  • Loading branch information
volcano-sh-bot authored Jul 5, 2019
2 parents e7c60c4 + 84c2340 commit 22a297e
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 32 deletions.
10 changes: 7 additions & 3 deletions pkg/apis/batch/v1alpha1/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,19 @@ type JobStatus struct {
// +optional
Terminating int32 `json:"terminating,omitempty" protobuf:"bytes,7,opt,name=terminating"`

// The number of pods which reached phase Unknown.
// +optional
Unknown int32 `json:"unknown,omitempty" protobuf:"bytes,8,opt,name=unknown"`

//Current version of job
Version int32 `json:"version,omitempty" protobuf:"bytes,8,opt,name=version"`
Version int32 `json:"version,omitempty" protobuf:"bytes,9,opt,name=version"`

// The number of Job retries.
// +optional
RetryCount int32 `json:"retryCount,omitempty" protobuf:"bytes,9,opt,name=retryCount"`
RetryCount int32 `json:"retryCount,omitempty" protobuf:"bytes,10,opt,name=retryCount"`

// The resources that controlled by this job, e.g. Service, ConfigMap
ControlledResources map[string]string `json:"controlledResources,omitempty" protobuf:"bytes,8,opt,name=controlledResources"`
ControlledResources map[string]string `json:"controlledResources,omitempty" protobuf:"bytes,11,opt,name=controlledResources"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
10 changes: 6 additions & 4 deletions pkg/cli/job/list.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ const (
Version string = "Version"
// Failed failed
Failed string = "Failed"
// Unknown pod
Unknown string = "Unknown"
// RetryCount retry count
RetryCount string = "RetryCount"
// JobType job type
Expand Down Expand Up @@ -110,8 +112,8 @@ func ListJobs() error {
// PrintJobs prints all jobs details
func PrintJobs(jobs *v1alpha1.JobList, writer io.Writer) {
maxNameLen := getMaxNameLen(jobs)
_, err := fmt.Fprintf(writer, fmt.Sprintf("%%-%ds%%-25s%%-12s%%-12s%%-12s%%-6s%%-10s%%-10s%%-12s%%-10s%%-12s\n", maxNameLen),
Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, RetryCount)
_, err := fmt.Fprintf(writer, fmt.Sprintf("%%-%ds%%-25s%%-12s%%-12s%%-12s%%-6s%%-10s%%-10s%%-12s%%-10s%%-12s%%-10s\n", maxNameLen),
Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
if err != nil {
fmt.Printf("Failed to print list command result: %s.\n", err)
}
Expand All @@ -131,9 +133,9 @@ func PrintJobs(jobs *v1alpha1.JobList, writer io.Writer) {
if jobType == "" {
jobType = "Batch"
}
_, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds%%-25s%%-12s%%-12s%%-12d%%-6d%%-10d%%-10d%%-12d%%-10d%%-12d\n", maxNameLen),
_, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds%%-25s%%-12s%%-12s%%-12d%%-6d%%-10d%%-10d%%-12d%%-10d%%-12d%%-10d\n", maxNameLen),
job.Name, job.CreationTimestamp.Format("2006-01-02 15:04:05"), job.Status.State.Phase, jobType, replicas,
job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.RetryCount)
job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
if err != nil {
fmt.Printf("Failed to print list command result: %s.\n", err)
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/cli/job/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ func PrintJobInfo(job *v1alpha1.Job, writer io.Writer) {
if job.Status.Terminating > 0 {
WriteLine(writer, Level1, "Terminating: \t%d\n", job.Status.Terminating)
}
if job.Status.Unknown > 0 {
WriteLine(writer, Level1, "Unknown: \t%d\n", job.Status.Unknown)
}
if job.Status.RetryCount > 0 {
WriteLine(writer, Level1, "RetryCount: \t%d\n", job.Status.RetryCount)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/job/view_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func TestViewJob(t *testing.T) {
Pending: 3,
Running: 1,
Failed: 2,
Unknown: 10,
Terminating: 4,
RetryCount: 5,
MinAvailable: 6,
Expand Down
49 changes: 24 additions & 25 deletions pkg/controllers/job/job_controller_actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (cc *Controller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseM
return nil
}

var pending, running, terminating, succeeded, failed int32
var pending, running, terminating, succeeded, failed, unknown int32

var errs []error
var total int
Expand Down Expand Up @@ -76,16 +76,7 @@ func (cc *Controller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseM
cc.resyncTask(pod)
}

switch pod.Status.Phase {
case v1.PodRunning:
running++
case v1.PodPending:
pending++
case v1.PodSucceeded:
succeeded++
case v1.PodFailed:
failed++
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
}
}

Expand All @@ -108,6 +99,7 @@ func (cc *Controller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseM
Succeeded: succeeded,
Failed: failed,
Terminating: terminating,
Unknown: unknown,
Version: job.Status.Version,
MinAvailable: int32(job.Spec.MinAvailable),
RetryCount: job.Status.RetryCount,
Expand Down Expand Up @@ -217,7 +209,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt
return nil
}

var running, pending, terminating, succeeded, failed int32
var running, pending, terminating, succeeded, failed, unknown int32

var podToCreate []*v1.Pod
var podToDelete []*v1.Pod
Expand Down Expand Up @@ -250,16 +242,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt
continue
}

switch pod.Status.Phase {
case v1.PodPending:
pending++
case v1.PodRunning:
running++
case v1.PodSucceeded:
succeeded++
case v1.PodFailed:
failed++
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
}
}

Expand All @@ -273,7 +256,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt
for _, pod := range podToCreate {
go func(pod *v1.Pod) {
defer waitCreationGroup.Done()
_, err := cc.kubeClients.CoreV1().Pods(pod.Namespace).Create(pod)
newPod, err := cc.kubeClients.CoreV1().Pods(pod.Namespace).Create(pod)
if err != nil && !apierrors.IsAlreadyExists(err) {
// Failed to create Pod, waitCreationGroup a moment and then create it again
// This is to ensure all podsMap under the same Job created
Expand All @@ -286,8 +269,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt
cc.resyncTask(pod)
}

// TODO: maybe not pending status, maybe unknown.
pending++
classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
glog.V(3).Infof("Created Task <%s> of Job <%s/%s>",
pod.Name, job.Namespace, job.Name)
}
Expand Down Expand Up @@ -340,6 +322,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt
Succeeded: succeeded,
Failed: failed,
Terminating: terminating,
Unknown: unknown,
Version: job.Status.Version,
MinAvailable: int32(job.Spec.MinAvailable),
ControlledResources: job.Status.ControlledResources,
Expand Down Expand Up @@ -559,3 +542,19 @@ func (cc *Controller) initJobStatus(job *vkv1.Job) (*vkv1.Job, error) {

return newJob, nil
}

func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
switch pod.Status.Phase {
case v1.PodPending:
*pending++
case v1.PodRunning:
*running++
case v1.PodSucceeded:
*succeeded++
case v1.PodFailed:
*failed++
default:
*unknown++
}
return
}

0 comments on commit 22a297e

Please sign in to comment.