642 lines
19 KiB
Go
642 lines
19 KiB
Go
/*
|
|
Copyright 2019 The Volcano Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package job
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/klog"
|
|
|
|
batch "volcano.sh/volcano/pkg/apis/batch/v1alpha1"
|
|
"volcano.sh/volcano/pkg/apis/helpers"
|
|
scheduling "volcano.sh/volcano/pkg/apis/scheduling/v1beta1"
|
|
"volcano.sh/volcano/pkg/controllers/apis"
|
|
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
|
|
"volcano.sh/volcano/pkg/controllers/job/state"
|
|
)
|
|
|
|
func (cc *Controller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error {
|
|
klog.V(3).Infof("Killing Job <%s/%s>", jobInfo.Job.Namespace, jobInfo.Job.Name)
|
|
defer klog.V(3).Infof("Finished Job <%s/%s> killing", jobInfo.Job.Namespace, jobInfo.Job.Name)
|
|
|
|
job := jobInfo.Job
|
|
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
|
|
if job.DeletionTimestamp != nil {
|
|
klog.Infof("Job <%s/%s> is terminating, skip management process.",
|
|
job.Namespace, job.Name)
|
|
return nil
|
|
}
|
|
|
|
var pending, running, terminating, succeeded, failed, unknown int32
|
|
|
|
var errs []error
|
|
var total int
|
|
|
|
for _, pods := range jobInfo.Pods {
|
|
for _, pod := range pods {
|
|
total++
|
|
|
|
if pod.DeletionTimestamp != nil {
|
|
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
|
|
terminating++
|
|
continue
|
|
}
|
|
|
|
_, retain := podRetainPhase[pod.Status.Phase]
|
|
|
|
if !retain {
|
|
err := cc.deleteJobPod(job.Name, pod)
|
|
if err == nil {
|
|
terminating++
|
|
continue
|
|
}
|
|
// record the err, and then collect the pod info like retained pod
|
|
errs = append(errs, err)
|
|
cc.resyncTask(pod)
|
|
}
|
|
|
|
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
|
|
}
|
|
}
|
|
|
|
if len(errs) != 0 {
|
|
klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs)
|
|
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
|
|
fmt.Sprintf("Error deleting pods: %+v", errs))
|
|
return fmt.Errorf("failed to kill %d pods of %d", len(errs), total)
|
|
}
|
|
|
|
job = job.DeepCopy()
|
|
// Job version is bumped only when job is killed
|
|
job.Status.Version = job.Status.Version + 1
|
|
|
|
job.Status = batch.JobStatus{
|
|
State: job.Status.State,
|
|
|
|
Pending: pending,
|
|
Running: running,
|
|
Succeeded: succeeded,
|
|
Failed: failed,
|
|
Terminating: terminating,
|
|
Unknown: unknown,
|
|
Version: job.Status.Version,
|
|
MinAvailable: job.Spec.MinAvailable,
|
|
RetryCount: job.Status.RetryCount,
|
|
}
|
|
|
|
if updateStatus != nil {
|
|
if updateStatus(&job.Status) {
|
|
job.Status.State.LastTransitionTime = metav1.Now()
|
|
}
|
|
}
|
|
|
|
// Update Job status
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update status of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
if e := cc.cache.Update(newJob); e != nil {
|
|
klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v",
|
|
newJob.Namespace, newJob.Name, e)
|
|
return e
|
|
}
|
|
|
|
// Delete PodGroup
|
|
if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(job.Name, nil); err != nil {
|
|
if !apierrors.IsNotFound(err) {
|
|
klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := cc.pluginOnJobDelete(job); err != nil {
|
|
return err
|
|
}
|
|
|
|
// NOTE(k82cn): DO NOT delete input/output until job is deleted.
|
|
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) initiateJob(job *batch.Job) (*batch.Job, error) {
|
|
klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name)
|
|
defer klog.V(3).Infof("Finished Job <%s/%s> initiate", job.Namespace, job.Name)
|
|
|
|
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
|
|
job, err := cc.initJobStatus(job)
|
|
if err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError),
|
|
fmt.Sprintf("Failed to initialize job status, err: %v", err))
|
|
return nil, err
|
|
}
|
|
|
|
if err := cc.pluginOnJobAdd(job); err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
|
|
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
|
|
return nil, err
|
|
}
|
|
|
|
newJob, err := cc.createJobIOIfNotExist(job)
|
|
if err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError),
|
|
fmt.Sprintf("Failed to create PVC, err: %v", err))
|
|
return nil, err
|
|
}
|
|
|
|
if err := cc.createOrUpdatePodGroup(newJob); err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
|
|
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
|
|
return nil, err
|
|
}
|
|
|
|
return newJob, nil
|
|
}
|
|
|
|
func (cc *Controller) initOnJobUpdate(job *batch.Job) error {
|
|
klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name)
|
|
defer klog.V(3).Infof("Finished Job <%s/%s> initiate on update", job.Namespace, job.Name)
|
|
|
|
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
|
|
|
|
if err := cc.pluginOnJobUpdate(job); err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
|
|
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
|
|
return err
|
|
}
|
|
|
|
if err := cc.createOrUpdatePodGroup(job); err != nil {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
|
|
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error {
|
|
klog.V(3).Infof("Starting to sync up Job <%s/%s>", jobInfo.Job.Namespace, jobInfo.Job.Name)
|
|
defer klog.V(3).Infof("Finished Job <%s/%s> sync up", jobInfo.Job.Namespace, jobInfo.Job.Name)
|
|
|
|
job := jobInfo.Job.DeepCopy()
|
|
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
|
|
|
|
if job.DeletionTimestamp != nil {
|
|
klog.Infof("Job <%s/%s> is terminating, skip management process.",
|
|
job.Namespace, job.Name)
|
|
return nil
|
|
}
|
|
|
|
// Skip job initiation if job is already initiated
|
|
if !isInitiated(job) {
|
|
var err error
|
|
if job, err = cc.initiateJob(job); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
var err error
|
|
// TODO: optimize this call it only when scale up/down
|
|
if err = cc.initOnJobUpdate(job); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
var syncTask bool
|
|
if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(job.Name); pg != nil {
|
|
if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
|
|
syncTask = true
|
|
}
|
|
}
|
|
|
|
if !syncTask {
|
|
if updateStatus != nil {
|
|
if updateStatus(&job.Status) {
|
|
job.Status.State.LastTransitionTime = metav1.Now()
|
|
}
|
|
}
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update status of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
if e := cc.cache.Update(newJob); e != nil {
|
|
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
|
|
newJob.Namespace, newJob.Name, e)
|
|
return e
|
|
}
|
|
return nil
|
|
}
|
|
|
|
var running, pending, terminating, succeeded, failed, unknown int32
|
|
|
|
var podToCreate []*v1.Pod
|
|
var podToDelete []*v1.Pod
|
|
var creationErrs []error
|
|
var deletionErrs []error
|
|
appendMutex := sync.Mutex{}
|
|
|
|
appendError := func(container *[]error, err error) {
|
|
appendMutex.Lock()
|
|
defer appendMutex.Unlock()
|
|
*container = append(*container, err)
|
|
}
|
|
|
|
for _, ts := range job.Spec.Tasks {
|
|
ts.Template.Name = ts.Name
|
|
tc := ts.Template.DeepCopy()
|
|
name := ts.Template.Name
|
|
|
|
pods, found := jobInfo.Pods[name]
|
|
if !found {
|
|
pods = map[string]*v1.Pod{}
|
|
}
|
|
|
|
for i := 0; i < int(ts.Replicas); i++ {
|
|
podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i)
|
|
if pod, found := pods[podName]; !found {
|
|
newPod := createJobPod(job, tc, i)
|
|
if err := cc.pluginOnPodCreate(job, newPod); err != nil {
|
|
return err
|
|
}
|
|
podToCreate = append(podToCreate, newPod)
|
|
} else {
|
|
delete(pods, podName)
|
|
if pod.DeletionTimestamp != nil {
|
|
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
|
|
atomic.AddInt32(&terminating, 1)
|
|
continue
|
|
}
|
|
|
|
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
|
|
}
|
|
}
|
|
|
|
for _, pod := range pods {
|
|
podToDelete = append(podToDelete, pod)
|
|
}
|
|
}
|
|
|
|
waitCreationGroup := sync.WaitGroup{}
|
|
waitCreationGroup.Add(len(podToCreate))
|
|
for _, pod := range podToCreate {
|
|
go func(pod *v1.Pod) {
|
|
defer waitCreationGroup.Done()
|
|
newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(pod)
|
|
if err != nil && !apierrors.IsAlreadyExists(err) {
|
|
// Failed to create Pod, waitCreationGroup a moment and then create it again
|
|
// This is to ensure all podsMap under the same Job created
|
|
// So gang-scheduling could schedule the Job successfully
|
|
klog.Errorf("Failed to create pod %s for Job %s, err %#v",
|
|
pod.Name, job.Name, err)
|
|
appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err))
|
|
} else {
|
|
classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
|
|
klog.V(3).Infof("Created Task <%s> of Job <%s/%s>",
|
|
pod.Name, job.Namespace, job.Name)
|
|
}
|
|
}(pod)
|
|
}
|
|
waitCreationGroup.Wait()
|
|
|
|
if len(creationErrs) != 0 {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason,
|
|
fmt.Sprintf("Error creating pods: %+v", creationErrs))
|
|
return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate))
|
|
}
|
|
|
|
// Delete pods when scale down.
|
|
waitDeletionGroup := sync.WaitGroup{}
|
|
waitDeletionGroup.Add(len(podToDelete))
|
|
for _, pod := range podToDelete {
|
|
go func(pod *v1.Pod) {
|
|
defer waitDeletionGroup.Done()
|
|
err := cc.deleteJobPod(job.Name, pod)
|
|
if err != nil {
|
|
// Failed to delete Pod, waitCreationGroup a moment and then create it again
|
|
// This is to ensure all podsMap under the same Job created
|
|
// So gang-scheduling could schedule the Job successfully
|
|
klog.Errorf("Failed to delete pod %s for Job %s, err %#v",
|
|
pod.Name, job.Name, err)
|
|
appendError(&deletionErrs, err)
|
|
cc.resyncTask(pod)
|
|
} else {
|
|
klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>",
|
|
pod.Name, job.Namespace, job.Name)
|
|
atomic.AddInt32(&terminating, 1)
|
|
}
|
|
}(pod)
|
|
}
|
|
waitDeletionGroup.Wait()
|
|
|
|
if len(deletionErrs) != 0 {
|
|
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
|
|
fmt.Sprintf("Error deleting pods: %+v", deletionErrs))
|
|
return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete))
|
|
}
|
|
job.Status = batch.JobStatus{
|
|
State: job.Status.State,
|
|
|
|
Pending: pending,
|
|
Running: running,
|
|
Succeeded: succeeded,
|
|
Failed: failed,
|
|
Terminating: terminating,
|
|
Unknown: unknown,
|
|
Version: job.Status.Version,
|
|
MinAvailable: job.Spec.MinAvailable,
|
|
ControlledResources: job.Status.ControlledResources,
|
|
RetryCount: job.Status.RetryCount,
|
|
}
|
|
|
|
if updateStatus != nil {
|
|
if updateStatus(&job.Status) {
|
|
job.Status.State.LastTransitionTime = metav1.Now()
|
|
}
|
|
}
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update status of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
if e := cc.cache.Update(newJob); e != nil {
|
|
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
|
|
newJob.Namespace, newJob.Name, e)
|
|
return e
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) {
|
|
// If PVC does not exist, create them for Job.
|
|
var needUpdate bool
|
|
if job.Status.ControlledResources == nil {
|
|
job.Status.ControlledResources = make(map[string]string)
|
|
}
|
|
for index, volume := range job.Spec.Volumes {
|
|
vcName := volume.VolumeClaimName
|
|
if len(vcName) == 0 {
|
|
// NOTE(k82cn): Ensure never have duplicated generated names.
|
|
for {
|
|
vcName = jobhelpers.GenPVCName(job.Name)
|
|
exist, err := cc.checkPVCExist(job, vcName)
|
|
if err != nil {
|
|
return job, err
|
|
}
|
|
if exist {
|
|
continue
|
|
}
|
|
job.Spec.Volumes[index].VolumeClaimName = vcName
|
|
needUpdate = true
|
|
break
|
|
}
|
|
// TODO: check VolumeClaim must be set if VolumeClaimName is empty
|
|
if volume.VolumeClaim != nil {
|
|
if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil {
|
|
return job, err
|
|
}
|
|
}
|
|
} else {
|
|
exist, err := cc.checkPVCExist(job, vcName)
|
|
if err != nil {
|
|
return job, err
|
|
}
|
|
if !exist {
|
|
return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName)
|
|
}
|
|
}
|
|
job.Status.ControlledResources["volume-pvc-"+vcName] = vcName
|
|
}
|
|
if needUpdate {
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ",
|
|
job.Namespace, job.Name, err)
|
|
return job, err
|
|
}
|
|
|
|
newJob.Status = job.Status
|
|
return newJob, err
|
|
}
|
|
return job, nil
|
|
}
|
|
|
|
func (cc *Controller) checkPVCExist(job *batch.Job, pvc string) (bool, error) {
|
|
if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil {
|
|
if apierrors.IsNotFound(err) {
|
|
return false, nil
|
|
}
|
|
klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v",
|
|
pvc, job.Namespace, job.Name, err)
|
|
return false, err
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func (cc *Controller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error {
|
|
pvc := &v1.PersistentVolumeClaim{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Namespace: job.Namespace,
|
|
Name: vcName,
|
|
OwnerReferences: []metav1.OwnerReference{
|
|
*metav1.NewControllerRef(job, helpers.JobKind),
|
|
},
|
|
},
|
|
Spec: *volumeClaim,
|
|
}
|
|
|
|
klog.V(3).Infof("Try to create PVC: %v", pvc)
|
|
|
|
if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(pvc); e != nil {
|
|
klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v",
|
|
job.Namespace, job.Name, e)
|
|
return e
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) createOrUpdatePodGroup(job *batch.Job) error {
|
|
// If PodGroup does not exist, create one for Job.
|
|
pg, err := cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
|
|
if err != nil {
|
|
if !apierrors.IsNotFound(err) {
|
|
klog.V(3).Infof("Failed to get PodGroup for Job <%s/%s>: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
pg := &scheduling.PodGroup{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Namespace: job.Namespace,
|
|
Name: job.Name,
|
|
Annotations: job.Annotations,
|
|
OwnerReferences: []metav1.OwnerReference{
|
|
*metav1.NewControllerRef(job, helpers.JobKind),
|
|
},
|
|
},
|
|
Spec: scheduling.PodGroupSpec{
|
|
MinMember: job.Spec.MinAvailable,
|
|
Queue: job.Spec.Queue,
|
|
MinResources: cc.calcPGMinResources(job),
|
|
PriorityClassName: job.Spec.PriorityClassName,
|
|
},
|
|
}
|
|
|
|
if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(pg); err != nil {
|
|
if !apierrors.IsAlreadyExists(err) {
|
|
klog.V(3).Infof("Failed to create PodGroup for Job <%s/%s>: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if pg.Spec.MinMember != job.Spec.MinAvailable {
|
|
pg.Spec.MinMember = job.Spec.MinAvailable
|
|
pg.Spec.MinResources = cc.calcPGMinResources(job)
|
|
if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(pg); err != nil {
|
|
if !apierrors.IsAlreadyExists(err) {
|
|
klog.V(3).Infof("Failed to create PodGroup for Job <%s/%s>: %v",
|
|
job.Namespace, job.Name, err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) deleteJobPod(jobName string, pod *v1.Pod) error {
|
|
err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(pod.Name, nil)
|
|
if err != nil && !apierrors.IsNotFound(err) {
|
|
klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v",
|
|
pod.Namespace, pod.Name, jobName, err)
|
|
|
|
return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (cc *Controller) calcPGMinResources(job *batch.Job) *v1.ResourceList {
|
|
cc.Mutex.Lock()
|
|
defer cc.Mutex.Unlock()
|
|
|
|
// sort task by priorityClasses
|
|
var tasksPriority TasksPriority
|
|
for index := range job.Spec.Tasks {
|
|
tp := TaskPriority{0, job.Spec.Tasks[index]}
|
|
pc := job.Spec.Tasks[index].Template.Spec.PriorityClassName
|
|
if len(cc.priorityClasses) != 0 && cc.priorityClasses[pc] != nil {
|
|
tp.priority = cc.priorityClasses[pc].Value
|
|
}
|
|
tasksPriority = append(tasksPriority, tp)
|
|
}
|
|
|
|
sort.Sort(tasksPriority)
|
|
|
|
minAvailableTasksRes := v1.ResourceList{}
|
|
podCnt := int32(0)
|
|
for _, task := range tasksPriority {
|
|
for i := int32(0); i < task.Replicas; i++ {
|
|
if podCnt >= job.Spec.MinAvailable {
|
|
break
|
|
}
|
|
podCnt++
|
|
for _, c := range task.Template.Spec.Containers {
|
|
addResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits)
|
|
}
|
|
}
|
|
}
|
|
|
|
return &minAvailableTasksRes
|
|
}
|
|
|
|
func (cc *Controller) initJobStatus(job *batch.Job) (*batch.Job, error) {
|
|
if job.Status.State.Phase != "" {
|
|
return job, nil
|
|
}
|
|
|
|
job.Status.State.LastTransitionTime = metav1.Now()
|
|
job.Status.State.Phase = batch.Pending
|
|
job.Status.State.LastTransitionTime = metav1.Now()
|
|
job.Status.MinAvailable = job.Spec.MinAvailable
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update status of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return nil, err
|
|
}
|
|
if err := cc.cache.Update(newJob); err != nil {
|
|
klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
|
|
newJob.Namespace, newJob.Name, err)
|
|
return nil, err
|
|
}
|
|
|
|
return newJob, nil
|
|
}
|
|
|
|
func (cc *Controller) updateJobStatus(job *batch.Job) (*batch.Job, error) {
|
|
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
|
|
if err != nil {
|
|
klog.Errorf("Failed to update status of Job %v/%v: %v",
|
|
job.Namespace, job.Name, err)
|
|
return nil, err
|
|
}
|
|
if err := cc.cache.Update(newJob); err != nil {
|
|
klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
|
|
newJob.Namespace, newJob.Name, err)
|
|
return nil, err
|
|
}
|
|
|
|
return newJob, nil
|
|
}
|
|
|
|
func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
|
|
switch pod.Status.Phase {
|
|
case v1.PodPending:
|
|
atomic.AddInt32(pending, 1)
|
|
case v1.PodRunning:
|
|
atomic.AddInt32(running, 1)
|
|
case v1.PodSucceeded:
|
|
atomic.AddInt32(succeeded, 1)
|
|
case v1.PodFailed:
|
|
atomic.AddInt32(failed, 1)
|
|
default:
|
|
atomic.AddInt32(unknown, 1)
|
|
}
|
|
return
|
|
}
|
|
|
|
func isInitiated(job *batch.Job) bool {
|
|
if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending || job.Status.State.Phase == batch.Restarting {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|