volcano/pkg/controllers/job/job_controller_actions.go

642 lines
19 KiB
Go

/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"sort"
"sync"
"sync/atomic"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog"
batch "volcano.sh/volcano/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/apis/helpers"
scheduling "volcano.sh/volcano/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
"volcano.sh/volcano/pkg/controllers/job/state"
)
func (cc *Controller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error {
klog.V(3).Infof("Killing Job <%s/%s>", jobInfo.Job.Namespace, jobInfo.Job.Name)
defer klog.V(3).Infof("Finished Job <%s/%s> killing", jobInfo.Job.Namespace, jobInfo.Job.Name)
job := jobInfo.Job
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
if job.DeletionTimestamp != nil {
klog.Infof("Job <%s/%s> is terminating, skip management process.",
job.Namespace, job.Name)
return nil
}
var pending, running, terminating, succeeded, failed, unknown int32
var errs []error
var total int
for _, pods := range jobInfo.Pods {
for _, pod := range pods {
total++
if pod.DeletionTimestamp != nil {
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
terminating++
continue
}
_, retain := podRetainPhase[pod.Status.Phase]
if !retain {
err := cc.deleteJobPod(job.Name, pod)
if err == nil {
terminating++
continue
}
// record the err, and then collect the pod info like retained pod
errs = append(errs, err)
cc.resyncTask(pod)
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
}
}
if len(errs) != 0 {
klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs)
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
fmt.Sprintf("Error deleting pods: %+v", errs))
return fmt.Errorf("failed to kill %d pods of %d", len(errs), total)
}
job = job.DeepCopy()
// Job version is bumped only when job is killed
job.Status.Version = job.Status.Version + 1
job.Status = batch.JobStatus{
State: job.Status.State,
Pending: pending,
Running: running,
Succeeded: succeeded,
Failed: failed,
Terminating: terminating,
Unknown: unknown,
Version: job.Status.Version,
MinAvailable: job.Spec.MinAvailable,
RetryCount: job.Status.RetryCount,
}
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
}
}
// Update Job status
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
// Delete PodGroup
if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(job.Name, nil); err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
}
if err := cc.pluginOnJobDelete(job); err != nil {
return err
}
// NOTE(k82cn): DO NOT delete input/output until job is deleted.
return nil
}
func (cc *Controller) initiateJob(job *batch.Job) (*batch.Job, error) {
klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name)
defer klog.V(3).Infof("Finished Job <%s/%s> initiate", job.Namespace, job.Name)
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
job, err := cc.initJobStatus(job)
if err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError),
fmt.Sprintf("Failed to initialize job status, err: %v", err))
return nil, err
}
if err := cc.pluginOnJobAdd(job); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
return nil, err
}
newJob, err := cc.createJobIOIfNotExist(job)
if err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError),
fmt.Sprintf("Failed to create PVC, err: %v", err))
return nil, err
}
if err := cc.createOrUpdatePodGroup(newJob); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
return nil, err
}
return newJob, nil
}
func (cc *Controller) initOnJobUpdate(job *batch.Job) error {
klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name)
defer klog.V(3).Infof("Finished Job <%s/%s> initiate on update", job.Namespace, job.Name)
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
if err := cc.pluginOnJobUpdate(job); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
return err
}
if err := cc.createOrUpdatePodGroup(job); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
return err
}
return nil
}
func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error {
klog.V(3).Infof("Starting to sync up Job <%s/%s>", jobInfo.Job.Namespace, jobInfo.Job.Name)
defer klog.V(3).Infof("Finished Job <%s/%s> sync up", jobInfo.Job.Namespace, jobInfo.Job.Name)
job := jobInfo.Job.DeepCopy()
klog.Infof("Current Version is: %d of job: %s/%s", job.Status.Version, job.Namespace, job.Name)
if job.DeletionTimestamp != nil {
klog.Infof("Job <%s/%s> is terminating, skip management process.",
job.Namespace, job.Name)
return nil
}
// Skip job initiation if job is already initiated
if !isInitiated(job) {
var err error
if job, err = cc.initiateJob(job); err != nil {
return err
}
} else {
var err error
// TODO: optimize this call it only when scale up/down
if err = cc.initOnJobUpdate(job); err != nil {
return err
}
}
var syncTask bool
if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(job.Name); pg != nil {
if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
syncTask = true
}
}
if !syncTask {
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
}
}
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
return nil
}
var running, pending, terminating, succeeded, failed, unknown int32
var podToCreate []*v1.Pod
var podToDelete []*v1.Pod
var creationErrs []error
var deletionErrs []error
appendMutex := sync.Mutex{}
appendError := func(container *[]error, err error) {
appendMutex.Lock()
defer appendMutex.Unlock()
*container = append(*container, err)
}
for _, ts := range job.Spec.Tasks {
ts.Template.Name = ts.Name
tc := ts.Template.DeepCopy()
name := ts.Template.Name
pods, found := jobInfo.Pods[name]
if !found {
pods = map[string]*v1.Pod{}
}
for i := 0; i < int(ts.Replicas); i++ {
podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i)
if pod, found := pods[podName]; !found {
newPod := createJobPod(job, tc, i)
if err := cc.pluginOnPodCreate(job, newPod); err != nil {
return err
}
podToCreate = append(podToCreate, newPod)
} else {
delete(pods, podName)
if pod.DeletionTimestamp != nil {
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
atomic.AddInt32(&terminating, 1)
continue
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
}
}
for _, pod := range pods {
podToDelete = append(podToDelete, pod)
}
}
waitCreationGroup := sync.WaitGroup{}
waitCreationGroup.Add(len(podToCreate))
for _, pod := range podToCreate {
go func(pod *v1.Pod) {
defer waitCreationGroup.Done()
newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(pod)
if err != nil && !apierrors.IsAlreadyExists(err) {
// Failed to create Pod, waitCreationGroup a moment and then create it again
// This is to ensure all podsMap under the same Job created
// So gang-scheduling could schedule the Job successfully
klog.Errorf("Failed to create pod %s for Job %s, err %#v",
pod.Name, job.Name, err)
appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err))
} else {
classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
klog.V(3).Infof("Created Task <%s> of Job <%s/%s>",
pod.Name, job.Namespace, job.Name)
}
}(pod)
}
waitCreationGroup.Wait()
if len(creationErrs) != 0 {
cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason,
fmt.Sprintf("Error creating pods: %+v", creationErrs))
return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate))
}
// Delete pods when scale down.
waitDeletionGroup := sync.WaitGroup{}
waitDeletionGroup.Add(len(podToDelete))
for _, pod := range podToDelete {
go func(pod *v1.Pod) {
defer waitDeletionGroup.Done()
err := cc.deleteJobPod(job.Name, pod)
if err != nil {
// Failed to delete Pod, waitCreationGroup a moment and then create it again
// This is to ensure all podsMap under the same Job created
// So gang-scheduling could schedule the Job successfully
klog.Errorf("Failed to delete pod %s for Job %s, err %#v",
pod.Name, job.Name, err)
appendError(&deletionErrs, err)
cc.resyncTask(pod)
} else {
klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>",
pod.Name, job.Namespace, job.Name)
atomic.AddInt32(&terminating, 1)
}
}(pod)
}
waitDeletionGroup.Wait()
if len(deletionErrs) != 0 {
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
fmt.Sprintf("Error deleting pods: %+v", deletionErrs))
return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete))
}
job.Status = batch.JobStatus{
State: job.Status.State,
Pending: pending,
Running: running,
Succeeded: succeeded,
Failed: failed,
Terminating: terminating,
Unknown: unknown,
Version: job.Status.Version,
MinAvailable: job.Spec.MinAvailable,
ControlledResources: job.Status.ControlledResources,
RetryCount: job.Status.RetryCount,
}
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
}
}
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
return nil
}
func (cc *Controller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) {
// If PVC does not exist, create them for Job.
var needUpdate bool
if job.Status.ControlledResources == nil {
job.Status.ControlledResources = make(map[string]string)
}
for index, volume := range job.Spec.Volumes {
vcName := volume.VolumeClaimName
if len(vcName) == 0 {
// NOTE(k82cn): Ensure never have duplicated generated names.
for {
vcName = jobhelpers.GenPVCName(job.Name)
exist, err := cc.checkPVCExist(job, vcName)
if err != nil {
return job, err
}
if exist {
continue
}
job.Spec.Volumes[index].VolumeClaimName = vcName
needUpdate = true
break
}
// TODO: check VolumeClaim must be set if VolumeClaimName is empty
if volume.VolumeClaim != nil {
if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil {
return job, err
}
}
} else {
exist, err := cc.checkPVCExist(job, vcName)
if err != nil {
return job, err
}
if !exist {
return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName)
}
}
job.Status.ControlledResources["volume-pvc-"+vcName] = vcName
}
if needUpdate {
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(job)
if err != nil {
klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ",
job.Namespace, job.Name, err)
return job, err
}
newJob.Status = job.Status
return newJob, err
}
return job, nil
}
func (cc *Controller) checkPVCExist(job *batch.Job, pvc string) (bool, error) {
if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil {
if apierrors.IsNotFound(err) {
return false, nil
}
klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v",
pvc, job.Namespace, job.Name, err)
return false, err
}
return true, nil
}
func (cc *Controller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error {
pvc := &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Namespace: job.Namespace,
Name: vcName,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(job, helpers.JobKind),
},
},
Spec: *volumeClaim,
}
klog.V(3).Infof("Try to create PVC: %v", pvc)
if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(pvc); e != nil {
klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v",
job.Namespace, job.Name, e)
return e
}
return nil
}
func (cc *Controller) createOrUpdatePodGroup(job *batch.Job) error {
// If PodGroup does not exist, create one for Job.
pg, err := cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
if err != nil {
if !apierrors.IsNotFound(err) {
klog.V(3).Infof("Failed to get PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
pg := &scheduling.PodGroup{
ObjectMeta: metav1.ObjectMeta{
Namespace: job.Namespace,
Name: job.Name,
Annotations: job.Annotations,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(job, helpers.JobKind),
},
},
Spec: scheduling.PodGroupSpec{
MinMember: job.Spec.MinAvailable,
Queue: job.Spec.Queue,
MinResources: cc.calcPGMinResources(job),
PriorityClassName: job.Spec.PriorityClassName,
},
}
if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(pg); err != nil {
if !apierrors.IsAlreadyExists(err) {
klog.V(3).Infof("Failed to create PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
}
return nil
}
if pg.Spec.MinMember != job.Spec.MinAvailable {
pg.Spec.MinMember = job.Spec.MinAvailable
pg.Spec.MinResources = cc.calcPGMinResources(job)
if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(pg); err != nil {
if !apierrors.IsAlreadyExists(err) {
klog.V(3).Infof("Failed to create PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
}
}
return nil
}
func (cc *Controller) deleteJobPod(jobName string, pod *v1.Pod) error {
err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(pod.Name, nil)
if err != nil && !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v",
pod.Namespace, pod.Name, jobName, err)
return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err)
}
return nil
}
func (cc *Controller) calcPGMinResources(job *batch.Job) *v1.ResourceList {
cc.Mutex.Lock()
defer cc.Mutex.Unlock()
// sort task by priorityClasses
var tasksPriority TasksPriority
for index := range job.Spec.Tasks {
tp := TaskPriority{0, job.Spec.Tasks[index]}
pc := job.Spec.Tasks[index].Template.Spec.PriorityClassName
if len(cc.priorityClasses) != 0 && cc.priorityClasses[pc] != nil {
tp.priority = cc.priorityClasses[pc].Value
}
tasksPriority = append(tasksPriority, tp)
}
sort.Sort(tasksPriority)
minAvailableTasksRes := v1.ResourceList{}
podCnt := int32(0)
for _, task := range tasksPriority {
for i := int32(0); i < task.Replicas; i++ {
if podCnt >= job.Spec.MinAvailable {
break
}
podCnt++
for _, c := range task.Template.Spec.Containers {
addResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits)
}
}
}
return &minAvailableTasksRes
}
func (cc *Controller) initJobStatus(job *batch.Job) (*batch.Job, error) {
if job.Status.State.Phase != "" {
return job, nil
}
job.Status.State.LastTransitionTime = metav1.Now()
job.Status.State.Phase = batch.Pending
job.Status.State.LastTransitionTime = metav1.Now()
job.Status.MinAvailable = job.Spec.MinAvailable
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return nil, err
}
if err := cc.cache.Update(newJob); err != nil {
klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, err)
return nil, err
}
return newJob, nil
}
func (cc *Controller) updateJobStatus(job *batch.Job) (*batch.Job, error) {
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(job)
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return nil, err
}
if err := cc.cache.Update(newJob); err != nil {
klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, err)
return nil, err
}
return newJob, nil
}
func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
switch pod.Status.Phase {
case v1.PodPending:
atomic.AddInt32(pending, 1)
case v1.PodRunning:
atomic.AddInt32(running, 1)
case v1.PodSucceeded:
atomic.AddInt32(succeeded, 1)
case v1.PodFailed:
atomic.AddInt32(failed, 1)
default:
atomic.AddInt32(unknown, 1)
}
return
}
func isInitiated(job *batch.Job) bool {
if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending || job.Status.State.Phase == batch.Restarting {
return false
}
return true
}