Added suspend/resume.

Signed-off-by: Klaus Ma <mada3@huawei.com>
This commit is contained in:
Klaus Ma 2019-01-21 15:05:06 +08:00 committed by Da K. Ma
parent 2eb6c11fbe
commit 6fcbd5216c
10 changed files with 285 additions and 59 deletions

51
cmd/cli/job.go Normal file
View File

@ -0,0 +1,51 @@
package main
import (
"github.com/spf13/cobra"
"hpw.cloud/volcano/pkg/cli/job"
)
func buildJobCmd() *cobra.Command {
jobCmd := &cobra.Command{
Use: "job",
}
jobRunCmd := &cobra.Command{
Use: "run",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.RunJob())
},
}
job.InitRunFlags(jobRunCmd)
jobCmd.AddCommand(jobRunCmd)
jobListCmd := &cobra.Command{
Use: "list",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.ListJobs())
},
}
job.InitListFlags(jobListCmd)
jobCmd.AddCommand(jobListCmd)
jobSuspendCmd := &cobra.Command{
Use: "suspend",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.SuspendJob())
},
}
job.InitSuspendFlags(jobSuspendCmd)
jobCmd.AddCommand(jobSuspendCmd)
jobResumeCmd := &cobra.Command{
Use: "resume",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.ResumeJob())
},
}
job.InitResumeFlags(jobResumeCmd)
jobCmd.AddCommand(jobResumeCmd)
return jobCmd
}

View File

@ -24,8 +24,6 @@ import (
"github.com/spf13/pflag"
"k8s.io/apimachinery/pkg/util/wait"
"hpw.cloud/volcano/pkg/cli/job"
)
var logFlushFreq = pflag.Duration("log-flush-frequency", 5*time.Second, "Maximum number of seconds between log flushes")
@ -38,32 +36,10 @@ func main() {
defer glog.Flush()
rootCmd := cobra.Command{
Use: "vncli",
Use: "vkctl",
}
jobCmd := &cobra.Command{
Use: "job",
}
jobRunCmd := &cobra.Command{
Use: "run",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.RunJob())
},
}
job.InitRunFlags(jobRunCmd)
jobCmd.AddCommand(jobRunCmd)
jobListCmd := &cobra.Command{
Use: "list",
Run: func(cmd *cobra.Command, args []string) {
checkError(cmd, job.ListJobs())
},
}
job.InitListFlags(jobListCmd)
jobCmd.AddCommand(jobListCmd)
rootCmd.AddCommand(jobCmd)
rootCmd.AddCommand(buildJobCmd())
if err := rootCmd.Execute(); err != nil {
fmt.Printf("Failed to execute command: %v", err)

View File

@ -56,16 +56,16 @@ func ListJobs() error {
return nil
}
fmt.Printf("%-30s%-25s%-12s%-8s%-12s%-12s%-12s%-12s\n",
"Name", "Creation", "Replicas", "Min", "Pending", "Running", "Succeeded", "Failed")
fmt.Printf("%-25s%-25s%-12s%-12s%-6s%-10s%-10s%-12s%-10s\n",
"Name", "Creation", "Phase", "Replicas", "Min", "Pending", "Running", "Succeeded", "Failed")
for _, job := range jobs.Items {
replicas := int32(0)
for _, ts := range job.Spec.Tasks {
replicas += ts.Replicas
}
fmt.Printf("%-30s%-25s%-12d%-8d%-12d%-12d%-12d%-12d\n",
job.Name, job.CreationTimestamp.Format("2006-01-02 15:04:05"), replicas,
fmt.Printf("%-25s%-25s%-12s%-12d%-6d%-10d%-10d%-12d%-10d\n",
job.Name, job.CreationTimestamp.Format("2006-01-02 15:04:05"), job.Status.State.Phase, replicas,
job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed)
}

49
pkg/cli/job/resume.go Normal file
View File

@ -0,0 +1,49 @@
/*
Copyright 2018 The Vulcan Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"github.com/spf13/cobra"
"hpw.cloud/volcano/pkg/apis/batch/v1alpha1"
)
type resumeFlags struct {
commonFlags
Namespace string
JobName string
}
var resumeJobFlags = &resumeFlags{}
func InitResumeFlags(cmd *cobra.Command) {
initFlags(cmd, &resumeJobFlags.commonFlags)
cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "", "default", "the namespace of job")
cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "n", "", "the name of job")
}
func ResumeJob() error {
config, err := buildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
if err != nil {
return err
}
return createJobCommand(config,
resumeJobFlags.Namespace, resumeJobFlags.JobName,
v1alpha1.ResumeJobAction)
}

49
pkg/cli/job/suspend.go Normal file
View File

@ -0,0 +1,49 @@
/*
Copyright 2018 The Vulcan Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"github.com/spf13/cobra"
"hpw.cloud/volcano/pkg/apis/batch/v1alpha1"
)
type suspendFlags struct {
commonFlags
Namespace string
JobName string
}
var suspendJobFlags = &suspendFlags{}
func InitSuspendFlags(cmd *cobra.Command) {
initFlags(cmd, &suspendJobFlags.commonFlags)
cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "", "default", "the namespace of job")
cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "n", "", "the name of job")
}
func SuspendJob() error {
config, err := buildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
if err != nil {
return err
}
return createJobCommand(config,
suspendJobFlags.Namespace, suspendJobFlags.JobName,
v1alpha1.AbortJobAction)
}

View File

@ -20,10 +20,17 @@ import (
"os"
"strings"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
vkbatchv1 "hpw.cloud/volcano/pkg/apis/batch/v1alpha1"
vkbusv1 "hpw.cloud/volcano/pkg/apis/bus/v1alpha1"
"hpw.cloud/volcano/pkg/apis/helpers"
"hpw.cloud/volcano/pkg/client/clientset/versioned"
)
func homeDir() string {
@ -61,3 +68,31 @@ func populateResourceListV1(spec string) (v1.ResourceList, error) {
}
return result, nil
}
func createJobCommand(config *rest.Config, ns, name string, action vkbatchv1.Action) error {
jobClient := versioned.NewForConfigOrDie(config)
job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(name, metav1.GetOptions{})
if err != nil {
return err
}
ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
cmd := &vkbusv1.Command{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-%s-",
job.Name, strings.ToLower(string(action))),
Namespace: job.Namespace,
OwnerReferences: []metav1.OwnerReference{
*ctrlRef,
},
},
TargetObject: ctrlRef,
Action: string(action),
}
if _, err := jobClient.BusV1alpha1().Commands(ns).Create(cmd); err != nil {
return err
}
return nil
}

View File

@ -18,7 +18,6 @@ package job
import (
"fmt"
"github.com/golang/glog"
"k8s.io/api/core/v1"
@ -114,8 +113,8 @@ func NewJobController(config *rest.Config) *Controller {
case *v1corev1.Command:
return helpers.ControlledBy(t, helpers.JobKind)
case cache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1corev1.Command); ok {
return helpers.ControlledBy(pod, helpers.JobKind)
if cmd, ok := t.Obj.(*v1corev1.Command); ok {
return helpers.ControlledBy(cmd, helpers.JobKind)
}
runtime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod", obj))
return false
@ -237,6 +236,9 @@ func (cc *Controller) worker() {
action = applyPolicies(req.Event, job, pod)
}
glog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.",
action, req.Namespace, req.JobName, job.Status.State.Phase, st)
if err := st.Execute(action, req.Reason, req.Message); err != nil {
glog.Errorf("Failed to handle Job <%s/%s>: %v",
job.Namespace, job.Name, err)

View File

@ -19,12 +19,14 @@ package job
import (
"fmt"
"sync"
"time"
"github.com/golang/glog"
"k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
kbv1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1"
@ -34,6 +36,9 @@ import (
)
func (cc *Controller) killJob(job *vkv1.Job, nextState state.NextStateFn) error {
glog.V(3).Infof("Killing Job <%s/%s>", job.Namespace, job.Name)
defer glog.V(3).Infof("Finished Job <%s/%s> killing", job.Namespace, job.Name)
job, err := cc.jobLister.Jobs(job.Namespace).Get(job.Name)
if err != nil {
if apierrors.IsNotFound(err) {
@ -93,6 +98,8 @@ func (cc *Controller) killJob(job *vkv1.Job, nextState state.NextStateFn) error
}
job.Status = vkv1.JobStatus{
State: job.Status.State,
Pending: pending,
Running: running,
Succeeded: succeeded,
@ -100,9 +107,11 @@ func (cc *Controller) killJob(job *vkv1.Job, nextState state.NextStateFn) error
Terminating: terminating,
MinAvailable: int32(job.Spec.MinAvailable),
}
if nextState != nil {
job.Status.State = nextState(job.Status)
}
newState := job.Status.State
// Update Job status
if _, err := cc.vkClients.BatchV1alpha1().Jobs(job.Namespace).Update(job); err != nil {
@ -111,18 +120,27 @@ func (cc *Controller) killJob(job *vkv1.Job, nextState state.NextStateFn) error
return err
}
if err := cc.waitForJobState(job, newState); err != nil {
glog.Errorf("Failed to sync Job's status.")
return err
}
// Delete PodGroup
if err := cc.kbClients.SchedulingV1alpha1().PodGroups(job.Namespace).Delete(job.Name, nil); err != nil {
glog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
if !apierrors.IsNotFound(err) {
glog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
}
// Delete Service
if err := cc.kubeClients.CoreV1().Services(job.Namespace).Delete(job.Name, nil); err != nil {
glog.Errorf("Failed to delete Service of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
if !apierrors.IsNotFound(err) {
glog.Errorf("Failed to delete Service of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
}
// NOTE(k82cn): DO NOT delete input/output until job is deleted.
@ -131,6 +149,9 @@ func (cc *Controller) killJob(job *vkv1.Job, nextState state.NextStateFn) error
}
func (cc *Controller) syncJob(job *vkv1.Job, nextState state.NextStateFn) error {
glog.V(3).Infof("Starting to sync up Job <%s/%s>", job.Namespace, job.Name)
defer glog.V(3).Infof("Finished Job <%s/%s> sync up", job.Namespace, job.Name)
job, err := cc.jobLister.Jobs(job.Namespace).Get(job.Name)
if err != nil {
if apierrors.IsNotFound(err) {
@ -275,6 +296,8 @@ func (cc *Controller) syncJob(job *vkv1.Job, nextState state.NextStateFn) error
}
job.Status = vkv1.JobStatus{
State: job.Status.State,
Pending: pending,
Running: running,
Succeeded: succeeded,
@ -286,6 +309,7 @@ func (cc *Controller) syncJob(job *vkv1.Job, nextState state.NextStateFn) error
if nextState != nil {
job.Status.State = nextState(job.Status)
}
newState := job.Status.State
if _, err := cc.vkClients.BatchV1alpha1().Jobs(job.Namespace).Update(job); err != nil {
glog.Errorf("Failed to update status of Job %v/%v: %v",
@ -293,6 +317,10 @@ func (cc *Controller) syncJob(job *vkv1.Job, nextState state.NextStateFn) error
return err
}
if err := cc.waitForJobState(job, newState); err != nil {
return err
}
return err
}
@ -431,3 +459,24 @@ func (cc *Controller) createPodGroupIfNotExist(job *vkv1.Job) error {
return nil
}
// waitForJobState will wait for job's state changed to the target status.
// TODO(k82cn): enhance by cache/assume
func (cc *Controller) waitForJobState(job *vkv1.Job, newState vkv1.JobState) error {
if err := wait.Poll(100*time.Microsecond, 10*time.Second, func() (bool, error) {
newJob, err := cc.jobLister.Jobs(job.Namespace).Get(job.Name)
if err != nil {
return false, err
}
if newJob.Status.State == newState {
return true, nil
}
return false, nil
}); err != nil {
return err
}
return nil
}

View File

@ -17,8 +17,6 @@ limitations under the License.
package job
import (
"reflect"
"github.com/golang/glog"
"k8s.io/api/core/v1"
@ -43,10 +41,21 @@ func (cc *Controller) addCommand(obj interface{}) {
Action: vkbatchv1.Action(cmd.Action),
}
glog.V(3).Infof("Try to execute command <%v> on Job <%s/%s>",
cmd.Action, req.Namespace, req.JobName)
if err := cc.eventQueue.Add(req); err != nil {
glog.Errorf("Failed to add request <%v> into queue: %v",
req, err)
}
go func() {
if err := cc.vkClients.BusV1alpha1().Commands(cmd.Namespace).Delete(cmd.Name, nil); err != nil {
glog.Errorf("Failed to delete Command <%s/%s> which maybe executed again.",
cmd.Namespace, cmd.Name)
}
}()
}
func (cc *Controller) addJob(obj interface{}) {
@ -76,25 +85,18 @@ func (cc *Controller) updateJob(oldObj, newObj interface{}) {
return
}
oldJob, ok := oldObj.(*vkbatchv1.Job)
if !ok {
glog.Errorf("oldObj is not Job")
return
req := &Request{
Namespace: newJob.Namespace,
JobName: newJob.Name,
Event: vkbatchv1.OutOfSyncEvent,
}
if !reflect.DeepEqual(oldJob.Spec, newJob.Spec) {
req := &Request{
Namespace: newJob.Namespace,
JobName: newJob.Name,
Event: vkbatchv1.OutOfSyncEvent,
}
if err := cc.eventQueue.Add(req); err != nil {
glog.Errorf("Failed to add request <%v> into queue: %v",
req, err)
}
if err := cc.eventQueue.Add(req); err != nil {
glog.Errorf("Failed to add request <%v> into queue: %v",
req, err)
}
}
func (cc *Controller) deleteJob(obj interface{}) {
@ -216,5 +218,4 @@ func (cc *Controller) deletePod(obj interface{}) {
}
}
// TODO(k82cn): add handler for PodGroup unschedulable event.
// TODO(k82cn): add handler for PodGroup unschedulable event.

View File

@ -33,6 +33,20 @@ func (ps *pendingState) Execute(action vkv1.Action, reason string, msg string) e
phase = vkv1.Restarting
}
return vkv1.JobState{
Phase: phase,
Reason: reason,
Message: msg,
}
})
case vkv1.AbortJobAction:
return KillJob(ps.job, func(status vkv1.JobStatus) vkv1.JobState {
phase := vkv1.Pending
if status.Terminating != 0 {
phase = vkv1.Aborting
}
return vkv1.JobState{
Phase: phase,
Reason: reason,