diff --git a/cmd/cli/job.go b/cmd/cli/job.go index 13214d545..c577a3d7b 100644 --- a/cmd/cli/job.go +++ b/cmd/cli/job.go @@ -64,7 +64,7 @@ func buildJobCmd() *cobra.Command { jobDelCmd := &cobra.Command{ Use: "delete", - Short: "delete a job ", + Short: "delete a job", Run: func(cmd *cobra.Command, args []string) { checkError(cmd, job.DeleteJob()) }, diff --git a/community-membership.md b/community-membership.md index 11558f774..9b3b46050 100644 --- a/community-membership.md +++ b/community-membership.md @@ -18,7 +18,7 @@ This document gives a brief overview of the Volcano community roles with the req ## Member Members are active participants in the community who contribute by authoring PRs, -reviewing issues/PRs or participate in community discussions on slack/mailing list. +reviewing issues/PRs or participate in community discussions on slack/mailing list. ### Requirements @@ -45,7 +45,7 @@ reviewing issues/PRs or participate in community discussions on slack/mailing li ## Approver Approvers are active members who have good experience and knowledge of the domain. -They have actively participated in the issue/PR reviews and have identified relevant issues during review. +They have actively participated in the issue/PR reviews and have identified relevant issues during review. ### Requirements diff --git a/contribute.md b/contribute.md index adffc5b01..e3e59ee18 100644 --- a/contribute.md +++ b/contribute.md @@ -136,7 +136,7 @@ There are multiple types of tests. The location of the test code varies with type, as do the specifics of the environment needed to successfully run the test: * Unit: These confirm that a particular function behaves as intended. Unit test source code can be found adjacent to the corresponding source code within a given package. These are easily run locally by any developer. -* Integration: These tests cover interactions of package components or interactions between Volcano components and Kubernetes control plane components like API server. +* Integration: These tests cover interactions of package components or interactions between Volcano components and Kubernetes control plane components like API server. * End-to-end ("e2e"): These are broad tests of overall system behavior and coherence. The e2e tests are in [Volcano e2e](https://github.com/volcano-sh/volcano/tree/master/test/e2e). Continuous integration will run these tests on PRs. diff --git a/docs/design/command-line-enhancement.md b/docs/design/command-line-enhancement.md index 63bec29ee..d5cc8c8d3 100644 --- a/docs/design/command-line-enhancement.md +++ b/docs/design/command-line-enhancement.md @@ -70,12 +70,12 @@ The similar Slurm command lines are listed below: | `vcctl queue list ` | `vqueues` | #### `vsub` submit via file -Command `vsub` can also submit a batch job via `.sh` file, like: +Command `vsub` can also submit a batch job via `.sh` file, like: ```shell [user@host]$ vsub test.sh Submitted batch job test ``` -The job file owns a format like: +The job file owns a format like: ```shell #!/bin/bash` diff --git a/docs/design/dedicated-volume.md b/docs/design/dedicated-volume.md index a51e3732c..789bb9ca7 100644 --- a/docs/design/dedicated-volume.md +++ b/docs/design/dedicated-volume.md @@ -26,8 +26,8 @@ Volume mount was supported from begin. But there are few limitations: 2. We specify volumes by setting `TaskSpec.PodTemplateSpec.Volumes`, but similarly they are shared by pods within a task. -But in real world, scenarios like DL/BigData, etc requires high performance. Shared storage has some performance issue, -like io limit, read/write conflicts. +But in real world, scenarios like DL/BigData, etc requires high performance. Shared storage has some performance issue, +like io limit, read/write conflicts. Also some cloud vendors do not support volumes mounting to multiple nodes, it is to prevent data inconsistent. @@ -68,8 +68,8 @@ type VolumeSpec struct { // defined the PVC name VolumeClaimName string `json:"volumeClaimName,omitempty" protobuf:"bytes,2,opt,name=volumeClaimName"` - // If `VolumeClaimName` is empty, then the job controller will generate a name with `{task_index}` suffixed for each task instance. - // Note: it can be set for task scoped only. + // If `VolumeClaimName` is empty, then the job controller will generate a name with `{task_index}` suffixed for each task instance. + // Note: it can be set for task scoped only. GenerateName string `json:"generateName,omitempty" protobuf:"bytes,4,opt,name=generateName"` // VolumeClaim defines the PVC used by the VolumeMount. @@ -79,9 +79,9 @@ type VolumeSpec struct { - By default, this is empty. The task instance will use volumes defined in `JobSpec.Volumes` and `TaskSpec.Template`. -- If `Volumes` are specified, these pvcs are referenced by all the pods of the task. - If the the VolumeSpec specifies the `GenerateName` while the `VolumeClaimName` left empty, the pvc name is generated with task index suffixed by job controller. - Otherwise, the explicitly declared pvc will be shared by all pods of a task. +- If `Volumes` are specified, these pvcs are referenced by all the pods of the task. + If the the VolumeSpec specifies the `GenerateName` while the `VolumeClaimName` left empty, the pvc name is generated with task index suffixed by job controller. + Otherwise, the explicitly declared pvc will be shared by all pods of a task. - If the pvcs does not exist, job controller will create them. diff --git a/docs/design/delay-pod-creation.md b/docs/design/delay-pod-creation.md index 89e6470be..60d0357f9 100644 --- a/docs/design/delay-pod-creation.md +++ b/docs/design/delay-pod-creation.md @@ -44,7 +44,7 @@ After `InQueue`, the state transform map is updated as follow. The `InQueue` is a new state between `Pending` and `Running`; and it'll let operators/controllers start to create pods. If it meets errors, e.g. unschedulable, it rollbacks to `Pending` instead of `InQueue` to -avoid retry-loop. +avoid retry-loop. ### Action diff --git a/docs/design/drf.md b/docs/design/drf.md index 010d0a61c..f63aae0f4 100644 --- a/docs/design/drf.md +++ b/docs/design/drf.md @@ -15,7 +15,7 @@ This share value is used for job ordering and task premption. #### 1. Job Ordering: The job having the lowest share will have higher priority. - In the example below all the tasks task1, task2 of job1 and task3 and task4 of job2 is already allocated to the cluster. + In the example below all the tasks task1, task2 of job1 and task3 and task4 of job2 is already allocated to the cluster. ![drfjobordering](./images/drfjobordering.png) @@ -23,9 +23,9 @@ This share value is used for job ordering and task premption. Gang scheduling sorts the job based on whether the job has atleast **minAvailable** task already (allocated + successfully completed + pipelined) or not. Jobs which has not met the minAvailable criteria has higher priority than jobs which has met the minAvailable criteria. - + For the jobs which has met the minAvailable criteria will be sorted according to DRF. - + ![gangwithdrf](./images/gangwithdrf.png) #### 2. Task Preemption: diff --git a/docs/design/execution-flow.md b/docs/design/execution-flow.md index 09ed1b030..7cee6f140 100644 --- a/docs/design/execution-flow.md +++ b/docs/design/execution-flow.md @@ -19,7 +19,7 @@ The Allocation of the workloads to the node in scheduler happens in each session 1. If List is empty then continue to Step 4 2. If Yes then Pop a Job from the JobsList 1. If Job exits the Local PendingTasks - 1. If Not then : + 1. If Not then : 1. Create a Local Task List 1. Get the List of Each Tasks in the pending state for that job 1. If the required resource for the job is Empty then go back to previous step @@ -38,10 +38,10 @@ The Allocation of the workloads to the node in scheduler happens in each session 1. If yes the push the Job 2. If No then add the Queue back to the list. 3. Continue till all the Job is ready - 2. Continue till each Queue is processed. - - - + 2. Continue till each Queue is processed. + + + diff --git a/docs/design/fairshare.md b/docs/design/fairshare.md index 038b60b5b..8c0c0f2e7 100644 --- a/docs/design/fairshare.md +++ b/docs/design/fairshare.md @@ -78,7 +78,7 @@ type ClusterInfo struct { The behavior of `allocate` action is scheduling job in `Queue` one by one. -At the beginning of scheduling loop, it will take a job with highest priority from `Queue`. And try to schedule tasks that belong to it until job is ready (matches the minMember) then go to next round. +At the beginning of scheduling loop, it will take a job with highest priority from `Queue`. And try to schedule tasks that belong to it until job is ready (matches the minMember) then go to next round. The priority of job mentioned above is defined by `JobOrder` functions registered by plugins. Such as job ready order of Gang plugin, priority order of Priority plugin, and also the share order of DRF plugin. @@ -89,7 +89,7 @@ Namespace weight `should not` implement with JobOrder func. Because the scheduli > e.g. > > ns1 has job1, job2, ns2 has job3, job4. The original order is job1-job2-job3-job4. -> +> > After the scheduling of job1, right order should be job3-job4-job2. But in priority queue, we have no chance to fix the priority for job2 #### Namespace Order diff --git a/docs/design/images/Job-scale-up-down.PNG b/docs/design/images/Job-scale-up-down.PNG index caeb7b30d..d1c63fc77 100644 Binary files a/docs/design/images/Job-scale-up-down.PNG and b/docs/design/images/Job-scale-up-down.PNG differ diff --git a/docs/design/job-api.md b/docs/design/job-api.md index dabc045ad..7f3d2372a 100644 --- a/docs/design/job-api.md +++ b/docs/design/job-api.md @@ -12,7 +12,7 @@ * Define the API of Job * Define the behaviour of Job -* Clarify the interaction with other features +* Clarify the interaction with other features ### Out of Scope @@ -22,14 +22,14 @@ ## Function Detail The definition of `Job` follow Kuberentes's style, e.g. Status, Spec; the follow sections will only describe -the major functions of `Job`, refer to [Appendix](#appendix) section for the whole definition of `Job`. +the major functions of `Job`, refer to [Appendix](#appendix) section for the whole definition of `Job`. ### Multiple Pod Template As most jobs of high performance workload include different type of tasks, e.g. TensorFlow (ps/worker), Spark (driver/executor); `Job` introduces `taskSpecs` to support multiple pod template, defined as follow. The `Policies` will describe in [Error Handling](#error-handling) section. - + ```go // JobSpec describes how the job execution will look like and when it will actually run type JobSpec struct { @@ -59,8 +59,8 @@ type TaskSpec struct { ``` `JobController` will create Pods based on the templates and replicas in `spec.tasks`; -the controlled `OwnerReference` of Pod will be set to the `Job`. The following is -an example YAML with multiple pod template. +the controlled `OwnerReference` of Pod will be set to the `Job`. The following is +an example YAML with multiple pod template. ```yaml apiVersion: batch.volcano.sh/v1alpha1 @@ -79,7 +79,7 @@ spec: - name: "worker" replicas: 5 template: - spec: + spec: containers: - name: worker image: worker-img @@ -117,7 +117,7 @@ The `Volumes` of Job can be `nil` which means user will manage data themselves. ### Conditions and Phases The following phases are introduced to give a simple, high-level summary of where the Job is in its lifecycle; and the conditions array, -the reason and message field contain more detail about the job's status. +the reason and message field contain more detail about the job's status. ```go type JobPhase string @@ -166,7 +166,7 @@ type JobStatus struct { ``` The following table shows available transactions between different phases. The phase can not transfer to the target -phase if the cell is empty. +phase if the cell is empty. | From \ To | Pending | Aborted | Running | Completed | Terminated | | ------------- | ------- | ------- | ------- | --------- | ---------- | @@ -174,9 +174,9 @@ phase if the cell is empty. | Aborted | * | * | | | | | Running | | * | * | * | * | | Completed | | | | * | | -| Terminated | | | | | * | +| Terminated | | | | | * | -`Restarting`, `Aborting` and `Terminating` are temporary states to avoid race condition, e.g. there'll be several +`Restarting`, `Aborting` and `Terminating` are temporary states to avoid race condition, e.g. there'll be several `PodeEvictedEvent`s because of `TerminateJobAction` which should not be handled again. ### Error Handling @@ -209,7 +209,7 @@ const ( TaskCompletedEvent Event = "TaskCompleted" ) -// Action is the type of event handling +// Action is the type of event handling type Action string const ( @@ -238,13 +238,13 @@ type LifecyclePolicy struct { } ``` -Both `JobSpec` and `TaskSpec` include lifecycle policy: the policies in `JobSpec` are the default policy if no policies -in `TaskSpec`; the policies in `TaskSpec` will overwrite defaults. +Both `JobSpec` and `TaskSpec` include lifecycle policy: the policies in `JobSpec` are the default policy if no policies +in `TaskSpec`; the policies in `TaskSpec` will overwrite defaults. ```go // JobSpec describes how the job execution will look like and when it will actually run type JobSpec struct { - ... + ... // Specifies the default lifecycle of tasks // +optional @@ -268,7 +268,7 @@ type TaskSpec struct { The following examples demonstrate the usage of `LifecyclePolicy` for job and task. For the training job of machine learning framework, the whole job should be restarted if any task was failed or evicted. -To simplify the configuration, a job level `LifecyclePolicy` is set as follows. As no `LifecyclePolicy` is set for any +To simplify the configuration, a job level `LifecyclePolicy` is set as follows. As no `LifecyclePolicy` is set for any task, all tasks will use the policies in `spec.policies`. ```yaml @@ -291,8 +291,8 @@ spec: image: ps-img - name: "worker" replicas: 5 - template: - spec: + template: + spec: containers: - name: worker image: worker-img @@ -300,8 +300,8 @@ spec: ``` Some BigData framework (e.g. Spark) may have different requirements. Take Spark as example, the whole job will be restarted -if 'driver' tasks failed and only restart the task if 'executor' tasks failed. `OnFailure` restartPolicy is set for executor -and `RestartJob` is set for driver `spec.tasks.policies` as follow. +if 'driver' tasks failed and only restart the task if 'executor' tasks failed. `OnFailure` restartPolicy is set for executor +and `RestartJob` is set for driver `spec.tasks.policies` as follow. ```yaml apiVersion: batch.volcano.sh/v1alpha1 @@ -322,8 +322,8 @@ spec: image: driver-img - name: "executor" replicas: 5 - template: - spec: + template: + spec: containers: - name: executor image: executor-img @@ -334,19 +334,19 @@ spec: ### Admission Controller -The following validations must be included to make sure expected behaviours: +The following validations must be included to make sure expected behaviours: * `spec.minAvailable` <= sum(`spec.taskSpecs.replicas`) * no duplicated name in `spec.taskSpecs` array * no duplicated event handler in `LifecyclePolicy` array, both job policies and task policies - + ### CoScheduling CoScheduling (or Gang-scheduling) is required by most of high performance workload, e.g. TF training job, MPI job. The `spec.minAvailable` is used to identify how many pods will be scheduled together. The default value of `spec.minAvailable` is summary of `spec.tasks.replicas`. The admission controller web hook will check `spec.minAvailable` against the summary of `spec.tasks.replicas`; the job creation will be rejected if `spec.minAvailable` > sum(`spec.tasks.replicas`). -If `spec.minAvailable` < sum(`spec.tasks.replicas`), the pod of `spec.tasks` will be created randomly; +If `spec.minAvailable` < sum(`spec.tasks.replicas`), the pod of `spec.tasks` will be created randomly; refer to [Task Priority with Job](#task-priority-within-job) section on how to create tasks in order. ```yaml @@ -368,7 +368,7 @@ spec: - name: "worker" replicas: 5 template: - spec: + spec: containers: - name: "worker" image: "worker-img" @@ -379,7 +379,7 @@ spec: In addition to multiple pod template, the priority of each task maybe different. `PriorityClass` of `PodTemplate` is reused to define the priority of task within a job. This's an example to run spark job: 1 driver with 5 executors, the driver's priority is `master-pri` which is higher than normal pods; as `spec.minAvailable` is 3, the scheduler will make sure one driver -with 2 executors will be scheduled if not enough resources. +with 2 executors will be scheduled if not enough resources. ```yaml apiVersion: batch.volcano.sh/v1alpha1 @@ -400,7 +400,7 @@ spec: - name: "executor" replicas: 5 template: - spec: + spec: containers: - name: executor image: executor-img @@ -434,7 +434,7 @@ spec: - name: "executor" replicas: 5 template: - spec: + spec: containers: - name: executor image: executor-img @@ -442,7 +442,7 @@ spec: ### Plugins for Job -As many jobs of AI frame, e.g. TensorFlow, MPI, Mxnet, need set env, pods communicate, ssh sign in without password. +As many jobs of AI frame, e.g. TensorFlow, MPI, Mxnet, need set env, pods communicate, ssh sign in without password. We provide Job api plugins to give users a better focus on core business. Now we have three plugins, every plugin has parameters, if not provided, we use default. @@ -475,7 +475,7 @@ spec: name: mpimaster - replicas: 2 name: mpiworker - template: + template: spec: containers: image: mpi-image @@ -524,10 +524,10 @@ type JobSpec struct { // Key is plugin name, value is the arguments of the plugin // +optional Plugins map[string][]string `json:"plugins,omitempty" protobuf:"bytes,6,opt,name=plugins"` - + //Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty. Queue string `json:"queue,omitempty" protobuf:"bytes,7,opt,name=queue"` - + // Specifies the maximum number of retries before marking this Job failed. // Defaults to 3. // +optional @@ -540,7 +540,7 @@ type VolumeSpec struct { // defined the PVC name VolumeClaimName string `json:"volumeClaimName,omitempty" protobuf:"bytes,2,opt,name=volumeClaimName"` - + // VolumeClaim defines the PVC used by the VolumeMount. VolumeClaim *v1.PersistentVolumeClaimSpec `json:"volumeClaim,omitempty" protobuf:"bytes,3,opt,name=volumeClaim"` } @@ -565,7 +565,7 @@ const ( // CommandIssuedEvent is triggered if a command is raised by user CommandIssuedEvent Event = "CommandIssued" // TaskCompletedEvent is triggered if the 'Replicas' amount of pods in one task are succeed - TaskCompletedEvent Event = "TaskCompleted" + TaskCompletedEvent Event = "TaskCompleted" ) // Action is the action that Job controller will take according to the event. @@ -581,7 +581,7 @@ const ( // and can not be resumed: all Pod of Job will be evicted, and no Pod will be recreated. TerminateJobAction Action = "TerminateJob" // CompleteJobAction if this action is set, the unfinished pods will be killed, job completed. - CompleteJobAction Action = "CompleteJob" + CompleteJobAction Action = "CompleteJob" // ResumeJobAction is the action to resume an aborted job. ResumeJobAction Action = "ResumeJob" @@ -689,7 +689,7 @@ type JobStatus struct { // The minimal available pods to run for this Job // +optional MinAvailable int32 `json:"minAvailable,omitempty" protobuf:"bytes,6,opt,name=minAvailable"` - + // The number of pods which reached phase Terminating. // +optional Terminating int32 `json:"terminating,omitempty" protobuf:"bytes,7,opt,name=terminating"` diff --git a/docs/design/job-scale-up-down.md b/docs/design/job-scale-up-down.md index bb403022c..b04b6292c 100644 --- a/docs/design/job-scale-up-down.md +++ b/docs/design/job-scale-up-down.md @@ -5,7 +5,7 @@ ## Motivation Currently, Volcano does not support Job update. It is not allowed to update the `Job.Spec` on the fly. -However, many users show appeal to run ML training jobs in a elastic manner. For example ModelArts want to dynamically adjust Job's replicas according to the cluster idle capacity +However, many users show appeal to run ML training jobs in a elastic manner. For example ModelArts want to dynamically adjust Job's replicas according to the cluster idle capacity in order to achieve most high efficiency on GPU card. I propose to support volcano job dynamical scale up/down before more intelligent elasticity in the first step. @@ -43,7 +43,7 @@ The differences are: 3. delete pods when scale down -However, only when the job is not started, the initialization is run. +However, only when the job is not started, the initialization is run. So we need a way to know whether it is a scale up/down event that triggered this round of sync. The way I propose is to add a new event `JobUpdatedEvent` to indicate that the job is updated(here only cares about the scale up/down). @@ -51,14 +51,14 @@ And accordingly add a new action `UpdateJobAction` to run `UpdateJob` function. ![workflow](images/Job-scale-up-down.PNG) To scale up/down on the fly, Volcano should be responsible to notify the original pods the current status, including the hosts of all the pods. -This is done by plugins, so to distinguish from the initialization phase, a new `OnJobUpdate` is introduced. +This is done by plugins, so to distinguish from the initialization phase, a new `OnJobUpdate` is introduced. It is to reconcile all the associated configs of the job. Currently, the `svc` plugin should update the configmap of all the hosts. **NOTE**: 1. Users should watch the `/etc/volcano` to get the up-to-date hosts files if they want to be aware of the training workers. -2. The env `VC_{task name}_HOSTS` `VC_{task name}_NUM` of the existing pods can not be mutated on the fly, so be careful not to use it. +2. The env `VC_{task name}_HOSTS` `VC_{task name}_NUM` of the existing pods can not be mutated on the fly, so be careful not to use it. ``` type PluginInterface interface { diff --git a/docs/design/metrics.md b/docs/design/metrics.md index ba2e44af2..84b1977ec 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -6,9 +6,9 @@ Currently users can leverage controller logs and job events to monitor scheduler This document describes metrics we want to add into kube-batch to better monitor performance. ## Metrics -In order to support metrics, kube-batch needs to expose a metrics endpoint which can provide golang process metrics like number of goroutines, gc duration, cpu and memory usage, etc as well as kube-batch custom metrics related to time taken by plugins or actions. +In order to support metrics, kube-batch needs to expose a metrics endpoint which can provide golang process metrics like number of goroutines, gc duration, cpu and memory usage, etc as well as kube-batch custom metrics related to time taken by plugins or actions. -All the metrics are prefixed with `kube_batch_`. +All the metrics are prefixed with `kube_batch_`. ### kube-batch execution This metrics track execution of plugins and actions of kube-batch loop. diff --git a/docs/design/node-priority.md b/docs/design/node-priority.md index 3e16ffa55..033c0ac43 100644 --- a/docs/design/node-priority.md +++ b/docs/design/node-priority.md @@ -1,9 +1,9 @@ ## Node Priority in Kube-Batch -This feature allows `kube-batch` to schedule workloads based on the priority of the Nodes, Workloads will be scheduled on Nodes with higher priority and these priorities will be calculated based on different parameters like `ImageLocality`, `Most/Least Requested Nodes`...etc. +This feature allows `kube-batch` to schedule workloads based on the priority of the Nodes, Workloads will be scheduled on Nodes with higher priority and these priorities will be calculated based on different parameters like `ImageLocality`, `Most/Least Requested Nodes`...etc. A basic flow for the Node priority functions is depicted below. -![Node Priority Flow](./images/Node-Priority.png) +![Node Priority Flow](./images/Node-Priority.png) Currently in kube-batch `Session` is opened every 1 sec and the workloads which are there in Queue goes through `Predicate` to find a suitable set of Nodes where workloads can be scheduled and after that it goes through `Allocate` function to assign the Pods to the Nodes and then goes to `Preempt` if applicable. @@ -13,6 +13,6 @@ Node Priority can be introduced in the current flow for `Allocate` and `Preempt` - Score the Node based on whether the `Priority Rule` satisfies the Workload scheduling criteria. - Once the scores are returned from all the `PriorityFn` then aggregate the scoring and identify the Node with highest scoring. - Delegate this selected Node in last step to `AllocateFn` to Bind the workload to the Node. - + Currently there are multiple `PriorityFn` available with default Scheduler of Kubernetes. Going forward with each release we will implement all the priority functions in kube-batch based on their importance to batch scheduling. - + diff --git a/docs/design/podgroup-status.md b/docs/design/podgroup-status.md index 595c603d6..2921f5712 100644 --- a/docs/design/podgroup-status.md +++ b/docs/design/podgroup-status.md @@ -109,7 +109,7 @@ type PodGroupStatus struct { ``` According to the PodGroup's lifecycle, the following phase/state transactions are reasonable. And related -reasons will be appended to `Reason` field. +reasons will be appended to `Reason` field. | From | To | Reason | |---------|---------------|---------| diff --git a/docs/design/preempt-action.md b/docs/design/preempt-action.md index 28b835813..90a8bbb55 100644 --- a/docs/design/preempt-action.md +++ b/docs/design/preempt-action.md @@ -2,7 +2,7 @@ ## Introduction -In scheduler there are 4 actions such as `allocate`, `preempt`, `reclaim`, `backfill` and with the help of +In scheduler there are 4 actions such as `allocate`, `preempt`, `reclaim`, `backfill` and with the help of plugins like `conformance`, `drf`, `gang`, `nodeorder` and more plugins. All these plugins provides behavioural characteristics how scheduler make scheduling decisions. @@ -12,16 +12,16 @@ As discussed in Introduction, preempt is one of the actions in kube-batch schedu when a high priority task comes and there is no resource requested by that task is available in the cluster, then few of the tasks should be evicted so that new task will get resource to run. -In preempt action, multiple plugin function are getting used like +In preempt action, multiple plugin function are getting used like -1. TaskOrderFn(Plugin: Priority), -2. JobOrderFn(Plugin: Priority, DRF, Gang), -3. NodeOrderFn(Plugin: NodeOrder), -4. PredicateFn(Plugin: Predicates), +1. TaskOrderFn(Plugin: Priority), +2. JobOrderFn(Plugin: Priority, DRF, Gang), +3. NodeOrderFn(Plugin: NodeOrder), +4. PredicateFn(Plugin: Predicates), 5. PreemptableFn(Plugin: Conformance, Gang, DRF). ### 1. TaskOrderFn: -#### Priority: +#### Priority: Compares taskPriority set in PodSpec and returns the decision of comparison between two priorities. ### 2. JobOrderFn: diff --git a/docs/design/queue/queue-state-management.md b/docs/design/queue/queue-state-management.md index 8a708427e..d616ebaa6 100644 --- a/docs/design/queue/queue-state-management.md +++ b/docs/design/queue/queue-state-management.md @@ -18,38 +18,38 @@ ## Motivation -The queue is an object of resource management in the cluster and the cornerstone of resource scheduling, which is -closely related to the allocation of resources and the scheduling of tasks. The resources under the cluster are -allocated according to the `weight` ratio of the queue. The configuration of queue guarantees the number of cluster -resources that tasks can use under the queue and limits the maximum resources that can be used. A single user or -user group is correspond to one or more queues, which is assigned and determined by the administrator. When queues -splitting cluster resources, single queue obtains the resource guarantees and quotas for using resources, so that uses -or user groups under the queue have opportunity to use cluster resources, Simultaneously due to the resource limitation -of queue, the ability of users or user groups to user cluster resources is limited to prevent cluster from being -overwhelmed by a single user to deliver a large number or tasks, thereby ensuring the `multi-tenancy` feature of -scheduling. When task is delivered, it will be placed to a specific queue and pod scheduling will by affected by queue -priority and queue resource status. It is worth mentioning that the resource allocation of queue and limitation of -queue resource can be dynamically adjusted. The queue can flexibly acquire remaining resources under cluster if there -are idle resources, when a queue is busy, and there are idle resources under the cluster, the queue may break the +The queue is an object of resource management in the cluster and the cornerstone of resource scheduling, which is +closely related to the allocation of resources and the scheduling of tasks. The resources under the cluster are +allocated according to the `weight` ratio of the queue. The configuration of queue guarantees the number of cluster +resources that tasks can use under the queue and limits the maximum resources that can be used. A single user or +user group is correspond to one or more queues, which is assigned and determined by the administrator. When queues +splitting cluster resources, single queue obtains the resource guarantees and quotas for using resources, so that uses +or user groups under the queue have opportunity to use cluster resources, Simultaneously due to the resource limitation +of queue, the ability of users or user groups to user cluster resources is limited to prevent cluster from being +overwhelmed by a single user to deliver a large number or tasks, thereby ensuring the `multi-tenancy` feature of +scheduling. When task is delivered, it will be placed to a specific queue and pod scheduling will by affected by queue +priority and queue resource status. It is worth mentioning that the resource allocation of queue and limitation of +queue resource can be dynamically adjusted. The queue can flexibly acquire remaining resources under cluster if there +are idle resources, when a queue is busy, and there are idle resources under the cluster, the queue may break the original resource limit and try to occupy the remaining cluster resources. -Based on the above description, it can be found that queue is a crucial object in the process of resource scheduling. -There should have a complete guarantee mechanism to ensure the stability of queue without losing the flexibility of -queue. Firstly, the queue should not be deleted arbitrarily, since if the queue is deleted, the unscheduled tasks in -the queue will not be scheduled normally and the resources occupied by running tasks in the queue will not be normally -counted. However, considering the flexibility of resource control, queue should not be forbidden to delete. In addition, -considering the decisive role of queue in resource management, the administrator will control which user or user group -can use cluster resources by controlling queue which also requires queue to provide corresponding capabilities. +Based on the above description, it can be found that queue is a crucial object in the process of resource scheduling. +There should have a complete guarantee mechanism to ensure the stability of queue without losing the flexibility of +queue. Firstly, the queue should not be deleted arbitrarily, since if the queue is deleted, the unscheduled tasks in +the queue will not be scheduled normally and the resources occupied by running tasks in the queue will not be normally +counted. However, considering the flexibility of resource control, queue should not be forbidden to delete. In addition, +considering the decisive role of queue in resource management, the administrator will control which user or user group +can use cluster resources by controlling queue which also requires queue to provide corresponding capabilities. -Therefore, we need to provide `State Management` capabilities for queue. Add the state configuration for queue and -adjust capabilities of queue by judging the state of queue, thereby achieving the management of queue lifecycle and +Therefore, we need to provide `State Management` capabilities for queue. Add the state configuration for queue and +adjust capabilities of queue by judging the state of queue, thereby achieving the management of queue lifecycle and scheduling of tasks under the queue. ## Function Detail ### Data Structure -Add `state` to `properties` in `spec` of CRD `queues.scheduling.sigs.dev`. The `state` of queue controller the status +Add `state` to `properties` in `spec` of CRD `queues.scheduling.sigs.dev`. The `state` of queue controller the status of queue. ```go @@ -63,7 +63,7 @@ spec: ... ``` -Add `state` to `properties` in `status` of CRD `queues.scheduling.sigs.dev`. The `state` of queue display the status of +Add `state` to `properties` in `status` of CRD `queues.scheduling.sigs.dev`. The `state` of queue display the status of current queue. ```go @@ -79,20 +79,20 @@ status: ### Queue State Valid queue state includes: - + * `Open`, indicates that the queue is available, the queue receives new task delivery * `Closed`, indicated that the queue is unavailable, the queue will wait for the subordinate tasks to gracefully exit, -which does not mean that the system will actively delete tasks under the queue. However, the queue does not receive new +which does not mean that the system will actively delete tasks under the queue. However, the queue does not receive new task delivery -* `Closing`, is a intermediate state between `Open` and `Closed`. When the state of queue is `Open` and there -are tasks running or waiting to be scheduled under the queue. At this time, we try to change the state of queue to -`Closed`. The state of queue will changes to `Closing` firstly and then changes to `Closed` when all the tasks under +* `Closing`, is a intermediate state between `Open` and `Closed`. When the state of queue is `Open` and there +are tasks running or waiting to be scheduled under the queue. At this time, we try to change the state of queue to +`Closed`. The state of queue will changes to `Closing` firstly and then changes to `Closed` when all the tasks under the queue exist. - + The ability of queue corresponding to queue state as show in the following table: | state | default | can be set | receive delivery | can be deleted | can be scheduled | deserved resources | -| :-------: | :-----: | :--------: | :--------------: | :------------: |:---------------: | :----------------: | +| :-------: | :-----: | :--------: | :--------------: | :------------: |:---------------: | :----------------: | | `Open` | Y | Y | Y | N | Y | Normal | | `Closed` | N | Y | N | Y | Y | Normal | | `Closing` | N | N | N | N | Y | Normal | @@ -107,8 +107,8 @@ with `Closed` or `Closing` state In the lifecycle management of queue, we need to guarantee the following three points: -* When creating a new queue, if the user does not specify a state for queue, we need to specify default `Open` state -for it, If the user specifies a state for queue, the specified state must be a valid value, valid values are `Open` +* When creating a new queue, if the user does not specify a state for queue, we need to specify default `Open` state +for it, If the user specifies a state for queue, the specified state must be a valid value, valid values are `Open` and `Closed`. * When upgrading the queue, if state of queue changed, the specified state value must be valid. * when deleting the queue, only queue with `Closed` status can be deleted successfully. The `status` here is the `state` @@ -144,7 +144,7 @@ webhooks: - queues ``` -Add implementation function `AdmitQueues` +Add implementation function `AdmitQueues` ```go func AdmitQueues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { @@ -161,7 +161,7 @@ The above function will complete the following verification: * During creating or upgrading queue, verify the validity of the queue state * During deleting queue, check if queue can be deleted -We need another `webhook` to set default state value for queue during queue creating, add `mutatingwebhookconfiguration` +We need another `webhook` to set default state value for queue during queue creating, add `mutatingwebhookconfiguration` and `MutateQueues` function ```yaml @@ -207,8 +207,8 @@ considered: * If the `state` value is empty, the status of queue will be set as `Open` * If the `state` value is `Open`, then the status of queue will also be `Open` -* If the `state` value is `Closed`, then we need to further consider whether there is a podgroup under the queue. if -there is a podgroup under the queue, the status of the queue will be set as `Closing`, while if there is no podgroup +* If the `state` value is `Closed`, then we need to further consider whether there is a podgroup under the queue. if +there is a podgroup under the queue, the status of the queue will be set as `Closing`, while if there is no podgroup under the queue, the status of queue will be set as `Closed`. ### Queue Placement Restriction @@ -221,7 +221,7 @@ When creating job, we need to verify the status of queue specified by the job: ### Queue State on The Scheduling Process -The above three states of queue have no effect on the existing scheduling process, for there is no pod under queue with +The above three states of queue have no effect on the existing scheduling process, for there is no pod under queue with `Closed` state, while pods under queues with `Open` or `Closing` state should be scheduled normally. ### Queue State on `vcctl` diff --git a/docs/design/queue/queue.md b/docs/design/queue/queue.md index e901c47f6..09b4154bc 100644 --- a/docs/design/queue/queue.md +++ b/docs/design/queue/queue.md @@ -4,7 +4,7 @@ ## Motivation -`Queue` was introduced in [kube-batch](http://github.com/kubernetes-sigs/kube-batch) long time ago as an internal feature, which makes all jobs are submitted to the same queue, named `default`. As more and more users would like to share resources with each other by queue, this proposal is going to cover primary features of queue achieve that. +`Queue` was introduced in [kube-batch](http://github.com/kubernetes-sigs/kube-batch) long time ago as an internal feature, which makes all jobs are submitted to the same queue, named `default`. As more and more users would like to share resources with each other by queue, this proposal is going to cover primary features of queue achieve that. ## Function Specification @@ -50,7 +50,7 @@ type QueueStatus struct { ### QueueController -The `QueueController` will manage the lifecycle of queue: +The `QueueController` will manage the lifecycle of queue: 1. Watching `PodGroup`/`Job` for status 2. If `Queue` was deleted, also delete all related `PodGroup`/`Job` in the queue @@ -66,7 +66,7 @@ The admission controller will check `PodGroup`/`Job` 's queue when creation: #### Customized Job/PodGroup -If the `PodGroup` is created by customized controller, the `QueueController` will count those `PodGroup` into `Unknown` status; because `PodGroup` focus on scheduling specification which did not include customized job's status. +If the `PodGroup` is created by customized controller, the `QueueController` will count those `PodGroup` into `Unknown` status; because `PodGroup` focus on scheduling specification which did not include customized job's status. #### cli @@ -100,11 +100,11 @@ myqueue 10 10 5 5 #### Scheduler -* Proportion plugin: +* Proportion plugin: - Proportion plugin is used to share resource between `Queue`s by weight. The deserved resource of a queue is `(weight/total-weight) * total-resource`. When allocating resources, it will not allocate resource more than its deserved resources. + Proportion plugin is used to share resource between `Queue`s by weight. The deserved resource of a queue is `(weight/total-weight) * total-resource`. When allocating resources, it will not allocate resource more than its deserved resources. -* Reclaim action: +* Reclaim action: `reclaim` action will go through all queues to reclaim others by `ReclaimableFn`'s return value; the time complexity is `O(n^2)`. In `ReclaimableFn`, both `proportion` and `gang` will take effect: 1. `proportion` makes sure the queue will not be under-used after reclaim, 2. `gang` makes sure the job will not be reclaimed if its `minAvailable` > 1. diff --git a/docs/design/reclaim-action.md b/docs/design/reclaim-action.md index 7fb3a669a..35b116e57 100644 --- a/docs/design/reclaim-action.md +++ b/docs/design/reclaim-action.md @@ -14,22 +14,22 @@ When a new queue is created, resource is divided among queues depending on its r Consider two queues is already present and entire cluster resource is used by both the queues. When third queue is created, deserved share of previous two queues is reduced since resource should be given to third queue as well. So jobs/tasks which is under old queues will not be evicted until, new jobs/tasks comes to new queue(Third Queue). At that point of time, -resource for third queue(i.e. New Queue) should be reclaimed(i.e. few tasks/jobs should be evicted) from previous two queues, so that new job in third queue can +resource for third queue(i.e. New Queue) should be reclaimed(i.e. few tasks/jobs should be evicted) from previous two queues, so that new job in third queue can be created. -Reclaim is basically evicting tasks from other queues so that present queue can make use of it's entire deserved share for +Reclaim is basically evicting tasks from other queues so that present queue can make use of it's entire deserved share for creating tasks. In Reclaim Action, there are multiple plugin functions that are getting used like, -1. TaskOrderFn(Plugin: Priority), +1. TaskOrderFn(Plugin: Priority), 2. JobOrderFn(Plugin: Priority, DRF, Gang), -3. NodeOrderFn(Plugin: NodeOrder), -4. PredicateFn(Plugin: Predicates), +3. NodeOrderFn(Plugin: NodeOrder), +4. PredicateFn(Plugin: Predicates), 5. ReclaimableFn(Plugin: Conformance, Gang, Proportion). ### 1. TaskOrderFn: -#### Priority: +#### Priority: Compares taskPriority set in PodSpec and returns the decision of comparison between two priorities. ### 2. JobOrderFn: diff --git a/docs/design/reclaim-design.md b/docs/design/reclaim-design.md index bb1d4f872..1b3d766c3 100644 --- a/docs/design/reclaim-design.md +++ b/docs/design/reclaim-design.md @@ -15,7 +15,7 @@ Reclaim runs in each session and the workflow of the session is explained below 1. If queue is overused, move on to next queue from queues object 2. If queue is not overused, check for for jobs which has pending tasks within that queue and select preemptor task 4. Range over all nodes and run predicateFn for preemptor task - 1. If predicates are not satisfied, move on to next node + 1. If predicates are not satisfied, move on to next node 2. If all the predicates are satisfied 1. Range over all tasks running that node but from different queue other than preemptor task's queue and find all **reclaimees** tasks 2. Send preemptor task and set of **reclaimees** task to ReclaimableFn which has been loaded by following plugins such as conformance, gang and proportion @@ -25,7 +25,6 @@ Reclaim runs in each session and the workflow of the session is explained below 8. Run this until **queues** object is empty ![Execution flow graph for Reclaim](./images/ReclaimDesign.png) - - - - \ No newline at end of file + + + diff --git a/docs/development/prepare-for-development.md b/docs/development/prepare-for-development.md index aa98cba6d..61273d9d5 100644 --- a/docs/development/prepare-for-development.md +++ b/docs/development/prepare-for-development.md @@ -57,4 +57,4 @@ Alternatively you can [add your SSH keys](https://help.github.com/articles/addin ### What's next? Once you've set up the prerequisites, continue with [Using the Code Base](./development.md) -for more details about how to build & test Volcano. \ No newline at end of file +for more details about how to build & test Volcano. \ No newline at end of file diff --git a/docs/getting-started/getting-started.md b/docs/getting-started/getting-started.md index e8123b757..410ba203b 100644 --- a/docs/getting-started/getting-started.md +++ b/docs/getting-started/getting-started.md @@ -12,11 +12,11 @@ frameworks like TensorFlow, Spark, PyTorch, MPI, etc, which Volcano integrates w ### Why Vocano? - // TODO better to add separate md file & Link -- Learn about Volcano [here](https://github.com/volcano-sh/volcano/blob/master/README.md) +- Learn about Volcano [here](https://github.com/volcano-sh/volcano/blob/master/README.md) -### First Steps +### First Steps To get the most out of Volcano, start by reviewing a few introductory topics: - [perepare-for-development](../development/perepare-for-development.md) - preoaration for development -- [Setup](../development/development.md) - Install Volcano -- [Contributing](https://github.com/volcano-sh/volcano/blob/master/contribute.md) - Contribute to Volcano -- [Troubleshooting](../troubleshooting/troubleshooting.md) - Troubleshoot commonly occurring issues. GitHub issues are [here](https://github.com/volcano-sh/volcano/issues) +- [Setup](../development/development.md) - Install Volcano +- [Contributing](https://github.com/volcano-sh/volcano/blob/master/contribute.md) - Contribute to Volcano +- [Troubleshooting](../troubleshooting/troubleshooting.md) - Troubleshoot commonly occurring issues. GitHub issues are [here](https://github.com/volcano-sh/volcano/issues) diff --git a/docs/getting-started/support.md b/docs/getting-started/support.md index 74bf4db36..94fed921b 100644 --- a/docs/getting-started/support.md +++ b/docs/getting-started/support.md @@ -5,10 +5,10 @@ If you need support, start with the [troubleshooting guide](../troubleshooting/t ## Community -**Slack channel:** +**Slack channel:** We use Slack for public discussions. To chat with us or the rest of the community, join us in the [Volcano Slack](https://volcano-sh.slack.com) team channel #general. To sign up, use our Slack inviter link [here](https://join.slack.com/t/volcano-sh/shared_invite/enQtNTU5NTU3NDU0MTc4LTgzZTQ2MzViNTFmNDg1ZGUyMzcwNjgxZGQ1ZDdhOGE3Mzg1Y2NkZjk1MDJlZTZhZWU5MDg2MWJhMzI3Mjg3ZTk). -**Mailing List** +**Mailing List** Please sign up on our [mailing list](https://groups.google.com/forum/#!forum/volcano-sh) diff --git a/docs/volcano-introduction.pptx b/docs/volcano-introduction.pptx index 8ea76ea40..ca4d3dafa 100644 Binary files a/docs/volcano-introduction.pptx and b/docs/volcano-introduction.pptx differ diff --git a/example/huawei-connection/Volcano-intro.pptx b/example/huawei-connection/Volcano-intro.pptx index dd20c4ab6..167632d54 100644 Binary files a/example/huawei-connection/Volcano-intro.pptx and b/example/huawei-connection/Volcano-intro.pptx differ diff --git a/example/integrations/paddlepaddle/README.md b/example/integrations/paddlepaddle/README.md index df28983bd..fccfca5fd 100644 --- a/example/integrations/paddlepaddle/README.md +++ b/example/integrations/paddlepaddle/README.md @@ -1,6 +1,6 @@ # Click-Through-Rate Distributed Training with PaddlePaddle on Volcano -This is an example of running Click-Through-Rate(ctr) distributed training with PaddlePaddle on Volcano. The source code +This is an example of running Click-Through-Rate(ctr) distributed training with PaddlePaddle on Volcano. The source code is taken from PaddlePaddle EDL team's example [here](https://github.com/PaddlePaddle/edl/tree/develop/example/ctr). The directory contains the following files: diff --git a/example/kubecon-2019-china/tf-sample/tf-example.yaml b/example/kubecon-2019-china/tf-sample/tf-example.yaml index d73a8350f..8861a13c7 100644 --- a/example/kubecon-2019-china/tf-sample/tf-example.yaml +++ b/example/kubecon-2019-china/tf-sample/tf-example.yaml @@ -65,7 +65,7 @@ spec: ports: - containerPort: 2222 name: tfjob-port - resources: + resources: requests: cpu: "1000m" memory: "2048Mi" diff --git a/hack/check-generated-yaml.sh b/hack/check-generated-yaml.sh index 1426edf1c..ce4634594 100755 --- a/hack/check-generated-yaml.sh +++ b/hack/check-generated-yaml.sh @@ -21,6 +21,7 @@ set -o pipefail VK_ROOT=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/.. export RELEASE_FOLDER=${VK_ROOT}/${RELEASE_DIR} +sed -i 's/[ \t]*$//' ${RELEASE_FOLDER}/volcano-latest.yaml if ! diff ${VK_ROOT}/installer/volcano-development.yaml ${RELEASE_FOLDER}/volcano-latest.yaml ; then { diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh index 27f35809a..f465a6f43 100755 --- a/hack/local-up-cluster.sh +++ b/hack/local-up-cluster.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ -z $GOPATH ]; then +if [ -z $GOPATH ]; then echo "Please set GOPATH to start the cluster :)" exit 1 fi @@ -22,7 +22,7 @@ function install_tools { for d in work logs certs config static-pods do mkdir -p ${VC_HOME}/volcano/$d - done + done go get -u github.com/cloudflare/cfssl/cmd/... } @@ -30,7 +30,7 @@ function install_tools { function build_binaries { echo "Building Kubernetes ...... " echo "$( - cd $K8S_HOME + cd $K8S_HOME make kubectl kube-controller-manager kube-apiserver kubelet kube-proxy )" @@ -58,7 +58,7 @@ function create_certkey { echo '{"CN":"'${cn}'","hosts":['${hosts}'],"key":{"algo":"rsa","size":2048},"names":[{"O":"'${org}'"}]}' \ | cfssl gencert -ca=${CERT_DIR}/root.pem -ca-key=${CERT_DIR}/root-key.pem -config=${CERT_DIR}/root-ca-config.json - \ - | cfssljson -bare ${CERT_DIR}/$name + | cfssljson -bare ${CERT_DIR}/$name } function generate_cert_files { @@ -69,7 +69,7 @@ function generate_cert_files { echo '{"CN":"volcano","key":{"algo":"rsa","size":2048},"names":[{"O":"volcano"}]}' | cfssl gencert -initca - \ | cfssljson -bare ${CERT_DIR}/root - + create_certkey "kube-apiserver" "kubernetes.default" "volcano" "kubernetes.default.svc" "localhost" "127.0.0.1" "10.0.0.1" create_certkey "admin" "system:admin" "system:masters" create_certkey "kube-proxy" "system:kube-proxy" "volcano" diff --git a/installer/README.md b/installer/README.md index 35c065ff6..66309b8f8 100644 --- a/installer/README.md +++ b/installer/README.md @@ -4,7 +4,7 @@ Volcano is a batch system built on Kubernetes. It provides a suite of mechanisms Kubernetes that are commonly required by many classes of batch & elastic workload including: 1. machine learning/deep learning, -2. bioinformatics/genomics, and +2. bioinformatics/genomics, and 3. other "big data" applications. ## Prerequisites diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 6cda502bc..4f1d43337 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -23,7 +23,7 @@ data: - name: proportion - name: nodeorder - name: binpack - + --- apiVersion: v1 kind: ServiceAccount @@ -118,7 +118,7 @@ spec: app: volcano-scheduler spec: serviceAccount: volcano-scheduler - + containers: - name: volcano-scheduler image: volcanosh/vc-scheduler:latest @@ -208,7 +208,7 @@ spec: app: volcano-admission spec: serviceAccount: volcano-admission - + containers: - args: - --tls-cert-file=/admission.local.config/certificates/tls.crt @@ -358,7 +358,7 @@ spec: app: volcano-controller spec: serviceAccount: volcano-controllers - + containers: - name: volcano-controllers image: volcanosh/vc-controller-manager:latest diff --git a/test/e2e/vcctl.go b/test/e2e/vcctl.go index fdfa5a1a6..b680225de 100644 --- a/test/e2e/vcctl.go +++ b/test/e2e/vcctl.go @@ -57,7 +57,7 @@ Usage: vcctl job [command] Available Commands: - delete delete a job + delete delete a job list list job information resume resume a job run run job by parameters from the command line