allow specifying gpus explicitly (#16)

* allow specifying gpu resources explicitly; also no longer allocate gpus to the launcher

* small fixes

* address review comments
This commit is contained in:
Rong Ou 2018-06-14 09:49:28 -07:00 committed by k8s-ci-robot
parent 0e3119bae6
commit a3487b2208
6 changed files with 176 additions and 69 deletions

View File

@ -0,0 +1,16 @@
# This file shows how to run multi-node training benchmarks using an MPIJob,
# specifying GPUs explicitly per replica.
apiVersion: kubeflow.org/v1alpha1
kind: MPIJob
metadata:
name: tensorflow-benchmarks-16-custom
spec:
replicas: 4
template:
spec:
containers:
- image: mpioperator/tensorflow-benchmarks:latest
name: tensorflow-benchmarks
resources:
limits:
nvidia.com/gpu: 4

View File

@ -1,4 +1,15 @@
# This file shows how to run multi-node training benchmarks using an MPIJob.
# This file shows how to run multi-node training benchmarks using an MPIJob,
# letting the operator decide on how best to allocate GPUs.
#
# In this mode, the operator assumes all nodes have the same number of GPUs.
# If `gpus` is bigger than the number of GPUs per node, then only whole nodes
# can be allocated.
#
# For example, if each node has 8 GPUs, the valid `gpus` values are:
# 1, 2, 4, 8, 16, 24, 32, ...or any multiple of 8.
#
# If you need more flexibility in allocating GPUs, you can use the alternative
# mode to specify `replicas` and GPU resource limit explicitly.
apiVersion: kubeflow.org/v1alpha1
kind: MPIJob
metadata:

View File

@ -39,9 +39,16 @@ type MPIJobList struct {
type MPIJobSpec struct {
// Specifies the desired number of GPUs the MPIJob should run on.
// Mutually exclusive with the `Replicas` field.
// +optional
GPUs *int32 `json:"gpus,omitempty"`
// Specifies the desired number of replicas the MPIJob should run on.
// The `PodSpec` should specify the number of GPUs.
// Mutually exclusive with the `GPUs` field.
// +optional
Replicas *int32 `json:"replicas,omitempty"`
// Describes the pod that will be created when executing an MPIJob.
Template corev1.PodTemplateSpec `json:"template,omitempty"`
}

View File

@ -95,6 +95,15 @@ func (in *MPIJobSpec) DeepCopyInto(out *MPIJobSpec) {
**out = **in
}
}
if in.Replicas != nil {
in, out := &in.Replicas, &out.Replicas
if *in == nil {
*out = nil
} else {
*out = new(int32)
**out = **in
}
}
in.Template.DeepCopyInto(&out.Template)
return
}

View File

@ -398,11 +398,10 @@ func (c *MPIJobController) syncHandler(key string) error {
// We're done if the launcher either succeeded or failed.
done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1)
totalGPUs := getTotalGPUs(mpiJob)
workerReplicas := c.getWorkerReplicas(totalGPUs, done)
gpusPerWorker := totalGPUs
if totalGPUs > c.gpusPerNode {
gpusPerWorker = c.gpusPerNode
workerReplicas, gpusPerWorker, err := allocateGPUs(mpiJob, c.gpusPerNode, done)
if err != nil {
runtime.HandleError(err)
return nil
}
if !done {
@ -427,7 +426,7 @@ func (c *MPIJobController) syncHandler(key string) error {
}
}
worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas)
worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas, gpusPerWorker)
if err != nil {
return err
}
@ -435,11 +434,7 @@ func (c *MPIJobController) syncHandler(key string) error {
// If the worker is ready, start the launcher.
workerReady := workerReplicas == 0 || int(worker.Status.ReadyReplicas) == workerReplicas
if workerReady && launcher == nil {
launcherGPUs := totalGPUs
if launcherGPUs > c.gpusPerNode {
launcherGPUs = c.gpusPerNode
}
launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(newLauncher(mpiJob, launcherGPUs, c.kubectlDeliveryImage))
launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(newLauncher(mpiJob, c.kubectlDeliveryImage))
if err != nil {
return err
}
@ -480,13 +475,36 @@ func (c *MPIJobController) getLauncherJob(mpiJob *kubeflow.MPIJob) (*batchv1.Job
return launcher, nil
}
// getTotalGPUs gets the total number of desired GPUs. Defaults to 1 if not specified.
func getTotalGPUs(mpiJob *kubeflow.MPIJob) int {
totalGPUs := 1
// allocateGPUs allocates the worker replicas and GPUs per worker.
func allocateGPUs(mpiJob *kubeflow.MPIJob, gpusPerNode int, done bool) (workerReplicas int, gpusPerWorker int, err error) {
workerReplicas = 0
gpusPerWorker = 0
err = nil
if mpiJob.Spec.GPUs != nil {
totalGPUs = int(*mpiJob.Spec.GPUs)
totalGPUs := int(*mpiJob.Spec.GPUs)
if totalGPUs < gpusPerNode {
workerReplicas = 1
gpusPerWorker = totalGPUs
} else if totalGPUs % gpusPerNode == 0 {
workerReplicas = totalGPUs / gpusPerNode
gpusPerWorker = gpusPerNode
} else {
err = fmt.Errorf("specified #GPUs is not a multiple of GPUs per node (%d)", gpusPerNode)
}
} else if mpiJob.Spec.Replicas != nil {
workerReplicas = int(*mpiJob.Spec.Replicas)
container := mpiJob.Spec.Template.Spec.Containers[0]
if container.Resources.Limits != nil {
if val, ok := container.Resources.Limits[gpuResourceName]; ok {
gpus, _ := val.AsInt64()
gpusPerWorker = int(gpus)
}
}
}
return totalGPUs
if done {
workerReplicas = 0
}
return workerReplicas, gpusPerWorker, err
}
// getWorkerReplicas gets the desired number of worker replicas.
@ -603,11 +621,11 @@ func (c *MPIJobController) getLauncherRoleBinding(mpiJob *kubeflow.MPIJob) (*rba
// getOrCreateWorkerStatefulSet gets the worker StatefulSet controlled by this
// MPIJob, or creates one if it doesn't exist.
func (c *MPIJobController) getOrCreateWorkerStatefulSet(mpiJob *kubeflow.MPIJob, workerReplicas int) (*appsv1.StatefulSet, error) {
func (c *MPIJobController) getOrCreateWorkerStatefulSet(mpiJob *kubeflow.MPIJob, workerReplicas int, gpusPerWorker int) (*appsv1.StatefulSet, error) {
worker, err := c.statefulSetLister.StatefulSets(mpiJob.Namespace).Get(mpiJob.Name + workerSuffix)
// If the StatefulSet doesn't exist, we'll create it.
if errors.IsNotFound(err) && workerReplicas > 0 {
worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Create(newWorker(mpiJob, int32(workerReplicas), c.gpusPerNode))
worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Create(newWorker(mpiJob, int32(workerReplicas), gpusPerWorker))
}
// If an error occurs during Get/Create, we'll requeue the item so we
// can attempt processing again later. This could have been caused by a
@ -626,7 +644,7 @@ func (c *MPIJobController) getOrCreateWorkerStatefulSet(mpiJob *kubeflow.MPIJob,
// If the worker is out of date, update the worker.
if worker != nil && int(*worker.Spec.Replicas) != workerReplicas {
worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Update(newWorker(mpiJob, int32(workerReplicas), c.gpusPerNode))
worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Update(newWorker(mpiJob, int32(workerReplicas), gpusPerWorker))
// If an error occurs during Update, we'll requeue the item so we can
// attempt processing again later. This could have been caused by a
// temporary network failure, or any other transient reason.
@ -727,10 +745,14 @@ shift
%s/kubectl exec ${POD_NAME} -- /bin/sh -c "$*"
`, kubectlMountPath)
// If no GPU is specified, default to 1 slot.
slots := 1
if gpusPerWorker > 0 {
slots = gpusPerWorker
}
var buffer bytes.Buffer
buffer.WriteString(fmt.Sprintf("localhost slots=%d max_slots=%d\n", gpusPerWorker, gpusPerWorker))
for i := 0; i < workerReplicas; i++ {
buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d max_slots=%d\n", mpiJob.Name, workerSuffix, i, gpusPerWorker, gpusPerWorker))
buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d max_slots=%d\n", mpiJob.Name, workerSuffix, i, slots, slots))
}
return &corev1.ConfigMap{
@ -903,7 +925,7 @@ func newWorker(mpiJob *kubeflow.MPIJob, desiredReplicas int32, gpus int) *appsv1
// newLauncher creates a new launcher Job for an MPIJob resource. It also sets
// the appropriate OwnerReferences on the resource so handleObject can discover
// the MPIJob resource that 'owns' it.
func newLauncher(mpiJob *kubeflow.MPIJob, gpus int, kubectlDeliveryImage string) *batchv1.Job {
func newLauncher(mpiJob *kubeflow.MPIJob, kubectlDeliveryImage string) *batchv1.Job {
launcherName := mpiJob.Name + launcherSuffix
labels := map[string]string{
"app": launcherName,
@ -938,10 +960,9 @@ func newLauncher(mpiJob *kubeflow.MPIJob, gpus int, kubectlDeliveryImage string)
Name: "OMPI_MCA_orte_default_hostfile",
Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
})
if container.Resources.Limits == nil {
container.Resources.Limits = make(corev1.ResourceList)
if container.Resources.Limits != nil {
delete(container.Resources.Limits, gpuResourceName)
}
container.Resources.Limits[gpuResourceName] = *resource.NewQuantity(int64(gpus), resource.DecimalExponent)
container.VolumeMounts = append(container.VolumeMounts,
corev1.VolumeMount{
Name: kubectlVolumeName,

View File

@ -36,6 +36,7 @@ import (
kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1"
"github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/fake"
informers "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions"
"k8s.io/apimachinery/pkg/api/resource"
)
var (
@ -98,6 +99,34 @@ func newMPIJob(name string, gpus *int32) *kubeflow.MPIJob {
}
}
func newMPIJobWithCustomResources(name string, replicas *int32, gpusPerReplica int64) *kubeflow.MPIJob {
return &kubeflow.MPIJob{
TypeMeta: metav1.TypeMeta{APIVersion: kubeflow.SchemeGroupVersion.String()},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: metav1.NamespaceDefault,
},
Spec: kubeflow.MPIJobSpec{
Replicas: replicas,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "foo",
Image: "bar",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": *resource.NewQuantity(gpusPerReplica, resource.DecimalExponent),
},
},
},
},
},
},
},
}
}
func (f *fixture) newController() (*MPIJobController, informers.SharedInformerFactory, kubeinformers.SharedInformerFactory) {
f.client = fake.NewSimpleClientset(f.objects...)
f.kubeClient = k8sfake.NewSimpleClientset(f.kubeObjects...)
@ -415,7 +444,7 @@ func TestLauncherNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
launcher := newLauncher(mpiJob, 64, "kubectl-delivery")
launcher := newLauncher(mpiJob, "kubectl-delivery")
launcher.OwnerReferences = nil
f.setUpLauncher(launcher)
@ -428,7 +457,7 @@ func TestLauncherSucceeded(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
launcher := newLauncher(mpiJob, 64, "kubectl-delivery")
launcher := newLauncher(mpiJob, "kubectl-delivery")
launcher.Status.Succeeded = 1
f.setUpLauncher(launcher)
@ -445,7 +474,7 @@ func TestLauncherFailed(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
launcher := newLauncher(mpiJob, 64, "kubectl-delivery")
launcher := newLauncher(mpiJob, "kubectl-delivery")
launcher.Status.Failed = 1
f.setUpLauncher(launcher)
@ -462,19 +491,47 @@ func TestLauncherDoesNotExist(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
expConfigMap := newConfigMap(mpiJob, 7, 8)
expConfigMap := newConfigMap(mpiJob, 8, 8)
f.expectCreateConfigMapAction(expConfigMap)
expServiceAccount := newLauncherServiceAccount(mpiJob)
f.expectCreateServiceAccountAction(expServiceAccount)
expRole := newLauncherRole(mpiJob, 7)
expRole := newLauncherRole(mpiJob, 8)
f.expectCreateRoleAction(expRole)
expRoleBinding := newLauncherRoleBinding(mpiJob)
f.expectCreateRoleBindingAction(expRoleBinding)
expWorker := newWorker(mpiJob, 7, 8)
expWorker := newWorker(mpiJob, 8, 8)
f.expectCreateStatefulSetAction(expWorker)
mpiJobCopy := mpiJob.DeepCopy()
mpiJobCopy.Status.WorkerReplicas = 0
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestLauncherDoesNotExistWithCustomResources(t *testing.T) {
f := newFixture(t)
mpiJob := newMPIJobWithCustomResources("test", int32Ptr(4), 4)
f.setUpMPIJob(mpiJob)
expConfigMap := newConfigMap(mpiJob, 4, 4)
f.expectCreateConfigMapAction(expConfigMap)
expServiceAccount := newLauncherServiceAccount(mpiJob)
f.expectCreateServiceAccountAction(expServiceAccount)
expRole := newLauncherRole(mpiJob, 4)
f.expectCreateRoleAction(expRole)
expRoleBinding := newLauncherRoleBinding(mpiJob)
f.expectCreateRoleBindingAction(expRoleBinding)
expWorker := newWorker(mpiJob, 4, 4)
f.expectCreateStatefulSetAction(expWorker)
mpiJobCopy := mpiJob.DeepCopy()
@ -490,7 +547,7 @@ func TestConfigMapNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
configMap := newConfigMap(mpiJob, 7, 8)
configMap := newConfigMap(mpiJob, 8, 8)
configMap.OwnerReferences = nil
f.setUpConfigMap(configMap)
@ -503,7 +560,7 @@ func TestServiceAccountNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 7, 8))
f.setUpConfigMap(newConfigMap(mpiJob, 8, 8))
serviceAccount := newLauncherServiceAccount(mpiJob)
serviceAccount.OwnerReferences = nil
@ -518,10 +575,10 @@ func TestRoleNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 7, 8))
f.setUpConfigMap(newConfigMap(mpiJob, 8, 8))
f.setUpServiceAccount(newLauncherServiceAccount(mpiJob))
role := newLauncherRole(mpiJob, 7)
role := newLauncherRole(mpiJob, 8)
role.OwnerReferences = nil
f.setUpRole(role)
@ -534,9 +591,9 @@ func TestRoleBindingNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 7, 8))
f.setUpConfigMap(newConfigMap(mpiJob, 8, 8))
f.setUpServiceAccount(newLauncherServiceAccount(mpiJob))
f.setUpRole(newLauncherRole(mpiJob, 7))
f.setUpRole(newLauncherRole(mpiJob, 8))
roleBinding := newLauncherRoleBinding(mpiJob)
roleBinding.OwnerReferences = nil
@ -551,11 +608,11 @@ func TestShutdownWorker(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
launcher := newLauncher(mpiJob, 64, "kubectl-delivery")
launcher := newLauncher(mpiJob, "kubectl-delivery")
launcher.Status.Succeeded = 1
f.setUpLauncher(launcher)
worker := newWorker(mpiJob, 7, 8)
worker := newWorker(mpiJob, 8, 8)
f.setUpWorker(worker)
expWorker := newWorker(mpiJob, 0, 8)
@ -575,46 +632,32 @@ func TestWorkerNotControlledByUs(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(64))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 7, 8))
f.setUpRbac(mpiJob, 7)
f.setUpConfigMap(newConfigMap(mpiJob, 8, 8))
f.setUpRbac(mpiJob, 8)
worker := newWorker(mpiJob, 7, 8)
worker := newWorker(mpiJob, 8, 8)
worker.OwnerReferences = nil
f.setUpWorker(worker)
f.runExpectError(getKey(mpiJob, t))
}
func TestWorkerNotNeeded(t *testing.T) {
f := newFixture(t)
mpiJob := newMPIJob("test", int32Ptr(8))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 0, 8))
f.setUpRbac(mpiJob, 0)
expLauncher := newLauncher(mpiJob, 8, "kubectl-delivery")
f.expectCreateJobAction(expLauncher)
f.expectUpdateMPIJobStatusAction(mpiJob)
f.run(getKey(mpiJob, t))
}
func TestLauncherActive(t *testing.T) {
f := newFixture(t)
mpiJob := newMPIJob("test", int32Ptr(8))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 0, 8))
f.setUpRbac(mpiJob, 0)
f.setUpConfigMap(newConfigMap(mpiJob, 1, 8))
f.setUpRbac(mpiJob, 1)
launcher := newLauncher(mpiJob, 64, "kubectl-delivery")
launcher := newLauncher(mpiJob, "kubectl-delivery")
launcher.Status.Active = 1
f.setUpLauncher(launcher)
worker := newWorker(mpiJob, 1, 8)
f.setUpWorker(worker)
mpiJobCopy := mpiJob.DeepCopy()
mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherActive
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
@ -628,18 +671,18 @@ func TestWorkerReady(t *testing.T) {
mpiJob := newMPIJob("test", int32Ptr(16))
f.setUpMPIJob(mpiJob)
f.setUpConfigMap(newConfigMap(mpiJob, 1, 8))
f.setUpRbac(mpiJob, 1)
f.setUpConfigMap(newConfigMap(mpiJob, 2, 8))
f.setUpRbac(mpiJob, 2)
worker := newWorker(mpiJob, 1, 8)
worker.Status.ReadyReplicas = 1
worker := newWorker(mpiJob, 2, 8)
worker.Status.ReadyReplicas = 2
f.setUpWorker(worker)
expLauncher := newLauncher(mpiJob, 8, "kubectl-delivery")
expLauncher := newLauncher(mpiJob, "kubectl-delivery")
f.expectCreateJobAction(expLauncher)
mpiJobCopy := mpiJob.DeepCopy()
mpiJobCopy.Status.WorkerReplicas = 1
mpiJobCopy.Status.WorkerReplicas = 2
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))