129 lines
2.1 KiB
Python
129 lines
2.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
# In[1]:
|
|
|
|
|
|
from kubernetes.client import V1PodTemplateSpec
|
|
from kubernetes.client import V1ObjectMeta
|
|
from kubernetes.client import V1PodSpec
|
|
from kubernetes.client import V1Container
|
|
from kubernetes.client import V1ResourceRequirements
|
|
|
|
|
|
# In[2]:
|
|
|
|
|
|
from mpijob import V1ReplicaSpec
|
|
from mpijob import V2beta1MPIJob
|
|
from mpijob import V2beta1MPIJobSpec
|
|
|
|
|
|
# In[3]:
|
|
|
|
|
|
from kubernetes import client, config
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
default_image = "docker.io/kubeflow/mpi-horovod-mnist"
|
|
launcher_command = "mpirun"
|
|
launcher_args = [
|
|
"-np", "2",
|
|
"--allow-run-as-root",
|
|
"-bind-to", "none",
|
|
"-map-by", "slot",
|
|
"-x", "LD_LIBRARY_PATH",
|
|
"-x", "PATH",
|
|
"-mca", "pml", "ob1",
|
|
"-mca", "btl", "^openib",
|
|
"python", "/examples/tensorflow_mnist.py"
|
|
]
|
|
|
|
launcher_container = V1Container(
|
|
name="mpi-launcher",
|
|
image=default_image,
|
|
command=launcher_command,
|
|
args=launcher_args,
|
|
resources=V1ResourceRequirements(
|
|
limits={"cpu":"1", "memory":"2Gi"}
|
|
)
|
|
)
|
|
|
|
worker_container = V1Container(
|
|
name="mpi-worker",
|
|
image=default_image,
|
|
command=launcher_command,
|
|
args=launcher_args,
|
|
resources=V1ResourceRequirements(
|
|
limits={"cpu":"2", "memory":"4Gi"}
|
|
)
|
|
)
|
|
|
|
|
|
# In[5]:
|
|
|
|
|
|
launcher_spec = V1ReplicaSpec(
|
|
replicas=1,
|
|
template=V1PodTemplateSpec(
|
|
spec=V1PodSpec(
|
|
containers=[launcher_container]
|
|
)
|
|
)
|
|
)
|
|
|
|
worker_spec = V1ReplicaSpec(
|
|
replicas=2,
|
|
template=V1PodTemplateSpec(
|
|
spec=V1PodSpec(
|
|
containers=[worker_container]
|
|
)
|
|
)
|
|
)
|
|
|
|
|
|
# In[6]:
|
|
|
|
|
|
job = V2beta1MPIJob(
|
|
kind="MPIJob",
|
|
api_version="kubeflow.org/v2beta1",
|
|
metadata=V1ObjectMeta(
|
|
name="tensorflow-mnist",
|
|
),
|
|
spec=V2beta1MPIJobSpec(
|
|
slots_per_worker=1,
|
|
mpi_replica_specs={
|
|
"Launcher":launcher_spec,
|
|
"Worker":worker_spec
|
|
}
|
|
)
|
|
)
|
|
|
|
|
|
# In[7]:
|
|
|
|
|
|
config.load_kube_config()
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
|
crd_api = client.CustomObjectsApi()
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
|
crd_api.create_namespaced_custom_object(
|
|
group="kubeflow.org",
|
|
version="v2beta1",
|
|
namespace="default",
|
|
plural="mpijobs",
|
|
body=job
|
|
)
|