Merge b8c6b9abfe into b1c5c9d060
This commit is contained in:
commit
0e6e609ea8
|
|
@ -0,0 +1,8 @@
|
|||
# DeeepSpeed Example
|
||||
|
||||
This demo introduces the basic usage of deepspeed with mpi-operator.
|
||||
|
||||
## References
|
||||
|
||||
* https://github.com/microsoft/DeepSpeedExamples/blob/master/training/HelloDeepSpeed/README.md
|
||||
* https://www.alibabacloud.com/help/en/ack/cloud-native-ai-suite/user-guide/deepspeed-distributed-training
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
apiVersion: kubeflow.org/v2beta1
|
||||
kind: MPIJob
|
||||
metadata:
|
||||
name: deepspeed-helloworld
|
||||
spec:
|
||||
slotsPerWorker: 1
|
||||
runPolicy:
|
||||
cleanPodPolicy: Running
|
||||
mpiReplicaSpecs:
|
||||
Launcher:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
|
||||
name: deepspeed-helloworld
|
||||
command:
|
||||
- deepspeed
|
||||
args:
|
||||
- /workspace/DeepSpeedExamples/HelloDeepSpeed/train_bert_ds.py
|
||||
- --checkpoint_dir
|
||||
- /workspace
|
||||
Worker:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
|
||||
name: deepspeed-helloworld
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
Loading…
Reference in New Issue