Merge b8c6b9abfe into b1c5c9d060
This commit is contained in:
commit
0e6e609ea8
|
|
@ -0,0 +1,8 @@
|
||||||
|
# DeeepSpeed Example
|
||||||
|
|
||||||
|
This demo introduces the basic usage of deepspeed with mpi-operator.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
* https://github.com/microsoft/DeepSpeedExamples/blob/master/training/HelloDeepSpeed/README.md
|
||||||
|
* https://www.alibabacloud.com/help/en/ack/cloud-native-ai-suite/user-guide/deepspeed-distributed-training
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
apiVersion: kubeflow.org/v2beta1
|
||||||
|
kind: MPIJob
|
||||||
|
metadata:
|
||||||
|
name: deepspeed-helloworld
|
||||||
|
spec:
|
||||||
|
slotsPerWorker: 1
|
||||||
|
runPolicy:
|
||||||
|
cleanPodPolicy: Running
|
||||||
|
mpiReplicaSpecs:
|
||||||
|
Launcher:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
|
||||||
|
name: deepspeed-helloworld
|
||||||
|
command:
|
||||||
|
- deepspeed
|
||||||
|
args:
|
||||||
|
- /workspace/DeepSpeedExamples/HelloDeepSpeed/train_bert_ds.py
|
||||||
|
- --checkpoint_dir
|
||||||
|
- /workspace
|
||||||
|
Worker:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
|
||||||
|
name: deepspeed-helloworld
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
Loading…
Reference in New Issue