This commit is contained in:
kuizhiqing 2025-09-24 21:03:30 +08:00 committed by GitHub
commit 0e6e609ea8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 40 additions and 0 deletions

View File

@ -0,0 +1,8 @@
# DeeepSpeed Example
This demo introduces the basic usage of deepspeed with mpi-operator.
## References
* https://github.com/microsoft/DeepSpeedExamples/blob/master/training/HelloDeepSpeed/README.md
* https://www.alibabacloud.com/help/en/ack/cloud-native-ai-suite/user-guide/deepspeed-distributed-training

View File

@ -0,0 +1,32 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: deepspeed-helloworld
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
name: deepspeed-helloworld
command:
- deepspeed
args:
- /workspace/DeepSpeedExamples/HelloDeepSpeed/train_bert_ds.py
- --checkpoint_dir
- /workspace
Worker:
replicas: 2
template:
spec:
containers:
- image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed
name: deepspeed-helloworld
resources:
limits:
nvidia.com/gpu: 8