chore(test): Add E2E tests for Kubeflow Trainer (#2470)

* Add e2e tests for Kubeflow Trainer

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add timeout for papermill

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add output as part of make command

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add k8s version to setup cluster

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix Kind k8s version

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix 1.29 version

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Create script to run Notebook

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Download dataset when local_rank=0

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Update test/e2e/e2e_test.go

Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Refactor Go e2e tests

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Bump k8s to 1.29.14

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Install Kind from go mod

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix path for Kind package

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix Go e2e

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Reduce number of CPUs
Export Notebook as artifact

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Print logs due to flaky test

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix artifact path

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* docker pull image

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix path

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add k8s version to output name

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Remove install Kind cmd

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

---------

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
This commit is contained in:
Andrey Velichkevich 2025-03-05 04:04:07 +00:00 committed by GitHub
parent 3ec8f0705f
commit 9e785750d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 645 additions and 399 deletions

View File

@ -1,99 +0,0 @@
# TODO (andreyvelich): Refactor this once we have e2e test for Kubeflow Trainer.
name: Setup E2E test template
description: A composite action to setup e2e tests
inputs:
kubernetes-version:
required: true
description: Kubernetes version
python-version:
required: true
description: Python version
gang-scheduler-name:
required: false
default: "none"
description: Gang scheduler name
runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -hT
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT
- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python-version }}
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.11.0
with:
node_image: kindest/node:${{ inputs.kubernetes-version }}
cluster_name: training-operator-cluster
kubectl_version: ${{ inputs.kubernetes-version }}
- name: Build training-operator
shell: bash
run: |
./scripts/gha/build-image.sh
env:
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
- name: Deploy training operator
shell: bash
run: |
./scripts/gha/setup-training-operator.sh
docker system prune -a -f
docker system df
df -h
env:
KIND_CLUSTER: training-operator-cluster
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
GANG_SCHEDULER_NAME: ${{ inputs.gang-scheduler-name }}
KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}

View File

@ -7,23 +7,61 @@ on:
jobs:
e2e-test:
name: E2E Test
runs-on: ubuntu-latest
runs-on:
labels: ubuntu-latest-16-cores
env:
GOPATH: ${{ github.workspace }}/go
defaults:
run:
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
strategy:
fail-fast: false
matrix:
kubernetes-version: ["1.29.3", "1.30.0", "1.31.0"]
# Kubernetes versions for e2e tests on Kind cluster.
kubernetes-version: ["1.29.14", "1.30.0", "1.31.0"]
steps:
- name: Check out code
uses: actions/checkout@v4
with:
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
python-version: 3.11
- name: Install dependencies
run: |
echo "TODO (andreyvelich): Implement E2E Tests"
# pip install -U './sdk'
echo "Install Papermill"
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
echo "Install Kubeflow SDK"
pip install ./sdk
- name: Setup cluster
run: |
make test-e2e-setup-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
- name: Run e2e with Go
run: |
make test-e2e
- name: Run e2e test for example Notebooks.
run: |
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./mnist_output_${{ matrix.kubernetes-version }}.ipynb TIMEOUT=900
# TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks.
- name: Upload notebook
uses: actions/upload-artifact@v4
if: always()
with:
name: mnist_output_${{ matrix.kubernetes-version }}.ipynb
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/mnist_output_${{ matrix.kubernetes-version }}.ipynb
retention-days: 1

View File

@ -76,7 +76,7 @@ jobs:
- name: Run Go integration tests
run: |
make test-integration ENVTEST_K8S_VERSION=${{ matrix.kubernetes-version }}
make test-integration K8S_VERSION=${{ matrix.kubernetes-version }}
- name: Coveralls report
uses: shogo82148/actions-goveralls@v1

2
.gitignore vendored
View File

@ -13,6 +13,8 @@ __debug_bin
# Jupyter Notebooks.
**/.ipynb_checkpoints
# The default output for Notebook after Papermill execution.
trainer_output.ipynb
# Python cache files
__pycache__/

View File

@ -29,14 +29,16 @@ help: ## Display this help.
##@ Development
K8S_VERSION ?= 1.32.0
PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
# Tool Binaries
LOCALBIN ?= $(PROJECT_DIR)/bin
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest
ENVTEST_K8S_VERSION ?= 1.32
KIND ?= $(LOCALBIN)/kind
# Instructions to download tools for development.
.PHONY: envtest
@ -47,6 +49,10 @@ envtest: ## Download the setup-envtest binary if required.
controller-gen: ## Download the controller-gen binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.17.2
.PHONY: kind
kind: ## Download Kind binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind)
# Download external CRDs for Go integration testings.
EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds
@ -106,8 +112,9 @@ test: ## Run Go unit test.
.PHONY: test-integration
test-integration: envtest jobset-operator-crd scheduler-plugins-crd ## Run Go integration test.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" go test ./test/... -coverprofile cover.out
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(K8S_VERSION) -p path)" go test ./test/integration/... -coverprofile cover.out
.PHONY: test-python
test-python: ## Run Python unit test.
export PYTHONPATH=$(PROJECT_DIR)
pip install pytest
@ -118,9 +125,26 @@ test-python: ## Run Python unit test.
pytest ./pkg/initializer/model
pytest ./pkg/initializer/utils
.PHONY: test-python-integration
test-python-integration: ## Run Python integration test.
export PYTHONPATH=$(PROJECT_DIR)
pip install pytest
pip install -r ./cmd/initializer/dataset/requirements.txt
pytest ./test/integration/initializer
.PHONY: test-e2e-setup-cluster
test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
.PHONY: test-e2e
test-e2e: ## Run Go e2e test.
go test ./test/e2e/...
# Input and output location for Notebooks executed with Papermill.
NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/pytorch/image-classification/mnist.ipynb
NOTEBOOK_OUTPUT=$(PROJECT_DIR)/trainer_output.ipynb
PAPERMILL_TIMEOUT=900
.PHONY: test-e2e-notebook
test-e2e-notebook: ## Run Jupyter Notebook with Papermill.
NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh

View File

@ -33,7 +33,7 @@
"outputs": [],
"source": [
"# TODO (astefanutti): Change to the Kubeflow SDK when it's available.\n",
"!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
"# !pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
]
},
{
@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@ -119,14 +119,24 @@
" model = nn.parallel.DistributedDataParallel(Net().to(device))\n",
" optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n",
"\n",
" # Retrieve the Fashion-MNIST dataset\n",
" \n",
" # Download FashionMNIST dataset only on local_rank=0 process.\n",
" if local_rank == 0:\n",
" dataset = datasets.FashionMNIST(\n",
" \"./data\",\n",
" train=True,\n",
" download=True,\n",
" transform=transforms.Compose([transforms.ToTensor()]),\n",
" )\n",
" dist.barrier()\n",
" dataset = datasets.FashionMNIST(\n",
" \"./data\",\n",
" train=True,\n",
" download=True,\n",
" download=False,\n",
" transform=transforms.Compose([transforms.ToTensor()]),\n",
" )\n",
"\n",
"\n",
" # Shard the dataset accross workers.\n",
" train_loader = DataLoader(\n",
" dataset,\n",
@ -135,6 +145,7 @@
" )\n",
"\n",
" # TODO(astefanutti): add parameters to the training function\n",
" dist.barrier()\n",
" for epoch in range(1, 3):\n",
" model.train()\n",
"\n",
@ -181,7 +192,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -190,126 +201,195 @@
"text": [
"Using Device: cpu, Backend: gloo\n",
"Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n",
"Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.309967\n",
"Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.045446\n",
"Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.398883\n",
"Train Epoch: 1 [3000/60000 (5%)]\tLoss: 0.992089\n",
"Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.122684\n",
"Train Epoch: 1 [5000/60000 (8%)]\tLoss: 1.031676\n",
"Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.649529\n",
"Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.804960\n",
"Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.709698\n",
"Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.632330\n",
"Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.695469\n",
"Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.646323\n",
"Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.521877\n",
"Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.592377\n",
"Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.686853\n",
"Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.678805\n",
"Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.658783\n",
"Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.540468\n",
"Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.456685\n",
"Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.561984\n",
"Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.453478\n",
"Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.399682\n",
"Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.432961\n",
"Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.611499\n",
"Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.552892\n",
"Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.409226\n",
"Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.569662\n",
"Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.379728\n",
"Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.420447\n",
"Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.410670\n",
"Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.480141\n",
"Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.425981\n",
"Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.345157\n",
"Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.323578\n",
"Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.537613\n",
"Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.523302\n",
"Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.426407\n",
"Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.356403\n",
"Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.516297\n",
"Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.406655\n",
"Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.314193\n",
"Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.467424\n",
"Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457645\n",
"Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.388591\n",
"Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.386649\n",
"Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.282575\n",
"Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.446804\n",
"Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.418433\n",
"Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.575584\n",
"Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.382036\n",
"Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.299168\n",
"Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.423421\n",
"Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.425236\n",
"Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.403723\n",
"Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.303039\n",
"Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.375983\n",
"Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.434169\n",
"Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.429213\n",
"Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.354376\n",
"Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.305779\n",
"Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.437120\n",
"Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.464603\n",
"Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.284665\n",
"Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.369253\n",
"Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.468896\n",
"Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.388527\n",
"Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.474483\n",
"Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.373588\n",
"Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.443588\n",
"Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.449592\n",
"Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.363776\n",
"Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.400426\n",
"Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.282801\n",
"Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.288877\n",
"Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.549093\n",
"Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.359002\n",
"Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.322263\n",
"Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.289489\n",
"Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.279724\n",
"Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.452595\n",
"Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.334388\n",
"Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.340985\n",
"Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.247467\n",
"Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.439283\n",
"Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.270795\n",
"Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.283242\n",
"Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.377896\n",
"Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.264453\n",
"Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.328696\n",
"Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.294168\n",
"Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.421162\n",
"Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.306932\n",
"Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.297351\n",
"Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.261608\n",
"Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.413534\n",
"Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.433157\n",
"Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.390571\n",
"Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.242159\n",
"Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.347628\n",
"Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.321216\n",
"Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.285891\n",
"Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.401335\n",
"Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.357113\n",
"Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.321728\n",
"Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266073\n",
"Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.235082\n",
"Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.329955\n",
"Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.351680\n",
"Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.509699\n",
"Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.281432\n",
"Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.262006\n",
"Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.432544\n",
"Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.332725\n",
"Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.313516\n",
"Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.266921\n",
"Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.279880\n",
"Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.329515\n",
"Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.379902\n",
"Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.252111\n",
"Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.267555\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26.4M/26.4M [00:02<00:00, 10.9MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
"\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29.5k/29.5k [00:00<00:00, 1.50MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
"\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.42M/4.42M [00:00<00:00, 8.57MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
"\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.15k/5.15k [00:00<00:00, 4.41MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
"\n",
"Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.312359\n",
"Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.058874\n",
"Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.059449\n",
"Train Epoch: 1 [3000/60000 (5%)]\tLoss: 1.130449\n",
"Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.027112\n",
"Train Epoch: 1 [5000/60000 (8%)]\tLoss: 0.845158\n",
"Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.747619\n",
"Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.731784\n",
"Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.623452\n",
"Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.666745\n",
"Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.597833\n",
"Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.634247\n",
"Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.533735\n",
"Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.545662\n",
"Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.637130\n",
"Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.596679\n",
"Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.505523\n",
"Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.484452\n",
"Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.494081\n",
"Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.601508\n",
"Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.485598\n",
"Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.402011\n",
"Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.350119\n",
"Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.575456\n",
"Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.402916\n",
"Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.335860\n",
"Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.498826\n",
"Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.331436\n",
"Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.466167\n",
"Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.514644\n",
"Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.423657\n",
"Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.424075\n",
"Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.365285\n",
"Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.289063\n",
"Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.538227\n",
"Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.546361\n",
"Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.383847\n",
"Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.311942\n",
"Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.458801\n",
"Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.481895\n",
"Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.308024\n",
"Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.435803\n",
"Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457417\n",
"Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.310509\n",
"Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.347369\n",
"Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.341391\n",
"Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.464614\n",
"Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.432629\n",
"Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.519174\n",
"Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.368225\n",
"Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.325527\n",
"Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.455275\n",
"Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.423473\n",
"Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.354114\n",
"Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.328097\n",
"Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.353430\n",
"Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.419119\n",
"Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.383263\n",
"Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.292101\n",
"Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.251011\n",
"Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.412635\n",
"Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.467144\n",
"Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.246448\n",
"Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.341133\n",
"Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.412845\n",
"Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.338513\n",
"Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.426739\n",
"Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.387864\n",
"Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.441497\n",
"Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.475687\n",
"Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.390743\n",
"Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.432761\n",
"Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.306278\n",
"Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.322986\n",
"Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.460900\n",
"Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.350266\n",
"Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.274468\n",
"Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.341584\n",
"Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.237921\n",
"Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.387657\n",
"Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.379017\n",
"Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.267510\n",
"Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.270873\n",
"Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.437596\n",
"Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.265560\n",
"Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.279858\n",
"Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.332311\n",
"Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.225879\n",
"Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.278610\n",
"Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.286307\n",
"Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.413894\n",
"Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.308004\n",
"Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.280563\n",
"Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.264711\n",
"Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.417848\n",
"Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.512428\n",
"Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.319987\n",
"Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.251197\n",
"Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.325069\n",
"Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.303394\n",
"Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.280159\n",
"Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.426005\n",
"Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.363313\n",
"Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.271922\n",
"Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266253\n",
"Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.238750\n",
"Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.376987\n",
"Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.282419\n",
"Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.462140\n",
"Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.255249\n",
"Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.241605\n",
"Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.440393\n",
"Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.313707\n",
"Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.338831\n",
"Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.236808\n",
"Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.262569\n",
"Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.415122\n",
"Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.362813\n",
"Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.321227\n",
"Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.239889\n",
"Training is finished\n"
]
}
@ -345,7 +425,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"pycharm": {
"name": "#%%\n"
@ -371,13 +451,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Runtime(name='mpi-distributed', phase='Unknown', accelerator='Unknown', accelerator_count='Unknown')\n",
"Runtime(name='torch-distributed', phase='pre-training', accelerator='gpu-tesla-v100-16gb', accelerator_count='4')\n"
]
}
@ -393,12 +474,12 @@
"source": [
"## Run the Distributed TrainJob\n",
"\n",
"Kubeflow TrainJob will train the above model on 4 PyTorch nodes."
"Kubeflow TrainJob will train the above model on 3 PyTorch nodes."
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@ -408,13 +489,13 @@
" trainer=Trainer(\n",
" func=train_fashion_mnist,\n",
" # Set how many PyTorch nodes you want to use for distributed training.\n",
" num_nodes=4,\n",
" num_nodes=3,\n",
" # Set the resources for each PyTorch node.\n",
" resources_per_node={\n",
" \"cpu\": 5,\n",
" \"cpu\": 3,\n",
" \"memory\": \"16Gi\",\n",
" # Comment this to distribute the TrainJob using CPU nodes.\n",
" \"nvidia.com/gpu\": 1,\n",
" # Uncomment this to distribute the TrainJob using GPU nodes.\n",
" # \"nvidia.com/gpu\": 1,\n",
" },\n",
" ),\n",
")"
@ -428,14 +509,42 @@
"\n",
"You can check the components of TrainJob that's created.\n",
"\n",
"Since the TrainJob performs distributed training across 4 nodes, it generates 4 components: `trainer-node-0` .. `trainer-node-3`.\n",
"Since the TrainJob performs distributed training across 3 nodes, it generates 3 components: `trainer-node-0` .. `trainer-node-2`.\n",
"\n",
"You can get the individual status for each of these components."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"\n",
"def wait_for_job_running():\n",
" for _ in range(100):\n",
" trainjob = client.get_job(name=job_name)\n",
" for c in trainjob.components:\n",
" if c.name == \"trainer-node-0\" and c.status == \"Running\":\n",
" return\n",
" print(\"Wait for TrainJob running status. Sleep for 5 seconds\")\n",
" time.sleep(5)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n",
"wait_for_job_running()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -447,8 +556,6 @@
"Component: trainer-node-1, Status: Running, Devices: gpu x 1\n",
"\n",
"Component: trainer-node-2, Status: Running, Devices: gpu x 1\n",
"\n",
"Component: trainer-node-3, Status: Running, Devices: gpu x 1\n",
"\n"
]
}
@ -466,12 +573,12 @@
"\n",
"We can use the `get_job_logs()` API to get the TrainJob logs.\n",
"\n",
"Since we run training on 4 GPUs, every PyTorch node uses 60,000/4 = 15,000 images from the dataset."
"Since we run training on 3 GPUs, every PyTorch node uses 60,000/3 = 20,000 images from the dataset."
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -479,53 +586,63 @@
"output_type": "stream",
"text": [
"[trainer-node]: Using Device: cuda, Backend: nccl\n",
"[trainer-node]: Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
"[trainer-node]: Distributed Training for WORLD_SIZE: 3, RANK: 0, LOCAL_RANK: 0\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
"100%|██████████| 26.4M/26.4M [00:02<00:00, 12.5MB/s]\n",
"100%|██████████| 26.4M/26.4M [00:02<00:00, 11.1MB/s]\n",
"[trainer-node]: Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
"100%|██████████| 29.5k/29.5k [00:00<00:00, 214kB/s]\n",
"100%|██████████| 29.5k/29.5k [00:00<00:00, 74.8MB/s]\n",
"[trainer-node]: Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
"100%|██████████| 4.42M/4.42M [00:01<00:00, 3.50MB/s]\n",
"100%|██████████| 4.42M/4.42M [00:01<00:00, 3.90MB/s]\n",
"[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
"100%|██████████| 5.15k/5.15k [00:00<00:00, 37.8MB/s]\n",
"100%|██████████| 5.15k/5.15k [00:00<00:00, 38.1MB/s]\n",
"[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
"[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.305451\n",
"[trainer-node]: Train Epoch: 1 [1000/60000 (7%)]\tLoss: 2.056247\n",
"[trainer-node]: Train Epoch: 1 [2000/60000 (13%)]\tLoss: 2.166955\n",
"[trainer-node]: Train Epoch: 1 [3000/60000 (20%)]\tLoss: 1.045183\n",
"[trainer-node]: Train Epoch: 1 [4000/60000 (27%)]\tLoss: 0.767518\n",
"[trainer-node]: Train Epoch: 1 [5000/60000 (33%)]\tLoss: 0.697382\n",
"[trainer-node]: Train Epoch: 1 [6000/60000 (40%)]\tLoss: 0.638373\n",
"[trainer-node]: Train Epoch: 1 [7000/60000 (47%)]\tLoss: 0.667810\n",
"[trainer-node]: Train Epoch: 1 [8000/60000 (53%)]\tLoss: 0.541413\n",
"[trainer-node]: Train Epoch: 1 [9000/60000 (60%)]\tLoss: 0.564223\n",
"[trainer-node]: Train Epoch: 1 [10000/60000 (67%)]\tLoss: 0.425999\n",
"[trainer-node]: Train Epoch: 1 [11000/60000 (73%)]\tLoss: 0.564535\n",
"[trainer-node]: Train Epoch: 1 [12000/60000 (80%)]\tLoss: 0.459158\n",
"[trainer-node]: Train Epoch: 1 [13000/60000 (87%)]\tLoss: 0.545110\n",
"[trainer-node]: Train Epoch: 1 [14000/60000 (93%)]\tLoss: 0.471710\n",
"[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.520992\n",
"[trainer-node]: Train Epoch: 2 [1000/60000 (7%)]\tLoss: 0.440295\n",
"[trainer-node]: Train Epoch: 2 [2000/60000 (13%)]\tLoss: 0.436745\n",
"[trainer-node]: Train Epoch: 2 [3000/60000 (20%)]\tLoss: 0.359110\n",
"[trainer-node]: Train Epoch: 2 [4000/60000 (27%)]\tLoss: 0.493791\n",
"[trainer-node]: Train Epoch: 2 [5000/60000 (33%)]\tLoss: 0.384616\n",
"[trainer-node]: Train Epoch: 2 [6000/60000 (40%)]\tLoss: 0.529568\n",
"[trainer-node]: Train Epoch: 2 [7000/60000 (47%)]\tLoss: 0.443400\n",
"[trainer-node]: Train Epoch: 2 [8000/60000 (53%)]\tLoss: 0.352168\n",
"[trainer-node]: Train Epoch: 2 [9000/60000 (60%)]\tLoss: 0.431930\n",
"[trainer-node]: Train Epoch: 2 [10000/60000 (67%)]\tLoss: 0.282820\n",
"[trainer-node]: Train Epoch: 2 [11000/60000 (73%)]\tLoss: 0.412141\n",
"[trainer-node]: Train Epoch: 2 [12000/60000 (80%)]\tLoss: 0.367190\n",
"[trainer-node]: Train Epoch: 2 [13000/60000 (87%)]\tLoss: 0.355502\n",
"[trainer-node]: Train Epoch: 2 [14000/60000 (93%)]\tLoss: 0.326105\n",
"[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.298486\n",
"[trainer-node]: Train Epoch: 1 [1000/60000 (5%)]\tLoss: 2.023109\n",
"[trainer-node]: Train Epoch: 1 [2000/60000 (10%)]\tLoss: 2.210286\n",
"[trainer-node]: Train Epoch: 1 [3000/60000 (15%)]\tLoss: 1.308523\n",
"[trainer-node]: Train Epoch: 1 [4000/60000 (20%)]\tLoss: 0.896595\n",
"[trainer-node]: Train Epoch: 1 [5000/60000 (25%)]\tLoss: 0.790926\n",
"[trainer-node]: Train Epoch: 1 [6000/60000 (30%)]\tLoss: 0.694305\n",
"[trainer-node]: Train Epoch: 1 [7000/60000 (35%)]\tLoss: 0.532733\n",
"[trainer-node]: Train Epoch: 1 [8000/60000 (40%)]\tLoss: 0.571943\n",
"[trainer-node]: Train Epoch: 1 [9000/60000 (45%)]\tLoss: 0.593324\n",
"[trainer-node]: Train Epoch: 1 [10000/60000 (50%)]\tLoss: 0.570712\n",
"[trainer-node]: Train Epoch: 1 [11000/60000 (55%)]\tLoss: 0.416316\n",
"[trainer-node]: Train Epoch: 1 [12000/60000 (60%)]\tLoss: 0.438910\n",
"[trainer-node]: Train Epoch: 1 [13000/60000 (65%)]\tLoss: 0.486123\n",
"[trainer-node]: Train Epoch: 1 [14000/60000 (70%)]\tLoss: 0.432043\n",
"[trainer-node]: Train Epoch: 1 [15000/60000 (75%)]\tLoss: 0.374424\n",
"[trainer-node]: Train Epoch: 1 [16000/60000 (80%)]\tLoss: 0.366622\n",
"[trainer-node]: Train Epoch: 1 [17000/60000 (85%)]\tLoss: 0.495783\n",
"[trainer-node]: Train Epoch: 1 [18000/60000 (90%)]\tLoss: 0.381096\n",
"[trainer-node]: Train Epoch: 1 [19000/60000 (95%)]\tLoss: 0.385782\n",
"[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.380943\n",
"[trainer-node]: Train Epoch: 2 [1000/60000 (5%)]\tLoss: 0.466423\n",
"[trainer-node]: Train Epoch: 2 [2000/60000 (10%)]\tLoss: 0.452478\n",
"[trainer-node]: Train Epoch: 2 [3000/60000 (15%)]\tLoss: 0.409038\n",
"[trainer-node]: Train Epoch: 2 [4000/60000 (20%)]\tLoss: 0.370588\n",
"[trainer-node]: Train Epoch: 2 [5000/60000 (25%)]\tLoss: 0.419151\n",
"[trainer-node]: Train Epoch: 2 [6000/60000 (30%)]\tLoss: 0.378228\n",
"[trainer-node]: Train Epoch: 2 [7000/60000 (35%)]\tLoss: 0.328720\n",
"[trainer-node]: Train Epoch: 2 [8000/60000 (40%)]\tLoss: 0.557514\n",
"[trainer-node]: Train Epoch: 2 [9000/60000 (45%)]\tLoss: 0.332585\n",
"[trainer-node]: Train Epoch: 2 [10000/60000 (50%)]\tLoss: 0.374972\n",
"[trainer-node]: Train Epoch: 2 [11000/60000 (55%)]\tLoss: 0.344400\n",
"[trainer-node]: Train Epoch: 2 [12000/60000 (60%)]\tLoss: 0.359475\n",
"[trainer-node]: Train Epoch: 2 [13000/60000 (65%)]\tLoss: 0.335085\n",
"[trainer-node]: Train Epoch: 2 [14000/60000 (70%)]\tLoss: 0.352953\n",
"[trainer-node]: Train Epoch: 2 [15000/60000 (75%)]\tLoss: 0.367524\n",
"[trainer-node]: Train Epoch: 2 [16000/60000 (80%)]\tLoss: 0.313468\n",
"[trainer-node]: Train Epoch: 2 [17000/60000 (85%)]\tLoss: 0.385835\n",
"[trainer-node]: Train Epoch: 2 [18000/60000 (90%)]\tLoss: 0.324088\n",
"[trainer-node]: Train Epoch: 2 [19000/60000 (95%)]\tLoss: 0.336721\n",
"[trainer-node]: Training is finished\n"
]
}
@ -545,11 +662,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.delete_job(job_name)"
"# client.delete_job(job_name)"
]
},
{

8
go.mod
View File

@ -20,11 +20,14 @@ require (
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
sigs.k8s.io/controller-runtime v0.20.2
sigs.k8s.io/jobset v0.8.0
sigs.k8s.io/kind v0.27.0
sigs.k8s.io/scheduler-plugins v0.30.6
sigs.k8s.io/structured-merge-diff/v4 v4.5.0
)
require (
al.essio.dev/pkg/shellescape v1.5.1 // indirect
github.com/BurntSushi/toml v1.4.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
@ -43,19 +46,24 @@ require (
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pelletier/go-toml v1.9.5 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.21.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/spf13/cobra v1.8.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.uber.org/atomic v1.11.0 // indirect

21
go.sum
View File

@ -1,7 +1,12 @@
al.essio.dev/pkg/shellescape v1.5.1 h1:86HrALUujYS/h+GtqoB26SBEdkWfmMI6FubjXlsXyho=
al.essio.dev/pkg/shellescape v1.5.1/go.mod h1:6sIqp7X2P6mThCQ7twERpZTuigpr6KbZWtls1U8I890=
github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
@ -44,8 +49,14 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg=
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 h1:SJ+NtwL6QaZ21U+IrK7d0gGgpjGGvd2kz+FzTHVzdqI=
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2/go.mod h1:Tv1PlzqC9t8wNnpPdctvtSUOPUUg4SHeE6vR1Ir2hmg=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
@ -62,6 +73,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@ -77,6 +90,8 @@ github.com/open-policy-agent/cert-controller v0.12.0 h1:RKXlBafMcCh+++I1geJetXo7
github.com/open-policy-agent/cert-controller v0.12.0/go.mod h1:N5bCFXdAXMYx0PdS6ZQ9lrDQQMz+F6deoChym6VleXw=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a/go.mod h1:tI7nc6H6os2UYZRvSm9Y7bq4oMoXqhwA0WfnqKpoAgc=
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -92,6 +107,9 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -135,6 +153,7 @@ golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
@ -196,6 +215,8 @@ sigs.k8s.io/jobset v0.8.0 h1:80cJcPld+IMdKFOqzEW4et3Y6lGAPcP8YmBZ+aiKGYA=
sigs.k8s.io/jobset v0.8.0/go.mod h1:yitjuGOExl2p964nhyevQGIkfiPSRHcdC3zNBneKCT8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/kind v0.27.0 h1:PQ3f0iAWNIj66LYkZ1ivhEg/+Zb6UPMbO+qVei/INZA=
sigs.k8s.io/kind v0.27.0/go.mod h1:RZVFmy6qcwlSWwp6xeIUv7kXCPF3i8MXsEXxW/J+gJY=
sigs.k8s.io/scheduler-plugins v0.30.6 h1:P4pViMVoyVNHWmkG96UtJ4LvxkUIeenIUKLZd09vDyw=
sigs.k8s.io/scheduler-plugins v0.30.6/go.mod h1:EDYYqHmpHR//VYKAeud1TTQbTFSvpdGFeyEg9ejOmnI=
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk=

View File

@ -14,63 +14,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
# This bash script is used to run the example notebooks
# This shell is used to run Jupyter Notebook with Papermill.
set -o errexit
set -o nounset
set -o pipefail
set -x
NOTEBOOK_INPUT=""
NOTEBOOK_OUTPUT="-" # outputs to console
NAMESPACE="default"
TRAINING_PYTHON_SDK="./sdk/python"
if [ -z "${NOTEBOOK_INPUT}" ]; then
echo "NOTEBOOK_INPUT env variable must be set to run this script."
exit 1
fi
usage() {
echo "Usage: $0 -i <input_notebook> -o <output_notebook> [-p \"<param> <value>\"...] [-y <params.yaml>]"
echo "Options:"
echo " -i Input notebook (required)"
echo " -o Output notebook (required)"
echo " -k Kubeflow Training Operator Python SDK (optional)"
echo " -n Kubernetes namespace used by tests (optional)"
echo " -h Show this help message"
echo "NOTE: papermill, jupyter and ipykernel are required Python dependencies to run Notebooks"
exit 1
if [ -z "${NOTEBOOK_OUTPUT}" ]; then
echo "NOTEBOOK_OUTPUT env variable must be set to run this script."
exit 1
fi
if [ -z "${PAPERMILL_TIMEOUT}" ]; then
echo "PAPERMILL_TIMEOUT env variable must be set to run this script."
exit 1
fi
print_results() {
kubectl get pods
kubectl describe pod
kubectl describe trainjob
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
kubectl wait trainjob --for=condition=Complete --all --timeout 3s
}
while getopts "i:o:p:k:n:r:d:h:" opt; do
case "$opt" in
i) NOTEBOOK_INPUT="$OPTARG" ;; # -i for notebook input path
o) NOTEBOOK_OUTPUT="$OPTARG" ;; # -o for notebook output path
k) TRAINING_PYTHON_SDK="$OPTARG" ;; # -k for training operator python sdk
n) NAMESPACE="$OPTARG" ;; # -n for kubernetes namespace used by tests
h) usage ;; # -h for help (usage)
*)
usage
exit 1
;;
esac
done
if [ -z "$NOTEBOOK_INPUT" ]; then
echo "Error: -i notebook input path is required."
exit 1
fi
papermill_cmd="papermill $NOTEBOOK_INPUT $NOTEBOOK_OUTPUT -p training_python_sdk $TRAINING_PYTHON_SDK -p namespace $NAMESPACE"
if ! command -v papermill &>/dev/null; then
echo "Error: papermill is not installed. Please install papermill to proceed."
exit 1
fi
echo "Running command: $papermill_cmd"
$papermill_cmd
if [ $? -ne 0 ]; then
echo "Error: papermill execution failed." >&2
exit 1
fi
echo "Notebook execution completed successfully"
(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||
(print_results && exit 1)

View File

@ -14,74 +14,67 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
set -o errexit
set -o nounset
set -o pipefail
set -x
echo "Kind load newly locally built image"
# use cluster name which is used in github actions kind create
kind load docker-image ${TRAINING_CI_IMAGE} --name ${KIND_CLUSTER}
# Configure variables.
KIND=${KIND:-./bin/kind}
K8S_VERSION=${K8S_VERSION:-1.32.0}
KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
NAMESPACE="kubeflow-system"
TIMEOUT="5m"
echo "Update training operator manifest with newly built image"
cd manifests/overlays/standalone
kustomize edit set image kubeflow/training-operator=${TRAINING_CI_IMAGE}
# Kubeflow Trainer images.
# TODO (andreyvelich): Support initializers images.
CONTROLLER_MANAGER_CI_IMAGE=trainer-controller-manager:test
echo "Build Kubeflow Trainer images"
docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}
echo "Installing training operator manifests"
kustomize build . | kubectl apply --server-side -f -
echo "Set the image in Kustomize overlay"
cd manifests/overlays/manager
kustomize edit set image kubeflow/trainer-controller-manager=${CONTROLLER_MANAGER_CI_IMAGE}
if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins)
git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}"
echo "Create Kind cluster and load Kubeflow Trainer images"
${KIND} create cluster --image "${KIND_NODE_VERSION}"
${KIND} load docker-image ${CONTROLLER_MANAGER_CI_IMAGE}
echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..."
helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \
--namespace scheduler-plugins \
--set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \
--set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}"
echo "Deploy Kubeflow Trainer control plane"
kubectl apply --server-side -k .
echo "Configure gang-scheduling using scheduler-plugins to training-operator"
kubectl patch -n kubeflow deployments training-operator --type='json' \
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=scheduler-plugins"}]'
elif [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
VOLCANO_SCHEDULER_VERSION=$(go list -m -f "{{.Version}}" volcano.sh/apis)
# patch scheduler first so that it is ready when scheduler-deployment installing finished
echo "Configure gang-scheduling using volcano to training-operator"
kubectl patch -n kubeflow deployments training-operator --type='json' \
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=volcano"}]'
echo "Installing volcano scheduler ${VOLCANO_SCHEDULER_VERSION}..."
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/${VOLCANO_SCHEDULER_VERSION}/installer/volcano-development.yaml
fi
TIMEOUT=30
until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do
sleep 10
TIMEOUT=$((TIMEOUT - 1))
done
if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all ||
(
kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins
# We should wait until Deployment is in Ready status.
echo "Wait for Kubeflow Trainer to be ready"
(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
(
echo "Failed to wait until Kubeflow Trainer is ready" &&
kubectl get pods -n ${NAMESPACE} &&
kubectl describe pods -n ${NAMESPACE} &&
exit 1
)
fi
)
# wait for volcano up
if [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
kubectl rollout status deployment -n volcano-system volcano-admission --timeout "${TIMEOUT}s" &&
kubectl rollout status deployment -n volcano-system volcano-scheduler --timeout "${TIMEOUT}s" &&
kubectl rollout status deployment -n volcano-system volcano-controllers --timeout "${TIMEOUT}s" ||
(
kubectl get pods -n volcano-system && kubectl describe pods -n volcano-system
exit 1
)
fi
print_cluster_info() {
kubectl version
kubectl cluster-info
kubectl get nodes
kubectl get pods -n ${NAMESPACE}
kubectl describe pod -n ${NAMESPACE}
}
kubectl version
kubectl cluster-info
kubectl get nodes
kubectl get pods -n kubeflow
kubectl describe pods -n kubeflow
# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
echo "Deploy Kubeflow Trainer runtimes"
(cd ../runtimes && kubectl apply --server-side -k .) || (
kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
print_cluster_info &&
exit 1
)
# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
docker pull ${TORCH_RUNTIME_IMAGE}
${KIND} load docker-image ${TORCH_RUNTIME_IMAGE}
print_cluster_info

View File

@ -18,4 +18,5 @@ package tools
import (
_ "k8s.io/code-generator"
_ "sigs.k8s.io/kind/cmd/kind/app"
)

View File

@ -14,8 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer and so on.
# We don't use this tool to generate deepcopy because kubebuilder (controller-tools) has covered that part.
# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer.
set -o errexit
set -o nounset

View File

@ -202,6 +202,12 @@ class TrainerClient:
trainer.resources_per_node
)
# Set numProcPerNode to the Trainer.
if trainer and trainer.resources_per_node:
trainer_crd.num_proc_per_node = utils.get_num_proc_per_node(
trainer.resources_per_node
)
# Add command and args to the Trainer if training function is set.
if trainer and trainer.func:
trainer_crd.command = constants.DEFAULT_COMMAND

View File

@ -14,6 +14,7 @@
import inspect
import json
import math
import os
import queue
import textwrap
@ -122,6 +123,31 @@ def get_resources_per_node(resources_per_node: dict) -> client.V1ResourceRequire
return resources
# TODO (andreyvelich): Move this part to the Kubeflow Trainer torch plugins.
# Ref issue: https://github.com/kubeflow/trainer/issues/2407
def get_num_proc_per_node(resources_per_node: dict) -> object:
"""
Get the Trainer numProcPerNode from the given resources.
"""
resources = {k.lower(): v for k, v in resources_per_node.items()}
# NumProcPerNode is equal to number of GPUs or CPUs, otherwise set it to `auto`
for key, value in resources.items():
if "gpu" in key:
return value
for key, value in resources.items():
if "cpu" in key:
# For now, we can't convert milliCPUs to the numProcPerNode.
try:
value = math.ceil(int(value))
return value
except Exception:
pass
return "auto"
def get_args_using_train_func(
train_func: Callable,
train_func_parameters: Optional[Dict[str, Any]] = None,

View File

@ -31,8 +31,9 @@ classifiers = [
]
dependencies = [
"kubernetes>=27.2.0",
"pydantic>=2.10.0",
"jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.8.0#subdirectory=sdk/python",
# TODO (andreyvelich): Update JobSet to v0.8.0 once this PR is merged: https://github.com/kubeflow/trainer/pull/2466
# "pydantic>=2.10.0",
"jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.7.2#subdirectory=sdk/python",
]
[project.urls]

78
test/e2e/e2e_test.go Normal file
View File

@ -0,0 +1,78 @@
package e2e
import (
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
jobsetconsts "sigs.k8s.io/jobset/pkg/constants"
trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
"github.com/kubeflow/trainer/pkg/constants"
testingutil "github.com/kubeflow/trainer/pkg/util/testing"
"github.com/kubeflow/trainer/test/util"
)
var _ = ginkgo.Describe("TrainJob e2e", func() {
// Each test runs in a separate namespace.
var ns *corev1.Namespace
// Create test namespace before each test.
ginkgo.BeforeEach(func() {
ns = &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "e2e-",
},
}
gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
// Wait for namespace to exist before proceeding with test.
gomega.Eventually(func(g gomega.Gomega) {
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(ns), ns)).Should(gomega.Succeed())
}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
})
// Delete test namespace after each test.
ginkgo.AfterEach(func() {
// Delete test namespace after each test.
gomega.Expect(k8sClient.Delete(ctx, ns)).To(gomega.Succeed())
})
// These tests create TrainJob that reference supported runtime without any additional changes.
ginkgo.When("creating TrainJob", func() {
// Verify `torch-distributed` ClusterTrainingRuntime.
ginkgo.It("should create TrainJob with PyTorch runtime reference", func() {
// Create a TrainJob.
trainJob := testingutil.MakeTrainJobWrapper(ns.Name, "e2e-test").
RuntimeRef(trainer.SchemeGroupVersion.WithKind(trainer.ClusterTrainingRuntimeKind), "torch-distributed").
Obj()
ginkgo.By("Create a TrainJob with torch-distributed runtime reference", func() {
gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed())
})
// Wait for TrainJob to be in Succeeded status.
ginkgo.By("Wait for TrainJob to be in Succeeded status", func() {
gomega.Eventually(func(g gomega.Gomega) {
gotTrainJob := &trainer.TrainJob{}
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(trainJob), gotTrainJob)).Should(gomega.Succeed())
g.Expect(gotTrainJob.Status.Conditions).Should(gomega.BeComparableTo([]metav1.Condition{
{
Type: trainer.TrainJobCreated,
Status: metav1.ConditionTrue,
Reason: trainer.TrainJobJobsCreationSucceededReason,
Message: constants.TrainJobJobsCreationSucceededMessage,
},
{
Type: trainer.TrainJobComplete,
Status: metav1.ConditionTrue,
Reason: jobsetconsts.AllJobsCompletedReason,
Message: jobsetconsts.AllJobsCompletedMessage,
},
}, util.IgnoreConditions))
}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
})
})
})
})

56
test/e2e/suite_test.go Normal file
View File

@ -0,0 +1,56 @@
/*
Copyright 2024 The Kubeflow Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"context"
"testing"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
)
var (
k8sClient client.Client
ctx context.Context
)
func TestAPIs(t *testing.T) {
gomega.RegisterFailHandler(ginkgo.Fail)
ginkgo.RunSpecs(t, "Kubeflow Trainer E2E Suite")
}
var _ = ginkgo.BeforeSuite(func() {
ctx = context.Background()
// Get Kubernetes config.
cfg := config.GetConfigOrDie()
gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil())
// Add Trainer APIs.
err := trainer.AddToScheme(scheme.Scheme)
gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())
// Configure k8s client.
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
gomega.Expect(err).NotTo(gomega.HaveOccurred())
gomega.Expect(k8sClient).NotTo(gomega.BeNil())
})

View File

@ -17,14 +17,16 @@ limitations under the License.
package util
import (
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"time"
)
const (
Timeout = 5 * time.Second
TimeoutE2E = 10 * time.Minute // E2Es require a longer timeout due to large image pull
ConsistentDuration = time.Second
Interval = time.Millisecond * 250
)