chore(test): Add E2E tests for Kubeflow Trainer (#2470)

* Add e2e tests for Kubeflow Trainer Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add timeout for papermill Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add output as part of make command Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add k8s version to setup cluster Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix Kind k8s version Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix 1.29 version Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Create script to run Notebook Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Download dataset when local_rank=0 Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Update test/e2e/e2e_test.go Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com> Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Refactor Go e2e tests Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Bump k8s to 1.29.14 Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Install Kind from go mod Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix path for Kind package Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix Go e2e Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Reduce number of CPUs Export Notebook as artifact Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Print logs due to flaky test Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix artifact path Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * docker pull image Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix path Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add k8s version to output name Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove install Kind cmd Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> --------- Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2025-03-05 04:04:07 +00:00 · 2025-03-05 04:04:07 +00:00 · 9e785750d0
parent 3ec8f0705f
commit 9e785750d0
18 changed files with 645 additions and 399 deletions
--- a/.github/workflows/template-e2e-test/action.yaml
+++ b/.github/workflows/template-e2e-test/action.yaml
@ -1,99 +0,0 @@
-# TODO (andreyvelich): Refactor this once we have e2e test for Kubeflow Trainer.
-name: Setup E2E test template
-description: A composite action to setup e2e tests
-
-inputs:
-  kubernetes-version:
-    required: true
-    description: Kubernetes version
-  python-version:
-    required: true
-    description: Python version
-  gang-scheduler-name:
-    required: false
-    default: "none"
-    description: Gang scheduler name
-
-runs:
-  using: composite
-  steps:
-    # This step is a Workaround to avoid the "No space left on device" error.
-    # ref: https://github.com/actions/runner-images/issues/2840
-    - name: Remove unnecessary files
-      shell: bash
-      run: |
-        echo "Disk usage before cleanup:"
-        df -hT
-
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf /usr/local/share/boost
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        sudo rm -rf /usr/local/lib/android
-        sudo rm -rf /usr/local/share/powershell
-        sudo rm -rf /usr/share/swift
-
-        echo "Disk usage after cleanup:"
-        df -hT
-
-    - name: Prune docker images
-      shell: bash
-      run: |
-        docker image prune -a -f
-        docker system df
-        df -hT
-
-    - name: Move docker data directory
-      shell: bash
-      run: |
-        echo "Stopping docker service ..."
-        sudo systemctl stop docker
-        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-        DOCKER_ROOT_DIR=/mnt/docker
-        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
-        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-        echo "Starting docker service ..."
-        sudo systemctl daemon-reload
-        sudo systemctl start docker
-        echo "Docker service status:"
-        sudo systemctl --no-pager -l -o short status docker
-
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ inputs.python-version }}
-
-    - name: Setup Go
-      uses: actions/setup-go@v5
-      with:
-        go-version-file: go.mod
-
-    - name: Create k8s Kind Cluster
-      uses: helm/kind-action@v1.11.0
-      with:
-        node_image: kindest/node:${{ inputs.kubernetes-version }}
-        cluster_name: training-operator-cluster
-        kubectl_version: ${{ inputs.kubernetes-version }}
-
-    - name: Build training-operator
-      shell: bash
-      run: |
-        ./scripts/gha/build-image.sh
-      env:
-        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-
-    - name: Deploy training operator
-      shell: bash
-      run: |
-        ./scripts/gha/setup-training-operator.sh
-        docker system prune -a -f
-        docker system df
-        df -h
-      env:
-        KIND_CLUSTER: training-operator-cluster
-        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-        GANG_SCHEDULER_NAME: ${{ inputs.gang-scheduler-name }}
-        KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}
--- a/.github/workflows/test-e2e.yaml
+++ b/.github/workflows/test-e2e.yaml
@ -7,23 +7,61 @@ on:
 jobs:
  e2e-test:
    name: E2E Test
-    runs-on: ubuntu-latest
+    runs-on:
+      labels: ubuntu-latest-16-cores
+    env:
+      GOPATH: ${{ github.workspace }}/go
+    defaults:
+      run:
+        working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer

    strategy:
      fail-fast: false
      matrix:
-        kubernetes-version: ["1.29.3", "1.30.0", "1.31.0"]
+        # Kubernetes versions for e2e tests on Kind cluster.
+        kubernetes-version: ["1.29.14", "1.30.0", "1.31.0"]

    steps:
      - name: Check out code
        uses: actions/checkout@v4
+        with:
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
-          python-version: ${{ matrix.python-version }}
+          python-version: 3.11

      - name: Install dependencies
        run: |
-          echo "TODO (andreyvelich): Implement E2E Tests"
-          # pip install -U './sdk'
+          echo "Install Papermill"
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+
+          echo "Install Kubeflow SDK"
+          pip install ./sdk
+
+      - name: Setup cluster
+        run: |
+          make test-e2e-setup-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
+
+      - name: Run e2e with Go
+        run: |
+          make test-e2e
+
+      - name: Run e2e test for example Notebooks.
+        run: |
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./mnist_output_${{ matrix.kubernetes-version }}.ipynb TIMEOUT=900
+
+      # TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks.
+      - name: Upload notebook
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: mnist_output_${{ matrix.kubernetes-version }}.ipynb
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/mnist_output_${{ matrix.kubernetes-version }}.ipynb
+          retention-days: 1
--- a/.github/workflows/test-go.yaml
+++ b/.github/workflows/test-go.yaml
@ -76,7 +76,7 @@ jobs:

      - name: Run Go integration tests
        run: |
-          make test-integration ENVTEST_K8S_VERSION=${{ matrix.kubernetes-version }}
+          make test-integration K8S_VERSION=${{ matrix.kubernetes-version }}

      - name: Coveralls report
        uses: shogo82148/actions-goveralls@v1
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,8 @@ __debug_bin

 # Jupyter Notebooks.
 **/.ipynb_checkpoints
+# The default output for Notebook after Papermill execution.
+trainer_output.ipynb

 # Python cache files
 __pycache__/
--- a/30
+++ b/30
@ -29,14 +29,16 @@ help: ## Display this help.

 ##@ Development

+K8S_VERSION ?= 1.32.0
+
 PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))

 # Tool Binaries
 LOCALBIN ?= $(PROJECT_DIR)/bin
+
 CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
 ENVTEST ?= $(LOCALBIN)/setup-envtest
-
-ENVTEST_K8S_VERSION ?= 1.32
+KIND ?= $(LOCALBIN)/kind

 # Instructions to download tools for development.
 .PHONY: envtest
@ -47,6 +49,10 @@ envtest: ## Download the setup-envtest binary if required.
 controller-gen: ## Download the controller-gen binary if required.
 	GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.17.2

+.PHONY: kind
+kind: ## Download Kind binary if required.
+	GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind)
+
 # Download external CRDs for Go integration testings.
 EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds

@ -106,8 +112,9 @@ test: ## Run Go unit test.

 .PHONY: test-integration
 test-integration: envtest jobset-operator-crd scheduler-plugins-crd ## Run Go integration test.
-	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" go test ./test/... -coverprofile cover.out
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(K8S_VERSION) -p path)" go test ./test/integration/... -coverprofile cover.out

+.PHONY: test-python
 test-python: ## Run Python unit test.
 	export PYTHONPATH=$(PROJECT_DIR)
 	pip install pytest
@ -118,9 +125,26 @@ test-python: ## Run Python unit test.
 	pytest ./pkg/initializer/model
 	pytest ./pkg/initializer/utils

+.PHONY: test-python-integration
 test-python-integration: ## Run Python integration test.
 	export PYTHONPATH=$(PROJECT_DIR)
 	pip install pytest
 	pip install -r ./cmd/initializer/dataset/requirements.txt

 	pytest ./test/integration/initializer
+
+.PHONY: test-e2e-setup-cluster
+test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
+	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
+
+.PHONY: test-e2e
+test-e2e: ## Run Go e2e test.
+	go test ./test/e2e/...
+
+# Input and output location for Notebooks executed with Papermill.
+NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/pytorch/image-classification/mnist.ipynb
+NOTEBOOK_OUTPUT=$(PROJECT_DIR)/trainer_output.ipynb
+PAPERMILL_TIMEOUT=900
+.PHONY: test-e2e-notebook
+test-e2e-notebook: ## Run Jupyter Notebook with Papermill.
+	NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh
--- a/examples/pytorch/image-classification/mnist.ipynb
+++ b/examples/pytorch/image-classification/mnist.ipynb
@ -33,7 +33,7 @@
   "outputs": [],
   "source": [
    "# TODO (astefanutti): Change to the Kubeflow SDK when it's available.\n",
-    "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
+    "# !pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
   ]
  },
  {
@ -66,7 +66,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@ -119,14 +119,24 @@
    "    model = nn.parallel.DistributedDataParallel(Net().to(device))\n",
    "    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n",
    "\n",
-    "    # Retrieve the Fashion-MNIST dataset\n",
+    "    \n",
+    "    # Download FashionMNIST dataset only on local_rank=0 process.\n",
+    "    if local_rank == 0:\n",
+    "        dataset = datasets.FashionMNIST(\n",
+    "            \"./data\",\n",
+    "            train=True,\n",
+    "            download=True,\n",
+    "            transform=transforms.Compose([transforms.ToTensor()]),\n",
+    "        )\n",
+    "    dist.barrier()\n",
    "    dataset = datasets.FashionMNIST(\n",
    "        \"./data\",\n",
    "        train=True,\n",
-    "        download=True,\n",
+    "        download=False,\n",
    "        transform=transforms.Compose([transforms.ToTensor()]),\n",
    "    )\n",
    "\n",
+    "\n",
    "    # Shard the dataset accross workers.\n",
    "    train_loader = DataLoader(\n",
    "        dataset,\n",
@ -135,6 +145,7 @@
    "    )\n",
    "\n",
    "    # TODO(astefanutti): add parameters to the training function\n",
+    "    dist.barrier()\n",
    "    for epoch in range(1, 3):\n",
    "        model.train()\n",
    "\n",
@ -181,7 +192,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -190,126 +201,195 @@
     "text": [
      "Using Device: cpu, Backend: gloo\n",
      "Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n",
-      "Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.309967\n",
-      "Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.045446\n",
-      "Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.398883\n",
-      "Train Epoch: 1 [3000/60000 (5%)]\tLoss: 0.992089\n",
-      "Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.122684\n",
-      "Train Epoch: 1 [5000/60000 (8%)]\tLoss: 1.031676\n",
-      "Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.649529\n",
-      "Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.804960\n",
-      "Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.709698\n",
-      "Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.632330\n",
-      "Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.695469\n",
-      "Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.646323\n",
-      "Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.521877\n",
-      "Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.592377\n",
-      "Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.686853\n",
-      "Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.678805\n",
-      "Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.658783\n",
-      "Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.540468\n",
-      "Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.456685\n",
-      "Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.561984\n",
-      "Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.453478\n",
-      "Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.399682\n",
-      "Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.432961\n",
-      "Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.611499\n",
-      "Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.552892\n",
-      "Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.409226\n",
-      "Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.569662\n",
-      "Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.379728\n",
-      "Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.420447\n",
-      "Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.410670\n",
-      "Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.480141\n",
-      "Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.425981\n",
-      "Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.345157\n",
-      "Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.323578\n",
-      "Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.537613\n",
-      "Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.523302\n",
-      "Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.426407\n",
-      "Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.356403\n",
-      "Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.516297\n",
-      "Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.406655\n",
-      "Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.314193\n",
-      "Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.467424\n",
-      "Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457645\n",
-      "Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.388591\n",
-      "Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.386649\n",
-      "Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.282575\n",
-      "Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.446804\n",
-      "Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.418433\n",
-      "Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.575584\n",
-      "Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.382036\n",
-      "Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.299168\n",
-      "Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.423421\n",
-      "Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.425236\n",
-      "Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.403723\n",
-      "Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.303039\n",
-      "Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.375983\n",
-      "Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.434169\n",
-      "Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.429213\n",
-      "Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.354376\n",
-      "Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.305779\n",
-      "Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.437120\n",
-      "Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.464603\n",
-      "Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.284665\n",
-      "Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.369253\n",
-      "Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.468896\n",
-      "Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.388527\n",
-      "Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.474483\n",
-      "Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.373588\n",
-      "Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.443588\n",
-      "Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.449592\n",
-      "Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.363776\n",
-      "Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.400426\n",
-      "Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.282801\n",
-      "Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.288877\n",
-      "Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.549093\n",
-      "Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.359002\n",
-      "Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.322263\n",
-      "Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.289489\n",
-      "Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.279724\n",
-      "Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.452595\n",
-      "Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.334388\n",
-      "Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.340985\n",
-      "Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.247467\n",
-      "Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.439283\n",
-      "Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.270795\n",
-      "Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.283242\n",
-      "Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.377896\n",
-      "Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.264453\n",
-      "Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.328696\n",
-      "Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.294168\n",
-      "Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.421162\n",
-      "Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.306932\n",
-      "Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.297351\n",
-      "Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.261608\n",
-      "Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.413534\n",
-      "Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.433157\n",
-      "Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.390571\n",
-      "Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.242159\n",
-      "Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.347628\n",
-      "Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.321216\n",
-      "Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.285891\n",
-      "Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.401335\n",
-      "Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.357113\n",
-      "Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.321728\n",
-      "Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266073\n",
-      "Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.235082\n",
-      "Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.329955\n",
-      "Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.351680\n",
-      "Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.509699\n",
-      "Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.281432\n",
-      "Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.262006\n",
-      "Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.432544\n",
-      "Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.332725\n",
-      "Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.313516\n",
-      "Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.266921\n",
-      "Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.279880\n",
-      "Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.329515\n",
-      "Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.379902\n",
-      "Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.252111\n",
-      "Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.267555\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26.4M/26.4M [00:02<00:00, 10.9MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29.5k/29.5k [00:00<00:00, 1.50MB/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.42M/4.42M [00:00<00:00, 8.57MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.15k/5.15k [00:00<00:00, 4.41MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.312359\n",
+      "Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.058874\n",
+      "Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.059449\n",
+      "Train Epoch: 1 [3000/60000 (5%)]\tLoss: 1.130449\n",
+      "Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.027112\n",
+      "Train Epoch: 1 [5000/60000 (8%)]\tLoss: 0.845158\n",
+      "Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.747619\n",
+      "Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.731784\n",
+      "Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.623452\n",
+      "Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.666745\n",
+      "Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.597833\n",
+      "Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.634247\n",
+      "Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.533735\n",
+      "Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.545662\n",
+      "Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.637130\n",
+      "Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.596679\n",
+      "Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.505523\n",
+      "Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.484452\n",
+      "Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.494081\n",
+      "Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.601508\n",
+      "Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.485598\n",
+      "Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.402011\n",
+      "Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.350119\n",
+      "Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.575456\n",
+      "Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.402916\n",
+      "Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.335860\n",
+      "Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.498826\n",
+      "Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.331436\n",
+      "Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.466167\n",
+      "Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.514644\n",
+      "Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.423657\n",
+      "Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.424075\n",
+      "Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.365285\n",
+      "Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.289063\n",
+      "Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.538227\n",
+      "Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.546361\n",
+      "Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.383847\n",
+      "Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.311942\n",
+      "Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.458801\n",
+      "Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.481895\n",
+      "Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.308024\n",
+      "Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.435803\n",
+      "Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457417\n",
+      "Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.310509\n",
+      "Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.347369\n",
+      "Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.341391\n",
+      "Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.464614\n",
+      "Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.432629\n",
+      "Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.519174\n",
+      "Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.368225\n",
+      "Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.325527\n",
+      "Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.455275\n",
+      "Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.423473\n",
+      "Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.354114\n",
+      "Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.328097\n",
+      "Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.353430\n",
+      "Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.419119\n",
+      "Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.383263\n",
+      "Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.292101\n",
+      "Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.251011\n",
+      "Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.412635\n",
+      "Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.467144\n",
+      "Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.246448\n",
+      "Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.341133\n",
+      "Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.412845\n",
+      "Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.338513\n",
+      "Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.426739\n",
+      "Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.387864\n",
+      "Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.441497\n",
+      "Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.475687\n",
+      "Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.390743\n",
+      "Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.432761\n",
+      "Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.306278\n",
+      "Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.322986\n",
+      "Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.460900\n",
+      "Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.350266\n",
+      "Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.274468\n",
+      "Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.341584\n",
+      "Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.237921\n",
+      "Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.387657\n",
+      "Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.379017\n",
+      "Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.267510\n",
+      "Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.270873\n",
+      "Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.437596\n",
+      "Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.265560\n",
+      "Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.279858\n",
+      "Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.332311\n",
+      "Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.225879\n",
+      "Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.278610\n",
+      "Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.286307\n",
+      "Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.413894\n",
+      "Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.308004\n",
+      "Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.280563\n",
+      "Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.264711\n",
+      "Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.417848\n",
+      "Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.512428\n",
+      "Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.319987\n",
+      "Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.251197\n",
+      "Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.325069\n",
+      "Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.303394\n",
+      "Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.280159\n",
+      "Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.426005\n",
+      "Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.363313\n",
+      "Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.271922\n",
+      "Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266253\n",
+      "Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.238750\n",
+      "Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.376987\n",
+      "Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.282419\n",
+      "Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.462140\n",
+      "Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.255249\n",
+      "Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.241605\n",
+      "Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.440393\n",
+      "Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.313707\n",
+      "Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.338831\n",
+      "Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.236808\n",
+      "Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.262569\n",
+      "Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.415122\n",
+      "Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.362813\n",
+      "Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.321227\n",
+      "Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.239889\n",
      "Training is finished\n"
     ]
    }
@ -345,7 +425,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@ -371,13 +451,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "Runtime(name='mpi-distributed', phase='Unknown', accelerator='Unknown', accelerator_count='Unknown')\n",
      "Runtime(name='torch-distributed', phase='pre-training', accelerator='gpu-tesla-v100-16gb', accelerator_count='4')\n"
     ]
    }
@ -393,12 +474,12 @@
   "source": [
    "## Run the Distributed TrainJob\n",
    "\n",
-    "Kubeflow TrainJob will train the above model on 4 PyTorch nodes."
+    "Kubeflow TrainJob will train the above model on 3 PyTorch nodes."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@ -408,13 +489,13 @@
    "    trainer=Trainer(\n",
    "        func=train_fashion_mnist,\n",
    "        # Set how many PyTorch nodes you want to use for distributed training.\n",
-    "        num_nodes=4,\n",
+    "        num_nodes=3,\n",
    "        # Set the resources for each PyTorch node.\n",
    "        resources_per_node={\n",
-    "            \"cpu\": 5,\n",
+    "            \"cpu\": 3,\n",
    "            \"memory\": \"16Gi\",\n",
-    "            # Comment this to distribute the TrainJob using CPU nodes.\n",
-    "            \"nvidia.com/gpu\": 1,\n",
+    "            # Uncomment this to distribute the TrainJob using GPU nodes.\n",
+    "            # \"nvidia.com/gpu\": 1,\n",
    "        },\n",
    "    ),\n",
    ")"
@ -428,14 +509,42 @@
    "\n",
    "You can check the components of TrainJob that's created.\n",
    "\n",
-    "Since the TrainJob performs distributed training across 4 nodes, it generates 4 components: `trainer-node-0` .. `trainer-node-3`.\n",
+    "Since the TrainJob performs distributed training across 3 nodes, it generates 3 components: `trainer-node-0` .. `trainer-node-2`.\n",
    "\n",
    "You can get the individual status for each of these components."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "def wait_for_job_running():\n",
+    "    for _ in range(100):\n",
+    "        trainjob = client.get_job(name=job_name)\n",
+    "        for c in trainjob.components:\n",
+    "            if c.name == \"trainer-node-0\" and c.status == \"Running\":\n",
+    "                return\n",
+    "        print(\"Wait for TrainJob running status. Sleep for 5 seconds\")\n",
+    "        time.sleep(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n",
+    "wait_for_job_running()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
@ -447,8 +556,6 @@
      "Component: trainer-node-1, Status: Running, Devices: gpu x 1\n",
      "\n",
      "Component: trainer-node-2, Status: Running, Devices: gpu x 1\n",
-      "\n",
-      "Component: trainer-node-3, Status: Running, Devices: gpu x 1\n",
      "\n"
     ]
    }
@ -466,12 +573,12 @@
    "\n",
    "We can use the `get_job_logs()` API to get the TrainJob logs.\n",
    "\n",
-    "Since we run training on 4 GPUs, every PyTorch node uses 60,000/4 = 15,000 images from the dataset."
+    "Since we run training on 3 GPUs, every PyTorch node uses 60,000/3 = 20,000 images from the dataset."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
@ -479,53 +586,63 @@
     "output_type": "stream",
     "text": [
      "[trainer-node]: Using Device: cuda, Backend: nccl\n",
-      "[trainer-node]: Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
+      "[trainer-node]: Distributed Training for WORLD_SIZE: 3, RANK: 0, LOCAL_RANK: 0\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
-      "100%|██████████| 26.4M/26.4M [00:02<00:00, 12.5MB/s]\n",
+      "100%|██████████| 26.4M/26.4M [00:02<00:00, 11.1MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
-      "100%|██████████| 29.5k/29.5k [00:00<00:00, 214kB/s]\n",
+      "100%|██████████| 29.5k/29.5k [00:00<00:00, 74.8MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
-      "100%|██████████| 4.42M/4.42M [00:01<00:00, 3.50MB/s]\n",
+      "100%|██████████| 4.42M/4.42M [00:01<00:00, 3.90MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
-      "100%|██████████| 5.15k/5.15k [00:00<00:00, 37.8MB/s]\n",
+      "100%|██████████| 5.15k/5.15k [00:00<00:00, 38.1MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
-      "[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.305451\n",
-      "[trainer-node]: Train Epoch: 1 [1000/60000 (7%)]\tLoss: 2.056247\n",
-      "[trainer-node]: Train Epoch: 1 [2000/60000 (13%)]\tLoss: 2.166955\n",
-      "[trainer-node]: Train Epoch: 1 [3000/60000 (20%)]\tLoss: 1.045183\n",
-      "[trainer-node]: Train Epoch: 1 [4000/60000 (27%)]\tLoss: 0.767518\n",
-      "[trainer-node]: Train Epoch: 1 [5000/60000 (33%)]\tLoss: 0.697382\n",
-      "[trainer-node]: Train Epoch: 1 [6000/60000 (40%)]\tLoss: 0.638373\n",
-      "[trainer-node]: Train Epoch: 1 [7000/60000 (47%)]\tLoss: 0.667810\n",
-      "[trainer-node]: Train Epoch: 1 [8000/60000 (53%)]\tLoss: 0.541413\n",
-      "[trainer-node]: Train Epoch: 1 [9000/60000 (60%)]\tLoss: 0.564223\n",
-      "[trainer-node]: Train Epoch: 1 [10000/60000 (67%)]\tLoss: 0.425999\n",
-      "[trainer-node]: Train Epoch: 1 [11000/60000 (73%)]\tLoss: 0.564535\n",
-      "[trainer-node]: Train Epoch: 1 [12000/60000 (80%)]\tLoss: 0.459158\n",
-      "[trainer-node]: Train Epoch: 1 [13000/60000 (87%)]\tLoss: 0.545110\n",
-      "[trainer-node]: Train Epoch: 1 [14000/60000 (93%)]\tLoss: 0.471710\n",
-      "[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.520992\n",
-      "[trainer-node]: Train Epoch: 2 [1000/60000 (7%)]\tLoss: 0.440295\n",
-      "[trainer-node]: Train Epoch: 2 [2000/60000 (13%)]\tLoss: 0.436745\n",
-      "[trainer-node]: Train Epoch: 2 [3000/60000 (20%)]\tLoss: 0.359110\n",
-      "[trainer-node]: Train Epoch: 2 [4000/60000 (27%)]\tLoss: 0.493791\n",
-      "[trainer-node]: Train Epoch: 2 [5000/60000 (33%)]\tLoss: 0.384616\n",
-      "[trainer-node]: Train Epoch: 2 [6000/60000 (40%)]\tLoss: 0.529568\n",
-      "[trainer-node]: Train Epoch: 2 [7000/60000 (47%)]\tLoss: 0.443400\n",
-      "[trainer-node]: Train Epoch: 2 [8000/60000 (53%)]\tLoss: 0.352168\n",
-      "[trainer-node]: Train Epoch: 2 [9000/60000 (60%)]\tLoss: 0.431930\n",
-      "[trainer-node]: Train Epoch: 2 [10000/60000 (67%)]\tLoss: 0.282820\n",
-      "[trainer-node]: Train Epoch: 2 [11000/60000 (73%)]\tLoss: 0.412141\n",
-      "[trainer-node]: Train Epoch: 2 [12000/60000 (80%)]\tLoss: 0.367190\n",
-      "[trainer-node]: Train Epoch: 2 [13000/60000 (87%)]\tLoss: 0.355502\n",
-      "[trainer-node]: Train Epoch: 2 [14000/60000 (93%)]\tLoss: 0.326105\n",
+      "[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.298486\n",
+      "[trainer-node]: Train Epoch: 1 [1000/60000 (5%)]\tLoss: 2.023109\n",
+      "[trainer-node]: Train Epoch: 1 [2000/60000 (10%)]\tLoss: 2.210286\n",
+      "[trainer-node]: Train Epoch: 1 [3000/60000 (15%)]\tLoss: 1.308523\n",
+      "[trainer-node]: Train Epoch: 1 [4000/60000 (20%)]\tLoss: 0.896595\n",
+      "[trainer-node]: Train Epoch: 1 [5000/60000 (25%)]\tLoss: 0.790926\n",
+      "[trainer-node]: Train Epoch: 1 [6000/60000 (30%)]\tLoss: 0.694305\n",
+      "[trainer-node]: Train Epoch: 1 [7000/60000 (35%)]\tLoss: 0.532733\n",
+      "[trainer-node]: Train Epoch: 1 [8000/60000 (40%)]\tLoss: 0.571943\n",
+      "[trainer-node]: Train Epoch: 1 [9000/60000 (45%)]\tLoss: 0.593324\n",
+      "[trainer-node]: Train Epoch: 1 [10000/60000 (50%)]\tLoss: 0.570712\n",
+      "[trainer-node]: Train Epoch: 1 [11000/60000 (55%)]\tLoss: 0.416316\n",
+      "[trainer-node]: Train Epoch: 1 [12000/60000 (60%)]\tLoss: 0.438910\n",
+      "[trainer-node]: Train Epoch: 1 [13000/60000 (65%)]\tLoss: 0.486123\n",
+      "[trainer-node]: Train Epoch: 1 [14000/60000 (70%)]\tLoss: 0.432043\n",
+      "[trainer-node]: Train Epoch: 1 [15000/60000 (75%)]\tLoss: 0.374424\n",
+      "[trainer-node]: Train Epoch: 1 [16000/60000 (80%)]\tLoss: 0.366622\n",
+      "[trainer-node]: Train Epoch: 1 [17000/60000 (85%)]\tLoss: 0.495783\n",
+      "[trainer-node]: Train Epoch: 1 [18000/60000 (90%)]\tLoss: 0.381096\n",
+      "[trainer-node]: Train Epoch: 1 [19000/60000 (95%)]\tLoss: 0.385782\n",
+      "[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.380943\n",
+      "[trainer-node]: Train Epoch: 2 [1000/60000 (5%)]\tLoss: 0.466423\n",
+      "[trainer-node]: Train Epoch: 2 [2000/60000 (10%)]\tLoss: 0.452478\n",
+      "[trainer-node]: Train Epoch: 2 [3000/60000 (15%)]\tLoss: 0.409038\n",
+      "[trainer-node]: Train Epoch: 2 [4000/60000 (20%)]\tLoss: 0.370588\n",
+      "[trainer-node]: Train Epoch: 2 [5000/60000 (25%)]\tLoss: 0.419151\n",
+      "[trainer-node]: Train Epoch: 2 [6000/60000 (30%)]\tLoss: 0.378228\n",
+      "[trainer-node]: Train Epoch: 2 [7000/60000 (35%)]\tLoss: 0.328720\n",
+      "[trainer-node]: Train Epoch: 2 [8000/60000 (40%)]\tLoss: 0.557514\n",
+      "[trainer-node]: Train Epoch: 2 [9000/60000 (45%)]\tLoss: 0.332585\n",
+      "[trainer-node]: Train Epoch: 2 [10000/60000 (50%)]\tLoss: 0.374972\n",
+      "[trainer-node]: Train Epoch: 2 [11000/60000 (55%)]\tLoss: 0.344400\n",
+      "[trainer-node]: Train Epoch: 2 [12000/60000 (60%)]\tLoss: 0.359475\n",
+      "[trainer-node]: Train Epoch: 2 [13000/60000 (65%)]\tLoss: 0.335085\n",
+      "[trainer-node]: Train Epoch: 2 [14000/60000 (70%)]\tLoss: 0.352953\n",
+      "[trainer-node]: Train Epoch: 2 [15000/60000 (75%)]\tLoss: 0.367524\n",
+      "[trainer-node]: Train Epoch: 2 [16000/60000 (80%)]\tLoss: 0.313468\n",
+      "[trainer-node]: Train Epoch: 2 [17000/60000 (85%)]\tLoss: 0.385835\n",
+      "[trainer-node]: Train Epoch: 2 [18000/60000 (90%)]\tLoss: 0.324088\n",
+      "[trainer-node]: Train Epoch: 2 [19000/60000 (95%)]\tLoss: 0.336721\n",
      "[trainer-node]: Training is finished\n"
     ]
    }
@ -545,11 +662,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "client.delete_job(job_name)"
+    "# client.delete_job(job_name)"
   ]
  },
  {
--- a/go.mod
+++ b/go.mod
@ -20,11 +20,14 @@ require (
 	k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
 	sigs.k8s.io/controller-runtime v0.20.2
 	sigs.k8s.io/jobset v0.8.0
+	sigs.k8s.io/kind v0.27.0
 	sigs.k8s.io/scheduler-plugins v0.30.6
 	sigs.k8s.io/structured-merge-diff/v4 v4.5.0
 )

 require (
+	al.essio.dev/pkg/shellescape v1.5.1 // indirect
+	github.com/BurntSushi/toml v1.4.0 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
@ -43,19 +46,24 @@ require (
 	github.com/google/gnostic-models v0.6.8 // indirect
 	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
+	github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 // indirect
 	github.com/google/uuid v1.6.0 // indirect
+	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/compress v1.17.11 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/pelletier/go-toml v1.9.5 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/prometheus/client_golang v1.21.0 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.62.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
+	github.com/spf13/cobra v1.8.1 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.uber.org/atomic v1.11.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -1,7 +1,12 @@
+al.essio.dev/pkg/shellescape v1.5.1 h1:86HrALUujYS/h+GtqoB26SBEdkWfmMI6FubjXlsXyho=
+al.essio.dev/pkg/shellescape v1.5.1/go.mod h1:6sIqp7X2P6mThCQ7twERpZTuigpr6KbZWtls1U8I890=
+github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
+github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
@ -44,8 +49,14 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg=
 github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
+github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 h1:SJ+NtwL6QaZ21U+IrK7d0gGgpjGGvd2kz+FzTHVzdqI=
+github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2/go.mod h1:Tv1PlzqC9t8wNnpPdctvtSUOPUUg4SHeE6vR1Ir2hmg=
+github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
+github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
+github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
@ -62,6 +73,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@ -77,6 +90,8 @@ github.com/open-policy-agent/cert-controller v0.12.0 h1:RKXlBafMcCh+++I1geJetXo7
 github.com/open-policy-agent/cert-controller v0.12.0/go.mod h1:N5bCFXdAXMYx0PdS6ZQ9lrDQQMz+F6deoChym6VleXw=
 github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8=
 github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a/go.mod h1:tI7nc6H6os2UYZRvSm9Y7bq4oMoXqhwA0WfnqKpoAgc=
+github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
+github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -92,6 +107,9 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
 github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
+github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -135,6 +153,7 @@ golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
 golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
@ -196,6 +215,8 @@ sigs.k8s.io/jobset v0.8.0 h1:80cJcPld+IMdKFOqzEW4et3Y6lGAPcP8YmBZ+aiKGYA=
 sigs.k8s.io/jobset v0.8.0/go.mod h1:yitjuGOExl2p964nhyevQGIkfiPSRHcdC3zNBneKCT8=
 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
+sigs.k8s.io/kind v0.27.0 h1:PQ3f0iAWNIj66LYkZ1ivhEg/+Zb6UPMbO+qVei/INZA=
+sigs.k8s.io/kind v0.27.0/go.mod h1:RZVFmy6qcwlSWwp6xeIUv7kXCPF3i8MXsEXxW/J+gJY=
 sigs.k8s.io/scheduler-plugins v0.30.6 h1:P4pViMVoyVNHWmkG96UtJ4LvxkUIeenIUKLZd09vDyw=
 sigs.k8s.io/scheduler-plugins v0.30.6/go.mod h1:EDYYqHmpHR//VYKAeud1TTQbTFSvpdGFeyEg9ejOmnI=
 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk=
--- a/hack/e2e-run-notebook.sh
+++ b/hack/e2e-run-notebook.sh
@ -14,63 +14,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
-
-# This bash script is used to run the example notebooks
+# This shell is used to run Jupyter Notebook with Papermill.

 set -o errexit
 set -o nounset
 set -o pipefail
+set -x

-NOTEBOOK_INPUT=""
-NOTEBOOK_OUTPUT="-" # outputs to console
-NAMESPACE="default"
-TRAINING_PYTHON_SDK="./sdk/python"
+if [ -z "${NOTEBOOK_INPUT}" ]; then
+    echo "NOTEBOOK_INPUT env variable must be set to run this script."
+    exit 1
+fi

-usage() {
-  echo "Usage: $0 -i <input_notebook> -o <output_notebook> [-p \"<param> <value>\"...] [-y <params.yaml>]"
-  echo "Options:"
-  echo "  -i  Input notebook (required)"
-  echo "  -o  Output notebook (required)"
-  echo "  -k  Kubeflow Training Operator Python SDK (optional)"
-  echo "  -n  Kubernetes namespace used by tests (optional)"
-  echo "  -h  Show this help message"
-  echo "NOTE: papermill, jupyter and ipykernel are required Python dependencies to run Notebooks"
-  exit 1
+if [ -z "${NOTEBOOK_OUTPUT}" ]; then
+    echo "NOTEBOOK_OUTPUT env variable must be set to run this script."
+    exit 1
+fi
+
+if [ -z "${PAPERMILL_TIMEOUT}" ]; then
+    echo "PAPERMILL_TIMEOUT env variable must be set to run this script."
+    exit 1
+fi
+
+print_results() {
+    kubectl get pods
+    kubectl describe pod
+    kubectl describe trainjob
+    kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
+    kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
+    kubectl wait trainjob --for=condition=Complete --all --timeout 3s
 }

-while getopts "i:o:p:k:n:r:d:h:" opt; do
-  case "$opt" in
-  i) NOTEBOOK_INPUT="$OPTARG" ;;      # -i for notebook input path
-  o) NOTEBOOK_OUTPUT="$OPTARG" ;;     # -o for notebook output path
-  k) TRAINING_PYTHON_SDK="$OPTARG" ;; # -k for training operator python sdk
-  n) NAMESPACE="$OPTARG" ;;           # -n for kubernetes namespace used by tests
-  h) usage ;;                         # -h for help (usage)
-  *)
-    usage
-    exit 1
-    ;;
-  esac
-done
-
-if [ -z "$NOTEBOOK_INPUT" ]; then
-  echo "Error: -i notebook input path is required."
-  exit 1
-fi
-
-papermill_cmd="papermill $NOTEBOOK_INPUT $NOTEBOOK_OUTPUT -p training_python_sdk $TRAINING_PYTHON_SDK -p namespace $NAMESPACE"
-
-if ! command -v papermill &>/dev/null; then
-  echo "Error: papermill is not installed. Please install papermill to proceed."
-  exit 1
-fi
-
-echo "Running command: $papermill_cmd"
-$papermill_cmd
-
-if [ $? -ne 0 ]; then
-  echo "Error: papermill execution failed." >&2
-  exit 1
-fi
-
-echo "Notebook execution completed successfully"
+(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||
+    (print_results && exit 1)
--- a/hack/e2e-setup-cluster.sh
+++ b/hack/e2e-setup-cluster.sh
@ -14,74 +14,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
+# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.

 set -o errexit
 set -o nounset
 set -o pipefail
+set -x

-echo "Kind load newly locally built image"
-# use cluster name which is used in github actions kind create
-kind load docker-image ${TRAINING_CI_IMAGE} --name ${KIND_CLUSTER}
+# Configure variables.
+KIND=${KIND:-./bin/kind}
+K8S_VERSION=${K8S_VERSION:-1.32.0}
+KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
+NAMESPACE="kubeflow-system"
+TIMEOUT="5m"

-echo "Update training operator manifest with newly built image"
-cd manifests/overlays/standalone
-kustomize edit set image kubeflow/training-operator=${TRAINING_CI_IMAGE}
+# Kubeflow Trainer images.
+# TODO (andreyvelich): Support initializers images.
+CONTROLLER_MANAGER_CI_IMAGE=trainer-controller-manager:test
+echo "Build Kubeflow Trainer images"
+docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}

-echo "Installing training operator manifests"
-kustomize build . | kubectl apply --server-side -f -
+echo "Set the image in Kustomize overlay"
+cd manifests/overlays/manager
+kustomize edit set image kubeflow/trainer-controller-manager=${CONTROLLER_MANAGER_CI_IMAGE}

-if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
-  SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins)
-  git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}"
+echo "Create Kind cluster and load Kubeflow Trainer images"
+${KIND} create cluster --image "${KIND_NODE_VERSION}"
+${KIND} load docker-image ${CONTROLLER_MANAGER_CI_IMAGE}

-  echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..."
-  helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \
-    --namespace scheduler-plugins \
-    --set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \
-    --set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}"
+echo "Deploy Kubeflow Trainer control plane"
+kubectl apply --server-side -k .

-  echo "Configure gang-scheduling using scheduler-plugins to training-operator"
-  kubectl patch -n kubeflow deployments training-operator --type='json' \
-    -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=scheduler-plugins"}]'
-elif [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
-  VOLCANO_SCHEDULER_VERSION=$(go list -m -f "{{.Version}}" volcano.sh/apis)
-
-  # patch scheduler first so that it is ready when scheduler-deployment installing finished
-  echo "Configure gang-scheduling using volcano to training-operator"
-  kubectl patch -n kubeflow deployments training-operator --type='json' \
-    -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=volcano"}]'
-
-  echo "Installing volcano scheduler ${VOLCANO_SCHEDULER_VERSION}..."
-  kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/${VOLCANO_SCHEDULER_VERSION}/installer/volcano-development.yaml
-fi
-
-TIMEOUT=30
-until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do
-  sleep 10
-  TIMEOUT=$((TIMEOUT - 1))
-done
-if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
-  kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all ||
-    (
-      kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins
+# We should wait until Deployment is in Ready status.
+echo "Wait for Kubeflow Trainer to be ready"
+(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
+  kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
+  (
+    echo "Failed to wait until Kubeflow Trainer is ready" &&
+      kubectl get pods -n ${NAMESPACE} &&
+      kubectl describe pods -n ${NAMESPACE} &&
      exit 1
-    )
-fi
+  )

-# wait for volcano up
-if [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
-  kubectl rollout status deployment -n volcano-system volcano-admission --timeout "${TIMEOUT}s" &&
-    kubectl rollout status deployment -n volcano-system volcano-scheduler --timeout "${TIMEOUT}s" &&
-    kubectl rollout status deployment -n volcano-system volcano-controllers --timeout "${TIMEOUT}s" ||
-    (
-      kubectl get pods -n volcano-system && kubectl describe pods -n volcano-system
-      exit 1
-    )
-fi
+print_cluster_info() {
+  kubectl version
+  kubectl cluster-info
+  kubectl get nodes
+  kubectl get pods -n ${NAMESPACE}
+  kubectl describe pod -n ${NAMESPACE}
+}

-kubectl version
-kubectl cluster-info
-kubectl get nodes
-kubectl get pods -n kubeflow
-kubectl describe pods -n kubeflow
+# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
+echo "Deploy Kubeflow Trainer runtimes"
+(cd ../runtimes && kubectl apply --server-side -k .) || (
+  kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
+    print_cluster_info &&
+    exit 1
+)
+
+# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
+TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
+docker pull ${TORCH_RUNTIME_IMAGE}
+${KIND} load docker-image ${TORCH_RUNTIME_IMAGE}
+
+print_cluster_info
--- a/hack/tools.go
+++ b/hack/tools.go
@ -18,4 +18,5 @@ package tools

 import (
 	_ "k8s.io/code-generator"
+	_ "sigs.k8s.io/kind/cmd/kind/app"
 )
--- a/hack/update-codegen.sh
+++ b/hack/update-codegen.sh
@ -14,8 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer and so on.
-# We don't use this tool to generate deepcopy because kubebuilder (controller-tools) has covered that part.
+# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer.

 set -o errexit
 set -o nounset
--- a/sdk/kubeflow/trainer/api/trainer_client.py
+++ b/sdk/kubeflow/trainer/api/trainer_client.py
@ -202,6 +202,12 @@ class TrainerClient:
                trainer.resources_per_node
            )

+        # Set numProcPerNode to the Trainer.
+        if trainer and trainer.resources_per_node:
+            trainer_crd.num_proc_per_node = utils.get_num_proc_per_node(
+                trainer.resources_per_node
+            )
+
        # Add command and args to the Trainer if training function is set.
        if trainer and trainer.func:
            trainer_crd.command = constants.DEFAULT_COMMAND
--- a/sdk/kubeflow/trainer/utils/utils.py
+++ b/sdk/kubeflow/trainer/utils/utils.py
@ -14,6 +14,7 @@

 import inspect
 import json
+import math
 import os
 import queue
 import textwrap
@ -122,6 +123,31 @@ def get_resources_per_node(resources_per_node: dict) -> client.V1ResourceRequire
    return resources


+# TODO (andreyvelich): Move this part to the Kubeflow Trainer torch plugins.
+# Ref issue: https://github.com/kubeflow/trainer/issues/2407
+def get_num_proc_per_node(resources_per_node: dict) -> object:
+    """
+    Get the Trainer numProcPerNode from the given resources.
+    """
+
+    resources = {k.lower(): v for k, v in resources_per_node.items()}
+    # NumProcPerNode is equal to number of GPUs or CPUs, otherwise set it to `auto`
+    for key, value in resources.items():
+        if "gpu" in key:
+            return value
+
+    for key, value in resources.items():
+        if "cpu" in key:
+            # For now, we can't convert milliCPUs to the numProcPerNode.
+            try:
+                value = math.ceil(int(value))
+                return value
+            except Exception:
+                pass
+
+    return "auto"
+
+
 def get_args_using_train_func(
    train_func: Callable,
    train_func_parameters: Optional[Dict[str, Any]] = None,
--- a/sdk/pyproject.toml
+++ b/sdk/pyproject.toml
@ -31,8 +31,9 @@ classifiers = [
 ]
 dependencies = [
    "kubernetes>=27.2.0",
-    "pydantic>=2.10.0",
-    "jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.8.0#subdirectory=sdk/python",
+    # TODO (andreyvelich): Update JobSet to v0.8.0 once this PR is merged: https://github.com/kubeflow/trainer/pull/2466
+    # "pydantic>=2.10.0",
+    "jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.7.2#subdirectory=sdk/python",
 ]

 [project.urls]
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@ -0,0 +1,78 @@
+package e2e
+
+import (
+	"github.com/onsi/ginkgo/v2"
+	"github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	jobsetconsts "sigs.k8s.io/jobset/pkg/constants"
+
+	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
+	"github.com/kubeflow/trainer/pkg/constants"
+	testingutil "github.com/kubeflow/trainer/pkg/util/testing"
+	"github.com/kubeflow/trainer/test/util"
+)
+
+var _ = ginkgo.Describe("TrainJob e2e", func() {
+	// Each test runs in a separate namespace.
+	var ns *corev1.Namespace
+
+	// Create test namespace before each test.
+	ginkgo.BeforeEach(func() {
+		ns = &corev1.Namespace{
+			ObjectMeta: metav1.ObjectMeta{
+				GenerateName: "e2e-",
+			},
+		}
+		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
+
+		// Wait for namespace to exist before proceeding with test.
+		gomega.Eventually(func(g gomega.Gomega) {
+			g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(ns), ns)).Should(gomega.Succeed())
+		}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
+	})
+
+	// Delete test namespace after each test.
+	ginkgo.AfterEach(func() {
+		// Delete test namespace after each test.
+		gomega.Expect(k8sClient.Delete(ctx, ns)).To(gomega.Succeed())
+	})
+
+	// These tests create TrainJob that reference supported runtime without any additional changes.
+	ginkgo.When("creating TrainJob", func() {
+		// Verify `torch-distributed` ClusterTrainingRuntime.
+		ginkgo.It("should create TrainJob with PyTorch runtime reference", func() {
+			// Create a TrainJob.
+			trainJob := testingutil.MakeTrainJobWrapper(ns.Name, "e2e-test").
+				RuntimeRef(trainer.SchemeGroupVersion.WithKind(trainer.ClusterTrainingRuntimeKind), "torch-distributed").
+				Obj()
+
+			ginkgo.By("Create a TrainJob with torch-distributed runtime reference", func() {
+				gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed())
+			})
+
+			// Wait for TrainJob to be in Succeeded status.
+			ginkgo.By("Wait for TrainJob to be in Succeeded status", func() {
+				gomega.Eventually(func(g gomega.Gomega) {
+					gotTrainJob := &trainer.TrainJob{}
+					g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(trainJob), gotTrainJob)).Should(gomega.Succeed())
+					g.Expect(gotTrainJob.Status.Conditions).Should(gomega.BeComparableTo([]metav1.Condition{
+						{
+							Type:    trainer.TrainJobCreated,
+							Status:  metav1.ConditionTrue,
+							Reason:  trainer.TrainJobJobsCreationSucceededReason,
+							Message: constants.TrainJobJobsCreationSucceededMessage,
+						},
+						{
+							Type:    trainer.TrainJobComplete,
+							Status:  metav1.ConditionTrue,
+							Reason:  jobsetconsts.AllJobsCompletedReason,
+							Message: jobsetconsts.AllJobsCompletedMessage,
+						},
+					}, util.IgnoreConditions))
+				}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
+			})
+		})
+	})
+})
--- a/test/e2e/suite_test.go
+++ b/test/e2e/suite_test.go
@ -0,0 +1,56 @@
+/*
+Copyright 2024 The Kubeflow Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"testing"
+
+	"github.com/onsi/ginkgo/v2"
+	"github.com/onsi/gomega"
+	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/config"
+
+	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
+)
+
+var (
+	k8sClient client.Client
+	ctx       context.Context
+)
+
+func TestAPIs(t *testing.T) {
+	gomega.RegisterFailHandler(ginkgo.Fail)
+	ginkgo.RunSpecs(t, "Kubeflow Trainer E2E Suite")
+}
+
+var _ = ginkgo.BeforeSuite(func() {
+	ctx = context.Background()
+	// Get Kubernetes config.
+	cfg := config.GetConfigOrDie()
+	gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil())
+
+	// Add Trainer APIs.
+	err := trainer.AddToScheme(scheme.Scheme)
+	gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())
+
+	// Configure k8s client.
+	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	gomega.Expect(k8sClient).NotTo(gomega.BeNil())
+})
--- a/test/util/constants.go
+++ b/test/util/constants.go
@ -17,14 +17,16 @@ limitations under the License.
 package util

 import (
+	"time"
+
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"time"
 )

 const (
 	Timeout            = 5 * time.Second
+	TimeoutE2E         = 10 * time.Minute // E2Es require a longer timeout due to large image pull
 	ConsistentDuration = time.Second
 	Interval           = time.Millisecond * 250
 )