mirror of https://github.com/kubeflow/trainer.git
chore(test): Add E2E tests for Kubeflow Trainer (#2470)
* Add e2e tests for Kubeflow Trainer Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add timeout for papermill Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add output as part of make command Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add k8s version to setup cluster Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix Kind k8s version Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix 1.29 version Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Create script to run Notebook Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Download dataset when local_rank=0 Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Update test/e2e/e2e_test.go Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com> Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Refactor Go e2e tests Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Bump k8s to 1.29.14 Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Install Kind from go mod Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix path for Kind package Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix Go e2e Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Reduce number of CPUs Export Notebook as artifact Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Print logs due to flaky test Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix artifact path Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * docker pull image Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix path Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add k8s version to output name Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove install Kind cmd Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> --------- Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
This commit is contained in:
parent
3ec8f0705f
commit
9e785750d0
|
|
@ -1,99 +0,0 @@
|
|||
# TODO (andreyvelich): Refactor this once we have e2e test for Kubeflow Trainer.
|
||||
name: Setup E2E test template
|
||||
description: A composite action to setup e2e tests
|
||||
|
||||
inputs:
|
||||
kubernetes-version:
|
||||
required: true
|
||||
description: Kubernetes version
|
||||
python-version:
|
||||
required: true
|
||||
description: Python version
|
||||
gang-scheduler-name:
|
||||
required: false
|
||||
default: "none"
|
||||
description: Gang scheduler name
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
# This step is a Workaround to avoid the "No space left on device" error.
|
||||
# ref: https://github.com/actions/runner-images/issues/2840
|
||||
- name: Remove unnecessary files
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Disk usage before cleanup:"
|
||||
df -hT
|
||||
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/share/swift
|
||||
|
||||
echo "Disk usage after cleanup:"
|
||||
df -hT
|
||||
|
||||
- name: Prune docker images
|
||||
shell: bash
|
||||
run: |
|
||||
docker image prune -a -f
|
||||
docker system df
|
||||
df -hT
|
||||
|
||||
- name: Move docker data directory
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Stopping docker service ..."
|
||||
sudo systemctl stop docker
|
||||
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
|
||||
DOCKER_ROOT_DIR=/mnt/docker
|
||||
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
|
||||
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
|
||||
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
|
||||
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
|
||||
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
|
||||
echo "Starting docker service ..."
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start docker
|
||||
echo "Docker service status:"
|
||||
sudo systemctl --no-pager -l -o short status docker
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
|
||||
- name: Create k8s Kind Cluster
|
||||
uses: helm/kind-action@v1.11.0
|
||||
with:
|
||||
node_image: kindest/node:${{ inputs.kubernetes-version }}
|
||||
cluster_name: training-operator-cluster
|
||||
kubectl_version: ${{ inputs.kubernetes-version }}
|
||||
|
||||
- name: Build training-operator
|
||||
shell: bash
|
||||
run: |
|
||||
./scripts/gha/build-image.sh
|
||||
env:
|
||||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
|
||||
|
||||
- name: Deploy training operator
|
||||
shell: bash
|
||||
run: |
|
||||
./scripts/gha/setup-training-operator.sh
|
||||
docker system prune -a -f
|
||||
docker system df
|
||||
df -h
|
||||
env:
|
||||
KIND_CLUSTER: training-operator-cluster
|
||||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
|
||||
GANG_SCHEDULER_NAME: ${{ inputs.gang-scheduler-name }}
|
||||
KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}
|
||||
|
|
@ -7,23 +7,61 @@ on:
|
|||
jobs:
|
||||
e2e-test:
|
||||
name: E2E Test
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
labels: ubuntu-latest-16-cores
|
||||
env:
|
||||
GOPATH: ${{ github.workspace }}/go
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["1.29.3", "1.30.0", "1.31.0"]
|
||||
# Kubernetes versions for e2e tests on Kind cluster.
|
||||
kubernetes-version: ["1.29.14", "1.30.0", "1.31.0"]
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
python-version: 3.11
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
echo "TODO (andreyvelich): Implement E2E Tests"
|
||||
# pip install -U './sdk'
|
||||
echo "Install Papermill"
|
||||
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
|
||||
|
||||
echo "Install Kubeflow SDK"
|
||||
pip install ./sdk
|
||||
|
||||
- name: Setup cluster
|
||||
run: |
|
||||
make test-e2e-setup-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
|
||||
|
||||
- name: Run e2e with Go
|
||||
run: |
|
||||
make test-e2e
|
||||
|
||||
- name: Run e2e test for example Notebooks.
|
||||
run: |
|
||||
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./mnist_output_${{ matrix.kubernetes-version }}.ipynb TIMEOUT=900
|
||||
|
||||
# TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks.
|
||||
- name: Upload notebook
|
||||
uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: mnist_output_${{ matrix.kubernetes-version }}.ipynb
|
||||
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/mnist_output_${{ matrix.kubernetes-version }}.ipynb
|
||||
retention-days: 1
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ jobs:
|
|||
|
||||
- name: Run Go integration tests
|
||||
run: |
|
||||
make test-integration ENVTEST_K8S_VERSION=${{ matrix.kubernetes-version }}
|
||||
make test-integration K8S_VERSION=${{ matrix.kubernetes-version }}
|
||||
|
||||
- name: Coveralls report
|
||||
uses: shogo82148/actions-goveralls@v1
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ __debug_bin
|
|||
|
||||
# Jupyter Notebooks.
|
||||
**/.ipynb_checkpoints
|
||||
# The default output for Notebook after Papermill execution.
|
||||
trainer_output.ipynb
|
||||
|
||||
# Python cache files
|
||||
__pycache__/
|
||||
|
|
|
|||
30
Makefile
30
Makefile
|
|
@ -29,14 +29,16 @@ help: ## Display this help.
|
|||
|
||||
##@ Development
|
||||
|
||||
K8S_VERSION ?= 1.32.0
|
||||
|
||||
PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
# Tool Binaries
|
||||
LOCALBIN ?= $(PROJECT_DIR)/bin
|
||||
|
||||
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
|
||||
ENVTEST ?= $(LOCALBIN)/setup-envtest
|
||||
|
||||
ENVTEST_K8S_VERSION ?= 1.32
|
||||
KIND ?= $(LOCALBIN)/kind
|
||||
|
||||
# Instructions to download tools for development.
|
||||
.PHONY: envtest
|
||||
|
|
@ -47,6 +49,10 @@ envtest: ## Download the setup-envtest binary if required.
|
|||
controller-gen: ## Download the controller-gen binary if required.
|
||||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.17.2
|
||||
|
||||
.PHONY: kind
|
||||
kind: ## Download Kind binary if required.
|
||||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind)
|
||||
|
||||
# Download external CRDs for Go integration testings.
|
||||
EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds
|
||||
|
||||
|
|
@ -106,8 +112,9 @@ test: ## Run Go unit test.
|
|||
|
||||
.PHONY: test-integration
|
||||
test-integration: envtest jobset-operator-crd scheduler-plugins-crd ## Run Go integration test.
|
||||
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" go test ./test/... -coverprofile cover.out
|
||||
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(K8S_VERSION) -p path)" go test ./test/integration/... -coverprofile cover.out
|
||||
|
||||
.PHONY: test-python
|
||||
test-python: ## Run Python unit test.
|
||||
export PYTHONPATH=$(PROJECT_DIR)
|
||||
pip install pytest
|
||||
|
|
@ -118,9 +125,26 @@ test-python: ## Run Python unit test.
|
|||
pytest ./pkg/initializer/model
|
||||
pytest ./pkg/initializer/utils
|
||||
|
||||
.PHONY: test-python-integration
|
||||
test-python-integration: ## Run Python integration test.
|
||||
export PYTHONPATH=$(PROJECT_DIR)
|
||||
pip install pytest
|
||||
pip install -r ./cmd/initializer/dataset/requirements.txt
|
||||
|
||||
pytest ./test/integration/initializer
|
||||
|
||||
.PHONY: test-e2e-setup-cluster
|
||||
test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
|
||||
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
|
||||
|
||||
.PHONY: test-e2e
|
||||
test-e2e: ## Run Go e2e test.
|
||||
go test ./test/e2e/...
|
||||
|
||||
# Input and output location for Notebooks executed with Papermill.
|
||||
NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/pytorch/image-classification/mnist.ipynb
|
||||
NOTEBOOK_OUTPUT=$(PROJECT_DIR)/trainer_output.ipynb
|
||||
PAPERMILL_TIMEOUT=900
|
||||
.PHONY: test-e2e-notebook
|
||||
test-e2e-notebook: ## Run Jupyter Notebook with Papermill.
|
||||
NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO (astefanutti): Change to the Kubeflow SDK when it's available.\n",
|
||||
"!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
|
||||
"# !pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -66,7 +66,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -119,14 +119,24 @@
|
|||
" model = nn.parallel.DistributedDataParallel(Net().to(device))\n",
|
||||
" optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n",
|
||||
"\n",
|
||||
" # Retrieve the Fashion-MNIST dataset\n",
|
||||
" \n",
|
||||
" # Download FashionMNIST dataset only on local_rank=0 process.\n",
|
||||
" if local_rank == 0:\n",
|
||||
" dataset = datasets.FashionMNIST(\n",
|
||||
" \"./data\",\n",
|
||||
" train=True,\n",
|
||||
" download=True,\n",
|
||||
" transform=transforms.Compose([transforms.ToTensor()]),\n",
|
||||
" )\n",
|
||||
" dist.barrier()\n",
|
||||
" dataset = datasets.FashionMNIST(\n",
|
||||
" \"./data\",\n",
|
||||
" train=True,\n",
|
||||
" download=True,\n",
|
||||
" download=False,\n",
|
||||
" transform=transforms.Compose([transforms.ToTensor()]),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" # Shard the dataset accross workers.\n",
|
||||
" train_loader = DataLoader(\n",
|
||||
" dataset,\n",
|
||||
|
|
@ -135,6 +145,7 @@
|
|||
" )\n",
|
||||
"\n",
|
||||
" # TODO(astefanutti): add parameters to the training function\n",
|
||||
" dist.barrier()\n",
|
||||
" for epoch in range(1, 3):\n",
|
||||
" model.train()\n",
|
||||
"\n",
|
||||
|
|
@ -181,7 +192,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -190,126 +201,195 @@
|
|||
"text": [
|
||||
"Using Device: cpu, Backend: gloo\n",
|
||||
"Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n",
|
||||
"Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.309967\n",
|
||||
"Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.045446\n",
|
||||
"Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.398883\n",
|
||||
"Train Epoch: 1 [3000/60000 (5%)]\tLoss: 0.992089\n",
|
||||
"Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.122684\n",
|
||||
"Train Epoch: 1 [5000/60000 (8%)]\tLoss: 1.031676\n",
|
||||
"Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.649529\n",
|
||||
"Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.804960\n",
|
||||
"Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.709698\n",
|
||||
"Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.632330\n",
|
||||
"Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.695469\n",
|
||||
"Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.646323\n",
|
||||
"Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.521877\n",
|
||||
"Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.592377\n",
|
||||
"Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.686853\n",
|
||||
"Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.678805\n",
|
||||
"Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.658783\n",
|
||||
"Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.540468\n",
|
||||
"Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.456685\n",
|
||||
"Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.561984\n",
|
||||
"Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.453478\n",
|
||||
"Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.399682\n",
|
||||
"Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.432961\n",
|
||||
"Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.611499\n",
|
||||
"Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.552892\n",
|
||||
"Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.409226\n",
|
||||
"Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.569662\n",
|
||||
"Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.379728\n",
|
||||
"Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.420447\n",
|
||||
"Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.410670\n",
|
||||
"Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.480141\n",
|
||||
"Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.425981\n",
|
||||
"Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.345157\n",
|
||||
"Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.323578\n",
|
||||
"Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.537613\n",
|
||||
"Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.523302\n",
|
||||
"Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.426407\n",
|
||||
"Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.356403\n",
|
||||
"Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.516297\n",
|
||||
"Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.406655\n",
|
||||
"Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.314193\n",
|
||||
"Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.467424\n",
|
||||
"Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457645\n",
|
||||
"Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.388591\n",
|
||||
"Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.386649\n",
|
||||
"Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.282575\n",
|
||||
"Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.446804\n",
|
||||
"Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.418433\n",
|
||||
"Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.575584\n",
|
||||
"Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.382036\n",
|
||||
"Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.299168\n",
|
||||
"Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.423421\n",
|
||||
"Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.425236\n",
|
||||
"Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.403723\n",
|
||||
"Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.303039\n",
|
||||
"Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.375983\n",
|
||||
"Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.434169\n",
|
||||
"Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.429213\n",
|
||||
"Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.354376\n",
|
||||
"Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.305779\n",
|
||||
"Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.437120\n",
|
||||
"Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.464603\n",
|
||||
"Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.284665\n",
|
||||
"Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.369253\n",
|
||||
"Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.468896\n",
|
||||
"Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.388527\n",
|
||||
"Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.474483\n",
|
||||
"Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.373588\n",
|
||||
"Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.443588\n",
|
||||
"Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.449592\n",
|
||||
"Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.363776\n",
|
||||
"Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.400426\n",
|
||||
"Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.282801\n",
|
||||
"Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.288877\n",
|
||||
"Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.549093\n",
|
||||
"Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.359002\n",
|
||||
"Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.322263\n",
|
||||
"Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.289489\n",
|
||||
"Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.279724\n",
|
||||
"Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.452595\n",
|
||||
"Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.334388\n",
|
||||
"Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.340985\n",
|
||||
"Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.247467\n",
|
||||
"Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.439283\n",
|
||||
"Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.270795\n",
|
||||
"Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.283242\n",
|
||||
"Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.377896\n",
|
||||
"Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.264453\n",
|
||||
"Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.328696\n",
|
||||
"Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.294168\n",
|
||||
"Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.421162\n",
|
||||
"Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.306932\n",
|
||||
"Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.297351\n",
|
||||
"Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.261608\n",
|
||||
"Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.413534\n",
|
||||
"Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.433157\n",
|
||||
"Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.390571\n",
|
||||
"Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.242159\n",
|
||||
"Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.347628\n",
|
||||
"Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.321216\n",
|
||||
"Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.285891\n",
|
||||
"Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.401335\n",
|
||||
"Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.357113\n",
|
||||
"Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.321728\n",
|
||||
"Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266073\n",
|
||||
"Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.235082\n",
|
||||
"Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.329955\n",
|
||||
"Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.351680\n",
|
||||
"Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.509699\n",
|
||||
"Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.281432\n",
|
||||
"Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.262006\n",
|
||||
"Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.432544\n",
|
||||
"Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.332725\n",
|
||||
"Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.313516\n",
|
||||
"Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.266921\n",
|
||||
"Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.279880\n",
|
||||
"Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.329515\n",
|
||||
"Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.379902\n",
|
||||
"Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.252111\n",
|
||||
"Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.267555\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26.4M/26.4M [00:02<00:00, 10.9MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29.5k/29.5k [00:00<00:00, 1.50MB/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.42M/4.42M [00:00<00:00, 8.57MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
|
||||
"Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.15k/5.15k [00:00<00:00, 4.41MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"\n",
|
||||
"Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.312359\n",
|
||||
"Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.058874\n",
|
||||
"Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.059449\n",
|
||||
"Train Epoch: 1 [3000/60000 (5%)]\tLoss: 1.130449\n",
|
||||
"Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.027112\n",
|
||||
"Train Epoch: 1 [5000/60000 (8%)]\tLoss: 0.845158\n",
|
||||
"Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.747619\n",
|
||||
"Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.731784\n",
|
||||
"Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.623452\n",
|
||||
"Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.666745\n",
|
||||
"Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.597833\n",
|
||||
"Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.634247\n",
|
||||
"Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.533735\n",
|
||||
"Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.545662\n",
|
||||
"Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.637130\n",
|
||||
"Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.596679\n",
|
||||
"Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.505523\n",
|
||||
"Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.484452\n",
|
||||
"Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.494081\n",
|
||||
"Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.601508\n",
|
||||
"Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.485598\n",
|
||||
"Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.402011\n",
|
||||
"Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.350119\n",
|
||||
"Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.575456\n",
|
||||
"Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.402916\n",
|
||||
"Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.335860\n",
|
||||
"Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.498826\n",
|
||||
"Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.331436\n",
|
||||
"Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.466167\n",
|
||||
"Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.514644\n",
|
||||
"Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.423657\n",
|
||||
"Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.424075\n",
|
||||
"Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.365285\n",
|
||||
"Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.289063\n",
|
||||
"Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.538227\n",
|
||||
"Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.546361\n",
|
||||
"Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.383847\n",
|
||||
"Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.311942\n",
|
||||
"Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.458801\n",
|
||||
"Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.481895\n",
|
||||
"Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.308024\n",
|
||||
"Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.435803\n",
|
||||
"Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457417\n",
|
||||
"Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.310509\n",
|
||||
"Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.347369\n",
|
||||
"Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.341391\n",
|
||||
"Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.464614\n",
|
||||
"Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.432629\n",
|
||||
"Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.519174\n",
|
||||
"Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.368225\n",
|
||||
"Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.325527\n",
|
||||
"Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.455275\n",
|
||||
"Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.423473\n",
|
||||
"Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.354114\n",
|
||||
"Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.328097\n",
|
||||
"Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.353430\n",
|
||||
"Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.419119\n",
|
||||
"Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.383263\n",
|
||||
"Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.292101\n",
|
||||
"Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.251011\n",
|
||||
"Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.412635\n",
|
||||
"Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.467144\n",
|
||||
"Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.246448\n",
|
||||
"Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.341133\n",
|
||||
"Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.412845\n",
|
||||
"Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.338513\n",
|
||||
"Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.426739\n",
|
||||
"Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.387864\n",
|
||||
"Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.441497\n",
|
||||
"Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.475687\n",
|
||||
"Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.390743\n",
|
||||
"Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.432761\n",
|
||||
"Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.306278\n",
|
||||
"Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.322986\n",
|
||||
"Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.460900\n",
|
||||
"Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.350266\n",
|
||||
"Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.274468\n",
|
||||
"Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.341584\n",
|
||||
"Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.237921\n",
|
||||
"Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.387657\n",
|
||||
"Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.379017\n",
|
||||
"Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.267510\n",
|
||||
"Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.270873\n",
|
||||
"Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.437596\n",
|
||||
"Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.265560\n",
|
||||
"Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.279858\n",
|
||||
"Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.332311\n",
|
||||
"Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.225879\n",
|
||||
"Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.278610\n",
|
||||
"Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.286307\n",
|
||||
"Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.413894\n",
|
||||
"Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.308004\n",
|
||||
"Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.280563\n",
|
||||
"Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.264711\n",
|
||||
"Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.417848\n",
|
||||
"Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.512428\n",
|
||||
"Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.319987\n",
|
||||
"Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.251197\n",
|
||||
"Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.325069\n",
|
||||
"Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.303394\n",
|
||||
"Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.280159\n",
|
||||
"Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.426005\n",
|
||||
"Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.363313\n",
|
||||
"Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.271922\n",
|
||||
"Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266253\n",
|
||||
"Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.238750\n",
|
||||
"Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.376987\n",
|
||||
"Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.282419\n",
|
||||
"Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.462140\n",
|
||||
"Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.255249\n",
|
||||
"Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.241605\n",
|
||||
"Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.440393\n",
|
||||
"Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.313707\n",
|
||||
"Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.338831\n",
|
||||
"Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.236808\n",
|
||||
"Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.262569\n",
|
||||
"Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.415122\n",
|
||||
"Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.362813\n",
|
||||
"Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.321227\n",
|
||||
"Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.239889\n",
|
||||
"Training is finished\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -345,7 +425,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
|
|
@ -371,13 +451,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Runtime(name='mpi-distributed', phase='Unknown', accelerator='Unknown', accelerator_count='Unknown')\n",
|
||||
"Runtime(name='torch-distributed', phase='pre-training', accelerator='gpu-tesla-v100-16gb', accelerator_count='4')\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -393,12 +474,12 @@
|
|||
"source": [
|
||||
"## Run the Distributed TrainJob\n",
|
||||
"\n",
|
||||
"Kubeflow TrainJob will train the above model on 4 PyTorch nodes."
|
||||
"Kubeflow TrainJob will train the above model on 3 PyTorch nodes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -408,13 +489,13 @@
|
|||
" trainer=Trainer(\n",
|
||||
" func=train_fashion_mnist,\n",
|
||||
" # Set how many PyTorch nodes you want to use for distributed training.\n",
|
||||
" num_nodes=4,\n",
|
||||
" num_nodes=3,\n",
|
||||
" # Set the resources for each PyTorch node.\n",
|
||||
" resources_per_node={\n",
|
||||
" \"cpu\": 5,\n",
|
||||
" \"cpu\": 3,\n",
|
||||
" \"memory\": \"16Gi\",\n",
|
||||
" # Comment this to distribute the TrainJob using CPU nodes.\n",
|
||||
" \"nvidia.com/gpu\": 1,\n",
|
||||
" # Uncomment this to distribute the TrainJob using GPU nodes.\n",
|
||||
" # \"nvidia.com/gpu\": 1,\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
")"
|
||||
|
|
@ -428,14 +509,42 @@
|
|||
"\n",
|
||||
"You can check the components of TrainJob that's created.\n",
|
||||
"\n",
|
||||
"Since the TrainJob performs distributed training across 4 nodes, it generates 4 components: `trainer-node-0` .. `trainer-node-3`.\n",
|
||||
"Since the TrainJob performs distributed training across 3 nodes, it generates 3 components: `trainer-node-0` .. `trainer-node-2`.\n",
|
||||
"\n",
|
||||
"You can get the individual status for each of these components."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"def wait_for_job_running():\n",
|
||||
" for _ in range(100):\n",
|
||||
" trainjob = client.get_job(name=job_name)\n",
|
||||
" for c in trainjob.components:\n",
|
||||
" if c.name == \"trainer-node-0\" and c.status == \"Running\":\n",
|
||||
" return\n",
|
||||
" print(\"Wait for TrainJob running status. Sleep for 5 seconds\")\n",
|
||||
" time.sleep(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n",
|
||||
"wait_for_job_running()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -447,8 +556,6 @@
|
|||
"Component: trainer-node-1, Status: Running, Devices: gpu x 1\n",
|
||||
"\n",
|
||||
"Component: trainer-node-2, Status: Running, Devices: gpu x 1\n",
|
||||
"\n",
|
||||
"Component: trainer-node-3, Status: Running, Devices: gpu x 1\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -466,12 +573,12 @@
|
|||
"\n",
|
||||
"We can use the `get_job_logs()` API to get the TrainJob logs.\n",
|
||||
"\n",
|
||||
"Since we run training on 4 GPUs, every PyTorch node uses 60,000/4 = 15,000 images from the dataset."
|
||||
"Since we run training on 3 GPUs, every PyTorch node uses 60,000/3 = 20,000 images from the dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -479,53 +586,63 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"[trainer-node]: Using Device: cuda, Backend: nccl\n",
|
||||
"[trainer-node]: Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
|
||||
"[trainer-node]: Distributed Training for WORLD_SIZE: 3, RANK: 0, LOCAL_RANK: 0\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
|
||||
"100%|██████████| 26.4M/26.4M [00:02<00:00, 12.5MB/s]\n",
|
||||
"100%|██████████| 26.4M/26.4M [00:02<00:00, 11.1MB/s]\n",
|
||||
"[trainer-node]: Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
|
||||
"100%|██████████| 29.5k/29.5k [00:00<00:00, 214kB/s]\n",
|
||||
"100%|██████████| 29.5k/29.5k [00:00<00:00, 74.8MB/s]\n",
|
||||
"[trainer-node]: Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
|
||||
"100%|██████████| 4.42M/4.42M [00:01<00:00, 3.50MB/s]\n",
|
||||
"100%|██████████| 4.42M/4.42M [00:01<00:00, 3.90MB/s]\n",
|
||||
"[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
|
||||
"[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
|
||||
"100%|██████████| 5.15k/5.15k [00:00<00:00, 37.8MB/s]\n",
|
||||
"100%|██████████| 5.15k/5.15k [00:00<00:00, 38.1MB/s]\n",
|
||||
"[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
|
||||
"[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.305451\n",
|
||||
"[trainer-node]: Train Epoch: 1 [1000/60000 (7%)]\tLoss: 2.056247\n",
|
||||
"[trainer-node]: Train Epoch: 1 [2000/60000 (13%)]\tLoss: 2.166955\n",
|
||||
"[trainer-node]: Train Epoch: 1 [3000/60000 (20%)]\tLoss: 1.045183\n",
|
||||
"[trainer-node]: Train Epoch: 1 [4000/60000 (27%)]\tLoss: 0.767518\n",
|
||||
"[trainer-node]: Train Epoch: 1 [5000/60000 (33%)]\tLoss: 0.697382\n",
|
||||
"[trainer-node]: Train Epoch: 1 [6000/60000 (40%)]\tLoss: 0.638373\n",
|
||||
"[trainer-node]: Train Epoch: 1 [7000/60000 (47%)]\tLoss: 0.667810\n",
|
||||
"[trainer-node]: Train Epoch: 1 [8000/60000 (53%)]\tLoss: 0.541413\n",
|
||||
"[trainer-node]: Train Epoch: 1 [9000/60000 (60%)]\tLoss: 0.564223\n",
|
||||
"[trainer-node]: Train Epoch: 1 [10000/60000 (67%)]\tLoss: 0.425999\n",
|
||||
"[trainer-node]: Train Epoch: 1 [11000/60000 (73%)]\tLoss: 0.564535\n",
|
||||
"[trainer-node]: Train Epoch: 1 [12000/60000 (80%)]\tLoss: 0.459158\n",
|
||||
"[trainer-node]: Train Epoch: 1 [13000/60000 (87%)]\tLoss: 0.545110\n",
|
||||
"[trainer-node]: Train Epoch: 1 [14000/60000 (93%)]\tLoss: 0.471710\n",
|
||||
"[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.520992\n",
|
||||
"[trainer-node]: Train Epoch: 2 [1000/60000 (7%)]\tLoss: 0.440295\n",
|
||||
"[trainer-node]: Train Epoch: 2 [2000/60000 (13%)]\tLoss: 0.436745\n",
|
||||
"[trainer-node]: Train Epoch: 2 [3000/60000 (20%)]\tLoss: 0.359110\n",
|
||||
"[trainer-node]: Train Epoch: 2 [4000/60000 (27%)]\tLoss: 0.493791\n",
|
||||
"[trainer-node]: Train Epoch: 2 [5000/60000 (33%)]\tLoss: 0.384616\n",
|
||||
"[trainer-node]: Train Epoch: 2 [6000/60000 (40%)]\tLoss: 0.529568\n",
|
||||
"[trainer-node]: Train Epoch: 2 [7000/60000 (47%)]\tLoss: 0.443400\n",
|
||||
"[trainer-node]: Train Epoch: 2 [8000/60000 (53%)]\tLoss: 0.352168\n",
|
||||
"[trainer-node]: Train Epoch: 2 [9000/60000 (60%)]\tLoss: 0.431930\n",
|
||||
"[trainer-node]: Train Epoch: 2 [10000/60000 (67%)]\tLoss: 0.282820\n",
|
||||
"[trainer-node]: Train Epoch: 2 [11000/60000 (73%)]\tLoss: 0.412141\n",
|
||||
"[trainer-node]: Train Epoch: 2 [12000/60000 (80%)]\tLoss: 0.367190\n",
|
||||
"[trainer-node]: Train Epoch: 2 [13000/60000 (87%)]\tLoss: 0.355502\n",
|
||||
"[trainer-node]: Train Epoch: 2 [14000/60000 (93%)]\tLoss: 0.326105\n",
|
||||
"[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.298486\n",
|
||||
"[trainer-node]: Train Epoch: 1 [1000/60000 (5%)]\tLoss: 2.023109\n",
|
||||
"[trainer-node]: Train Epoch: 1 [2000/60000 (10%)]\tLoss: 2.210286\n",
|
||||
"[trainer-node]: Train Epoch: 1 [3000/60000 (15%)]\tLoss: 1.308523\n",
|
||||
"[trainer-node]: Train Epoch: 1 [4000/60000 (20%)]\tLoss: 0.896595\n",
|
||||
"[trainer-node]: Train Epoch: 1 [5000/60000 (25%)]\tLoss: 0.790926\n",
|
||||
"[trainer-node]: Train Epoch: 1 [6000/60000 (30%)]\tLoss: 0.694305\n",
|
||||
"[trainer-node]: Train Epoch: 1 [7000/60000 (35%)]\tLoss: 0.532733\n",
|
||||
"[trainer-node]: Train Epoch: 1 [8000/60000 (40%)]\tLoss: 0.571943\n",
|
||||
"[trainer-node]: Train Epoch: 1 [9000/60000 (45%)]\tLoss: 0.593324\n",
|
||||
"[trainer-node]: Train Epoch: 1 [10000/60000 (50%)]\tLoss: 0.570712\n",
|
||||
"[trainer-node]: Train Epoch: 1 [11000/60000 (55%)]\tLoss: 0.416316\n",
|
||||
"[trainer-node]: Train Epoch: 1 [12000/60000 (60%)]\tLoss: 0.438910\n",
|
||||
"[trainer-node]: Train Epoch: 1 [13000/60000 (65%)]\tLoss: 0.486123\n",
|
||||
"[trainer-node]: Train Epoch: 1 [14000/60000 (70%)]\tLoss: 0.432043\n",
|
||||
"[trainer-node]: Train Epoch: 1 [15000/60000 (75%)]\tLoss: 0.374424\n",
|
||||
"[trainer-node]: Train Epoch: 1 [16000/60000 (80%)]\tLoss: 0.366622\n",
|
||||
"[trainer-node]: Train Epoch: 1 [17000/60000 (85%)]\tLoss: 0.495783\n",
|
||||
"[trainer-node]: Train Epoch: 1 [18000/60000 (90%)]\tLoss: 0.381096\n",
|
||||
"[trainer-node]: Train Epoch: 1 [19000/60000 (95%)]\tLoss: 0.385782\n",
|
||||
"[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.380943\n",
|
||||
"[trainer-node]: Train Epoch: 2 [1000/60000 (5%)]\tLoss: 0.466423\n",
|
||||
"[trainer-node]: Train Epoch: 2 [2000/60000 (10%)]\tLoss: 0.452478\n",
|
||||
"[trainer-node]: Train Epoch: 2 [3000/60000 (15%)]\tLoss: 0.409038\n",
|
||||
"[trainer-node]: Train Epoch: 2 [4000/60000 (20%)]\tLoss: 0.370588\n",
|
||||
"[trainer-node]: Train Epoch: 2 [5000/60000 (25%)]\tLoss: 0.419151\n",
|
||||
"[trainer-node]: Train Epoch: 2 [6000/60000 (30%)]\tLoss: 0.378228\n",
|
||||
"[trainer-node]: Train Epoch: 2 [7000/60000 (35%)]\tLoss: 0.328720\n",
|
||||
"[trainer-node]: Train Epoch: 2 [8000/60000 (40%)]\tLoss: 0.557514\n",
|
||||
"[trainer-node]: Train Epoch: 2 [9000/60000 (45%)]\tLoss: 0.332585\n",
|
||||
"[trainer-node]: Train Epoch: 2 [10000/60000 (50%)]\tLoss: 0.374972\n",
|
||||
"[trainer-node]: Train Epoch: 2 [11000/60000 (55%)]\tLoss: 0.344400\n",
|
||||
"[trainer-node]: Train Epoch: 2 [12000/60000 (60%)]\tLoss: 0.359475\n",
|
||||
"[trainer-node]: Train Epoch: 2 [13000/60000 (65%)]\tLoss: 0.335085\n",
|
||||
"[trainer-node]: Train Epoch: 2 [14000/60000 (70%)]\tLoss: 0.352953\n",
|
||||
"[trainer-node]: Train Epoch: 2 [15000/60000 (75%)]\tLoss: 0.367524\n",
|
||||
"[trainer-node]: Train Epoch: 2 [16000/60000 (80%)]\tLoss: 0.313468\n",
|
||||
"[trainer-node]: Train Epoch: 2 [17000/60000 (85%)]\tLoss: 0.385835\n",
|
||||
"[trainer-node]: Train Epoch: 2 [18000/60000 (90%)]\tLoss: 0.324088\n",
|
||||
"[trainer-node]: Train Epoch: 2 [19000/60000 (95%)]\tLoss: 0.336721\n",
|
||||
"[trainer-node]: Training is finished\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -545,11 +662,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client.delete_job(job_name)"
|
||||
"# client.delete_job(job_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
8
go.mod
8
go.mod
|
|
@ -20,11 +20,14 @@ require (
|
|||
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
|
||||
sigs.k8s.io/controller-runtime v0.20.2
|
||||
sigs.k8s.io/jobset v0.8.0
|
||||
sigs.k8s.io/kind v0.27.0
|
||||
sigs.k8s.io/scheduler-plugins v0.30.6
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.5.0
|
||||
)
|
||||
|
||||
require (
|
||||
al.essio.dev/pkg/shellescape v1.5.1 // indirect
|
||||
github.com/BurntSushi/toml v1.4.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
|
|
@ -43,19 +46,24 @@ require (
|
|||
github.com/google/gnostic-models v0.6.8 // indirect
|
||||
github.com/google/gofuzz v1.2.0 // indirect
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
|
||||
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.11 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pelletier/go-toml v1.9.5 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_golang v1.21.0 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
github.com/prometheus/common v0.62.0 // indirect
|
||||
github.com/prometheus/procfs v0.15.1 // indirect
|
||||
github.com/spf13/cobra v1.8.1 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
|
|
|
|||
21
go.sum
21
go.sum
|
|
@ -1,7 +1,12 @@
|
|||
al.essio.dev/pkg/shellescape v1.5.1 h1:86HrALUujYS/h+GtqoB26SBEdkWfmMI6FubjXlsXyho=
|
||||
al.essio.dev/pkg/shellescape v1.5.1/go.mod h1:6sIqp7X2P6mThCQ7twERpZTuigpr6KbZWtls1U8I890=
|
||||
github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
|
||||
github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||
|
|
@ -44,8 +49,14 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
|
|||
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg=
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
|
||||
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 h1:SJ+NtwL6QaZ21U+IrK7d0gGgpjGGvd2kz+FzTHVzdqI=
|
||||
github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2/go.mod h1:Tv1PlzqC9t8wNnpPdctvtSUOPUUg4SHeE6vR1Ir2hmg=
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
|
||||
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
|
|
@ -62,6 +73,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0
|
|||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
|
|
@ -77,6 +90,8 @@ github.com/open-policy-agent/cert-controller v0.12.0 h1:RKXlBafMcCh+++I1geJetXo7
|
|||
github.com/open-policy-agent/cert-controller v0.12.0/go.mod h1:N5bCFXdAXMYx0PdS6ZQ9lrDQQMz+F6deoChym6VleXw=
|
||||
github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8=
|
||||
github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a/go.mod h1:tI7nc6H6os2UYZRvSm9Y7bq4oMoXqhwA0WfnqKpoAgc=
|
||||
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
|
||||
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
|
|
@ -92,6 +107,9 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
|
|||
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
|
||||
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
|
||||
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
|
||||
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
|
|
@ -135,6 +153,7 @@ golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
|||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
|
||||
|
|
@ -196,6 +215,8 @@ sigs.k8s.io/jobset v0.8.0 h1:80cJcPld+IMdKFOqzEW4et3Y6lGAPcP8YmBZ+aiKGYA=
|
|||
sigs.k8s.io/jobset v0.8.0/go.mod h1:yitjuGOExl2p964nhyevQGIkfiPSRHcdC3zNBneKCT8=
|
||||
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
|
||||
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
|
||||
sigs.k8s.io/kind v0.27.0 h1:PQ3f0iAWNIj66LYkZ1ivhEg/+Zb6UPMbO+qVei/INZA=
|
||||
sigs.k8s.io/kind v0.27.0/go.mod h1:RZVFmy6qcwlSWwp6xeIUv7kXCPF3i8MXsEXxW/J+gJY=
|
||||
sigs.k8s.io/scheduler-plugins v0.30.6 h1:P4pViMVoyVNHWmkG96UtJ4LvxkUIeenIUKLZd09vDyw=
|
||||
sigs.k8s.io/scheduler-plugins v0.30.6/go.mod h1:EDYYqHmpHR//VYKAeud1TTQbTFSvpdGFeyEg9ejOmnI=
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk=
|
||||
|
|
|
|||
|
|
@ -14,63 +14,36 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
|
||||
|
||||
# This bash script is used to run the example notebooks
|
||||
# This shell is used to run Jupyter Notebook with Papermill.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
NOTEBOOK_INPUT=""
|
||||
NOTEBOOK_OUTPUT="-" # outputs to console
|
||||
NAMESPACE="default"
|
||||
TRAINING_PYTHON_SDK="./sdk/python"
|
||||
if [ -z "${NOTEBOOK_INPUT}" ]; then
|
||||
echo "NOTEBOOK_INPUT env variable must be set to run this script."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -i <input_notebook> -o <output_notebook> [-p \"<param> <value>\"...] [-y <params.yaml>]"
|
||||
echo "Options:"
|
||||
echo " -i Input notebook (required)"
|
||||
echo " -o Output notebook (required)"
|
||||
echo " -k Kubeflow Training Operator Python SDK (optional)"
|
||||
echo " -n Kubernetes namespace used by tests (optional)"
|
||||
echo " -h Show this help message"
|
||||
echo "NOTE: papermill, jupyter and ipykernel are required Python dependencies to run Notebooks"
|
||||
exit 1
|
||||
if [ -z "${NOTEBOOK_OUTPUT}" ]; then
|
||||
echo "NOTEBOOK_OUTPUT env variable must be set to run this script."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${PAPERMILL_TIMEOUT}" ]; then
|
||||
echo "PAPERMILL_TIMEOUT env variable must be set to run this script."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_results() {
|
||||
kubectl get pods
|
||||
kubectl describe pod
|
||||
kubectl describe trainjob
|
||||
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
|
||||
kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
|
||||
kubectl wait trainjob --for=condition=Complete --all --timeout 3s
|
||||
}
|
||||
|
||||
while getopts "i:o:p:k:n:r:d:h:" opt; do
|
||||
case "$opt" in
|
||||
i) NOTEBOOK_INPUT="$OPTARG" ;; # -i for notebook input path
|
||||
o) NOTEBOOK_OUTPUT="$OPTARG" ;; # -o for notebook output path
|
||||
k) TRAINING_PYTHON_SDK="$OPTARG" ;; # -k for training operator python sdk
|
||||
n) NAMESPACE="$OPTARG" ;; # -n for kubernetes namespace used by tests
|
||||
h) usage ;; # -h for help (usage)
|
||||
*)
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$NOTEBOOK_INPUT" ]; then
|
||||
echo "Error: -i notebook input path is required."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
papermill_cmd="papermill $NOTEBOOK_INPUT $NOTEBOOK_OUTPUT -p training_python_sdk $TRAINING_PYTHON_SDK -p namespace $NAMESPACE"
|
||||
|
||||
if ! command -v papermill &>/dev/null; then
|
||||
echo "Error: papermill is not installed. Please install papermill to proceed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Running command: $papermill_cmd"
|
||||
$papermill_cmd
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: papermill execution failed." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Notebook execution completed successfully"
|
||||
(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||
|
||||
(print_results && exit 1)
|
||||
|
|
|
|||
|
|
@ -14,74 +14,67 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# TODO (andreyvelich): Refactor this script for Kubeflow Trainer V2
|
||||
# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
echo "Kind load newly locally built image"
|
||||
# use cluster name which is used in github actions kind create
|
||||
kind load docker-image ${TRAINING_CI_IMAGE} --name ${KIND_CLUSTER}
|
||||
# Configure variables.
|
||||
KIND=${KIND:-./bin/kind}
|
||||
K8S_VERSION=${K8S_VERSION:-1.32.0}
|
||||
KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
|
||||
NAMESPACE="kubeflow-system"
|
||||
TIMEOUT="5m"
|
||||
|
||||
echo "Update training operator manifest with newly built image"
|
||||
cd manifests/overlays/standalone
|
||||
kustomize edit set image kubeflow/training-operator=${TRAINING_CI_IMAGE}
|
||||
# Kubeflow Trainer images.
|
||||
# TODO (andreyvelich): Support initializers images.
|
||||
CONTROLLER_MANAGER_CI_IMAGE=trainer-controller-manager:test
|
||||
echo "Build Kubeflow Trainer images"
|
||||
docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}
|
||||
|
||||
echo "Installing training operator manifests"
|
||||
kustomize build . | kubectl apply --server-side -f -
|
||||
echo "Set the image in Kustomize overlay"
|
||||
cd manifests/overlays/manager
|
||||
kustomize edit set image kubeflow/trainer-controller-manager=${CONTROLLER_MANAGER_CI_IMAGE}
|
||||
|
||||
if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
|
||||
SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins)
|
||||
git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}"
|
||||
echo "Create Kind cluster and load Kubeflow Trainer images"
|
||||
${KIND} create cluster --image "${KIND_NODE_VERSION}"
|
||||
${KIND} load docker-image ${CONTROLLER_MANAGER_CI_IMAGE}
|
||||
|
||||
echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..."
|
||||
helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \
|
||||
--namespace scheduler-plugins \
|
||||
--set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \
|
||||
--set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}"
|
||||
echo "Deploy Kubeflow Trainer control plane"
|
||||
kubectl apply --server-side -k .
|
||||
|
||||
echo "Configure gang-scheduling using scheduler-plugins to training-operator"
|
||||
kubectl patch -n kubeflow deployments training-operator --type='json' \
|
||||
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=scheduler-plugins"}]'
|
||||
elif [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
|
||||
VOLCANO_SCHEDULER_VERSION=$(go list -m -f "{{.Version}}" volcano.sh/apis)
|
||||
|
||||
# patch scheduler first so that it is ready when scheduler-deployment installing finished
|
||||
echo "Configure gang-scheduling using volcano to training-operator"
|
||||
kubectl patch -n kubeflow deployments training-operator --type='json' \
|
||||
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=volcano"}]'
|
||||
|
||||
echo "Installing volcano scheduler ${VOLCANO_SCHEDULER_VERSION}..."
|
||||
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/${VOLCANO_SCHEDULER_VERSION}/installer/volcano-development.yaml
|
||||
fi
|
||||
|
||||
TIMEOUT=30
|
||||
until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do
|
||||
sleep 10
|
||||
TIMEOUT=$((TIMEOUT - 1))
|
||||
done
|
||||
if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
|
||||
kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all ||
|
||||
(
|
||||
kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins
|
||||
# We should wait until Deployment is in Ready status.
|
||||
echo "Wait for Kubeflow Trainer to be ready"
|
||||
(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
|
||||
kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
|
||||
(
|
||||
echo "Failed to wait until Kubeflow Trainer is ready" &&
|
||||
kubectl get pods -n ${NAMESPACE} &&
|
||||
kubectl describe pods -n ${NAMESPACE} &&
|
||||
exit 1
|
||||
)
|
||||
fi
|
||||
)
|
||||
|
||||
# wait for volcano up
|
||||
if [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
|
||||
kubectl rollout status deployment -n volcano-system volcano-admission --timeout "${TIMEOUT}s" &&
|
||||
kubectl rollout status deployment -n volcano-system volcano-scheduler --timeout "${TIMEOUT}s" &&
|
||||
kubectl rollout status deployment -n volcano-system volcano-controllers --timeout "${TIMEOUT}s" ||
|
||||
(
|
||||
kubectl get pods -n volcano-system && kubectl describe pods -n volcano-system
|
||||
exit 1
|
||||
)
|
||||
fi
|
||||
print_cluster_info() {
|
||||
kubectl version
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
kubectl get pods -n ${NAMESPACE}
|
||||
kubectl describe pod -n ${NAMESPACE}
|
||||
}
|
||||
|
||||
kubectl version
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
kubectl get pods -n kubeflow
|
||||
kubectl describe pods -n kubeflow
|
||||
# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
|
||||
echo "Deploy Kubeflow Trainer runtimes"
|
||||
(cd ../runtimes && kubectl apply --server-side -k .) || (
|
||||
kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
|
||||
print_cluster_info &&
|
||||
exit 1
|
||||
)
|
||||
|
||||
# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
|
||||
TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
||||
docker pull ${TORCH_RUNTIME_IMAGE}
|
||||
${KIND} load docker-image ${TORCH_RUNTIME_IMAGE}
|
||||
|
||||
print_cluster_info
|
||||
|
|
|
|||
|
|
@ -18,4 +18,5 @@ package tools
|
|||
|
||||
import (
|
||||
_ "k8s.io/code-generator"
|
||||
_ "sigs.k8s.io/kind/cmd/kind/app"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -14,8 +14,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer and so on.
|
||||
# We don't use this tool to generate deepcopy because kubebuilder (controller-tools) has covered that part.
|
||||
# This shell is used to auto generate some useful tools for k8s, such as clientset, lister, informer.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
|
|
|
|||
|
|
@ -202,6 +202,12 @@ class TrainerClient:
|
|||
trainer.resources_per_node
|
||||
)
|
||||
|
||||
# Set numProcPerNode to the Trainer.
|
||||
if trainer and trainer.resources_per_node:
|
||||
trainer_crd.num_proc_per_node = utils.get_num_proc_per_node(
|
||||
trainer.resources_per_node
|
||||
)
|
||||
|
||||
# Add command and args to the Trainer if training function is set.
|
||||
if trainer and trainer.func:
|
||||
trainer_crd.command = constants.DEFAULT_COMMAND
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
import inspect
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import queue
|
||||
import textwrap
|
||||
|
|
@ -122,6 +123,31 @@ def get_resources_per_node(resources_per_node: dict) -> client.V1ResourceRequire
|
|||
return resources
|
||||
|
||||
|
||||
# TODO (andreyvelich): Move this part to the Kubeflow Trainer torch plugins.
|
||||
# Ref issue: https://github.com/kubeflow/trainer/issues/2407
|
||||
def get_num_proc_per_node(resources_per_node: dict) -> object:
|
||||
"""
|
||||
Get the Trainer numProcPerNode from the given resources.
|
||||
"""
|
||||
|
||||
resources = {k.lower(): v for k, v in resources_per_node.items()}
|
||||
# NumProcPerNode is equal to number of GPUs or CPUs, otherwise set it to `auto`
|
||||
for key, value in resources.items():
|
||||
if "gpu" in key:
|
||||
return value
|
||||
|
||||
for key, value in resources.items():
|
||||
if "cpu" in key:
|
||||
# For now, we can't convert milliCPUs to the numProcPerNode.
|
||||
try:
|
||||
value = math.ceil(int(value))
|
||||
return value
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return "auto"
|
||||
|
||||
|
||||
def get_args_using_train_func(
|
||||
train_func: Callable,
|
||||
train_func_parameters: Optional[Dict[str, Any]] = None,
|
||||
|
|
|
|||
|
|
@ -31,8 +31,9 @@ classifiers = [
|
|||
]
|
||||
dependencies = [
|
||||
"kubernetes>=27.2.0",
|
||||
"pydantic>=2.10.0",
|
||||
"jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.8.0#subdirectory=sdk/python",
|
||||
# TODO (andreyvelich): Update JobSet to v0.8.0 once this PR is merged: https://github.com/kubeflow/trainer/pull/2466
|
||||
# "pydantic>=2.10.0",
|
||||
"jobset @ git+https://github.com/kubernetes-sigs/jobset.git@v0.7.2#subdirectory=sdk/python",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,78 @@
|
|||
package e2e
|
||||
|
||||
import (
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
"github.com/onsi/gomega"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
jobsetconsts "sigs.k8s.io/jobset/pkg/constants"
|
||||
|
||||
trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
|
||||
"github.com/kubeflow/trainer/pkg/constants"
|
||||
testingutil "github.com/kubeflow/trainer/pkg/util/testing"
|
||||
"github.com/kubeflow/trainer/test/util"
|
||||
)
|
||||
|
||||
var _ = ginkgo.Describe("TrainJob e2e", func() {
|
||||
// Each test runs in a separate namespace.
|
||||
var ns *corev1.Namespace
|
||||
|
||||
// Create test namespace before each test.
|
||||
ginkgo.BeforeEach(func() {
|
||||
ns = &corev1.Namespace{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
GenerateName: "e2e-",
|
||||
},
|
||||
}
|
||||
gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
|
||||
|
||||
// Wait for namespace to exist before proceeding with test.
|
||||
gomega.Eventually(func(g gomega.Gomega) {
|
||||
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(ns), ns)).Should(gomega.Succeed())
|
||||
}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
|
||||
})
|
||||
|
||||
// Delete test namespace after each test.
|
||||
ginkgo.AfterEach(func() {
|
||||
// Delete test namespace after each test.
|
||||
gomega.Expect(k8sClient.Delete(ctx, ns)).To(gomega.Succeed())
|
||||
})
|
||||
|
||||
// These tests create TrainJob that reference supported runtime without any additional changes.
|
||||
ginkgo.When("creating TrainJob", func() {
|
||||
// Verify `torch-distributed` ClusterTrainingRuntime.
|
||||
ginkgo.It("should create TrainJob with PyTorch runtime reference", func() {
|
||||
// Create a TrainJob.
|
||||
trainJob := testingutil.MakeTrainJobWrapper(ns.Name, "e2e-test").
|
||||
RuntimeRef(trainer.SchemeGroupVersion.WithKind(trainer.ClusterTrainingRuntimeKind), "torch-distributed").
|
||||
Obj()
|
||||
|
||||
ginkgo.By("Create a TrainJob with torch-distributed runtime reference", func() {
|
||||
gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed())
|
||||
})
|
||||
|
||||
// Wait for TrainJob to be in Succeeded status.
|
||||
ginkgo.By("Wait for TrainJob to be in Succeeded status", func() {
|
||||
gomega.Eventually(func(g gomega.Gomega) {
|
||||
gotTrainJob := &trainer.TrainJob{}
|
||||
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(trainJob), gotTrainJob)).Should(gomega.Succeed())
|
||||
g.Expect(gotTrainJob.Status.Conditions).Should(gomega.BeComparableTo([]metav1.Condition{
|
||||
{
|
||||
Type: trainer.TrainJobCreated,
|
||||
Status: metav1.ConditionTrue,
|
||||
Reason: trainer.TrainJobJobsCreationSucceededReason,
|
||||
Message: constants.TrainJobJobsCreationSucceededMessage,
|
||||
},
|
||||
{
|
||||
Type: trainer.TrainJobComplete,
|
||||
Status: metav1.ConditionTrue,
|
||||
Reason: jobsetconsts.AllJobsCompletedReason,
|
||||
Message: jobsetconsts.AllJobsCompletedMessage,
|
||||
},
|
||||
}, util.IgnoreConditions))
|
||||
}, util.TimeoutE2E, util.Interval).Should(gomega.Succeed())
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
Copyright 2024 The Kubeflow Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
"github.com/onsi/gomega"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/config"
|
||||
|
||||
trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
|
||||
)
|
||||
|
||||
var (
|
||||
k8sClient client.Client
|
||||
ctx context.Context
|
||||
)
|
||||
|
||||
func TestAPIs(t *testing.T) {
|
||||
gomega.RegisterFailHandler(ginkgo.Fail)
|
||||
ginkgo.RunSpecs(t, "Kubeflow Trainer E2E Suite")
|
||||
}
|
||||
|
||||
var _ = ginkgo.BeforeSuite(func() {
|
||||
ctx = context.Background()
|
||||
// Get Kubernetes config.
|
||||
cfg := config.GetConfigOrDie()
|
||||
gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil())
|
||||
|
||||
// Add Trainer APIs.
|
||||
err := trainer.AddToScheme(scheme.Scheme)
|
||||
gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())
|
||||
|
||||
// Configure k8s client.
|
||||
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
|
||||
gomega.Expect(err).NotTo(gomega.HaveOccurred())
|
||||
gomega.Expect(k8sClient).NotTo(gomega.BeNil())
|
||||
})
|
||||
|
|
@ -17,14 +17,16 @@ limitations under the License.
|
|||
package util
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
Timeout = 5 * time.Second
|
||||
TimeoutE2E = 10 * time.Minute // E2Es require a longer timeout due to large image pull
|
||||
ConsistentDuration = time.Second
|
||||
Interval = time.Millisecond * 250
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue