mirror of https://github.com/kubeflow/katib.git
Compare commits
5 Commits
Author | SHA1 | Date |
---|---|---|
|
f09dbf170b | |
|
d79b8d279d | |
|
3d3fb391db | |
|
7dcdde7af9 | |
|
2daece483c |
|
@ -1,5 +1,5 @@
|
|||
# Reusable workflows for publishing Katib images.
|
||||
name: Build And Publish Images
|
||||
name: Build and Publish Images
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
@ -21,31 +21,50 @@ on:
|
|||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
name: Publish Image
|
||||
name: Build and Publish Images
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Docker Login
|
||||
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
|
||||
if: >-
|
||||
github.repository == 'kubeflow/katib' &&
|
||||
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
|
||||
- name: Set Publish Condition
|
||||
id: publish-condition
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.repository }}" == 'kubeflow/katib' && \
|
||||
( "${{ github.ref }}" == 'refs/heads/master' || \
|
||||
"${{ github.ref }}" =~ ^refs/heads/release- || \
|
||||
"${{ github.ref }}" =~ ^refs/tags/v ) ]]; then
|
||||
echo "should_publish=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_publish=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: GHCR Login
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: DockerHub Login
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: docker.io
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Publish Component ${{ inputs.component-name }}
|
||||
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
|
||||
if: >-
|
||||
github.repository == 'kubeflow/katib' &&
|
||||
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
id: publish
|
||||
uses: ./.github/workflows/template-publish-image
|
||||
with:
|
||||
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
image: |
|
||||
ghcr.io/kubeflow/katib/${{ inputs.component-name }}
|
||||
docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
dockerfile: ${{ inputs.dockerfile }}
|
||||
platforms: ${{ inputs.platforms }}
|
||||
push: true
|
||||
|
@ -54,7 +73,9 @@ jobs:
|
|||
if: steps.publish.outcome == 'skipped'
|
||||
uses: ./.github/workflows/template-publish-image
|
||||
with:
|
||||
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
image: |
|
||||
ghcr.io/kubeflow/katib/${{ inputs.component-name }}
|
||||
docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
dockerfile: ${{ inputs.dockerfile }}
|
||||
platforms: ${{ inputs.platforms }}
|
||||
push: false
|
||||
|
|
2
Makefile
2
Makefile
|
@ -5,7 +5,7 @@ HAS_SETUP_ENVTEST := $(shell command -v setup-envtest;)
|
|||
HAS_MOCKGEN := $(shell command -v mockgen;)
|
||||
|
||||
COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD)
|
||||
KATIB_REGISTRY := docker.io/kubeflowkatib
|
||||
KATIB_REGISTRY := ghcr.io/kubeflow/katib
|
||||
CPU_ARCH ?= linux/amd64,linux/arm64
|
||||
ENVTEST_K8S_VERSION ?= 1.31
|
||||
MOCKGEN_VERSION ?= $(shell grep 'go.uber.org/mock' go.mod | cut -d ' ' -f 2)
|
||||
|
|
|
@ -22,7 +22,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-controller</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-controller</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib Controller
|
||||
|
@ -33,7 +33,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-ui</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-ui</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib User Interface
|
||||
|
@ -44,7 +44,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-db-manager</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-db-manager</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib DB Manager
|
||||
|
@ -87,7 +87,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/file-metrics-collector</code>
|
||||
<code>ghcr.io/kubeflow/katib/file-metrics-collector</code>
|
||||
</td>
|
||||
<td>
|
||||
File Metrics Collector
|
||||
|
@ -98,7 +98,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/tfevent-metrics-collector</code>
|
||||
<code>ghcr.io/kubeflow/katib/tfevent-metrics-collector</code>
|
||||
</td>
|
||||
<td>
|
||||
Tensorflow Event Metrics Collector
|
||||
|
@ -131,7 +131,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-hyperopt</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-hyperopt</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/hyperopt/hyperopt">Hyperopt</a> Suggestion
|
||||
|
@ -142,7 +142,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-skopt</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-skopt</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/scikit-optimize/scikit-optimize">Skopt</a> Suggestion
|
||||
|
@ -153,7 +153,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-optuna</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-optuna</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/optuna/optuna">Optuna</a> Suggestion
|
||||
|
@ -164,7 +164,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-goptuna</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-goptuna</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/c-bata/goptuna">Goptuna</a> Suggestion
|
||||
|
@ -175,7 +175,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-hyperband</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-hyperband</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#hyperband">Hyperband</a> Suggestion
|
||||
|
@ -186,7 +186,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-enas</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-enas</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#enas">ENAS</a> Suggestion
|
||||
|
@ -197,7 +197,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-darts</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-darts</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#differentiable-architecture-search-darts">DARTS</a> Suggestion
|
||||
|
@ -208,7 +208,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/earlystopping-medianstop</code>
|
||||
<code>ghcr.io/kubeflow/katib/earlystopping-medianstop</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/early-stopping/#median-stopping-rule">Median Stopping Rule</a>
|
||||
|
@ -240,7 +240,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/pytorch-mnist-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/pytorch-mnist-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch MNIST example with printing metrics to the file or StdOut with CPU support
|
||||
|
@ -251,7 +251,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/pytorch-mnist-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/pytorch-mnist-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch MNIST example with printing metrics to the file or StdOut with GPU support
|
||||
|
@ -262,7 +262,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/tf-mnist-with-summaries</code>
|
||||
<code>ghcr.io/kubeflow/katib/tf-mnist-with-summaries</code>
|
||||
</td>
|
||||
<td>
|
||||
Tensorflow MNIST example with saving metrics in the summaries
|
||||
|
@ -273,7 +273,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/xgboost-lightgbm</code>
|
||||
<code>ghcr.io/kubeflow/katib/xgboost-lightgbm</code>
|
||||
</td>
|
||||
<td>
|
||||
Distributed LightGBM example for XGBoostJob
|
||||
|
@ -306,7 +306,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/enas-cnn-cifar10-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/enas-cnn-cifar10-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
Keras CIFAR-10 CNN example for ENAS with GPU support
|
||||
|
@ -317,7 +317,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/enas-cnn-cifar10-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/enas-cnn-cifar10-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
Keras CIFAR-10 CNN example for ENAS with CPU support
|
||||
|
@ -328,7 +328,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/darts-cnn-cifar10-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/darts-cnn-cifar10-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch CIFAR-10 CNN example for DARTS with GPU support
|
||||
|
@ -339,7 +339,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/darts-cnn-cifar10-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/darts-cnn-cifar10-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch CIFAR-10 CNN example for DARTS with CPU support
|
||||
|
|
|
@ -74,7 +74,7 @@ spec:
|
|||
- name: epochs
|
||||
container:
|
||||
name: model-training
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -62,7 +62,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -52,7 +52,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -45,7 +45,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -45,7 +45,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -44,7 +44,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -57,7 +57,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -63,7 +63,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -42,7 +42,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -63,7 +63,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -42,7 +42,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -43,7 +43,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/simple-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/simple-pbt:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pbt/pbt_test.py"
|
||||
|
|
|
@ -42,7 +42,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -42,7 +42,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -46,7 +46,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
@ -61,7 +61,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -56,7 +56,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: tensorflow
|
||||
image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest
|
||||
image: ghcr.io/kubeflow/katib/tf-mnist-with-summaries:v0.18.0
|
||||
command:
|
||||
- "python"
|
||||
- "/opt/tf-mnist-with-summaries/mnist.py"
|
||||
|
|
|
@ -56,7 +56,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: xgboost
|
||||
image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0
|
||||
image: ghcr.io/kubeflow/katib/xgboost-lightgbm:1.0
|
||||
ports:
|
||||
- containerPort: 9991
|
||||
name: xgboostjob-port
|
||||
|
@ -90,7 +90,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: xgboost
|
||||
image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0
|
||||
image: ghcr.io/kubeflow/katib/xgboost-lightgbm:1.0
|
||||
ports:
|
||||
- containerPort: 9991
|
||||
name: xgboostjob-port
|
||||
|
|
|
@ -26,7 +26,7 @@ spec:
|
|||
- katib-db-manager.kubeflow:6789
|
||||
- -path
|
||||
- /katib/mnist.log
|
||||
image: kubeflowkatib/custom-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/custom-metrics-collector:latest
|
||||
imagePullPolicy: Always
|
||||
name: custom-metrics-logger-and-collector
|
||||
env:
|
||||
|
@ -67,7 +67,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -52,7 +52,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -54,7 +54,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -60,7 +60,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/darts-cnn-cifar10-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/darts-cnn-cifar10-cpu:v0.18.0
|
||||
command:
|
||||
- python3
|
||||
- run_trial.py
|
||||
|
|
|
@ -77,7 +77,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/darts-cnn-cifar10-gpu:latest
|
||||
image: ghcr.io/kubeflow/katib/darts-cnn-cifar10-gpu:v0.18.0
|
||||
command:
|
||||
- python3
|
||||
- run_trial.py
|
||||
|
|
|
@ -139,7 +139,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/enas-cnn-cifar10-cpu:v0.18.0
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
|
|
|
@ -136,7 +136,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/enas-cnn-cifar10-gpu:latest
|
||||
image: ghcr.io/kubeflow/katib/enas-cnn-cifar10-gpu:v0.18.0
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
|
|
|
@ -43,7 +43,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -43,7 +43,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -30,13 +30,13 @@ set this `nop` image to Metrics Collector image.
|
|||
|
||||
For example, if you are using
|
||||
[StdOut](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector) Metrics Collector,
|
||||
`nop` image must be equal to `docker.io/kubeflowkatib/file-metrics-collector`.
|
||||
`nop` image must be equal to `ghcr.io/kubeflow/katib/file-metrics-collector`.
|
||||
|
||||
Run the following command to modify the `nop` image:
|
||||
|
||||
```bash
|
||||
kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \
|
||||
-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "docker.io/kubeflowkatib/file-metrics-collector"}]'
|
||||
-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "ghcr.io/kubeflow/katib/file-metrics-collector"}]'
|
||||
```
|
||||
|
||||
Check that Tekton Pipelines Controller's pod was restarted:
|
||||
|
@ -54,7 +54,7 @@ Verify that `nop` image was modified:
|
|||
```bash
|
||||
$ kubectl get $(kubectl get pods -o name -n tekton-pipelines | grep tekton-pipelines-controller) -n tekton-pipelines -o yaml | grep katib
|
||||
|
||||
- docker.io/kubeflowkatib/file-metrics-collector
|
||||
- ghcr.io/kubeflow/katib/file-metrics-collector
|
||||
```
|
||||
|
||||
### Katib Controller
|
||||
|
|
|
@ -88,7 +88,7 @@ spec:
|
|||
description: Number of epochs
|
||||
steps:
|
||||
- name: model-training
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -66,7 +66,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -23,7 +23,7 @@ spec:
|
|||
serviceAccountName: katib-controller
|
||||
containers:
|
||||
- name: katib-controller
|
||||
image: docker.io/kubeflowkatib/katib-controller
|
||||
image: ghcr.io/kubeflow/katib/katib-controller
|
||||
command: ["./katib-controller"]
|
||||
args:
|
||||
- --katib-config=/katib-config.yaml
|
||||
|
|
|
@ -15,7 +15,7 @@ data:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
@ -33,7 +33,7 @@ data:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/enas-cnn-cifar10-cpu:v0.18.0
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
|
@ -54,7 +54,7 @@ data:
|
|||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
@ -68,7 +68,7 @@ data:
|
|||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -20,7 +20,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: katib-db-manager
|
||||
image: docker.io/kubeflowkatib/katib-db-manager
|
||||
image: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
env:
|
||||
- name: DB_NAME
|
||||
value: "mysql"
|
||||
|
|
|
@ -20,7 +20,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: katib-ui
|
||||
image: docker.io/kubeflowkatib/katib-ui
|
||||
image: ghcr.io/kubeflow/katib/katib-ui
|
||||
command:
|
||||
- "./katib-ui"
|
||||
args:
|
||||
|
|
|
@ -13,40 +13,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -55,4 +55,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -20,15 +20,15 @@ resources:
|
|||
# Cert-manager certificate for webhooks
|
||||
- certificate.yaml
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
|
||||
patchesStrategicMerge:
|
||||
- patches/katib-cert-injection.yaml
|
||||
|
|
|
@ -15,40 +15,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -57,4 +57,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -16,15 +16,15 @@ resources:
|
|||
# Katib webhooks.
|
||||
- ../../components/webhook/
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
patchesStrategicMerge:
|
||||
- patches/db-manager.yaml
|
||||
# Modify katib-mysql-secrets with parameters for the DB.
|
||||
|
|
|
@ -16,40 +16,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -58,4 +58,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -13,40 +13,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -55,4 +55,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -28,15 +28,15 @@ resources:
|
|||
# Katib webhooks.
|
||||
- ../../components/webhook/
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
|
||||
patchesJson6902:
|
||||
# Annotate Service to delegate TLS-secret generation to OpenShift service controller
|
||||
|
|
|
@ -15,40 +15,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -57,4 +57,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -18,15 +18,15 @@ resources:
|
|||
# Katib webhooks.
|
||||
- ../../components/webhook/
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
patchesJson6902:
|
||||
- target:
|
||||
group: apps
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
---
|
||||
apiVersion: config.kubeflow.org/v1beta1
|
||||
kind: KatibConfig
|
||||
init:
|
||||
|
@ -15,40 +14,40 @@ init:
|
|||
runtime:
|
||||
metricsCollectors:
|
||||
- kind: StdOut
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: File
|
||||
image: docker.io/kubeflowkatib/file-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.18.0
|
||||
- kind: TensorFlowEvent
|
||||
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
|
||||
image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 1Gi
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.18.0
|
||||
- algorithmName: grid
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: hyperband
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.18.0
|
||||
- algorithmName: bayesianoptimization
|
||||
image: docker.io/kubeflowkatib/suggestion-skopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.18.0
|
||||
- algorithmName: cmaes
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: sobol
|
||||
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.18.0
|
||||
- algorithmName: multivariate-tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-optuna:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.18.0
|
||||
- algorithmName: enas
|
||||
image: docker.io/kubeflowkatib/suggestion-enas:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-enas:v0.18.0
|
||||
resources:
|
||||
limits:
|
||||
memory: 400Mi
|
||||
- algorithmName: darts
|
||||
image: docker.io/kubeflowkatib/suggestion-darts:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-darts:v0.18.0
|
||||
- algorithmName: pbt
|
||||
image: docker.io/kubeflowkatib/suggestion-pbt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.18.0
|
||||
persistentVolumeClaimSpec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
|
@ -57,4 +56,4 @@ runtime:
|
|||
storage: 5Gi
|
||||
earlyStoppings:
|
||||
- algorithmName: medianstop
|
||||
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
|
||||
image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.18.0
|
||||
|
|
|
@ -18,15 +18,15 @@ resources:
|
|||
# Katib webhooks.
|
||||
- ../../components/webhook/
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
configMapGenerator:
|
||||
- name: katib-config
|
||||
behavior: create
|
||||
|
|
|
@ -9,15 +9,15 @@ resources:
|
|||
- ui-virtual-service.yaml
|
||||
- istio-authorizationpolicy.yaml
|
||||
images:
|
||||
- name: docker.io/kubeflowkatib/katib-controller
|
||||
newName: docker.io/kubeflowkatib/katib-controller
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-db-manager
|
||||
newName: docker.io/kubeflowkatib/katib-db-manager
|
||||
newTag: latest
|
||||
- name: docker.io/kubeflowkatib/katib-ui
|
||||
newName: docker.io/kubeflowkatib/katib-ui
|
||||
newTag: latest
|
||||
- name: ghcr.io/kubeflow/katib/katib-controller
|
||||
newName: ghcr.io/kubeflow/katib/katib-controller
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newName: ghcr.io/kubeflow/katib/katib-db-manager
|
||||
newTag: v0.18.0
|
||||
- name: ghcr.io/kubeflow/katib/katib-ui
|
||||
newName: ghcr.io/kubeflow/katib/katib-ui
|
||||
newTag: v0.18.0
|
||||
|
||||
patchesStrategicMerge:
|
||||
- patches/remove-namespace.yaml
|
||||
|
|
|
@ -483,7 +483,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
|
|||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: primaryContainer,
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
@ -619,7 +619,7 @@ func newFakeBatchJob() *batchv1.Job {
|
|||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: primaryContainer,
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
|
|
@ -61,7 +61,7 @@ func TestGetRunSpecWithHP(t *testing.T) {
|
|||
Containers: []v1.Container{
|
||||
{
|
||||
Name: "training-container",
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
@ -170,7 +170,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
@ -186,7 +186,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu
|
||||
command:
|
||||
- python3
|
||||
- /opt/pytorch-mnist/mnist.py
|
||||
|
@ -207,7 +207,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
@ -337,7 +337,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
|
|||
Containers: []v1.Container{
|
||||
{
|
||||
Name: "training-container",
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
|
|
@ -440,7 +440,7 @@ func newFakeTrialBatchJob(mcType commonv1beta1.CollectorKind, trialName string)
|
|||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: primaryContainer,
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
|
|
@ -30,11 +30,23 @@ import api_pb2
|
|||
import rfc3339
|
||||
import tensorflow as tf
|
||||
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
|
||||
from tensorboard.backend.event_processing.tag_types import TENSORS
|
||||
from tensorboard.backend.event_processing.tag_types import SCALARS, TENSORS
|
||||
|
||||
from pkg.metricscollector.v1beta1.common import const
|
||||
|
||||
|
||||
def _should_consider(tag: str, metric_name: str, tfefile: str) -> bool:
|
||||
tfefile_parent_dir = (
|
||||
os.path.dirname(metric_name)
|
||||
if len(metric_name.split("/")) >= 2
|
||||
else os.path.dirname(tfefile)
|
||||
)
|
||||
basedir_name = os.path.dirname(tfefile)
|
||||
return tag.startswith(metric_name.split("/")[-1]) and basedir_name.endswith(
|
||||
tfefile_parent_dir
|
||||
)
|
||||
|
||||
|
||||
class TFEventFileParser:
|
||||
def __init__(self, metric_names):
|
||||
self.metric_names = metric_names
|
||||
|
@ -47,31 +59,36 @@ class TFEventFileParser:
|
|||
|
||||
def parse_summary(self, tfefile):
|
||||
metric_logs = []
|
||||
event_accumulator = EventAccumulator(tfefile, size_guidance={TENSORS: 0})
|
||||
event_accumulator = EventAccumulator(
|
||||
tfefile, size_guidance={SCALARS: 0, TENSORS: 0}
|
||||
)
|
||||
event_accumulator.Reload()
|
||||
for tag in event_accumulator.Tags()[TENSORS]:
|
||||
tags = event_accumulator.Tags()
|
||||
for tag in tags[TENSORS]:
|
||||
for m in self.metric_names:
|
||||
tfefile_parent_dir = (
|
||||
os.path.dirname(m)
|
||||
if len(m.split("/")) >= 2
|
||||
else os.path.dirname(tfefile)
|
||||
)
|
||||
basedir_name = os.path.dirname(tfefile)
|
||||
if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith(
|
||||
tfefile_parent_dir
|
||||
):
|
||||
continue
|
||||
|
||||
for tensor in event_accumulator.Tensors(tag):
|
||||
ml = api_pb2.MetricLog(
|
||||
time_stamp=rfc3339.rfc3339(
|
||||
datetime.fromtimestamp(tensor.wall_time)
|
||||
),
|
||||
metric=api_pb2.Metric(
|
||||
name=m, value=str(tf.make_ndarray(tensor.tensor_proto))
|
||||
),
|
||||
)
|
||||
metric_logs.append(ml)
|
||||
if _should_consider(tag, m, tfefile):
|
||||
for tensor in event_accumulator.Tensors(tag):
|
||||
ml = api_pb2.MetricLog(
|
||||
time_stamp=rfc3339.rfc3339(
|
||||
datetime.fromtimestamp(tensor.wall_time)
|
||||
),
|
||||
metric=api_pb2.Metric(
|
||||
name=m, value=str(tf.make_ndarray(tensor.tensor_proto))
|
||||
),
|
||||
)
|
||||
metric_logs.append(ml)
|
||||
# support old-style tensorboard metrics too
|
||||
for tag in tags[SCALARS]:
|
||||
for m in self.metric_names:
|
||||
if _should_consider(tag, m, tfefile):
|
||||
for scalar in event_accumulator.Scalars(tag):
|
||||
ml = api_pb2.MetricLog(
|
||||
time_stamp=rfc3339.rfc3339(
|
||||
datetime.fromtimestamp(scalar.wall_time)
|
||||
),
|
||||
metric=api_pb2.Metric(name=m, value=str(scalar.value)),
|
||||
)
|
||||
metric_logs.append(ml)
|
||||
|
||||
return metric_logs
|
||||
|
||||
|
|
|
@ -8,15 +8,15 @@
|
|||
"Templates": [
|
||||
{
|
||||
"Path": "defaultTrialTemplate.yaml",
|
||||
"Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n restartPolicy: Never"
|
||||
"Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: ghcr.io/kubeflow/katib/pytorch-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n restartPolicy: Never"
|
||||
},
|
||||
{
|
||||
"Path": "enasCPUTemplate",
|
||||
"Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v1beta1-45c5727\n command:\n - python3\n - -u\n - RunTrial.py\n - --num_epochs=1\n - \"--architecture=\\\"${trialParameters.neuralNetworkArchitecture}\\\"\"\n - \"--nn_config=\\\"${trialParameters.neuralNetworkConfig}\\\"\"\n restartPolicy: Never"
|
||||
"Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: ghcr.io/kubeflow/katib/enas-cnn-cifar10-cpu:v1beta1-45c5727\n command:\n - python3\n - -u\n - RunTrial.py\n - --num_epochs=1\n - \"--architecture=\\\"${trialParameters.neuralNetworkArchitecture}\\\"\"\n - \"--nn_config=\\\"${trialParameters.neuralNetworkConfig}\\\"\"\n restartPolicy: Never"
|
||||
},
|
||||
{
|
||||
"Path": "pytorchJobTemplate",
|
||||
"Yaml": "apiVersion: \"kubeflow.org/v1\"\nkind: PyTorchJob\nspec:\n pytorchReplicaSpecs:\n Master:\n replicas: 1\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727\n imagePullPolicy: Always\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n Worker:\n replicas: 2\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727\n imagePullPolicy: Always\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\""
|
||||
"Yaml": "apiVersion: \"kubeflow.org/v1\"\nkind: PyTorchJob\nspec:\n pytorchReplicaSpecs:\n Master:\n replicas: 1\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: ghcr.io/kubeflow/katib/pytorch-mnist:v1beta1-45c5727\n imagePullPolicy: Always\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n Worker:\n replicas: 2\n restartPolicy: OnFailure\n template:\n spec:\n containers:\n - name: pytorch\n image: ghcr.io/kubeflow/katib/pytorch-mnist:v1beta1-45c5727\n imagePullPolicy: Always\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\""
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -407,7 +407,7 @@ init:
|
|||
runtime:
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
|
||||
image: ghcr.io/kubeflow/katib/suggestion-hyperopt:latest
|
||||
`), os.FileMode(0600)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
|
@ -1433,7 +1433,7 @@ func newFakeBatchJob() *batchv1.Job {
|
|||
Containers: []v1.Container{
|
||||
{
|
||||
Name: "training-container",
|
||||
Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
|
||||
Image: "ghcr.io/kubeflow/katib/pytorch-mnist-cpu",
|
||||
Command: []string{
|
||||
"python3",
|
||||
"--epochs=1",
|
||||
|
|
|
@ -68,7 +68,7 @@ fi
|
|||
|
||||
# ------------------ Change image tag ------------------
|
||||
# Change Katib image tags to the new release tag.
|
||||
make update-images OLD_PREFIX="docker.io/kubeflowkatib/" NEW_PREFIX="docker.io/kubeflowkatib/" TAG="${TAG}"
|
||||
make update-images OLD_PREFIX="ghcr.io/kubeflow/katib/" NEW_PREFIX="ghcr.io/kubeflow/katib/" TAG="${TAG}"
|
||||
|
||||
# ------------------ Publish Katib SDK ------------------
|
||||
# Remove first "v" for the SDK version.
|
||||
|
|
|
@ -28,8 +28,8 @@
|
|||
# 5. Katib Trial training containers
|
||||
#
|
||||
# Run ./scripts/v1beta1/update-images.sh <OLD_PREFIX> <NEW_PREFIX> <TAG> to execute it.
|
||||
# For example, to update images from: docker.io/kubeflowkatib/ to: docker.io/private/ registry with tag: v0.12.0, run:
|
||||
# ./scripts/v1beta1/update-images.sh docker.io/kubeflowkatib/ docker.io/private/ v0.12.0
|
||||
# For example, to update images from: ghcr.io/kubeflow/katib/ to: ghcr.io/private/ registry with tag: v0.12.0, run:
|
||||
# ./scripts/v1beta1/update-images.sh ghcr.io/kubeflow/katib/ ghcr.io/private/ v0.12.0
|
||||
|
||||
set -o errexit
|
||||
set -o pipefail
|
||||
|
@ -42,8 +42,8 @@ TAG=${3:-""}
|
|||
if [[ -z "$OLD_PREFIX" || -z "$NEW_PREFIX" || -z "$TAG" ]]; then
|
||||
echo "Image old prefix, new prefix, and tag must be set"
|
||||
echo -e "Usage: $0 <OLD_PREFIX> <NEW_PREFIX> <TAG>\n" 1>&2
|
||||
echo "For example, to update images from: docker.io/kubeflowkatib/ to: docker.io/private/ registry with tag: v0.12.0, run:"
|
||||
echo "$0 docker.io/kubeflowkatib/ docker.io/private/ v0.12.0"
|
||||
echo "For example, to update images from: ghcr.io/kubeflow/katib/ to: ghcr.io/private/ registry with tag: v0.12.0, run:"
|
||||
echo "$0 ghcr.io/kubeflow/katib/ ghcr.io/private/ v0.12.0"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ def generate_trial_template() -> V1beta1TrialTemplate:
|
|||
"containers": [
|
||||
{
|
||||
"name": "training-container",
|
||||
"image": "docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0",
|
||||
"image": "ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.14.0",
|
||||
"command": [
|
||||
"python3",
|
||||
"/opt/pytorch-mnist/mnist.py",
|
||||
|
|
|
@ -56,7 +56,7 @@ if os.path.exists(katib_grpc_svc_file):
|
|||
|
||||
setuptools.setup(
|
||||
name="kubeflow-katib",
|
||||
version="0.17.0",
|
||||
version="0.18.0",
|
||||
author="Kubeflow Authors",
|
||||
author_email="premnath.vel@gmail.com",
|
||||
license="Apache License Version 2.0",
|
||||
|
|
|
@ -32,7 +32,7 @@ kubectl version
|
|||
kubectl cluster-info
|
||||
|
||||
# Update Katib images with the current PULL SHA.
|
||||
make update-images OLD_PREFIX="docker.io/kubeflowkatib/" NEW_PREFIX="${ECR_REGISTRY}/${REPO_NAME}/v1beta1/" TAG="${PULL_PULL_SHA}"
|
||||
make update-images OLD_PREFIX="ghcr.io/kubeflow/katib/" NEW_PREFIX="${ECR_REGISTRY}/${REPO_NAME}/v1beta1/" TAG="${PULL_PULL_SHA}"
|
||||
|
||||
echo -e "\n The Katib will be deployed with the following configs"
|
||||
cat "manifests/v1beta1/installs/katib-standalone/kustomization.yaml"
|
||||
|
|
|
@ -30,7 +30,7 @@ TUNE_API=${2:-false}
|
|||
TRIAL_IMAGES=${3:-""}
|
||||
EXPERIMENTS=${4:-""}
|
||||
|
||||
REGISTRY="docker.io/kubeflowkatib"
|
||||
REGISTRY="ghcr.io/kubeflow/katib"
|
||||
TAG="e2e-test"
|
||||
VERSION="v1beta1"
|
||||
CMD_PREFIX="cmd"
|
||||
|
|
|
@ -30,7 +30,7 @@ TRAINING_OPERATOR_VERSION="v1.9.0"
|
|||
echo "Start to install Katib"
|
||||
|
||||
# Update Katib images with `e2e-test`.
|
||||
cd ../../../../../ && make update-images OLD_PREFIX="docker.io/kubeflowkatib/" NEW_PREFIX="docker.io/kubeflowkatib/" TAG="$E2E_TEST_IMAGE_TAG" && cd -
|
||||
cd ../../../../../ && make update-images OLD_PREFIX="ghcr.io/kubeflow/katib/" NEW_PREFIX="ghcr.io/kubeflow/katib/" TAG="$E2E_TEST_IMAGE_TAG" && cd -
|
||||
|
||||
# first declare the which kustomization file to use, by default use mysql.
|
||||
KUSTOMIZATION_FILE="../../../../../manifests/v1beta1/installs/katib-standalone/kustomization.yaml"
|
||||
|
|
|
@ -40,7 +40,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -40,7 +40,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.18.0
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -13,10 +13,19 @@
|
|||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import tensorboardX
|
||||
import utils
|
||||
|
||||
METRIC_DIR_NAMES = ("train", "test")
|
||||
METRIC_NAMES = ("accuracy", "loss")
|
||||
QUALIFIED_METRIC_NAMES = tuple(
|
||||
f"{dir}/{metric}"
|
||||
for dir in METRIC_DIR_NAMES
|
||||
for metric in METRIC_NAMES
|
||||
)
|
||||
|
||||
class TestTFEventMetricsCollector(unittest.TestCase):
|
||||
def test_parse_file(self):
|
||||
|
@ -24,24 +33,47 @@ class TestTFEventMetricsCollector(unittest.TestCase):
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs")
|
||||
|
||||
# Metric format is "{{dirname}}/{{metrics name}}"
|
||||
metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"]
|
||||
metric_logs = utils.get_metric_logs(logs_dir, metric_names)
|
||||
|
||||
metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES)
|
||||
self.assertEqual(20, len(metric_logs))
|
||||
|
||||
for log in metric_logs:
|
||||
actual = log["metric"]["name"]
|
||||
self.assertIn(actual, metric_names)
|
||||
self.assertIn(actual, QUALIFIED_METRIC_NAMES)
|
||||
|
||||
# Metric format is "{{metrics name}}"
|
||||
metric_names = ["accuracy", "loss"]
|
||||
metrics_file_dir = os.path.join(logs_dir, "train")
|
||||
metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names)
|
||||
self.assertEqual(10, len(metric_logs))
|
||||
train_metric_logs = utils.get_metric_logs(
|
||||
os.path.join(logs_dir, "train"), METRIC_NAMES)
|
||||
self.assertEqual(10, len(train_metric_logs))
|
||||
|
||||
for log in train_metric_logs:
|
||||
actual = log["metric"]["name"]
|
||||
self.assertIn(actual, METRIC_NAMES)
|
||||
|
||||
def test_parse_file_with_tensorboardX(self):
|
||||
logs_dir = tempfile.mkdtemp()
|
||||
num_iters = 3
|
||||
|
||||
for dir_name in METRIC_DIR_NAMES:
|
||||
with tensorboardX.SummaryWriter(os.path.join(logs_dir, dir_name)) as writer:
|
||||
for metric_name in METRIC_NAMES:
|
||||
for iter in range(num_iters):
|
||||
writer.add_scalar(metric_name, 0.1, iter)
|
||||
|
||||
|
||||
metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES)
|
||||
self.assertEqual(num_iters * len(QUALIFIED_METRIC_NAMES), len(metric_logs))
|
||||
|
||||
for log in metric_logs:
|
||||
actual = log["metric"]["name"]
|
||||
self.assertIn(actual, metric_names)
|
||||
self.assertIn(actual, QUALIFIED_METRIC_NAMES)
|
||||
|
||||
train_metric_logs = utils.get_metric_logs(
|
||||
os.path.join(logs_dir, "train"), METRIC_NAMES)
|
||||
self.assertEqual(num_iters * len(METRIC_NAMES), len(train_metric_logs))
|
||||
|
||||
for log in train_metric_logs:
|
||||
actual = log["metric"]["name"]
|
||||
self.assertIn(actual, METRIC_NAMES)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
grpcio-testing==1.64.1
|
||||
pytest==7.2.0
|
||||
tensorboardX==2.6.2.2
|
||||
kubeflow-training[huggingface]==1.9.0
|
||||
|
|
Loading…
Reference in New Issue