Compare commits
165 Commits
Author | SHA1 | Date |
---|---|---|
|
fe7a35dffa | |
|
dd107108b5 | |
|
8e887b8719 | |
|
5d70808886 | |
|
ba2cf7d1ec | |
|
73b8c5c029 | |
|
5cd9592335 | |
|
9421f2322b | |
|
1ebd5e4453 | |
|
c9513c633d | |
|
dd4acfc2ce | |
|
349b571541 | |
|
8e965f11d8 | |
|
6578306795 | |
|
54764d6aa4 | |
|
db4b68bf56 | |
|
1f76bb3bbf | |
|
4884253067 | |
|
9e430ceaf5 | |
|
c18035e104 | |
|
3c88967299 | |
|
338a5c107b | |
|
302020c29e | |
|
7b4652058d | |
|
6389cbadf1 | |
|
c2b5b52762 | |
|
4d2a23073a | |
|
3e736dc54d | |
|
bf034636fa | |
|
741238d712 | |
|
28e466e1b8 | |
|
09523cdfad | |
|
0133983d4a | |
|
40e1e651f2 | |
|
2567939fc9 | |
|
f46cee565b | |
|
d87b41f4b0 | |
|
aa04cf4335 | |
|
59af784f50 | |
|
224aa9d7a0 | |
|
93bee4dc25 | |
|
0cab624e6e | |
|
1412c56059 | |
|
e5482959fc | |
|
3b554aaf64 | |
|
bf4a0b2c41 | |
|
eb8af4d502 | |
|
9889b33599 | |
|
9531372530 | |
|
336396436a | |
|
5212949244 | |
|
fce751a90e | |
|
3e3e0f8cdc | |
|
dc3398dbd4 | |
|
2b41ae62ab | |
|
706a6f2190 | |
|
0bc143ad1a | |
|
719ae382c1 | |
|
867c40a1b0 | |
|
bc09cfd412 | |
|
e251a07cb9 | |
|
a524f33830 | |
|
0e2ba6efc1 | |
|
4964d04208 | |
|
abd1c428c7 | |
|
2f5bda2da9 | |
|
4a385f515a | |
|
e9e6e0c0b1 | |
|
8eb0e86385 | |
|
b6f7cfd9a7 | |
|
51b246fa1c | |
|
6a17c3e35a | |
|
9a8c9d480f | |
|
ffc005855d | |
|
2c57522758 | |
|
a6c37e4f3a | |
|
a8840f26f8 | |
|
a3dd708541 | |
|
206fe1c106 | |
|
7be8b243f6 | |
|
0b4e7c1780 | |
|
33f60c8ac0 | |
|
da3238d310 | |
|
db17214cf0 | |
|
154a85b740 | |
|
f06906d338 | |
|
e83628bb49 | |
|
57ed828702 | |
|
7eb73b6b19 | |
|
8bbac200a8 | |
|
99ba1d58cf | |
|
5a0b7db651 | |
|
f8b8d8d484 | |
|
8a342460f2 | |
|
0d190b9437 | |
|
e6bd3e7b5b | |
|
b02aed8ec6 | |
|
4e4ce6f731 | |
|
7959ffd548 | |
|
d69d04e77e | |
|
2a9ffb169b | |
|
87aec69b9f | |
|
55e283ea1b | |
|
328bc5ca6a | |
|
199e8a41f5 | |
|
a1046db880 | |
|
c4c3eb5243 | |
|
8c9a33a2f7 | |
|
1551ca3975 | |
|
af900202c6 | |
|
ea46a7f2b7 | |
|
2d308b72c3 | |
|
21320b6d57 | |
|
025ce256a4 | |
|
1365e473c5 | |
|
086093fed7 | |
|
7df05c23a5 | |
|
9680b8c73f | |
|
8629a3ce05 | |
|
36150bc3e9 | |
|
250e9d176f | |
|
1df32f2b24 | |
|
0a5c9e5191 | |
|
b3e4715c33 | |
|
ec86f23311 | |
|
51c9350847 | |
|
ae894507c9 | |
|
6f372f6808 | |
|
5837b8a90e | |
|
679e6fb8f8 | |
|
61406a5397 | |
|
a2f3fcae55 | |
|
03a400128a | |
|
fc858d15dd | |
|
8df3c5c838 | |
|
19268062f1 | |
|
10f17fedfb | |
|
d92c168baa | |
|
bf9a1b09e9 | |
|
75ea35cc0f | |
|
4617346302 | |
|
f4c8861c81 | |
|
fbe7c786e9 | |
|
f62e40dbd3 | |
|
700e64e053 | |
|
d2e311fc03 | |
|
cf7fe2e47e | |
|
50a3f4110d | |
|
520a39701b | |
|
e3e0aa24ae | |
|
2843a814a6 | |
|
373f6e6d7d | |
|
ea27fa7fee | |
|
87a0161c2c | |
|
1f5fb48c6e | |
|
b107b2cf4e | |
|
2f3ffc7d23 | |
|
2ae992a111 | |
|
29887c13a0 | |
|
c33494bc8f | |
|
aa772b607d | |
|
1b68744276 | |
|
2ae3eb5adf | |
|
4dbb49f536 | |
|
7f0d9229fa |
|
@ -0,0 +1,4 @@
|
|||
[flake8]
|
||||
max-line-length = 100
|
||||
# E203 is ignored to avoid conflicts with Black's formatting, as it's not PEP 8 compliant
|
||||
extend-ignore = W503, E203
|
|
@ -1,26 +0,0 @@
|
|||
---
|
||||
name: Bug report
|
||||
about: Tell us about a problem you are experiencing
|
||||
---
|
||||
|
||||
/kind bug
|
||||
|
||||
**What steps did you take and what happened:**
|
||||
[A clear and concise description of what the bug is.]
|
||||
|
||||
**What did you expect to happen:**
|
||||
|
||||
**Anything else you would like to add:**
|
||||
[Miscellaneous information that will assist in solving the issue.]
|
||||
|
||||
**Environment:**
|
||||
|
||||
- Katib version (check the Katib controller image version):
|
||||
- Kubernetes version: (`kubectl version`):
|
||||
- OS (`uname -a`):
|
||||
|
||||
---
|
||||
|
||||
<!-- Don't delete this message to encourage users to support your issue! -->
|
||||
|
||||
Impacted by this bug? Give it a 👍 We prioritize the issues with the most 👍
|
|
@ -0,0 +1,50 @@
|
|||
name: Bug Report
|
||||
description: Tell us about a problem you are experiencing with Katib
|
||||
labels: ["kind/bug", "lifecycle/needs-triage"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this Katib bug report!
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: |
|
||||
Please provide as much info as possible. Not doing so may result in your bug not being
|
||||
addressed in a timely manner.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: expected
|
||||
attributes:
|
||||
label: What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: environment
|
||||
attributes:
|
||||
label: Environment
|
||||
value: |
|
||||
Kubernetes version:
|
||||
```bash
|
||||
$ kubectl version
|
||||
|
||||
```
|
||||
Katib controller version:
|
||||
```bash
|
||||
$ kubectl get pods -n kubeflow -l katib.kubeflow.org/component=controller -o jsonpath="{.items[*].spec.containers[*].image}"
|
||||
|
||||
```
|
||||
Katib Python SDK version:
|
||||
```bash
|
||||
$ pip show kubeflow-katib
|
||||
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: votes
|
||||
attributes:
|
||||
label: Impacted by this bug?
|
||||
value: Give it a 👍 We prioritize the issues with most 👍
|
|
@ -1,9 +1,12 @@
|
|||
blank_issues_enabled: false
|
||||
blank_issues_enabled: true
|
||||
|
||||
contact_links:
|
||||
- name: Katib Documentation
|
||||
url: https://www.kubeflow.org/docs/components/katib/
|
||||
about: Much help can be found in the docs
|
||||
- name: AutoML Slack Channel
|
||||
url: https://kubeflow.slack.com/archives/C018PMV53NW
|
||||
about: Ask the Katib community on Slack
|
||||
- name: Kubeflow Katib Slack Channel
|
||||
url: https://www.kubeflow.org/docs/about/community/#kubeflow-slack-channels
|
||||
about: Ask the Katib community on CNCF Slack
|
||||
- name: Kubeflow Katib Community Meeting
|
||||
url: https://bit.ly/2PWVCkV
|
||||
about: Join the Kubeflow AutoML working group meeting
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
---
|
||||
name: Feature enhancement request
|
||||
about: Suggest an idea for this project
|
||||
---
|
||||
|
||||
/kind feature
|
||||
|
||||
**Describe the solution you'd like**
|
||||
[A clear and concise description of what you want to happen.]
|
||||
|
||||
**Anything else you would like to add:**
|
||||
[Miscellaneous information that will assist in solving the issue.]
|
||||
|
||||
---
|
||||
|
||||
<!-- Don't delete this message to encourage users to support your issue! -->
|
||||
|
||||
Love this feature? Give it a 👍 We prioritize the features with the most 👍
|
|
@ -0,0 +1,28 @@
|
|||
name: Feature Request
|
||||
description: Suggest an idea for Katib
|
||||
labels: ["kind/feature", "lifecycle/needs-triage"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this Katib feature request!
|
||||
- type: textarea
|
||||
id: feature
|
||||
attributes:
|
||||
label: What you would like to be added?
|
||||
description: |
|
||||
A clear and concise description of what you want to add to Katib.
|
||||
Please consider to write Katib enhancement proposal if it is a large feature request.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: rationale
|
||||
attributes:
|
||||
label: Why is this needed?
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: votes
|
||||
attributes:
|
||||
label: Love this feature?
|
||||
value: Give it a 👍 We prioritize the features with most 👍
|
|
@ -1,6 +1,6 @@
|
|||
<!-- Thanks for sending a pull request! Here are some tips for you:
|
||||
1. If this is your first time, check our contributor guidelines https://www.kubeflow.org/docs/about/contributing
|
||||
2. To know more about Katib components, check developer guide https://github.com/kubeflow/katib/blob/master/docs/developer-guide.md
|
||||
2. To know more about Katib components, check developer guide https://github.com/kubeflow/katib/blob/master/CONTRIBUTING.md
|
||||
3. If you want *faster* PR reviews, check how: https://git.k8s.io/community/contributors/guide/pull-requests.md#best-practices-for-faster-reviews
|
||||
-->
|
||||
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
# Configuration for stale probot https://probot.github.io/apps/stale/
|
||||
|
||||
# Number of days of inactivity before an issue becomes stale
|
||||
daysUntilStale: 90
|
||||
# Number of days of inactivity before a stale issue is closed
|
||||
daysUntilClose: 20
|
||||
# Issues with these labels will never be considered stale
|
||||
exemptLabels:
|
||||
- lifecycle/frozen
|
||||
# Label to use when marking an issue as stale
|
||||
staleLabel: lifecycle/stale
|
||||
# Comment to post when marking an issue as stale. Set to `false` to disable
|
||||
markComment: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions.
|
||||
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||
closeComment: >
|
||||
This issue has been automatically closed because it has not had recent
|
||||
activity. Please comment "/reopen" to reopen it.
|
|
@ -1,5 +1,5 @@
|
|||
# Reusable workflows for publishing Katib images.
|
||||
name: Build And Publish Images
|
||||
name: Build and Publish Images
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
@ -21,31 +21,50 @@ on:
|
|||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
name: Publish Image
|
||||
runs-on: ubuntu-latest
|
||||
name: Build and Publish Images
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Docker Login
|
||||
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
|
||||
if: >-
|
||||
github.repository == 'kubeflow/katib' &&
|
||||
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
|
||||
uses: docker/login-action@v2
|
||||
- name: Set Publish Condition
|
||||
id: publish-condition
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.repository }}" == 'kubeflow/katib' && \
|
||||
( "${{ github.ref }}" == 'refs/heads/master' || \
|
||||
"${{ github.ref }}" =~ ^refs/heads/release- || \
|
||||
"${{ github.ref }}" =~ ^refs/tags/v ) ]]; then
|
||||
echo "should_publish=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_publish=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: GHCR Login
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: DockerHub Login
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: docker.io
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Publish Component ${{ inputs.component-name }}
|
||||
# Trigger workflow only for kubeflow/katib repository with specific branch (master, release-.*) or tag (v.*).
|
||||
if: >-
|
||||
github.repository == 'kubeflow/katib' &&
|
||||
(github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/v'))
|
||||
if: steps.publish-condition.outputs.should_publish == 'true'
|
||||
id: publish
|
||||
uses: ./.github/workflows/template-publish-image
|
||||
with:
|
||||
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
image: |
|
||||
ghcr.io/kubeflow/katib/${{ inputs.component-name }}
|
||||
docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
dockerfile: ${{ inputs.dockerfile }}
|
||||
platforms: ${{ inputs.platforms }}
|
||||
push: true
|
||||
|
@ -54,7 +73,9 @@ jobs:
|
|||
if: steps.publish.outcome == 'skipped'
|
||||
uses: ./.github/workflows/template-publish-image
|
||||
with:
|
||||
image: docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
image: |
|
||||
ghcr.io/kubeflow/katib/${{ inputs.component-name }}
|
||||
docker.io/kubeflowkatib/${{ inputs.component-name }}
|
||||
dockerfile: ${{ inputs.dockerfile }}
|
||||
platforms: ${{ inputs.platforms }}
|
||||
push: false
|
||||
|
|
|
@ -11,17 +11,17 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
with:
|
||||
kubernetes-version: ${{ matrix.kubernetes-version }}
|
||||
python-version: "3.7"
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Run e2e test with ${{ matrix.experiments }} experiments
|
||||
uses: ./.github/workflows/template-e2e-test
|
||||
|
@ -33,6 +33,6 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
# Comma Delimited
|
||||
experiments: ["darts-cpu"]
|
||||
|
|
|
@ -11,11 +11,11 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
|
@ -33,6 +33,6 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
# Comma Delimited
|
||||
experiments: ["enas-cpu"]
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
name: E2E Test with mxnet-mnist
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths-ignore:
|
||||
- "pkg/ui/v1beta1/frontend/**"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
with:
|
||||
kubernetes-version: ${{ matrix.kubernetes-version }}
|
||||
python-version: "3.9"
|
||||
|
||||
- name: Run e2e test with ${{ matrix.experiments }} experiments
|
||||
uses: ./.github/workflows/template-e2e-test
|
||||
with:
|
||||
experiments: ${{ matrix.experiments }}
|
||||
# Comma Delimited
|
||||
trial-images: mxnet-mnist
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
# Comma Delimited
|
||||
experiments:
|
||||
# suggestion-hyperopt
|
||||
- "long-running-resume,from-volume-resume,median-stop"
|
||||
# others
|
||||
- "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband"
|
|
@ -11,11 +11,11 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
|
@ -34,8 +34,13 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
# Comma Delimited
|
||||
experiments:
|
||||
# suggestion-hyperopt
|
||||
- "long-running-resume,from-volume-resume,median-stop"
|
||||
# others
|
||||
- "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband"
|
||||
- "hyperopt-distribution,optuna-distribution"
|
||||
- "file-metrics-collector,pytorchjob-mnist"
|
||||
- "median-stop-with-json-format,file-metrics-collector-with-json-format"
|
||||
|
|
|
@ -11,11 +11,11 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
|
@ -33,6 +33,6 @@ jobs:
|
|||
fail-fast: false
|
||||
matrix:
|
||||
# Detail: https://hub.docker.com/r/kindest/node
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
# Comma Delimited
|
||||
experiments: ["simple-pbt"]
|
||||
|
|
|
@ -11,11 +11,11 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
|
@ -33,6 +33,6 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
# Comma Delimited
|
||||
experiments: ["tfjob-mnist-with-summaries"]
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
name: E2E Test with tune API
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths-ignore:
|
||||
- "pkg/ui/v1beta1/frontend/**"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
with:
|
||||
kubernetes-version: ${{ matrix.kubernetes-version }}
|
||||
|
||||
- name: Install Katib SDK with extra requires
|
||||
shell: bash
|
||||
run: |
|
||||
pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]'
|
||||
|
||||
- name: Run e2e test with tune API
|
||||
uses: ./.github/workflows/template-e2e-test
|
||||
with:
|
||||
tune-api: true
|
||||
training-operator: true
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# Detail: https://hub.docker.com/r/kindest/node
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
|
@ -9,11 +9,11 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-20.04
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Test Env
|
||||
uses: ./.github/workflows/template-setup-e2e-test
|
||||
|
@ -25,11 +25,11 @@ jobs:
|
|||
with:
|
||||
experiments: random
|
||||
# Comma Delimited
|
||||
trial-images: mxnet-mnist
|
||||
trial-images: pytorch-mnist-cpu
|
||||
katib-ui: true
|
||||
database-type: postgres
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
|
||||
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
name: Free-Up Disk Space
|
||||
description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
# This step is a Workaround to avoid the "No space left on device" error.
|
||||
# ref: https://github.com/actions/runner-images/issues/2840
|
||||
- name: Remove unnecessary files
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Disk usage before cleanup:"
|
||||
df -hT
|
||||
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/share/swift
|
||||
|
||||
echo "Disk usage after cleanup:"
|
||||
df -hT
|
||||
|
||||
- name: Prune docker images
|
||||
shell: bash
|
||||
run: |
|
||||
docker image prune -a -f
|
||||
docker system df
|
||||
df -hT
|
||||
|
||||
- name: Move docker data directory
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Stopping docker service ..."
|
||||
sudo systemctl stop docker
|
||||
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
|
||||
DOCKER_ROOT_DIR=/mnt/docker
|
||||
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
|
||||
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
|
||||
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
|
||||
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
|
||||
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
|
||||
echo "Starting docker service ..."
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start docker
|
||||
echo "Docker service status:"
|
||||
sudo systemctl --no-pager -l -o short status docker
|
|
@ -22,9 +22,6 @@ jobs:
|
|||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- trial-name: mxnet-mnist
|
||||
platforms: linux/amd64,linux/arm64
|
||||
dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
|
||||
- trial-name: pytorch-mnist-cpu
|
||||
platforms: linux/amd64,linux/arm64
|
||||
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
|
||||
#
|
||||
# You can adjust the behavior by modifying this file.
|
||||
# For more information, see:
|
||||
# https://github.com/actions/stale
|
||||
name: Mark stale issues and pull requests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 */5 * * *"
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
days-before-stale: 90
|
||||
days-before-close: 20
|
||||
stale-issue-message: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions.
|
||||
close-issue-message: >
|
||||
This issue has been automatically closed because it has not had recent
|
||||
activity. Please comment "/reopen" to reopen it.
|
||||
stale-issue-label: lifecycle/stale
|
||||
exempt-issue-labels: lifecycle/frozen
|
||||
stale-pr-message: >
|
||||
This pull request has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions.
|
||||
close-pr-message: >
|
||||
This pull request has been automatically closed because it has not had recent
|
||||
activity. Please comment "/reopen" to reopen it.
|
||||
stale-pr-label: lifecycle/stale
|
||||
exempt-pr-labels: lifecycle/frozen
|
|
@ -4,15 +4,17 @@ description: Run e2e test using the minikube cluster
|
|||
|
||||
inputs:
|
||||
experiments:
|
||||
required: true
|
||||
required: false
|
||||
description: comma delimited experiment name
|
||||
default: ""
|
||||
training-operator:
|
||||
required: false
|
||||
description: whether to deploy training-operator or not
|
||||
default: false
|
||||
trial-images:
|
||||
required: true
|
||||
required: false
|
||||
description: comma delimited trial image name
|
||||
default: ""
|
||||
katib-ui:
|
||||
required: true
|
||||
description: whether to deploy katib-ui or not
|
||||
|
@ -21,13 +23,17 @@ inputs:
|
|||
required: false
|
||||
description: mysql or postgres
|
||||
default: mysql
|
||||
tune-api:
|
||||
required: true
|
||||
description: whether to execute tune-api test or not
|
||||
default: false
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Setup Minikube Cluster
|
||||
shell: bash
|
||||
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh ${{ inputs.katib-ui }} ${{ inputs.trial-images }} ${{ inputs.experiments }}
|
||||
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh ${{ inputs.katib-ui }} ${{ inputs.tune-api }} ${{ inputs.trial-images }} ${{ inputs.experiments }}
|
||||
|
||||
- name: Setup Katib
|
||||
shell: bash
|
||||
|
@ -35,4 +41,9 @@ runs:
|
|||
|
||||
- name: Run E2E Experiment
|
||||
shell: bash
|
||||
run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
|
||||
run: |
|
||||
if "${{ inputs.tune-api }}"; then
|
||||
./test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh
|
||||
else
|
||||
./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
|
||||
fi
|
||||
|
|
|
@ -36,14 +36,14 @@ runs:
|
|||
df -h
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set Up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Add Docker Tags
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ inputs.image }}
|
||||
tags: |
|
||||
|
@ -51,12 +51,12 @@ runs:
|
|||
type=sha,prefix=v1beta1-
|
||||
|
||||
- name: Build and Push
|
||||
uses: docker/build-push-action@v3
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ${{ inputs.dockerfile }}
|
||||
push: ${{ inputs.push }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-to: type=gha,mode=max,ignore-error=true
|
||||
platforms: ${{ inputs.platforms }}
|
||||
|
|
|
@ -17,40 +17,29 @@ runs:
|
|||
steps:
|
||||
# This step is a Workaround to avoid the "No space left on device" error.
|
||||
# ref: https://github.com/actions/runner-images/issues/2840
|
||||
- name: Remove unnecessary files
|
||||
shell: bash
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf "/usr/local/share/boost"
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/share/swift
|
||||
|
||||
echo "Disk usage after cleanup:"
|
||||
df -h
|
||||
- name: Free-Up Disk Space
|
||||
uses: ./.github/workflows/free-up-disk-space
|
||||
|
||||
- name: Setup kubectl
|
||||
uses: azure/setup-kubectl@v3
|
||||
uses: azure/setup-kubectl@v4
|
||||
with:
|
||||
version: ${{ inputs.kubernetes-version }}
|
||||
|
||||
- name: Setup Minikube Cluster
|
||||
uses: medyagh/setup-minikube@v0.0.14
|
||||
uses: medyagh/setup-minikube@v0.0.18
|
||||
with:
|
||||
network-plugin: cni
|
||||
cni: flannel
|
||||
driver: none
|
||||
kubernetes-version: ${{ inputs.kubernetes-version }}
|
||||
minikube-version: 1.31.1
|
||||
minikube-version: 1.34.0
|
||||
start-args: --wait-timeout=120s
|
||||
|
||||
- name: Setup Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ concurrency:
|
|||
jobs:
|
||||
generatetests:
|
||||
name: Generate And Format Test
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GOPATH: ${{ github.workspace }}/go
|
||||
defaults:
|
||||
|
@ -20,21 +20,22 @@ jobs:
|
|||
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/katib/go.mod
|
||||
cache-dependency-path: ${{ env.GOPATH }}/src/github.com/kubeflow/katib/go.sum
|
||||
|
||||
- name: Check Go Modules, Generated Go/Python codes, and Format
|
||||
run: make check
|
||||
|
||||
unittests:
|
||||
name: Unit Test
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GOPATH: ${{ github.workspace }}/go
|
||||
defaults:
|
||||
|
@ -42,14 +43,15 @@ jobs:
|
|||
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/katib/go.mod
|
||||
cache-dependency-path: ${{ env.GOPATH }}/src/github.com/kubeflow/katib/go.sum
|
||||
|
||||
- name: Run Go test
|
||||
run: go mod download && make test ENVTEST_K8S_VERSION=${{ matrix.kubernetes-version }}
|
||||
|
@ -59,9 +61,19 @@ jobs:
|
|||
with:
|
||||
path-to-profile: coverage.out
|
||||
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
|
||||
parallel: true
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# Detail: `setup-envtest list`
|
||||
kubernetes-version: ["1.25.0", "1.26.1", "1.27.1"]
|
||||
kubernetes-version: ["1.29.3", "1.30.0", "1.31.0"]
|
||||
|
||||
# notifies that all test jobs are finished.
|
||||
finish:
|
||||
needs: unittests
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: shogo82148/actions-goveralls@v1
|
||||
with:
|
||||
parallel-finished: true
|
||||
|
|
|
@ -12,19 +12,19 @@ concurrency:
|
|||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Check YAML files
|
||||
run: make yamllint
|
||||
|
||||
- name: Check shell scripts
|
||||
run: make shellcheck
|
||||
|
||||
- name: Run pre-commit
|
||||
uses: pre-commit/action@v3.0.1
|
||||
|
|
|
@ -12,16 +12,16 @@ concurrency:
|
|||
jobs:
|
||||
test:
|
||||
name: Code format and lint
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 12.18.1
|
||||
node-version: 16.20.2
|
||||
|
||||
- name: Format katib code
|
||||
run: |
|
||||
|
@ -35,16 +35,16 @@ jobs:
|
|||
|
||||
frontend-unit-tests:
|
||||
name: Frontend Unit Tests
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 12.18.1
|
||||
node-version: 16.20.2
|
||||
|
||||
- name: Fetch Kubeflow and install common code dependencies
|
||||
run: |
|
||||
|
@ -73,11 +73,11 @@ jobs:
|
|||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Setup node version to 12
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup node version to 16
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 12
|
||||
node-version: 16
|
||||
|
||||
- name: Fetch Kubeflow and install common code dependencies
|
||||
run: |
|
||||
|
|
|
@ -12,16 +12,36 @@ concurrency:
|
|||
jobs:
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.11
|
||||
|
||||
- name: Run Python test
|
||||
run: make pytest
|
||||
|
||||
# The skopt service doesn't work appropriately with Python 3.11.
|
||||
# So, we need to run the test with Python 3.9.
|
||||
# TODO (tenzen-y): Once we stop to support skopt, we can remove this test.
|
||||
# REF: https://github.com/kubeflow/katib/issues/2280
|
||||
test-skopt:
|
||||
name: Test Skopt
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Run Python test
|
||||
run: make pytest
|
||||
run: make pytest-skopt
|
||||
|
|
|
@ -78,3 +78,6 @@ $RECYCLE.BIN/
|
|||
|
||||
## Vendor dir
|
||||
vendor
|
||||
|
||||
# Jupyter Notebooks.
|
||||
**/.ipynb_checkpoints
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v2.3.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
args: [--allow-multiple-documents]
|
||||
- id: check-json
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.11.5
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort
|
||||
entry: isort --profile black
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.2.0
|
||||
hooks:
|
||||
- id: black
|
||||
files: (sdk|examples|pkg)/.*
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 7.1.1
|
||||
hooks:
|
||||
- id: flake8
|
||||
files: (sdk|examples|pkg)/.*
|
||||
exclude: |
|
||||
(?x)^(
|
||||
.*zz_generated.deepcopy.*|
|
||||
.*pb.go|
|
||||
pkg/apis/manager/.*pb2(?:_grpc)?.py(?:i)?|
|
||||
pkg/apis/v1beta1/openapi_generated.go|
|
||||
pkg/mock/.*|
|
||||
pkg/client/controller/.*|
|
||||
sdk/python/v1beta1/kubeflow/katib/configuration.py|
|
||||
sdk/python/v1beta1/kubeflow/katib/rest.py|
|
||||
sdk/python/v1beta1/kubeflow/katib/__init__.py|
|
||||
sdk/python/v1beta1/kubeflow/katib/exceptions.py|
|
||||
sdk/python/v1beta1/kubeflow/katib/api_client.py|
|
||||
sdk/python/v1beta1/kubeflow/katib/models/.*
|
||||
)$
|
541
CHANGELOG.md
|
@ -1,6 +1,507 @@
|
|||
# Changelog
|
||||
|
||||
## [v0.15.0](https://github.com/kubeflow/katib/tree/v0.15.0) (2023-03-22)
|
||||
# [v0.18.0](https://github.com/kubeflow/katib/tree/v0.18.0) (2025-03-25)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- Move Katib manifest image references to ghcr ([#2535](https://github.com/kubeflow/katib/pull/2535) by [@saileshd1402](https://github.com/saileshd1402))
|
||||
- Migrate docker images to ghcr ([#2531](https://github.com/kubeflow/katib/pull/2531) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- Upgrade Kubernetes to v1.31.3 ([#2478](https://github.com/kubeflow/katib/pull/2478) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Upgrade Kubernetes to v1.30.7 ([#2463](https://github.com/kubeflow/katib/pull/2463) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Drop Python 3.7 and Support Python 3.11 in the SDK ([#2337](https://github.com/kubeflow/katib/pull/2337) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## New Features
|
||||
|
||||
### Hyperparameter Optimization for LLMs
|
||||
|
||||
- [DOCS] move llm hyperparameter optimisation design image to the proposal directory and rename it ([#2472](https://github.com/kubeflow/katib/pull/2472) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- [GSoC] Update `tune` API for LLM hyperparameters optimization ([#2393](https://github.com/kubeflow/katib/pull/2393) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- [GSoC] Create LLM Hyperparameters Optimization API Proposal ([#2333](https://github.com/kubeflow/katib/pull/2333) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
|
||||
### Support for Advanced Distributions for HPO
|
||||
|
||||
- [GSOC] `optuna` suggestion service logic update ([#2446](https://github.com/kubeflow/katib/pull/2446) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] `hyperopt` suggestion service logic update ([#2412](https://github.com/kubeflow/katib/pull/2412) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] Add validator for feasible space distribution ([#2404](https://github.com/kubeflow/katib/pull/2404) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] added Unknown distribution and convertDistribution in suggestion client ([#2403](https://github.com/kubeflow/katib/pull/2403) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] Support for various Parameter distributions in Katib ([#2334](https://github.com/kubeflow/katib/pull/2334) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSoC] Added `DistributionType` to Experiment API ([#2377](https://github.com/kubeflow/katib/pull/2377) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
|
||||
### Push-based Metrics Collector
|
||||
|
||||
- [GSoC] Provide a PyTorch MNIST Example for Push-based Metrics Collection ([#2437](https://github.com/kubeflow/katib/pull/2437) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] Compatibility Changes in Trial Controller ([#2394](https://github.com/kubeflow/katib/pull/2394) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] New Interface `report_metrics` in Python SDK ([#2371](https://github.com/kubeflow/katib/pull/2371) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] KEP for Project 6: Push-based Metrics Collection for Katib ([#2328](https://github.com/kubeflow/katib/pull/2328) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] Add New Parameter in `tune` ([#2369](https://github.com/kubeflow/katib/pull/2369) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
|
||||
### SDK Updates
|
||||
|
||||
- [SDK] Support PyTorchJob as a Trial Worker ([#2512](https://github.com/kubeflow/katib/pull/2512) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] test: Add e2e test for tune function. ([#2399](https://github.com/kubeflow/katib/pull/2399) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [SDK] improve PVC creation name error ([#2496](https://github.com/kubeflow/katib/pull/2496) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- [SDK] Fix empty list for env variables and numpy version ([#2360](https://github.com/kubeflow/katib/pull/2360) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] Explain Python version support cycle ([#2354](https://github.com/kubeflow/katib/pull/2354) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- fix(webhook): fix validation message in experiment webhook ([#2507](https://github.com/kubeflow/katib/pull/2507) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Install typing-extensions v4.10.0 to fix Python test error ([#2504](https://github.com/kubeflow/katib/pull/2504) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- [SDK] Update `tune` API ([#2497](https://github.com/kubeflow/katib/pull/2497) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix(api): resolve all api voilation exceptions in katib api ([#2482](https://github.com/kubeflow/katib/pull/2482) by [@truc0](https://github.com/truc0))
|
||||
- fix(trial): use propagated gomega to improve debuggability. ([#2432](https://github.com/kubeflow/katib/pull/2432) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix(ui): update None Collector with Push Collector. ([#2418](https://github.com/kubeflow/katib/pull/2418) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix: Resolve errors in e2e tests for cypress in Katib UI ([#2384](https://github.com/kubeflow/katib/pull/2384) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- doc(example): fix the broken link. ([#2433](https://github.com/kubeflow/katib/pull/2433) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix: remove remaining MXNet dependency. ([#2456](https://github.com/kubeflow/katib/pull/2456) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Remove Dropout layer from ENAS Trial container to fix E2E tests ([#2455](https://github.com/kubeflow/katib/pull/2455) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] fix grpc related bugs in Python SDK ([#2398](https://github.com/kubeflow/katib/pull/2398) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [SDK] Fix types error ([#2424](https://github.com/kubeflow/katib/pull/2424) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix: remove the dependency of `protocmp` in `google.golang.org/protobuf/testing/protocmp`. ([#2391](https://github.com/kubeflow/katib/pull/2391) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Fix TestReconcileBatchJob ([#2350](https://github.com/kubeflow/katib/pull/2350) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix apple silicon rosetta error when building images from the source code ([#2342](https://github.com/kubeflow/katib/pull/2342) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix katib use crds token pipeline trail template guide ([#2330](https://github.com/kubeflow/katib/pull/2330) by [@Jerry-yz](https://github.com/Jerry-yz))
|
||||
- Fix Scikit-Learn Version for Skopt Tests ([#2336](https://github.com/kubeflow/katib/pull/2336) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Support old-style TensorFlow events (tensorboard) ([#2517](https://github.com/kubeflow/katib/pull/2517) by [@garymm](https://github.com/garymm))
|
||||
- Set experiment names at a max of 40 characters. ([#2468](https://github.com/kubeflow/katib/pull/2468) by [@AydanPirani](https://github.com/AydanPirani))
|
||||
- [CI] optimize katib ui dockerfile ([#2505](https://github.com/kubeflow/katib/pull/2505) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- Sort experiments by descending creation date by default in katib-ui ([#2498](https://github.com/kubeflow/katib/pull/2498) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- [GSoC] Add unit tests for `tune` API ([#2423](https://github.com/kubeflow/katib/pull/2423) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- Update MutatingWebhookConfiguration: Switch from objectSelector to AdmissionWebhookMatchConditions ([#2241](https://github.com/kubeflow/katib/pull/2241) by [@lianghao208](https://github.com/lianghao208))
|
||||
- chore: supporting the listen-address parameter on db-manager ([#2465](https://github.com/kubeflow/katib/pull/2465) by [@caiofralmeida](https://github.com/caiofralmeida))
|
||||
- Upgrade klog to v2 ([#2470](https://github.com/kubeflow/katib/pull/2470) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- Ignore cache exporting errors in the image building workflows ([#2487](https://github.com/kubeflow/katib/pull/2487) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- Upgrade grpcio version to v1.64.1 ([#2483](https://github.com/kubeflow/katib/pull/2483) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- docs: remove katib workflow ([#2443](https://github.com/kubeflow/katib/pull/2443) by [@gonmmarques](https://github.com/gonmmarques))
|
||||
- Migrate KatibCertGenerator to OPA CertController ([#2345](https://github.com/kubeflow/katib/pull/2345) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Promote @Electronic-Waste and @helenxie-bit as Katib reviewers ([#2439](https://github.com/kubeflow/katib/pull/2439) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update README and out-of-date docs ([#2438](https://github.com/kubeflow/katib/pull/2438) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Changes isort profile to black, to be fully compatible and adds 'pkg' dir for black and flake8 ([#2413](https://github.com/kubeflow/katib/pull/2413) by [@Ygnas](https://github.com/Ygnas))
|
||||
- Introduced error constants and replaced reflect with cmp ([#2289](https://github.com/kubeflow/katib/pull/2289) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- [Test] Refactor `inject_webhook_test.go` according to the Developer Guide ([#2401](https://github.com/kubeflow/katib/pull/2401) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Enhance pre-commit hooks with flake8 and black ([#2407](https://github.com/kubeflow/katib/pull/2407) by [@Ygnas](https://github.com/Ygnas))
|
||||
- added `Distribution` field to feasibleSpace in `api.proto` ([#2397](https://github.com/kubeflow/katib/pull/2397) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- Begin enabling pre-commit hooks ([#2242](https://github.com/kubeflow/katib/pull/2242) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- Update Instructions for Argo Workflows ([#2382](https://github.com/kubeflow/katib/pull/2382) by [@jaffe-fly](https://github.com/jaffe-fly))
|
||||
- docs: update suggestion.md ([#2387](https://github.com/kubeflow/katib/pull/2387) by [@eltociear](https://github.com/eltociear))
|
||||
- Add command to re-run GitHub Actions tests ([#2385](https://github.com/kubeflow/katib/pull/2385) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Bump Katib Python SDK to 0.17.0 version ([#2379](https://github.com/kubeflow/katib/pull/2379) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.17.0 ([#2380](https://github.com/kubeflow/katib/pull/2380) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Replaced hpcloud with nxadm for tail package in Go ([#2375](https://github.com/kubeflow/katib/pull/2375) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Use ErrorList for experiment validator ([#2329](https://github.com/kubeflow/katib/pull/2329) by [@ckcd](https://github.com/ckcd))
|
||||
- Add Changelog for Katib v0.17.0-rc.1 ([#2370](https://github.com/kubeflow/katib/pull/2370) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Remove default caBundle value ([#2368](https://github.com/kubeflow/katib/pull/2368) by [@vihangm](https://github.com/vihangm))
|
||||
- Bump Katib Python SDK to 0.17.0rc1 version ([#2365](https://github.com/kubeflow/katib/pull/2365) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add unit test for `create_experiment` in the `katib_client` module ([#2325](https://github.com/kubeflow/katib/pull/2325) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Remove code generation from release script ([#2363](https://github.com/kubeflow/katib/pull/2363) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Upgrade the protobuf version to >=4.21.12,<5 ([#2358](https://github.com/kubeflow/katib/pull/2358) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace gRPC code generation tool from Znly/protoc to Buf ([#2344](https://github.com/kubeflow/katib/pull/2344) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Replace already closed github.com/golang/mock with go.uber.org/mock ([#2357](https://github.com/kubeflow/katib/pull/2357) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Use cache-dependency-path in actions/setup-go for CI workflow ([#2355](https://github.com/kubeflow/katib/pull/2355) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Update Slack Invitation ([#2349](https://github.com/kubeflow/katib/pull/2349) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update GitHub template to better triage Issues ([#2335](https://github.com/kubeflow/katib/pull/2335) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.17.0-rc.0 ([#2319](https://github.com/kubeflow/katib/pull/2319) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update outdated actions ([#2324](https://github.com/kubeflow/katib/pull/2324) by [@Mersho](https://github.com/Mersho))
|
||||
- Make test fields private in Go unit tests ([#2316](https://github.com/kubeflow/katib/pull/2316) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Bump Katib Python SDK to 0.17.0rc0 Version ([#2318](https://github.com/kubeflow/katib/pull/2318) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.17.0...v0.18.0)
|
||||
|
||||
# [v0.18.0-rc.0](https://github.com/kubeflow/katib/tree/v0.18.0-rc.0) (2025-02-13)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- Upgrade Kubernetes to v1.31.3 ([#2478](https://github.com/kubeflow/katib/pull/2478) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Upgrade Kubernetes to v1.30.7 ([#2463](https://github.com/kubeflow/katib/pull/2463) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Drop Python 3.7 and Support Python 3.11 in the SDK ([#2337](https://github.com/kubeflow/katib/pull/2337) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## New Features
|
||||
|
||||
### Hyperparameter Optimization for LLMs
|
||||
|
||||
- [DOCS] move llm hyperparameter optimisation design image to the proposal directory and rename it ([#2472](https://github.com/kubeflow/katib/pull/2472) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- [GSoC] Update `tune` API for LLM hyperparameters optimization ([#2393](https://github.com/kubeflow/katib/pull/2393) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- [GSoC] Create LLM Hyperparameters Optimization API Proposal ([#2333](https://github.com/kubeflow/katib/pull/2333) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
|
||||
### Support for Advanced Distributions for HPO
|
||||
|
||||
- [GSOC] `optuna` suggestion service logic update ([#2446](https://github.com/kubeflow/katib/pull/2446) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] `hyperopt` suggestion service logic update ([#2412](https://github.com/kubeflow/katib/pull/2412) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] Add validator for feasible space distribution ([#2404](https://github.com/kubeflow/katib/pull/2404) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] added Unknown distribution and convertDistribution in suggestion client ([#2403](https://github.com/kubeflow/katib/pull/2403) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSOC] Support for various Parameter distributions in Katib ([#2334](https://github.com/kubeflow/katib/pull/2334) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- [GSoC] Added `DistributionType` to Experiment API ([#2377](https://github.com/kubeflow/katib/pull/2377) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
|
||||
### Push-based Metrics Collector
|
||||
|
||||
- [GSoC] Provide a PyTorch MNIST Example for Push-based Metrics Collection ([#2437](https://github.com/kubeflow/katib/pull/2437) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] Compatibility Changes in Trial Controller ([#2394](https://github.com/kubeflow/katib/pull/2394) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] New Interface `report_metrics` in Python SDK ([#2371](https://github.com/kubeflow/katib/pull/2371) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] KEP for Project 6: Push-based Metrics Collection for Katib ([#2328](https://github.com/kubeflow/katib/pull/2328) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [GSoC] Add New Parameter in `tune` ([#2369](https://github.com/kubeflow/katib/pull/2369) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
|
||||
### SDK Updates
|
||||
|
||||
- [SDK] Support PyTorchJob as a Trial Worker ([#2512](https://github.com/kubeflow/katib/pull/2512) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] test: Add e2e test for tune function. ([#2399](https://github.com/kubeflow/katib/pull/2399) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [SDK] improve PVC creation name error ([#2496](https://github.com/kubeflow/katib/pull/2496) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- [SDK] Fix empty list for env variables and numpy version ([#2360](https://github.com/kubeflow/katib/pull/2360) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] Explain Python version support cycle ([#2354](https://github.com/kubeflow/katib/pull/2354) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- fix(webhook): fix validation message in experiment webhook ([#2507](https://github.com/kubeflow/katib/pull/2507) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Install typing-extensions v4.10.0 to fix Python test error ([#2504](https://github.com/kubeflow/katib/pull/2504) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- [SDK] Update `tune` API ([#2497](https://github.com/kubeflow/katib/pull/2497) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix(api): resolve all api voilation exceptions in katib api ([#2482](https://github.com/kubeflow/katib/pull/2482) by [@truc0](https://github.com/truc0))
|
||||
- fix(trial): use propagated gomega to improve debuggability. ([#2432](https://github.com/kubeflow/katib/pull/2432) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix(ui): update None Collector with Push Collector. ([#2418](https://github.com/kubeflow/katib/pull/2418) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix: Resolve errors in e2e tests for cypress in Katib UI ([#2384](https://github.com/kubeflow/katib/pull/2384) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- doc(example): fix the broken link. ([#2433](https://github.com/kubeflow/katib/pull/2433) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- fix: remove remaining MXNet dependency. ([#2456](https://github.com/kubeflow/katib/pull/2456) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Remove Dropout layer from ENAS Trial container to fix E2E tests ([#2455](https://github.com/kubeflow/katib/pull/2455) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] fix grpc related bugs in Python SDK ([#2398](https://github.com/kubeflow/katib/pull/2398) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- [SDK] Fix types error ([#2424](https://github.com/kubeflow/katib/pull/2424) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix: remove the dependency of `protocmp` in `google.golang.org/protobuf/testing/protocmp`. ([#2391](https://github.com/kubeflow/katib/pull/2391) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Fix TestReconcileBatchJob ([#2350](https://github.com/kubeflow/katib/pull/2350) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix apple silicon rosetta error when building images from the source code ([#2342](https://github.com/kubeflow/katib/pull/2342) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- fix katib use crds token pipeline trail template guide ([#2330](https://github.com/kubeflow/katib/pull/2330) by [@Jerry-yz](https://github.com/Jerry-yz))
|
||||
- Fix Scikit-Learn Version for Skopt Tests ([#2336](https://github.com/kubeflow/katib/pull/2336) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Set experiment names at a max of 40 characters. ([#2468](https://github.com/kubeflow/katib/pull/2468) by [@AydanPirani](https://github.com/AydanPirani))
|
||||
- [CI] optimize katib ui dockerfile ([#2505](https://github.com/kubeflow/katib/pull/2505) by [@mahdikhashan](https://github.com/mahdikhashan))
|
||||
- Sort experiments by descending creation date by default in katib-ui ([#2498](https://github.com/kubeflow/katib/pull/2498) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- [GSoC] Add unit tests for `tune` API ([#2423](https://github.com/kubeflow/katib/pull/2423) by [@helenxie-bit](https://github.com/helenxie-bit))
|
||||
- Update MutatingWebhookConfiguration: Switch from objectSelector to AdmissionWebhookMatchConditions ([#2241](https://github.com/kubeflow/katib/pull/2241) by [@lianghao208](https://github.com/lianghao208))
|
||||
- chore: supporting the listen-address parameter on db-manager ([#2465](https://github.com/kubeflow/katib/pull/2465) by [@caiofralmeida](https://github.com/caiofralmeida))
|
||||
- Upgrade klog to v2 ([#2470](https://github.com/kubeflow/katib/pull/2470) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- Ignore cache exporting errors in the image building workflows ([#2487](https://github.com/kubeflow/katib/pull/2487) by [@Doris-xm](https://github.com/Doris-xm))
|
||||
- Upgrade grpcio version to v1.64.1 ([#2483](https://github.com/kubeflow/katib/pull/2483) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- docs: remove katib workflow ([#2443](https://github.com/kubeflow/katib/pull/2443) by [@gonmmarques](https://github.com/gonmmarques))
|
||||
- Migrate KatibCertGenerator to OPA CertController ([#2345](https://github.com/kubeflow/katib/pull/2345) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Promote @Electronic-Waste and @helenxie-bit as Katib reviewers ([#2439](https://github.com/kubeflow/katib/pull/2439) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update README and out-of-date docs ([#2438](https://github.com/kubeflow/katib/pull/2438) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Changes isort profile to black, to be fully compatible and adds 'pkg' dir for black and flake8 ([#2413](https://github.com/kubeflow/katib/pull/2413) by [@Ygnas](https://github.com/Ygnas))
|
||||
- Introduced error constants and replaced reflect with cmp ([#2289](https://github.com/kubeflow/katib/pull/2289) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- [Test] Refactor `inject_webhook_test.go` according to the Developer Guide ([#2401](https://github.com/kubeflow/katib/pull/2401) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Enhance pre-commit hooks with flake8 and black ([#2407](https://github.com/kubeflow/katib/pull/2407) by [@Ygnas](https://github.com/Ygnas))
|
||||
- added `Distribution` field to feasibleSpace in `api.proto` ([#2397](https://github.com/kubeflow/katib/pull/2397) by [@shashank-iitbhu](https://github.com/shashank-iitbhu))
|
||||
- Begin enabling pre-commit hooks ([#2242](https://github.com/kubeflow/katib/pull/2242) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- Update Instructions for Argo Workflows ([#2382](https://github.com/kubeflow/katib/pull/2382) by [@jaffe-fly](https://github.com/jaffe-fly))
|
||||
- docs: update suggestion.md ([#2387](https://github.com/kubeflow/katib/pull/2387) by [@eltociear](https://github.com/eltociear))
|
||||
- Add command to re-run GitHub Actions tests ([#2385](https://github.com/kubeflow/katib/pull/2385) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Bump Katib Python SDK to 0.17.0 version ([#2379](https://github.com/kubeflow/katib/pull/2379) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.17.0 ([#2380](https://github.com/kubeflow/katib/pull/2380) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Replaced hpcloud with nxadm for tail package in Go ([#2375](https://github.com/kubeflow/katib/pull/2375) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Use ErrorList for experiment validator ([#2329](https://github.com/kubeflow/katib/pull/2329) by [@ckcd](https://github.com/ckcd))
|
||||
- Add Changelog for Katib v0.17.0-rc.1 ([#2370](https://github.com/kubeflow/katib/pull/2370) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Remove default caBundle value ([#2368](https://github.com/kubeflow/katib/pull/2368) by [@vihangm](https://github.com/vihangm))
|
||||
- Bump Katib Python SDK to 0.17.0rc1 version ([#2365](https://github.com/kubeflow/katib/pull/2365) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add unit test for `create_experiment` in the `katib_client` module ([#2325](https://github.com/kubeflow/katib/pull/2325) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Remove code generation from release script ([#2363](https://github.com/kubeflow/katib/pull/2363) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Upgrade the protobuf version to >=4.21.12,<5 ([#2358](https://github.com/kubeflow/katib/pull/2358) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace gRPC code generation tool from Znly/protoc to Buf ([#2344](https://github.com/kubeflow/katib/pull/2344) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Replace already closed github.com/golang/mock with go.uber.org/mock ([#2357](https://github.com/kubeflow/katib/pull/2357) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Use cache-dependency-path in actions/setup-go for CI workflow ([#2355](https://github.com/kubeflow/katib/pull/2355) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Update Slack Invitation ([#2349](https://github.com/kubeflow/katib/pull/2349) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update GitHub template to better triage Issues ([#2335](https://github.com/kubeflow/katib/pull/2335) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.17.0-rc.0 ([#2319](https://github.com/kubeflow/katib/pull/2319) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update outdated actions ([#2324](https://github.com/kubeflow/katib/pull/2324) by [@Mersho](https://github.com/Mersho))
|
||||
- Make test fields private in Go unit tests ([#2316](https://github.com/kubeflow/katib/pull/2316) by [@tariq-hasan](https://github.com/tariq-hasan))
|
||||
- Bump Katib Python SDK to 0.17.0rc0 Version ([#2318](https://github.com/kubeflow/katib/pull/2318) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.17.0...v0.18.0-rc.0)
|
||||
|
||||
# [v0.17.0](https://github.com/kubeflow/katib/tree/v0.17.0) (2024-07-12)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- [SDK] Drop Python 3.7 and Support Python 3.11 ([#2337](https://github.com/kubeflow/katib/pull/2337) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- [SDK] Upgrade the protobuf version to >=4.21.12,<5 ([#2358](https://github.com/kubeflow/katib/pull/2358) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.26, and support Kubernetes v1.29 ([#2308](https://github.com/kubeflow/katib/pull/2308) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.25, and Support Kubernetes v1.28 ([#2303](https://github.com/kubeflow/katib/pull/2303) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove MXNet examples ([#2267](https://github.com/kubeflow/katib/pull/2267) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## New Features
|
||||
|
||||
### Core Features
|
||||
|
||||
- Replace gRPC code generation tool from Znly/protoc to Buf ([#2344](https://github.com/kubeflow/katib/pull/2344) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Support ARM64 arch for release images ([#2315](https://github.com/kubeflow/katib/pull/2315) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- DB: Add environment variable option to skip DB table creationˆ ([#2245](https://github.com/kubeflow/katib/pull/2245) by [@lkaybob](https://github.com/lkaybob))
|
||||
- Add environment variable option to set postgres ssl mode ([#2266](https://github.com/kubeflow/katib/pull/2266) by [@ckcd](https://github.com/ckcd))
|
||||
- Upgrade TensorFlow version to v2.16.1 ([#2282](https://github.com/kubeflow/katib/pull/2282) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Upgrade PyTorch version to v2.2.1 ([#2279](https://github.com/kubeflow/katib/pull/2279) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
### SDK Features
|
||||
|
||||
- [SDK] Generate Name functionality for creating experiments. ([#2272](https://github.com/kubeflow/katib/pull/2272) by [@bharathk005](https://github.com/bharathk005))
|
||||
- [SDK] Add `env` & `env_from` in client tune ([#2235](https://github.com/kubeflow/katib/pull/2235) by [@shipengcheng1230](https://github.com/shipengcheng1230))
|
||||
- [SDK] Add 'algorithm_settings' in client tune ([#2227](https://github.com/kubeflow/katib/pull/2227) by [@shipengcheng1230](https://github.com/shipengcheng1230))
|
||||
- [SDK] Raise more human-readable name conflict exception ([#2199](https://github.com/kubeflow/katib/pull/2199) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- Remove code generation from release script ([#2364](https://github.com/kubeflow/katib/pull/2364) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] Fix empty list for env variables and numpy version ([#2360](https://github.com/kubeflow/katib/pull/2360) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Use cache-dependency-path in actions/setup-go for CI workflow ([#2355](https://github.com/kubeflow/katib/pull/2355) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix TestReconcileBatchJob ([#2350](https://github.com/kubeflow/katib/pull/2350) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix Scikit-Learn Version for Skopt Tests ([#2336](https://github.com/kubeflow/katib/pull/2336) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] Fix env per Trial parameter in tune API ([#2304](https://github.com/kubeflow/katib/pull/2304) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Fix: clean up UTs for file metrics collector ([#2285](https://github.com/kubeflow/katib/pull/2285) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Fix tensor devices for DARTS Trial ([#2273](https://github.com/kubeflow/katib/pull/2273) by [@sifa1024](https://github.com/sifa1024))
|
||||
- Typo fix stale.yaml ([#2257](https://github.com/kubeflow/katib/pull/2257) by [@tarilabs](https://github.com/tarilabs))
|
||||
- Fix Optuna Validation for CMA-ES ([#2240](https://github.com/kubeflow/katib/pull/2240) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Replace already closed github.com/golang/mock with go.uber.org/mock ([#2357](https://github.com/kubeflow/katib/pull/2357) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Update outdated actions ([#2324](https://github.com/kubeflow/katib/pull/2324) by [@Mersho](https://github.com/Mersho))
|
||||
- Upgrade Go version to v1.22 ([#2309](https://github.com/kubeflow/katib/pull/2309) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- CI: Enable parallel mode for the coveralls ([#2297](https://github.com/kubeflow/katib/pull/2297) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Upgrade Python version to 3.11 ([#2278](https://github.com/kubeflow/katib/pull/2278) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- chore: add unit testcases for files in Text format. ([#2274](https://github.com/kubeflow/katib/pull/2274) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Upgrade google/go-containerregistry/pkg/authn/k8schain ([#2252](https://github.com/kubeflow/katib/pull/2252) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Add Technical and style guide to the contribution guide ([#2250](https://github.com/kubeflow/katib/pull/2250) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Install typing-extensions v4.6.3 for Optuna ([#2251](https://github.com/kubeflow/katib/pull/2251) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove legacy BO code ([#2246](https://github.com/kubeflow/katib/pull/2246) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0 ([#2239](https://github.com/kubeflow/katib/pull/2239) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Katib ROADMAP 2022/2023 ([#2153](https://github.com/kubeflow/katib/pull/2153) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update Ubuntu to 22.04 for E2E Tests ([#2222](https://github.com/kubeflow/katib/pull/2222) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Run Stale Action Every 5th Hour ([#2221](https://github.com/kubeflow/katib/pull/2221) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Stale GitHub Action ([#2220](https://github.com/kubeflow/katib/pull/2220) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0-rc.1 ([#2218](https://github.com/kubeflow/katib/pull/2218) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0-rc.0 ([#2204](https://github.com/kubeflow/katib/pull/2204) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Use the controller-runtime logger in the cert-generator ([#2219](https://github.com/kubeflow/katib/pull/2219) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.16.0...v0.17.0)
|
||||
|
||||
# [v0.17.0-rc.1](https://github.com/kubeflow/katib/tree/v0.17.0-rc.1) (2024-06-20)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- [SDK] Drop Python 3.7 and Support Python 3.11 ([#2337](https://github.com/kubeflow/katib/pull/2337) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- [SDK] Upgrade the protobuf version to >=4.21.12,<5 ([#2358](https://github.com/kubeflow/katib/pull/2358) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## New Features
|
||||
|
||||
- Replace gRPC code generation tool from Znly/protoc to Buf ([#2344](https://github.com/kubeflow/katib/pull/2344) by [@forsaken628](https://github.com/forsaken628))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- Remove code generation from release script ([#2364](https://github.com/kubeflow/katib/pull/2364) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [SDK] Fix empty list for env variables and numpy version ([#2360](https://github.com/kubeflow/katib/pull/2360) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Use cache-dependency-path in actions/setup-go for CI workflow ([#2355](https://github.com/kubeflow/katib/pull/2355) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix TestReconcileBatchJob ([#2350](https://github.com/kubeflow/katib/pull/2350) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Fix Scikit-Learn Version for Skopt Tests ([#2336](https://github.com/kubeflow/katib/pull/2336) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Replace already closed github.com/golang/mock with go.uber.org/mock ([#2357](https://github.com/kubeflow/katib/pull/2357) by [@forsaken628](https://github.com/forsaken628))
|
||||
- Update outdated actions ([#2324](https://github.com/kubeflow/katib/pull/2324) by [@Mersho](https://github.com/Mersho))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.17.0-rc.0...v0.17.0-rc.1)
|
||||
|
||||
# [v0.17.0-rc.0](https://github.com/kubeflow/katib/tree/v0.17.0-rc.0) (2024-04-29)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- Drop Kubernetes v1.26, and support Kubernetes v1.29 ([#2308](https://github.com/kubeflow/katib/pull/2308) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.25, and Support Kubernetes v1.28 ([#2303](https://github.com/kubeflow/katib/pull/2303) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## New Features
|
||||
|
||||
### Core Features
|
||||
|
||||
- Support ARM64 arch for release images ([#2315](https://github.com/kubeflow/katib/pull/2315) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- DB: Add environment variable option to skip DB table creationˆ ([#2245](https://github.com/kubeflow/katib/pull/2245) by [@lkaybob](https://github.com/lkaybob))
|
||||
- Add environment variable option to set postgres ssl mode ([#2266](https://github.com/kubeflow/katib/pull/2266) by [@ckcd](https://github.com/ckcd))
|
||||
- Upgrade TensorFlow version to v2.16.1 ([#2282](https://github.com/kubeflow/katib/pull/2282) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Upgrade PyTorch version to v2.2.1 ([#2279](https://github.com/kubeflow/katib/pull/2279) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
### SDK Features
|
||||
|
||||
- [SDK] Generate Name functionality for creating experiments. ([#2272](https://github.com/kubeflow/katib/pull/2272) by [@bharathk005](https://github.com/bharathk005))
|
||||
- [SDK] Add `env` & `env_from` in client tune ([#2235](https://github.com/kubeflow/katib/pull/2235) by [@shipengcheng1230](https://github.com/shipengcheng1230))
|
||||
- [SDK] Add 'algorithm_settings' in client tune ([#2227](https://github.com/kubeflow/katib/pull/2227) by [@shipengcheng1230](https://github.com/shipengcheng1230))
|
||||
- [SDK] Raise more human-readable name conflict exception ([#2199](https://github.com/kubeflow/katib/pull/2199) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- [SDK] Fix env per Trial parameter in tune API ([#2304](https://github.com/kubeflow/katib/pull/2304) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Fix: clean up UTs for file metrics collector ([#2285](https://github.com/kubeflow/katib/pull/2285) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Fix tensor devices for DARTS Trial ([#2273](https://github.com/kubeflow/katib/pull/2273) by [@sifa1024](https://github.com/sifa1024))
|
||||
- Typo fix stale.yaml ([#2257](https://github.com/kubeflow/katib/pull/2257) by [@tarilabs](https://github.com/tarilabs))
|
||||
- Fix Optuna Validation for CMA-ES ([#2240](https://github.com/kubeflow/katib/pull/2240) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Upgrade Go version to v1.22 ([#2309](https://github.com/kubeflow/katib/pull/2309) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- CI: Enable parallel mode for the coveralls ([#2297](https://github.com/kubeflow/katib/pull/2297) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Upgrade Python version to 3.11 ([#2278](https://github.com/kubeflow/katib/pull/2278) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- chore: add unit testcases for files in Text format. ([#2274](https://github.com/kubeflow/katib/pull/2274) by [@Electronic-Waste](https://github.com/Electronic-Waste))
|
||||
- Upgrade google/go-containerregistry/pkg/authn/k8schain ([#2252](https://github.com/kubeflow/katib/pull/2252) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove MXNet examples ([#2267](https://github.com/kubeflow/katib/pull/2267) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Add Technical and style guide to the contribution guide ([#2250](https://github.com/kubeflow/katib/pull/2250) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Install typing-extensions v4.6.3 for Optuna ([#2251](https://github.com/kubeflow/katib/pull/2251) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove legacy BO code ([#2246](https://github.com/kubeflow/katib/pull/2246) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0 ([#2239](https://github.com/kubeflow/katib/pull/2239) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Katib ROADMAP 2022/2023 ([#2153](https://github.com/kubeflow/katib/pull/2153) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Update Ubuntu to 22.04 for E2E Tests ([#2222](https://github.com/kubeflow/katib/pull/2222) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Run Stale Action Every 5th Hour ([#2221](https://github.com/kubeflow/katib/pull/2221) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Stale GitHub Action ([#2220](https://github.com/kubeflow/katib/pull/2220) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0-rc.1 ([#2218](https://github.com/kubeflow/katib/pull/2218) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.16.0-rc.0 ([#2204](https://github.com/kubeflow/katib/pull/2204) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Use the controller-runtime logger in the cert-generator ([#2219](https://github.com/kubeflow/katib/pull/2219) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.16.0...v0.17.0-rc.0)
|
||||
|
||||
# [v0.16.0](https://github.com/kubeflow/katib/tree/v0.16.0) (2023-10-31)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- Implement KatibConfig API ([#2176](https://github.com/kubeflow/katib/pull/2176) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.24 and support Kubernetes v1.27 ([#2182](https://github.com/kubeflow/katib/pull/2182) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.23 and support Kubernetes v1.26 ([#2177](https://github.com/kubeflow/katib/pull/2177) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Change failurePolicy to Fail for Katib Webhooks ([#2018](https://github.com/kubeflow/katib/pull/2018) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## New Features
|
||||
|
||||
### Core Features
|
||||
|
||||
- Consolidate the Katib Cert Generator to the Katib Controller ([#2185](https://github.com/kubeflow/katib/pull/2185) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Containerize tests for Katib Conformance ([#2146](https://github.com/kubeflow/katib/pull/2146) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
|
||||
### UI Improvements
|
||||
|
||||
- [UI] Default Resume Policy to never from UI ([#2195](https://github.com/kubeflow/katib/pull/2195) by [@mChowdhury-91](https://github.com/mChowdhury-91))
|
||||
- [UI] Remove Deprecated Katib UI ([#2179](https://github.com/kubeflow/katib/pull/2179) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [UI] Fix Trial Logs when Kubernetes Job Fails ([#2164](https://github.com/kubeflow/katib/pull/2164) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- kwa(front): Support all namespaces ([#2119](https://github.com/kubeflow/katib/pull/2119) by [@elenzio9](https://github.com/elenzio9))
|
||||
- kwa(front): Update the use of SnackBarService ([#2113](https://github.com/kubeflow/katib/pull/2113) by [@orfeas-k](https://github.com/orfeas-k))
|
||||
- UI: Remove an unsed import, EventV1beta1Api ([#2116](https://github.com/kubeflow/katib/pull/2116) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
### SDK Improvements
|
||||
|
||||
- [SDK] Enable resource specification for trial containers ([#2192](https://github.com/kubeflow/katib/pull/2192) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- [SDK] Add namespace parameter to KatibClient ([#2183](https://github.com/kubeflow/katib/pull/2183) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- [SDK] Import all Kubernetes Models ([#2148](https://github.com/kubeflow/katib/pull/2148) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Bug fixes
|
||||
|
||||
- Bug: Wait for the certs to be mounted inside the container ([#2213](https://github.com/kubeflow/katib/pull/2213) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Start waiting for certs to be ready before sending data to the channel ([#2215](https://github.com/kubeflow/katib/pull/2215) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- E2E: Add additional checks to verify if the components are ready ([#2212](https://github.com/kubeflow/katib/pull/2212) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove a katib-webhook-cert Secret from components ([#2214](https://github.com/kubeflow/katib/pull/2214) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Skip to inject the metrics-collector pods to the Katib controller ([#2211](https://github.com/kubeflow/katib/pull/2211) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Sending an empty data to the certsReady channel ([#2196](https://github.com/kubeflow/katib/pull/2196) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Fix conformance docker image ([#2147](https://github.com/kubeflow/katib/pull/2147) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
|
||||
## Documentation
|
||||
|
||||
- Add PITS Global Data Recovery Services to the adopters list ([#2160](https://github.com/kubeflow/katib/pull/2160) by [@ghost](https://github.com/ghost))
|
||||
- Add SDK Breaking Change to Changelog ([#2133](https://github.com/kubeflow/katib/pull/2133) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0 ([#2129](https://github.com/kubeflow/katib/pull/2129) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0-rc.1 ([#2123](https://github.com/kubeflow/katib/pull/2123) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0-rc.0 ([#2106](https://github.com/kubeflow/katib/pull/2106) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Upgrade Tensorflow version to v2.13.0 ([#2216](https://github.com/kubeflow/katib/pull/2216) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Upgrade Go version to v1.20 ([#2190](https://github.com/kubeflow/katib/pull/2190) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace grpc_health_probe with the built-in gRPC container probe feature ([#2189](https://github.com/kubeflow/katib/pull/2189) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Allow install binaries for the arm64 in the envtest ([#2188](https://github.com/kubeflow/katib/pull/2188) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace action to setup minikube with medyagh/setup-minikube ([#2178](https://github.com/kubeflow/katib/pull/2178) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove Charmed Operators for Katib ([#2161](https://github.com/kubeflow/katib/pull/2161) by [@ca-scribner](https://github.com/ca-scribner))
|
||||
- Namespace and trial pod annotations as CLI argument ([#2138](https://github.com/kubeflow/katib/pull/2138) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
- Relax dependencies restriction for the gRPC libraries ([#2140](https://github.com/kubeflow/katib/pull/2140) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Add SDK Breaking Change to Changelog ([#2133](https://github.com/kubeflow/katib/pull/2133) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Increase the free spaces in CI ([#2131](https://github.com/kubeflow/katib/pull/2131) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Reformat katib-operators ([#2114](https://github.com/kubeflow/katib/pull/2114) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.15.0...v0.16.0)
|
||||
|
||||
# [v0.16.0-rc.1](https://github.com/kubeflow/katib/tree/v0.16.0-rc.1) (2023-08-16)
|
||||
|
||||
## New Features
|
||||
|
||||
- Upgrade Tensorflow version to v2.13.0 ([#2216](https://github.com/kubeflow/katib/pull/2216) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
- Bug: Wait for the certs to be mounted inside the container ([#2213](https://github.com/kubeflow/katib/pull/2213) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Start waiting for certs to be ready before sending data to the channel ([#2215](https://github.com/kubeflow/katib/pull/2215) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- E2E: Add additional checks to verify if the components are ready ([#2212](https://github.com/kubeflow/katib/pull/2212) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove a katib-webhook-cert Secret from components ([#2214](https://github.com/kubeflow/katib/pull/2214) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Skip to inject the metrics-collector pods to the Katib controller ([#2211](https://github.com/kubeflow/katib/pull/2211) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.16.0-rc.0...v0.16.0-rc.1)
|
||||
|
||||
# [v0.16.0-rc.0](https://github.com/kubeflow/katib/tree/v0.16.0-rc.0) (2023-08-05)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
- Implement KatibConfig API ([#2176](https://github.com/kubeflow/katib/pull/2176) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.24 and support Kubernetes v1.27 ([#2182](https://github.com/kubeflow/katib/pull/2182) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Drop Kubernetes v1.23 and support Kubernetes v1.26 ([#2177](https://github.com/kubeflow/katib/pull/2177) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Change failurePolicy to Fail for Katib Webhooks ([#2018](https://github.com/kubeflow/katib/pull/2018) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## New Features
|
||||
|
||||
### Core Features
|
||||
|
||||
- Consolidate the Katib Cert Generator to the Katib Controller ([#2185](https://github.com/kubeflow/katib/pull/2185) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Containerize tests for Katib Conformance ([#2146](https://github.com/kubeflow/katib/pull/2146) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
|
||||
### UI Improvements
|
||||
|
||||
- [UI] Default Resume Policy to never from UI ([#2195](https://github.com/kubeflow/katib/pull/2195) by [@mChowdhury-91](https://github.com/mChowdhury-91))
|
||||
- [UI] Remove Deprecated Katib UI ([#2179](https://github.com/kubeflow/katib/pull/2179) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- [UI] Fix Trial Logs when Kubernetes Job Fails ([#2164](https://github.com/kubeflow/katib/pull/2164) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- kwa(front): Support all namespaces ([#2119](https://github.com/kubeflow/katib/pull/2119) by [@elenzio9](https://github.com/elenzio9))
|
||||
- kwa(front): Update the use of SnackBarService ([#2113](https://github.com/kubeflow/katib/pull/2113) by [@orfeas-k](https://github.com/orfeas-k))
|
||||
- UI: Remove an unsed import, EventV1beta1Api ([#2116](https://github.com/kubeflow/katib/pull/2116) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
### SDK Improvements
|
||||
|
||||
- [SDK] Enable resource specification for trial containers ([#2192](https://github.com/kubeflow/katib/pull/2192) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- [SDK] Add namespace parameter to KatibClient ([#2183](https://github.com/kubeflow/katib/pull/2183) by [@droctothorpe](https://github.com/droctothorpe))
|
||||
- [SDK] Import all Kubernetes Models ([#2148](https://github.com/kubeflow/katib/pull/2148) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Bug fixes
|
||||
|
||||
- Sending an empty data to the certsReady channel ([#2196](https://github.com/kubeflow/katib/pull/2196) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Fix conformance docker image ([#2147](https://github.com/kubeflow/katib/pull/2147) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
|
||||
## Documentation
|
||||
|
||||
- Add PITS Global Data Recovery Services to the adopters list ([#2160](https://github.com/kubeflow/katib/pull/2160) by [@ghost](https://github.com/ghost))
|
||||
- Add SDK Breaking Change to Changelog ([#2133](https://github.com/kubeflow/katib/pull/2133) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0 ([#2129](https://github.com/kubeflow/katib/pull/2129) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0-rc.1 ([#2123](https://github.com/kubeflow/katib/pull/2123) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Add Changelog for Katib v0.15.0-rc.0 ([#2106](https://github.com/kubeflow/katib/pull/2106) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
|
||||
## Misc
|
||||
|
||||
- Upgrade Go version to v1.20 ([#2190](https://github.com/kubeflow/katib/pull/2190) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace grpc_health_probe with the built-in gRPC container probe feature ([#2189](https://github.com/kubeflow/katib/pull/2189) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Allow install binaries for the arm64 in the envtest ([#2188](https://github.com/kubeflow/katib/pull/2188) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Replace action to setup minikube with medyagh/setup-minikube ([#2178](https://github.com/kubeflow/katib/pull/2178) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Remove Charmed Operators for Katib ([#2161](https://github.com/kubeflow/katib/pull/2161) by [@ca-scribner](https://github.com/ca-scribner))
|
||||
- Namespace and trial pod annotations as CLI argument ([#2138](https://github.com/kubeflow/katib/pull/2138) by [@nagar-ajay](https://github.com/nagar-ajay))
|
||||
- Relax dependencies restriction for the gRPC libraries ([#2140](https://github.com/kubeflow/katib/pull/2140) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Add SDK Breaking Change to Changelog ([#2133](https://github.com/kubeflow/katib/pull/2133) by [@andreyvelich](https://github.com/andreyvelich))
|
||||
- Increase the free spaces in CI ([#2131](https://github.com/kubeflow/katib/pull/2131) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
- Reformat katib-operators ([#2114](https://github.com/kubeflow/katib/pull/2114) by [@tenzen-y](https://github.com/tenzen-y))
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.15.0...v0.16.0-rc.0)
|
||||
|
||||
# [v0.15.0](https://github.com/kubeflow/katib/tree/v0.15.0) (2023-03-22)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
|
@ -121,7 +622,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.14.0...v0.15.0)
|
||||
|
||||
## [v0.15.0-rc.1](https://github.com/kubeflow/katib/tree/v0.15.0-rc.1) (2023-02-15)
|
||||
# [v0.15.0-rc.1](https://github.com/kubeflow/katib/tree/v0.15.0-rc.1) (2023-02-15)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -133,7 +634,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.15.0-rc.0...v0.15.0-rc.1)
|
||||
|
||||
## [v0.15.0-rc.0](https://github.com/kubeflow/katib/tree/v0.15.0-rc.0) (2023-01-27)
|
||||
# [v0.15.0-rc.0](https://github.com/kubeflow/katib/tree/v0.15.0-rc.0) (2023-01-27)
|
||||
|
||||
## Breaking Changes
|
||||
|
||||
|
@ -253,7 +754,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.14.0...v0.15.0-rc.0)
|
||||
|
||||
## [v0.14.0](https://github.com/kubeflow/katib/tree/v0.14.0) (2022-08-18)
|
||||
# [v0.14.0](https://github.com/kubeflow/katib/tree/v0.14.0) (2022-08-18)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -314,7 +815,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.13.0...v0.14.0).
|
||||
|
||||
## [v0.13.0](https://github.com/kubeflow/katib/tree/v0.13.0) (2022-03-04)
|
||||
# [v0.13.0](https://github.com/kubeflow/katib/tree/v0.13.0) (2022-03-04)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -389,7 +890,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.12.0...v0.13.0)
|
||||
|
||||
## [v0.13.0-rc.1](https://github.com/kubeflow/katib/tree/v0.13.0-rc.1) (2022-02-15)
|
||||
# [v0.13.0-rc.1](https://github.com/kubeflow/katib/tree/v0.13.0-rc.1) (2022-02-15)
|
||||
|
||||
## Bug fixes
|
||||
|
||||
|
@ -398,7 +899,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.13.0-rc.0...v0.13.0-rc.1)
|
||||
|
||||
## [v0.13.0-rc.0](https://github.com/kubeflow/katib/tree/v0.13.0-rc.0) (2022-01-25)
|
||||
# [v0.13.0-rc.0](https://github.com/kubeflow/katib/tree/v0.13.0-rc.0) (2022-01-25)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -471,7 +972,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.12.0...v0.13.0-rc.0)
|
||||
|
||||
## [v0.12.0](https://github.com/kubeflow/katib/tree/v0.12.0) (2021-10-05)
|
||||
# [v0.12.0](https://github.com/kubeflow/katib/tree/v0.12.0) (2021-10-05)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -527,7 +1028,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.11.1...v0.12.0)
|
||||
|
||||
## [v0.12.0-rc.1](https://github.com/kubeflow/katib/tree/v0.12.0-rc.1) (2021-09-07)
|
||||
# [v0.12.0-rc.1](https://github.com/kubeflow/katib/tree/v0.12.0-rc.1) (2021-09-07)
|
||||
|
||||
## Bug Fixes
|
||||
|
||||
|
@ -536,7 +1037,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.12.0-rc.0...v0.12.0-rc.1)
|
||||
|
||||
## [v0.12.0-rc.0](https://github.com/kubeflow/katib/tree/v0.12.0-rc.0) (2021-08-19)
|
||||
# [v0.12.0-rc.0](https://github.com/kubeflow/katib/tree/v0.12.0-rc.0) (2021-08-19)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -590,7 +1091,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.11.1...v0.12.0-rc.0)
|
||||
|
||||
## [v0.11.1](https://github.com/kubeflow/katib/tree/v0.11.1) (2021-06-09)
|
||||
# [v0.11.1](https://github.com/kubeflow/katib/tree/v0.11.1) (2021-06-09)
|
||||
|
||||
## Bug fixes
|
||||
|
||||
|
@ -604,7 +1105,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.11.0...v0.11.1)
|
||||
|
||||
## [v0.11.0](https://github.com/kubeflow/katib/tree/v0.11.0) (2021-03-22)
|
||||
# [v0.11.0](https://github.com/kubeflow/katib/tree/v0.11.0) (2021-03-22)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -661,7 +1162,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.10.1...v0.11.0)
|
||||
|
||||
## [v0.10.1](https://github.com/kubeflow/katib/tree/v0.10.1) (2021-03-02)
|
||||
# [v0.10.1](https://github.com/kubeflow/katib/tree/v0.10.1) (2021-03-02)
|
||||
|
||||
## Features and Bug Fixes
|
||||
|
||||
|
@ -695,7 +1196,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.10.0...v0.10.1)
|
||||
|
||||
## [v0.10.0](https://github.com/kubeflow/katib/tree/v0.10.0) (2020-11-07)
|
||||
# [v0.10.0](https://github.com/kubeflow/katib/tree/v0.10.0) (2020-11-07)
|
||||
|
||||
## New Features
|
||||
|
||||
|
@ -739,7 +1240,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.9.0...v0.10.0)
|
||||
|
||||
## [v0.9.0](https://github.com/kubeflow/katib/tree/v0.9.0) (2020-06-10)
|
||||
# [v0.9.0](https://github.com/kubeflow/katib/tree/v0.9.0) (2020-06-10)
|
||||
|
||||
## Features and Bug Fixes
|
||||
|
||||
|
@ -996,7 +1497,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.6.0-rc.0...v0.9.0)
|
||||
|
||||
## [v0.6.0-rc.0](https://github.com/kubeflow/katib/tree/v0.6.0-rc.0) (2019-06-28)
|
||||
# [v0.6.0-rc.0](https://github.com/kubeflow/katib/tree/v0.6.0-rc.0) (2019-06-28)
|
||||
|
||||
## Features and Bug Fixes
|
||||
|
||||
|
@ -1251,7 +1752,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/826657c14602a3f36263f3d6769451af0a75d18a...v0.6.0-rc.0)
|
||||
|
||||
## [0.2](https://github.com/kubeflow/katib/tree/0.2) (2018-08-20)
|
||||
# [0.2](https://github.com/kubeflow/katib/tree/0.2) (2018-08-20)
|
||||
|
||||
## Features
|
||||
|
||||
|
@ -1278,7 +1779,7 @@
|
|||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.1.2-alpha...826657c14602a3f36263f3d6769451af0a75d18a)
|
||||
|
||||
## [v0.1.2-alpha](https://github.com/kubeflow/katib/tree/v0.1.2-alpha) (2018-06-05)
|
||||
# [v0.1.2-alpha](https://github.com/kubeflow/katib/tree/v0.1.2-alpha) (2018-06-05)
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.1.1-alpha...v0.1.2-alpha)
|
||||
|
||||
|
@ -1309,7 +1810,7 @@
|
|||
- Refine API [\#74](https://github.com/kubeflow/katib/pull/74) ([YujiOshima](https://github.com/YujiOshima))
|
||||
- worker: Rename worker_interface to worker [\#70](https://github.com/kubeflow/katib/pull/70) ([gaocegege](https://github.com/gaocegege))
|
||||
|
||||
## [v0.1.1-alpha](https://github.com/kubeflow/katib/tree/v0.1.1-alpha) (2018-04-24)
|
||||
# [v0.1.1-alpha](https://github.com/kubeflow/katib/tree/v0.1.1-alpha) (2018-04-24)
|
||||
|
||||
[Full Changelog](https://github.com/kubeflow/katib/compare/v0.1.0-alpha...v0.1.1-alpha)
|
||||
|
||||
|
@ -1347,7 +1848,7 @@
|
|||
- New db log schema [\#35](https://github.com/kubeflow/katib/pull/35) ([YujiOshima](https://github.com/YujiOshima))
|
||||
- Fix CI failures [\#27](https://github.com/kubeflow/katib/pull/27) ([gaocegege](https://github.com/gaocegege))
|
||||
|
||||
## [v0.1.0-alpha](https://github.com/kubeflow/katib/tree/v0.1.0-alpha) (2018-04-10)
|
||||
# [v0.1.0-alpha](https://github.com/kubeflow/katib/tree/v0.1.0-alpha) (2018-04-10)
|
||||
|
||||
**Closed issues:**
|
||||
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
cff-version: 1.2.0
|
||||
message: "If you use Katib in your scientific publication, please cite it as below."
|
||||
authors:
|
||||
- family-names: "George"
|
||||
given-names: "Johnu"
|
||||
- family-names: "Gao"
|
||||
given-names: "Ce"
|
||||
- family-names: "Liu"
|
||||
given-names: "Richard"
|
||||
- family-names: "Liu"
|
||||
given-names: "Hou Gang"
|
||||
- family-names: "Tang"
|
||||
given-names: "Yuan"
|
||||
- family-names: "Pydipaty"
|
||||
given-names: "Ramdoot"
|
||||
- family-names: "Saha"
|
||||
given-names: "Amit Kumar"
|
||||
title: "Katib"
|
||||
type: software
|
||||
repository-code: "https://github.com/kubeflow/katib"
|
||||
preferred-citation:
|
||||
type: misc
|
||||
title: "A Scalable and Cloud-Native Hyperparameter Tuning System"
|
||||
authors:
|
||||
- family-names: "George"
|
||||
given-names: "Johnu"
|
||||
- family-names: "Gao"
|
||||
given-names: "Ce"
|
||||
- family-names: "Liu"
|
||||
given-names: "Richard"
|
||||
- family-names: "Liu"
|
||||
given-names: "Hou Gang"
|
||||
- family-names: "Tang"
|
||||
given-names: "Yuan"
|
||||
- family-names: "Pydipaty"
|
||||
given-names: "Ramdoot"
|
||||
- family-names: "Saha"
|
||||
given-names: "Amit Kumar"
|
||||
year: 2020
|
||||
url: "https://arxiv.org/abs/2006.02085"
|
||||
identifiers:
|
||||
- type: "other"
|
||||
value: "arXiv:2006.02085"
|
|
@ -2,31 +2,35 @@
|
|||
|
||||
This developer guide is for people who want to contribute to the Katib project.
|
||||
If you're interesting in using Katib in your machine learning project,
|
||||
see the following user guides:
|
||||
see the following guides:
|
||||
|
||||
- [Concepts](https://www.kubeflow.org/docs/components/katib/overview/)
|
||||
in Katib, hyperparameter tuning, and neural architecture search.
|
||||
- [Getting started with Katib](https://kubeflow.org/docs/components/katib/hyperparameter/).
|
||||
- Detailed guide to [configuring and running a Katib
|
||||
experiment](https://kubeflow.org/docs/components/katib/experiment/).
|
||||
- [How to configure Katib Experiment](https://kubeflow.org/docs/components/katib/experiment/).
|
||||
- [Katib architecture and concepts](https://www.kubeflow.org/docs/components/katib/reference/architecture/)
|
||||
for hyperparameter tuning and neural architecture search.
|
||||
|
||||
## Requirements
|
||||
|
||||
- [Go](https://golang.org/) (1.20 or later)
|
||||
- [Docker](https://docs.docker.com/) (20.10 or later)
|
||||
- [Go](https://golang.org/) (1.22 or later)
|
||||
- [Docker](https://docs.docker.com/) (24.0 or later)
|
||||
- [Docker Buildx](https://docs.docker.com/build/buildx/) (0.8.0 or later)
|
||||
- [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later)
|
||||
- [Python](https://www.python.org/) (3.10 or later)
|
||||
- [Python](https://www.python.org/) (3.11 or later)
|
||||
- [kustomize](https://kustomize.io/) (4.0.5 or later)
|
||||
- [pre-commit](https://pre-commit.com/)
|
||||
|
||||
## Build from source code
|
||||
|
||||
Check source code as follows:
|
||||
**Note** that your Docker Desktop should
|
||||
[enable containerd image store](https://docs.docker.com/desktop/containerd/#enable-the-containerd-image-store)
|
||||
to build multi-arch images. Check source code as follows:
|
||||
|
||||
```bash
|
||||
make build REGISTRY=<image-registry> TAG=<image-tag>
|
||||
```
|
||||
|
||||
If you are using an Apple Silicon machine and encounter the "rosetta error: bss_size overflow," go to Docker Desktop -> General and uncheck "Use Rosetta for x86_64/amd64 emulation on Apple Silicon."
|
||||
|
||||
To use your custom images for the Katib components, modify
|
||||
[Kustomization file](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/kustomization.yaml)
|
||||
and [Katib Config](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/katib-config.yaml)
|
||||
|
@ -43,6 +47,25 @@ You can undeploy Katib v1beta1 manifests from a Kubernetes cluster as follows:
|
|||
make undeploy
|
||||
```
|
||||
|
||||
## Technical and style guide
|
||||
|
||||
The following guidelines apply primarily to Katib,
|
||||
but other projects like [Training Operator](https://github.com/kubeflow/training-operator) might also adhere to them.
|
||||
|
||||
## Go Development
|
||||
|
||||
When coding:
|
||||
|
||||
- Follow [effective go](https://go.dev/doc/effective_go) guidelines.
|
||||
- Run locally [`make check`](https://github.com/kubeflow/katib/blob/46173463027e4fd2e604e25d7075b2b31a702049/Makefile#L31)
|
||||
to verify if changes follow best practices before submitting PRs.
|
||||
|
||||
Testing:
|
||||
|
||||
- Use [`cmp.Diff`](https://pkg.go.dev/github.com/google/go-cmp/cmp#Diff) instead of `reflect.Equal`, to provide useful comparisons.
|
||||
- Define test cases as maps instead of slices to avoid dependencies on the running order.
|
||||
Map key should be equal to the test case name.
|
||||
|
||||
## Modify controller APIs
|
||||
|
||||
If you want to modify Katib controller APIs, you have to
|
||||
|
@ -58,20 +81,17 @@ make generate
|
|||
Below is a list of command-line flags accepted by Katib controller:
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
|--------------|--------|---------|----------------------------------------------------------------------------------------------------------------------------------|
|
||||
| ------------ | ------ | ------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| katib-config | string | "" | The katib-controller will load its initial configuration from this file. Omit this flag to use the default configuration values. |
|
||||
|
||||
## DB Manager Flags
|
||||
|
||||
Below is a list of command-line flags accepted by Katib DB Manager:
|
||||
|
||||
| Name | Type | Default | Description |
|
||||
| --------------- | ------------- | ------- | ------------------------------------------------------- |
|
||||
| connect-timeout | time.Duration | 60s | Timeout before calling error during database connection |
|
||||
|
||||
## Workflow design
|
||||
|
||||
Please see [workflow-design.md](./workflow-design.md).
|
||||
| Name | Type | Default | Description |
|
||||
| --------------- | ------------- | -------------| ------------------------------------------------------------------- |
|
||||
| connect-timeout | time.Duration | 60s | Timeout before calling error during database connection |
|
||||
| listen-address | string | 0.0.0.0:6789 | The network interface or IP address to receive incoming connections |
|
||||
|
||||
## Katib admission webhooks
|
||||
|
||||
|
@ -89,7 +109,7 @@ Katib uses three [Kubernetes admission webhooks](https://kubernetes.io/docs/refe
|
|||
1. `mutator.pod.katib.kubeflow.org` - Mutating admission webhook to inject the metrics
|
||||
collector sidecar container to the training pod. Learn more about the Katib's
|
||||
metrics collector in the
|
||||
[Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector).
|
||||
[Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/user-guides/metrics-collector/).
|
||||
|
||||
You can find the YAMLs for the Katib webhooks
|
||||
[here](../manifests/v1beta1/components/webhook/webhooks.yaml).
|
||||
|
@ -107,7 +127,6 @@ Once Katib is deployed in the Kubernetes cluster, the `cert-generator` follows t
|
|||
- Generate the self-signed certificate and private key.
|
||||
|
||||
- Update a Kubernetes Secret with the self-signed TLS certificate and private key.
|
||||
|
||||
- Patch the webhooks with the `CABundle`.
|
||||
|
||||
Once the `cert-generator` finished, the Katib controller starts to register controllers such as `experiment-controller` to the manager.
|
||||
|
@ -128,3 +147,21 @@ Please see [Katib UI README](../pkg/ui/v1beta1).
|
|||
## Design proposals
|
||||
|
||||
Please see [proposals](./proposals).
|
||||
|
||||
## Code Style
|
||||
|
||||
### pre-commit
|
||||
|
||||
Make sure to install [pre-commit](https://pre-commit.com/) (`pip install
|
||||
pre-commit`) and run `pre-commit install` from the root of the repository at
|
||||
least once before creating git commits.
|
||||
|
||||
The pre-commit [hooks](../.pre-commit-config.yaml) ensure code quality and
|
||||
consistency. They are executed in CI. PRs that fail to comply with the hooks
|
||||
will not be able to pass the corresponding CI gate. The hooks are only executed
|
||||
against staged files unless you run `pre-commit run --all`, in which case,
|
||||
they'll be executed against every file in the repository.
|
||||
|
||||
Specific programmatically generated files listed in the `exclude` field in
|
||||
[.pre-commit-config.yaml](../.pre-commit-config.yaml) are deliberately excluded
|
||||
from the hooks.
|
64
Makefile
|
@ -5,15 +5,13 @@ HAS_SETUP_ENVTEST := $(shell command -v setup-envtest;)
|
|||
HAS_MOCKGEN := $(shell command -v mockgen;)
|
||||
|
||||
COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD)
|
||||
KATIB_REGISTRY := docker.io/kubeflowkatib
|
||||
CPU_ARCH ?= amd64
|
||||
ENVTEST_K8S_VERSION ?= 1.27
|
||||
MOCKGEN_VERSION ?= $(shell grep 'github.com/golang/mock' go.mod | cut -d ' ' -f 2)
|
||||
KATIB_REGISTRY := ghcr.io/kubeflow/katib
|
||||
CPU_ARCH ?= linux/amd64,linux/arm64
|
||||
ENVTEST_K8S_VERSION ?= 1.31
|
||||
MOCKGEN_VERSION ?= $(shell grep 'go.uber.org/mock' go.mod | cut -d ' ' -f 2)
|
||||
GO_VERSION=$(shell grep '^go' go.mod | cut -d ' ' -f 2)
|
||||
GOPATH ?= $(shell go env GOPATH)
|
||||
|
||||
# for pytest
|
||||
PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/apis/manager/v1beta1/python:$(CURDIR)/pkg/apis/manager/health/python
|
||||
PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/metricscollector/v1beta1/common:$(CURDIR)/pkg/metricscollector/v1beta1/tfevent-metricscollector
|
||||
TEST_TENSORFLOW_EVENT_FILE_PATH ?= $(CURDIR)/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs
|
||||
|
||||
# Run tests
|
||||
|
@ -23,7 +21,7 @@ test: envtest
|
|||
|
||||
envtest:
|
||||
ifndef HAS_SETUP_ENVTEST
|
||||
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@935faeba70039b5403616e73f109f4b6b1115b9f #v0.15.0
|
||||
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@release-0.19
|
||||
$(info "setup-envtest has been installed")
|
||||
endif
|
||||
$(info "setup-envtest has already installed")
|
||||
|
@ -35,7 +33,7 @@ fmt:
|
|||
|
||||
lint:
|
||||
ifndef HAS_LINT
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.53.3
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.7
|
||||
$(info "golangci-lint has been installed")
|
||||
endif
|
||||
hack/verify-golangci-lint.sh
|
||||
|
@ -81,10 +79,14 @@ endif
|
|||
sync-go-mod:
|
||||
go mod tidy -go $(GO_VERSION)
|
||||
|
||||
.PHONY: go-mod-download
|
||||
go-mod-download:
|
||||
go mod download
|
||||
|
||||
CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
|
||||
.PHONY: controller-gen
|
||||
controller-gen:
|
||||
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.10.0
|
||||
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.16.5
|
||||
|
||||
# Run this if you update any existing controller APIs.
|
||||
# 1. Generate deepcopy, clientset, listers, informers for the APIs (hack/update-codegen.sh)
|
||||
|
@ -92,18 +94,14 @@ controller-gen:
|
|||
# 3. Generate Python SDK for Katib (hack/gen-python-sdk/gen-sdk.sh)
|
||||
# 4. Generate gRPC manager APIs (pkg/apis/manager/v1beta1/build.sh and pkg/apis/manager/health/build.sh)
|
||||
# 5. Generate Go mock codes
|
||||
generate: controller-gen
|
||||
ifndef GOPATH
|
||||
$(error GOPATH not defined, please define GOPATH. Run "go help gopath" to learn more about GOPATH)
|
||||
endif
|
||||
generate: go-mod-download controller-gen
|
||||
ifndef HAS_MOCKGEN
|
||||
go install github.com/golang/mock/mockgen@$(MOCKGEN_VERSION)
|
||||
go install go.uber.org/mock/mockgen@$(MOCKGEN_VERSION)
|
||||
$(info "mockgen has been installed")
|
||||
endif
|
||||
go generate ./pkg/... ./cmd/...
|
||||
hack/gen-python-sdk/gen-sdk.sh
|
||||
pkg/apis/manager/v1beta1/build.sh
|
||||
pkg/apis/manager/health/build.sh
|
||||
hack/update-proto.sh
|
||||
hack/update-mockgen.sh
|
||||
|
||||
# Build images for the Katib v1beta1 components.
|
||||
|
@ -121,14 +119,12 @@ push-latest: generate
|
|||
bash scripts/v1beta1/push.sh $(KATIB_REGISTRY) $(COMMIT)
|
||||
|
||||
# Build and push Katib images for the given tag.
|
||||
push-tag: generate
|
||||
push-tag:
|
||||
ifeq ($(TAG),)
|
||||
$(error TAG must be set. Usage: make push-tag TAG=<release-tag>)
|
||||
endif
|
||||
bash scripts/v1beta1/build.sh $(KATIB_REGISTRY) $(TAG) $(CPU_ARCH)
|
||||
bash scripts/v1beta1/build.sh $(KATIB_REGISTRY) $(COMMIT) $(CPU_ARCH)
|
||||
bash scripts/v1beta1/push.sh $(KATIB_REGISTRY) $(TAG)
|
||||
bash scripts/v1beta1/push.sh $(KATIB_REGISTRY) $(COMMIT)
|
||||
|
||||
# Release a new version of Katib.
|
||||
release:
|
||||
|
@ -157,7 +153,6 @@ update-boilerplate:
|
|||
prepare-pytest:
|
||||
pip install --prefer-binary -r test/unit/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/hyperopt/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/skopt/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/optuna/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/hyperband/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/nas/enas/v1beta1/requirements.txt
|
||||
|
@ -165,13 +160,34 @@ prepare-pytest:
|
|||
pip install --prefer-binary -r cmd/suggestion/pbt/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/earlystopping/medianstop/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt
|
||||
# `TypeIs` was introduced in typing-extensions 4.10.0, and torch 2.6.0 requires typing-extensions>=4.10.0.
|
||||
# REF: https://github.com/kubeflow/katib/pull/2504
|
||||
# TODO (tenzen-y): Once we upgrade libraries depended on typing-extensions==4.5.0, we can remove this line.
|
||||
pip install typing-extensions==4.10.0
|
||||
|
||||
prepare-pytest-testdata:
|
||||
ifeq ("$(wildcard $(TEST_TENSORFLOW_EVENT_FILE_PATH))", "")
|
||||
python examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py --epochs 5 --batch-size 200 --log-path $(TEST_TENSORFLOW_EVENT_FILE_PATH)
|
||||
endif
|
||||
|
||||
# TODO(Electronic-Waste): Remove the import rewrite when protobuf supports `python_package` option.
|
||||
# REF: https://github.com/protocolbuffers/protobuf/issues/7061
|
||||
pytest: prepare-pytest prepare-pytest-testdata
|
||||
PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion
|
||||
PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/earlystopping
|
||||
PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/metricscollector
|
||||
pytest ./test/unit/v1beta1/suggestion --ignore=./test/unit/v1beta1/suggestion/test_skopt_service.py
|
||||
pytest ./test/unit/v1beta1/earlystopping
|
||||
pytest ./test/unit/v1beta1/metricscollector
|
||||
cp ./pkg/apis/manager/v1beta1/python/api_pb2.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2.py
|
||||
cp ./pkg/apis/manager/v1beta1/python/api_pb2_grpc.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py
|
||||
sed -i "s/api_pb2/kubeflow\.katib\.katib_api_pb2/g" ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py
|
||||
pytest ./sdk/python/v1beta1/kubeflow/katib
|
||||
rm ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py
|
||||
|
||||
# The skopt service doesn't work appropriately with Python 3.11.
|
||||
# So, we need to run the test with Python 3.9.
|
||||
# TODO (tenzen-y): Once we stop to support skopt, we can remove this test.
|
||||
# REF: https://github.com/kubeflow/katib/issues/2280
|
||||
pytest-skopt:
|
||||
pip install six
|
||||
pip install --prefer-binary -r test/unit/v1beta1/requirements.txt
|
||||
pip install --prefer-binary -r cmd/suggestion/skopt/v1beta1/requirements.txt
|
||||
pytest ./test/unit/v1beta1/suggestion/test_skopt_service.py
|
||||
|
|
4
OWNERS
|
@ -1,8 +1,10 @@
|
|||
approvers:
|
||||
- andreyvelich
|
||||
- gaocegege
|
||||
- tenzen-y
|
||||
- johnugeorge
|
||||
reviewers:
|
||||
- anencore94
|
||||
- c-bata
|
||||
- Electronic-Waste
|
||||
emeritus_approvers:
|
||||
- tenzen-y
|
||||
|
|
163
README.md
|
@ -1,15 +1,18 @@
|
|||
<h1 align="center">
|
||||
<img src="./docs/images/logo-title.png" alt="logo" width="200">
|
||||
<br>
|
||||
</h1>
|
||||
# Kubeflow Katib
|
||||
|
||||
[](https://github.com/kubeflow/katib/actions/workflows/test-go.yaml?branch=master)
|
||||
[](https://coveralls.io/github/kubeflow/katib?branch=master)
|
||||
[](https://goreportcard.com/report/github.com/kubeflow/katib)
|
||||
[](https://github.com/kubeflow/katib/releases)
|
||||
[](https://kubeflow.slack.com/archives/C018PMV53NW)
|
||||
[](https://www.kubeflow.org/docs/about/community/#kubeflow-slack-channels)
|
||||
[](https://www.bestpractices.dev/projects/9941)
|
||||
|
||||
Katib is a Kubernetes-native project for automated machine learning (AutoML).
|
||||
<h1 align="center">
|
||||
<img src="./docs/images/logo-title.png" alt="logo" width="200">
|
||||
<br>
|
||||
</h1>
|
||||
|
||||
Kubeflow Katib is a Kubernetes-native project for automated machine learning (AutoML).
|
||||
Katib supports
|
||||
[Hyperparameter Tuning](https://en.wikipedia.org/wiki/Hyperparameter_optimization),
|
||||
[Early Stopping](https://en.wikipedia.org/wiki/Early_stopping) and
|
||||
|
@ -18,8 +21,7 @@ Katib supports
|
|||
Katib is the project which is agnostic to machine learning (ML) frameworks.
|
||||
It can tune hyperparameters of applications written in any language of the
|
||||
users’ choice and natively supports many ML frameworks, such as
|
||||
[TensorFlow](https://www.tensorflow.org/), [Apache MXNet](https://mxnet.apache.org/),
|
||||
[PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/latest/), and others.
|
||||
[TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/latest/), and others.
|
||||
|
||||
Katib can perform training jobs using any Kubernetes
|
||||
[Custom Resources](https://www.kubeflow.org/docs/components/katib/trial-template/)
|
||||
|
@ -29,13 +31,13 @@ and many more.
|
|||
|
||||
Katib stands for `secretary` in Arabic.
|
||||
|
||||
# Search Algorithms
|
||||
## Search Algorithms
|
||||
|
||||
Katib supports several search algorithms. Follow the
|
||||
[Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/experiment/#search-algorithms-in-detail)
|
||||
[Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/user-guides/hp-tuning/configure-algorithm/#hp-tuning-algorithms)
|
||||
to know more about each algorithm and check the
|
||||
[Suggestion service guide](/docs/new-algorithm-service.md) to implement your
|
||||
custom algorithm.
|
||||
[this guide](https://www.kubeflow.org/docs/components/katib/user-guides/hp-tuning/configure-algorithm/#use-custom-algorithm-in-katib)
|
||||
to implement your custom algorithm.
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -137,141 +139,68 @@ custom algorithm.
|
|||
</tbody>
|
||||
</table>
|
||||
|
||||
To perform above algorithms Katib supports the following frameworks:
|
||||
To perform the above algorithms Katib supports the following frameworks:
|
||||
|
||||
- [Goptuna](https://github.com/c-bata/goptuna)
|
||||
- [Hyperopt](https://github.com/hyperopt/hyperopt)
|
||||
- [Optuna](https://github.com/optuna/optuna)
|
||||
- [Scikit Optimize](https://github.com/scikit-optimize/scikit-optimize)
|
||||
|
||||
# Installation
|
||||
|
||||
For the various Katib installs check the
|
||||
[Kubeflow guide](https://www.kubeflow.org/docs/components/katib/hyperparameter/#katib-setup).
|
||||
Follow the next steps to install Katib standalone.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
This is the minimal requirements to install Katib:
|
||||
Please check [the official Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/installation/#prerequisites)
|
||||
for prerequisites to install Katib.
|
||||
|
||||
- Kubernetes >= 1.25
|
||||
- `kubectl` >= 1.25
|
||||
## Installation
|
||||
|
||||
## Latest Version
|
||||
Please follow [the Kubeflow Katib guide](https://www.kubeflow.org/docs/components/katib/installation/#installing-katib)
|
||||
for the detailed instructions on how to install Katib.
|
||||
|
||||
For the latest Katib version run this command:
|
||||
### Installing the Control Plane
|
||||
|
||||
Run the following command to install the latest stable release of Katib control plane:
|
||||
|
||||
```
|
||||
kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=v0.17.0"
|
||||
```
|
||||
|
||||
Run the following command to install the latest changes of Katib control plane:
|
||||
|
||||
```
|
||||
kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"
|
||||
```
|
||||
|
||||
## Release Version
|
||||
|
||||
For the specific Katib release (for example `v0.14.0`) run this command:
|
||||
|
||||
```
|
||||
kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=v0.14.0"
|
||||
```
|
||||
|
||||
Make sure that all Katib components are running:
|
||||
|
||||
```
|
||||
$ kubectl get pods -n kubeflow
|
||||
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
katib-controller-566595bdd8-hbxgf 1/1 Running 0 36s
|
||||
katib-db-manager-57cd769cdb-4g99m 1/1 Running 0 36s
|
||||
katib-mysql-7894994f88-5d4s5 1/1 Running 0 36s
|
||||
katib-ui-5767cfccdc-pwg2x 1/1 Running 0 36s
|
||||
```
|
||||
|
||||
For the Katib Experiments check the [complete examples list](./examples/v1beta1).
|
||||
|
||||
# Quickstart
|
||||
### Installing the Python SDK
|
||||
|
||||
You can run your first HyperParameter Tuning Experiment using [Katib Python SDK](./sdk/python/v1beta1).
|
||||
Katib implements [a Python SDK](https://pypi.org/project/kubeflow-katib/) to simplify creation of
|
||||
hyperparameter tuning jobs for Data Scientists.
|
||||
|
||||
In the following example we are going to maximize a simple objective function:
|
||||
$F(a,b) = 4a - b^2$. The bigger $a$ and the lesser $b$ value, the bigger the function value $F$.
|
||||
Run the following command to install the latest stable release of Katib SDK:
|
||||
|
||||
```python
|
||||
import kubeflow.katib as katib
|
||||
|
||||
# Step 1. Create an objective function.
|
||||
def objective(parameters):
|
||||
# Import required packages.
|
||||
import time
|
||||
time.sleep(5)
|
||||
# Calculate objective function.
|
||||
result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
|
||||
# Katib parses metrics in this format: <metric-name>=<metric-value>.
|
||||
print(f"result={result}")
|
||||
|
||||
# Step 2. Create HyperParameter search space.
|
||||
parameters = {
|
||||
"a": katib.search.int(min=10, max=20),
|
||||
"b": katib.search.double(min=0.1, max=0.2)
|
||||
}
|
||||
|
||||
# Step 3. Create Katib Experiment.
|
||||
katib_client = katib.KatibClient()
|
||||
name = "tune-experiment"
|
||||
katib_client.tune(
|
||||
name=name,
|
||||
objective=objective,
|
||||
parameters=parameters,
|
||||
objective_metric_name="result",
|
||||
max_trial_count=12
|
||||
)
|
||||
|
||||
# Step 4. Get the best HyperParameters.
|
||||
print(katib_client.get_optimal_hyperparameters(name))
|
||||
```sh
|
||||
pip install -U kubeflow-katib
|
||||
```
|
||||
|
||||
# Documentation
|
||||
## Getting Started
|
||||
|
||||
- Check
|
||||
[the Katib getting started guide](https://www.kubeflow.org/docs/components/katib/hyperparameter/#example-using-random-search-algorithm).
|
||||
Please refer to [the getting started guide](https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk)
|
||||
to quickly create your first hyperparameter tuning Experiment using the Python SDK.
|
||||
|
||||
- Learn about Katib **Concepts** in this
|
||||
[guide](https://www.kubeflow.org/docs/components/katib/overview/#katib-concepts).
|
||||
## Community
|
||||
|
||||
- Learn about Katib **Interfaces** in this
|
||||
[guide](https://www.kubeflow.org/docs/components/katib/overview/#katib-interfaces).
|
||||
The following links provide information on how to get involved in the community:
|
||||
|
||||
- Learn about Katib **Components** in this
|
||||
[guide](https://www.kubeflow.org/docs/components/katib/hyperparameter/#katib-components).
|
||||
|
||||
- Know more about Katib in the [presentations and demos list](./docs/presentations.md).
|
||||
|
||||
# Community
|
||||
|
||||
We are always growing our community and invite new users and AutoML enthusiasts
|
||||
to contribute to the Katib project. The following links provide information
|
||||
about getting involved in the community:
|
||||
|
||||
- Subscribe to the
|
||||
[AutoML calendar](https://calendar.google.com/calendar/u/0/r?cid=ZDQ5bnNpZWZzbmZna2Y5MW8wdThoMmpoazRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
|
||||
to attend Working Group bi-weekly community meetings.
|
||||
|
||||
- Check the
|
||||
[AutoML and Training Working Group meeting notes](https://docs.google.com/document/d/1MChKfzrKAeFRtYqypFbMXL6ZIc_OgijjkvbqmwRV-64/edit).
|
||||
|
||||
- If you use Katib, please update [the adopters list](ADOPTERS.md).
|
||||
- Attend [the bi-weekly AutoML and Training Working Group](https://bit.ly/2PWVCkV)
|
||||
community meeting.
|
||||
- Join our [`#kubeflow-katib`](https://www.kubeflow.org/docs/about/community/#kubeflow-slack-channels)
|
||||
Slack channel.
|
||||
- Check out [who is using Katib](ADOPTERS.md) and [presentations about Katib project](docs/presentations.md).
|
||||
|
||||
## Contributing
|
||||
|
||||
Please feel free to test the system! [Developer guide](./docs/developer-guide.md)
|
||||
is a good starting point for our developers.
|
||||
|
||||
## Blog posts
|
||||
|
||||
- [Kubeflow Katib: Scalable, Portable and Cloud Native System for AutoML](https://blog.kubeflow.org/katib/)
|
||||
(by Andrey Velichkevich)
|
||||
|
||||
## Events
|
||||
|
||||
- [AutoML and Training WG Summit. 16th of July 2021](https://docs.google.com/document/d/1vGluSPHmAqEr8k9Dmm82RcQ-MVnqbYYSfnjMGB-aPuo/edit?usp=sharing)
|
||||
Please refer to the [CONTRIBUTING guide](CONTRIBUTING.md).
|
||||
|
||||
## Citation
|
||||
|
||||
|
|
44
ROADMAP.md
|
@ -1,3 +1,45 @@
|
|||
# Katib 2022/2023 Roadmap
|
||||
|
||||
## AutoML Features
|
||||
|
||||
- Support advance HyperParameter tuning algorithms:
|
||||
|
||||
- Population Based Training (PBT) - [#1382](https://github.com/kubeflow/katib/issues/1382)
|
||||
- Tree of Parzen Estimators (TPE)
|
||||
- Multivariate TPE
|
||||
- Sobol’s Quasirandom Sequence
|
||||
- Asynchronous Successive Halving - [ASHA](https://arxiv.org/pdf/1810.05934.pdf)
|
||||
|
||||
- Support multi-objective optimization - [#1549](https://github.com/kubeflow/katib/issues/1549)
|
||||
- Support various HP distributions (log-uniform, uniform, normal) - [#1207](https://github.com/kubeflow/katib/issues/1207)
|
||||
- Support Auto Model Compression - [#460](https://github.com/kubeflow/katib/issues/460)
|
||||
- Support Auto Feature Engineering - [#475](https://github.com/kubeflow/katib/issues/475)
|
||||
- Improve Neural Architecture Search design
|
||||
|
||||
## Backend and API Enhancements
|
||||
|
||||
- Conformance tests for Katib - [#2044](https://github.com/kubeflow/katib/issues/2044)
|
||||
- Support push-based metrics collection in Katib - [#577](https://github.com/kubeflow/katib/issues/577)
|
||||
- Support PostgreSQL as a Katib DB - [#915](https://github.com/kubeflow/katib/issues/915)
|
||||
- Improve Katib scalability - [#1847](https://github.com/kubeflow/katib/issues/1847)
|
||||
- Promote Katib APIs to the `v1` version
|
||||
- Support multiple CRD versions (`v1beta1`, `v1`) with conversion webhook
|
||||
|
||||
## Improve Katib User Experience
|
||||
|
||||
- Simplify Katib Experiment creation with Katib SDK - [#1951](https://github.com/kubeflow/katib/pull/1951)
|
||||
- Fully migrate to a new Katib UI - [Project 1](https://github.com/kubeflow/katib/projects/1)
|
||||
- Expose Trial logs in Katib UI - [#971](https://github.com/kubeflow/katib/issues/971)
|
||||
- Enhance Katib UI visualization metrics for AutoML Experiments
|
||||
- Improve Katib Config UX - [#2150](https://github.com/kubeflow/katib/issues/2150)
|
||||
|
||||
## Integration with Kubeflow Components
|
||||
|
||||
- Kubeflow Pipeline as a Katib Trial target - [#1914](https://github.com/kubeflow/katib/issues/1914)
|
||||
- Improve data passing when Katib Experiment is part of Kubeflow Pipeline - [#1846](https://github.com/kubeflow/katib/issues/1846)
|
||||
|
||||
# History
|
||||
|
||||
# Katib 2021 Roadmap
|
||||
|
||||
## New Features
|
||||
|
@ -24,8 +66,6 @@
|
|||
- Support multiple CRD version with conversion webhook
|
||||
- MLMD integration with Katib Experiments
|
||||
|
||||
# History
|
||||
|
||||
# Katib 2020 Roadmap
|
||||
|
||||
## New Features
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
Kubeflow Katib versions are expressed as `vX.Y.Z`, where X is the major version,
|
||||
Y is the minor version, and Z is the patch version, following the
|
||||
[Semantic Versioning](https://semver.org/) terminology.
|
||||
|
||||
The Kubeflow Katib project maintains release branches for the most recent two minor releases.
|
||||
Applicable fixes, including security fixes, may be backported to those two release branches,
|
||||
depending on severity and feasibility.
|
||||
|
||||
Users are encouraged to stay updated with the latest releases to benefit from security patches and
|
||||
improvements.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
We're extremely grateful for security researchers and users that report vulnerabilities to the
|
||||
Kubeflow Open Source Community. All reports are thoroughly investigated by Kubeflow projects owners.
|
||||
|
||||
You can use the following ways to report security vulnerabilities privately:
|
||||
|
||||
- Using the Kubeflow Katib repository [GitHub Security Advisory](https://github.com/kubeflow/katib/security/advisories/new).
|
||||
- Using our private Kubeflow Steering Committee mailing list: ksc@kubeflow.org.
|
||||
|
||||
Please provide detailed information to help us understand and address the issue promptly.
|
||||
|
||||
## Disclosure Process
|
||||
|
||||
**Acknowledgment**: We will acknowledge receipt of your report within 10 business days.
|
||||
|
||||
**Assessment**: The Kubeflow projects owners will investigate the reported issue to determine its
|
||||
validity and severity.
|
||||
|
||||
**Resolution**: If the issue is confirmed, we will work on a fix and prepare a release.
|
||||
|
||||
**Notification**: Once a fix is available, we will notify the reporter and coordinate a public
|
||||
disclosure.
|
||||
|
||||
**Public Disclosure**: Details of the vulnerability and the fix will be published in the project's
|
||||
release notes and communicated through appropriate channels.
|
||||
|
||||
## Prevention Mechanisms
|
||||
|
||||
Kubeflow Katib employs several measures to prevent security issues:
|
||||
|
||||
**Code Reviews**: All code changes are reviewed by maintainers to ensure code quality and security.
|
||||
|
||||
**Dependency Management**: Regular updates and monitoring of dependencies (e.g. Dependabot) to
|
||||
address known vulnerabilities.
|
||||
|
||||
**Continuous Integration**: Automated testing and security checks are integrated into the CI/CD pipeline.
|
||||
|
||||
**Image Scanning**: Container images are scanned for vulnerabilities.
|
||||
|
||||
## Communication Channels
|
||||
|
||||
For the general questions please join the following resources:
|
||||
|
||||
- Kubeflow [Slack channels](https://www.kubeflow.org/docs/about/community/#kubeflow-slack-channels).
|
||||
|
||||
- Kubeflow discuss [mailing list](https://www.kubeflow.org/docs/about/community/#kubeflow-mailing-list).
|
||||
|
||||
Please **do not report** security vulnerabilities through public channels.
|
|
@ -28,14 +28,14 @@ import (
|
|||
api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
|
||||
db "github.com/kubeflow/katib/pkg/db/v1beta1"
|
||||
"github.com/kubeflow/katib/pkg/db/v1beta1/common"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/reflection"
|
||||
)
|
||||
|
||||
const (
|
||||
port = "0.0.0.0:6789"
|
||||
defaultListenAddress = "0.0.0.0:6789"
|
||||
defaultConnectTimeout = time.Second * 60
|
||||
)
|
||||
|
||||
|
@ -90,7 +90,9 @@ func (s *server) Check(ctx context.Context, in *health_pb.HealthCheckRequest) (*
|
|||
|
||||
func main() {
|
||||
var connectTimeout time.Duration
|
||||
var listenAddress string
|
||||
flag.DurationVar(&connectTimeout, "connect-timeout", defaultConnectTimeout, "Timeout before calling error during database connection. (e.g. 120s)")
|
||||
flag.StringVar(&listenAddress, "listen-address", defaultListenAddress, "The network interface or IP address to receive incoming connections. (e.g. 0.0.0.0:6789)")
|
||||
flag.Parse()
|
||||
|
||||
var err error
|
||||
|
@ -104,13 +106,13 @@ func main() {
|
|||
klog.Fatalf("Failed to open db connection: %v", err)
|
||||
}
|
||||
dbIf.DBInit()
|
||||
listener, err := net.Listen("tcp", port)
|
||||
listener, err := net.Listen("tcp", listenAddress)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to listen: %v", err)
|
||||
}
|
||||
|
||||
size := 1<<31 - 1
|
||||
klog.Infof("Start Katib manager: %s", port)
|
||||
klog.Infof("Start Katib manager: %s", listenAddress)
|
||||
s := grpc.NewServer(grpc.MaxRecvMsgSize(size), grpc.MaxSendMsgSize(size))
|
||||
api_pb.RegisterDBManagerServer(s, &server{})
|
||||
health_pb.RegisterHealthServer(s, &server{})
|
||||
|
|
|
@ -20,7 +20,7 @@ import (
|
|||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/golang/mock/gomock"
|
||||
"go.uber.org/mock/gomock"
|
||||
|
||||
health_pb "github.com/kubeflow/katib/pkg/apis/manager/health"
|
||||
api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,12 +12,14 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
import logging
|
||||
import time
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.earlystopping.v1beta1.medianstop.service import MedianStopService
|
||||
from concurrent import futures
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6788"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
grpcio>=1.41.1
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
grpcio>=1.64.1
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.6.0
|
||||
kubernetes==22.6.0
|
||||
cython>=0.29.24
|
||||
|
|
|
@ -26,13 +26,13 @@ import (
|
|||
"github.com/spf13/viper"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/config"
|
||||
"sigs.k8s.io/controller-runtime/pkg/healthz"
|
||||
logf "sigs.k8s.io/controller-runtime/pkg/log"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
"sigs.k8s.io/controller-runtime/pkg/manager"
|
||||
"sigs.k8s.io/controller-runtime/pkg/manager/signals"
|
||||
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
|
||||
"sigs.k8s.io/controller-runtime/pkg/webhook"
|
||||
|
||||
configv1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1"
|
||||
|
@ -110,15 +110,13 @@ func main() {
|
|||
|
||||
// Create a new katib controller to provide shared dependencies and start components
|
||||
mgr, err := manager.New(cfg, manager.Options{
|
||||
MetricsBindAddress: initConfig.ControllerConfig.MetricsAddr,
|
||||
Metrics: metricsserver.Options{
|
||||
BindAddress: initConfig.ControllerConfig.MetricsAddr,
|
||||
},
|
||||
HealthProbeBindAddress: initConfig.ControllerConfig.HealthzAddr,
|
||||
LeaderElection: initConfig.ControllerConfig.EnableLeaderElection,
|
||||
LeaderElectionID: initConfig.ControllerConfig.LeaderElectionID,
|
||||
Scheme: scheme,
|
||||
// TODO: Once the below issue is resolved, we need to switch discovery-client to the built-in one.
|
||||
// https://github.com/kubernetes-sigs/controller-runtime/issues/2354
|
||||
// https://github.com/kubernetes-sigs/controller-runtime/issues/2424
|
||||
MapperProvider: apiutil.NewDiscoveryRESTMapper,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to create the manager")
|
||||
|
@ -136,6 +134,11 @@ func main() {
|
|||
ctx := signals.SetupSignalHandler()
|
||||
certsReady := make(chan struct{})
|
||||
defer close(certsReady)
|
||||
|
||||
// The setupControllers will register controllers to the manager
|
||||
// after generated certs for the admission webhooks.
|
||||
go setupControllers(mgr, certsReady, hookServer)
|
||||
|
||||
if initConfig.CertGeneratorConfig.Enable {
|
||||
if err = cert.AddToManager(mgr, initConfig.CertGeneratorConfig, certsReady); err != nil {
|
||||
log.Error(err, "Failed to set up cert-generator")
|
||||
|
@ -144,10 +147,6 @@ func main() {
|
|||
certsReady <- struct{}{}
|
||||
}
|
||||
|
||||
// The setupControllers will register controllers to the manager
|
||||
// after generated certs for the admission webhooks.
|
||||
go setupControllers(mgr, certsReady, hookServer)
|
||||
|
||||
log.Info("Setting up health checker.")
|
||||
if err := mgr.AddReadyzCheck("readyz", hookServer.StartedChecker()); err != nil {
|
||||
log.Error(err, "Unable to add readyz endpoint to the manager")
|
||||
|
|
|
@ -49,11 +49,11 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/hpcloud/tail"
|
||||
"github.com/nxadm/tail"
|
||||
psutil "github.com/shirou/gopsutil/v3/process"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
|
||||
api "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
|
||||
|
@ -134,7 +134,11 @@ func printMetricsFile(mFile string) {
|
|||
checkMetricFile(mFile)
|
||||
|
||||
// Print lines from metrics file.
|
||||
t, _ := tail.TailFile(mFile, tail.Config{Follow: true})
|
||||
t, err := tail.TailFile(mFile, tail.Config{Follow: true, ReOpen: true})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to open metrics file: %v", err)
|
||||
}
|
||||
|
||||
for line := range t.Lines {
|
||||
klog.Info(line.Text)
|
||||
}
|
||||
|
@ -307,7 +311,7 @@ func watchMetricsFile(mFile string, stopRules stopRulesFlag, filters []string, f
|
|||
}
|
||||
|
||||
// Create connection and client for Early Stopping service.
|
||||
conn, err := grpc.Dial(*earlyStopServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.NewClient(*earlyStopServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
klog.Fatalf("Could not connect to Early Stopping service, error: %v", err)
|
||||
}
|
||||
|
@ -429,7 +433,7 @@ func main() {
|
|||
|
||||
func reportMetrics(filters []string, fileFormat commonv1beta1.FileFormat) {
|
||||
|
||||
conn, err := grpc.Dial(*dbManagerServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.NewClient(*dbManagerServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
klog.Fatalf("Could not connect to DB manager service, error: %v", err)
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import argparse
|
||||
from logging import INFO, StreamHandler, getLogger
|
||||
|
||||
import api_pb2
|
||||
from pns import WaitMainProcesses
|
||||
import api_pb2_grpc
|
||||
import const
|
||||
import grpc
|
||||
from pns import WaitMainProcesses
|
||||
from tfevent_loader import MetricsCollector
|
||||
from logging import getLogger, StreamHandler, INFO
|
||||
|
||||
timeout_in_seconds = 60
|
||||
|
||||
|
@ -55,25 +57,28 @@ if __name__ == '__main__':
|
|||
wait_all_processes = opt.wait_all_processes.lower() == "true"
|
||||
db_manager_server = opt.db_manager_server_addr.split(':')
|
||||
if len(db_manager_server) != 2:
|
||||
raise Exception("Invalid Katib DB manager service address: %s" %
|
||||
opt.db_manager_server_addr)
|
||||
raise Exception(
|
||||
f"Invalid Katib DB manager service address: {opt.db_manager_server_addr}"
|
||||
)
|
||||
|
||||
WaitMainProcesses(
|
||||
pool_interval=opt.poll_interval,
|
||||
timout=opt.timeout,
|
||||
wait_all=wait_all_processes,
|
||||
completed_marked_dir=opt.metrics_file_dir)
|
||||
completed_marked_dir=opt.metrics_file_dir,
|
||||
)
|
||||
|
||||
mc = MetricsCollector(opt.metric_names.split(';'))
|
||||
mc = MetricsCollector(opt.metric_names.split(";"))
|
||||
observation_log = mc.parse_file(opt.metrics_file_dir)
|
||||
|
||||
channel = grpc.beta.implementations.insecure_channel(
|
||||
db_manager_server[0], int(db_manager_server[1]))
|
||||
|
||||
with api_pb2.beta_create_DBManager_stub(channel) as client:
|
||||
logger.info("In " + opt.trial_name + " " +
|
||||
str(len(observation_log.metric_logs)) + " metrics will be reported.")
|
||||
client.ReportObservationLog(api_pb2.ReportObservationLogRequest(
|
||||
trial_name=opt.trial_name,
|
||||
observation_log=observation_log
|
||||
), timeout=timeout_in_seconds)
|
||||
with grpc.insecure_channel(opt.db_manager_server_addr) as channel:
|
||||
stub = api_pb2_grpc.DBManagerStub(channel)
|
||||
logger.info(
|
||||
f"In {opt.trial_name} {str(len(observation_log.metric_logs))} metrics will be reported."
|
||||
)
|
||||
stub.ReportObservationLog(
|
||||
api_pb2.ReportObservationLogRequest(
|
||||
trial_name=opt.trial_name, observation_log=observation_log
|
||||
),
|
||||
timeout=timeout_in_seconds,
|
||||
)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
psutil==5.9.4
|
||||
rfc3339>=6.2
|
||||
grpcio>=1.41.1
|
||||
grpcio>=1.64.1
|
||||
googleapis-common-protos==1.6.0
|
||||
tensorflow==2.11.0
|
||||
tensorflow==2.16.1
|
||||
protobuf>=4.21.12,<5
|
||||
|
|
|
@ -24,7 +24,7 @@ import (
|
|||
api_v1_beta1 "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
|
||||
suggestion "github.com/kubeflow/katib/pkg/suggestion/v1beta1/goptuna"
|
||||
"google.golang.org/grpc"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.hyperband.service import HyperbandService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.hyperband.service import HyperbandService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
grpcio>=1.41.1
|
||||
grpcio>=1.64.1
|
||||
cloudpickle==0.5.6
|
||||
numpy>=1.20.0
|
||||
numpy>=1.25.2
|
||||
scikit-learn>=0.24.0
|
||||
scipy>=1.5.4
|
||||
forestci==0.3
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.6.0
|
||||
cython>=0.29.24
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.hyperopt.service import HyperoptService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.hyperopt.service import HyperoptService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
grpcio>=1.41.1
|
||||
grpcio>=1.64.1
|
||||
cloudpickle==0.5.6
|
||||
numpy>=1.20.0
|
||||
numpy>=1.25.2
|
||||
scikit-learn>=0.24.0
|
||||
scipy>=1.5.4
|
||||
forestci==0.3
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.6.0
|
||||
hyperopt==0.2.5
|
||||
cython>=0.29.24
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,14 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.nas.darts.service import DartsService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.nas.darts.service import DartsService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
grpcio>=1.41.1
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
grpcio>=1.64.1
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.6.0
|
||||
cython>=0.29.24
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,15 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import time
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.nas.enas.service import EnasService
|
||||
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
grpcio>=1.41.1
|
||||
grpcio>=1.64.1
|
||||
googleapis-common-protos==1.6.0
|
||||
cython>=0.29.24
|
||||
tensorflow==2.11.0
|
||||
tensorflow==2.16.1
|
||||
protobuf>=4.21.12,<5
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.optuna.service import OptunaService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.optuna.service import OptunaService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
grpcio>=1.41.1
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
grpcio>=1.64.1
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.53.0
|
||||
optuna>=3.0.0
|
||||
optuna==3.3.0
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG TARGETARCH
|
||||
ENV TARGET_DIR /opt/katib
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.pbt.service import PbtService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.pbt.service import PbtService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
grpcio>=1.41.1
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
grpcio>=1.64.1
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.53.0
|
||||
numpy==1.22.2
|
||||
numpy==1.25.2
|
||||
|
|
|
@ -12,13 +12,15 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import grpc
|
||||
import time
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.skopt.service import SkoptService
|
||||
from concurrent import futures
|
||||
|
||||
import grpc
|
||||
|
||||
from pkg.apis.manager.health.python import health_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.skopt.service import SkoptService
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
DEFAULT_PORT = "0.0.0.0:6789"
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
grpcio>=1.41.1
|
||||
grpcio>=1.64.1
|
||||
cloudpickle==0.5.6
|
||||
# This is a workaround to avoid the following error.
|
||||
# AttributeError: module 'numpy' has no attribute 'int'
|
||||
# See more: https://github.com/numpy/numpy/pull/22607
|
||||
numpy==1.23.5
|
||||
scikit-learn>=0.24.0
|
||||
scikit-learn>=0.24.0, <=1.3.0
|
||||
scipy>=1.5.4
|
||||
forestci==0.3
|
||||
protobuf>=3.19.5, <=3.20.3
|
||||
protobuf>=4.21.12,<5
|
||||
googleapis-common-protos==1.6.0
|
||||
scikit-optimize>=0.9.0
|
||||
cython>=0.29.24
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
# --- Clone the kubeflow/kubeflow code ---
|
||||
FROM ubuntu AS fetch-kubeflow-kubeflow
|
||||
|
||||
RUN apt-get update && apt-get install git -y
|
||||
FROM alpine/git AS fetch-kubeflow-kubeflow
|
||||
|
||||
WORKDIR /kf
|
||||
COPY ./pkg/ui/v1beta1/frontend/COMMIT ./
|
||||
|
@ -11,23 +9,37 @@ RUN git clone https://github.com/kubeflow/kubeflow.git && \
|
|||
git checkout $COMMIT
|
||||
|
||||
# --- Build the frontend kubeflow library ---
|
||||
FROM node:12 AS frontend-kubeflow-lib
|
||||
FROM node:16-alpine AS frontend-kubeflow-lib
|
||||
|
||||
WORKDIR /src
|
||||
|
||||
ARG LIB=/kf/kubeflow/components/crud-web-apps/common/frontend/kubeflow-common-lib
|
||||
COPY --from=fetch-kubeflow-kubeflow $LIB/package*.json ./
|
||||
RUN npm ci
|
||||
RUN npm config set fetch-retry-mintimeout 200000 && \
|
||||
npm config set fetch-retry-maxtimeout 1200000 && \
|
||||
npm config get registry && \
|
||||
npm config set registry https://registry.npmjs.org/ && \
|
||||
npm config delete https-proxy && \
|
||||
npm config set loglevel verbose && \
|
||||
npm cache clean --force && \
|
||||
npm ci --force --prefer-offline --no-audit
|
||||
|
||||
COPY --from=fetch-kubeflow-kubeflow $LIB/ ./
|
||||
RUN npm run build
|
||||
|
||||
# --- Build the frontend ---
|
||||
FROM node:12 AS frontend
|
||||
FROM node:16-alpine AS frontend
|
||||
|
||||
WORKDIR /src
|
||||
COPY ./pkg/ui/v1beta1/frontend/package*.json ./
|
||||
RUN npm ci
|
||||
RUN npm config set fetch-retry-mintimeout 200000 && \
|
||||
npm config set fetch-retry-maxtimeout 1200000 && \
|
||||
npm config get registry && \
|
||||
npm config set registry https://registry.npmjs.org/ && \
|
||||
npm config delete https-proxy && \
|
||||
npm config set loglevel verbose && \
|
||||
npm cache clean --force && \
|
||||
npm ci --force --prefer-offline --no-audit
|
||||
|
||||
COPY ./pkg/ui/v1beta1/frontend/ .
|
||||
COPY --from=frontend-kubeflow-lib /src/dist/kubeflow/ ./node_modules/kubeflow/
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# Run conformance test and generate test report.
|
||||
python test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py --experiment-path examples/v1beta1/hp-tuning/random.yaml --namespace kf-conformance \
|
||||
--trial-pod-annotations '{"sidecar.istio.io/inject": "false"}' | tee /tmp/katib-conformance.log
|
||||
--trial-pod-labels '{"sidecar.istio.io/inject": "false"}' | tee /tmp/katib-conformance.log
|
||||
|
||||
|
||||
# Create the done file.
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
# Katib Documentation
|
||||
|
||||
Welcome to Kubeflow Katib!
|
||||
|
||||
The Katib documentation is available on [kubeflow.org](https://www.kubeflow.org/docs/components/katib/).
|
|
@ -5,7 +5,7 @@ Here you can find the location for images that are used in Katib.
|
|||
## Katib Components Images
|
||||
|
||||
The following table shows images for the
|
||||
[Katib components](https://www.kubeflow.org/docs/components/katib/hyperparameter/#katib-components).
|
||||
[Katib components](https://www.kubeflow.org/docs/components/katib/reference/architecture/#katib-control-plane-components).
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -22,7 +22,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-controller</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-controller</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib Controller
|
||||
|
@ -33,7 +33,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-ui</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-ui</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib User Interface
|
||||
|
@ -44,7 +44,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/katib-db-manager</code>
|
||||
<code>ghcr.io/kubeflow/katib/katib-db-manager</code>
|
||||
</td>
|
||||
<td>
|
||||
Katib DB Manager
|
||||
|
@ -70,7 +70,7 @@ The following table shows images for the
|
|||
## Katib Metrics Collectors Images
|
||||
|
||||
The following table shows images for the
|
||||
[Katib Metrics Collectors](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector).
|
||||
[Katib Metrics Collectors](https://www.kubeflow.org/docs/components/katib/user-guides/metrics-collector/).
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -87,7 +87,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/file-metrics-collector</code>
|
||||
<code>ghcr.io/kubeflow/katib/file-metrics-collector</code>
|
||||
</td>
|
||||
<td>
|
||||
File Metrics Collector
|
||||
|
@ -98,7 +98,7 @@ The following table shows images for the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/tfevent-metrics-collector</code>
|
||||
<code>ghcr.io/kubeflow/katib/tfevent-metrics-collector</code>
|
||||
</td>
|
||||
<td>
|
||||
Tensorflow Event Metrics Collector
|
||||
|
@ -113,8 +113,8 @@ The following table shows images for the
|
|||
## Katib Suggestions and Early Stopping Images
|
||||
|
||||
The following table shows images for the
|
||||
[Katib Suggestions](https://www.kubeflow.org/docs/components/katib/experiment/#search-algorithms-in-detail)
|
||||
and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/components/katib/early-stopping/).
|
||||
[Katib Suggestion services](https://www.kubeflow.org/docs/components/katib/reference/architecture/#suggestion)
|
||||
and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/components/katib/user-guides/early-stopping/#early-stopping-algorithms).
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -131,7 +131,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-hyperopt</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-hyperopt</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/hyperopt/hyperopt">Hyperopt</a> Suggestion
|
||||
|
@ -142,7 +142,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-skopt</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-skopt</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/scikit-optimize/scikit-optimize">Skopt</a> Suggestion
|
||||
|
@ -153,7 +153,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-optuna</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-optuna</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/optuna/optuna">Optuna</a> Suggestion
|
||||
|
@ -164,7 +164,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-goptuna</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-goptuna</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/c-bata/goptuna">Goptuna</a> Suggestion
|
||||
|
@ -175,7 +175,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-hyperband</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-hyperband</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#hyperband">Hyperband</a> Suggestion
|
||||
|
@ -186,7 +186,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-enas</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-enas</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#enas">ENAS</a> Suggestion
|
||||
|
@ -197,7 +197,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/suggestion-darts</code>
|
||||
<code>ghcr.io/kubeflow/katib/suggestion-darts</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/experiment/#differentiable-architecture-search-darts">DARTS</a> Suggestion
|
||||
|
@ -208,7 +208,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/earlystopping-medianstop</code>
|
||||
<code>ghcr.io/kubeflow/katib/earlystopping-medianstop</code>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://www.kubeflow.org/docs/components/katib/early-stopping/#median-stopping-rule">Median Stopping Rule</a>
|
||||
|
@ -223,7 +223,7 @@ and the [Katib Early Stopping algorithms](https://www.kubeflow.org/docs/componen
|
|||
## Training Containers Images
|
||||
|
||||
The following table shows images for training containers which are used in the
|
||||
[Katib Trials](https://www.kubeflow.org/docs/components/katib/experiment/#packaging-your-training-code-in-a-container-image).
|
||||
[Katib Trials](https://www.kubeflow.org/docs/components/katib/reference/architecture/#trial).
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
|
@ -240,18 +240,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/mxnet-mnist</code>
|
||||
</td>
|
||||
<td>
|
||||
MXNet MNIST example with collecting metrics time
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile">Dockerfile</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/pytorch-mnist-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/pytorch-mnist-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch MNIST example with printing metrics to the file or StdOut with CPU support
|
||||
|
@ -262,7 +251,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/pytorch-mnist-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/pytorch-mnist-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch MNIST example with printing metrics to the file or StdOut with GPU support
|
||||
|
@ -273,7 +262,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/tf-mnist-with-summaries</code>
|
||||
<code>ghcr.io/kubeflow/katib/tf-mnist-with-summaries</code>
|
||||
</td>
|
||||
<td>
|
||||
Tensorflow MNIST example with saving metrics in the summaries
|
||||
|
@ -284,18 +273,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/bytepsimage/mxnet</code>
|
||||
</td>
|
||||
<td>
|
||||
Distributed BytePS example for MXJob
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://github.com/bytedance/byteps/blob/v0.2.5/docker/Dockerfile">Dockerfile</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/xgboost-lightgbm</code>
|
||||
<code>ghcr.io/kubeflow/katib/xgboost-lightgbm</code>
|
||||
</td>
|
||||
<td>
|
||||
Distributed LightGBM example for XGBoostJob
|
||||
|
@ -328,7 +306,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/enas-cnn-cifar10-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/enas-cnn-cifar10-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
Keras CIFAR-10 CNN example for ENAS with GPU support
|
||||
|
@ -339,7 +317,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/enas-cnn-cifar10-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/enas-cnn-cifar10-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
Keras CIFAR-10 CNN example for ENAS with CPU support
|
||||
|
@ -350,7 +328,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/darts-cnn-cifar10-gpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/darts-cnn-cifar10-gpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch CIFAR-10 CNN example for DARTS with GPU support
|
||||
|
@ -361,7 +339,7 @@ The following table shows images for training containers which are used in the
|
|||
</tr>
|
||||
<tr align="center">
|
||||
<td>
|
||||
<code>docker.io/kubeflowkatib/darts-cnn-cifar10-cpu</code>
|
||||
<code>ghcr.io/kubeflow/katib/darts-cnn-cifar10-cpu</code>
|
||||
</td>
|
||||
<td>
|
||||
PyTorch CIFAR-10 CNN example for DARTS with CPU support
|
||||
|
|
Before Width: | Height: | Size: 102 KiB |
Before Width: | Height: | Size: 192 KiB |
|
@ -1,185 +0,0 @@
|
|||
# Document about how to add a new algorithm in Katib
|
||||
|
||||
## Implement a new algorithm and use it in Katib
|
||||
|
||||
### Implement the algorithm
|
||||
|
||||
The design of Katib follows the `ask-and-tell` pattern:
|
||||
|
||||
> They often follow a pattern a bit like this: 1. ask for a new set of parameters 1. walk to the Experiment and program in the new parameters 1. observe the outcome of running the Experiment 1. walk back to your laptop and tell the optimizer about the outcome 1. go to step 1
|
||||
|
||||
When an Experiment is created, one algorithm service will be created. Then Katib asks for new sets of parameters via `GetSuggestions` GRPC call. After that, Katib creates new trials according to the sets and observe the outcome. When the trials are finished, Katib tells the metrics of the finished trials to the algorithm, and ask another new sets.
|
||||
|
||||
The new algorithm needs to implement `Suggestion` service defined in [api.proto](../pkg/apis/manager/v1beta1/api.proto). One sample algorithm looks like:
|
||||
|
||||
```python
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.suggestion.v1beta1.internal.search_space import HyperParameter, HyperParameterSearchSpace
|
||||
from pkg.suggestion.v1beta1.internal.trial import Trial, Assignment
|
||||
from pkg.suggestion.v1beta1.hyperopt.base_service import BaseHyperoptService
|
||||
from pkg.suggestion.v1beta1.internal.base_health_service import HealthServicer
|
||||
|
||||
|
||||
# Inherit SuggestionServicer and implement GetSuggestions.
|
||||
class HyperoptService(
|
||||
api_pb2_grpc.SuggestionServicer, HealthServicer):
|
||||
def ValidateAlgorithmSettings(self, request, context):
|
||||
# Optional, it is used to validate algorithm settings defined by users.
|
||||
pass
|
||||
def GetSuggestions(self, request, context):
|
||||
# Convert the Experiment in GRPC request to the search space.
|
||||
# search_space example:
|
||||
# HyperParameterSearchSpace(
|
||||
# goal: MAXIMIZE,
|
||||
# params: [HyperParameter(name: param-1, type: INTEGER, min: 1, max: 5, step: 0),
|
||||
# HyperParameter(name: param-2, type: CATEGORICAL, list: cat1, cat2, cat3),
|
||||
# HyperParameter(name: param-3, type: DISCRETE, list: 3, 2, 6),
|
||||
# HyperParameter(name: param-4, type: DOUBLE, min: 1, max: 5, step: )]
|
||||
# )
|
||||
search_space = HyperParameterSearchSpace.convert(request.experiment)
|
||||
# Convert the trials in GRPC request to the trials in algorithm side.
|
||||
# trials example:
|
||||
# [Trial(
|
||||
# assignment: [Assignment(name=param-1, value=2),
|
||||
# Assignment(name=param-2, value=cat1),
|
||||
# Assignment(name=param-3, value=2),
|
||||
# Assignment(name=param-4, value=3.44)],
|
||||
# target_metric: Metric(name="metric-2" value="5643"),
|
||||
# additional_metrics: [Metric(name=metric-1, value=435),
|
||||
# Metric(name=metric-3, value=5643)],
|
||||
# Trial(
|
||||
# assignment: [Assignment(name=param-1, value=3),
|
||||
# Assignment(name=param-2, value=cat2),
|
||||
# Assignment(name=param-3, value=6),
|
||||
# Assignment(name=param-4, value=4.44)],
|
||||
# target_metric: Metric(name="metric-2" value="3242"),
|
||||
# additional_metrics: [Metric(name=metric=1, value=123),
|
||||
# Metric(name=metric-3, value=543)],
|
||||
trials = Trial.convert(request.trials)
|
||||
#--------------------------------------------------------------
|
||||
# Your code here
|
||||
# Implement the logic to generate new assignments for the given current request number.
|
||||
# For example, if request.current_request_number is 2, you should return:
|
||||
# [
|
||||
# [Assignment(name=param-1, value=3),
|
||||
# Assignment(name=param-2, value=cat2),
|
||||
# Assignment(name=param-3, value=3),
|
||||
# Assignment(name=param-4, value=3.22)
|
||||
# ],
|
||||
# [Assignment(name=param-1, value=4),
|
||||
# Assignment(name=param-2, value=cat4),
|
||||
# Assignment(name=param-3, value=2),
|
||||
# Assignment(name=param-4, value=4.32)
|
||||
# ],
|
||||
# ]
|
||||
list_of_assignments = your_logic(search_space, trials, request.current_request_number)
|
||||
#--------------------------------------------------------------
|
||||
# Convert list_of_assignments to
|
||||
return api_pb2.GetSuggestionsReply(
|
||||
trials=Assignment.generate(list_of_assignments)
|
||||
)
|
||||
```
|
||||
|
||||
### Make a GRPC server for the algorithm
|
||||
|
||||
Create a package under [cmd/suggestion](../cmd/suggestion). Then create the main function and Dockerfile. The new GRPC server should serve in port 6789.
|
||||
|
||||
Here is an example: [cmd/suggestion/hyperopt](../cmd/suggestion/hyperopt).
|
||||
Then build the Docker image.
|
||||
|
||||
### Use the algorithm in Katib.
|
||||
|
||||
Update the [Katib config](../manifests/v1beta1/installs/katib-standalone/katib-config.yaml) with the new algorithm entity:
|
||||
|
||||
```diff
|
||||
runtime:
|
||||
suggestions:
|
||||
- algorithmName: random
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
|
||||
- algorithmName: tpe
|
||||
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
|
||||
+ - algorithmName: <new-algorithm-name>
|
||||
+ image: "image built in the previous stage":$(KATIB_VERSION)
|
||||
```
|
||||
|
||||
Learn more about Katib config in the
|
||||
[Kubeflow documentation](https://www.kubeflow.org/docs/components/katib/katib-config/)
|
||||
|
||||
### Contribute the algorithm to Katib
|
||||
|
||||
If you want to contribute the algorithm to Katib, you could add unit test and/or
|
||||
e2e test for it in the CI and submit a PR.
|
||||
|
||||
#### Unit Test
|
||||
|
||||
Here is an example [test_hyperopt_service.py](../test/unit/v1beta1/suggestion/test_hyperopt_service.py):
|
||||
|
||||
```python
|
||||
import grpc
|
||||
import grpc_testing
|
||||
import unittest
|
||||
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2_grpc
|
||||
from pkg.apis.manager.v1beta1.python import api_pb2
|
||||
|
||||
from pkg.suggestion.v1beta1.hyperopt.service import HyperoptService
|
||||
|
||||
class TestHyperopt(unittest.TestCase):
|
||||
def setUp(self):
|
||||
servicers = {
|
||||
api_pb2.DESCRIPTOR.services_by_name['Suggestion']: HyperoptService()
|
||||
}
|
||||
|
||||
self.test_server = grpc_testing.server_from_dictionary(
|
||||
servicers, grpc_testing.strict_real_time())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
```
|
||||
|
||||
You can setup the GRPC server using `grpc_testing`, then define your own test cases.
|
||||
|
||||
#### E2E Test (Optional)
|
||||
|
||||
E2E tests help Katib verify that the algorithm works well.
|
||||
Follow below steps to add your algorithm (Suggestion) to the Katib CI
|
||||
(replace `<name>` with your Suggestion name):
|
||||
|
||||
1. Submit a PR to add a new ECR private registry to the AWS
|
||||
[`ECR_Private_Registry_List`](https://github.com/kubeflow/testing/blob/master/aws/IaC/CDK/test-infra/config/static_config/ECR_Resources.py#L18).
|
||||
Registry name should follow the pattern: `katib/v1beta1/suggestion-<name>`
|
||||
|
||||
1. Create a new Experiment YAML in the [examples/v1beta1](../examples/v1beta1)
|
||||
with the new algorithm.
|
||||
|
||||
1. Update [`setup-katib.sh`](../test/e2e/v1beta1/scripts/setup-katib.sh)
|
||||
script to modify `katib-config.yaml` with the new test Suggestion image name.
|
||||
For example:
|
||||
|
||||
```sh
|
||||
sed -i -e "s@docker.io/kubeflowkatib/suggestion-<name>@${ECR_REGISTRY}/${REPO_NAME}/v1beta1/suggestion-<name>@" ${CONFIG_PATCH}
|
||||
```
|
||||
|
||||
1. Update the following variables in [`argo_workflow.py`](../test/e2e/v1beta1/argo_workflow.py):
|
||||
|
||||
- [`KATIB_IMAGES`](../test/e2e/v1beta1/argo_workflow.py#L43) with your Suggestion Dockerfile location:
|
||||
|
||||
```diff
|
||||
. . .
|
||||
"suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile",
|
||||
"suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile",
|
||||
+ "suggestion-<name>": "cmd/suggestion/<name>/v1beta1/Dockerfile",
|
||||
. . .
|
||||
```
|
||||
|
||||
- [`KATIB_EXPERIMENTS`](../test/e2e/v1beta1/argo_workflow.py#L69) with your Experiment YAML location:
|
||||
|
||||
```diff
|
||||
. . .
|
||||
"multivariate-tpe": "examples/v1beta1/hp-tuning/multivariate-tpe.yaml",
|
||||
"cmaes": "examples/v1beta1/hp-tuning/cma-es.yaml",
|
||||
+ "<algorithm-name>: "examples/v1beta1/hp-tuning/<algorithm-name>.yaml",
|
||||
. . .
|
||||
```
|
|
@ -1,4 +1,4 @@
|
|||
# Support custom CRD in Trial Job proposal
|
||||
# KEP-1214: Support custom CRD in Trial Job proposal
|
||||
|
||||
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
||||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
|
@ -180,7 +180,7 @@ SucceededCondition: Succeeded
|
|||
Previously, we had problems with Istio sidecar containers,
|
||||
check [kubeflow/issue#1081](https://github.com/kubeflow/kubeflow/issues/4742).
|
||||
In some cases, it is unable to properly download datasets in training pod.
|
||||
It was fixed by adding annotation `sidecar.istio.io/inject: false` to appropriate Trial job in Katib controller.
|
||||
It was fixed by adding label `sidecar.istio.io/inject: false` to appropriate Trial job in Katib controller.
|
||||
|
||||
Various CRD can have unified design and it is hard to understand where annotation must be specified
|
||||
to disable Istio injection for the running pods.
|
|
@ -1,4 +1,4 @@
|
|||
# Conformance Test for AutoML and Training Working Group
|
||||
# KEP-2044: Conformance Test for AutoML and Training Working Group
|
||||
|
||||
Andrey Velichkevich ([@andreyvelich](https://github.com/andreyvelich))
|
||||
Johnu George ([@johnugeorge](https://github.com/johnugeorge))
|
||||
|
@ -61,7 +61,7 @@ the 3 category of tests:
|
|||
|
||||
## Design for the CRD-based tests
|
||||
|
||||

|
||||

|
||||
|
||||
The design is similar to the KFP conformance program for the API-based tests.
|
||||
|
Before Width: | Height: | Size: 77 KiB After Width: | Height: | Size: 77 KiB |
|
@ -0,0 +1,240 @@
|
|||
# KEP-2339: HyperParameter Optimization API for LLM Fine-Tuning
|
||||
|
||||
- [HyperParameter Optimization API for LLM Fine-Tuning](#hyperparameter-optimization-api-for-llm-fine-tuning)
|
||||
- [Links](#links)
|
||||
- [Motivation](#motivation)
|
||||
- [Goals](#goals)
|
||||
- [Non-Goals](#non-goals)
|
||||
- [Design for API](#design-for-api)
|
||||
- [Example](#example)
|
||||
- [Implementation](#implementation)
|
||||
|
||||
## Links
|
||||
|
||||
- [katib/issues#2291 (Tuning API in Katib for LLMs)](https://github.com/kubeflow/katib/issues/2291)
|
||||
|
||||
## Motivation
|
||||
|
||||
The rapid advancements and growing popularity of Large Language Models (LLMs) have driven an increased need for effective LLMOps in Kubernetes environments. To address this, we developed a [train API](https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/) within the Training Python SDK, simplifying the process of fine-tuning LLMs using distributed PyTorchJob workers. However, hyperparameter optimization remains a crucial yet labor-intensive task for enhancing model performance. Automating this tuning process through a dedicated API will facilitate efficient and scalable exploration of hyperparameters, ultimately improving model performance and reducing manual effort.
|
||||
|
||||
## Goals
|
||||
|
||||
Our goal is to develop a high-level API for tuning hyperparameters of LLMs that simplifies the process of hyperparameter optimization in Kubernetes. This API will seamlessly integrate with external platforms like HuggingFace and S3 for importing pretrained models and datasets. By specifying parameters for the training objective, trial configurations, and PyTorch worker configurations, the API will automate the creation of experiments and execution of trials. This abstraction of Kubernetes infrastructure complexities will enable data scientists to optimize hyperparameters efficiently and effectively.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
1. Incorporate early stopping strategy into the API to optimize training efficiency.
|
||||
2. Expand support for distributed training in frameworks beyond PyTorch by leveraging their distributed training capabilities.
|
||||
3. Support adding custom providers through configmap or CRD approach to enhance flexibility.
|
||||
4. Enable users to deploy tuned models for inference within their applications or seamlessly integrate them into existing NLP pipelines for specialized tasks.
|
||||
|
||||
## Design for API
|
||||
|
||||

|
||||
|
||||
```python
|
||||
import kubeflow.katib as katib
|
||||
from kubeflow.katib import KatibClient
|
||||
|
||||
class KatibClient(object):
|
||||
|
||||
def tune(
|
||||
self,
|
||||
name: str,
|
||||
namespace: Optional[str] = None,
|
||||
model_provider_parameters: Optional[HuggingFaceModelParams] = None,
|
||||
dataset_provider_parameters: Optional[Union[HuggingFaceDatasetParams, S3DatasetParams]] = None,
|
||||
trainer_parameters: Union[HuggingFaceTrainerParams, Dict[str, Any]] = None,
|
||||
storage_config: Dict[str, Optional[Union[str, List[str]]]] = {
|
||||
"size": constants.PVC_DEFAULT_SIZE,
|
||||
"storage_class": None,
|
||||
"access_modes": constants.PVC_DEFAULT_ACCESS_MODES,
|
||||
},
|
||||
objective: Optional[Callable] = None,
|
||||
base_image: Optional[str] = None,
|
||||
algorithm_name: str = "random",
|
||||
algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None,
|
||||
objective_metric_name: str = "eval_accuracy",
|
||||
additional_metric_names: List[str] = [],
|
||||
objective_type: str = "maximize",
|
||||
objective_goal: float = None,
|
||||
max_trial_count: int = None,
|
||||
parallel_trial_count: int = None,
|
||||
max_failed_trial_count: int = None,
|
||||
resources_per_trial = Union[dict, client.V1ResourceRequirements, types.TrainerResources, None] = None,
|
||||
retain_trials: bool = False,
|
||||
env_per_trial: Optional[Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]]] = None,
|
||||
packages_to_install: List[str] = None,
|
||||
pip_index_url: str = "https://pypi.org/simple",
|
||||
):
|
||||
"""
|
||||
Initiates a hyperparameter tuning experiment in Katib.
|
||||
Model, dataset and parameters can be configured using one of the following options:
|
||||
- Using the Storage Initializer: Specify `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters`. This option downloads models and datasets from external platforms like HuggingFace and S3, and utilizes `Trainer.train()` in HuggingFace to train the model.
|
||||
- Defining a custom objective function: Specify the `objective` parameter to define your own objective function, and use the `base_image` parameter to execute the objective function.
|
||||
|
||||
Parameters:
|
||||
- name: Name for the experiment.
|
||||
- namespace: Namespace for the experiment. Defaults to the namespace of the 'KatibClient' object.
|
||||
- model_provider_parameters: Parameters for providing the model. Compatible with model providers like HuggingFace.
|
||||
- dataset_provider_parameters: Parameters for providing the dataset. Compatible with dataset providers like HuggingFace or S3.
|
||||
- trainer_parameters: Parameters for configuring the training process, including settings for hyperparameters search space.
|
||||
- storage_config: Configuration for Storage Initializer PVC to download pre-trained model and dataset.
|
||||
- objective: Objective function that Katib uses to train the model.
|
||||
- base_image: Image to use when executing the objective function.
|
||||
- algorithm_name: Tuning algorithm name (e.g., 'random', 'bayesian').
|
||||
- algorithm_settings: Settings for the tuning algorithm.
|
||||
- objective_metric_name: Primary metric to optimize.
|
||||
- additional_metric_names: List of additional metrics to collect.
|
||||
- objective_type: Optimization direction for the objective metric, "minimize" or "maximize".
|
||||
- objective_goal: Desired value of the objective metric.
|
||||
- max_trial_count: Maximum number of trials to run.
|
||||
- parallel_trial_count: Number of trials to run in parallel.
|
||||
- max_failed_trial_count: Maximum number of allowed failed trials.
|
||||
- resources_per_trial: Resources assigned to per trial, which can be specified using one of the following options:
|
||||
- Non-distributed Training: Specify a kubernetes.client.V1ResourceRequirements object or a dicitionary that includes one or more of the following keys: `cpu`, `memory`, or `gpu` (other keys will be ignored).
|
||||
- Distributed Training in Pytorch: Specify a types.TrainerResources, which includes the following parameters:
|
||||
- num_workers: Number of PyTorchJob workers.
|
||||
- num_procs_per_worker: Number of processes per PyTorchJob worker.
|
||||
- resources_per_worker: Resources assigned to per PyTorchJob worker container, specified as either a kubernetes.client.V1ResourceRequirements object or a dicitionary that includes one or more of the following keys: `cpu`, `memory`, or `gpu` (other keys will be ignored).
|
||||
- retain_trials: Whether to retain trial resources after completion.
|
||||
- env_per_trial: Environment variables for worker containers.
|
||||
- packages_to_install: Additional Python packages to install.
|
||||
- pip_index_url: URL of the PyPI index for installing packages.
|
||||
"""
|
||||
pass # Implementation logic for initiating the experiment
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
```python
|
||||
import kubeflow.katib as katib
|
||||
from kubeflow.katib import KatibClient
|
||||
|
||||
import transformers
|
||||
from peft import LoraConfig
|
||||
|
||||
from kubeflow.storage_initializer.hugging_face import (
|
||||
HuggingFaceModelParams,
|
||||
HuggingFaceDatasetParams,
|
||||
HuggingFaceTrainerParams,
|
||||
)
|
||||
|
||||
# Create a Katib client.
|
||||
cl = KatibClient(namespace="kubeflow")
|
||||
|
||||
# Run the tuning experiment.
|
||||
exp_name = "llm-experiment1"
|
||||
cl.tune(
|
||||
name = exp_name,
|
||||
# BERT model URI and type of Transformer to train it.
|
||||
model_provider_parameters = HuggingFaceModelParams(
|
||||
model_uri = "hf://google-bert/bert-base-cased",
|
||||
transformer_type = transformers.AutoModelForSequenceClassification,
|
||||
),
|
||||
# Use 3000 samples from Yelp dataset.
|
||||
dataset_provider_parameters = HuggingFaceDatasetParams(
|
||||
repo_id = "yelp_review_full",
|
||||
split = "train[:3000]",
|
||||
),
|
||||
# Specify HuggingFace Trainer parameters.
|
||||
trainer_parameters = HuggingFaceTrainerParams(
|
||||
training_parameters = transformers.TrainingArguments(
|
||||
output_dir = "test_tune_api",
|
||||
save_strategy = "no",
|
||||
learning_rate = katib.search.double(min=1e-05, max=5e-05),
|
||||
num_train_epochs=3,
|
||||
),
|
||||
# Set LoRA config to reduce number of trainable model parameters.
|
||||
lora_config = LoraConfig(
|
||||
r = katib.search.int(min=8, max=32),
|
||||
lora_alpha = 8,
|
||||
lora_dropout = 0.1,
|
||||
bias = "none",
|
||||
),
|
||||
),
|
||||
objective_metric_name = "train_loss",
|
||||
objective_type = "minimize",
|
||||
algorithm_name = "random",
|
||||
max_trial_count = 10,
|
||||
parallel_trial_count = 2,
|
||||
resources_per_trial={
|
||||
"gpu": "2",
|
||||
"cpu": "4",
|
||||
"memory": "10G",
|
||||
},
|
||||
# For distribued training, please specify `resources_per_trial` using `types.TrainerResources` (To be implemented).
|
||||
)
|
||||
|
||||
# Wait until Katib Experiment is complete
|
||||
cl.wait_for_experiment_condition(name=exp_name)
|
||||
|
||||
# Get the best hyperparameters.
|
||||
print(cl.get_optimal_hyperparameters(exp_name))
|
||||
```
|
||||
|
||||
## Implementation
|
||||
|
||||
By passing the specified parameters, this API will automate hyperparameter optimization for LLMs. The implementation will focus on the following aspects:
|
||||
|
||||
**Model and Dataset Management**: We will leverage the [storage_initializer](https://github.com/kubeflow/training-operator/tree/master/sdk/python/kubeflow/storage_initializer) from the Training Python SDK for seamless integration of pretrained models and datasets from platforms like HuggingFace and S3. This component manages downloading and storing pretrained models and datasets via a PersistentVolumeClaim (PVC), which is shared across containers, ensuring efficient access to the pretrained model and dataset without redundant downloads.
|
||||
|
||||
**Hyperparameter Configuration**: Users specify training parameters and the hyperparameters to be optimized within `trainer_parameters`. The API will first traverse `trainer_parameters.training_parameters` and `trainer_parameters.lora_config` to identify tunable hyperparameters and set up their values for the Experiment and Trials. These parameters are then passed as `args` to the container spec of workers.
|
||||
|
||||
```python
|
||||
# Traverse and set up hyperparameters
|
||||
input_params = {}
|
||||
experiment_params = []
|
||||
trial_params = []
|
||||
|
||||
training_args = trainer_parameters.training_parameters
|
||||
for p_name, p_value in training_args.to_dict().items():
|
||||
if not hasattr(training_args, p_name):
|
||||
logger.warning(f"Training parameter {p_name} is not supported by the current transformer.")
|
||||
continue
|
||||
if isinstance(p_value, models.V1beta1ParameterSpec):
|
||||
value = f"${{trialParameters.{p_name}}}"
|
||||
setattr(training_args, p_name, value)
|
||||
p_value.name = p_name
|
||||
experiment_params.append(p_value)
|
||||
trial_params.append(models.V1beta1TrialParameterSpec(name=p_name, reference=p_name))
|
||||
elif p_value is not None:
|
||||
value = type(old_attr)(p_value)
|
||||
setattr(training_args, p_name, value)
|
||||
input_params['training_args'] = training_args
|
||||
|
||||
# Note: Repeat similar logic for `lora_config`
|
||||
|
||||
# create container spec of worker
|
||||
container_spec = client.V1Container(
|
||||
...
|
||||
args=[
|
||||
"--model_uri",
|
||||
model_provider_parameters.model_uri,
|
||||
"--transformer_type",
|
||||
model_provider_parameters.transformer_type.__name__,
|
||||
"--model_dir",
|
||||
"REPLACE_WITH_ACTUAL_MODEL_PATH",
|
||||
"--dataset_dir",
|
||||
"REPLACE_WITH_ACTUAL_DATASET_PATH",
|
||||
"--lora_config",
|
||||
json.dumps(input_params['lora_config'].__dict__, cls=utils.SetEncoder),
|
||||
"--training_parameters",
|
||||
json.dumps(input_params['training_args'].to_dict()),
|
||||
],
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
**Hyperparameter Optimization**: This API will create an Experiment that defines the search space for identified tunable hyperparameters, the objective metric, optimization algorithm, etc. The Experiment will orchestrate the hyperparameter tuning process, generating Trials for each configuratin. Each Trial will be implemented as a Kubernete PyTorchJob, with the `trialTemplate` specifying the exact values for hyperparameters. The `trialTemplate` will also define master and worker containers, facilitating effective resource distribution and parallel execution of Trials. Trial results will then be fed back to the Experiment, which will evaluate the outcomes to identify the optimal set of hyperparameters.
|
||||
|
||||
**Dependencies Update**: To reuse existing assets from the Training Python SDK and integrate packages from HuggingFace, dependencies will be added to the `setup.py` of the Katib Python SDK as follows:
|
||||
|
||||
```python
|
||||
setuptools.setup(
|
||||
...// Configurations of the package
|
||||
extras_require={
|
||||
"huggingface": ["kubeflow-training[huggingface]==1.8.0rc1"],
|
||||
},
|
||||
)
|
||||
```
|
After Width: | Height: | Size: 302 KiB |
|
@ -0,0 +1,185 @@
|
|||
# KEP-2340: Push-based Metrics Collection Proposal
|
||||
|
||||
## Links
|
||||
|
||||
- [katib/issues#577([Enhancement Request] Metrics Collector Push-based Implementation)](https://github.com/kubeflow/katib/issues/577)
|
||||
|
||||
## Motivation
|
||||
|
||||
[Katib](https://github.com/kubeflow/katib) is a Kubernetes-native project for automated machine learning (AutoML). It can not only tune hyperparameters of applications written in any language and natively supports many ML frameworks, but also supports features like early stopping and neural architecture search.
|
||||
|
||||
In the procedure of tuning hyperparameters, Metrics Collector, which is implemented as a sidecar container attached to each training container in the [current design](https://github.com/kubeflow/katib/blob/master/docs/proposals/metrics-collector.md), will collect training logs from Trials once the training is complete. Then, the Metrics Collector will parse training logs to get appropriate metrics like accuracy or loss and pass the evaluation results to the HyperParameter tuning algorithm.
|
||||
|
||||
However, current implementation of Metrics Collector is pull-based, raising some [design problems](https://github.com/kubeflow/training-operator/issues/722#issuecomment-405669269) such as determining the frequency we scrape the metrics, performance issues like the overhead caused by too many sidecar containers, and restrictions on developing environments which must support sidecar containers. Thus, we should implement a new API for Katib Python SDK to offer users a push-based way to store metrics directly into the Katib DB and resolve those issues raised by pull-based metrics collection.
|
||||
|
||||

|
||||
|
||||
Fig.1 Architecture of the new design
|
||||
|
||||
### Goals
|
||||
|
||||
1. **A new parameter in Python SDK function `tune`**: allow users to specify the method of collecting metrics(push-based/pull-based).
|
||||
|
||||
2. **A new interface `report_metrics` in Python SDK**: push the metrics to Katib DB directly.
|
||||
|
||||
3. The final metrics of worker pods should be **pushed to Katib DB directly** in the push mode of metrics collection.
|
||||
|
||||
### Non-Goals
|
||||
|
||||
1. Implement authentication model for Katib DB to push metrics.
|
||||
|
||||
2. Support pushing data to different types of storage system(prometheus, self-defined interface etc.)
|
||||
|
||||
## API
|
||||
|
||||
### New Parameter in Python SDK Function `tune`
|
||||
|
||||
We decided to add `metrics_collector_config` to `tune` function in Python SDK.
|
||||
|
||||
```Python
|
||||
def tune(
|
||||
self,
|
||||
name: str,
|
||||
objective: Callable,
|
||||
parameters: Dict[str, Any],
|
||||
base_image: str = constants.BASE_IMAGE_TENSORFLOW,
|
||||
namespace: Optional[str] = None,
|
||||
env_per_trial: Optional[Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]]] = None,
|
||||
algorithm_name: str = "random",
|
||||
algorithm_settings: Union[dict, List[models.V1beta1AlgorithmSetting], None] = None,
|
||||
objective_metric_name: str = None,
|
||||
additional_metric_names: List[str] = [],
|
||||
objective_type: str = "maximize",
|
||||
objective_goal: float = None,
|
||||
max_trial_count: int = None,
|
||||
parallel_trial_count: int = None,
|
||||
max_failed_trial_count: int = None,
|
||||
resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
|
||||
retain_trials: bool = False,
|
||||
packages_to_install: List[str] = None,
|
||||
pip_index_url: str = "https://pypi.org/simple",
|
||||
# The newly added parameter metrics_collector_config.
|
||||
# It specifies the config of metrics collector, for example,
|
||||
# metrics_collector_config={"kind": "Push"},
|
||||
metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"},
|
||||
)
|
||||
```
|
||||
|
||||
### New Interface `report_metrics` in Python SDK
|
||||
|
||||
```Python
|
||||
"""Push Metrics Directly to Katib DB
|
||||
|
||||
[!!!] Trial name should always be passed into Katib Trials as env variable `KATIB_TRIAL_NAME`.
|
||||
|
||||
Args:
|
||||
metrics: Dict of metrics pushed to Katib DB.
|
||||
For examle, `metrics = {"loss": 0.01, "accuracy": 0.99}`.
|
||||
db-manager-address: Address for the Katib DB Manager in this format: `ip-address:port`.
|
||||
timeout: Optional, gRPC API Server timeout in seconds to report metrics.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Unable to push Trial metrics to Katib DB.
|
||||
"""
|
||||
def report_metrics(
|
||||
metrics: Dict[str, Any],
|
||||
db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS,
|
||||
timeout: int = constants.DEFAULT_TIMEOUT,
|
||||
)
|
||||
```
|
||||
|
||||
### A Simple Example:
|
||||
|
||||
```Python
|
||||
import kubeflow.katib as katib
|
||||
|
||||
# Step 1. Create an objective function with push-based metrics collection.
|
||||
def objective(parameters):
|
||||
# Import required packages.
|
||||
import kubeflow.katib as katib
|
||||
# Calculate objective function.
|
||||
result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
|
||||
# Push metrics to Katib DB.
|
||||
katib.report_metrics({"result": result})
|
||||
|
||||
# Step 2. Create HyperParameter search space.
|
||||
parameters = {
|
||||
"a": katib.search.int(min=10, max=20),
|
||||
"b": katib.search.double(min=0.1, max=0.2)
|
||||
}
|
||||
|
||||
# Step 3. Create Katib Experiment with 12 Trials and 2 GPUs per Trial.
|
||||
katib_client = katib.KatibClient(namespace="kubeflow")
|
||||
name = "tune-experiment"
|
||||
katib_client.tune(
|
||||
name=name,
|
||||
objective=objective,
|
||||
parameters=parameters,
|
||||
objective_metric_name="result",
|
||||
max_trial_count=12,
|
||||
resources_per_trial={"gpu": "2"},
|
||||
metrics_collector_config={"kind": "Push"},
|
||||
)
|
||||
|
||||
# Step 4. Get the best HyperParameters.
|
||||
print(katib_client.get_optimal_hyperparameters(name))
|
||||
```
|
||||
|
||||
## Implementation
|
||||
|
||||
### Add New Parameter in `tune`
|
||||
|
||||
As mentioned above, we decided to add `metrics_collector_config` to the tune function in Python SDK. Also, we have some changes to be made:
|
||||
|
||||
1. Configure the way of metrics collection: set the configuration `spec.metricsCollectionSpec.collector.kind`(specify the way of metrics collection) to `Push`.
|
||||
|
||||
2. Rename metrics collector from `None` to `Push`: It's not correct to call push-based metrics collection `None`. We should modify related code to rename it.
|
||||
|
||||
3. Write env variables into Trial spec: set `KATIB_TRIAL_NAME` for `report_metrics` function to dial db manager.
|
||||
|
||||
### New Interface `report_metrics` in Python SDK
|
||||
|
||||
We decide to implement this funcion to push metrics directly to Katib DB with the help of grpc. Trial name should always be passed into Katib Trials (and then into this function) as env variable `KATIB_TRIAL_NAME`.
|
||||
|
||||
Also, the function is supposed to be implemented as **global function** because it is called in the user container.
|
||||
|
||||
Steps:
|
||||
|
||||
1. Wrap metrics into `katib_api_pb2.ReportObservationLogRequest`:
|
||||
|
||||
Firstly, convert metrics (in dict format) into `katib_api_pb2.ReportObservationLogRequest` type for the following grpc call, referring to https://github.com/kubeflow/katib/blob/master/pkg/apis/manager/v1beta1/gen-doc/api.md#reportobservationlogrequest
|
||||
|
||||
2. Dial Katib DBManager Service
|
||||
|
||||
We'll create a DBManager Stub and make a grpc call to report metrics to Katib DB.
|
||||
|
||||
### Compatibility Changes in Trial Controller
|
||||
|
||||
We need to make appropriate changes in the Trial controller to make sure we insert unavailable value into Katib DB, if user doesn't report metric accidentally. The current implementation handles unavailable metrics in:
|
||||
|
||||
```Golang
|
||||
// If observation is empty metrics collector doesn't finish.
|
||||
// For early stopping metrics collector are reported logs before Trial status is changed to EarlyStopped.
|
||||
if jobStatus.Condition == trialutil.JobSucceeded && instance.Status.Observation == nil {
|
||||
logger.Info("Trial job is succeeded but metrics are not reported, reconcile requeued")
|
||||
return errMetricsNotReported
|
||||
}
|
||||
```
|
||||
|
||||
1. Distinguish pull-based and push-based metrics collection
|
||||
|
||||
We decide to add a if-else statement in the code above to distinguish pull-based and push-based metrics collection. In the push-based collection, the Trial does not need to be requeued. Instead, we'll insert a unavailable value to Katib DB.
|
||||
|
||||
2. Update the status of Trial to `MetricsUnavailable`
|
||||
|
||||
In the current implementation of pull-based metrics collection, Trials will be re-queued when the metrics collector finds the `.Status.Observation` is empty. However, it's not compatible with push-based metrics collection because the forgotten metrics won't be reported in the new round of reconcile. So, we need to update its status in the function `UpdateTrialStatusCondition` in accommodation with the pull-based metrics collection. The following code will be insert into lines before [trial_controller_util.go#L69](https://github.com/kubeflow/katib/blob/7959ffd54851216dbffba791e1da13c8485d1085/pkg/controller.v1beta1/trial/trial_controller_util.go#L69)
|
||||
|
||||
```Golang
|
||||
else if instance.Spec.MetricCollector.Collector.Kind == "Push" {
|
||||
... // Update the status of this Trial to `MetricsUnavailable` and output the reason.
|
||||
}
|
||||
```
|
||||
|
||||
### Collection of Final Metrics
|
||||
|
||||
The final metrics of worker pods should be pushed to Katib DB directly in the push mode of metrics collection.
|
After Width: | Height: | Size: 53 KiB |
|
@ -0,0 +1,169 @@
|
|||
# KEP-2374: Proposal for Supporting various parameter distributions in Katib
|
||||
|
||||
## Summary
|
||||
The goal of this project is to enhance the existing Katib Experiment APIs to support various parameter distributions such as uniform, log-uniform, and qlog-uniform. Then extend the suggestion services to be able to configure distributions for search space using libraries provided in each framework.
|
||||
|
||||
## Motivation
|
||||
Currently, [Katib](https://github.com/kubeflow/katib) is limited to supporting only uniform distribution for integer, float, and categorical hyperparameters. By introducing additional distributions, Katib will become more flexible and powerful in conducting hyperparameter optimization tasks.
|
||||
|
||||
A Data Scientist requires Katib to support multiple hyperparameter distributions, such as log-uniform, normal, and log-normal, in addition to the existing uniform distribution. This enhancement is crucial for more flexible and precise hyperparameter optimization. For instance, learning rates often benefit from a log-uniform distribution because small values can significantly impact performance. Similarly, normal distributions are useful for parameters that are expected to vary around a central value.
|
||||
|
||||
### Goals
|
||||
- Add `Distribution` field to `FeasibleSpace` alongside `ParameterType`.
|
||||
- Support for the log-uniform, normal, and log-normal Distributions.
|
||||
- Update the Experiment and gRPC API to support `Distribution`.
|
||||
- Update logic to handle the new parameter distributions for each suggestion service (e.g., Optuna, Hyperopt).
|
||||
- Extend the Python SDK to support the new `Distribution` field.
|
||||
### Non-Goals
|
||||
- This proposal do not aim to create new version for CRD APIs.
|
||||
- This proposal do not aim to make the necessary Katib UI changes.
|
||||
- No changes will be made to the core optimization algorithms beyond supporting new distributions.
|
||||
|
||||
## Proposal
|
||||
|
||||
### Parameter Distribution Comparison Table
|
||||
|
||||
| Distribution Type | Hyperopt | Optuna | Ray Tune | Nevergrad |
|
||||
|-------------------------------|-----------------------|-------------------------------------------------|-----------------------|---------------------------------------------|
|
||||
| **Uniform Continuous** | `hp.uniform` | `FloatDistribution` | `tune.uniform` | `p.Scalar` with uniform transformation |
|
||||
| **Quantized Uniform** | `hp.quniform` | `DiscreteUniformDistribution` (deprecated) | `tune.quniform` | `p.Scalar` with uniform and step specified |
|
||||
| **Log Uniform** | `hp.loguniform` | `LogUniformDistribution` (deprecated) | `tune.loguniform` | `p.Log` with uniform transformation |
|
||||
| **Uniform Integer** | `hp.randint` or quantized distributions with step size `q` set to 1 | `IntDistribution` | `tune.randint` | `p.Scalar` with integer transformation |
|
||||
| **Categorical** | `hp.choice` | `CategoricalDistribution` | `tune.choice` | `p.Choice` |
|
||||
| **Quantized Log Uniform** | `hp.qloguniform` | Custom Implementation | `tune.qloguniform` | `p.Log` with uniform and step specified |
|
||||
| **Normal** | `hp.normal` | (Not directly supported) | `tune.randn` | (Not directly supported) |
|
||||
| **Quantized Normal** | `hp.qnormal` | (Not directly supported) | `tune.qrandn` | (Not directly supported) |
|
||||
| **Log Normal** | `hp.lognormal` | (Not directly supported) | (Use custom transformation in `tune.randn`) | (Not directly supported) |
|
||||
| **Quantized Log Normal** | `hp.qlognormal` | (Not directly supported) | (Use custom transformation in `tune.qrandn`) | (Not directly supported) |
|
||||
| **Quantized Integer** | `hp.quniformint` | `IntUniformDistribution` (deprecated) | | `p.Scalar` with integer and step specified |
|
||||
| **Log Integer** | | `IntLogUniformDistribution` (deprecated) | `tune.lograndint` | `p.Scalar` with log-integer transformation |
|
||||
|
||||
|
||||
- Note:
|
||||
In `Nevergrad`, parameter types like `p.Scalar`, `p.Log`, and `p.Choice` are mapped to corresponding `Hyperopt` search space definitions like `hp.uniform`, `hp.loguniform`, and `hp.choice` using internal functions to convert parameter bounds and distributions.
|
||||
|
||||
## API Design
|
||||
### FeasibleSpace
|
||||
Feasible space for optimization.
|
||||
Int and Double type use Max/Min.
|
||||
Discrete and Categorical type use List.
|
||||
|
||||
|
||||
| Field | Type | Label | Description |
|
||||
| ----- | ---- | ----- | ----------- |
|
||||
| max | [string](#string) | | Max Value |
|
||||
| min | [string](#string) | | Minimum Value |
|
||||
| list | [string](#string) | repeated | List of Values. |
|
||||
| step | [string](#string) | | Step for double or int parameter or q for quantization|
|
||||
| distribution | [Distribution](#api-v1-beta1-Distribution) | | Type of the Distribution. |
|
||||
|
||||
|
||||
<a name="api-v1-beta1-Distribution"></a>
|
||||
|
||||
### Distribution
|
||||
- Types of value for HyperParameter Distributions.
|
||||
- We add the `distribution` field to represent the hyperparameters search space rather than [`ParameterType`](https://github.com/kubeflow/katib/blob/2c575227586ff1c03cf6b5190d066e2f3061a404/pkg/apis/controller/experiments/v1beta1/experiment_types.go#L199-L207).
|
||||
- The `distribution` allows users to configure more granular search space customizations.
|
||||
- In this enhancement, we would propose the following 4 distributions:
|
||||
|
||||
| Name | Number | Description |
|
||||
| ---- | ------ | ----------- |
|
||||
| UNIFORM | 0 | Continuous uniform distribution. Samples values evenly between a minimum and maximum value. Use "Max/Min". Use "Step" for `q`. |
|
||||
| LOGUNIFORM | 1 | Samples values such that their logarithm is uniformly distributed. Use "Max/Min". Use "Step" for `q`. |
|
||||
| NORMAL | 2 | Normal (Gaussian) distribution type. Samples values according to a normal distribution characterized by a mean and standard deviation. Use "Max/Min". Use "Step" for `q`. |
|
||||
| LOGNORMAL | 3 | Log-normal distribution type. Samples values such that their logarithm is normally distributed. Use "Max/Min". Use "Step" for `q`. |
|
||||
|
||||
|
||||
## Experiment API changes
|
||||
Scope: `pkg/apis/controller/experiments/v1beta1/experiment_types.go`
|
||||
|
||||
```go
|
||||
type ParameterSpec struct {
|
||||
Name string `json:"name,omitempty"`
|
||||
ParameterType ParameterType `json:"parameterType,omitempty"`
|
||||
FeasibleSpace FeasibleSpace `json:"feasibleSpace,omitempty"`
|
||||
}
|
||||
```
|
||||
- Adding new field `Distribution` to `FeasibleSpace`
|
||||
|
||||
- The `Step` field can be used to define quantization steps for uniform or log-uniform distributions, effectively covering q-quantization requirements.
|
||||
|
||||
Updated `FeasibleSpace` struct
|
||||
```diff
|
||||
type FeasibleSpace struct {
|
||||
Max string `json:"max,omitempty"`
|
||||
Min string `json:"min,omitempty"`
|
||||
List []string `json:"list,omitempty"`
|
||||
Step string `json:"step,omitempty"` // Step can be used to define q-quantization
|
||||
+ Distribution Distribution `json:"distribution,omitempty"` // Added Distribution field
|
||||
}
|
||||
```
|
||||
- New Field Description: `Distribution`
|
||||
- Type: `Distribution`
|
||||
- Description: The Distribution field specifies the type of statistical distribution to be applied to the parameter. This allows the definition of various distributions, such as uniform, log-uniform, or other supported types.
|
||||
|
||||
- Defining `Distribution` type
|
||||
```go
|
||||
type Distribution string
|
||||
|
||||
const (
|
||||
DistributionUniform Distribution = "uniform"
|
||||
DistributionLogUniform Distribution = "logUniform"
|
||||
DistributionNormal Distribution = "normal"
|
||||
DistributionLogNormal Distribution = "logNormal"
|
||||
)
|
||||
```
|
||||
|
||||
## gRPC API changes
|
||||
Scope: `pkg/apis/manager/v1beta1/api.proto`
|
||||
- Add the `Distribution` field to the `FeasibleSpace` message
|
||||
```diff
|
||||
/**
|
||||
* Feasible space for optimization.
|
||||
* Int and Double type use Max/Min.
|
||||
* Discrete and Categorical type use List.
|
||||
*/
|
||||
message FeasibleSpace {
|
||||
string max = 1; /// Max Value
|
||||
string min = 2; /// Minimum Value
|
||||
repeated string list = 3; /// List of Values.
|
||||
string step = 4; /// Step for double or int parameter
|
||||
+ Distribution distribution = 4; // Distribution of the parameter.
|
||||
}
|
||||
```
|
||||
- Define the `Distribution` enum
|
||||
```
|
||||
/**
|
||||
* Distribution types for HyperParameter.
|
||||
*/
|
||||
enum Distribution {
|
||||
UNIFORM = 0;
|
||||
LOG_UNIFORM = 1;
|
||||
NORMAL = 2;
|
||||
LOG_NORMAL = 3;
|
||||
}
|
||||
```
|
||||
|
||||
## Suggestion Service Logic
|
||||
- For each suggestion service (e.g., Optuna, Hyperopt), the logic will be updated to handle the new parameter distributions.
|
||||
- This involves modifying the conversion functions to map Katib distributions to the corresponding framework-specific distributions.
|
||||
|
||||
#### Optuna
|
||||
ref: https://optuna.readthedocs.io/en/stable/reference/distributions.html
|
||||
|
||||
For example:
|
||||
- Update the `_get_optuna_search_space` for new Distributions.
|
||||
scope: `pkg/suggestion/v1beta1/optuna/base_service.py`
|
||||
|
||||
#### Goptuna
|
||||
ref: https://github.com/c-bata/goptuna/blob/2245ddd9e8d1edba750839893c8a618f852bc1cf/distribution.go
|
||||
|
||||
#### Hyperopt
|
||||
ref: http://hyperopt.github.io/hyperopt/getting-started/search_spaces/#parameter-expressions
|
||||
|
||||
#### Ray-tune
|
||||
ref: https://docs.ray.io/en/latest/tune/api/search_space.html
|
||||
|
||||
## Python SDK
|
||||
Extend the Python SDK to support the new `Distribution` field.
|
||||
|
|
@ -1,28 +1,27 @@
|
|||
# Suggestion CRD Design Document
|
||||
# KEP-507: Suggestion CRD Design Document
|
||||
|
||||
Table of Contents
|
||||
=================
|
||||
# Table of Contents
|
||||
|
||||
* [Suggestion CRD Design Document](#suggestion-crd-design-document)
|
||||
* [Table of Contents](#table-of-contents)
|
||||
* [Background](#background)
|
||||
* [Goals](#goals)
|
||||
* [Non-Goals](#non-goals)
|
||||
* [Design](#design)
|
||||
* [Kubernetes API](#kubernetes-api)
|
||||
* [GRPC API](#grpc-api)
|
||||
* [Workflow](#workflow)
|
||||
* [Example](#example)
|
||||
* [Algorithm Supports](#algorithm-supports)
|
||||
* [Random](#random)
|
||||
* [Grid](#grid)
|
||||
* [Bayes Optimization](#bayes-optimization)
|
||||
* [HyperBand](#hyperband)
|
||||
* [BOHB](#bohb)
|
||||
* [TPE](#tpe)
|
||||
* [SMAC](#smac)
|
||||
* [CMA-ES](#cma-es)
|
||||
* [Sobol](#sobol)
|
||||
- [Suggestion CRD Design Document](#suggestion-crd-design-document)
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [Background](#background)
|
||||
- [Goals](#goals)
|
||||
- [Non-Goals](#non-goals)
|
||||
- [Design](#design)
|
||||
- [Kubernetes API](#kubernetes-api)
|
||||
- [GRPC API](#grpc-api)
|
||||
- [Workflow](#workflow)
|
||||
- [Example](#example)
|
||||
- [Algorithm Supports](#algorithm-supports)
|
||||
- [Random](#random)
|
||||
- [Grid](#grid)
|
||||
- [Bayes Optimization](#bayes-optimization)
|
||||
- [HyperBand](#hyperband)
|
||||
- [BOHB](#bohb)
|
||||
- [TPE](#tpe)
|
||||
- [SMAC](#smac)
|
||||
- [CMA-ES](#cma-es)
|
||||
- [Sobol](#sobol)
|
||||
|
||||
Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
|
||||
|
||||
|
@ -30,7 +29,7 @@ Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
|
|||
|
||||
Katib makes suggestions long-running in v1alpha3. And the suggestions need to communicate with Katib DB manager to get experiments and trials from Katib db driver. This design hurts high availability.
|
||||
|
||||
Thus we proposed a new design to implement a CRD for suggestion and remove Katib db communication from main workflow. The new design simplifies the implmentation of experiment and trial controller, and makes Katib Kubernetes native.
|
||||
Thus we proposed a new design to implement a CRD for suggestion and remove Katib db communication from main workflow. The new design simplifies the implementation of experiment and trial controller, and makes Katib Kubernetes native.
|
||||
|
||||
This document is to illustrate the details of the new design.
|
||||
|
||||
|
@ -118,7 +117,7 @@ message ExperimentSpec {
|
|||
}
|
||||
|
||||
message ParameterSpecs {
|
||||
repeated ParameterSpec parameters = 1;
|
||||
repeated ParameterSpec parameters = 1;
|
||||
}
|
||||
|
||||
message AlgorithmSpec {
|
||||
|
@ -228,28 +227,28 @@ spec:
|
|||
algorithmName: random
|
||||
trialTemplate:
|
||||
goTemplate:
|
||||
rawTemplate: |-
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{.Trial}}
|
||||
namespace: {{.NameSpace}}
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: {{.Trial}}
|
||||
image: katib/mxnet-mnist-example
|
||||
command:
|
||||
- "python"
|
||||
- "/mxnet/example/image-classification/train_mnist.py"
|
||||
- "--batch-size=64"
|
||||
{{- with .HyperParameters}}
|
||||
{{- range .}}
|
||||
- "{{.Name}}={{.Value}}"
|
||||
{{- end}}
|
||||
{{- end}}
|
||||
restartPolicy: Never
|
||||
rawTemplate: |-
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{.Trial}}
|
||||
namespace: {{.NameSpace}}
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: {{.Trial}}
|
||||
image: katib/mxnet-mnist-example
|
||||
command:
|
||||
- "python"
|
||||
- "/mxnet/example/image-classification/train_mnist.py"
|
||||
- "--batch-size=64"
|
||||
{{- with .HyperParameters}}
|
||||
{{- range .}}
|
||||
- "{{.Name}}={{.Value}}"
|
||||
{{- end}}
|
||||
{{- end}}
|
||||
restartPolicy: Never
|
||||
parameters:
|
||||
- name: --lr
|
||||
parameterType: double
|
||||
|
@ -265,9 +264,9 @@ spec:
|
|||
parameterType: categorical
|
||||
feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
```
|
||||
|
||||
Then, Experiment controller needs 3 parallel trials to run. It creates the Suggestions:
|
|
@ -1,4 +1,4 @@
|
|||
# Metrics Collector Proposal
|
||||
# KEP-685: Metrics Collector Proposal
|
||||
|
||||
- [Metrics Collector Proposal](#metrics-collector-proposal)
|
||||
- [Links](#links)
|
||||
|
@ -33,7 +33,7 @@ In the new design, Katib use mutating webhook to inject metrics collector contai
|
|||
The sidecar collects metrics of the master and then store them on the persistent layer (e.x. katib-db-manager and metadata server).
|
||||
|
||||
<center>
|
||||
<img src="../images/metrics-collector-design.png" width="80%">
|
||||
<img src="./metrics-collector-design.png" width="80%">
|
||||
|
||||
Fig. 1 Architecture of the new design
|
||||
|
Before Width: | Height: | Size: 166 KiB After Width: | Height: | Size: 166 KiB |
|
@ -0,0 +1,6 @@
|
|||
# Proposals
|
||||
|
||||
Kubeflow uses the KEP process to document large scale changes to the project.
|
||||
|
||||
Details on the process (including the KEP template, recommendations, etc.) can be found at
|
||||
[kubeflow/community/proposals](https://github.com/kubeflow/community/blob/master/proposals/README.md)
|
|
@ -4,7 +4,7 @@ This is the instruction on how to make a new release for the Katib project.
|
|||
|
||||
## Prerequisite
|
||||
|
||||
- Tools, defined in the [Developer Guide](./../developer-guide.md#requirements).
|
||||
- Tools, defined in the [Contributing Guide](./../../CONTRIBUTING.md#requirements).
|
||||
|
||||
- [Write](https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-permission-levels-for-an-organization#permission-levels-for-repositories-owned-by-an-organization)
|
||||
permission for the Katib repository.
|
||||
|
@ -19,6 +19,16 @@ This is the instruction on how to make a new release for the Katib project.
|
|||
|
||||
- Install `twine` to publish the SDK package: `pip install twine==3.4.1`
|
||||
|
||||
- Create a [PyPI Token](https://pypi.org/help/#apitoken) to publish Katib SDK.
|
||||
|
||||
- Add the following config to your `~/.pypirc` file:
|
||||
|
||||
```
|
||||
[pypi]
|
||||
username = __token__
|
||||
password = <PYPI_TOKEN>
|
||||
```
|
||||
|
||||
## Release Process
|
||||
|
||||
### Versioning Policy
|
||||
|
@ -65,7 +75,10 @@ Follow these steps to cut a new Katib release:
|
|||
git clone git@github.com:kubeflow/katib.git $GOPATH/src/github.com/kubeflow/katib
|
||||
```
|
||||
|
||||
1. Make sure that you can build all Katib images:
|
||||
1. Make sure that you can build all Katib images. **Note** that
|
||||
your Docker Desktop should
|
||||
[enable containerd image store](https://docs.docker.com/desktop/containerd/#enable-the-containerd-image-store)
|
||||
to build multi-arch images:
|
||||
|
||||
```
|
||||
make build REGISTRY=private-registry TAG=latest
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from github import Github
|
||||
import argparse
|
||||
|
||||
from github import Github
|
||||
|
||||
REPO_NAME = "kubeflow/katib"
|
||||
CHANGELOG_FILE = "CHANGELOG.md"
|
||||
|
||||
|
@ -50,7 +51,7 @@ for commit in reversed(commits):
|
|||
|
||||
change_log = [
|
||||
"# Changelog" "\n\n",
|
||||
"## [{}]({}) ({})".format(current_release, release_url, release_date),
|
||||
"# [{}]({}) ({})".format(current_release, release_url, release_date),
|
||||
"\n\n",
|
||||
"## TODO: Group PRs into Breaking Changes, New Features, Bug fixes, Documentation, etc. "
|
||||
+ "For example: [v0.11.0](https://github.com/kubeflow/katib/releases/tag/v0.11.0)",
|
||||
|
|
|
@ -1,423 +0,0 @@
|
|||
# How Katib v1beta1 tunes hyperparameters automatically in a Kubernetes native way
|
||||
|
||||
Follow the Kubeflow documentation guides:
|
||||
|
||||
- [Concepts](https://www.kubeflow.org/docs/components/katib/overview/)
|
||||
in Katib, hyperparameter tuning, and neural architecture search.
|
||||
- [Getting started with Katib](https://kubeflow.org/docs/components/katib/hyperparameter/).
|
||||
- Detailed guide to
|
||||
[configuring and running a Katib `Experiment`](https://kubeflow.org/docs/components/katib/experiment/).
|
||||
|
||||
## Example and Illustration
|
||||
|
||||
After install Katib v1beta1, you can try the first Katib Experiment:
|
||||
|
||||
```
|
||||
kubectl apply -f https://raw.githubusercontent.com/kubeflow/katib/master/examples/v1beta1/hp-tuning/random.yaml
|
||||
```
|
||||
|
||||
### Experiment
|
||||
|
||||
When you want to tune hyperparameters for your machine learning model before
|
||||
training it further, you just need to create an `Experiment` CR. To
|
||||
learn what fields are included in the `Experiment.spec`, follow
|
||||
the detailed guide to
|
||||
[configuring and running a Katib `Experiment`](https://kubeflow.org/docs/components/katib/experiment/).
|
||||
Then you can get the new `Experiment` as below.
|
||||
Katib concepts are introduced based on this example.
|
||||
|
||||
```yaml
|
||||
$ kubectl get experiment random -n kubeflow -o yaml
|
||||
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: Experiment
|
||||
metadata:
|
||||
...
|
||||
name: random
|
||||
namespace: kubeflow
|
||||
...
|
||||
spec:
|
||||
algorithm:
|
||||
algorithmName: random
|
||||
maxFailedTrialCount: 3
|
||||
maxTrialCount: 12
|
||||
metricsCollectorSpec:
|
||||
collector:
|
||||
kind: StdOut
|
||||
objective:
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
goal: 0.99
|
||||
metricStrategies:
|
||||
- name: Validation-accuracy
|
||||
value: max
|
||||
- name: Train-accuracy
|
||||
value: max
|
||||
objectiveMetricName: Validation-accuracy
|
||||
type: maximize
|
||||
parallelTrialCount: 3
|
||||
parameters:
|
||||
- feasibleSpace:
|
||||
max: "0.03"
|
||||
min: "0.01"
|
||||
name: lr
|
||||
parameterType: double
|
||||
- feasibleSpace:
|
||||
max: "5"
|
||||
min: "2"
|
||||
name: num-layers
|
||||
parameterType: int
|
||||
- feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
name: optimizer
|
||||
parameterType: categorical
|
||||
resumePolicy: Never
|
||||
trialTemplate:
|
||||
failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
|
||||
primaryContainerName: training-container
|
||||
successCondition: status.conditions.#(type=="Complete")#|#(status=="True")#
|
||||
trialParameters:
|
||||
- description: Learning rate for the training model
|
||||
name: learningRate
|
||||
reference: lr
|
||||
- description: Number of training model layers
|
||||
name: numberLayers
|
||||
reference: num-layers
|
||||
- description: Training model optimizer (sdg, adam or ftrl)
|
||||
name: optimizer
|
||||
reference: optimizer
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- /opt/mxnet-mnist/mnist.py
|
||||
- --batch-size=64
|
||||
- --lr=${trialParameters.learningRate}
|
||||
- --num-layers=${trialParameters.numberLayers}
|
||||
- --optimizer=${trialParameters.optimizer}
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727
|
||||
name: training-container
|
||||
restartPolicy: Never
|
||||
status:
|
||||
completionTime: "2021-10-01T21:47:35Z"
|
||||
conditions:
|
||||
- lastTransitionTime: "2021-10-01T21:27:46Z"
|
||||
lastUpdateTime: "2021-10-01T21:27:46Z"
|
||||
message: Experiment is created
|
||||
reason: ExperimentCreated
|
||||
status: "True"
|
||||
type: Created
|
||||
- lastTransitionTime: "2021-10-01T21:47:35Z"
|
||||
lastUpdateTime: "2021-10-01T21:47:35Z"
|
||||
message: Experiment is running
|
||||
reason: ExperimentRunning
|
||||
status: "False"
|
||||
type: Running
|
||||
- lastTransitionTime: "2021-10-01T21:47:35Z"
|
||||
lastUpdateTime: "2021-10-01T21:47:35Z"
|
||||
message: Experiment has succeeded because max trial count has reached
|
||||
reason: ExperimentMaxTrialsReached
|
||||
status: "True"
|
||||
type: Succeeded
|
||||
currentOptimalTrial:
|
||||
bestTrialName: random-gh8psfcz
|
||||
observation:
|
||||
metrics:
|
||||
- latest: "0.977707"
|
||||
max: "0.979299"
|
||||
min: "0.955215"
|
||||
name: Validation-accuracy
|
||||
- latest: "0.993570"
|
||||
max: "0.993570"
|
||||
min: "0.907932"
|
||||
name: Train-accuracy
|
||||
parameterAssignments:
|
||||
- name: lr
|
||||
value: "0.014431754535687558"
|
||||
- name: num-layers
|
||||
value: "3"
|
||||
- name: optimizer
|
||||
value: sgd
|
||||
startTime: "2021-10-01T21:27:46Z"
|
||||
succeededTrialList:
|
||||
- random-ghvj6q8z
|
||||
- random-4z4kqr5l
|
||||
- random-8ssrzrzr
|
||||
- random-gw7xtn84
|
||||
- random-zlldw6v9
|
||||
- random-9jx47rsk
|
||||
- random-rzx6zcwb
|
||||
- random-46rqvb9k
|
||||
- random-nd8d2lmc
|
||||
- random-gw7wzdw2
|
||||
- random-hq2fghf6
|
||||
- random-gh8psfcz
|
||||
trials: 12
|
||||
trialsSucceeded: 12
|
||||
```
|
||||
|
||||
### Suggestion
|
||||
|
||||
Katib internally creates a `Suggestion` CR for each `Experiment` CR. The
|
||||
`Suggestion` CR includes the hyperparameter algorithm name by `algorithmName`
|
||||
field and how many sets of hyperparameter Katib asks to be generated by
|
||||
`requests` field. The `Suggestion` also traces all already generated sets of
|
||||
hyperparameter in `status.suggestions`. The `Suggestion` CR is used for internal
|
||||
logic control and end user can even ignore it.
|
||||
|
||||
```yaml
|
||||
$ kubectl get suggestion random -n kubeflow -o yaml
|
||||
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: Suggestion
|
||||
metadata:
|
||||
...
|
||||
name: random
|
||||
namespace: kubeflow
|
||||
ownerReferences:
|
||||
- apiVersion: kubeflow.org/v1beta1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: Experiment
|
||||
name: random
|
||||
uid: 355b05f5-6951-47b2-85f6-d0b9b8be5a64
|
||||
...
|
||||
spec:
|
||||
algorithm:
|
||||
algorithmName: random
|
||||
requests: 12
|
||||
resumePolicy: Never
|
||||
status:
|
||||
conditions:
|
||||
- lastTransitionTime: "2021-10-01T21:27:46Z"
|
||||
lastUpdateTime: "2021-10-01T21:27:46Z"
|
||||
message: Suggestion is created
|
||||
reason: SuggestionCreated
|
||||
status: "True"
|
||||
type: Created
|
||||
- lastTransitionTime: "2021-10-01T21:28:56Z"
|
||||
lastUpdateTime: "2021-10-01T21:28:56Z"
|
||||
message: Deployment is ready
|
||||
reason: DeploymentReady
|
||||
status: "True"
|
||||
type: DeploymentReady
|
||||
- lastTransitionTime: "2021-10-01T21:28:57Z"
|
||||
lastUpdateTime: "2021-10-01T21:28:57Z"
|
||||
message: Suggestion is running
|
||||
reason: SuggestionRunning
|
||||
status: "True"
|
||||
type: Running
|
||||
startTime: "2021-10-01T21:27:46Z"
|
||||
suggestionCount: 12
|
||||
suggestions:
|
||||
...
|
||||
- name: random-gw7wzdw2
|
||||
parameterAssignments:
|
||||
- name: lr
|
||||
value: "0.020202241839540558"
|
||||
- name: num-layers
|
||||
value: "4"
|
||||
- name: optimizer
|
||||
value: adam
|
||||
- name: random-hq2fghf6
|
||||
parameterAssignments:
|
||||
- name: lr
|
||||
value: "0.01841281609693181"
|
||||
- name: num-layers
|
||||
value: "3"
|
||||
- name: optimizer
|
||||
value: sgd
|
||||
- name: random-8ssrzrzr
|
||||
parameterAssignments:
|
||||
- name: lr
|
||||
value: "0.021473410597867483"
|
||||
- name: num-layers
|
||||
value: "2"
|
||||
- name: optimizer
|
||||
value: adam
|
||||
...
|
||||
```
|
||||
|
||||
### Trial
|
||||
|
||||
For each set of hyperparameters, Katib internally generates a `Trial` CR
|
||||
with the hyperparameters key-value pairs, `Worker Job` run specification with
|
||||
parameters instantiated and some other fields like below. The `Trial` CR
|
||||
is used for internal logic control and end user can even ignore it.
|
||||
|
||||
```yaml
|
||||
$ kubectl get trial -n kubeflow
|
||||
|
||||
NAME TYPE STATUS AGE
|
||||
random-46rqvb9k Succeeded True 20m
|
||||
random-4z4kqr5l Succeeded True 23m
|
||||
random-8ssrzrzr Succeeded True 14m
|
||||
random-9jx47rsk Succeeded True 23m
|
||||
random-gh8psfcz Succeeded True 8m15s
|
||||
random-ghvj6q8z Succeeded True 23m
|
||||
random-gw7wzdw2 Succeeded True 17m
|
||||
random-gw7xtn84 Succeeded True 12m
|
||||
random-hq2fghf6 Succeeded True 17m
|
||||
random-nd8d2lmc Succeeded True 17m
|
||||
random-rzx6zcwb Succeeded True 20m
|
||||
random-zlldw6v9 Succeeded True 11m
|
||||
|
||||
$ kubectl get trial random-gw7wzdw2 -o yaml -n kubeflow
|
||||
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: Trial
|
||||
metadata:
|
||||
creationTimestamp: "2021-10-01T21:35:18Z"
|
||||
finalizers:
|
||||
- clean-metrics-in-db
|
||||
generation: 1
|
||||
labels:
|
||||
katib.kubeflow.org/experiment: random
|
||||
name: random-gw7wzdw2
|
||||
namespace: kubeflow
|
||||
ownerReferences:
|
||||
- apiVersion: kubeflow.org/v1beta1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: Experiment
|
||||
name: random
|
||||
uid: 355b05f5-6951-47b2-85f6-d0b9b8be5a64
|
||||
...
|
||||
spec:
|
||||
failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
|
||||
metricsCollector:
|
||||
collector:
|
||||
kind: StdOut
|
||||
objective:
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
goal: 0.99
|
||||
metricStrategies:
|
||||
- name: Validation-accuracy
|
||||
value: max
|
||||
- name: Train-accuracy
|
||||
value: max
|
||||
objectiveMetricName: Validation-accuracy
|
||||
type: maximize
|
||||
parameterAssignments:
|
||||
- name: lr
|
||||
value: "0.020202241839540558"
|
||||
- name: num-layers
|
||||
value: "4"
|
||||
- name: optimizer
|
||||
value: adam
|
||||
primaryContainerName: training-container
|
||||
runSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: random-gw7wzdw2
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- python3
|
||||
- /opt/mxnet-mnist/mnist.py
|
||||
- --batch-size=64
|
||||
- --lr=0.020202241839540558
|
||||
- --num-layers=4
|
||||
- --optimizer=adam
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727
|
||||
name: training-container
|
||||
restartPolicy: Never
|
||||
successCondition: status.conditions.#(type=="Complete")#|#(status=="True")#
|
||||
status:
|
||||
completionTime: "2021-10-01T21:40:59Z"
|
||||
conditions:
|
||||
- lastTransitionTime: "2021-10-01T21:35:18Z"
|
||||
lastUpdateTime: "2021-10-01T21:35:18Z"
|
||||
message: Trial is created
|
||||
reason: TrialCreated
|
||||
status: "True"
|
||||
type: Created
|
||||
- lastTransitionTime: "2021-10-01T21:40:59Z"
|
||||
lastUpdateTime: "2021-10-01T21:40:59Z"
|
||||
message: Trial is running
|
||||
reason: TrialRunning
|
||||
status: "False"
|
||||
type: Running
|
||||
- lastTransitionTime: "2021-10-01T21:40:59Z"
|
||||
lastUpdateTime: "2021-10-01T21:40:59Z"
|
||||
message: Trial has succeeded
|
||||
reason: TrialSucceeded
|
||||
status: "True"
|
||||
type: Succeeded
|
||||
observation:
|
||||
metrics:
|
||||
- latest: "0.949542"
|
||||
max: "0.949542"
|
||||
min: "0.938396"
|
||||
name: Validation-accuracy
|
||||
- latest: "0.943164"
|
||||
max: "0.944463"
|
||||
min: "0.911081"
|
||||
name: Train-accuracy
|
||||
startTime: "2021-10-01T21:35:18Z"
|
||||
```
|
||||
|
||||
## What happens after an `Experiment` CR is created
|
||||
|
||||
When user creates an `Experiment` CR, Katib `Experiment` controller,
|
||||
`Suggestion` controller and `Trial` controller is working together to achieve
|
||||
hyperparameters tuning for user's Machine learning model. The Experiment
|
||||
workflow looks as follows:
|
||||
|
||||
<center>
|
||||
<img width="100%" alt="image" src="images/katib-workflow.png">
|
||||
</center>
|
||||
|
||||
1. The `Experiment` CR is submitted to the Kubernetes API server. Katib
|
||||
`Experiment` mutating and validating webhook is called to set the default
|
||||
values for the `Experiment` CR and validate the CR separately.
|
||||
|
||||
1. The `Experiment` controller creates the `Suggestion` CR.
|
||||
|
||||
1. The `Suggestion` controller creates the algorithm deployment and service
|
||||
based on the new `Suggestion` CR.
|
||||
|
||||
1. When the `Suggestion` controller verifies that the algorithm service is
|
||||
ready, it calls the service to generate
|
||||
`spec.request - len(status.suggestions)` sets of hyperparameters and append
|
||||
them into `status.suggestions`.
|
||||
|
||||
1. The `Experiment` controller finds that `Suggestion` CR had been updated and
|
||||
generates each `Trial` for the each new hyperparameters set.
|
||||
|
||||
1. The `Trial` controller generates `Worker Job` based on the `runSpec`
|
||||
from the `Trial` CR with the new hyperparameters set.
|
||||
|
||||
1. The related job controller
|
||||
(Kubernetes batch Job, Kubeflow TFJob, Tekton Pipeline, etc.) generates
|
||||
Kubernetes Pods.
|
||||
|
||||
1. Katib Pod mutating webhook is called to inject the metrics collector sidecar
|
||||
container to the candidate Pods.
|
||||
|
||||
1. During the ML model container runs, the metrics collector container
|
||||
collects metrics from the injected pod and persists metrics to the Katib
|
||||
DB backend.
|
||||
|
||||
1. When the ML model training ends, the `Trial` controller updates status
|
||||
of the corresponding `Trial` CR.
|
||||
|
||||
1. When the `Trial` CR goes to end, the `Experiment` controller increases
|
||||
`request` field of the corresponding `Suggestion` CR if it is needed,
|
||||
then everything goes to `step 4` again.
|
||||
Of course, if the `Trial` CRs meet one of `end` condition
|
||||
(exceeds `maxTrialCount`, `maxFailedTrialCount` or `goal`),
|
||||
the `Experiment` controller takes everything done.
|
|
@ -104,8 +104,6 @@ Check the following images for the Trial containers:
|
|||
|
||||
- [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries)
|
||||
|
||||
- [MXNet MNIST](./trial-images/mxnet-mnist)
|
||||
|
||||
- [PyTorch MNIST](./trial-images/pytorch-mnist)
|
||||
|
||||
- [ENAS Keras CNN CIFAR-10](./trial-images/enas-cnn-cifar10)
|
||||
|
@ -124,8 +122,6 @@ Check the following examples for the various distributed operators:
|
|||
|
||||
- [PyTorchJob MNIST](./kubeflow-training-operator/pytorchjob-mnist.yaml)
|
||||
|
||||
- [MXJob BytePS](./kubeflow-training-operator/mxjob-byteps.yaml)
|
||||
|
||||
- [XGBoostJob LightGBM](./kubeflow-training-operator/xgboostjob-lightgbm.yaml)
|
||||
|
||||
- [MPIJob Horovod](./kubeflow-training-operator/mpijob-horovod.yaml)
|
||||
|
|
|
@ -79,15 +79,23 @@ kubectl patch ClusterRole katib-controller -n kubeflow --type=json \
|
|||
-p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]'
|
||||
```
|
||||
|
||||
In addition to that, you have to modify Katib
|
||||
[Controller args](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/controller.yaml#L27)
|
||||
with the new flag `--trial-resources`.
|
||||
|
||||
Run the following command to update Katib Controller args:
|
||||
Run the following command to update [Katib config](https://www.kubeflow.org/docs/components/katib/user-guides/katib-config/#katib-controller-parameters):
|
||||
|
||||
```bash
|
||||
kubectl patch Deployment katib-controller -n kubeflow --type=json \
|
||||
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--trial-resources=Workflow.v1alpha1.argoproj.io"}]'
|
||||
kubectl edit configMap katib-config -n kubeflow
|
||||
```
|
||||
|
||||
For example, to support Workflow Pipelines, add `Workflow.v1alpha1.argoproj.io` in `trialResources`:
|
||||
|
||||
```bash
|
||||
trialResources:
|
||||
- Workflow.v1alpha1.argoproj.io
|
||||
```
|
||||
|
||||
After that, you need to restart the Katib controller Pod:
|
||||
|
||||
```bash
|
||||
kubectl delete pod -n kubeflow -l katib.kubeflow.org/component=controller
|
||||
```
|
||||
|
||||
Check that Katib Controller's pod was restarted:
|
||||
|
@ -107,7 +115,7 @@ Check logs from Katib Controller to verify Argo Workflow integration:
|
|||
```bash
|
||||
$ kubectl logs $(kubectl get pods -n kubeflow -o name | grep katib-controller) -n kubeflow | grep '"CRD Kind":"Workflow"'
|
||||
|
||||
{"level":"info","ts":1628032648.6285546,"logger":"trial-controller","msg":"Job watch added successfully","CRD Group":"argoproj.io","CRD Version":"v1alpha1","CRD Kind":"Workflow"}
|
||||
{"level":"info","ts":"2024-07-13T10:02:10Z","logger":"trial-controller","msg":"Job watch added successfully","CRD Group":"argoproj.io","CRD Version":"v1alpha1","CRD Kind":"Workflow"}
|
||||
```
|
||||
|
||||
If you ran the above steps successfully, you should be able to run Argo Workflow examples.
|
||||
|
|
|
@ -13,11 +13,9 @@ metadata:
|
|||
name: katib-argo-workflow
|
||||
spec:
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: random
|
||||
parallelTrialCount: 2
|
||||
|
@ -50,22 +48,22 @@ spec:
|
|||
- name: hp-workflow
|
||||
steps:
|
||||
- - name: data-preprocessing
|
||||
template: gen-num-examples
|
||||
template: gen-epochs
|
||||
- - name: model-training
|
||||
template: model-training
|
||||
arguments:
|
||||
parameters:
|
||||
- name: num-examples
|
||||
- name: epochs
|
||||
value: "{{steps.data-preprocessing.outputs.result}}"
|
||||
|
||||
- name: gen-num-examples
|
||||
- name: gen-epochs
|
||||
script:
|
||||
image: python:alpine3.6
|
||||
command:
|
||||
- python
|
||||
source: |
|
||||
import random
|
||||
print(60000//random.randint(10, 100))
|
||||
print(60000//random.randint(3000, 30000))
|
||||
|
||||
- name: model-training
|
||||
metadata:
|
||||
|
@ -73,12 +71,13 @@ spec:
|
|||
katib.kubeflow.org/model-training: "true"
|
||||
inputs:
|
||||
parameters:
|
||||
- name: num-examples
|
||||
- name: epochs
|
||||
container:
|
||||
name: model-training
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-examples={{inputs.parameters.num-examples}}"
|
||||
- "--epochs={{inputs.parameters.epochs}}"
|
||||
- "--batch-size=16"
|
||||
|
|
|
@ -62,7 +62,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
|
|
|
@ -8,11 +8,9 @@ metadata:
|
|||
name: median-stop
|
||||
spec:
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: random
|
||||
earlyStopping:
|
||||
|
@ -30,12 +28,12 @@ spec:
|
|||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.01"
|
||||
max: "0.5"
|
||||
- name: num-epochs
|
||||
parameterType: int
|
||||
max: "0.05"
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "3"
|
||||
max: "4"
|
||||
min: "0.5"
|
||||
max: "0.9"
|
||||
trialTemplate:
|
||||
retain: true
|
||||
primaryContainerName: training-container
|
||||
|
@ -43,9 +41,9 @@ spec:
|
|||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: numberEpochs
|
||||
description: Number of epochs to train the model
|
||||
reference: num-epochs
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
|
@ -54,11 +52,12 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "--batch-size=64"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=1"
|
||||
- "--batch-size=16"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-epochs=${trialParameters.numberEpochs}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|
||||
|
|
|
@ -6,11 +6,9 @@ metadata:
|
|||
name: bayesian-optimization
|
||||
spec:
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: bayesianoptimization
|
||||
algorithmSettings:
|
||||
|
@ -24,31 +22,21 @@ spec:
|
|||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.01"
|
||||
max: "0.03"
|
||||
- name: num-layers
|
||||
parameterType: int
|
||||
max: "0.05"
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "2"
|
||||
max: "5"
|
||||
- name: optimizer
|
||||
parameterType: categorical
|
||||
feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
min: "0.5"
|
||||
max: "0.9"
|
||||
trialTemplate:
|
||||
primaryContainerName: training-container
|
||||
trialParameters:
|
||||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: numberLayers
|
||||
description: Number of training model layers
|
||||
reference: num-layers
|
||||
- name: optimizer
|
||||
description: Training model optimizer (sdg, adam or ftrl)
|
||||
reference: optimizer
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
|
@ -57,12 +45,12 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "--batch-size=64"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=1"
|
||||
- "--batch-size=16"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-layers=${trialParameters.numberLayers}"
|
||||
- "--optimizer=${trialParameters.optimizer}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|
||||
|
|
|
@ -6,11 +6,9 @@ metadata:
|
|||
name: cmaes
|
||||
spec:
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: cmaes
|
||||
algorithmSettings:
|
||||
|
@ -24,31 +22,21 @@ spec:
|
|||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.01"
|
||||
max: "0.03"
|
||||
- name: num-layers
|
||||
parameterType: int
|
||||
max: "0.05"
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "2"
|
||||
max: "5"
|
||||
- name: optimizer
|
||||
parameterType: categorical
|
||||
feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
min: "0.5"
|
||||
max: "0.9"
|
||||
trialTemplate:
|
||||
primaryContainerName: training-container
|
||||
trialParameters:
|
||||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: numberLayers
|
||||
description: Number of training model layers
|
||||
reference: num-layers
|
||||
- name: optimizer
|
||||
description: Training model optimizer (sdg, adam or ftrl)
|
||||
reference: optimizer
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
|
@ -57,12 +45,12 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "--batch-size=64"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=1"
|
||||
- "--batch-size=16"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-layers=${trialParameters.numberLayers}"
|
||||
- "--optimizer=${trialParameters.optimizer}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|
||||
|
|
|
@ -6,11 +6,9 @@ metadata:
|
|||
name: grid
|
||||
spec:
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: grid
|
||||
parallelTrialCount: 3
|
||||
|
@ -20,33 +18,24 @@ spec:
|
|||
- name: lr
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.001"
|
||||
max: "0.01"
|
||||
step: "0.001"
|
||||
- name: num-layers
|
||||
parameterType: int
|
||||
min: "0.01"
|
||||
step: "0.005"
|
||||
max: "0.05"
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "2"
|
||||
max: "5"
|
||||
- name: optimizer
|
||||
parameterType: categorical
|
||||
feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
min: "0.5"
|
||||
step: "0.1"
|
||||
max: "0.9"
|
||||
trialTemplate:
|
||||
primaryContainerName: training-container
|
||||
trialParameters:
|
||||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: numberLayers
|
||||
description: Number of training model layers
|
||||
reference: num-layers
|
||||
- name: optimizer
|
||||
description: Training model optimizer (sdg, adam or ftrl)
|
||||
reference: optimizer
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
|
@ -55,12 +44,12 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "--batch-size=64"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=1"
|
||||
- "--batch-size=16"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-layers=${trialParameters.numberLayers}"
|
||||
- "--optimizer=${trialParameters.optimizer}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|
||||
|
|
|
@ -8,11 +8,9 @@ spec:
|
|||
parallelTrialCount: 2
|
||||
maxTrialCount: 2
|
||||
objective:
|
||||
type: maximize
|
||||
goal: 0.99
|
||||
objectiveMetricName: Validation-accuracy
|
||||
additionalMetricNames:
|
||||
- Train-accuracy
|
||||
type: minimize
|
||||
goal: 0.001
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: hyperband
|
||||
algorithmSettings:
|
||||
|
@ -28,19 +26,12 @@ spec:
|
|||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.01"
|
||||
max: "0.03"
|
||||
- name: num-layers
|
||||
parameterType: int
|
||||
max: "0.05"
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "2"
|
||||
max: "5"
|
||||
- name: optimizer
|
||||
parameterType: categorical
|
||||
feasibleSpace:
|
||||
list:
|
||||
- sgd
|
||||
- adam
|
||||
- ftrl
|
||||
min: "0.5"
|
||||
max: "0.9"
|
||||
- name: num-epochs
|
||||
parameterType: int
|
||||
feasibleSpace:
|
||||
|
@ -52,12 +43,9 @@ spec:
|
|||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: numberLayers
|
||||
description: Number of training model layers
|
||||
reference: num-layers
|
||||
- name: optimizer
|
||||
description: Training model optimizer (sdg, adam or ftrl)
|
||||
reference: optimizer
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
- name: numberEpochs
|
||||
description: Number of epochs to train the model
|
||||
reference: num-epochs
|
||||
|
@ -69,13 +57,12 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: docker.io/kubeflowkatib/mxnet-mnist:latest
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/mxnet-mnist/mnist.py"
|
||||
- "--batch-size=32"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=${trialParameters.numberEpochs}"
|
||||
- "--batch-size=16"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--num-layers=${trialParameters.numberLayers}"
|
||||
- "--optimizer=${trialParameters.optimizer}"
|
||||
- "--num-epochs=${trialParameters.numberEpochs}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: Experiment
|
||||
metadata:
|
||||
namespace: kubeflow
|
||||
name: hyperopt-distribution
|
||||
spec:
|
||||
objective:
|
||||
type: minimize
|
||||
goal: 0.05
|
||||
objectiveMetricName: loss
|
||||
algorithm:
|
||||
algorithmName: random
|
||||
parallelTrialCount: 3
|
||||
maxTrialCount: 12
|
||||
maxFailedTrialCount: 3
|
||||
parameters:
|
||||
- name: lr
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.01"
|
||||
max: "0.05"
|
||||
step: "0.01"
|
||||
distribution: normal
|
||||
- name: momentum
|
||||
parameterType: double
|
||||
feasibleSpace:
|
||||
min: "0.001"
|
||||
max: "1"
|
||||
distribution: uniform
|
||||
- name: epochs
|
||||
parameterType: int
|
||||
feasibleSpace:
|
||||
min: "1"
|
||||
max: "3"
|
||||
distribution: logUniform
|
||||
- name: batch_size
|
||||
parameterType: int
|
||||
feasibleSpace:
|
||||
min: "32"
|
||||
max: "64"
|
||||
distribution: logNormal
|
||||
trialTemplate:
|
||||
primaryContainerName: training-container
|
||||
trialParameters:
|
||||
- name: learningRate
|
||||
description: Learning rate for the training model
|
||||
reference: lr
|
||||
- name: momentum
|
||||
description: Momentum for the training model
|
||||
reference: momentum
|
||||
- name: epochs
|
||||
description: Epochs
|
||||
reference: epochs
|
||||
- name: batchSize
|
||||
description: Batch Size
|
||||
reference: batch_size
|
||||
trialSpec:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: training-container
|
||||
image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest
|
||||
command:
|
||||
- "python3"
|
||||
- "/opt/pytorch-mnist/mnist.py"
|
||||
- "--epochs=${trialParameters.epochs}"
|
||||
- "--batch-size=${trialParameters.batchSize}"
|
||||
- "--lr=${trialParameters.learningRate}"
|
||||
- "--momentum=${trialParameters.momentum}"
|
||||
restartPolicy: Never
|