Compare commits

...

9 Commits

Author SHA1 Message Date
Andrey Velichkevich fc6beec835 Katib official release v0.16.0 2023-10-31 19:53:25 +00:00
Andrey Velichkevich f299a22672 Katib official release v0.16.0-rc.1 2023-08-16 13:18:30 +01:00
Yuki Iwai f60c76f3de
Automated cherry pick of #2201: Upgrade Tensorflow version to v2.13.0 (#2216)
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 10:49:04 +00:00
Yuki Iwai 2116fc7340
Automated cherry pick of #2198: Bug: Wait for the certs to be mounted inside the container (#2213)
* Wait for the certs to be mounted inside the container



* Initialize fullServiceDomain when adding certgenerator to the manager



* Output logs every 15 seconds if the certs don't yet exist in the container



---------

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 10:01:04 +00:00
Yuki Iwai 1092dba86b
Automated cherry pick of #2209: Start waiting for certs to be ready before sending data to the channel (#2215)
Start waiting for certs to be ready before sending data to the channel

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 09:41:03 +00:00
Yuki Iwai 95e8ef16e3
Automated cherry pick of #2202: E2E: Add additional checks to verify if the components are ready (#2212)
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 09:34:03 +00:00
Yuki Iwai 945ae81623
Automated cherry pick of #2207: Remove a katib-webhook-cert Secret from components (#2214)
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 08:57:04 +00:00
Yuki Iwai 0dfb344d08
Automated cherry pick of #2203: Skip to inject the metrics-collector pods to the katib controller (#2211)
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
2023-08-16 08:26:03 +00:00
Andrey Velichkevich bc5add92d5 Katib official release v0.16.0-rc.0 2023-08-05 20:13:33 +01:00
56 changed files with 303 additions and 172 deletions

View File

@ -136,6 +136,11 @@ func main() {
ctx := signals.SetupSignalHandler()
certsReady := make(chan struct{})
defer close(certsReady)
// The setupControllers will register controllers to the manager
// after generated certs for the admission webhooks.
go setupControllers(mgr, certsReady, hookServer)
if initConfig.CertGeneratorConfig.Enable {
if err = cert.AddToManager(mgr, initConfig.CertGeneratorConfig, certsReady); err != nil {
log.Error(err, "Failed to set up cert-generator")
@ -144,10 +149,6 @@ func main() {
certsReady <- struct{}{}
}
// The setupControllers will register controllers to the manager
// after generated certs for the admission webhooks.
go setupControllers(mgr, certsReady, hookServer)
log.Info("Setting up health checker.")
if err := mgr.AddReadyzCheck("readyz", hookServer.StartedChecker()); err != nil {
log.Error(err, "Unable to add readyz endpoint to the manager")

View File

@ -2,4 +2,5 @@ psutil==5.9.4
rfc3339>=6.2
grpcio>=1.41.1
googleapis-common-protos==1.6.0
tensorflow==2.11.0
tensorflow==2.13.0
protobuf<=3.20.3

View File

@ -1,4 +1,5 @@
grpcio>=1.41.1
googleapis-common-protos==1.6.0
cython>=0.29.24
tensorflow==2.11.0
tensorflow==2.13.0
protobuf<=3.20.3

View File

@ -76,7 +76,7 @@ spec:
- name: num-examples
container:
name: model-training
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -62,7 +62,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -57,7 +57,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -57,7 +57,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -55,7 +55,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -69,7 +69,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -43,7 +43,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/simple-pbt:latest
image: docker.io/kubeflowkatib/simple-pbt:v0.16.0
command:
- "python3"
- "/opt/pbt/pbt_test.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -46,7 +46,7 @@ spec:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
@ -61,7 +61,7 @@ spec:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -56,7 +56,7 @@ spec:
spec:
containers:
- name: tensorflow
image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest
image: docker.io/kubeflowkatib/tf-mnist-with-summaries:v0.16.0
command:
- "python"
- "/opt/tf-mnist-with-summaries/mnist.py"

View File

@ -67,7 +67,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -52,7 +52,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -59,7 +59,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -60,7 +60,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/darts-cnn-cifar10-cpu:latest
image: docker.io/kubeflowkatib/darts-cnn-cifar10-cpu:v0.16.0
command:
- python3
- run_trial.py

View File

@ -77,7 +77,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/darts-cnn-cifar10-gpu:latest
image: docker.io/kubeflowkatib/darts-cnn-cifar10-gpu:v0.16.0
command:
- python3
- run_trial.py

View File

@ -139,7 +139,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:latest
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.16.0
command:
- python3
- -u

View File

@ -136,7 +136,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/enas-cnn-cifar10-gpu:latest
image: docker.io/kubeflowkatib/enas-cnn-cifar10-gpu:v0.16.0
command:
- python3
- -u

View File

@ -55,7 +55,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -55,7 +55,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -89,7 +89,7 @@ spec:
description: Number of training examples
steps:
- name: model-training
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -1,2 +1,2 @@
scipy>=1.7.2
tensorflow==2.11.0
tensorflow==2.13.0

View File

@ -1 +1 @@
tensorflow==2.11.0
tensorflow==2.13.0

View File

@ -60,7 +60,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -15,6 +15,7 @@ spec:
metadata:
labels:
katib.kubeflow.org/component: controller
katib.kubeflow.org/metrics-collector-injection: disabled
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"

View File

@ -15,7 +15,7 @@ data:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
@ -33,7 +33,7 @@ data:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:latest
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v0.16.0
command:
- python3
- -u
@ -54,7 +54,7 @@ data:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
@ -68,7 +68,7 @@ data:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.16.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"

View File

@ -4,4 +4,3 @@ kind: Kustomization
resources:
- webhooks.yaml
- secret.yaml

View File

@ -1,5 +0,0 @@
---
apiVersion: v1
kind: Secret
metadata:
name: katib-webhook-cert

View File

@ -63,6 +63,16 @@ webhooks:
namespaceSelector:
matchLabels:
katib.kubeflow.org/metrics-collector-injection: enabled
# Once the AdmissionWebhookMatchConditions feature gate is enabled by default, we should switch to control based on userInfo.
# REF:
# - AdmissionWebhookMatchConditions: https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchconditions
# - Tracking issue: https://github.com/kubeflow/katib/issues/2206
objectSelector:
matchExpressions:
- key: katib.kubeflow.org/metrics-collector-injection
operator: NotIn
values:
- disabled
rules:
- apiGroups:
- ""

View File

@ -14,40 +14,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -56,4 +56,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -22,13 +22,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
patchesStrategicMerge:
- patches/katib-cert-injection.yaml

View File

@ -16,40 +16,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -58,4 +58,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -18,13 +18,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
patchesStrategicMerge:
- patches/db-manager.yaml
# Modify katib-mysql-secrets with parameters for the DB.
@ -32,6 +32,10 @@ secretGenerator:
- name: katib-mysql-secrets
envs:
- secrets.env
# Secret for webhooks certs.
- name: katib-webhook-cert
options:
disableNameSuffixHash: true
configMapGenerator:
- name: katib-config
behavior: create

View File

@ -17,40 +17,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -59,4 +59,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -14,40 +14,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -56,4 +56,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -30,13 +30,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
patchesJson6902:
# Annotate Service to delegate TLS-secret generation to OpenShift service controller

View File

@ -16,40 +16,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -58,4 +58,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -20,13 +20,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
patchesJson6902:
- target:
group: apps
@ -41,3 +41,8 @@ configMapGenerator:
- katib-config.yaml
options:
disableNameSuffixHash: true
# Secret for webhooks certs.
secretGenerator:
- name: katib-webhook-cert
options:
disableNameSuffixHash: true

View File

@ -16,40 +16,40 @@ init:
runtime:
metricsCollectors:
- kind: StdOut
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: File
image: docker.io/kubeflowkatib/file-metrics-collector:latest
image: docker.io/kubeflowkatib/file-metrics-collector:v0.16.0
- kind: TensorFlowEvent
image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest
image: docker.io/kubeflowkatib/tfevent-metrics-collector:v0.16.0
resources:
limits:
memory: 1Gi
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:latest
image: docker.io/kubeflowkatib/suggestion-hyperopt:v0.16.0
- algorithmName: grid
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: hyperband
image: docker.io/kubeflowkatib/suggestion-hyperband:latest
image: docker.io/kubeflowkatib/suggestion-hyperband:v0.16.0
- algorithmName: bayesianoptimization
image: docker.io/kubeflowkatib/suggestion-skopt:latest
image: docker.io/kubeflowkatib/suggestion-skopt:v0.16.0
- algorithmName: cmaes
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: sobol
image: docker.io/kubeflowkatib/suggestion-goptuna:latest
image: docker.io/kubeflowkatib/suggestion-goptuna:v0.16.0
- algorithmName: multivariate-tpe
image: docker.io/kubeflowkatib/suggestion-optuna:latest
image: docker.io/kubeflowkatib/suggestion-optuna:v0.16.0
- algorithmName: enas
image: docker.io/kubeflowkatib/suggestion-enas:latest
image: docker.io/kubeflowkatib/suggestion-enas:v0.16.0
resources:
limits:
memory: 200Mi
- algorithmName: darts
image: docker.io/kubeflowkatib/suggestion-darts:latest
image: docker.io/kubeflowkatib/suggestion-darts:v0.16.0
- algorithmName: pbt
image: docker.io/kubeflowkatib/suggestion-pbt:latest
image: docker.io/kubeflowkatib/suggestion-pbt:v0.16.0
persistentVolumeClaimSpec:
accessModes:
- ReadWriteMany
@ -58,4 +58,4 @@ runtime:
storage: 5Gi
earlyStoppings:
- algorithmName: medianstop
image: docker.io/kubeflowkatib/earlystopping-medianstop:latest
image: docker.io/kubeflowkatib/earlystopping-medianstop:v0.16.0

View File

@ -20,13 +20,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
configMapGenerator:
- name: katib-config
behavior: create
@ -34,3 +34,8 @@ configMapGenerator:
- katib-config.yaml
options:
disableNameSuffixHash: true
# Secret for webhooks certs.
secretGenerator:
- name: katib-webhook-cert
options:
disableNameSuffixHash: true

View File

@ -11,13 +11,13 @@ resources:
images:
- name: docker.io/kubeflowkatib/katib-controller
newName: docker.io/kubeflowkatib/katib-controller
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-db-manager
newName: docker.io/kubeflowkatib/katib-db-manager
newTag: latest
newTag: v0.16.0
- name: docker.io/kubeflowkatib/katib-ui
newName: docker.io/kubeflowkatib/katib-ui
newTag: latest
newTag: v0.16.0
patchesStrategicMerge:
- patches/remove-namespace.yaml

View File

@ -26,12 +26,15 @@ import (
"errors"
"fmt"
"math/big"
"os"
"path/filepath"
"strings"
"time"
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
@ -53,11 +56,11 @@ type CertGenerator struct {
namespace string
webhookServiceName string
webhookSecretName string
fullServiceDomain string
kubeClient client.Client
certsReady chan struct{}
certs *certificates
fullServiceDomain string
}
var _ manager.Runnable = &CertGenerator{}
@ -67,11 +70,50 @@ func (c *CertGenerator) Start(ctx context.Context) error {
if err := c.generate(ctx); err != nil {
return err
}
klog.Info("Waiting for certs to get ready.")
if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
Duration: time.Second,
Factor: 2,
Jitter: 1,
Steps: 10,
Cap: time.Minute * 5,
}, ensureCertMounted(time.Now())); err != nil {
return err
}
// Sending an empty data to a certsReady means it starts to register controllers to the manager.
c.certsReady <- struct{}{}
return nil
}
// ensureCertMounted ensures that the generated certs are mounted inside the container.
func ensureCertMounted(start time.Time) func(context.Context) (bool, error) {
return func(ctx context.Context) (bool, error) {
now := time.Now()
outputLog := false
if now.Sub(start) >= 15*time.Second {
start = now
outputLog = true
}
certFile := filepath.Join(consts.CertDir, serverCertName)
if _, err := os.Stat(certFile); err != nil {
if outputLog {
klog.Infof("Public key file %q doesn't exist in the container yet", certFile)
}
return false, nil
}
keyFile := filepath.Join(consts.CertDir, serverKeyName)
if _, err := os.Stat(keyFile); err != nil {
if outputLog {
klog.Infof("Private key file %q doesn't exist in the container yet", keyFile)
}
return false, nil
}
klog.Info("Succeeded to be mounted certs inside the container.")
return true, nil
}
}
func (c *CertGenerator) NeedLeaderElection() bool {
return false
}
@ -82,6 +124,11 @@ func AddToManager(mgr manager.Manager, config configv1beta1.CertGeneratorConfig,
namespace: consts.DefaultKatibNamespace,
webhookServiceName: config.WebhookServiceName,
webhookSecretName: config.WebhookSecretName,
fullServiceDomain: strings.Join([]string{
config.WebhookServiceName,
consts.DefaultKatibNamespace,
"svc",
}, "."),
kubeClient: mgr.GetClient(),
certsReady: certsReady,
})
@ -99,8 +146,6 @@ func (c *CertGenerator) generate(ctx context.Context) error {
return fmt.Errorf("%w: %v", errCertCheckFail, err)
}
if !certExist {
c.fullServiceDomain = strings.Join([]string{c.webhookServiceName, c.namespace, "svc"}, ".")
if err = c.createCert(); err != nil {
return fmt.Errorf("%w: %v", errCreateCertFail, err)
}

View File

@ -18,8 +18,11 @@ package certgenerator
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
@ -31,6 +34,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"
configv1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
)
func TestGenerate(t *testing.T) {
@ -210,3 +214,61 @@ func buildFakeClient(kubeResources []client.Object) client.Client {
}
return fakeClientBuilder.Build()
}
func TestEnsureCertMounted(t *testing.T) {
tests := map[string]struct {
keyExist bool
certExist bool
wantExist bool
}{
"key and cert exist": {
keyExist: true,
certExist: true,
wantExist: true,
},
"key doesn't exist": {
keyExist: false,
certExist: true,
wantExist: false,
},
"cert doesn't exist": {
keyExist: true,
certExist: false,
wantExist: false,
},
"all files doesn't exist": {
keyExist: false,
certExist: false,
wantExist: false,
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
if tc.keyExist || tc.certExist {
if err := os.MkdirAll(consts.CertDir, 0760); err != nil {
t.Fatalf("Failed to set up directory: %v", err)
}
defer func() {
if err := os.RemoveAll(consts.CertDir); err != nil {
t.Fatalf("Failed to clean up directory: %v", err)
}
}()
}
if tc.keyExist {
if _, err := os.Create(filepath.Join(consts.CertDir, serverKeyName)); err != nil {
t.Fatalf("Failed to create tls.key: %v", err)
}
}
if tc.certExist {
if _, err := os.Create(filepath.Join(consts.CertDir, serverCertName)); err != nil {
t.Fatalf("Failed to create tls.crt: %v", err)
}
}
ensureFunc := ensureCertMounted(time.Now())
got, _ := ensureFunc(context.Background())
if tc.wantExist != got {
t.Errorf("Unexpected value from ensureCertMounted: \n(want: %v, got: %v)\n", tc.wantExist, got)
}
})
}
}

View File

@ -22,10 +22,11 @@
# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22
import tensorflow as tf
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator, TensorEvent
from tensorboard.backend.event_processing.tag_types import TENSORS
import os
from datetime import datetime
import rfc3339
from datetime import datetime
import api_pb2
from logging import getLogger, StreamHandler, INFO
from pkg.metricscollector.v1beta1.common import const
@ -43,9 +44,9 @@ class TFEventFileParser:
def parse_summary(self, tfefile):
metric_logs = []
event_accumulator = EventAccumulator(tfefile, size_guidance={'tensors': 0})
event_accumulator = EventAccumulator(tfefile, size_guidance={TENSORS: 0})
event_accumulator.Reload()
for tag in event_accumulator.Tags()['tensors']:
for tag in event_accumulator.Tags()[TENSORS]:
for m in self.metric_names:
tfefile_parent_dir = os.path.dirname(m) if len(m.split("/")) >= 2 else os.path.dirname(tfefile)
@ -53,12 +54,12 @@ class TFEventFileParser:
if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith(tfefile_parent_dir):
continue
for wall_time, step, tensor in event_accumulator.Tensors(tag):
for tensor in event_accumulator.Tensors(tag):
ml = api_pb2.MetricLog(
time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(wall_time)),
time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(tensor.wall_time)),
metric=api_pb2.Metric(
name=m,
value=str(tf.make_ndarray(tensor))
value=str(tf.make_ndarray(tensor.tensor_proto))
)
)
metric_logs.append(ml)

View File

@ -50,8 +50,8 @@ TRIAL_CONDITION_SUCCEEDED = "Succeeded"
# Supported base images for the Katib Trials.
# TODO (andreyvelich): Implement list_base_images function to get each image description.
BASE_IMAGE_TENSORFLOW = "docker.io/tensorflow/tensorflow:2.11.0"
BASE_IMAGE_TENSORFLOW_GPU = "docker.io/tensorflow/tensorflow:2.11.0-gpu"
BASE_IMAGE_TENSORFLOW = "docker.io/tensorflow/tensorflow:2.13.0"
BASE_IMAGE_TENSORFLOW_GPU = "docker.io/tensorflow/tensorflow:2.13.0-gpu"
BASE_IMAGE_PYTORCH = "docker.io/pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3"

View File

@ -37,7 +37,7 @@ if os.path.exists(katib_grpc_api_file):
setuptools.setup(
name="kubeflow-katib",
version="0.15.0",
version="0.16.0",
author="Kubeflow Authors",
author_email="premnath.vel@gmail.com",
license="Apache License Version 2.0",

View File

@ -66,7 +66,8 @@ cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd -
# Wait until all Katib pods is running.
TIMEOUT=120s
kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod ||
kubectl wait --for=condition=ContainersReady=True --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod ||
(kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1)
echo "All Katib components are running."

View File

@ -52,7 +52,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"

View File

@ -52,7 +52,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:latest
image: docker.io/kubeflowkatib/mxnet-mnist:v0.16.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"