mirror of https://github.com/kubeflow/examples.git
Create a deployment to run the HP/Katib controller for the GitHub issue example. (#161)
* Some of the code is copied over from https://github.com/kubeflow/katib/tree/master/examples/GKEDemo * I think it makes sense to centralize all the code in a single place. * Update the controller program (git-issue-summarize-demo.go) so that can specify the Docker image containing the training code. * Create a ksonnet deployment for running the controller on the cluster. * The HP tuning job isn't functional here's an incomplete list of issues * The training jobs launched fail because they don't have GCP credentials so they can't download the data. * We don't actually extract and report metrics back to Katib. Related to: kubeflow/katib#116
This commit is contained in:
parent
d692db36e8
commit
eaf0298590
|
|
@ -44,4 +44,9 @@ examples/.ipynb_checkpoints/
|
||||||
*.dpkl
|
*.dpkl
|
||||||
|
|
||||||
# Build directory
|
# Build directory
|
||||||
|
**/build
|
||||||
github_issue_summarization/build/
|
github_issue_summarization/build/
|
||||||
|
|
||||||
|
# Don't check in the go vendor directory
|
||||||
|
# We can just use the dep tool to install them.
|
||||||
|
github_issue_summarization/hp-tune/vendor
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
FROM golang:1.9
|
||||||
|
|
||||||
|
RUN mkdir -p /opt/kubeflow
|
||||||
|
COPY ./build/git-issue-summarize-demo /opt/kubeflow
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
|
||||||
|
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
name = "github.com/golang/protobuf"
|
||||||
|
packages = [
|
||||||
|
"proto",
|
||||||
|
"ptypes",
|
||||||
|
"ptypes/any",
|
||||||
|
"ptypes/duration",
|
||||||
|
"ptypes/timestamp"
|
||||||
|
]
|
||||||
|
revision = "b4deda0973fb4c70b50d226b1af49f3da59f5265"
|
||||||
|
version = "v1.1.0"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
name = "github.com/kubeflow/katib"
|
||||||
|
packages = ["pkg/api"]
|
||||||
|
revision = "f24b520cc52920ae511aeea235636462ebc21d21"
|
||||||
|
version = "v0.1.2-alpha"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
branch = "master"
|
||||||
|
name = "golang.org/x/net"
|
||||||
|
packages = [
|
||||||
|
"context",
|
||||||
|
"http/httpguts",
|
||||||
|
"http2",
|
||||||
|
"http2/hpack",
|
||||||
|
"idna",
|
||||||
|
"internal/timeseries",
|
||||||
|
"trace"
|
||||||
|
]
|
||||||
|
revision = "4cb1c02c05b0e749b0365f61ae859a8e0cfceed9"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
name = "golang.org/x/text"
|
||||||
|
packages = [
|
||||||
|
"collate",
|
||||||
|
"collate/build",
|
||||||
|
"internal/colltab",
|
||||||
|
"internal/gen",
|
||||||
|
"internal/tag",
|
||||||
|
"internal/triegen",
|
||||||
|
"internal/ucd",
|
||||||
|
"language",
|
||||||
|
"secure/bidirule",
|
||||||
|
"transform",
|
||||||
|
"unicode/bidi",
|
||||||
|
"unicode/cldr",
|
||||||
|
"unicode/norm",
|
||||||
|
"unicode/rangetable"
|
||||||
|
]
|
||||||
|
revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0"
|
||||||
|
version = "v0.3.0"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
branch = "master"
|
||||||
|
name = "google.golang.org/genproto"
|
||||||
|
packages = ["googleapis/rpc/status"]
|
||||||
|
revision = "ff3583edef7de132f219f0efc00e097cabcc0ec0"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
name = "google.golang.org/grpc"
|
||||||
|
packages = [
|
||||||
|
".",
|
||||||
|
"balancer",
|
||||||
|
"balancer/base",
|
||||||
|
"balancer/roundrobin",
|
||||||
|
"codes",
|
||||||
|
"connectivity",
|
||||||
|
"credentials",
|
||||||
|
"encoding",
|
||||||
|
"encoding/proto",
|
||||||
|
"grpclog",
|
||||||
|
"internal",
|
||||||
|
"internal/backoff",
|
||||||
|
"internal/channelz",
|
||||||
|
"internal/grpcrand",
|
||||||
|
"keepalive",
|
||||||
|
"metadata",
|
||||||
|
"naming",
|
||||||
|
"peer",
|
||||||
|
"resolver",
|
||||||
|
"resolver/dns",
|
||||||
|
"resolver/passthrough",
|
||||||
|
"stats",
|
||||||
|
"status",
|
||||||
|
"tap",
|
||||||
|
"transport"
|
||||||
|
]
|
||||||
|
revision = "168a6198bcb0ef175f7dacec0b8691fc141dc9b8"
|
||||||
|
version = "v1.13.0"
|
||||||
|
|
||||||
|
[solve-meta]
|
||||||
|
analyzer-name = "dep"
|
||||||
|
analyzer-version = 1
|
||||||
|
inputs-digest = "3d9f4c7de4665d6a45accfb3d5a5a6a6ae9b98229cea14e0a8dfba942a4e49f8"
|
||||||
|
solver-name = "gps-cdcl"
|
||||||
|
solver-version = 1
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Gopkg.toml example
|
||||||
|
#
|
||||||
|
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
|
||||||
|
# for detailed Gopkg.toml documentation.
|
||||||
|
#
|
||||||
|
# required = ["github.com/user/thing/cmd/thing"]
|
||||||
|
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
|
||||||
|
#
|
||||||
|
# [[constraint]]
|
||||||
|
# name = "github.com/user/project"
|
||||||
|
# version = "1.0.0"
|
||||||
|
#
|
||||||
|
# [[constraint]]
|
||||||
|
# name = "github.com/user/project2"
|
||||||
|
# branch = "dev"
|
||||||
|
# source = "github.com/myfork/project2"
|
||||||
|
#
|
||||||
|
# [[override]]
|
||||||
|
# name = "github.com/x/y"
|
||||||
|
# version = "2.4.0"
|
||||||
|
#
|
||||||
|
# [prune]
|
||||||
|
# non-go = false
|
||||||
|
# go-tests = true
|
||||||
|
# unused-packages = true
|
||||||
|
|
||||||
|
|
||||||
|
[[constraint]]
|
||||||
|
name = "github.com/kubeflow/katib"
|
||||||
|
version = "0.1.2-alpha"
|
||||||
|
|
||||||
|
[[constraint]]
|
||||||
|
name = "google.golang.org/grpc"
|
||||||
|
version = "1.13.0"
|
||||||
|
|
||||||
|
[prune]
|
||||||
|
go-tests = true
|
||||||
|
unused-packages = true
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Requirements:
|
||||||
|
# Make sure ${GOPATH}/src/github.com/kubeflow/examples
|
||||||
|
# points at a checked out version of the examples repository.
|
||||||
|
IMG = gcr.io/kubeflow-examples/gh-issue-hp-tuner
|
||||||
|
DIR := ${CURDIR}
|
||||||
|
|
||||||
|
# List any changed files.
|
||||||
|
CHANGED_FILES := $(shell git diff-files --relative=examples/GKEDemo)
|
||||||
|
|
||||||
|
ifeq ($(strip $(CHANGED_FILES)),)
|
||||||
|
# Changed files is empty; not dirty
|
||||||
|
# Don't include --dirty because it could be dirty if files outside the ones we care
|
||||||
|
# about changed.
|
||||||
|
TAG := $(shell date +v%Y%m%d)-$(shell git describe --always)
|
||||||
|
else
|
||||||
|
TAG := $(shell date +v%Y%m%d)-$(shell git describe --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
|
||||||
|
endif
|
||||||
|
|
||||||
|
all: build
|
||||||
|
|
||||||
|
# To build without the cache set the environment variable
|
||||||
|
# export DOCKER_BUILD_OPTS=--no-cache
|
||||||
|
build: Dockerfile git-issue-summarize-demo.go
|
||||||
|
mkdir -p build
|
||||||
|
dep ensure
|
||||||
|
go build -i -o ./build/git-issue-summarize-demo ${GOPATH}/src/github.com/kubeflow/examples/github_issue_summarization/hp-tune/git-issue-summarize-demo.go
|
||||||
|
docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) .
|
||||||
|
docker tag $(IMG):$(TAG) $(IMG):latest
|
||||||
|
@echo Built $(IMG):$(TAG)
|
||||||
|
|
||||||
|
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||||
|
# first.
|
||||||
|
push: build
|
||||||
|
gcloud docker -- push $(IMG):$(TAG)
|
||||||
|
@echo Pushed $(IMG) with :$(TAG) tags
|
||||||
|
|
||||||
|
push-latest: push
|
||||||
|
gcloud container images add-tag --quiet $(IMG):$(TAG) $(IMG):latest --verbosity=info
|
||||||
|
echo created $(IMG):latest
|
||||||
|
|
@ -0,0 +1,134 @@
|
||||||
|
# Experimental: HP Tuning for GitHub Issue Summarization
|
||||||
|
|
||||||
|
This directoy contains experimental code for adding hyperparameter
|
||||||
|
tuning support to the GitHub issue summarization example using Katib.
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
1. Deploy Kubeflow
|
||||||
|
1. [Deploy Katib](https://github.com/kubeflow/kubeflow/blob/master/kubeflow/katib/README.md)
|
||||||
|
1. Create the katib namespace
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl create namespace katib
|
||||||
|
```
|
||||||
|
|
||||||
|
* This is a known issue [kubeflow/katib#134](https://github.com/kubeflow/katib/issues/134)
|
||||||
|
|
||||||
|
1. Deploy the hyperparameter tuning job
|
||||||
|
|
||||||
|
```
|
||||||
|
cd kubeflow/examples/github_issue_summarization/ks-kubeflow
|
||||||
|
ks apply ${ENVIRONMENT} -c hp-tune
|
||||||
|
```
|
||||||
|
|
||||||
|
## UI
|
||||||
|
|
||||||
|
You can check your Model with Web UI.
|
||||||
|
|
||||||
|
Access to `http://${ENDPOINT}/katib/projects`
|
||||||
|
|
||||||
|
* If you are using GKE and IAP then ENDPOINT is the endpoint you
|
||||||
|
are serving Kubeflow on
|
||||||
|
|
||||||
|
* Otherwise you can port-forward to one of the AMBASSADOR pods
|
||||||
|
and ENDPOINT
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl port-forward `kubectl get pods --selector=service=ambassador -o jsonpath='{.items[0].metadata.name}'` 8080:80
|
||||||
|
ENDPOINT=localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
The Results will be saved automatically.
|
||||||
|
|
||||||
|
## Description of git-issue-summarize-demo.go
|
||||||
|
You can make hyperparameter and evaluate it by Katib-API.
|
||||||
|
Katib-APIs are grpc. So you can use any language grpc supported(e.g. golang, python, c++).
|
||||||
|
A typical case, you will call APIs in the order as below.
|
||||||
|
In git-issue-summarize-demo.go, it wait for the status of all workers will be Completed.
|
||||||
|
|
||||||
|
### CreateStudy
|
||||||
|
First, you should create Study.
|
||||||
|
The input is StudyConfig.
|
||||||
|
It has Study name, owner, optimization info, and Parameter config(parameter name, min, and max).
|
||||||
|
This function generates a unique ID for your study and stores the config to DB.
|
||||||
|
Input:
|
||||||
|
* StudyConfig:
|
||||||
|
* Name: string
|
||||||
|
* Owner: string
|
||||||
|
* OptimizationType: enum(OptimizationType_MAXIMIZE, OptimizationType_MINIMIZE)
|
||||||
|
* OptimizationGoal: float
|
||||||
|
* DefaultSuggestionAlgorithm: string
|
||||||
|
* DefaultEarlyStoppingAlgorithm: string
|
||||||
|
* ObjectiveValueName: string
|
||||||
|
* Metrics: List of Metrics name
|
||||||
|
* ParameterConfigs: List of parameter config.
|
||||||
|
Return:
|
||||||
|
* StudyID
|
||||||
|
|
||||||
|
### SetSuggestionParameters
|
||||||
|
Hyperparameters are generated by suggestion services with Parameter config of Study.
|
||||||
|
You can set the specific config for each suggestion.
|
||||||
|
Input:
|
||||||
|
* StudyID: ID of your study.
|
||||||
|
* SuggestionAlgorithm: name of suggestion service (e.g. random, grid)
|
||||||
|
* SuggestionParameters: key-value pairs parameter for suggestions. The wanted key is different for each suggestion.
|
||||||
|
Return:
|
||||||
|
* ParameterID
|
||||||
|
|
||||||
|
### GetSuggestions
|
||||||
|
This function will create Trials(set of Parameters).
|
||||||
|
Input:
|
||||||
|
* StudyID: ID of your study.
|
||||||
|
* SuggestionAlgorithm: name of suggestion service (e.g. random, grid)
|
||||||
|
* RequestNumber: the number you want to evaluate.
|
||||||
|
* ParamID: ParameterID you got from SetSuggestionParameters func.
|
||||||
|
Return
|
||||||
|
* List of Trials
|
||||||
|
* TrialID
|
||||||
|
* Parameter Sets
|
||||||
|
|
||||||
|
### RunTrial
|
||||||
|
Start to evaluate Trial.
|
||||||
|
When you use kubernetes runtime, the pods are created the specified config.
|
||||||
|
Input:
|
||||||
|
* StudyId: ID of your study.
|
||||||
|
* TrialId: ID of Trial.
|
||||||
|
* Runtime: worker type(e.g. kubernetes)
|
||||||
|
* WorkerConfig: runtime config
|
||||||
|
* Image: name of docker image
|
||||||
|
* Command: running commands
|
||||||
|
* GPU: number of GPU
|
||||||
|
* Scheduler: scheduler name
|
||||||
|
Return:
|
||||||
|
* List of WorkerID
|
||||||
|
|
||||||
|
### GetMetrics
|
||||||
|
Get metrics of running workers.
|
||||||
|
Input:
|
||||||
|
* StudyId: ID of your study.
|
||||||
|
* WorkerIDs: List of worker ID you want to get metrics from.
|
||||||
|
Return:
|
||||||
|
* List of Metrics
|
||||||
|
|
||||||
|
### SaveModel
|
||||||
|
Save the Model date to KatibDB. After you called this function, you can look model info in the KatibUI.
|
||||||
|
When you call this API multiple time, only Metrics will be updated.
|
||||||
|
Input:
|
||||||
|
* ModelInfo
|
||||||
|
* StudyName
|
||||||
|
* WorkerId
|
||||||
|
* Parameters: List of Parameter
|
||||||
|
* Metrics: List of Metrics
|
||||||
|
* ModelPath: path to model saved. (PVCname:mountpath)
|
||||||
|
* DataSet: informatino of input date
|
||||||
|
* Name
|
||||||
|
* Path: path to input data.(PVCname:mountpath)
|
||||||
|
|
||||||
|
Return:
|
||||||
|
|
||||||
|
### GetWorkers
|
||||||
|
You can get worker list and status of workers.
|
||||||
|
Input:
|
||||||
|
Return:
|
||||||
|
* List of worker information
|
||||||
|
|
@ -0,0 +1,210 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"flag"
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/kubeflow/katib/pkg/api"
|
||||||
|
"google.golang.org/grpc"
|
||||||
|
)
|
||||||
|
|
||||||
|
var studyConfig = api.StudyConfig{
|
||||||
|
Name: "grid-demo",
|
||||||
|
Owner: "katib",
|
||||||
|
OptimizationType: api.OptimizationType_MAXIMIZE,
|
||||||
|
OptimizationGoal: 0.99,
|
||||||
|
ObjectiveValueName: "Validation-accuracy",
|
||||||
|
Metrics: []string{
|
||||||
|
"accuracy",
|
||||||
|
},
|
||||||
|
ParameterConfigs: &api.StudyConfig_ParameterConfigs{
|
||||||
|
Configs: []*api.ParameterConfig{
|
||||||
|
&api.ParameterConfig{
|
||||||
|
Name: "--learning_rate",
|
||||||
|
ParameterType: api.ParameterType_DOUBLE,
|
||||||
|
Feasible: &api.FeasibleSpace{
|
||||||
|
Min: "0.005",
|
||||||
|
Max: "0.5",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var gridConfig = []*api.SuggestionParameter{
|
||||||
|
&api.SuggestionParameter{
|
||||||
|
Name: "DefaultGrid",
|
||||||
|
Value: "4",
|
||||||
|
},
|
||||||
|
&api.SuggestionParameter{
|
||||||
|
Name: "--learning_rate",
|
||||||
|
Value: "2",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var managerAddr = flag.String("katib_endpoint", "127.0.0.1:6789", "Endpoint of manager default 127.0.0.1:6789")
|
||||||
|
var trainerImage = flag.String("trainer_image", "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888", "The docker image containing the training code")
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
flag.Parse()
|
||||||
|
conn, err := grpc.Dial(*managerAddr, grpc.WithInsecure())
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("could not connect: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
workerConfig := api.WorkerConfig{
|
||||||
|
Image: *trainerImage,
|
||||||
|
Command: []string{
|
||||||
|
"python",
|
||||||
|
"/workdir/train.py",
|
||||||
|
"--sample_size",
|
||||||
|
"20000",
|
||||||
|
// "--input_data_gcs_bucket",
|
||||||
|
// "katib-gi-example",
|
||||||
|
// "--input_data_gcs_path",
|
||||||
|
// "github-issue-summarization-data/github-issues.zip",
|
||||||
|
// "--output_model_gcs_bucket",
|
||||||
|
// "katib-gi-example",
|
||||||
|
},
|
||||||
|
Gpu: 0,
|
||||||
|
Scheduler: "default-scheduler",
|
||||||
|
}
|
||||||
|
|
||||||
|
defer conn.Close()
|
||||||
|
ctx := context.Background()
|
||||||
|
c := api.NewManagerClient(conn)
|
||||||
|
createStudyreq := &api.CreateStudyRequest{
|
||||||
|
StudyConfig: &studyConfig,
|
||||||
|
}
|
||||||
|
createStudyreply, err := c.CreateStudy(ctx, createStudyreq)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("StudyConfig Error %v", err)
|
||||||
|
}
|
||||||
|
studyId := createStudyreply.StudyId
|
||||||
|
log.Printf("Study ID %s", studyId)
|
||||||
|
getStudyreq := &api.GetStudyRequest{
|
||||||
|
StudyId: studyId,
|
||||||
|
}
|
||||||
|
getStudyReply, err := c.GetStudy(ctx, getStudyreq)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("GetConfig Error %v", err)
|
||||||
|
}
|
||||||
|
log.Printf("Study ID %s StudyConf%v", studyId, getStudyReply.StudyConfig)
|
||||||
|
setSuggesitonParameterRequest := &api.SetSuggestionParametersRequest{
|
||||||
|
StudyId: studyId,
|
||||||
|
SuggestionAlgorithm: "grid",
|
||||||
|
SuggestionParameters: gridConfig,
|
||||||
|
}
|
||||||
|
setSuggesitonParameterReply, err := c.SetSuggestionParameters(ctx, setSuggesitonParameterRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("SetConfig Error %v", err)
|
||||||
|
}
|
||||||
|
log.Printf("Grid Prameter ID %s", setSuggesitonParameterReply.ParamId)
|
||||||
|
getGridSuggestRequest := &api.GetSuggestionsRequest{
|
||||||
|
StudyId: studyId,
|
||||||
|
SuggestionAlgorithm: "grid",
|
||||||
|
RequestNumber: 0,
|
||||||
|
//RequestNumber=0 means get all grids.
|
||||||
|
ParamId: setSuggesitonParameterReply.ParamId,
|
||||||
|
}
|
||||||
|
getGridSuggestReply, err := c.GetSuggestions(ctx, getGridSuggestRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("GetSuggestion Error %v", err)
|
||||||
|
}
|
||||||
|
log.Println("Get Grid Suggestions:")
|
||||||
|
for _, t := range getGridSuggestReply.Trials {
|
||||||
|
log.Printf("%v", t)
|
||||||
|
}
|
||||||
|
workerIds := make([]string, len(getGridSuggestReply.Trials))
|
||||||
|
workerParameter := make(map[string][]*api.Parameter)
|
||||||
|
for i, t := range getGridSuggestReply.Trials {
|
||||||
|
ws := workerConfig
|
||||||
|
rtr := &api.RunTrialRequest{
|
||||||
|
StudyId: studyId,
|
||||||
|
TrialId: t.TrialId,
|
||||||
|
Runtime: "kubernetes",
|
||||||
|
WorkerConfig: &ws,
|
||||||
|
}
|
||||||
|
rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, "--output_model_gcs_path")
|
||||||
|
rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, "github-issue-summarization-data/"+t.TrialId+"output_model.h5")
|
||||||
|
for _, p := range t.ParameterSet {
|
||||||
|
rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, p.Name)
|
||||||
|
rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, p.Value)
|
||||||
|
}
|
||||||
|
workerReply, err := c.RunTrial(ctx, rtr)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("RunTrial Error %v", err)
|
||||||
|
}
|
||||||
|
workerIds[i] = workerReply.WorkerId
|
||||||
|
workerParameter[workerReply.WorkerId] = t.ParameterSet
|
||||||
|
saveModelRequest := &api.SaveModelRequest{
|
||||||
|
Model: &api.ModelInfo{
|
||||||
|
StudyName: studyConfig.Name,
|
||||||
|
WorkerId: workerReply.WorkerId,
|
||||||
|
Parameters: t.ParameterSet,
|
||||||
|
Metrics: []*api.Metrics{},
|
||||||
|
ModelPath: "pvc:/Path/to/Model",
|
||||||
|
},
|
||||||
|
DataSet: &api.DataSetInfo{
|
||||||
|
Name: "GitHub",
|
||||||
|
Path: "/path/to/data",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_, err = c.SaveModel(ctx, saveModelRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("SaveModel Error %v", err)
|
||||||
|
}
|
||||||
|
log.Printf("WorkerID %s start\n", workerReply.WorkerId)
|
||||||
|
}
|
||||||
|
for true {
|
||||||
|
time.Sleep(10 * time.Second)
|
||||||
|
getMetricsRequest := &api.GetMetricsRequest{
|
||||||
|
StudyId: studyId,
|
||||||
|
WorkerIds: workerIds,
|
||||||
|
}
|
||||||
|
getMetricsReply, err := c.GetMetrics(ctx, getMetricsRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("GetMetErr %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, mls := range getMetricsReply.MetricsLogSets {
|
||||||
|
if len(mls.MetricsLogs) > 0 {
|
||||||
|
//Only Metrics can be updated.
|
||||||
|
saveModelRequest := &api.SaveModelRequest{
|
||||||
|
Model: &api.ModelInfo{
|
||||||
|
StudyName: studyConfig.Name,
|
||||||
|
WorkerId: mls.WorkerId,
|
||||||
|
Metrics: []*api.Metrics{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, ml := range mls.MetricsLogs {
|
||||||
|
if len(ml.Values) > 0 {
|
||||||
|
log.Printf("WorkerID %s :\t Metrics Name %s Value %v", mls.WorkerId, ml.Name, ml.Values[len(ml.Values)-1])
|
||||||
|
saveModelRequest.Model.Metrics = append(saveModelRequest.Model.Metrics, &api.Metrics{Name: ml.Name, Value: ml.Values[len(ml.Values)-1]})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_, err = c.SaveModel(ctx, saveModelRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("SaveModel Error %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
getWorkerRequest := &api.GetWorkersRequest{StudyId: studyId}
|
||||||
|
getWorkerReply, err := c.GetWorkers(ctx, getWorkerRequest)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("GetWorker Error %v", err)
|
||||||
|
}
|
||||||
|
completeCount := 0
|
||||||
|
for _, w := range getWorkerReply.Workers {
|
||||||
|
if w.Status == api.State_COMPLETED {
|
||||||
|
completeCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if completeCount == len(getWorkerReply.Workers) {
|
||||||
|
log.Printf("All Worker Completed!")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
// Run an HP Tuning job using Katib
|
||||||
|
//
|
||||||
|
// Experimental:
|
||||||
|
// This is experimental code looking at adding hp tuning using Katib
|
||||||
|
// to the GitHub issue summarization example. It doesn't work yet.
|
||||||
|
local env = std.extVar("__ksonnet/environments");
|
||||||
|
local overrideParams = std.extVar("__ksonnet/params").components["hp-tune"];
|
||||||
|
local k = import "k.libsonnet";
|
||||||
|
|
||||||
|
local params = {
|
||||||
|
// Image containing the Katib source code.
|
||||||
|
tunerImage: "gcr.io/kubeflow-examples/gh-issue-hp-tuner:v20180629-b14b337-dirty-e6d4f9",
|
||||||
|
name: "hp-tune",
|
||||||
|
katibEndpoint: "vizier-core:6789",
|
||||||
|
} + overrideParams;
|
||||||
|
|
||||||
|
local tuner = {
|
||||||
|
apiVersion: "batch/v1",
|
||||||
|
kind: "Job",
|
||||||
|
metadata: {
|
||||||
|
name: params.name,
|
||||||
|
namespace: env.namespace,
|
||||||
|
},
|
||||||
|
spec: {
|
||||||
|
backoffLimit: 4,
|
||||||
|
template: {
|
||||||
|
spec: {
|
||||||
|
containers: [
|
||||||
|
{
|
||||||
|
command: [
|
||||||
|
"/opt/kubeflow/git-issue-summarize-demo",
|
||||||
|
"--katib_endpoint=" + params.katibEndpoint,
|
||||||
|
],
|
||||||
|
image: params.tunerImage,
|
||||||
|
name: "tuner",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
restartPolicy: "Never",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
std.prune(k.core.v1.list.new([tuner]))
|
||||||
|
|
@ -95,5 +95,6 @@
|
||||||
"tfjob-pvc-v1alpha2": {
|
"tfjob-pvc-v1alpha2": {
|
||||||
name: "tfjob-pvc-v1alpha2",
|
name: "tfjob-pvc-v1alpha2",
|
||||||
},
|
},
|
||||||
|
"hp-tune": {},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue