mirror of https://github.com/kubeflow/examples.git
				
				
				
			Create a deployment to run the HP/Katib controller for the GitHub issue example. (#161)
* Some of the code is copied over from https://github.com/kubeflow/katib/tree/master/examples/GKEDemo * I think it makes sense to centralize all the code in a single place. * Update the controller program (git-issue-summarize-demo.go) so that can specify the Docker image containing the training code. * Create a ksonnet deployment for running the controller on the cluster. * The HP tuning job isn't functional here's an incomplete list of issues * The training jobs launched fail because they don't have GCP credentials so they can't download the data. * We don't actually extract and report metrics back to Katib. Related to: kubeflow/katib#116
This commit is contained in:
		
							parent
							
								
									d692db36e8
								
							
						
					
					
						commit
						eaf0298590
					
				| 
						 | 
				
			
			@ -44,4 +44,9 @@ examples/.ipynb_checkpoints/
 | 
			
		|||
*.dpkl
 | 
			
		||||
 | 
			
		||||
# Build directory
 | 
			
		||||
github_issue_summarization/build/
 | 
			
		||||
**/build
 | 
			
		||||
github_issue_summarization/build/
 | 
			
		||||
 | 
			
		||||
# Don't check in the go vendor directory
 | 
			
		||||
# We can just use the dep tool to install them.
 | 
			
		||||
github_issue_summarization/hp-tune/vendor
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,4 @@
 | 
			
		|||
FROM golang:1.9
 | 
			
		||||
 | 
			
		||||
RUN mkdir -p /opt/kubeflow
 | 
			
		||||
COPY ./build/git-issue-summarize-demo /opt/kubeflow
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,100 @@
 | 
			
		|||
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  name = "github.com/golang/protobuf"
 | 
			
		||||
  packages = [
 | 
			
		||||
    "proto",
 | 
			
		||||
    "ptypes",
 | 
			
		||||
    "ptypes/any",
 | 
			
		||||
    "ptypes/duration",
 | 
			
		||||
    "ptypes/timestamp"
 | 
			
		||||
  ]
 | 
			
		||||
  revision = "b4deda0973fb4c70b50d226b1af49f3da59f5265"
 | 
			
		||||
  version = "v1.1.0"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  name = "github.com/kubeflow/katib"
 | 
			
		||||
  packages = ["pkg/api"]
 | 
			
		||||
  revision = "f24b520cc52920ae511aeea235636462ebc21d21"
 | 
			
		||||
  version = "v0.1.2-alpha"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  branch = "master"
 | 
			
		||||
  name = "golang.org/x/net"
 | 
			
		||||
  packages = [
 | 
			
		||||
    "context",
 | 
			
		||||
    "http/httpguts",
 | 
			
		||||
    "http2",
 | 
			
		||||
    "http2/hpack",
 | 
			
		||||
    "idna",
 | 
			
		||||
    "internal/timeseries",
 | 
			
		||||
    "trace"
 | 
			
		||||
  ]
 | 
			
		||||
  revision = "4cb1c02c05b0e749b0365f61ae859a8e0cfceed9"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  name = "golang.org/x/text"
 | 
			
		||||
  packages = [
 | 
			
		||||
    "collate",
 | 
			
		||||
    "collate/build",
 | 
			
		||||
    "internal/colltab",
 | 
			
		||||
    "internal/gen",
 | 
			
		||||
    "internal/tag",
 | 
			
		||||
    "internal/triegen",
 | 
			
		||||
    "internal/ucd",
 | 
			
		||||
    "language",
 | 
			
		||||
    "secure/bidirule",
 | 
			
		||||
    "transform",
 | 
			
		||||
    "unicode/bidi",
 | 
			
		||||
    "unicode/cldr",
 | 
			
		||||
    "unicode/norm",
 | 
			
		||||
    "unicode/rangetable"
 | 
			
		||||
  ]
 | 
			
		||||
  revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0"
 | 
			
		||||
  version = "v0.3.0"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  branch = "master"
 | 
			
		||||
  name = "google.golang.org/genproto"
 | 
			
		||||
  packages = ["googleapis/rpc/status"]
 | 
			
		||||
  revision = "ff3583edef7de132f219f0efc00e097cabcc0ec0"
 | 
			
		||||
 | 
			
		||||
[[projects]]
 | 
			
		||||
  name = "google.golang.org/grpc"
 | 
			
		||||
  packages = [
 | 
			
		||||
    ".",
 | 
			
		||||
    "balancer",
 | 
			
		||||
    "balancer/base",
 | 
			
		||||
    "balancer/roundrobin",
 | 
			
		||||
    "codes",
 | 
			
		||||
    "connectivity",
 | 
			
		||||
    "credentials",
 | 
			
		||||
    "encoding",
 | 
			
		||||
    "encoding/proto",
 | 
			
		||||
    "grpclog",
 | 
			
		||||
    "internal",
 | 
			
		||||
    "internal/backoff",
 | 
			
		||||
    "internal/channelz",
 | 
			
		||||
    "internal/grpcrand",
 | 
			
		||||
    "keepalive",
 | 
			
		||||
    "metadata",
 | 
			
		||||
    "naming",
 | 
			
		||||
    "peer",
 | 
			
		||||
    "resolver",
 | 
			
		||||
    "resolver/dns",
 | 
			
		||||
    "resolver/passthrough",
 | 
			
		||||
    "stats",
 | 
			
		||||
    "status",
 | 
			
		||||
    "tap",
 | 
			
		||||
    "transport"
 | 
			
		||||
  ]
 | 
			
		||||
  revision = "168a6198bcb0ef175f7dacec0b8691fc141dc9b8"
 | 
			
		||||
  version = "v1.13.0"
 | 
			
		||||
 | 
			
		||||
[solve-meta]
 | 
			
		||||
  analyzer-name = "dep"
 | 
			
		||||
  analyzer-version = 1
 | 
			
		||||
  inputs-digest = "3d9f4c7de4665d6a45accfb3d5a5a6a6ae9b98229cea14e0a8dfba942a4e49f8"
 | 
			
		||||
  solver-name = "gps-cdcl"
 | 
			
		||||
  solver-version = 1
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,38 @@
 | 
			
		|||
# Gopkg.toml example
 | 
			
		||||
#
 | 
			
		||||
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
 | 
			
		||||
# for detailed Gopkg.toml documentation.
 | 
			
		||||
#
 | 
			
		||||
# required = ["github.com/user/thing/cmd/thing"]
 | 
			
		||||
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
 | 
			
		||||
#
 | 
			
		||||
# [[constraint]]
 | 
			
		||||
#   name = "github.com/user/project"
 | 
			
		||||
#   version = "1.0.0"
 | 
			
		||||
#
 | 
			
		||||
# [[constraint]]
 | 
			
		||||
#   name = "github.com/user/project2"
 | 
			
		||||
#   branch = "dev"
 | 
			
		||||
#   source = "github.com/myfork/project2"
 | 
			
		||||
#
 | 
			
		||||
# [[override]]
 | 
			
		||||
#   name = "github.com/x/y"
 | 
			
		||||
#   version = "2.4.0"
 | 
			
		||||
#
 | 
			
		||||
# [prune]
 | 
			
		||||
#   non-go = false
 | 
			
		||||
#   go-tests = true
 | 
			
		||||
#   unused-packages = true
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[[constraint]]
 | 
			
		||||
  name = "github.com/kubeflow/katib"
 | 
			
		||||
  version = "0.1.2-alpha"
 | 
			
		||||
 | 
			
		||||
[[constraint]]
 | 
			
		||||
  name = "google.golang.org/grpc"
 | 
			
		||||
  version = "1.13.0"
 | 
			
		||||
 | 
			
		||||
[prune]
 | 
			
		||||
  go-tests = true
 | 
			
		||||
  unused-packages = true
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,53 @@
 | 
			
		|||
# Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
# Requirements:
 | 
			
		||||
#   Make sure ${GOPATH}/src/github.com/kubeflow/examples
 | 
			
		||||
#   points at a checked out version of the examples repository.
 | 
			
		||||
IMG = gcr.io/kubeflow-examples/gh-issue-hp-tuner
 | 
			
		||||
DIR := ${CURDIR}
 | 
			
		||||
 | 
			
		||||
# List any changed  files. 
 | 
			
		||||
CHANGED_FILES := $(shell git diff-files --relative=examples/GKEDemo)
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(CHANGED_FILES)),)
 | 
			
		||||
# Changed files is empty; not dirty
 | 
			
		||||
# Don't include --dirty because it could be dirty if files outside the ones we care
 | 
			
		||||
# about changed.
 | 
			
		||||
TAG := $(shell date +v%Y%m%d)-$(shell git describe --always)
 | 
			
		||||
else
 | 
			
		||||
TAG := $(shell date +v%Y%m%d)-$(shell git describe --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
all: build
 | 
			
		||||
 | 
			
		||||
# To build without the cache set the environment variable
 | 
			
		||||
# export DOCKER_BUILD_OPTS=--no-cache
 | 
			
		||||
build: Dockerfile git-issue-summarize-demo.go
 | 
			
		||||
	mkdir -p build
 | 
			
		||||
	dep ensure
 | 
			
		||||
	go build -i -o ./build/git-issue-summarize-demo ${GOPATH}/src/github.com/kubeflow/examples/github_issue_summarization/hp-tune/git-issue-summarize-demo.go 
 | 
			
		||||
	docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) .
 | 
			
		||||
	docker tag $(IMG):$(TAG) $(IMG):latest
 | 
			
		||||
	@echo Built $(IMG):$(TAG)
 | 
			
		||||
 | 
			
		||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
 | 
			
		||||
# first.
 | 
			
		||||
push: build
 | 
			
		||||
	gcloud docker -- push $(IMG):$(TAG)	
 | 
			
		||||
	@echo Pushed $(IMG) with  :$(TAG) tags
 | 
			
		||||
	
 | 
			
		||||
push-latest: push
 | 
			
		||||
	gcloud container images add-tag --quiet $(IMG):$(TAG) $(IMG):latest --verbosity=info
 | 
			
		||||
	echo created $(IMG):latest
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,134 @@
 | 
			
		|||
# Experimental: HP Tuning for GitHub Issue Summarization
 | 
			
		||||
 | 
			
		||||
This directoy contains experimental code for adding hyperparameter
 | 
			
		||||
tuning support to the GitHub issue summarization example using Katib.
 | 
			
		||||
 | 
			
		||||
## Instructions
 | 
			
		||||
 | 
			
		||||
1. Deploy Kubeflow
 | 
			
		||||
1. [Deploy Katib](https://github.com/kubeflow/kubeflow/blob/master/kubeflow/katib/README.md)
 | 
			
		||||
1. Create the katib namespace
 | 
			
		||||
 | 
			
		||||
    ```
 | 
			
		||||
    kubectl create namespace katib
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
    * This is a known issue [kubeflow/katib#134](https://github.com/kubeflow/katib/issues/134)
 | 
			
		||||
 | 
			
		||||
1. Deploy the hyperparameter tuning job 
 | 
			
		||||
 | 
			
		||||
   ```
 | 
			
		||||
   cd kubeflow/examples/github_issue_summarization/ks-kubeflow
 | 
			
		||||
   ks apply ${ENVIRONMENT} -c hp-tune
 | 
			
		||||
   ```
 | 
			
		||||
 | 
			
		||||
## UI
 | 
			
		||||
 | 
			
		||||
You can check your Model with Web UI.
 | 
			
		||||
 | 
			
		||||
Access to `http://${ENDPOINT}/katib/projects`
 | 
			
		||||
 | 
			
		||||
    * If you are using GKE and IAP then ENDPOINT is the endpoint you
 | 
			
		||||
      are serving Kubeflow on
 | 
			
		||||
 | 
			
		||||
    * Otherwise you can port-forward to one of the AMBASSADOR pods
 | 
			
		||||
      and ENDPOINT
 | 
			
		||||
 | 
			
		||||
      ```
 | 
			
		||||
      kubectl port-forward `kubectl get pods --selector=service=ambassador -o jsonpath='{.items[0].metadata.name}'` 8080:80
 | 
			
		||||
      ENDPOINT=localhost:8080
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
The Results will be saved automatically.
 | 
			
		||||
 | 
			
		||||
## Description of git-issue-summarize-demo.go
 | 
			
		||||
You can make hyperparameter and evaluate it by Katib-API.
 | 
			
		||||
Katib-APIs are grpc. So you can use any language grpc supported(e.g. golang, python, c++).
 | 
			
		||||
A typical case, you will call APIs in the order as below.
 | 
			
		||||
In git-issue-summarize-demo.go, it wait for the status of all workers will be Completed.
 | 
			
		||||
 | 
			
		||||
### CreateStudy
 | 
			
		||||
First, you should create Study.
 | 
			
		||||
The input is StudyConfig.
 | 
			
		||||
It has Study name, owner, optimization info, and Parameter config(parameter name, min, and max).
 | 
			
		||||
This function generates a unique ID for your study and stores the config to DB.
 | 
			
		||||
Input:
 | 
			
		||||
* StudyConfig:
 | 
			
		||||
    * Name: string
 | 
			
		||||
    * Owner: string
 | 
			
		||||
    * OptimizationType: enum(OptimizationType_MAXIMIZE, OptimizationType_MINIMIZE)
 | 
			
		||||
    * OptimizationGoal: float
 | 
			
		||||
    * DefaultSuggestionAlgorithm: string
 | 
			
		||||
    * DefaultEarlyStoppingAlgorithm: string
 | 
			
		||||
    * ObjectiveValueName: string
 | 
			
		||||
    * Metrics: List of Metrics name
 | 
			
		||||
    * ParameterConfigs: List of parameter config.
 | 
			
		||||
Return:
 | 
			
		||||
* StudyID
 | 
			
		||||
 | 
			
		||||
### SetSuggestionParameters
 | 
			
		||||
Hyperparameters are generated by suggestion services with Parameter config of Study.
 | 
			
		||||
You can set the specific config for each suggestion.
 | 
			
		||||
Input: 
 | 
			
		||||
* StudyID: ID of your study.
 | 
			
		||||
* SuggestionAlgorithm: name of suggestion service (e.g. random, grid)
 | 
			
		||||
* SuggestionParameters: key-value pairs parameter for suggestions. The wanted key is different for each suggestion.
 | 
			
		||||
Return:
 | 
			
		||||
* ParameterID
 | 
			
		||||
 | 
			
		||||
### GetSuggestions
 | 
			
		||||
This function will create Trials(set of Parameters).
 | 
			
		||||
Input:
 | 
			
		||||
* StudyID: ID of your study.
 | 
			
		||||
* SuggestionAlgorithm: name of suggestion service (e.g. random, grid)
 | 
			
		||||
* RequestNumber: the number you want to evaluate.
 | 
			
		||||
* ParamID: ParameterID you got from SetSuggestionParameters func.
 | 
			
		||||
Return
 | 
			
		||||
* List of Trials
 | 
			
		||||
    * TrialID
 | 
			
		||||
    * Parameter Sets
 | 
			
		||||
 | 
			
		||||
### RunTrial
 | 
			
		||||
Start to evaluate Trial.
 | 
			
		||||
When you use kubernetes runtime, the pods are created the specified config.
 | 
			
		||||
Input:
 | 
			
		||||
* StudyId: ID of your study.
 | 
			
		||||
* TrialId: ID of Trial.
 | 
			
		||||
* Runtime: worker type(e.g. kubernetes)
 | 
			
		||||
* WorkerConfig: runtime config
 | 
			
		||||
    * Image: name of docker image
 | 
			
		||||
    * Command: running commands
 | 
			
		||||
    * GPU: number of GPU
 | 
			
		||||
    * Scheduler: scheduler name
 | 
			
		||||
Return:
 | 
			
		||||
* List of WorkerID
 | 
			
		||||
 | 
			
		||||
### GetMetrics
 | 
			
		||||
Get metrics of running workers.
 | 
			
		||||
Input:
 | 
			
		||||
* StudyId: ID of your study.
 | 
			
		||||
* WorkerIDs: List of worker ID you want to get metrics from.
 | 
			
		||||
Return:
 | 
			
		||||
* List of Metrics
 | 
			
		||||
 | 
			
		||||
### SaveModel
 | 
			
		||||
Save the Model date to KatibDB. After you called this function, you can look model info in the KatibUI.
 | 
			
		||||
When you call this API multiple time, only Metrics will be updated.
 | 
			
		||||
Input:
 | 
			
		||||
* ModelInfo
 | 
			
		||||
    * StudyName
 | 
			
		||||
    * WorkerId
 | 
			
		||||
    * Parameters: List of Parameter
 | 
			
		||||
    * Metrics: List of Metrics
 | 
			
		||||
    * ModelPath: path to model saved. (PVCname:mountpath)
 | 
			
		||||
* DataSet: informatino of input date
 | 
			
		||||
    * Name
 | 
			
		||||
    * Path: path to input data.(PVCname:mountpath)
 | 
			
		||||
 | 
			
		||||
Return:
 | 
			
		||||
    
 | 
			
		||||
### GetWorkers
 | 
			
		||||
You can get worker list and status of workers.
 | 
			
		||||
Input:
 | 
			
		||||
Return:
 | 
			
		||||
* List of worker information
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,210 @@
 | 
			
		|||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"context"
 | 
			
		||||
	"flag"
 | 
			
		||||
	"log"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	"github.com/kubeflow/katib/pkg/api"
 | 
			
		||||
	"google.golang.org/grpc"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var studyConfig = api.StudyConfig{
 | 
			
		||||
	Name:               "grid-demo",
 | 
			
		||||
	Owner:              "katib",
 | 
			
		||||
	OptimizationType:   api.OptimizationType_MAXIMIZE,
 | 
			
		||||
	OptimizationGoal:   0.99,
 | 
			
		||||
	ObjectiveValueName: "Validation-accuracy",
 | 
			
		||||
	Metrics: []string{
 | 
			
		||||
		"accuracy",
 | 
			
		||||
	},
 | 
			
		||||
	ParameterConfigs: &api.StudyConfig_ParameterConfigs{
 | 
			
		||||
		Configs: []*api.ParameterConfig{
 | 
			
		||||
			&api.ParameterConfig{
 | 
			
		||||
				Name:          "--learning_rate",
 | 
			
		||||
				ParameterType: api.ParameterType_DOUBLE,
 | 
			
		||||
				Feasible: &api.FeasibleSpace{
 | 
			
		||||
					Min: "0.005",
 | 
			
		||||
					Max: "0.5",
 | 
			
		||||
				},
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	},
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
var gridConfig = []*api.SuggestionParameter{
 | 
			
		||||
	&api.SuggestionParameter{
 | 
			
		||||
		Name:  "DefaultGrid",
 | 
			
		||||
		Value: "4",
 | 
			
		||||
	},
 | 
			
		||||
	&api.SuggestionParameter{
 | 
			
		||||
		Name:  "--learning_rate",
 | 
			
		||||
		Value: "2",
 | 
			
		||||
	},
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
var managerAddr = flag.String("katib_endpoint", "127.0.0.1:6789", "Endpoint of manager default 127.0.0.1:6789")
 | 
			
		||||
var trainerImage = flag.String("trainer_image", "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888", "The docker image containing the training code")
 | 
			
		||||
 | 
			
		||||
func main() {
 | 
			
		||||
	flag.Parse()
 | 
			
		||||
	conn, err := grpc.Dial(*managerAddr, grpc.WithInsecure())
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("could not connect: %v", err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	workerConfig := api.WorkerConfig{
 | 
			
		||||
		Image: *trainerImage,
 | 
			
		||||
		Command: []string{
 | 
			
		||||
			"python",
 | 
			
		||||
			"/workdir/train.py",
 | 
			
		||||
			"--sample_size",
 | 
			
		||||
			"20000",
 | 
			
		||||
			//		"--input_data_gcs_bucket",
 | 
			
		||||
			//		"katib-gi-example",
 | 
			
		||||
			//		"--input_data_gcs_path",
 | 
			
		||||
			//		"github-issue-summarization-data/github-issues.zip",
 | 
			
		||||
			//		"--output_model_gcs_bucket",
 | 
			
		||||
			//		"katib-gi-example",
 | 
			
		||||
		},
 | 
			
		||||
		Gpu:       0,
 | 
			
		||||
		Scheduler: "default-scheduler",
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	defer conn.Close()
 | 
			
		||||
	ctx := context.Background()
 | 
			
		||||
	c := api.NewManagerClient(conn)
 | 
			
		||||
	createStudyreq := &api.CreateStudyRequest{
 | 
			
		||||
		StudyConfig: &studyConfig,
 | 
			
		||||
	}
 | 
			
		||||
	createStudyreply, err := c.CreateStudy(ctx, createStudyreq)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("StudyConfig Error %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	studyId := createStudyreply.StudyId
 | 
			
		||||
	log.Printf("Study ID %s", studyId)
 | 
			
		||||
	getStudyreq := &api.GetStudyRequest{
 | 
			
		||||
		StudyId: studyId,
 | 
			
		||||
	}
 | 
			
		||||
	getStudyReply, err := c.GetStudy(ctx, getStudyreq)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("GetConfig Error %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	log.Printf("Study ID %s StudyConf%v", studyId, getStudyReply.StudyConfig)
 | 
			
		||||
	setSuggesitonParameterRequest := &api.SetSuggestionParametersRequest{
 | 
			
		||||
		StudyId:              studyId,
 | 
			
		||||
		SuggestionAlgorithm:  "grid",
 | 
			
		||||
		SuggestionParameters: gridConfig,
 | 
			
		||||
	}
 | 
			
		||||
	setSuggesitonParameterReply, err := c.SetSuggestionParameters(ctx, setSuggesitonParameterRequest)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("SetConfig Error %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	log.Printf("Grid Prameter ID %s", setSuggesitonParameterReply.ParamId)
 | 
			
		||||
	getGridSuggestRequest := &api.GetSuggestionsRequest{
 | 
			
		||||
		StudyId:             studyId,
 | 
			
		||||
		SuggestionAlgorithm: "grid",
 | 
			
		||||
		RequestNumber:       0,
 | 
			
		||||
		//RequestNumber=0 means get all grids.
 | 
			
		||||
		ParamId: setSuggesitonParameterReply.ParamId,
 | 
			
		||||
	}
 | 
			
		||||
	getGridSuggestReply, err := c.GetSuggestions(ctx, getGridSuggestRequest)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatalf("GetSuggestion Error %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	log.Println("Get Grid Suggestions:")
 | 
			
		||||
	for _, t := range getGridSuggestReply.Trials {
 | 
			
		||||
		log.Printf("%v", t)
 | 
			
		||||
	}
 | 
			
		||||
	workerIds := make([]string, len(getGridSuggestReply.Trials))
 | 
			
		||||
	workerParameter := make(map[string][]*api.Parameter)
 | 
			
		||||
	for i, t := range getGridSuggestReply.Trials {
 | 
			
		||||
		ws := workerConfig
 | 
			
		||||
		rtr := &api.RunTrialRequest{
 | 
			
		||||
			StudyId:      studyId,
 | 
			
		||||
			TrialId:      t.TrialId,
 | 
			
		||||
			Runtime:      "kubernetes",
 | 
			
		||||
			WorkerConfig: &ws,
 | 
			
		||||
		}
 | 
			
		||||
		rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, "--output_model_gcs_path")
 | 
			
		||||
		rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, "github-issue-summarization-data/"+t.TrialId+"output_model.h5")
 | 
			
		||||
		for _, p := range t.ParameterSet {
 | 
			
		||||
			rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, p.Name)
 | 
			
		||||
			rtr.WorkerConfig.Command = append(rtr.WorkerConfig.Command, p.Value)
 | 
			
		||||
		}
 | 
			
		||||
		workerReply, err := c.RunTrial(ctx, rtr)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Fatalf("RunTrial Error %v", err)
 | 
			
		||||
		}
 | 
			
		||||
		workerIds[i] = workerReply.WorkerId
 | 
			
		||||
		workerParameter[workerReply.WorkerId] = t.ParameterSet
 | 
			
		||||
		saveModelRequest := &api.SaveModelRequest{
 | 
			
		||||
			Model: &api.ModelInfo{
 | 
			
		||||
				StudyName:  studyConfig.Name,
 | 
			
		||||
				WorkerId:   workerReply.WorkerId,
 | 
			
		||||
				Parameters: t.ParameterSet,
 | 
			
		||||
				Metrics:    []*api.Metrics{},
 | 
			
		||||
				ModelPath:  "pvc:/Path/to/Model",
 | 
			
		||||
			},
 | 
			
		||||
			DataSet: &api.DataSetInfo{
 | 
			
		||||
				Name: "GitHub",
 | 
			
		||||
				Path: "/path/to/data",
 | 
			
		||||
			},
 | 
			
		||||
		}
 | 
			
		||||
		_, err = c.SaveModel(ctx, saveModelRequest)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Fatalf("SaveModel Error %v", err)
 | 
			
		||||
		}
 | 
			
		||||
		log.Printf("WorkerID %s start\n", workerReply.WorkerId)
 | 
			
		||||
	}
 | 
			
		||||
	for true {
 | 
			
		||||
		time.Sleep(10 * time.Second)
 | 
			
		||||
		getMetricsRequest := &api.GetMetricsRequest{
 | 
			
		||||
			StudyId:   studyId,
 | 
			
		||||
			WorkerIds: workerIds,
 | 
			
		||||
		}
 | 
			
		||||
		getMetricsReply, err := c.GetMetrics(ctx, getMetricsRequest)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Printf("GetMetErr %v", err)
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		for _, mls := range getMetricsReply.MetricsLogSets {
 | 
			
		||||
			if len(mls.MetricsLogs) > 0 {
 | 
			
		||||
				//Only Metrics can be updated.
 | 
			
		||||
				saveModelRequest := &api.SaveModelRequest{
 | 
			
		||||
					Model: &api.ModelInfo{
 | 
			
		||||
						StudyName: studyConfig.Name,
 | 
			
		||||
						WorkerId:  mls.WorkerId,
 | 
			
		||||
						Metrics:   []*api.Metrics{},
 | 
			
		||||
					},
 | 
			
		||||
				}
 | 
			
		||||
				for _, ml := range mls.MetricsLogs {
 | 
			
		||||
					if len(ml.Values) > 0 {
 | 
			
		||||
						log.Printf("WorkerID %s :\t Metrics Name %s Value %v", mls.WorkerId, ml.Name, ml.Values[len(ml.Values)-1])
 | 
			
		||||
						saveModelRequest.Model.Metrics = append(saveModelRequest.Model.Metrics, &api.Metrics{Name: ml.Name, Value: ml.Values[len(ml.Values)-1]})
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
				_, err = c.SaveModel(ctx, saveModelRequest)
 | 
			
		||||
				if err != nil {
 | 
			
		||||
					log.Fatalf("SaveModel Error %v", err)
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		getWorkerRequest := &api.GetWorkersRequest{StudyId: studyId}
 | 
			
		||||
		getWorkerReply, err := c.GetWorkers(ctx, getWorkerRequest)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Fatalf("GetWorker Error %v", err)
 | 
			
		||||
		}
 | 
			
		||||
		completeCount := 0
 | 
			
		||||
		for _, w := range getWorkerReply.Workers {
 | 
			
		||||
			if w.Status == api.State_COMPLETED {
 | 
			
		||||
				completeCount++
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if completeCount == len(getWorkerReply.Workers) {
 | 
			
		||||
			log.Printf("All Worker Completed!")
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,44 @@
 | 
			
		|||
// Run an HP Tuning job using Katib
 | 
			
		||||
//
 | 
			
		||||
// Experimental:
 | 
			
		||||
// This is experimental code looking at adding hp tuning using Katib
 | 
			
		||||
// to the GitHub issue summarization example. It doesn't work yet.
 | 
			
		||||
local env = std.extVar("__ksonnet/environments");
 | 
			
		||||
local overrideParams = std.extVar("__ksonnet/params").components["hp-tune"];
 | 
			
		||||
local k = import "k.libsonnet";
 | 
			
		||||
 | 
			
		||||
local params = {
 | 
			
		||||
  // Image containing the Katib source code.
 | 
			
		||||
  tunerImage: "gcr.io/kubeflow-examples/gh-issue-hp-tuner:v20180629-b14b337-dirty-e6d4f9",
 | 
			
		||||
  name: "hp-tune",
 | 
			
		||||
  katibEndpoint: "vizier-core:6789",
 | 
			
		||||
} + overrideParams;
 | 
			
		||||
 | 
			
		||||
local tuner = {
 | 
			
		||||
  apiVersion: "batch/v1",
 | 
			
		||||
  kind: "Job",
 | 
			
		||||
  metadata: {
 | 
			
		||||
    name: params.name,
 | 
			
		||||
    namespace: env.namespace,
 | 
			
		||||
  },
 | 
			
		||||
  spec: {
 | 
			
		||||
    backoffLimit: 4,
 | 
			
		||||
    template: {
 | 
			
		||||
      spec: {
 | 
			
		||||
        containers: [
 | 
			
		||||
          {
 | 
			
		||||
            command: [
 | 
			
		||||
              "/opt/kubeflow/git-issue-summarize-demo",
 | 
			
		||||
              "--katib_endpoint=" + params.katibEndpoint,
 | 
			
		||||
            ],
 | 
			
		||||
            image: params.tunerImage,
 | 
			
		||||
            name: "tuner",            
 | 
			
		||||
          },
 | 
			
		||||
        ],
 | 
			
		||||
        restartPolicy: "Never",        
 | 
			
		||||
      },
 | 
			
		||||
    },
 | 
			
		||||
  },
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
std.prune(k.core.v1.list.new([tuner]))
 | 
			
		||||
| 
						 | 
				
			
			@ -95,5 +95,6 @@
 | 
			
		|||
    "tfjob-pvc-v1alpha2": {
 | 
			
		||||
      name: "tfjob-pvc-v1alpha2",
 | 
			
		||||
    },
 | 
			
		||||
    "hp-tune": {},
 | 
			
		||||
  },
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue