Disable Distributed Training (#207)

* Upgrade TFJob and Ksonnet app

* Container name should be tensorflow. See #563.

* Working single node training and serving on Kubeflow

* Add issue link for fixme

* Remove redundant create secrets and use Kubeflow provided secrets
This commit is contained in:
Sanyam Kapoor 2018-08-02 23:02:05 -07:00 committed by k8s-ci-robot
parent 091eacb4f6
commit e9e844022e
18 changed files with 96 additions and 417 deletions

View File

@ -38,16 +38,6 @@ $ gcloud auth configure-docker
See [Google Cloud Docs](https://cloud.google.com/docs/) for more.
### Create Kubernetes Secrets
This is needed for deployed pods in the Kubernetes cluster to access Google Cloud resources.
```
$ PROJECT=my-project ./create_secrets.sh
```
**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step.
### Python Environment Setup
This demo needs multiple Python versions and `virtualenv` is an easy way to
@ -74,7 +64,7 @@ See [Virtualenv Docs](https://virtualenv.pypa.io/en/stable/) for more.
To install dependencies, run the following commands
```
(env2.7) $ pip install https://github.com/kubeflow/batch-predict/tarball/master
(env2.7) $ pip install https://github.com/activatedgeek/batch-predict/tarball/fix-value-provider
(env2.7) $ pip install src/
```

View File

@ -1,57 +0,0 @@
#!/usr/bin/env bash
##
# This script creates all the necessary service accounts and permissions
# needed for the training jobs to pull private images from
# Google Cloud Registry and access Google Cloud Storage. To
# undo all the changes made, add a "-d" flag while executing the
# script.
#
set -ex
export PROJECT=${PROJECT:-}
if [[ -z "${PROJECT}" ]]; then
echo "PROJECT environment variable missing!"
exit 1
fi
export SA_NAME=code-search-access
export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com
export SA_KEY_FILE=${SA_EMAIL}.key.json
if [[ "${1}" = "-d" ]]; then
gcloud projects remove-iam-policy-binding ${PROJECT} \
--member=serviceAccount:${SA_EMAIL} \
--role=roles/storage.admin
gcloud iam service-accounts delete ${SA_EMAIL} --quiet
rm -f ${SA_KEY_FILE}
kubectl delete secret gcp-credentials gcp-registry-credentials
exit 0
fi
gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL}
gcloud projects add-iam-policy-binding ${PROJECT} \
--member=serviceAccount:${SA_EMAIL} \
--role=roles/storage.admin
gcloud iam service-accounts keys create ${SA_KEY_FILE} \
--iam-account=${SA_EMAIL}
kubectl create secret docker-registry gcp-registry-credentials \
--docker-server=https://gcr.io \
--docker-username=_json_key \
--docker-password="$(cat ${SA_KEY_FILE})" \
--docker-email=${SA_EMAIL}
kubectl create secret generic gcp-credentials \
--from-file=key.json="${SA_KEY_FILE}"

View File

@ -8,23 +8,36 @@
set -ex
GPU=${GPU:-0}
BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu" || echo "1.8.0")
BUILD_IMAGE_UUID=$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);')
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-${BUILD_IMAGE_UUID}"
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)-${BUILD_IMAGE_UUID}"
# Directory of this script used for path references
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
pushd "${_SCRIPT_DIR}"
docker build -f "${_SCRIPT_DIR}/Dockerfile" -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} "${_SCRIPT_DIR}/../.."
# Build CPU image
docker build -f "${_SCRIPT_DIR}/Dockerfile" \
-t ${BUILD_IMAGE_TAG} \
--build-arg BASE_IMAGE_TAG=1.8.0 \
"${_SCRIPT_DIR}/../.."
# Push image to GCR PROJECT available
# Build GPU image
docker build -f "${_SCRIPT_DIR}/Dockerfile" \
-t ${BUILD_IMAGE_TAG}-gpu \
--build-arg BASE_IMAGE_TAG=1.8.0-gpu \
"${_SCRIPT_DIR}/../.."
# Push images to GCR Project if available
PROJECT=${PROJECT:-}
if [[ ! -z "${PROJECT}" ]]; then
# Tag and push CPU image
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
# Tag and push GPU image
docker tag ${BUILD_IMAGE_TAG}-gpu gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu
fi
popd

View File

@ -16,11 +16,13 @@ TF_CONFIG=${TF_CONFIG:-}
if [[ ! -z "${TF_CONFIG}" ]]; then
WORKER_ID=$(echo "${TF_CONFIG}" | jq ".task.index")
WORKER_TYPE=$(echo "${TF_CONFIG}" | jq -r ".task.type")
MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.master[0]")
MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then
TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}"
fi
# FIXME(sanyamkapoor): Distributed training hangs. See kubeflow/examples#208.
# if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then
# TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}"
# fi
unset TF_CONFIG
fi
EVAL_CMD="${TARGET_BIN} ${TARGET_BIN_OPTS} ${@:2}"

View File

@ -1,37 +1,23 @@
apiVersion: 0.1.0
apiVersion: 0.2.0
environments:
code-search:
kf-cs:
destination:
namespace: kubeflow
server: https://35.193.190.6
server: https://35.232.164.190
k8sVersion: v1.9.6
path: code-search
path: kf-cs
kind: ksonnet.io/app
libraries:
tf-job:
gitVersion:
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
refSpec: master
name: tf-job
registry: kubeflow
tf-serving:
gitVersion:
commitSha: e1b2aee865866b2e7e4f8c41b34ae03b4c4bb0db
refSpec: master
name: tf-serving
registry: kubeflow
version: e95f94a1a97a0974ada734895d590b5ba565fa77
name: kubeflow
registries:
incubator:
gitVersion:
commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c
refSpec: master
protocol: github
uri: github.com/ksonnet/parts/tree/master/incubator
kubeflow:
gitVersion:
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
refSpec: master
protocol: github
uri: github.com/kubeflow/kubeflow/tree/master/kubeflow
uri: https://github.com/kubeflow/kubeflow/tree/v0.2.2/kubeflow
version: 0.0.1

View File

@ -2,6 +2,7 @@
global: {
// User-defined global parameters; accessible to all component and environments, Ex:
// replicas: 4,
t2tWorkingDir: "gs://kubeflow-examples/t2t-code-search/20180802",
},
components: {
// Component-level parameters, defined initially from 'ks prototype use ...'
@ -9,8 +10,7 @@
"t2t-job": {
jobType: "trainer",
numMaster: 1,
numWorker: 0,
numWorker: 1,
numPs: 0,
numWorkerGpu: 0,
numPsGpu: 0,
@ -18,8 +18,8 @@
train_steps: 100,
eval_steps: 10,
image: "gcr.io/kubeflow-dev/code-search:v20180719-f04a4b7",
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180719-gpu-9b8b4a8",
image: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac",
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac-gpu",
imagePullSecrets: [],
dataDir: "null",
@ -28,18 +28,23 @@
hparams_set: "null",
},
"t2t-code-search-datagen": {
jobType: "datagen",
name: "t2t-code-search-datagen",
problem: "github_function_docstring",
dataDir: $.global.t2tWorkingDir + "/data",
},
"t2t-code-search-trainer": {
jobType: "trainer",
numWorker: 2,
numPs: 1,
// numWorkerGpu: 1,
// numPsGpu: 1,
name: "t2t-code-search-trainer",
problem: "github_function_docstring",
dataDir: "gs://kubeflow-examples/t2t-code-search/data",
outputDir: "gs://kubeflow-examples/t2t-code-search/output",
dataDir: $.global.t2tWorkingDir + "/data",
outputDir: $.global.t2tWorkingDir + "/output",
model: "similarity_transformer",
hparams_set: "transformer_tiny",
},
@ -50,8 +55,8 @@
name: "t2t-code-search-exporter",
problem: "github_function_docstring",
dataDir: "gs://kubeflow-examples/t2t-code-search/data",
outputDir: "gs://kubeflow-examples/t2t-code-search/output",
dataDir: $.global.t2tWorkingDir + "/data",
outputDir: $.global.t2tWorkingDir + "/output",
model: "similarity_transformer",
hparams_set: "transformer_tiny",
},
@ -59,8 +64,8 @@
"t2t-code-search-serving": {
name: "t2t-code-search",
modelName: "t2t_code_search",
modelPath: "gs://kubeflow-examples/t2t-code-search/output/export/Servo",
modelName: "t2t-code-search",
modelPath: $.global.t2tWorkingDir + "/output/export/Servo",
modelServerImage: "gcr.io/kubeflow-images-public/tensorflow-serving-1.8:latest",
cloud: "gcp",
gcpCredentialSecretName: "gcp-credentials",

View File

@ -0,0 +1,7 @@
local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-code-search-datagen"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -1,9 +1,9 @@
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
{
getDatagenCmd(params)::
[
"/usr/local/sbin/t2t-entrypoint",
"t2t-datagen",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
@ -24,36 +24,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
local trainer = [
"/usr/local/sbin/t2t-entrypoint",
"t2t-trainer",
"--generate_data",
"--problem=" + params.problem,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
"--data_dir=" + params.dataDir,
"--output_dir=" + params.outputDir,
"--train_steps=" + std.toString(params.train_steps),
"--eval_steps=" + std.toString(params.eval_steps),
"--t2t_usr_dir=/app/code_search/t2t",
],
local workerBase = trainer + [
worker: trainer,
worker_dist: trainer + [
"--schedule=train",
"--ps_gpu=" + std.toString(params.numPsGpu),
"--worker_gpu=" + std.toString(params.numWorkerGpu),
"--worker_replicas=" + std.toString(params.numWorker + params.numMaster),
"--worker_replicas=" + std.toString(params.numWorker),
"--ps_replicas=" + std.toString(params.numPs),
"--eval_steps=" + std.toString(params.eval_steps),
"--worker_job=/job:worker",
],
ps: trainer + [
"--schedule=run_std_server",
"--ps_job=/job:ps",
],
worker: workerBase + [
"--worker_job=/job:worker",
],
master: workerBase + [
"--worker_job=/job:master",
],
},
tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[])::
@ -61,9 +57,9 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
image: image,
name: "tensorflow",
[if std.length(args) > 0 then "args"]: args,
[if numGpus > 0 then "resources"]: {
resources: {
limits: {
"nvidia.com/gpu": numGpus,
[if numGpus > 0 then "nvidia.com/gpu"]: numGpus,
},
},
[if std.length(env) > 0 then "env"]: env,
@ -76,34 +72,26 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
containers: [ containerSpec ],
[if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets,
[if std.length(volumes) > 0 then "volumes"]: volumes,
restartPolicy: "OnFailure",
// restartPolicy: "OnFailure",
},
},
tfReplicaType: replicaType,
},
parts(newParams, env):: {
local params = baseParams + newParams,
local terminationPolicy = if params.numMaster == 1
then tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
else tfJob.parts.tfJobTerminationPolicy("WORKER", 0),
local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image,
local workerImagePullSecrets = [
{ name: "gcp-registry-credentials" },
],
local workerEnv = [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json"
value: "/secret/gcp-credentials/user-gcp-sa.json"
},
],
local workerVolumes = [
{
name: "gcp-credentials",
secret: {
secretName: "gcp-credentials",
secretName: "user-gcp-sa",
},
},
],
@ -115,26 +103,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
],
local cmd = $.getTrainerCmd(params),
local workerCmd = if params.jobType == "exporter" then $.getExporterCmd(params)
else if params.jobType == "datagen" then $.getDatagenCmd(params)
else cmd.worker,
job::
tfJob.parts.tfJob(
params.name,
env.namespace,
if params.jobType == "exporter" then
[
$.tfJobReplica("MASTER", params.numMaster, $.getExporterCmd(params), workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
]
else
[
$.tfJobReplica("MASTER", params.numMaster, cmd.master, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
$.tfJobReplica("WORKER", params.numWorker, cmd.worker, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
$.tfJobReplica("PS", params.numPs, cmd.ps, workerImage, params.numPsGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
],
terminationPolicy
),
job:: {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: params.name,
namespace: env.namespace,
},
spec: {
tfReplicaSpecs: {
[if params.numPs > 0 then "PS"]: $.tfJobReplica("PS", params.numPs, cmd.ps, workerImage,
numGpus=params.numPsGpu,
env=workerEnv,
volumes=workerVolumes,
volumeMounts=workerVolumeMounts),
[if params.numWorker > 0 then "Worker"]: $.tfJobReplica("WORKER", params.numWorker,
workerCmd, workerImage,
numGpus=params.numPsGpu,
env=workerEnv,
volumes=workerVolumes,
volumeMounts=workerVolumeMounts),
},
},
},
},
}

View File

@ -1,91 +0,0 @@
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [tf-job](#tf-job)
- [Quickstart](#quickstart)
- [Using the library](#using-the-library)
- [io.ksonnet.pkg.tf-job](#ioksonnetpkgtf-job)
- [Example](#example)
- [Parameters](#parameters)
- [Example](#example-1)
- [Parameters](#parameters-1)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
# tf-job
> Prototypes for running TensorFlow jobs.
* [Quickstart](#quickstart)
* [Using Prototypes](#using-prototypes)
* [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job)
* [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn)
## Quickstart
*The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.*
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
Finally, in the ksonnet application directory, run the following:
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
--namespace default \
--name tf-job
# Apply to server.
$ ks apply -f tf-job.jsonnet
```
## Using the library
The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
These prototypes, as well as how to use them, are enumerated below.
### io.ksonnet.pkg.tf-job
A TensorFlow job (could be training or evaluation).
#### Example
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
--name YOUR_NAME_HERE
```
#### Parameters
The available options to pass prototype are:
* `--name=<name>`: Name to give to each of the components [string]
### io.ksonnet.pkg.tf-cnn
A TensorFlow CNN Benchmarking job
#### Example
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \
--name YOUR_NAME_HERE
```
#### Parameters
The available options to pass prototype are:
* `--name=<name>`: Name for the job. [string]
[rootReadme]: https://github.com/ksonnet/mixins

View File

@ -1,35 +0,0 @@
{
"name": "tf-job",
"apiVersion": "0.0.1",
"kind": "ksonnet.io/parts",
"description": "Prototypes for running TensorFlow jobs.\n",
"author": "kubeflow team <kubeflow-team@google.com>",
"contributors": [
{
"name": "Jeremy Lewi",
"email": "jlewi@google.com"
}
],
"repository": {
"type": "git",
"url": "https://github.com/kubeflow/kubeflow"
},
"bugs": {
"url": "https://github.com/kubeflow/kubeflow/issues"
},
"keywords": [
"kubeflow",
"tensorflow",
"database"
],
"quickStart": {
"prototype": "io.ksonnet.pkg.tf-job",
"componentName": "tf-job",
"flags": {
"name": "tf-job",
"namespace": "default"
},
"comment": "Run TensorFlow Job"
},
"license": "Apache 2.0"
}

View File

@ -1,65 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-job
// @description A TensorFlow job (could be training or evaluation).
// @shortDescription A TensorFlow job.
// @param name string Name to give to each of the components
// @optionalParam namespace string null Namespace to use for the components. It is automatically inherited from the environment if not set.
// @optionalParam args string null Comma separated list of arguments to pass to the job
// @optionalParam image string null The docker image to use for the job.
// @optionalParam image_gpu string null The docker image to use when using GPUs.
// @optionalParam image_pull_secrets string null Comma-delimited list of secret names to use credentials in pulling your docker images.
// @optionalParam num_masters number 1 The number of masters to use
// @optionalParam num_ps number 0 The number of ps to use
// @optionalParam num_workers number 0 The number of workers to use
// @optionalParam num_gpus number 0 The number of GPUs to attach to workers.
// TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--".
local k = import "k.libsonnet";
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
// updatedParams uses the environment namespace if
// the namespace parameter is not explicitly set
local updatedParams = params {
namespace: if params.namespace == "null" then env.namespace else params.namespace,
};
local name = import "param://name";
local namespace = updatedParams.namespace;
local argsParam = import "param://args";
local args =
if argsParam == "null" then
[]
else
std.split(argsParam, ",");
local image = import "param://image";
local imageGpu = import "param://image_gpu";
local imagePullSecrets = import "param://image_pull_secrets";
local numMasters = import "param://num_masters";
local numPs = import "param://num_ps";
local numWorkers = import "param://num_workers";
local numGpus = import "param://num_gpus";
local terminationPolicy = if numMasters == 1 then
tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
else
tfJob.parts.tfJobTerminationPolicy("WORKER", 0);
local workerSpec = if numGpus > 0 then
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, imagePullSecrets, numGpus)
else
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image, imagePullSecrets);
std.prune(k.core.v1.list.new([
tfJob.parts.tfJob(
name,
namespace,
[
tfJob.parts.tfJobReplica("MASTER", numMasters, args, image, imagePullSecrets),
workerSpec,
tfJob.parts.tfJobReplica("PS", numPs, args, image, imagePullSecrets),
],
terminationPolicy
),
]))

View File

@ -1,59 +0,0 @@
local k = import "k.libsonnet";
local util = import "util.libsonnet";
{
parts:: {
tfJobReplica(replicaType, number, args, image, imagePullSecrets=[], numGpus=0)::
local baseContainer = {
image: image,
name: "tensorflow",
};
local containerArgs = if std.length(args) > 0 then
{
args: args,
}
else {};
local resources = if numGpus > 0 then {
resources: {
limits: {
"nvidia.com/gpu": numGpus,
},
},
} else {};
if number > 0 then
{
replicas: number,
template: {
spec: {
imagePullSecrets: [{ name: secret } for secret in util.toArray(imagePullSecrets)],
containers: [
baseContainer + containerArgs + resources,
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: replicaType,
}
else {},
tfJobTerminationPolicy(replicaName, replicaIndex):: {
chief: {
replicaName: replicaName,
replicaIndex: replicaIndex,
},
},
tfJob(name, namespace, replicas, tp):: {
apiVersion: "kubeflow.org/v1alpha1",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
replicaSpecs: replicas,
terminationPolicy: tp,
},
},
},
}

View File

@ -1,7 +0,0 @@
{
// Convert a comma-delimited string to an array.
toArray(str)::
if std.type(str) == "string" && str != "null" && std.length(str) > 0 then
std.split(str, ",")
else [],
}

View File

@ -122,7 +122,6 @@
args: [
"/usr/bin/tensorflow_model_server",
"--port=9000",
"--rest_api_port=8000",
"--model_name=" + $.params.modelName,
"--model_base_path=" + $.params.modelPath,
],
@ -130,9 +129,6 @@
{
containerPort: 9000,
},
{
containerPort: 8000,
},
],
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
// model-server doesn't have something we can use out of the box.