mirror of https://github.com/kubeflow/examples.git
Disable Distributed Training (#207)
* Upgrade TFJob and Ksonnet app * Container name should be tensorflow. See #563. * Working single node training and serving on Kubeflow * Add issue link for fixme * Remove redundant create secrets and use Kubeflow provided secrets
This commit is contained in:
parent
091eacb4f6
commit
e9e844022e
|
|
@ -38,16 +38,6 @@ $ gcloud auth configure-docker
|
|||
|
||||
See [Google Cloud Docs](https://cloud.google.com/docs/) for more.
|
||||
|
||||
### Create Kubernetes Secrets
|
||||
|
||||
This is needed for deployed pods in the Kubernetes cluster to access Google Cloud resources.
|
||||
|
||||
```
|
||||
$ PROJECT=my-project ./create_secrets.sh
|
||||
```
|
||||
|
||||
**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step.
|
||||
|
||||
### Python Environment Setup
|
||||
|
||||
This demo needs multiple Python versions and `virtualenv` is an easy way to
|
||||
|
|
@ -74,7 +64,7 @@ See [Virtualenv Docs](https://virtualenv.pypa.io/en/stable/) for more.
|
|||
To install dependencies, run the following commands
|
||||
|
||||
```
|
||||
(env2.7) $ pip install https://github.com/kubeflow/batch-predict/tarball/master
|
||||
(env2.7) $ pip install https://github.com/activatedgeek/batch-predict/tarball/fix-value-provider
|
||||
(env2.7) $ pip install src/
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -1,57 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
##
|
||||
# This script creates all the necessary service accounts and permissions
|
||||
# needed for the training jobs to pull private images from
|
||||
# Google Cloud Registry and access Google Cloud Storage. To
|
||||
# undo all the changes made, add a "-d" flag while executing the
|
||||
# script.
|
||||
#
|
||||
|
||||
set -ex
|
||||
|
||||
export PROJECT=${PROJECT:-}
|
||||
|
||||
if [[ -z "${PROJECT}" ]]; then
|
||||
echo "PROJECT environment variable missing!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export SA_NAME=code-search-access
|
||||
export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com
|
||||
export SA_KEY_FILE=${SA_EMAIL}.key.json
|
||||
|
||||
|
||||
if [[ "${1}" = "-d" ]]; then
|
||||
gcloud projects remove-iam-policy-binding ${PROJECT} \
|
||||
--member=serviceAccount:${SA_EMAIL} \
|
||||
--role=roles/storage.admin
|
||||
|
||||
gcloud iam service-accounts delete ${SA_EMAIL} --quiet
|
||||
|
||||
rm -f ${SA_KEY_FILE}
|
||||
|
||||
kubectl delete secret gcp-credentials gcp-registry-credentials
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL}
|
||||
|
||||
gcloud projects add-iam-policy-binding ${PROJECT} \
|
||||
--member=serviceAccount:${SA_EMAIL} \
|
||||
--role=roles/storage.admin
|
||||
|
||||
gcloud iam service-accounts keys create ${SA_KEY_FILE} \
|
||||
--iam-account=${SA_EMAIL}
|
||||
|
||||
kubectl create secret docker-registry gcp-registry-credentials \
|
||||
--docker-server=https://gcr.io \
|
||||
--docker-username=_json_key \
|
||||
--docker-password="$(cat ${SA_KEY_FILE})" \
|
||||
--docker-email=${SA_EMAIL}
|
||||
|
||||
kubectl create secret generic gcp-credentials \
|
||||
--from-file=key.json="${SA_KEY_FILE}"
|
||||
|
||||
|
|
@ -8,23 +8,36 @@
|
|||
|
||||
set -ex
|
||||
|
||||
GPU=${GPU:-0}
|
||||
BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu" || echo "1.8.0")
|
||||
BUILD_IMAGE_UUID=$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);')
|
||||
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-${BUILD_IMAGE_UUID}"
|
||||
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)-${BUILD_IMAGE_UUID}"
|
||||
|
||||
# Directory of this script used for path references
|
||||
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
pushd "${_SCRIPT_DIR}"
|
||||
|
||||
docker build -f "${_SCRIPT_DIR}/Dockerfile" -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} "${_SCRIPT_DIR}/../.."
|
||||
# Build CPU image
|
||||
docker build -f "${_SCRIPT_DIR}/Dockerfile" \
|
||||
-t ${BUILD_IMAGE_TAG} \
|
||||
--build-arg BASE_IMAGE_TAG=1.8.0 \
|
||||
"${_SCRIPT_DIR}/../.."
|
||||
|
||||
# Push image to GCR PROJECT available
|
||||
# Build GPU image
|
||||
docker build -f "${_SCRIPT_DIR}/Dockerfile" \
|
||||
-t ${BUILD_IMAGE_TAG}-gpu \
|
||||
--build-arg BASE_IMAGE_TAG=1.8.0-gpu \
|
||||
"${_SCRIPT_DIR}/../.."
|
||||
|
||||
# Push images to GCR Project if available
|
||||
PROJECT=${PROJECT:-}
|
||||
if [[ ! -z "${PROJECT}" ]]; then
|
||||
# Tag and push CPU image
|
||||
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
|
||||
# Tag and push GPU image
|
||||
docker tag ${BUILD_IMAGE_TAG}-gpu gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu
|
||||
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu
|
||||
fi
|
||||
|
||||
popd
|
||||
|
|
|
|||
|
|
@ -16,11 +16,13 @@ TF_CONFIG=${TF_CONFIG:-}
|
|||
if [[ ! -z "${TF_CONFIG}" ]]; then
|
||||
WORKER_ID=$(echo "${TF_CONFIG}" | jq ".task.index")
|
||||
WORKER_TYPE=$(echo "${TF_CONFIG}" | jq -r ".task.type")
|
||||
MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.master[0]")
|
||||
MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||
|
||||
if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then
|
||||
TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}"
|
||||
fi
|
||||
# FIXME(sanyamkapoor): Distributed training hangs. See kubeflow/examples#208.
|
||||
# if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then
|
||||
# TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}"
|
||||
# fi
|
||||
unset TF_CONFIG
|
||||
fi
|
||||
|
||||
EVAL_CMD="${TARGET_BIN} ${TARGET_BIN_OPTS} ${@:2}"
|
||||
|
|
|
|||
|
|
@ -1,37 +1,23 @@
|
|||
apiVersion: 0.1.0
|
||||
apiVersion: 0.2.0
|
||||
environments:
|
||||
code-search:
|
||||
kf-cs:
|
||||
destination:
|
||||
namespace: kubeflow
|
||||
server: https://35.193.190.6
|
||||
server: https://35.232.164.190
|
||||
k8sVersion: v1.9.6
|
||||
path: code-search
|
||||
path: kf-cs
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
tf-job:
|
||||
gitVersion:
|
||||
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
|
||||
refSpec: master
|
||||
name: tf-job
|
||||
registry: kubeflow
|
||||
tf-serving:
|
||||
gitVersion:
|
||||
commitSha: e1b2aee865866b2e7e4f8c41b34ae03b4c4bb0db
|
||||
refSpec: master
|
||||
name: tf-serving
|
||||
registry: kubeflow
|
||||
version: e95f94a1a97a0974ada734895d590b5ba565fa77
|
||||
name: kubeflow
|
||||
registries:
|
||||
incubator:
|
||||
gitVersion:
|
||||
commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c
|
||||
refSpec: master
|
||||
protocol: github
|
||||
uri: github.com/ksonnet/parts/tree/master/incubator
|
||||
kubeflow:
|
||||
gitVersion:
|
||||
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
|
||||
refSpec: master
|
||||
protocol: github
|
||||
uri: github.com/kubeflow/kubeflow/tree/master/kubeflow
|
||||
uri: https://github.com/kubeflow/kubeflow/tree/v0.2.2/kubeflow
|
||||
version: 0.0.1
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
global: {
|
||||
// User-defined global parameters; accessible to all component and environments, Ex:
|
||||
// replicas: 4,
|
||||
t2tWorkingDir: "gs://kubeflow-examples/t2t-code-search/20180802",
|
||||
},
|
||||
components: {
|
||||
// Component-level parameters, defined initially from 'ks prototype use ...'
|
||||
|
|
@ -9,8 +10,7 @@
|
|||
"t2t-job": {
|
||||
jobType: "trainer",
|
||||
|
||||
numMaster: 1,
|
||||
numWorker: 0,
|
||||
numWorker: 1,
|
||||
numPs: 0,
|
||||
numWorkerGpu: 0,
|
||||
numPsGpu: 0,
|
||||
|
|
@ -18,8 +18,8 @@
|
|||
train_steps: 100,
|
||||
eval_steps: 10,
|
||||
|
||||
image: "gcr.io/kubeflow-dev/code-search:v20180719-f04a4b7",
|
||||
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180719-gpu-9b8b4a8",
|
||||
image: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac",
|
||||
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac-gpu",
|
||||
imagePullSecrets: [],
|
||||
|
||||
dataDir: "null",
|
||||
|
|
@ -28,18 +28,23 @@
|
|||
hparams_set: "null",
|
||||
},
|
||||
|
||||
"t2t-code-search-datagen": {
|
||||
jobType: "datagen",
|
||||
|
||||
name: "t2t-code-search-datagen",
|
||||
|
||||
problem: "github_function_docstring",
|
||||
dataDir: $.global.t2tWorkingDir + "/data",
|
||||
},
|
||||
|
||||
"t2t-code-search-trainer": {
|
||||
jobType: "trainer",
|
||||
numWorker: 2,
|
||||
numPs: 1,
|
||||
// numWorkerGpu: 1,
|
||||
// numPsGpu: 1,
|
||||
|
||||
name: "t2t-code-search-trainer",
|
||||
|
||||
problem: "github_function_docstring",
|
||||
dataDir: "gs://kubeflow-examples/t2t-code-search/data",
|
||||
outputDir: "gs://kubeflow-examples/t2t-code-search/output",
|
||||
dataDir: $.global.t2tWorkingDir + "/data",
|
||||
outputDir: $.global.t2tWorkingDir + "/output",
|
||||
model: "similarity_transformer",
|
||||
hparams_set: "transformer_tiny",
|
||||
},
|
||||
|
|
@ -50,8 +55,8 @@
|
|||
name: "t2t-code-search-exporter",
|
||||
|
||||
problem: "github_function_docstring",
|
||||
dataDir: "gs://kubeflow-examples/t2t-code-search/data",
|
||||
outputDir: "gs://kubeflow-examples/t2t-code-search/output",
|
||||
dataDir: $.global.t2tWorkingDir + "/data",
|
||||
outputDir: $.global.t2tWorkingDir + "/output",
|
||||
model: "similarity_transformer",
|
||||
hparams_set: "transformer_tiny",
|
||||
},
|
||||
|
|
@ -59,8 +64,8 @@
|
|||
"t2t-code-search-serving": {
|
||||
name: "t2t-code-search",
|
||||
|
||||
modelName: "t2t_code_search",
|
||||
modelPath: "gs://kubeflow-examples/t2t-code-search/output/export/Servo",
|
||||
modelName: "t2t-code-search",
|
||||
modelPath: $.global.t2tWorkingDir + "/output/export/Servo",
|
||||
modelServerImage: "gcr.io/kubeflow-images-public/tensorflow-serving-1.8:latest",
|
||||
cloud: "gcp",
|
||||
gcpCredentialSecretName: "gcp-credentials",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
local k = import "k.libsonnet";
|
||||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-datagen"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
|
||||
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
||||
|
||||
{
|
||||
getDatagenCmd(params)::
|
||||
[
|
||||
"/usr/local/sbin/t2t-entrypoint",
|
||||
"t2t-datagen",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
|
|
@ -24,36 +24,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
local trainer = [
|
||||
"/usr/local/sbin/t2t-entrypoint",
|
||||
"t2t-trainer",
|
||||
"--generate_data",
|
||||
"--problem=" + params.problem,
|
||||
"--model=" + params.model,
|
||||
"--hparams_set=" + params.hparams_set,
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--output_dir=" + params.outputDir,
|
||||
"--train_steps=" + std.toString(params.train_steps),
|
||||
"--eval_steps=" + std.toString(params.eval_steps),
|
||||
"--t2t_usr_dir=/app/code_search/t2t",
|
||||
],
|
||||
|
||||
local workerBase = trainer + [
|
||||
worker: trainer,
|
||||
|
||||
worker_dist: trainer + [
|
||||
"--schedule=train",
|
||||
"--ps_gpu=" + std.toString(params.numPsGpu),
|
||||
"--worker_gpu=" + std.toString(params.numWorkerGpu),
|
||||
"--worker_replicas=" + std.toString(params.numWorker + params.numMaster),
|
||||
"--worker_replicas=" + std.toString(params.numWorker),
|
||||
"--ps_replicas=" + std.toString(params.numPs),
|
||||
"--eval_steps=" + std.toString(params.eval_steps),
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
|
||||
ps: trainer + [
|
||||
"--schedule=run_std_server",
|
||||
"--ps_job=/job:ps",
|
||||
],
|
||||
|
||||
worker: workerBase + [
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
|
||||
master: workerBase + [
|
||||
"--worker_job=/job:master",
|
||||
],
|
||||
},
|
||||
|
||||
tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[])::
|
||||
|
|
@ -61,9 +57,9 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
image: image,
|
||||
name: "tensorflow",
|
||||
[if std.length(args) > 0 then "args"]: args,
|
||||
[if numGpus > 0 then "resources"]: {
|
||||
resources: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
[if numGpus > 0 then "nvidia.com/gpu"]: numGpus,
|
||||
},
|
||||
},
|
||||
[if std.length(env) > 0 then "env"]: env,
|
||||
|
|
@ -76,34 +72,26 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
containers: [ containerSpec ],
|
||||
[if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets,
|
||||
[if std.length(volumes) > 0 then "volumes"]: volumes,
|
||||
restartPolicy: "OnFailure",
|
||||
// restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: replicaType,
|
||||
},
|
||||
|
||||
parts(newParams, env):: {
|
||||
local params = baseParams + newParams,
|
||||
|
||||
local terminationPolicy = if params.numMaster == 1
|
||||
then tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
|
||||
else tfJob.parts.tfJobTerminationPolicy("WORKER", 0),
|
||||
|
||||
local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image,
|
||||
local workerImagePullSecrets = [
|
||||
{ name: "gcp-registry-credentials" },
|
||||
],
|
||||
local workerEnv = [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/key.json"
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json"
|
||||
},
|
||||
],
|
||||
local workerVolumes = [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "gcp-credentials",
|
||||
secretName: "user-gcp-sa",
|
||||
},
|
||||
},
|
||||
],
|
||||
|
|
@ -115,26 +103,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
],
|
||||
|
||||
local cmd = $.getTrainerCmd(params),
|
||||
local workerCmd = if params.jobType == "exporter" then $.getExporterCmd(params)
|
||||
else if params.jobType == "datagen" then $.getDatagenCmd(params)
|
||||
else cmd.worker,
|
||||
|
||||
job::
|
||||
tfJob.parts.tfJob(
|
||||
params.name,
|
||||
env.namespace,
|
||||
if params.jobType == "exporter" then
|
||||
[
|
||||
$.tfJobReplica("MASTER", params.numMaster, $.getExporterCmd(params), workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
]
|
||||
else
|
||||
[
|
||||
$.tfJobReplica("MASTER", params.numMaster, cmd.master, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
$.tfJobReplica("WORKER", params.numWorker, cmd.worker, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
$.tfJobReplica("PS", params.numPs, cmd.ps, workerImage, params.numPsGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
],
|
||||
terminationPolicy
|
||||
),
|
||||
job:: {
|
||||
apiVersion: "kubeflow.org/v1alpha2",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
tfReplicaSpecs: {
|
||||
[if params.numPs > 0 then "PS"]: $.tfJobReplica("PS", params.numPs, cmd.ps, workerImage,
|
||||
numGpus=params.numPsGpu,
|
||||
env=workerEnv,
|
||||
volumes=workerVolumes,
|
||||
volumeMounts=workerVolumeMounts),
|
||||
[if params.numWorker > 0 then "Worker"]: $.tfJobReplica("WORKER", params.numWorker,
|
||||
workerCmd, workerImage,
|
||||
numGpus=params.numPsGpu,
|
||||
env=workerEnv,
|
||||
volumes=workerVolumes,
|
||||
volumeMounts=workerVolumeMounts),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,91 +0,0 @@
|
|||
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
||||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
|
||||
|
||||
- [tf-job](#tf-job)
|
||||
- [Quickstart](#quickstart)
|
||||
- [Using the library](#using-the-library)
|
||||
- [io.ksonnet.pkg.tf-job](#ioksonnetpkgtf-job)
|
||||
- [Example](#example)
|
||||
- [Parameters](#parameters)
|
||||
- [Example](#example-1)
|
||||
- [Parameters](#parameters-1)
|
||||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
# tf-job
|
||||
|
||||
> Prototypes for running TensorFlow jobs.
|
||||
|
||||
|
||||
* [Quickstart](#quickstart)
|
||||
* [Using Prototypes](#using-prototypes)
|
||||
* [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job)
|
||||
* [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn)
|
||||
|
||||
## Quickstart
|
||||
|
||||
*The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.*
|
||||
|
||||
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
|
||||
|
||||
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
|
||||
|
||||
Finally, in the ksonnet application directory, run the following:
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
|
||||
--namespace default \
|
||||
--name tf-job
|
||||
|
||||
# Apply to server.
|
||||
$ ks apply -f tf-job.jsonnet
|
||||
```
|
||||
|
||||
## Using the library
|
||||
|
||||
The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
|
||||
|
||||
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
|
||||
|
||||
These prototypes, as well as how to use them, are enumerated below.
|
||||
|
||||
### io.ksonnet.pkg.tf-job
|
||||
|
||||
A TensorFlow job (could be training or evaluation).
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
|
||||
--name YOUR_NAME_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name to give to each of the components [string]
|
||||
### io.ksonnet.pkg.tf-cnn
|
||||
|
||||
A TensorFlow CNN Benchmarking job
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \
|
||||
--name YOUR_NAME_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name for the job. [string]
|
||||
|
||||
|
||||
[rootReadme]: https://github.com/ksonnet/mixins
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
{
|
||||
"name": "tf-job",
|
||||
"apiVersion": "0.0.1",
|
||||
"kind": "ksonnet.io/parts",
|
||||
"description": "Prototypes for running TensorFlow jobs.\n",
|
||||
"author": "kubeflow team <kubeflow-team@google.com>",
|
||||
"contributors": [
|
||||
{
|
||||
"name": "Jeremy Lewi",
|
||||
"email": "jlewi@google.com"
|
||||
}
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/kubeflow/kubeflow"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/kubeflow/kubeflow/issues"
|
||||
},
|
||||
"keywords": [
|
||||
"kubeflow",
|
||||
"tensorflow",
|
||||
"database"
|
||||
],
|
||||
"quickStart": {
|
||||
"prototype": "io.ksonnet.pkg.tf-job",
|
||||
"componentName": "tf-job",
|
||||
"flags": {
|
||||
"name": "tf-job",
|
||||
"namespace": "default"
|
||||
},
|
||||
"comment": "Run TensorFlow Job"
|
||||
},
|
||||
"license": "Apache 2.0"
|
||||
}
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-job
|
||||
// @description A TensorFlow job (could be training or evaluation).
|
||||
// @shortDescription A TensorFlow job.
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam namespace string null Namespace to use for the components. It is automatically inherited from the environment if not set.
|
||||
// @optionalParam args string null Comma separated list of arguments to pass to the job
|
||||
// @optionalParam image string null The docker image to use for the job.
|
||||
// @optionalParam image_gpu string null The docker image to use when using GPUs.
|
||||
// @optionalParam image_pull_secrets string null Comma-delimited list of secret names to use credentials in pulling your docker images.
|
||||
// @optionalParam num_masters number 1 The number of masters to use
|
||||
// @optionalParam num_ps number 0 The number of ps to use
|
||||
// @optionalParam num_workers number 0 The number of workers to use
|
||||
// @optionalParam num_gpus number 0 The number of GPUs to attach to workers.
|
||||
|
||||
// TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--".
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
|
||||
// updatedParams uses the environment namespace if
|
||||
// the namespace parameter is not explicitly set
|
||||
local updatedParams = params {
|
||||
namespace: if params.namespace == "null" then env.namespace else params.namespace,
|
||||
};
|
||||
|
||||
local name = import "param://name";
|
||||
local namespace = updatedParams.namespace;
|
||||
|
||||
local argsParam = import "param://args";
|
||||
local args =
|
||||
if argsParam == "null" then
|
||||
[]
|
||||
else
|
||||
std.split(argsParam, ",");
|
||||
|
||||
local image = import "param://image";
|
||||
local imageGpu = import "param://image_gpu";
|
||||
local imagePullSecrets = import "param://image_pull_secrets";
|
||||
local numMasters = import "param://num_masters";
|
||||
local numPs = import "param://num_ps";
|
||||
local numWorkers = import "param://num_workers";
|
||||
local numGpus = import "param://num_gpus";
|
||||
|
||||
local terminationPolicy = if numMasters == 1 then
|
||||
tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
|
||||
else
|
||||
tfJob.parts.tfJobTerminationPolicy("WORKER", 0);
|
||||
|
||||
local workerSpec = if numGpus > 0 then
|
||||
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, imagePullSecrets, numGpus)
|
||||
else
|
||||
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image, imagePullSecrets);
|
||||
|
||||
std.prune(k.core.v1.list.new([
|
||||
tfJob.parts.tfJob(
|
||||
name,
|
||||
namespace,
|
||||
[
|
||||
tfJob.parts.tfJobReplica("MASTER", numMasters, args, image, imagePullSecrets),
|
||||
workerSpec,
|
||||
tfJob.parts.tfJobReplica("PS", numPs, args, image, imagePullSecrets),
|
||||
],
|
||||
terminationPolicy
|
||||
),
|
||||
]))
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
local k = import "k.libsonnet";
|
||||
local util = import "util.libsonnet";
|
||||
|
||||
{
|
||||
parts:: {
|
||||
tfJobReplica(replicaType, number, args, image, imagePullSecrets=[], numGpus=0)::
|
||||
local baseContainer = {
|
||||
image: image,
|
||||
name: "tensorflow",
|
||||
};
|
||||
local containerArgs = if std.length(args) > 0 then
|
||||
{
|
||||
args: args,
|
||||
}
|
||||
else {};
|
||||
local resources = if numGpus > 0 then {
|
||||
resources: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
},
|
||||
},
|
||||
} else {};
|
||||
if number > 0 then
|
||||
{
|
||||
replicas: number,
|
||||
template: {
|
||||
spec: {
|
||||
imagePullSecrets: [{ name: secret } for secret in util.toArray(imagePullSecrets)],
|
||||
containers: [
|
||||
baseContainer + containerArgs + resources,
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: replicaType,
|
||||
}
|
||||
else {},
|
||||
|
||||
tfJobTerminationPolicy(replicaName, replicaIndex):: {
|
||||
chief: {
|
||||
replicaName: replicaName,
|
||||
replicaIndex: replicaIndex,
|
||||
},
|
||||
},
|
||||
|
||||
tfJob(name, namespace, replicas, tp):: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: replicas,
|
||||
terminationPolicy: tp,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
{
|
||||
// Convert a comma-delimited string to an array.
|
||||
toArray(str)::
|
||||
if std.type(str) == "string" && str != "null" && std.length(str) > 0 then
|
||||
std.split(str, ",")
|
||||
else [],
|
||||
}
|
||||
|
|
@ -122,7 +122,6 @@
|
|||
args: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
"--port=9000",
|
||||
"--rest_api_port=8000",
|
||||
"--model_name=" + $.params.modelName,
|
||||
"--model_base_path=" + $.params.modelPath,
|
||||
],
|
||||
|
|
@ -130,9 +129,6 @@
|
|||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
{
|
||||
containerPort: 8000,
|
||||
},
|
||||
],
|
||||
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
|
||||
// model-server doesn't have something we can use out of the box.
|
||||
Loading…
Reference in New Issue