diff --git a/code_search/README.md b/code_search/README.md index c5ed5834..038af3d7 100644 --- a/code_search/README.md +++ b/code_search/README.md @@ -38,16 +38,6 @@ $ gcloud auth configure-docker See [Google Cloud Docs](https://cloud.google.com/docs/) for more. -### Create Kubernetes Secrets - -This is needed for deployed pods in the Kubernetes cluster to access Google Cloud resources. - -``` -$ PROJECT=my-project ./create_secrets.sh -``` - -**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step. - ### Python Environment Setup This demo needs multiple Python versions and `virtualenv` is an easy way to @@ -74,7 +64,7 @@ See [Virtualenv Docs](https://virtualenv.pypa.io/en/stable/) for more. To install dependencies, run the following commands ``` -(env2.7) $ pip install https://github.com/kubeflow/batch-predict/tarball/master +(env2.7) $ pip install https://github.com/activatedgeek/batch-predict/tarball/fix-value-provider (env2.7) $ pip install src/ ``` diff --git a/code_search/create_secrets.sh b/code_search/create_secrets.sh deleted file mode 100755 index 6ba834ba..00000000 --- a/code_search/create_secrets.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash - -## -# This script creates all the necessary service accounts and permissions -# needed for the training jobs to pull private images from -# Google Cloud Registry and access Google Cloud Storage. To -# undo all the changes made, add a "-d" flag while executing the -# script. -# - -set -ex - -export PROJECT=${PROJECT:-} - -if [[ -z "${PROJECT}" ]]; then - echo "PROJECT environment variable missing!" - exit 1 -fi - -export SA_NAME=code-search-access -export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com -export SA_KEY_FILE=${SA_EMAIL}.key.json - - -if [[ "${1}" = "-d" ]]; then - gcloud projects remove-iam-policy-binding ${PROJECT} \ - --member=serviceAccount:${SA_EMAIL} \ - --role=roles/storage.admin - - gcloud iam service-accounts delete ${SA_EMAIL} --quiet - - rm -f ${SA_KEY_FILE} - - kubectl delete secret gcp-credentials gcp-registry-credentials - - exit 0 -fi - - -gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL} - -gcloud projects add-iam-policy-binding ${PROJECT} \ - --member=serviceAccount:${SA_EMAIL} \ - --role=roles/storage.admin - -gcloud iam service-accounts keys create ${SA_KEY_FILE} \ - --iam-account=${SA_EMAIL} - -kubectl create secret docker-registry gcp-registry-credentials \ - --docker-server=https://gcr.io \ - --docker-username=_json_key \ - --docker-password="$(cat ${SA_KEY_FILE})" \ - --docker-email=${SA_EMAIL} - -kubectl create secret generic gcp-credentials \ - --from-file=key.json="${SA_KEY_FILE}" - diff --git a/code_search/docker/t2t/build.sh b/code_search/docker/t2t/build.sh index a766cb17..c209aaae 100755 --- a/code_search/docker/t2t/build.sh +++ b/code_search/docker/t2t/build.sh @@ -8,23 +8,36 @@ set -ex -GPU=${GPU:-0} -BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu" || echo "1.8.0") BUILD_IMAGE_UUID=$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);') -BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-${BUILD_IMAGE_UUID}" +BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)-${BUILD_IMAGE_UUID}" # Directory of this script used for path references _SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" pushd "${_SCRIPT_DIR}" -docker build -f "${_SCRIPT_DIR}/Dockerfile" -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} "${_SCRIPT_DIR}/../.." +# Build CPU image +docker build -f "${_SCRIPT_DIR}/Dockerfile" \ + -t ${BUILD_IMAGE_TAG} \ + --build-arg BASE_IMAGE_TAG=1.8.0 \ + "${_SCRIPT_DIR}/../.." -# Push image to GCR PROJECT available +# Build GPU image +docker build -f "${_SCRIPT_DIR}/Dockerfile" \ + -t ${BUILD_IMAGE_TAG}-gpu \ + --build-arg BASE_IMAGE_TAG=1.8.0-gpu \ + "${_SCRIPT_DIR}/../.." + +# Push images to GCR Project if available PROJECT=${PROJECT:-} if [[ ! -z "${PROJECT}" ]]; then + # Tag and push CPU image docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} + + # Tag and push GPU image + docker tag ${BUILD_IMAGE_TAG}-gpu gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu + docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}-gpu fi popd diff --git a/code_search/docker/t2t/t2t-entrypoint.sh b/code_search/docker/t2t/t2t-entrypoint.sh index 09ae00ba..970c1bb3 100755 --- a/code_search/docker/t2t/t2t-entrypoint.sh +++ b/code_search/docker/t2t/t2t-entrypoint.sh @@ -16,11 +16,13 @@ TF_CONFIG=${TF_CONFIG:-} if [[ ! -z "${TF_CONFIG}" ]]; then WORKER_ID=$(echo "${TF_CONFIG}" | jq ".task.index") WORKER_TYPE=$(echo "${TF_CONFIG}" | jq -r ".task.type") - MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.master[0]") + MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]") - if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then - TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}" - fi + # FIXME(sanyamkapoor): Distributed training hangs. See kubeflow/examples#208. + # if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then + # TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}" + # fi + unset TF_CONFIG fi EVAL_CMD="${TARGET_BIN} ${TARGET_BIN_OPTS} ${@:2}" diff --git a/code_search/kubeflow/app.yaml b/code_search/kubeflow/app.yaml index ace7ad86..66e69805 100644 --- a/code_search/kubeflow/app.yaml +++ b/code_search/kubeflow/app.yaml @@ -1,37 +1,23 @@ -apiVersion: 0.1.0 +apiVersion: 0.2.0 environments: - code-search: + kf-cs: destination: namespace: kubeflow - server: https://35.193.190.6 + server: https://35.232.164.190 k8sVersion: v1.9.6 - path: code-search + path: kf-cs kind: ksonnet.io/app libraries: - tf-job: - gitVersion: - commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9 - refSpec: master - name: tf-job - registry: kubeflow tf-serving: - gitVersion: - commitSha: e1b2aee865866b2e7e4f8c41b34ae03b4c4bb0db - refSpec: master name: tf-serving registry: kubeflow + version: e95f94a1a97a0974ada734895d590b5ba565fa77 name: kubeflow registries: incubator: - gitVersion: - commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c - refSpec: master protocol: github uri: github.com/ksonnet/parts/tree/master/incubator kubeflow: - gitVersion: - commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9 - refSpec: master protocol: github - uri: github.com/kubeflow/kubeflow/tree/master/kubeflow + uri: https://github.com/kubeflow/kubeflow/tree/v0.2.2/kubeflow version: 0.0.1 diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index 12a3c667..cdea4357 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -2,6 +2,7 @@ global: { // User-defined global parameters; accessible to all component and environments, Ex: // replicas: 4, + t2tWorkingDir: "gs://kubeflow-examples/t2t-code-search/20180802", }, components: { // Component-level parameters, defined initially from 'ks prototype use ...' @@ -9,8 +10,7 @@ "t2t-job": { jobType: "trainer", - numMaster: 1, - numWorker: 0, + numWorker: 1, numPs: 0, numWorkerGpu: 0, numPsGpu: 0, @@ -18,8 +18,8 @@ train_steps: 100, eval_steps: 10, - image: "gcr.io/kubeflow-dev/code-search:v20180719-f04a4b7", - imageGpu: "gcr.io/kubeflow-dev/code-search:v20180719-gpu-9b8b4a8", + image: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac", + imageGpu: "gcr.io/kubeflow-dev/code-search:v20180802-c622aac-gpu", imagePullSecrets: [], dataDir: "null", @@ -28,18 +28,23 @@ hparams_set: "null", }, + "t2t-code-search-datagen": { + jobType: "datagen", + + name: "t2t-code-search-datagen", + + problem: "github_function_docstring", + dataDir: $.global.t2tWorkingDir + "/data", + }, + "t2t-code-search-trainer": { jobType: "trainer", - numWorker: 2, - numPs: 1, - // numWorkerGpu: 1, - // numPsGpu: 1, name: "t2t-code-search-trainer", problem: "github_function_docstring", - dataDir: "gs://kubeflow-examples/t2t-code-search/data", - outputDir: "gs://kubeflow-examples/t2t-code-search/output", + dataDir: $.global.t2tWorkingDir + "/data", + outputDir: $.global.t2tWorkingDir + "/output", model: "similarity_transformer", hparams_set: "transformer_tiny", }, @@ -50,8 +55,8 @@ name: "t2t-code-search-exporter", problem: "github_function_docstring", - dataDir: "gs://kubeflow-examples/t2t-code-search/data", - outputDir: "gs://kubeflow-examples/t2t-code-search/output", + dataDir: $.global.t2tWorkingDir + "/data", + outputDir: $.global.t2tWorkingDir + "/output", model: "similarity_transformer", hparams_set: "transformer_tiny", }, @@ -59,8 +64,8 @@ "t2t-code-search-serving": { name: "t2t-code-search", - modelName: "t2t_code_search", - modelPath: "gs://kubeflow-examples/t2t-code-search/output/export/Servo", + modelName: "t2t-code-search", + modelPath: $.global.t2tWorkingDir + "/output/export/Servo", modelServerImage: "gcr.io/kubeflow-images-public/tensorflow-serving-1.8:latest", cloud: "gcp", gcpCredentialSecretName: "gcp-credentials", diff --git a/code_search/kubeflow/components/t2t-code-search-datagen.jsonnet b/code_search/kubeflow/components/t2t-code-search-datagen.jsonnet new file mode 100644 index 00000000..f60167c6 --- /dev/null +++ b/code_search/kubeflow/components/t2t-code-search-datagen.jsonnet @@ -0,0 +1,7 @@ +local k = import "k.libsonnet"; +local t2tJob = import "t2t-job.libsonnet"; + +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["t2t-code-search-datagen"]; + +std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/kubeflow/components/t2t-job.libsonnet b/code_search/kubeflow/components/t2t-job.libsonnet index e9f5cd24..0da3a33c 100644 --- a/code_search/kubeflow/components/t2t-job.libsonnet +++ b/code_search/kubeflow/components/t2t-job.libsonnet @@ -1,9 +1,9 @@ -local tfJob = import "kubeflow/tf-job/tf-job.libsonnet"; local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; { getDatagenCmd(params):: [ + "/usr/local/sbin/t2t-entrypoint", "t2t-datagen", "--problem=" + params.problem, "--data_dir=" + params.dataDir, @@ -24,36 +24,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; local trainer = [ "/usr/local/sbin/t2t-entrypoint", "t2t-trainer", - "--generate_data", "--problem=" + params.problem, "--model=" + params.model, "--hparams_set=" + params.hparams_set, "--data_dir=" + params.dataDir, "--output_dir=" + params.outputDir, "--train_steps=" + std.toString(params.train_steps), + "--eval_steps=" + std.toString(params.eval_steps), + "--t2t_usr_dir=/app/code_search/t2t", ], - local workerBase = trainer + [ + worker: trainer, + + worker_dist: trainer + [ "--schedule=train", "--ps_gpu=" + std.toString(params.numPsGpu), "--worker_gpu=" + std.toString(params.numWorkerGpu), - "--worker_replicas=" + std.toString(params.numWorker + params.numMaster), + "--worker_replicas=" + std.toString(params.numWorker), "--ps_replicas=" + std.toString(params.numPs), "--eval_steps=" + std.toString(params.eval_steps), + "--worker_job=/job:worker", ], ps: trainer + [ "--schedule=run_std_server", "--ps_job=/job:ps", ], - - worker: workerBase + [ - "--worker_job=/job:worker", - ], - - master: workerBase + [ - "--worker_job=/job:master", - ], }, tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[]):: @@ -61,9 +57,9 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; image: image, name: "tensorflow", [if std.length(args) > 0 then "args"]: args, - [if numGpus > 0 then "resources"]: { + resources: { limits: { - "nvidia.com/gpu": numGpus, + [if numGpus > 0 then "nvidia.com/gpu"]: numGpus, }, }, [if std.length(env) > 0 then "env"]: env, @@ -76,34 +72,26 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; containers: [ containerSpec ], [if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets, [if std.length(volumes) > 0 then "volumes"]: volumes, - restartPolicy: "OnFailure", + // restartPolicy: "OnFailure", }, }, - tfReplicaType: replicaType, }, parts(newParams, env):: { local params = baseParams + newParams, - local terminationPolicy = if params.numMaster == 1 - then tfJob.parts.tfJobTerminationPolicy("MASTER", 0) - else tfJob.parts.tfJobTerminationPolicy("WORKER", 0), - local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image, - local workerImagePullSecrets = [ - { name: "gcp-registry-credentials" }, - ], local workerEnv = [ { name: "GOOGLE_APPLICATION_CREDENTIALS", - value: "/secret/gcp-credentials/key.json" + value: "/secret/gcp-credentials/user-gcp-sa.json" }, ], local workerVolumes = [ { name: "gcp-credentials", secret: { - secretName: "gcp-credentials", + secretName: "user-gcp-sa", }, }, ], @@ -115,26 +103,32 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; ], local cmd = $.getTrainerCmd(params), + local workerCmd = if params.jobType == "exporter" then $.getExporterCmd(params) + else if params.jobType == "datagen" then $.getDatagenCmd(params) + else cmd.worker, - job:: - tfJob.parts.tfJob( - params.name, - env.namespace, - if params.jobType == "exporter" then - [ - $.tfJobReplica("MASTER", params.numMaster, $.getExporterCmd(params), workerImage, params.numWorkerGpu, - workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), - ] - else - [ - $.tfJobReplica("MASTER", params.numMaster, cmd.master, workerImage, params.numWorkerGpu, - workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), - $.tfJobReplica("WORKER", params.numWorker, cmd.worker, workerImage, params.numWorkerGpu, - workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), - $.tfJobReplica("PS", params.numPs, cmd.ps, workerImage, params.numPsGpu, - workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), - ], - terminationPolicy - ), + job:: { + apiVersion: "kubeflow.org/v1alpha2", + kind: "TFJob", + metadata: { + name: params.name, + namespace: env.namespace, + }, + spec: { + tfReplicaSpecs: { + [if params.numPs > 0 then "PS"]: $.tfJobReplica("PS", params.numPs, cmd.ps, workerImage, + numGpus=params.numPsGpu, + env=workerEnv, + volumes=workerVolumes, + volumeMounts=workerVolumeMounts), + [if params.numWorker > 0 then "Worker"]: $.tfJobReplica("WORKER", params.numWorker, + workerCmd, workerImage, + numGpus=params.numPsGpu, + env=workerEnv, + volumes=workerVolumes, + volumeMounts=workerVolumeMounts), + }, + }, + }, }, } diff --git a/code_search/kubeflow/vendor/kubeflow/tf-job/README.md b/code_search/kubeflow/vendor/kubeflow/tf-job/README.md deleted file mode 100644 index 9825b264..00000000 --- a/code_search/kubeflow/vendor/kubeflow/tf-job/README.md +++ /dev/null @@ -1,91 +0,0 @@ - - -**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* - -- [tf-job](#tf-job) - - [Quickstart](#quickstart) - - [Using the library](#using-the-library) - - [io.ksonnet.pkg.tf-job](#ioksonnetpkgtf-job) - - [Example](#example) - - [Parameters](#parameters) - - [Example](#example-1) - - [Parameters](#parameters-1) - - - -# tf-job - -> Prototypes for running TensorFlow jobs. - - -* [Quickstart](#quickstart) -* [Using Prototypes](#using-prototypes) - * [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job) - * [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn) - -## Quickstart - -*The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.* - -First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)). - -If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init `. - -Finally, in the ksonnet application directory, run the following: - -```shell -# Expand prototype as a Jsonnet file, place in a file in the -# `components/` directory. (YAML and JSON are also available.) -$ ks prototype use io.ksonnet.pkg.tf-job tf-job \ - --namespace default \ - --name tf-job - -# Apply to server. -$ ks apply -f tf-job.jsonnet -``` - -## Using the library - -The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache. - -This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs. - -These prototypes, as well as how to use them, are enumerated below. - -### io.ksonnet.pkg.tf-job - -A TensorFlow job (could be training or evaluation). -#### Example - -```shell -# Expand prototype as a Jsonnet file, place in a file in the -# `components/` directory. (YAML and JSON are also available.) -$ ks prototype use io.ksonnet.pkg.tf-job tf-job \ - --name YOUR_NAME_HERE -``` - -#### Parameters - -The available options to pass prototype are: - -* `--name=`: Name to give to each of the components [string] -### io.ksonnet.pkg.tf-cnn - -A TensorFlow CNN Benchmarking job -#### Example - -```shell -# Expand prototype as a Jsonnet file, place in a file in the -# `components/` directory. (YAML and JSON are also available.) -$ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \ - --name YOUR_NAME_HERE -``` - -#### Parameters - -The available options to pass prototype are: - -* `--name=`: Name for the job. [string] - - -[rootReadme]: https://github.com/ksonnet/mixins diff --git a/code_search/kubeflow/vendor/kubeflow/tf-job/parts.yaml b/code_search/kubeflow/vendor/kubeflow/tf-job/parts.yaml deleted file mode 100644 index 905ff0a9..00000000 --- a/code_search/kubeflow/vendor/kubeflow/tf-job/parts.yaml +++ /dev/null @@ -1,35 +0,0 @@ -{ - "name": "tf-job", - "apiVersion": "0.0.1", - "kind": "ksonnet.io/parts", - "description": "Prototypes for running TensorFlow jobs.\n", - "author": "kubeflow team ", - "contributors": [ - { - "name": "Jeremy Lewi", - "email": "jlewi@google.com" - } - ], - "repository": { - "type": "git", - "url": "https://github.com/kubeflow/kubeflow" - }, - "bugs": { - "url": "https://github.com/kubeflow/kubeflow/issues" - }, - "keywords": [ - "kubeflow", - "tensorflow", - "database" - ], - "quickStart": { - "prototype": "io.ksonnet.pkg.tf-job", - "componentName": "tf-job", - "flags": { - "name": "tf-job", - "namespace": "default" - }, - "comment": "Run TensorFlow Job" - }, - "license": "Apache 2.0" -} diff --git a/code_search/kubeflow/vendor/kubeflow/tf-job/prototypes/tf-job.jsonnet b/code_search/kubeflow/vendor/kubeflow/tf-job/prototypes/tf-job.jsonnet deleted file mode 100644 index f3e5e68b..00000000 --- a/code_search/kubeflow/vendor/kubeflow/tf-job/prototypes/tf-job.jsonnet +++ /dev/null @@ -1,65 +0,0 @@ -// @apiVersion 0.1 -// @name io.ksonnet.pkg.tf-job -// @description A TensorFlow job (could be training or evaluation). -// @shortDescription A TensorFlow job. -// @param name string Name to give to each of the components -// @optionalParam namespace string null Namespace to use for the components. It is automatically inherited from the environment if not set. -// @optionalParam args string null Comma separated list of arguments to pass to the job -// @optionalParam image string null The docker image to use for the job. -// @optionalParam image_gpu string null The docker image to use when using GPUs. -// @optionalParam image_pull_secrets string null Comma-delimited list of secret names to use credentials in pulling your docker images. -// @optionalParam num_masters number 1 The number of masters to use -// @optionalParam num_ps number 0 The number of ps to use -// @optionalParam num_workers number 0 The number of workers to use -// @optionalParam num_gpus number 0 The number of GPUs to attach to workers. - -// TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--". - -local k = import "k.libsonnet"; -local tfJob = import "kubeflow/tf-job/tf-job.libsonnet"; -// updatedParams uses the environment namespace if -// the namespace parameter is not explicitly set -local updatedParams = params { - namespace: if params.namespace == "null" then env.namespace else params.namespace, -}; - -local name = import "param://name"; -local namespace = updatedParams.namespace; - -local argsParam = import "param://args"; -local args = - if argsParam == "null" then - [] - else - std.split(argsParam, ","); - -local image = import "param://image"; -local imageGpu = import "param://image_gpu"; -local imagePullSecrets = import "param://image_pull_secrets"; -local numMasters = import "param://num_masters"; -local numPs = import "param://num_ps"; -local numWorkers = import "param://num_workers"; -local numGpus = import "param://num_gpus"; - -local terminationPolicy = if numMasters == 1 then - tfJob.parts.tfJobTerminationPolicy("MASTER", 0) -else - tfJob.parts.tfJobTerminationPolicy("WORKER", 0); - -local workerSpec = if numGpus > 0 then - tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, imagePullSecrets, numGpus) -else - tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image, imagePullSecrets); - -std.prune(k.core.v1.list.new([ - tfJob.parts.tfJob( - name, - namespace, - [ - tfJob.parts.tfJobReplica("MASTER", numMasters, args, image, imagePullSecrets), - workerSpec, - tfJob.parts.tfJobReplica("PS", numPs, args, image, imagePullSecrets), - ], - terminationPolicy - ), -])) diff --git a/code_search/kubeflow/vendor/kubeflow/tf-job/tf-job.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-job/tf-job.libsonnet deleted file mode 100644 index 2ac2f744..00000000 --- a/code_search/kubeflow/vendor/kubeflow/tf-job/tf-job.libsonnet +++ /dev/null @@ -1,59 +0,0 @@ -local k = import "k.libsonnet"; -local util = import "util.libsonnet"; - -{ - parts:: { - tfJobReplica(replicaType, number, args, image, imagePullSecrets=[], numGpus=0):: - local baseContainer = { - image: image, - name: "tensorflow", - }; - local containerArgs = if std.length(args) > 0 then - { - args: args, - } - else {}; - local resources = if numGpus > 0 then { - resources: { - limits: { - "nvidia.com/gpu": numGpus, - }, - }, - } else {}; - if number > 0 then - { - replicas: number, - template: { - spec: { - imagePullSecrets: [{ name: secret } for secret in util.toArray(imagePullSecrets)], - containers: [ - baseContainer + containerArgs + resources, - ], - restartPolicy: "OnFailure", - }, - }, - tfReplicaType: replicaType, - } - else {}, - - tfJobTerminationPolicy(replicaName, replicaIndex):: { - chief: { - replicaName: replicaName, - replicaIndex: replicaIndex, - }, - }, - - tfJob(name, namespace, replicas, tp):: { - apiVersion: "kubeflow.org/v1alpha1", - kind: "TFJob", - metadata: { - name: name, - namespace: namespace, - }, - spec: { - replicaSpecs: replicas, - terminationPolicy: tp, - }, - }, - }, -} diff --git a/code_search/kubeflow/vendor/kubeflow/tf-job/util.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-job/util.libsonnet deleted file mode 100644 index d5458f40..00000000 --- a/code_search/kubeflow/vendor/kubeflow/tf-job/util.libsonnet +++ /dev/null @@ -1,7 +0,0 @@ -{ - // Convert a comma-delimited string to an array. - toArray(str):: - if std.type(str) == "string" && str != "null" && std.length(str) > 0 then - std.split(str, ",") - else [], -} diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving/README.md b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/README.md similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving/README.md rename to code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/README.md diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving/parts.yaml b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/parts.yaml similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving/parts.yaml rename to code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/parts.yaml diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving/prototypes/tf-serving-all-features.jsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/prototypes/tf-serving-all-features.jsonnet similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving/prototypes/tf-serving-all-features.jsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/prototypes/tf-serving-all-features.jsonnet diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving/tf-serving.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet similarity index 99% rename from code_search/kubeflow/vendor/kubeflow/tf-serving/tf-serving.libsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet index 8c867452..61273cf7 100644 --- a/code_search/kubeflow/vendor/kubeflow/tf-serving/tf-serving.libsonnet +++ b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet @@ -122,7 +122,6 @@ args: [ "/usr/bin/tensorflow_model_server", "--port=9000", - "--rest_api_port=8000", "--model_name=" + $.params.modelName, "--model_base_path=" + $.params.modelPath, ], @@ -130,9 +129,6 @@ { containerPort: 9000, }, - { - containerPort: 8000, - }, ], // TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that // model-server doesn't have something we can use out of the box. diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving/util.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/util.libsonnet similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving/util.libsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/util.libsonnet