mirror of https://github.com/kubeflow/examples.git
Language task on kubeflow (#143)
* [WIP] initialize ksonnet app * Push images to GCR * Upgrade Docker container to run T2T entrypoint with appropriate env vars * Add a tf-job based t2t-job * Fix GPU parameters
This commit is contained in:
parent
242c2e6d20
commit
4bd30a1e68
|
|
@ -72,53 +72,82 @@ $ python preprocess/scripts/process_github_archive.py -i files/select_github_arc
|
|||
## 2. Model Training
|
||||
|
||||
A `Dockerfile` based on Tensorflow is provided along which has all the dependencies for this part of the pipeline.
|
||||
By default, it is based off Tensorflow CPU 1.8.0 for `Python3` but can be overridden in the Docker image build using
|
||||
the following command
|
||||
By default, it is based off Tensorflow CPU 1.8.0 for `Python3` but can be overridden in the Docker image build.
|
||||
This script builds and pushes the docker image to Google Container Registry.
|
||||
|
||||
### 2.1 Build & Push images to GCR
|
||||
|
||||
**NOTE**: The images can be pushed to any registry of choice but rest of the
|
||||
|
||||
* Authenticate with GCR
|
||||
```
|
||||
$ export BUILD_IMAGE_TAG=my-new-tag # (optional) to change built image tag
|
||||
$ gcloud auth configure-docker
|
||||
```
|
||||
|
||||
* Setup environment variables
|
||||
```
|
||||
$ export PROJECT=<your_project> # (optional) setup project ID. if not set, image is not published to GCR
|
||||
$ export BUILD_IMAGE_TAG=code-search:devel # (optional) to change built image tag
|
||||
$ export BASE_IMAGE_TAG=1.8.0-gpu-py3 # (optional) for GPU base image
|
||||
```
|
||||
|
||||
* Build and push the image
|
||||
```
|
||||
$ ./language_task/build_image.sh
|
||||
```
|
||||
|
||||
### 2.1 Function Summarizer
|
||||
See [GCR Pushing and Pulling Images](https://cloud.google.com/container-registry/docs/pushing-and-pulling) for more.
|
||||
|
||||
|
||||
### 2.2 Train Locally
|
||||
|
||||
**WARNING**: The container might run out of memory and be killed.
|
||||
|
||||
#### 2.2.1 Function Summarizer
|
||||
|
||||
This part generates a model to summarize functions into docstrings using the data generated in previous
|
||||
step. It uses `tensor2tensor`.
|
||||
|
||||
* Generate `TFRecords` for training
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||
$ export DOCKER_ENTRYPOINT=t2t-datagen # (required)
|
||||
$ ./language_task/run.sh --problem=github_function_summarizer
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data ${BUILD_IMAGE_TAG} \
|
||||
t2t-datagen --problem=github_function_summarizer --data_dir=/data
|
||||
```
|
||||
|
||||
* Train transduction model using `Tranformer Networks` and a base hyper-parameters set
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||
$ export DOCKER_ENTRYPOINT=t2t-trainer # (required)
|
||||
$ ./language_task/run.sh --problem=github_function_summarizer --model=transformer --hparams_set=transformer_base
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output ${BUILD_IMAGE_TAG} \
|
||||
t2t-trainer --problem=github_function_summarizer --data_dir=/data --output_dir=/output \
|
||||
--model=transformer --hparams_set=transformer_base
|
||||
```
|
||||
|
||||
### 2.2 Docstrings Language Model
|
||||
#### 2.2.2 Docstrings Language Model
|
||||
|
||||
This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor`
|
||||
|
||||
* Generate `TFRecords` for training
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||
$ export DOCKER_ENTRYPOINT=t2t-datagen # (required)
|
||||
$ ./language_task/run.sh --problem=github_docstring_language_model
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data ${BUILD_IMAGE_TAG} \
|
||||
t2t-datagen --problem=github_docstring_language_model --data_dir=/data
|
||||
```
|
||||
|
||||
* Train language model using `Tranformer Networks` and a custom hyper-parameters set
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder # (optional) mount a local output directory
|
||||
$ export DOCKER_ENTRYPOINT=t2t-trainer # (required)
|
||||
$ ./language_task/run.sh --problem=github_docstring_language_model --model=transformer --hparams_set=transformer_gh_lm
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output ${BUILD_IMAGE_TAG} \
|
||||
t2t-trainer --problem=github_docstring_language_model --data_dir=/data --output_dir=/output \
|
||||
--model=transformer --hparams_set=transformer_gh_lm
|
||||
```
|
||||
|
||||
### 2.3 Train on Kubeflow
|
||||
|
||||
TODO
|
||||
|
||||
# Acknowledgements
|
||||
|
||||
This project derives from [hamelsmu/code_search](https://github.com/hamelsmu/code_search).
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
/lib
|
||||
/.ksonnet/registries
|
||||
/app.override.yaml
|
||||
/.ks_environment
|
||||
/environments
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
apiVersion: 0.1.0
|
||||
environments:
|
||||
default:
|
||||
destination:
|
||||
namespace: kubeflow
|
||||
server: https://130.211.225.204
|
||||
k8sVersion: v1.9.6
|
||||
path: default
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
tf-job:
|
||||
gitVersion:
|
||||
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
|
||||
refSpec: master
|
||||
name: tf-job
|
||||
registry: kubeflow
|
||||
name: kubeflow
|
||||
registries:
|
||||
incubator:
|
||||
gitVersion:
|
||||
commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c
|
||||
refSpec: master
|
||||
protocol: github
|
||||
uri: github.com/ksonnet/parts/tree/master/incubator
|
||||
kubeflow:
|
||||
gitVersion:
|
||||
commitSha: d8e19a4762406bb454453331f52ed5a4433c0df9
|
||||
refSpec: master
|
||||
protocol: github
|
||||
uri: github.com/kubeflow/kubeflow/tree/master/kubeflow
|
||||
version: 0.0.1
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
global: {
|
||||
// User-defined global parameters; accessible to all component and environments, Ex:
|
||||
// replicas: 4,
|
||||
},
|
||||
components: {
|
||||
// Component-level parameters, defined initially from 'ks prototype use ...'
|
||||
// Each object below should correspond to a component in the components/ directory
|
||||
"t2t-job": {
|
||||
numWorker: 1,
|
||||
numMaster: 1,
|
||||
numPs: 1,
|
||||
numWorkerGpu: 0,
|
||||
numPsGpu: 0,
|
||||
|
||||
train_steps: 100,
|
||||
eval_steps: 10,
|
||||
|
||||
image: "gcr.io/kubeflow-dev/code-search:devel",
|
||||
imageGpu: "gcr.io/kubeflow-dev/code-search:gpu-devel",
|
||||
imagePullSecrets: [],
|
||||
},
|
||||
|
||||
"t2t-gh-summarizer": {
|
||||
"name": "github_function_summarizer",
|
||||
"problem": "github_function_summarizer",
|
||||
"dataDir": "gs://kubeflow-dev/code-search/raw_data",
|
||||
"outputDir": "gs://kubeflow-dev/code-search/train",
|
||||
"model": "transformer",
|
||||
"hparams_set": "transformer_base"
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
local k = import "k.libsonnet";
|
||||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-gh-summarizer"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
|
||||
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
||||
|
||||
{
|
||||
parts(newParams, env):: {
|
||||
local params = baseParams + newParams,
|
||||
|
||||
local t2tCmd = {
|
||||
datagen: [
|
||||
"t2t-datagen",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
],
|
||||
|
||||
trainer: [
|
||||
"t2t-trainer",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--output_dir=" + params.outputDir,
|
||||
"--model=" + params.model,
|
||||
"--hparams_set=" + params.hparams_set,
|
||||
"--train_steps=" + std.toString(params.train_steps),
|
||||
],
|
||||
|
||||
workerBase: self.trainer + [
|
||||
"--schedule=train",
|
||||
"--ps_gpu=" + std.toString(params.numPsGpu),
|
||||
"--worker_gpu=" + std.toString(params.numWorkerGpu),
|
||||
"--worker_replicas=" + std.toString(params.numWorker + params.numMaster),
|
||||
"--ps_replicas=" + std.toString(params.numPs),
|
||||
"--eval_steps=" + std.toString(params.eval_steps),
|
||||
],
|
||||
|
||||
ps: self.trainer + [
|
||||
"--schedule=run_std_server",
|
||||
"--ps_job=/job:ps",
|
||||
],
|
||||
|
||||
worker: self.workerBase + [
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
|
||||
master: self.workerBase + [
|
||||
"--worker_job=/job:master",
|
||||
],
|
||||
},
|
||||
|
||||
local terminationPolicy = if params.numMaster == 1
|
||||
then tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
|
||||
else tfJob.parts.tfJobTerminationPolicy("WORKER", 0),
|
||||
|
||||
local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image,
|
||||
local psImage = if params.numPsGpu > 0 then params.imageGpu else params.image,
|
||||
|
||||
job::
|
||||
tfJob.parts.tfJob(
|
||||
params.name,
|
||||
env.namespace,
|
||||
[
|
||||
tfJob.parts.tfJobReplica("MASTER", params.numMaster, t2tCmd.master, workerImage, params.imagePullSecrets, params.numWorkerGpu),
|
||||
tfJob.parts.tfJobReplica("WORKER", params.numWorker, t2tCmd.worker, workerImage, params.imagePullSecrets, params.numWorkerGpu),
|
||||
tfJob.parts.tfJobReplica("PS", params.numPs, t2tCmd.ps, psImage, params.imagePullSecrets, params.numPsGpu),
|
||||
],
|
||||
terminationPolicy
|
||||
),
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
local components = std.extVar("__ksonnet/components");
|
||||
components + {
|
||||
// Insert user-specified overrides here.
|
||||
}
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
||||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
|
||||
|
||||
- [tf-job](#tf-job)
|
||||
- [Quickstart](#quickstart)
|
||||
- [Using the library](#using-the-library)
|
||||
- [io.ksonnet.pkg.tf-job](#ioksonnetpkgtf-job)
|
||||
- [Example](#example)
|
||||
- [Parameters](#parameters)
|
||||
- [Example](#example-1)
|
||||
- [Parameters](#parameters-1)
|
||||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
# tf-job
|
||||
|
||||
> Prototypes for running TensorFlow jobs.
|
||||
|
||||
|
||||
* [Quickstart](#quickstart)
|
||||
* [Using Prototypes](#using-prototypes)
|
||||
* [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job)
|
||||
* [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn)
|
||||
|
||||
## Quickstart
|
||||
|
||||
*The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.*
|
||||
|
||||
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
|
||||
|
||||
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
|
||||
|
||||
Finally, in the ksonnet application directory, run the following:
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
|
||||
--namespace default \
|
||||
--name tf-job
|
||||
|
||||
# Apply to server.
|
||||
$ ks apply -f tf-job.jsonnet
|
||||
```
|
||||
|
||||
## Using the library
|
||||
|
||||
The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
|
||||
|
||||
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
|
||||
|
||||
These prototypes, as well as how to use them, are enumerated below.
|
||||
|
||||
### io.ksonnet.pkg.tf-job
|
||||
|
||||
A TensorFlow job (could be training or evaluation).
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-job tf-job \
|
||||
--name YOUR_NAME_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name to give to each of the components [string]
|
||||
### io.ksonnet.pkg.tf-cnn
|
||||
|
||||
A TensorFlow CNN Benchmarking job
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \
|
||||
--name YOUR_NAME_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name for the job. [string]
|
||||
|
||||
|
||||
[rootReadme]: https://github.com/ksonnet/mixins
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"name": "tf-job",
|
||||
"apiVersion": "0.0.1",
|
||||
"kind": "ksonnet.io/parts",
|
||||
"description": "Prototypes for running TensorFlow jobs.\n",
|
||||
"author": "kubeflow team <kubeflow-team@google.com>",
|
||||
"contributors": [
|
||||
{
|
||||
"name": "Jeremy Lewi",
|
||||
"email": "jlewi@google.com"
|
||||
}
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/kubeflow/kubeflow"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/kubeflow/kubeflow/issues"
|
||||
},
|
||||
"keywords": [
|
||||
"kubeflow",
|
||||
"tensorflow",
|
||||
"database"
|
||||
],
|
||||
"quickStart": {
|
||||
"prototype": "io.ksonnet.pkg.tf-job",
|
||||
"componentName": "tf-job",
|
||||
"flags": {
|
||||
"name": "tf-job",
|
||||
"namespace": "default"
|
||||
},
|
||||
"comment": "Run TensorFlow Job"
|
||||
},
|
||||
"license": "Apache 2.0"
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-job
|
||||
// @description A TensorFlow job (could be training or evaluation).
|
||||
// @shortDescription A TensorFlow job.
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam namespace string null Namespace to use for the components. It is automatically inherited from the environment if not set.
|
||||
// @optionalParam args string null Comma separated list of arguments to pass to the job
|
||||
// @optionalParam image string null The docker image to use for the job.
|
||||
// @optionalParam image_gpu string null The docker image to use when using GPUs.
|
||||
// @optionalParam image_pull_secrets string null Comma-delimited list of secret names to use credentials in pulling your docker images.
|
||||
// @optionalParam num_masters number 1 The number of masters to use
|
||||
// @optionalParam num_ps number 0 The number of ps to use
|
||||
// @optionalParam num_workers number 0 The number of workers to use
|
||||
// @optionalParam num_gpus number 0 The number of GPUs to attach to workers.
|
||||
|
||||
// TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--".
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
|
||||
// updatedParams uses the environment namespace if
|
||||
// the namespace parameter is not explicitly set
|
||||
local updatedParams = params {
|
||||
namespace: if params.namespace == "null" then env.namespace else params.namespace,
|
||||
};
|
||||
|
||||
local name = import "param://name";
|
||||
local namespace = updatedParams.namespace;
|
||||
|
||||
local argsParam = import "param://args";
|
||||
local args =
|
||||
if argsParam == "null" then
|
||||
[]
|
||||
else
|
||||
std.split(argsParam, ",");
|
||||
|
||||
local image = import "param://image";
|
||||
local imageGpu = import "param://image_gpu";
|
||||
local imagePullSecrets = import "param://image_pull_secrets";
|
||||
local numMasters = import "param://num_masters";
|
||||
local numPs = import "param://num_ps";
|
||||
local numWorkers = import "param://num_workers";
|
||||
local numGpus = import "param://num_gpus";
|
||||
|
||||
local terminationPolicy = if numMasters == 1 then
|
||||
tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
|
||||
else
|
||||
tfJob.parts.tfJobTerminationPolicy("WORKER", 0);
|
||||
|
||||
local workerSpec = if numGpus > 0 then
|
||||
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, imagePullSecrets, numGpus)
|
||||
else
|
||||
tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image, imagePullSecrets);
|
||||
|
||||
std.prune(k.core.v1.list.new([
|
||||
tfJob.parts.tfJob(
|
||||
name,
|
||||
namespace,
|
||||
[
|
||||
tfJob.parts.tfJobReplica("MASTER", numMasters, args, image, imagePullSecrets),
|
||||
workerSpec,
|
||||
tfJob.parts.tfJobReplica("PS", numPs, args, image, imagePullSecrets),
|
||||
],
|
||||
terminationPolicy
|
||||
),
|
||||
]))
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
local k = import "k.libsonnet";
|
||||
local util = import "util.libsonnet";
|
||||
|
||||
{
|
||||
parts:: {
|
||||
tfJobReplica(replicaType, number, args, image, imagePullSecrets=[], numGpus=0)::
|
||||
local baseContainer = {
|
||||
image: image,
|
||||
name: "tensorflow",
|
||||
};
|
||||
local containerArgs = if std.length(args) > 0 then
|
||||
{
|
||||
args: args,
|
||||
}
|
||||
else {};
|
||||
local resources = if numGpus > 0 then {
|
||||
resources: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
},
|
||||
},
|
||||
} else {};
|
||||
if number > 0 then
|
||||
{
|
||||
replicas: number,
|
||||
template: {
|
||||
spec: {
|
||||
imagePullSecrets: [{ name: secret } for secret in util.toArray(imagePullSecrets)],
|
||||
containers: [
|
||||
baseContainer + containerArgs + resources,
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: replicaType,
|
||||
}
|
||||
else {},
|
||||
|
||||
tfJobTerminationPolicy(replicaName, replicaIndex):: {
|
||||
chief: {
|
||||
replicaName: replicaName,
|
||||
replicaIndex: replicaIndex,
|
||||
},
|
||||
},
|
||||
|
||||
tfJob(name, namespace, replicas, tp):: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: replicas,
|
||||
terminationPolicy: tp,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
// Convert a comma-delimited string to an array.
|
||||
toArray(str)::
|
||||
if std.type(str) == "string" && str != "null" && std.length(str) > 0 then
|
||||
std.split(str, ",")
|
||||
else [],
|
||||
}
|
||||
|
|
@ -6,8 +6,15 @@ FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
|
|||
|
||||
ADD requirements.txt /
|
||||
|
||||
RUN pip3 --no-cache-dir install -r /requirements.txt
|
||||
RUN pip3 --no-cache-dir install -r /requirements.txt &&\
|
||||
apt-get update && apt-get install -y jq &&\
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
VOLUME ["/data", "/output"]
|
||||
|
||||
ADD t2t_problems/* /t2t_problems/
|
||||
ADD t2t-entrypoint.sh /usr/local/sbin/t2t-entrypoint
|
||||
|
||||
ENV T2T_USR_DIR=/t2t_problems
|
||||
|
||||
ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"]
|
||||
|
|
|
|||
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
set -e
|
||||
|
||||
PROJECT=${PROJECT:-}
|
||||
BASE_IMAGE_TAG=${BASE_IMAGE_TAG:-1.8.0-py3} # 1.8.0-gpu-py3 for GPU-based image
|
||||
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-semantic-code-search:devel}
|
||||
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-code-search:devel}
|
||||
|
||||
# Directory of this script used as docker context
|
||||
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
|
@ -12,4 +13,10 @@ pushd "$_SCRIPT_DIR"
|
|||
|
||||
docker build -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} .
|
||||
|
||||
# Push image to GCR if PROJECT available
|
||||
if [[ ! -z "${PROJECT}" ]]; then
|
||||
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
fi
|
||||
|
||||
popd
|
||||
|
|
|
|||
|
|
@ -1,36 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Script Variables
|
||||
IMAGE_TAG=${IMAGE_TAG:-semantic-code-search:devel}
|
||||
DOCKER_ENTRYPOINT=${DOCKER_ENTRYPOINT:-}
|
||||
|
||||
MOUNT_DATA_DIR=${MOUNT_DATA_DIR:-}
|
||||
MOUNT_OUTPUT_DIR=${MOUNT_OUTPUT_DIR:-}
|
||||
|
||||
DATA_DIR=${DATA_DIR:-/data}
|
||||
OUTPUT_DIR=${OUTPUT_DIR:-/output}
|
||||
|
||||
# Internal Variables
|
||||
_DOCKER_RUN_OPTS="-it --rm --entrypoint=${DOCKER_ENTRYPOINT}"
|
||||
_DOCKER_CMD="${@} --t2t_usr_dir=/t2t_problems --tmp_dir=/tmp --data_dir=${DATA_DIR} --output_dir=${OUTPUT_DIR}"
|
||||
|
||||
if [[ -z ${DOCKER_ENTRYPOINT} ]]; then
|
||||
echo "ERROR: Missing DOCKER_ENTRYPOINT environment variable! Use 't2t-datagen' or 't2t-trainer'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Mount local directories (if specified)
|
||||
if [[ ! -z ${MOUNT_DATA_DIR} ]]; then
|
||||
_DOCKER_RUN_OPTS="${_DOCKER_RUN_OPTS} -v ${MOUNT_DATA_DIR}:${DATA_DIR}:rw"
|
||||
fi
|
||||
|
||||
if [[ ! -z ${MOUNT_OUTPUT_DIR} ]]; then
|
||||
_DOCKER_RUN_OPTS="${_DOCKER_RUN_OPTS} -v ${MOUNT_OUTPUT_DIR}:${OUTPUT_DIR}:rw"
|
||||
fi
|
||||
|
||||
_FINAL_CMD="docker run ${_DOCKER_RUN_OPTS} ${IMAGE_TAG} ${_DOCKER_CMD}"
|
||||
|
||||
echo "${_FINAL_CMD}"
|
||||
eval "${_FINAL_CMD}"
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
T2T_USR_DIR=${T2T_USR_DIR:-}
|
||||
TARGET_BIN="${1}"
|
||||
TARGET_BIN_OPTS="--tmp_dir=/tmp"
|
||||
|
||||
# Add T2T user directory for new problems
|
||||
if [[ ! -z "${T2T_USR_DIR}" ]]; then
|
||||
TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --t2t_usr_dir=${T2T_USR_DIR}"
|
||||
fi
|
||||
|
||||
# Process TF_CONFIG to pass parameters distributed training parameters to `t2t-trainer`
|
||||
TF_CONFIG=${TF_CONFIG:-}
|
||||
if [[ ! -z "${TF_CONFIG}" ]]; then
|
||||
WORKER_ID=$(echo "${TF_CONFIG}" | jq ".task.index")
|
||||
WORKER_TYPE=$(echo "${TF_CONFIG}" | jq -r ".task.type")
|
||||
MASTER_INSTANCE=$(echo "${TF_CONFIG}" | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||
|
||||
if [[ "${TARGET_BIN}" = "t2t-trainer" ]]; then
|
||||
TARGET_BIN_OPTS="${TARGET_BIN_OPTS} --master=grpc://${MASTER_INSTANCE} --worker_id=${WORKER_ID}"
|
||||
fi
|
||||
fi
|
||||
|
||||
EVAL_CMD="${TARGET_BIN} ${TARGET_BIN_OPTS} ${@:2}"
|
||||
|
||||
echo "Running command: '${EVAL_CMD}'"
|
||||
eval "${EVAL_CMD}"
|
||||
Loading…
Reference in New Issue