Create a component to submit the Dataflow job to compute embeddings for code search (#324)

* Create a component to submit the Dataflow job to compute embeddings for code search.

* Update Beam to 2.8.0
* Remove nmslib from Apache beam requirements.txt; its not needed and appears
  to have problems installing on the Dataflow workers.

* Spacy download was failing on Dataflow workers; reinstalling the spacy
  package as a pip package appears to fix this.

* Fix some bugs in the workflow for building the Docker images.

* * Split requirements.txt into separate requirements for the Dataflow
  workers and the UI.

* We don't want to install unnecessary dependencies in the Dataflow workers.
  Some unnecessary dependencies; e.g. nmslib were also having problems
  being installed in the workers.
This commit is contained in:
Jeremy Lewi 2018-11-15 05:45:09 +08:00 committed by k8s-ci-robot
parent 6c976342a3
commit 26c400a4cd
14 changed files with 165 additions and 21 deletions

View File

@ -30,7 +30,7 @@ build-cpu:
@echo Built $(IMG):$(TAG)
# TODO(jlewi): We could always use build.jsonnet and then just
# Parse out the docker build command.
# Parse out the docker build command.
build-gpu:
docker build -f "./docker/t2t/Dockerfile" \
-t $(IMG)-gpu:$(TAG) \
@ -49,34 +49,34 @@ build-dataflow:
build: build-cpu build-gpu build-dataflow
# Build using GCB. This is useful if we are on a slow internet connection
# and don't want to pull
# and don't want to pull
build-gcb:
mkdir -p build
jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
> ./build/build.json
> ./build/build.json
cp -r ./docker ./build/
cp -r ./src ../build/
cp -r ./src ./build/
rm -rf ./build/src/code_search/dataflow/cli/test_data
rm -rf ./build/src/code_search/t2t/test_data
gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
# first.
push-cpu: build-cpu
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG):$(TAG)
@echo Pushed $(IMG):$(TAG)
push-gpu: build-gpu
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG)-gpu:$(TAG)
@echo Pushed $(IMG)-gpu:$(TAG)
push-trainer: push-cpu push-gpu
push-dataflow: build-dataflow
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG)-dataflow:$(TAG)
@echo Pushed $(IMG)-dataflow:$(TAG)

View File

@ -7,8 +7,8 @@ FROM python:2.7-jessie
# so we need to install them for Python2.
# We do this before copying the code because we don't want to have
# reinstall the requirements just because the code changed.
COPY src/requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
RUN pip install -r /tmp/requirements.dataflow.txt
RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
# install the spacy model

View File

@ -1,21 +1,34 @@
// TODO(jlewi): We should tag the image latest and then
// use latest as a cache so that rebuilds are fast
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
{
"steps": [
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
"./docker/t2t"],
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
"--file=docker/t2t/Dockerfile", "."],
"waitFor": ["-"],
},
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
"./docker/t2t"],
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
"--file=docker/t2t/Dockerfile", "."],
"waitFor": ["-"],
},
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--file=docker/t2t/Dockerfile.dataflow", "."],
"waitFor": ["-"],
},
],
"images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")],
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
}

View File

View File

@ -7,5 +7,9 @@
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base",
project: "code-search-demo",
modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",
problem: "kf_github_function_docstring",
model: "kf_similarity_transformer",
},
}

View File

@ -20,7 +20,7 @@
eval_steps: 10,
image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181106-v0.2-76-g611636c-dirty-860631",
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
imagePullSecrets: [],
// TODO(jlewi): dataDir doesn't seem to be used.
@ -106,6 +106,20 @@
numWorkers: 5,
project: "",
},
"submit-code-embeddings-job": {
name: "submit-code-embeddings-job",
image: $.components["t2t-job"].dataflowImage,
// Big query table where results will be written.
targetDataset: "code_search",
workingDir: $.components["t2t-code-search"].workingDir,
dataDir: self.workingDir + "/data",
// Directory where the model is stored.
modelDir: "",
jobName: "submit-code-embeddings-job",
workerMachineType: "n1-highcpu-32",
numWorkers: 5,
project: "",
},
tensorboard: {
image: "tensorflow/tensorflow:1.8.0",

View File

@ -0,0 +1,14 @@
// Submit a Dataflow job to compute the code embeddings used a trained model.
local k = import "k.libsonnet";
local experiments = import "experiments.libsonnet";
local lib = import "submit-code-embeddings-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
local experimentName = baseParams.experiment;
local params = baseParams + experiments[experimentName] + {
name: experimentName + "-embed-code",
};
std.prune(k.core.v1.list.new([lib.parts(params,env).job]))

View File

@ -0,0 +1,74 @@
{
parts(params, env):: {
// Submit a Dataflow job to compute the code embeddings used a trained model.
job :: {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
app: params.name,
},
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: params.name,
},
},
spec: {
// Don't restart because all the job should do is launch the Dataflow job.
restartPolicy: "Never",
containers: [
{
name: "dataflow",
image: params.image,
command: [
"python2",
"-m",
"code_search.dataflow.cli.create_function_embeddings",
"--runner=DataflowRunner",
"--project=" + params.project,
"--target_dataset=" + params.targetDataset,
"--data_dir=" + params.dataDir,
"--problem=" + params.problem,
"--job_name=" + params.jobName,
"--saved_model_dir=" + params.modelDir,
"--temp_location=" + params.workingDir + "/dataflow/temp",
"--staging_location=" + params.workingDir + "/dataflow/staging",
"--worker_machine_type=" + params.workerMachineType,
"--num_workers=" + params.numWorkers,
"--requirements_file=requirements.dataflow.txt",
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
workingDir: "/src",
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
], //volumeMounts
},
], // containers
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
}, // spec
},
},
}, // job
}, // parts
}

View File

@ -42,6 +42,7 @@ local jobSpec = {
"--staging_location=" + params.workingDir + "/dataflow/staging",
"--worker_machine_type=" + params.workerMachineType,
"--num_workers=" + params.numWorkers,
"--requirements_file=requirements.dataflow.txt",
],
env: [
{

View File

@ -5,4 +5,5 @@
workingDir: "gs://code-search-demo/20181104",
dataDir: "gs://code-search-demo/20181104/data",
project: "code-search-demo",
experiment: "demo-trainer-11-07-dist-sync-gpu",
}

View File

@ -1,3 +1,6 @@
"""Dataflow job to compute function embeddings."""
import logging
import apache_beam as beam
import code_search.dataflow.cli.arguments as arguments
@ -45,9 +48,16 @@ def create_function_embeddings(argv=None):
)
result = pipeline.run()
logging.info("Submitted Dataflow job: %s", result)
if args.wait_until_finish:
result.wait_until_finish()
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
create_function_embeddings()

View File

@ -0,0 +1,11 @@
# Requirements for the Dataflow jobs.
# We want to avoid unnecessary dependencies as these dependencies are installed on each
# worker.
astor~=0.7.0
apache-beam[gcp]~=2.8.0
nltk~=3.3.0
oauth2client~=4.1.0
spacy~=2.0.0
tensor2tensor~=1.9.0
tensorflow~=1.11.0
pybind11~=2.2.4

View File

@ -1,8 +1,6 @@
astor~=0.7.0
apache-beam[gcp]~=2.6.0
apache-beam[gcp]~=2.8.0
Flask~=1.0.0
nltk~=3.3.0
nmslib~=1.7.0
oauth2client~=4.1.0
requests~=2.19.0
spacy~=2.0.0

View File

@ -9,6 +9,10 @@ with open('requirements.txt', 'r') as f:
install_requires = f.readlines()
CUSTOM_COMMANDS = [
# TODO(jlewi): python -m is complaining that module spacy not found even
# though it should be installed due to requirements. Reinstalling
# it using a custom command appears to fix the problem.
['pip', 'install', 'spacy'],
['python', '-m', 'spacy', 'download', 'en'],
# TODO(sanyamkapoor): This isn't ideal but no other way for a seamless install right now.
['pip', 'install', 'https://github.com/kubeflow/batch-predict/tarball/master']