mirror of https://github.com/kubeflow/examples.git
Create a component to submit the Dataflow job to compute embeddings for code search (#324)
* Create a component to submit the Dataflow job to compute embeddings for code search. * Update Beam to 2.8.0 * Remove nmslib from Apache beam requirements.txt; its not needed and appears to have problems installing on the Dataflow workers. * Spacy download was failing on Dataflow workers; reinstalling the spacy package as a pip package appears to fix this. * Fix some bugs in the workflow for building the Docker images. * * Split requirements.txt into separate requirements for the Dataflow workers and the UI. * We don't want to install unnecessary dependencies in the Dataflow workers. Some unnecessary dependencies; e.g. nmslib were also having problems being installed in the workers.
This commit is contained in:
parent
6c976342a3
commit
26c400a4cd
|
@ -30,7 +30,7 @@ build-cpu:
|
|||
@echo Built $(IMG):$(TAG)
|
||||
|
||||
# TODO(jlewi): We could always use build.jsonnet and then just
|
||||
# Parse out the docker build command.
|
||||
# Parse out the docker build command.
|
||||
build-gpu:
|
||||
docker build -f "./docker/t2t/Dockerfile" \
|
||||
-t $(IMG)-gpu:$(TAG) \
|
||||
|
@ -49,34 +49,34 @@ build-dataflow:
|
|||
build: build-cpu build-gpu build-dataflow
|
||||
|
||||
# Build using GCB. This is useful if we are on a slow internet connection
|
||||
# and don't want to pull
|
||||
# and don't want to pull
|
||||
build-gcb:
|
||||
mkdir -p build
|
||||
jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
|
||||
> ./build/build.json
|
||||
> ./build/build.json
|
||||
cp -r ./docker ./build/
|
||||
cp -r ./src ../build/
|
||||
cp -r ./src ./build/
|
||||
rm -rf ./build/src/code_search/dataflow/cli/test_data
|
||||
rm -rf ./build/src/code_search/t2t/test_data
|
||||
gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build
|
||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
|
||||
|
||||
|
||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||
# first.
|
||||
push-cpu: build-cpu
|
||||
gcloud docker --authorize-only
|
||||
gcloud docker --authorize-only
|
||||
docker push $(IMG):$(TAG)
|
||||
@echo Pushed $(IMG):$(TAG)
|
||||
|
||||
push-gpu: build-gpu
|
||||
gcloud docker --authorize-only
|
||||
gcloud docker --authorize-only
|
||||
docker push $(IMG)-gpu:$(TAG)
|
||||
@echo Pushed $(IMG)-gpu:$(TAG)
|
||||
|
||||
push-trainer: push-cpu push-gpu
|
||||
|
||||
push-dataflow: build-dataflow
|
||||
gcloud docker --authorize-only
|
||||
gcloud docker --authorize-only
|
||||
docker push $(IMG)-dataflow:$(TAG)
|
||||
@echo Pushed $(IMG)-dataflow:$(TAG)
|
||||
|
||||
|
|
|
@ -7,8 +7,8 @@ FROM python:2.7-jessie
|
|||
# so we need to install them for Python2.
|
||||
# We do this before copying the code because we don't want to have
|
||||
# reinstall the requirements just because the code changed.
|
||||
COPY src/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install -r /tmp/requirements.txt
|
||||
COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
|
||||
RUN pip install -r /tmp/requirements.dataflow.txt
|
||||
RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
|
||||
|
||||
# install the spacy model
|
||||
|
|
|
@ -1,21 +1,34 @@
|
|||
// TODO(jlewi): We should tag the image latest and then
|
||||
// use latest as a cache so that rebuilds are fast
|
||||
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
|
||||
{
|
||||
|
||||
"steps": [
|
||||
{
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
|
||||
"./docker/t2t"],
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
|
||||
"--file=docker/t2t/Dockerfile", "."],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
{
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
|
||||
"./docker/t2t"],
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
|
||||
"--file=docker/t2t/Dockerfile", "."],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
{
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--file=docker/t2t/Dockerfile.dataflow", "."],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
],
|
||||
"images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")],
|
||||
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
|
||||
}
|
|
@ -7,5 +7,9 @@
|
|||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base",
|
||||
project: "code-search-demo",
|
||||
modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",
|
||||
problem: "kf_github_function_docstring",
|
||||
model: "kf_similarity_transformer",
|
||||
},
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
eval_steps: 10,
|
||||
image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
|
||||
imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
|
||||
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181106-v0.2-76-g611636c-dirty-860631",
|
||||
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
|
||||
|
||||
imagePullSecrets: [],
|
||||
// TODO(jlewi): dataDir doesn't seem to be used.
|
||||
|
@ -106,6 +106,20 @@
|
|||
numWorkers: 5,
|
||||
project: "",
|
||||
},
|
||||
"submit-code-embeddings-job": {
|
||||
name: "submit-code-embeddings-job",
|
||||
image: $.components["t2t-job"].dataflowImage,
|
||||
// Big query table where results will be written.
|
||||
targetDataset: "code_search",
|
||||
workingDir: $.components["t2t-code-search"].workingDir,
|
||||
dataDir: self.workingDir + "/data",
|
||||
// Directory where the model is stored.
|
||||
modelDir: "",
|
||||
jobName: "submit-code-embeddings-job",
|
||||
workerMachineType: "n1-highcpu-32",
|
||||
numWorkers: 5,
|
||||
project: "",
|
||||
},
|
||||
|
||||
tensorboard: {
|
||||
image: "tensorflow/tensorflow:1.8.0",
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
// Submit a Dataflow job to compute the code embeddings used a trained model.
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local experiments = import "experiments.libsonnet";
|
||||
local lib = import "submit-code-embeddings-job.libsonnet";
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
|
||||
local experimentName = baseParams.experiment;
|
||||
local params = baseParams + experiments[experimentName] + {
|
||||
name: experimentName + "-embed-code",
|
||||
};
|
||||
|
||||
|
||||
std.prune(k.core.v1.list.new([lib.parts(params,env).job]))
|
|
@ -0,0 +1,74 @@
|
|||
{
|
||||
parts(params, env):: {
|
||||
// Submit a Dataflow job to compute the code embeddings used a trained model.
|
||||
job :: {
|
||||
apiVersion: "batch/v1",
|
||||
kind: "Job",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
// Don't restart because all the job should do is launch the Dataflow job.
|
||||
restartPolicy: "Never",
|
||||
containers: [
|
||||
{
|
||||
name: "dataflow",
|
||||
image: params.image,
|
||||
command: [
|
||||
"python2",
|
||||
"-m",
|
||||
"code_search.dataflow.cli.create_function_embeddings",
|
||||
"--runner=DataflowRunner",
|
||||
"--project=" + params.project,
|
||||
"--target_dataset=" + params.targetDataset,
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--problem=" + params.problem,
|
||||
"--job_name=" + params.jobName,
|
||||
"--saved_model_dir=" + params.modelDir,
|
||||
"--temp_location=" + params.workingDir + "/dataflow/temp",
|
||||
"--staging_location=" + params.workingDir + "/dataflow/staging",
|
||||
"--worker_machine_type=" + params.workerMachineType,
|
||||
"--num_workers=" + params.numWorkers,
|
||||
"--requirements_file=requirements.dataflow.txt",
|
||||
],
|
||||
env: [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
},
|
||||
],
|
||||
workingDir: "/src",
|
||||
volumeMounts: [
|
||||
{
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
name: "gcp-credentials",
|
||||
},
|
||||
], //volumeMounts
|
||||
},
|
||||
], // containers
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "user-gcp-sa",
|
||||
},
|
||||
},
|
||||
],
|
||||
}, // spec
|
||||
},
|
||||
},
|
||||
}, // job
|
||||
}, // parts
|
||||
}
|
|
@ -42,6 +42,7 @@ local jobSpec = {
|
|||
"--staging_location=" + params.workingDir + "/dataflow/staging",
|
||||
"--worker_machine_type=" + params.workerMachineType,
|
||||
"--num_workers=" + params.numWorkers,
|
||||
"--requirements_file=requirements.dataflow.txt",
|
||||
],
|
||||
env: [
|
||||
{
|
||||
|
|
|
@ -5,4 +5,5 @@
|
|||
workingDir: "gs://code-search-demo/20181104",
|
||||
dataDir: "gs://code-search-demo/20181104/data",
|
||||
project: "code-search-demo",
|
||||
experiment: "demo-trainer-11-07-dist-sync-gpu",
|
||||
}
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
"""Dataflow job to compute function embeddings."""
|
||||
import logging
|
||||
|
||||
import apache_beam as beam
|
||||
|
||||
import code_search.dataflow.cli.arguments as arguments
|
||||
|
@ -45,9 +48,16 @@ def create_function_embeddings(argv=None):
|
|||
)
|
||||
|
||||
result = pipeline.run()
|
||||
logging.info("Submitted Dataflow job: %s", result)
|
||||
if args.wait_until_finish:
|
||||
result.wait_until_finish()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format=('%(levelname)s|%(asctime)s'
|
||||
'|%(pathname)s|%(lineno)d| %(message)s'),
|
||||
datefmt='%Y-%m-%dT%H:%M:%S',
|
||||
)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
create_function_embeddings()
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
# Requirements for the Dataflow jobs.
|
||||
# We want to avoid unnecessary dependencies as these dependencies are installed on each
|
||||
# worker.
|
||||
astor~=0.7.0
|
||||
apache-beam[gcp]~=2.8.0
|
||||
nltk~=3.3.0
|
||||
oauth2client~=4.1.0
|
||||
spacy~=2.0.0
|
||||
tensor2tensor~=1.9.0
|
||||
tensorflow~=1.11.0
|
||||
pybind11~=2.2.4
|
|
@ -1,8 +1,6 @@
|
|||
astor~=0.7.0
|
||||
apache-beam[gcp]~=2.6.0
|
||||
apache-beam[gcp]~=2.8.0
|
||||
Flask~=1.0.0
|
||||
nltk~=3.3.0
|
||||
nmslib~=1.7.0
|
||||
oauth2client~=4.1.0
|
||||
requests~=2.19.0
|
||||
spacy~=2.0.0
|
|
@ -9,6 +9,10 @@ with open('requirements.txt', 'r') as f:
|
|||
install_requires = f.readlines()
|
||||
|
||||
CUSTOM_COMMANDS = [
|
||||
# TODO(jlewi): python -m is complaining that module spacy not found even
|
||||
# though it should be installed due to requirements. Reinstalling
|
||||
# it using a custom command appears to fix the problem.
|
||||
['pip', 'install', 'spacy'],
|
||||
['python', '-m', 'spacy', 'download', 'en'],
|
||||
# TODO(sanyamkapoor): This isn't ideal but no other way for a seamless install right now.
|
||||
['pip', 'install', 'https://github.com/kubeflow/batch-predict/tarball/master']
|
||||
|
|
Loading…
Reference in New Issue