Create a component to submit the Dataflow job to compute embeddings for code search (#324)

* Create a component to submit the Dataflow job to compute embeddings for code search. * Update Beam to 2.8.0 * Remove nmslib from Apache beam requirements.txt; its not needed and appears to have problems installing on the Dataflow workers. * Spacy download was failing on Dataflow workers; reinstalling the spacy package as a pip package appears to fix this. * Fix some bugs in the workflow for building the Docker images. * * Split requirements.txt into separate requirements for the Dataflow workers and the UI. * We don't want to install unnecessary dependencies in the Dataflow workers. Some unnecessary dependencies; e.g. nmslib were also having problems being installed in the workers.
2018-11-15 05:45:09 +08:00 · 2018-11-15 05:45:09 +08:00 · 26c400a4cd
parent 6c976342a3
commit 26c400a4cd
14 changed files with 165 additions and 21 deletions
--- a/code_search/Makefile
+++ b/code_search/Makefile
@ -30,7 +30,7 @@ build-cpu:
 	@echo Built $(IMG):$(TAG)

 # TODO(jlewi): We could always use build.jsonnet and then just
-# Parse out the docker build command.	
+# Parse out the docker build command.
 build-gpu:
 	docker build -f "./docker/t2t/Dockerfile" \
             -t $(IMG)-gpu:$(TAG) \
@ -49,34 +49,34 @@ build-dataflow:
 build: build-cpu build-gpu build-dataflow

 # Build using GCB. This is useful if we are on a slow internet connection
-# and don't want to pull 
+# and don't want to pull
 build-gcb:
 	mkdir  -p build
 	jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
-	  > ./build/build.json	
+	  > ./build/build.json
 	cp -r ./docker ./build/
-	cp -r ./src ../build/
+	cp -r ./src ./build/
 	rm -rf ./build/src/code_search/dataflow/cli/test_data
 	rm -rf ./build/src/code_search/t2t/test_data
-	gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build


 # Build but don't attach the latest tag. This allows manual testing/inspection of the image
 # first.
 push-cpu: build-cpu
-	gcloud docker --authorize-only 
+	gcloud docker --authorize-only
 	docker push $(IMG):$(TAG)
 	@echo Pushed $(IMG):$(TAG)

 push-gpu: build-gpu
-	gcloud docker --authorize-only 
+	gcloud docker --authorize-only
 	docker push $(IMG)-gpu:$(TAG)
 	@echo Pushed $(IMG)-gpu:$(TAG)

 push-trainer: push-cpu push-gpu

 push-dataflow: build-dataflow
-	gcloud docker --authorize-only 
+	gcloud docker --authorize-only
 	docker push $(IMG)-dataflow:$(TAG)
 	@echo Pushed $(IMG)-dataflow:$(TAG)

--- a/code_search/docker/t2t/Dockerfile.dataflow
+++ b/code_search/docker/t2t/Dockerfile.dataflow
@ -7,8 +7,8 @@ FROM python:2.7-jessie
 # so we need to install them for Python2.
 # We do this before copying the code because we don't want to have
 # reinstall the requirements just because the code changed.
-COPY src/requirements.txt /tmp/requirements.txt
-RUN pip install -r /tmp/requirements.txt
+COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
+RUN pip install -r /tmp/requirements.dataflow.txt
 RUN pip install https://github.com/kubeflow/batch-predict/tarball/master

 # install the spacy model
--- a/code_search/docker/t2t/build.jsonnet
+++ b/code_search/docker/t2t/build.jsonnet
@ -1,21 +1,34 @@
+// TODO(jlewi): We should tag the image latest and then
+// use latest as a cache so that rebuilds are fast
+// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
 {
 	
 	"steps": [
    {
      "name": "gcr.io/cloud-builders/docker",
      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
-             	"--label=git-versions=" + std.extVar("gitVersion"), 
-                "--build-arg", "BASE_IMAGE_TAG=1.11.0",
-      		   "./docker/t2t"],
+             	 "--label=git-versions=" + std.extVar("gitVersion"), 
+               "--build-arg", "BASE_IMAGE_TAG=1.11.0",
+               "--file=docker/t2t/Dockerfile", "."],
+      "waitFor": ["-"],
    },
    {
      "name": "gcr.io/cloud-builders/docker",
      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
-             	"--label=git-versions=" + std.extVar("gitVersion"), 
-                "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
-      		   "./docker/t2t"],
+             	 "--label=git-versions=" + std.extVar("gitVersion"), 
+               "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
+               "--file=docker/t2t/Dockerfile", "."],
+      "waitFor": ["-"],
+    },
+    {
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), 
+               "--label=git-versions=" + std.extVar("gitVersion"),
+               "--file=docker/t2t/Dockerfile.dataflow", "."],
+      "waitFor": ["-"],
    },
  ],
  "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
-             "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")],
+             "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
+             "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
 }
--- a/code_search/kubeflow/README.md
+++ b/code_search/kubeflow/README.md
--- a/code_search/kubeflow/components/experiments.libsonnet
+++ b/code_search/kubeflow/components/experiments.libsonnet
@ -7,5 +7,9 @@
    train_steps: 200000,
    eval_steps: 100,
    hparams_set: "transformer_base",
+    project: "code-search-demo",    
+    modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",    
+    problem: "kf_github_function_docstring",
+    model: "kf_similarity_transformer",
  },
 }
--- a/code_search/kubeflow/components/params.libsonnet
+++ b/code_search/kubeflow/components/params.libsonnet
@ -20,7 +20,7 @@
      eval_steps: 10,
      image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
      imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
-      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181106-v0.2-76-g611636c-dirty-860631",
+      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",

      imagePullSecrets: [],
      // TODO(jlewi): dataDir doesn't seem to be used.
@ -106,6 +106,20 @@
      numWorkers: 5,
      project: "",
    },
+    "submit-code-embeddings-job": {
+      name: "submit-code-embeddings-job",
+      image: $.components["t2t-job"].dataflowImage,
+      // Big query table where results will be written.
+      targetDataset: "code_search",
+      workingDir: $.components["t2t-code-search"].workingDir,
+      dataDir: self.workingDir + "/data",
+      // Directory where the model is stored.
+      modelDir: "",
+      jobName: "submit-code-embeddings-job",
+      workerMachineType: "n1-highcpu-32",
+      numWorkers: 5,
+      project: "",
+    },

    tensorboard: {
      image: "tensorflow/tensorflow:1.8.0",
--- a/code_search/kubeflow/components/submit-code-embeddings-job.jsonnet
+++ b/code_search/kubeflow/components/submit-code-embeddings-job.jsonnet
@ -0,0 +1,14 @@
+// Submit a Dataflow job to compute the code embeddings used a trained model.
+local k = import "k.libsonnet";
+
+local experiments = import "experiments.libsonnet";
+local lib = import "submit-code-embeddings-job.libsonnet";
+local env = std.extVar("__ksonnet/environments");
+local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
+local experimentName = baseParams.experiment;
+local params = baseParams + experiments[experimentName] + {
+  name: experimentName + "-embed-code",
+};
+
+
+std.prune(k.core.v1.list.new([lib.parts(params,env).job]))
--- a/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet
+++ b/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet
@ -0,0 +1,74 @@
+{
+  parts(params, env):: {
+  	// Submit a Dataflow job to compute the code embeddings used a trained model.
+  	job :: {
+	  apiVersion: "batch/v1",
+	  kind: "Job",
+	  metadata: {
+	    name: params.name,
+	    namespace: env.namespace,
+	    labels: {
+	      app: params.name,
+	    },
+	  },
+	  spec: {
+	    replicas: 1,
+	    template: {
+	      metadata: {
+	        labels: {
+	          app: params.name,
+	        },
+	      },
+	      spec: {
+	        // Don't restart because all the job should do is launch the Dataflow job.
+	        restartPolicy: "Never",
+	        containers: [
+	          {
+	            name: "dataflow",
+	            image: params.image,
+	            command: [
+	              "python2",
+	              "-m",
+	              "code_search.dataflow.cli.create_function_embeddings",
+	              "--runner=DataflowRunner",              
+	              "--project=" + params.project,
+	              "--target_dataset=" + params.targetDataset,
+	              "--data_dir=" + params.dataDir,
+	              "--problem=" + params.problem,
+	              "--job_name=" + params.jobName,
+	              "--saved_model_dir=" + params.modelDir,
+	              "--temp_location=" + params.workingDir + "/dataflow/temp",
+	              "--staging_location=" + params.workingDir + "/dataflow/staging",
+	              "--worker_machine_type=" + params.workerMachineType,
+	              "--num_workers=" + params.numWorkers,
+	              "--requirements_file=requirements.dataflow.txt",
+	            ],
+	            env: [
+	              {
+	                name: "GOOGLE_APPLICATION_CREDENTIALS",
+	                value: "/secret/gcp-credentials/user-gcp-sa.json",
+	              },
+	            ],
+	            workingDir: "/src",            
+	            volumeMounts: [
+	              {
+	                mountPath: "/secret/gcp-credentials",
+	                name: "gcp-credentials",
+	              },
+	            ],  //volumeMounts
+	          },
+	        ],  // containers
+	        volumes: [
+	          {
+	            name: "gcp-credentials",
+	            secret: {
+	              secretName: "user-gcp-sa",
+	            },
+	          },
+	        ],
+	      },  // spec
+	    },
+	  },
+	}, // job
+  }, // parts
+}
--- a/code_search/kubeflow/components/submit-preprocess-job.jsonnet
+++ b/code_search/kubeflow/components/submit-preprocess-job.jsonnet
@ -42,6 +42,7 @@ local jobSpec = {
              "--staging_location=" + params.workingDir + "/dataflow/staging",
              "--worker_machine_type=" + params.workerMachineType,
              "--num_workers=" + params.numWorkers,
+              "--requirements_file=requirements.dataflow.txt",
            ],
            env: [
              {
--- a/code_search/kubeflow/environments/cs_demo/globals.libsonnet
+++ b/code_search/kubeflow/environments/cs_demo/globals.libsonnet
@ -5,4 +5,5 @@
  workingDir: "gs://code-search-demo/20181104",
  dataDir: "gs://code-search-demo/20181104/data",
  project: "code-search-demo",
+  experiment: "demo-trainer-11-07-dist-sync-gpu",
 }
--- a/code_search/src/code_search/dataflow/cli/create_function_embeddings.py
+++ b/code_search/src/code_search/dataflow/cli/create_function_embeddings.py
@ -1,3 +1,6 @@
+"""Dataflow job to compute function embeddings."""
+import logging
+
 import apache_beam as beam

 import code_search.dataflow.cli.arguments as arguments
@ -45,9 +48,16 @@ def create_function_embeddings(argv=None):
  )

  result = pipeline.run()
+  logging.info("Submitted Dataflow job: %s", result)
  if args.wait_until_finish:
    result.wait_until_finish()


 if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
  create_function_embeddings()
--- a/code_search/src/requirements.dataflow.txt
+++ b/code_search/src/requirements.dataflow.txt
@ -0,0 +1,11 @@
+# Requirements for the Dataflow jobs.
+# We want to avoid unnecessary dependencies as these dependencies are installed on each
+# worker.
+astor~=0.7.0
+apache-beam[gcp]~=2.8.0
+nltk~=3.3.0
+oauth2client~=4.1.0
+spacy~=2.0.0
+tensor2tensor~=1.9.0
+tensorflow~=1.11.0
+pybind11~=2.2.4
--- a/code_search/src/requirements.ui.txt
+++ b/code_search/src/requirements.ui.txt
@ -1,8 +1,6 @@
-astor~=0.7.0
-apache-beam[gcp]~=2.6.0
+apache-beam[gcp]~=2.8.0
 Flask~=1.0.0
 nltk~=3.3.0
-nmslib~=1.7.0
 oauth2client~=4.1.0
 requests~=2.19.0
 spacy~=2.0.0
--- a/code_search/src/setup.py
+++ b/code_search/src/setup.py
@ -9,6 +9,10 @@ with open('requirements.txt', 'r') as f:
  install_requires = f.readlines()

 CUSTOM_COMMANDS = [
+  # TODO(jlewi): python -m is complaining that module spacy not found even
+  # though it should be installed due to requirements. Reinstalling
+  # it using a custom command appears to fix the problem.
+  ['pip', 'install', 'spacy'],
  ['python', '-m', 'spacy', 'download', 'en'],
  # TODO(sanyamkapoor): This isn't ideal but no other way for a seamless install right now.
  ['pip', 'install', 'https://github.com/kubeflow/batch-predict/tarball/master']