mirror of https://github.com/kubeflow/examples.git
				
				
				
			Create a component to submit the Dataflow job to compute embeddings for code search (#324)
* Create a component to submit the Dataflow job to compute embeddings for code search. * Update Beam to 2.8.0 * Remove nmslib from Apache beam requirements.txt; its not needed and appears to have problems installing on the Dataflow workers. * Spacy download was failing on Dataflow workers; reinstalling the spacy package as a pip package appears to fix this. * Fix some bugs in the workflow for building the Docker images. * * Split requirements.txt into separate requirements for the Dataflow workers and the UI. * We don't want to install unnecessary dependencies in the Dataflow workers. Some unnecessary dependencies; e.g. nmslib were also having problems being installed in the workers.
This commit is contained in:
		
							parent
							
								
									6c976342a3
								
							
						
					
					
						commit
						26c400a4cd
					
				|  | @ -55,10 +55,10 @@ build-gcb: | |||
| 	jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
 | ||||
| 	  > ./build/build.json | ||||
| 	cp -r ./docker ./build/ | ||||
| 	cp -r ./src ../build/ | ||||
| 	cp -r ./src ./build/ | ||||
| 	rm -rf ./build/src/code_search/dataflow/cli/test_data | ||||
| 	rm -rf ./build/src/code_search/t2t/test_data | ||||
| 	gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build | ||||
| 	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build | ||||
| 
 | ||||
| 
 | ||||
| # Build but don't attach the latest tag. This allows manual testing/inspection of the image
 | ||||
|  |  | |||
|  | @ -7,8 +7,8 @@ FROM python:2.7-jessie | |||
| # so we need to install them for Python2. | ||||
| # We do this before copying the code because we don't want to have | ||||
| # reinstall the requirements just because the code changed. | ||||
| COPY src/requirements.txt /tmp/requirements.txt | ||||
| RUN pip install -r /tmp/requirements.txt | ||||
| COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt | ||||
| RUN pip install -r /tmp/requirements.dataflow.txt | ||||
| RUN pip install https://github.com/kubeflow/batch-predict/tarball/master | ||||
| 
 | ||||
| # install the spacy model | ||||
|  |  | |||
|  | @ -1,3 +1,6 @@ | |||
| // TODO(jlewi): We should tag the image latest and then | ||||
| // use latest as a cache so that rebuilds are fast | ||||
| // https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image | ||||
| { | ||||
| 	 | ||||
| 	"steps": [ | ||||
|  | @ -6,16 +9,26 @@ | |||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),  | ||||
|              	 "--label=git-versions=" + std.extVar("gitVersion"),  | ||||
|                "--build-arg", "BASE_IMAGE_TAG=1.11.0", | ||||
|       		   "./docker/t2t"], | ||||
|                "--file=docker/t2t/Dockerfile", "."], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|     { | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),  | ||||
|              	 "--label=git-versions=" + std.extVar("gitVersion"),  | ||||
|                "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu", | ||||
|       		   "./docker/t2t"], | ||||
|                "--file=docker/t2t/Dockerfile", "."], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|     { | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),  | ||||
|                "--label=git-versions=" + std.extVar("gitVersion"), | ||||
|                "--file=docker/t2t/Dockerfile.dataflow", "."], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|   ], | ||||
|   "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),  | ||||
|              "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")], | ||||
|              "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), | ||||
|              "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")], | ||||
| } | ||||
|  | @ -7,5 +7,9 @@ | |||
|     train_steps: 200000, | ||||
|     eval_steps: 100, | ||||
|     hparams_set: "transformer_base", | ||||
|     project: "code-search-demo",     | ||||
|     modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",     | ||||
|     problem: "kf_github_function_docstring", | ||||
|     model: "kf_similarity_transformer", | ||||
|   }, | ||||
| } | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ | |||
|       eval_steps: 10, | ||||
|       image: "gcr.io/kubeflow-examples/code-search:" + imageTag, | ||||
|       imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag, | ||||
|       dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181106-v0.2-76-g611636c-dirty-860631", | ||||
|       dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384", | ||||
| 
 | ||||
|       imagePullSecrets: [], | ||||
|       // TODO(jlewi): dataDir doesn't seem to be used. | ||||
|  | @ -106,6 +106,20 @@ | |||
|       numWorkers: 5, | ||||
|       project: "", | ||||
|     }, | ||||
|     "submit-code-embeddings-job": { | ||||
|       name: "submit-code-embeddings-job", | ||||
|       image: $.components["t2t-job"].dataflowImage, | ||||
|       // Big query table where results will be written. | ||||
|       targetDataset: "code_search", | ||||
|       workingDir: $.components["t2t-code-search"].workingDir, | ||||
|       dataDir: self.workingDir + "/data", | ||||
|       // Directory where the model is stored. | ||||
|       modelDir: "", | ||||
|       jobName: "submit-code-embeddings-job", | ||||
|       workerMachineType: "n1-highcpu-32", | ||||
|       numWorkers: 5, | ||||
|       project: "", | ||||
|     }, | ||||
| 
 | ||||
|     tensorboard: { | ||||
|       image: "tensorflow/tensorflow:1.8.0", | ||||
|  |  | |||
|  | @ -0,0 +1,14 @@ | |||
| // Submit a Dataflow job to compute the code embeddings used a trained model. | ||||
| local k = import "k.libsonnet"; | ||||
| 
 | ||||
| local experiments = import "experiments.libsonnet"; | ||||
| local lib = import "submit-code-embeddings-job.libsonnet"; | ||||
| local env = std.extVar("__ksonnet/environments"); | ||||
| local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"]; | ||||
| local experimentName = baseParams.experiment; | ||||
| local params = baseParams + experiments[experimentName] + { | ||||
|   name: experimentName + "-embed-code", | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| std.prune(k.core.v1.list.new([lib.parts(params,env).job])) | ||||
|  | @ -0,0 +1,74 @@ | |||
| { | ||||
|   parts(params, env):: { | ||||
|   	// Submit a Dataflow job to compute the code embeddings used a trained model. | ||||
|   	job :: { | ||||
| 	  apiVersion: "batch/v1", | ||||
| 	  kind: "Job", | ||||
| 	  metadata: { | ||||
| 	    name: params.name, | ||||
| 	    namespace: env.namespace, | ||||
| 	    labels: { | ||||
| 	      app: params.name, | ||||
| 	    }, | ||||
| 	  }, | ||||
| 	  spec: { | ||||
| 	    replicas: 1, | ||||
| 	    template: { | ||||
| 	      metadata: { | ||||
| 	        labels: { | ||||
| 	          app: params.name, | ||||
| 	        }, | ||||
| 	      }, | ||||
| 	      spec: { | ||||
| 	        // Don't restart because all the job should do is launch the Dataflow job. | ||||
| 	        restartPolicy: "Never", | ||||
| 	        containers: [ | ||||
| 	          { | ||||
| 	            name: "dataflow", | ||||
| 	            image: params.image, | ||||
| 	            command: [ | ||||
| 	              "python2", | ||||
| 	              "-m", | ||||
| 	              "code_search.dataflow.cli.create_function_embeddings", | ||||
| 	              "--runner=DataflowRunner",               | ||||
| 	              "--project=" + params.project, | ||||
| 	              "--target_dataset=" + params.targetDataset, | ||||
| 	              "--data_dir=" + params.dataDir, | ||||
| 	              "--problem=" + params.problem, | ||||
| 	              "--job_name=" + params.jobName, | ||||
| 	              "--saved_model_dir=" + params.modelDir, | ||||
| 	              "--temp_location=" + params.workingDir + "/dataflow/temp", | ||||
| 	              "--staging_location=" + params.workingDir + "/dataflow/staging", | ||||
| 	              "--worker_machine_type=" + params.workerMachineType, | ||||
| 	              "--num_workers=" + params.numWorkers, | ||||
| 	              "--requirements_file=requirements.dataflow.txt", | ||||
| 	            ], | ||||
| 	            env: [ | ||||
| 	              { | ||||
| 	                name: "GOOGLE_APPLICATION_CREDENTIALS", | ||||
| 	                value: "/secret/gcp-credentials/user-gcp-sa.json", | ||||
| 	              }, | ||||
| 	            ], | ||||
| 	            workingDir: "/src",             | ||||
| 	            volumeMounts: [ | ||||
| 	              { | ||||
| 	                mountPath: "/secret/gcp-credentials", | ||||
| 	                name: "gcp-credentials", | ||||
| 	              }, | ||||
| 	            ],  //volumeMounts | ||||
| 	          }, | ||||
| 	        ],  // containers | ||||
| 	        volumes: [ | ||||
| 	          { | ||||
| 	            name: "gcp-credentials", | ||||
| 	            secret: { | ||||
| 	              secretName: "user-gcp-sa", | ||||
| 	            }, | ||||
| 	          }, | ||||
| 	        ], | ||||
| 	      },  // spec | ||||
| 	    }, | ||||
| 	  }, | ||||
| 	}, // job | ||||
|   }, // parts | ||||
| } | ||||
|  | @ -42,6 +42,7 @@ local jobSpec = { | |||
|               "--staging_location=" + params.workingDir + "/dataflow/staging", | ||||
|               "--worker_machine_type=" + params.workerMachineType, | ||||
|               "--num_workers=" + params.numWorkers, | ||||
|               "--requirements_file=requirements.dataflow.txt", | ||||
|             ], | ||||
|             env: [ | ||||
|               { | ||||
|  |  | |||
|  | @ -5,4 +5,5 @@ | |||
|   workingDir: "gs://code-search-demo/20181104", | ||||
|   dataDir: "gs://code-search-demo/20181104/data", | ||||
|   project: "code-search-demo", | ||||
|   experiment: "demo-trainer-11-07-dist-sync-gpu", | ||||
| } | ||||
|  |  | |||
|  | @ -1,3 +1,6 @@ | |||
| """Dataflow job to compute function embeddings.""" | ||||
| import logging | ||||
| 
 | ||||
| import apache_beam as beam | ||||
| 
 | ||||
| import code_search.dataflow.cli.arguments as arguments | ||||
|  | @ -45,9 +48,16 @@ def create_function_embeddings(argv=None): | |||
|   ) | ||||
| 
 | ||||
|   result = pipeline.run() | ||||
|   logging.info("Submitted Dataflow job: %s", result) | ||||
|   if args.wait_until_finish: | ||||
|     result.wait_until_finish() | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|   logging.basicConfig(level=logging.INFO, | ||||
|                       format=('%(levelname)s|%(asctime)s' | ||||
|                               '|%(pathname)s|%(lineno)d| %(message)s'), | ||||
|                       datefmt='%Y-%m-%dT%H:%M:%S', | ||||
|                       ) | ||||
|   logging.getLogger().setLevel(logging.INFO) | ||||
|   create_function_embeddings() | ||||
|  |  | |||
|  | @ -0,0 +1,11 @@ | |||
| # Requirements for the Dataflow jobs. | ||||
| # We want to avoid unnecessary dependencies as these dependencies are installed on each | ||||
| # worker. | ||||
| astor~=0.7.0 | ||||
| apache-beam[gcp]~=2.8.0 | ||||
| nltk~=3.3.0 | ||||
| oauth2client~=4.1.0 | ||||
| spacy~=2.0.0 | ||||
| tensor2tensor~=1.9.0 | ||||
| tensorflow~=1.11.0 | ||||
| pybind11~=2.2.4 | ||||
|  | @ -1,8 +1,6 @@ | |||
| astor~=0.7.0 | ||||
| apache-beam[gcp]~=2.6.0 | ||||
| apache-beam[gcp]~=2.8.0 | ||||
| Flask~=1.0.0 | ||||
| nltk~=3.3.0 | ||||
| nmslib~=1.7.0 | ||||
| oauth2client~=4.1.0 | ||||
| requests~=2.19.0 | ||||
| spacy~=2.0.0 | ||||
|  | @ -9,6 +9,10 @@ with open('requirements.txt', 'r') as f: | |||
|   install_requires = f.readlines() | ||||
| 
 | ||||
| CUSTOM_COMMANDS = [ | ||||
|   # TODO(jlewi): python -m is complaining that module spacy not found even | ||||
|   # though it should be installed due to requirements. Reinstalling | ||||
|   # it using a custom command appears to fix the problem. | ||||
|   ['pip', 'install', 'spacy'], | ||||
|   ['python', '-m', 'spacy', 'download', 'en'], | ||||
|   # TODO(sanyamkapoor): This isn't ideal but no other way for a seamless install right now. | ||||
|   ['pip', 'install', 'https://github.com/kubeflow/batch-predict/tarball/master'] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue