mirror of https://github.com/kubeflow/examples.git
				
				
				
			Fix the K8s job to create the nmslib index. (#338)
* Install nmslib in the Dataflow container so its suitable for running the index creation job. * Use command not args in the job specs. * Dockerfile.dataflow should install nmslib so that we can use that Docker image to create the index. * build.jsonnet should tag images as latest. We will use this to use the latest images as a layer cache to speed up builds. * Set logging level to info for start_search_server.py and create_search_index.py * Create search index pod keeps was getting evicted because node runs out of memory * Add a new node pool consisting of n1-standard-32 nodes to the demo cluster. These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8 * Set requests and limits on the creator search index pod. * Move all the config for the search-index-creator job into the search-index-creator.jsonnet file. We need to customize the memory resources so there's not much value to try to sharing config with other components.
This commit is contained in:
		
							parent
							
								
									a402db1ccc
								
							
						
					
					
						commit
						d2b68f15d7
					
				|  | @ -58,7 +58,8 @@ build-gcb: | |||
| 	cp -r ./src ./build/ | ||||
| 	rm -rf ./build/src/code_search/dataflow/cli/test_data | ||||
| 	rm -rf ./build/src/code_search/t2t/test_data | ||||
| 	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build | ||||
| 	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \
 | ||||
| 		--timeout=3600 ./build | ||||
| 
 | ||||
| 
 | ||||
| # Build but don't attach the latest tag. This allows manual testing/inspection of the image
 | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ resources: | |||
|     # bump this if you want to modify the node pools. | ||||
|     # This will cause existing node pools to be deleted and new ones to be created. | ||||
|     # Use prefix v so it will be treated as a string. | ||||
|     pool-version: v1 | ||||
|     pool-version: v2 | ||||
|     # Two is small enough to fit within default quota. | ||||
|     cpu-pool-initialNodeCount: 2 | ||||
|     gpu-pool-initialNodeCount: 0 | ||||
|  |  | |||
|  | @ -16,6 +16,7 @@ limitations under the License. | |||
| {% set CLUSTER_NAME = NAME_PREFIX %} | ||||
| {% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %} | ||||
| {% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %} | ||||
| {% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %} | ||||
| 
 | ||||
| {# Type names are the names to give to deployment manager type providers | ||||
|    that will be created to represent Kubernetes objects. | ||||
|  | @ -152,6 +153,41 @@ resources: | |||
|     # We can only create 1 node pool at a time. | ||||
|     - {{ CLUSTER_NAME }} | ||||
| 
 | ||||
| # Add a high memory pool because creating the search index requires a lot of memory. | ||||
| - name: {{ LARGE_POOL }} | ||||
|   {% if properties['gkeApiVersion'] == 'v1beta1' %} | ||||
|   type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools | ||||
|   {% else %} | ||||
|   type: container.v1.nodePool | ||||
|   {% endif %} | ||||
|   properties: | ||||
|     parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} | ||||
|     project: {{ properties['securityConfig']['project'] }} | ||||
|     zone: {{ properties['zone'] }} | ||||
|     clusterId: {{ CLUSTER_NAME }} | ||||
|     nodePool: | ||||
|       name: large-pool | ||||
|       initialNodeCount: 0 | ||||
|       autoscaling: | ||||
|         enabled: true | ||||
|         minNodeCount: 1 | ||||
|         maxNodeCount: 10 | ||||
|       config: | ||||
|         {% if properties['securityConfig']['secureNodeMetadata'] %} | ||||
|         workloadMetadataConfig: | ||||
|           nodeMetadata: SECURE | ||||
|         {% endif %} | ||||
|         machineType: n1-standard-32 | ||||
|         serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com | ||||
|         oauthScopes: {{ VM_OAUTH_SCOPES }} | ||||
|         # Set min cpu platform to ensure AVX2 is supported. | ||||
|         minCpuPlatform: 'Intel Haswell' | ||||
| 
 | ||||
|   metadata: | ||||
|     dependsOn: | ||||
|     # We can only create 1 node pool at a time. | ||||
|     - {{ GPU_POOL }} | ||||
| 
 | ||||
| {# Project defaults to the project of the deployment. #} | ||||
| - name: {{ properties['ipName']  }} | ||||
|   type: compute.v1.globalAddress | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # Dockerfile suitable for submitting Dataflow jobs. | ||||
| # Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator. | ||||
| # | ||||
| # We don't use the Docker image used for running the training jobs | ||||
| # because we have different versioning requirements. | ||||
| FROM python:2.7-jessie | ||||
|  | @ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt | |||
| RUN pip install -r /tmp/requirements.dataflow.txt | ||||
| RUN pip install https://github.com/kubeflow/batch-predict/tarball/master | ||||
| 
 | ||||
| # Install nmslib requirements so that we can create the index | ||||
| COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt | ||||
| RUN pip install -r /tmp/requirements.nmslib.txt | ||||
| 
 | ||||
| # install the spacy model | ||||
| RUN python -m spacy download en | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,30 +5,81 @@ | |||
| 	 | ||||
| 	"steps": [       | ||||
|     { | ||||
|       "id": "pull-cpu", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "build-cpu", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),  | ||||
|              	 "--label=git-versions=" + std.extVar("gitVersion"),  | ||||
|                "--build-arg", "BASE_IMAGE_TAG=1.11.0", | ||||
|                "--file=docker/t2t/Dockerfile", "."], | ||||
|                "--file=docker/t2t/Dockerfile",  | ||||
|                "--cache-from=gcr.io/kubeflow-examples/code-search:latest", | ||||
|                "."], | ||||
|       "waitFor": ["pull-cpu"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "tag-cpu", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),  | ||||
|                "gcr.io/kubeflow-examples/code-search:latest",], | ||||
|       "waitFor": ["build-cpu"], | ||||
|     },     | ||||
|     { | ||||
|       "id": "pull-gpu", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "build-gpu", | ||||
|       "name": "gcr.io/cloud-builders/docker",       | ||||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),  | ||||
|              	 "--label=git-versions=" + std.extVar("gitVersion"),  | ||||
|                "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu", | ||||
|                "--file=docker/t2t/Dockerfile", "."], | ||||
|                "--file=docker/t2t/Dockerfile",  | ||||
|                "--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest", | ||||
|                "."], | ||||
|       "waitFor": ["pull-gpu"], | ||||
|     },     | ||||
|     { | ||||
|       "id": "tag-gpu", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),  | ||||
|                "gcr.io/kubeflow-examples/code-search-gpu:latest",], | ||||
|       "waitFor": ["build-gpu"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "pull-dataflow", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"], | ||||
|       "waitFor": ["-"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "build-dataflow", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),  | ||||
|                "--label=git-versions=" + std.extVar("gitVersion"), | ||||
|                "--file=docker/t2t/Dockerfile.dataflow", "."], | ||||
|       "waitFor": ["-"], | ||||
|                "--file=docker/t2t/Dockerfile.dataflow",  | ||||
|                "--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest", | ||||
|                "."], | ||||
|       "waitFor": ["pull-dataflow"], | ||||
|     }, | ||||
|     { | ||||
|       "id": "tag-dataflow", | ||||
|       "name": "gcr.io/cloud-builders/docker", | ||||
|       "args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),  | ||||
|                "gcr.io/kubeflow-examples/code-search-dataflow:latest",], | ||||
|       "waitFor": ["build-dataflow"], | ||||
|     }, | ||||
|   ], | ||||
|   "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),  | ||||
|              "gcr.io/kubeflow-examples/code-search:latest",  | ||||
|              "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), | ||||
|              "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")], | ||||
|              "gcr.io/kubeflow-examples/code-search-gpu:latest", | ||||
|              "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), | ||||
|              "gcr.io/kubeflow-examples/code-search-dataflow:latest"], | ||||
| } | ||||
|  | @ -11,5 +11,10 @@ | |||
|     modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",     | ||||
|     problem: "kf_github_function_docstring", | ||||
|     model: "kf_similarity_transformer", | ||||
| 
 | ||||
|     // Location to write the index file for nmslib and the file to be used as the reverse lookup | ||||
|     // with the index server. | ||||
|     lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv", | ||||
|     indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index", | ||||
|   }, | ||||
| } | ||||
|  |  | |||
|  | @ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"]; | |||
|   containerSpec(params, env=[], volumeMounts=[], ports=[]):: { | ||||
|     name: params.name, | ||||
|     image: params.image, | ||||
|     args: params.args, | ||||
|     command: params.command, | ||||
|     ports: ports, | ||||
|     env: env, | ||||
|     volumeMounts: volumeMounts, | ||||
|  | @ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"]; | |||
|       }, | ||||
|     ], | ||||
| 
 | ||||
|     creator:: { | ||||
|       local creatorParams = params + { | ||||
|         args: [ | ||||
|           "-m", | ||||
|           "code_search.nmslib.cli.create_search_index", | ||||
|           "--data_dir=" + params.dataDir, | ||||
|           "--lookup_file=" + params.lookupFile, | ||||
|           "--index_file=" + params.indexFile, | ||||
|         ], | ||||
|       }, | ||||
| 
 | ||||
|       all: [ | ||||
|         $.jobSpec(creatorParams, env, | ||||
|                   [ | ||||
|                     $.containerSpec(creatorParams, env=containerEnv, | ||||
|                                     volumeMounts=containerVolumeMounts) | ||||
|                   ], | ||||
|                   volumes=volumes), | ||||
|       ], | ||||
|     }.all, | ||||
| 
 | ||||
|     server:: { | ||||
|       local serverParams = params + { | ||||
|         args: [ | ||||
|         command: [ | ||||
|           "python", | ||||
|           "-m", | ||||
|           "code_search.nmslib.cli.start_search_server", | ||||
|           "--problem=" + params.problem, | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ | |||
|     // are not picked up by the individual components. | ||||
|     // Need to see if we can find a way to fix this. | ||||
| 
 | ||||
|     local imageTag = "v20181108-004b5ad-dirty-eba459", | ||||
|     local imageTag = "v20181117-3c030ae-dirty-4d809c", | ||||
|     "t2t-job": { | ||||
|       jobType: "trainer", | ||||
|       numChief: 0, | ||||
|  | @ -20,7 +20,7 @@ | |||
|       eval_steps: 10, | ||||
|       image: "gcr.io/kubeflow-examples/code-search:" + imageTag, | ||||
|       imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag, | ||||
|       dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384", | ||||
|       dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag, | ||||
| 
 | ||||
|       imagePullSecrets: [], | ||||
|       // TODO(jlewi): dataDir doesn't seem to be used. | ||||
|  |  | |||
|  | @ -3,5 +3,82 @@ local nms = import "nms.libsonnet"; | |||
| 
 | ||||
| local env = std.extVar("__ksonnet/environments"); | ||||
| local params = std.extVar("__ksonnet/params").components["search-index-creator"]; | ||||
| local experiments = import "experiments.libsonnet"; | ||||
| 
 | ||||
| std.prune(k.core.v1.list.new(nms.parts(params, env).creator)) | ||||
| local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"]; | ||||
| local experimentName = baseParams.experiment; | ||||
| local params = baseParams + experiments[experimentName] + { | ||||
|   name: experimentName + "-create-search-index", | ||||
| }; | ||||
| 
 | ||||
| local jobSpec = { | ||||
|   apiVersion: "batch/v1", | ||||
|   kind: "Job", | ||||
|   metadata: { | ||||
|     name: params.name, | ||||
|     namespace: env.namespace, | ||||
|     labels: { | ||||
|       app: params.name, | ||||
|     }, | ||||
|   }, | ||||
|   spec: { | ||||
|     replicas: 1, | ||||
|     template: { | ||||
|       metadata: { | ||||
|         labels: { | ||||
|           app: params.name, | ||||
|         }, | ||||
|       }, | ||||
|       spec: { | ||||
|         // Don't restart because all the job should do is launch the Dataflow job. | ||||
|         restartPolicy: "Never", | ||||
|         containers: [ | ||||
|           { | ||||
|             name: "dataflow", | ||||
|             image: params.image, | ||||
|             command: [ | ||||
| 		          "python", | ||||
| 		          "-m", | ||||
| 		          "code_search.nmslib.cli.create_search_index", | ||||
| 		          "--data_dir=" + params.dataDir, | ||||
| 		          "--lookup_file=" + params.lookupFile, | ||||
| 		          "--index_file=" + params.indexFile, | ||||
| 		        ], | ||||
|             env: [ | ||||
|               { | ||||
|                 name: "GOOGLE_APPLICATION_CREDENTIALS", | ||||
|                 value: "/secret/gcp-credentials/user-gcp-sa.json", | ||||
|               }, | ||||
|             ], | ||||
|             // Creating the index requires a lot of memory. | ||||
|             resources: { | ||||
|             	requests: { | ||||
| 			        memory: "32Gi" | ||||
| 			 	}, | ||||
| 			 	limits: { | ||||
| 			        memory: "100Gi" | ||||
| 			 	}, | ||||
| 			}, | ||||
|             workingDir: "/src",             | ||||
|             volumeMounts: [ | ||||
|               { | ||||
|                 mountPath: "/secret/gcp-credentials", | ||||
|                 name: "gcp-credentials", | ||||
|               }, | ||||
|             ],  //volumeMounts | ||||
|           }, | ||||
|         ],  // containers | ||||
|         volumes: [ | ||||
|           { | ||||
|             name: "gcp-credentials", | ||||
|             secret: { | ||||
|               secretName: "user-gcp-sa", | ||||
|             }, | ||||
|           }, | ||||
|         ], | ||||
|       },  // spec | ||||
|     }, | ||||
|   }, | ||||
| }; | ||||
| 
 | ||||
| std.prune(k.core.v1.list.new(jobSpec)) | ||||
|  |  | |||
|  | @ -101,3 +101,4 @@ local deployment = { | |||
| }; | ||||
| 
 | ||||
| std.prune(k.core.v1.list.new([service, deployment])) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| import csv | ||||
| import logging | ||||
| import os | ||||
| import numpy as np | ||||
| import tensorflow as tf | ||||
|  | @ -23,6 +24,7 @@ def create_search_index(argv=None): | |||
|   args = arguments.parse_arguments(argv) | ||||
| 
 | ||||
|   if not os.path.isdir(args.tmp_dir): | ||||
|     logging.info("Creating directory %s", args.tmp_dir) | ||||
|     os.makedirs(args.tmp_dir) | ||||
| 
 | ||||
|   tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file)) | ||||
|  | @ -34,7 +36,7 @@ def create_search_index(argv=None): | |||
|     lookup_writer = csv.writer(lookup_file) | ||||
| 
 | ||||
|     for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)): | ||||
|       tf.logging.debug('Reading {}'.format(csv_file_path)) | ||||
|       logging.info('Reading %s', csv_file_path) | ||||
| 
 | ||||
|       with tf.gfile.Open(csv_file_path) as csv_file: | ||||
|         reader = csv.reader(csv_file) | ||||
|  | @ -49,9 +51,19 @@ def create_search_index(argv=None): | |||
| 
 | ||||
|   search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file) | ||||
| 
 | ||||
|   logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file) | ||||
|   tf.gfile.Copy(tmp_lookup_file, args.lookup_file) | ||||
|   logging.info("Copying file %s to %s", tmp_index_file, args.index_file) | ||||
|   tf.gfile.Copy(tmp_index_file, args.index_file) | ||||
|   logging.info("Finished creating the index") | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|   logging.basicConfig(level=logging.INFO, | ||||
|                       format=('%(levelname)s|%(asctime)s' | ||||
|                               '|%(pathname)s|%(lineno)d| %(message)s'), | ||||
|                       datefmt='%Y-%m-%dT%H:%M:%S', | ||||
|                       ) | ||||
|   logging.getLogger().setLevel(logging.INFO) | ||||
|   logging.info("Creating the search index") | ||||
|   create_search_index() | ||||
|  |  | |||
|  | @ -85,4 +85,10 @@ def start_search_server(argv=None): | |||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|   logging.basicConfig(level=logging.INFO, | ||||
|                       format=('%(levelname)s|%(asctime)s' | ||||
|                               '|%(pathname)s|%(lineno)d| %(message)s'), | ||||
|                       datefmt='%Y-%m-%dT%H:%M:%S', | ||||
|                       ) | ||||
|   logging.getLogger().setLevel(logging.INFO) | ||||
|   start_search_server() | ||||
|  |  | |||
|  | @ -0,0 +1,2 @@ | |||
| # Requirements to run nmslib. | ||||
| nmslib~=1.7.0 | ||||
		Loading…
	
		Reference in New Issue