From d2b68f15d784796d0ea819d10ec632db94b48311 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Tue, 20 Nov 2018 12:53:09 -0800 Subject: [PATCH] Fix the K8s job to create the nmslib index. (#338) * Install nmslib in the Dataflow container so its suitable for running the index creation job. * Use command not args in the job specs. * Dockerfile.dataflow should install nmslib so that we can use that Docker image to create the index. * build.jsonnet should tag images as latest. We will use this to use the latest images as a layer cache to speed up builds. * Set logging level to info for start_search_server.py and create_search_index.py * Create search index pod keeps was getting evicted because node runs out of memory * Add a new node pool consisting of n1-standard-32 nodes to the demo cluster. These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8 * Set requests and limits on the creator search index pod. * Move all the config for the search-index-creator job into the search-index-creator.jsonnet file. We need to customize the memory resources so there's not much value to try to sharing config with other components. --- code_search/Makefile | 3 +- .../gcp_config/cluster-kubeflow.yaml | 2 +- .../cs-demo-1103/gcp_config/cluster.jinja | 36 +++++++++ code_search/docker/t2t/Dockerfile.dataflow | 7 +- code_search/docker/t2t/build.jsonnet | 65 +++++++++++++-- .../kubeflow/components/experiments.libsonnet | 5 ++ code_search/kubeflow/components/nms.libsonnet | 26 +----- .../kubeflow/components/params.libsonnet | 4 +- .../components/search-index-creator.jsonnet | 79 ++++++++++++++++++- .../kubeflow/components/tensorboard.jsonnet | 1 + .../nmslib/cli/create_search_index.py | 14 +++- .../nmslib/cli/start_search_server.py | 6 ++ code_search/src/requirements.nmslib.txt | 2 + 13 files changed, 213 insertions(+), 37 deletions(-) create mode 100644 code_search/src/requirements.nmslib.txt diff --git a/code_search/Makefile b/code_search/Makefile index 0cef8439..f5e39706 100644 --- a/code_search/Makefile +++ b/code_search/Makefile @@ -58,7 +58,8 @@ build-gcb: cp -r ./src ./build/ rm -rf ./build/src/code_search/dataflow/cli/test_data rm -rf ./build/src/code_search/t2t/test_data - gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build + gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \ + --timeout=3600 ./build # Build but don't attach the latest tag. This allows manual testing/inspection of the image diff --git a/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml b/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml index ebb923d8..8d9fa077 100644 --- a/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml +++ b/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml @@ -37,7 +37,7 @@ resources: # bump this if you want to modify the node pools. # This will cause existing node pools to be deleted and new ones to be created. # Use prefix v so it will be treated as a string. - pool-version: v1 + pool-version: v2 # Two is small enough to fit within default quota. cpu-pool-initialNodeCount: 2 gpu-pool-initialNodeCount: 0 diff --git a/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja b/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja index 46114f9b..1ae000b1 100644 --- a/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja +++ b/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja @@ -16,6 +16,7 @@ limitations under the License. {% set CLUSTER_NAME = NAME_PREFIX %} {% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %} {% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %} +{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %} {# Type names are the names to give to deployment manager type providers that will be created to represent Kubernetes objects. @@ -152,6 +153,41 @@ resources: # We can only create 1 node pool at a time. - {{ CLUSTER_NAME }} +# Add a high memory pool because creating the search index requires a lot of memory. +- name: {{ LARGE_POOL }} + {% if properties['gkeApiVersion'] == 'v1beta1' %} + type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools + {% else %} + type: container.v1.nodePool + {% endif %} + properties: + parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} + project: {{ properties['securityConfig']['project'] }} + zone: {{ properties['zone'] }} + clusterId: {{ CLUSTER_NAME }} + nodePool: + name: large-pool + initialNodeCount: 0 + autoscaling: + enabled: true + minNodeCount: 1 + maxNodeCount: 10 + config: + {% if properties['securityConfig']['secureNodeMetadata'] %} + workloadMetadataConfig: + nodeMetadata: SECURE + {% endif %} + machineType: n1-standard-32 + serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com + oauthScopes: {{ VM_OAUTH_SCOPES }} + # Set min cpu platform to ensure AVX2 is supported. + minCpuPlatform: 'Intel Haswell' + + metadata: + dependsOn: + # We can only create 1 node pool at a time. + - {{ GPU_POOL }} + {# Project defaults to the project of the deployment. #} - name: {{ properties['ipName'] }} type: compute.v1.globalAddress diff --git a/code_search/docker/t2t/Dockerfile.dataflow b/code_search/docker/t2t/Dockerfile.dataflow index c74a24aa..06a12aee 100644 --- a/code_search/docker/t2t/Dockerfile.dataflow +++ b/code_search/docker/t2t/Dockerfile.dataflow @@ -1,4 +1,5 @@ -# Dockerfile suitable for submitting Dataflow jobs. +# Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator. +# # We don't use the Docker image used for running the training jobs # because we have different versioning requirements. FROM python:2.7-jessie @@ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt RUN pip install -r /tmp/requirements.dataflow.txt RUN pip install https://github.com/kubeflow/batch-predict/tarball/master +# Install nmslib requirements so that we can create the index +COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt +RUN pip install -r /tmp/requirements.nmslib.txt + # install the spacy model RUN python -m spacy download en diff --git a/code_search/docker/t2t/build.jsonnet b/code_search/docker/t2t/build.jsonnet index 711dca64..0131d624 100644 --- a/code_search/docker/t2t/build.jsonnet +++ b/code_search/docker/t2t/build.jsonnet @@ -3,32 +3,83 @@ // https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image { - "steps": [ + "steps": [ { + "id": "pull-cpu", + "name": "gcr.io/cloud-builders/docker", + "args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"], + "waitFor": ["-"], + }, + { + "id": "build-cpu", "name": "gcr.io/cloud-builders/docker", "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), "--label=git-versions=" + std.extVar("gitVersion"), "--build-arg", "BASE_IMAGE_TAG=1.11.0", - "--file=docker/t2t/Dockerfile", "."], + "--file=docker/t2t/Dockerfile", + "--cache-from=gcr.io/kubeflow-examples/code-search:latest", + "."], + "waitFor": ["pull-cpu"], + }, + { + "id": "tag-cpu", + "name": "gcr.io/cloud-builders/docker", + "args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search:latest",], + "waitFor": ["build-cpu"], + }, + { + "id": "pull-gpu", + "name": "gcr.io/cloud-builders/docker", + "args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"], "waitFor": ["-"], }, { - "name": "gcr.io/cloud-builders/docker", + "id": "build-gpu", + "name": "gcr.io/cloud-builders/docker", "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), "--label=git-versions=" + std.extVar("gitVersion"), "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu", - "--file=docker/t2t/Dockerfile", "."], + "--file=docker/t2t/Dockerfile", + "--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest", + "."], + "waitFor": ["pull-gpu"], + }, + { + "id": "tag-gpu", + "name": "gcr.io/cloud-builders/docker", + "args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search-gpu:latest",], + "waitFor": ["build-gpu"], + }, + { + "id": "pull-dataflow", + "name": "gcr.io/cloud-builders/docker", + "args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"], "waitFor": ["-"], }, { + "id": "build-dataflow", "name": "gcr.io/cloud-builders/docker", "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), "--label=git-versions=" + std.extVar("gitVersion"), - "--file=docker/t2t/Dockerfile.dataflow", "."], - "waitFor": ["-"], + "--file=docker/t2t/Dockerfile.dataflow", + "--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest", + "."], + "waitFor": ["pull-dataflow"], + }, + { + "id": "tag-dataflow", + "name": "gcr.io/cloud-builders/docker", + "args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search-dataflow:latest",], + "waitFor": ["build-dataflow"], }, ], "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search:latest", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), - "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")], + "gcr.io/kubeflow-examples/code-search-gpu:latest", + "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search-dataflow:latest"], } \ No newline at end of file diff --git a/code_search/kubeflow/components/experiments.libsonnet b/code_search/kubeflow/components/experiments.libsonnet index 926e4656..f9a1e381 100644 --- a/code_search/kubeflow/components/experiments.libsonnet +++ b/code_search/kubeflow/components/experiments.libsonnet @@ -11,5 +11,10 @@ modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/", problem: "kf_github_function_docstring", model: "kf_similarity_transformer", + + // Location to write the index file for nmslib and the file to be used as the reverse lookup + // with the index server. + lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv", + indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index", }, } diff --git a/code_search/kubeflow/components/nms.libsonnet b/code_search/kubeflow/components/nms.libsonnet index aaa1508e..cca6202f 100644 --- a/code_search/kubeflow/components/nms.libsonnet +++ b/code_search/kubeflow/components/nms.libsonnet @@ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"]; containerSpec(params, env=[], volumeMounts=[], ports=[]):: { name: params.name, image: params.image, - args: params.args, + command: params.command, ports: ports, env: env, volumeMounts: volumeMounts, @@ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"]; }, ], - creator:: { - local creatorParams = params + { - args: [ - "-m", - "code_search.nmslib.cli.create_search_index", - "--data_dir=" + params.dataDir, - "--lookup_file=" + params.lookupFile, - "--index_file=" + params.indexFile, - ], - }, - - all: [ - $.jobSpec(creatorParams, env, - [ - $.containerSpec(creatorParams, env=containerEnv, - volumeMounts=containerVolumeMounts) - ], - volumes=volumes), - ], - }.all, - server:: { local serverParams = params + { - args: [ + command: [ + "python", "-m", "code_search.nmslib.cli.start_search_server", "--problem=" + params.problem, diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index d74e7fa5..676ad98a 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -8,7 +8,7 @@ // are not picked up by the individual components. // Need to see if we can find a way to fix this. - local imageTag = "v20181108-004b5ad-dirty-eba459", + local imageTag = "v20181117-3c030ae-dirty-4d809c", "t2t-job": { jobType: "trainer", numChief: 0, @@ -20,7 +20,7 @@ eval_steps: 10, image: "gcr.io/kubeflow-examples/code-search:" + imageTag, imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag, - dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384", + dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag, imagePullSecrets: [], // TODO(jlewi): dataDir doesn't seem to be used. diff --git a/code_search/kubeflow/components/search-index-creator.jsonnet b/code_search/kubeflow/components/search-index-creator.jsonnet index b66c72c3..86ab98ee 100644 --- a/code_search/kubeflow/components/search-index-creator.jsonnet +++ b/code_search/kubeflow/components/search-index-creator.jsonnet @@ -3,5 +3,82 @@ local nms = import "nms.libsonnet"; local env = std.extVar("__ksonnet/environments"); local params = std.extVar("__ksonnet/params").components["search-index-creator"]; +local experiments = import "experiments.libsonnet"; -std.prune(k.core.v1.list.new(nms.parts(params, env).creator)) +local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"]; +local experimentName = baseParams.experiment; +local params = baseParams + experiments[experimentName] + { + name: experimentName + "-create-search-index", +}; + +local jobSpec = { + apiVersion: "batch/v1", + kind: "Job", + metadata: { + name: params.name, + namespace: env.namespace, + labels: { + app: params.name, + }, + }, + spec: { + replicas: 1, + template: { + metadata: { + labels: { + app: params.name, + }, + }, + spec: { + // Don't restart because all the job should do is launch the Dataflow job. + restartPolicy: "Never", + containers: [ + { + name: "dataflow", + image: params.image, + command: [ + "python", + "-m", + "code_search.nmslib.cli.create_search_index", + "--data_dir=" + params.dataDir, + "--lookup_file=" + params.lookupFile, + "--index_file=" + params.indexFile, + ], + env: [ + { + name: "GOOGLE_APPLICATION_CREDENTIALS", + value: "/secret/gcp-credentials/user-gcp-sa.json", + }, + ], + // Creating the index requires a lot of memory. + resources: { + requests: { + memory: "32Gi" + }, + limits: { + memory: "100Gi" + }, + }, + workingDir: "/src", + volumeMounts: [ + { + mountPath: "/secret/gcp-credentials", + name: "gcp-credentials", + }, + ], //volumeMounts + }, + ], // containers + volumes: [ + { + name: "gcp-credentials", + secret: { + secretName: "user-gcp-sa", + }, + }, + ], + }, // spec + }, + }, +}; + +std.prune(k.core.v1.list.new(jobSpec)) diff --git a/code_search/kubeflow/components/tensorboard.jsonnet b/code_search/kubeflow/components/tensorboard.jsonnet index c4311e74..2af0b60d 100644 --- a/code_search/kubeflow/components/tensorboard.jsonnet +++ b/code_search/kubeflow/components/tensorboard.jsonnet @@ -101,3 +101,4 @@ local deployment = { }; std.prune(k.core.v1.list.new([service, deployment])) + diff --git a/code_search/src/code_search/nmslib/cli/create_search_index.py b/code_search/src/code_search/nmslib/cli/create_search_index.py index ddb26520..56998004 100644 --- a/code_search/src/code_search/nmslib/cli/create_search_index.py +++ b/code_search/src/code_search/nmslib/cli/create_search_index.py @@ -1,4 +1,5 @@ import csv +import logging import os import numpy as np import tensorflow as tf @@ -23,6 +24,7 @@ def create_search_index(argv=None): args = arguments.parse_arguments(argv) if not os.path.isdir(args.tmp_dir): + logging.info("Creating directory %s", args.tmp_dir) os.makedirs(args.tmp_dir) tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file)) @@ -34,7 +36,7 @@ def create_search_index(argv=None): lookup_writer = csv.writer(lookup_file) for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)): - tf.logging.debug('Reading {}'.format(csv_file_path)) + logging.info('Reading %s', csv_file_path) with tf.gfile.Open(csv_file_path) as csv_file: reader = csv.reader(csv_file) @@ -49,9 +51,19 @@ def create_search_index(argv=None): search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file) + logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file) tf.gfile.Copy(tmp_lookup_file, args.lookup_file) + logging.info("Copying file %s to %s", tmp_index_file, args.index_file) tf.gfile.Copy(tmp_index_file, args.index_file) + logging.info("Finished creating the index") if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + logging.info("Creating the search index") create_search_index() diff --git a/code_search/src/code_search/nmslib/cli/start_search_server.py b/code_search/src/code_search/nmslib/cli/start_search_server.py index 04970fc2..44b4a18e 100644 --- a/code_search/src/code_search/nmslib/cli/start_search_server.py +++ b/code_search/src/code_search/nmslib/cli/start_search_server.py @@ -85,4 +85,10 @@ def start_search_server(argv=None): if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) start_search_server() diff --git a/code_search/src/requirements.nmslib.txt b/code_search/src/requirements.nmslib.txt new file mode 100644 index 00000000..7dd54f5f --- /dev/null +++ b/code_search/src/requirements.nmslib.txt @@ -0,0 +1,2 @@ +# Requirements to run nmslib. +nmslib~=1.7.0