Fix the K8s job to create the nmslib index. (#338)

* Install nmslib in the Dataflow container so its suitable for running the index creation job. * Use command not args in the job specs. * Dockerfile.dataflow should install nmslib so that we can use that Docker image to create the index. * build.jsonnet should tag images as latest. We will use this to use the latest images as a layer cache to speed up builds. * Set logging level to info for start_search_server.py and create_search_index.py * Create search index pod keeps was getting evicted because node runs out of memory * Add a new node pool consisting of n1-standard-32 nodes to the demo cluster. These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8 * Set requests and limits on the creator search index pod. * Move all the config for the search-index-creator job into the search-index-creator.jsonnet file. We need to customize the memory resources so there's not much value to try to sharing config with other components.
2018-11-20 12:53:09 -08:00 · 2018-11-20 12:53:09 -08:00 · d2b68f15d7
parent a402db1ccc
commit d2b68f15d7
13 changed files with 213 additions and 37 deletions
--- a/code_search/Makefile
+++ b/code_search/Makefile
@ -58,7 +58,8 @@ build-gcb:
 	cp -r ./src ./build/
 	rm -rf ./build/src/code_search/dataflow/cli/test_data
 	rm -rf ./build/src/code_search/t2t/test_data
-	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \
+		--timeout=3600 ./build


 # Build but don't attach the latest tag. This allows manual testing/inspection of the image
--- a/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml
+++ b/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml
@ -37,7 +37,7 @@ resources:
    # bump this if you want to modify the node pools.
    # This will cause existing node pools to be deleted and new ones to be created.
    # Use prefix v so it will be treated as a string.
-    pool-version: v1
+    pool-version: v2
    # Two is small enough to fit within default quota.
    cpu-pool-initialNodeCount: 2
    gpu-pool-initialNodeCount: 0
--- a/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja
+++ b/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja
@ -16,6 +16,7 @@ limitations under the License.
 {% set CLUSTER_NAME = NAME_PREFIX %}
 {% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
 {% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
+{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}

 {# Type names are the names to give to deployment manager type providers
   that will be created to represent Kubernetes objects.
@ -152,6 +153,41 @@ resources:
    # We can only create 1 node pool at a time.
    - {{ CLUSTER_NAME }}

+# Add a high memory pool because creating the search index requires a lot of memory.
+- name: {{ LARGE_POOL }}
+  {% if properties['gkeApiVersion'] == 'v1beta1' %}
+  type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
+  {% else %}
+  type: container.v1.nodePool
+  {% endif %}
+  properties:
+    parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
+    project: {{ properties['securityConfig']['project'] }}
+    zone: {{ properties['zone'] }}
+    clusterId: {{ CLUSTER_NAME }}
+    nodePool:
+      name: large-pool
+      initialNodeCount: 0
+      autoscaling:
+        enabled: true
+        minNodeCount: 1
+        maxNodeCount: 10
+      config:
+        {% if properties['securityConfig']['secureNodeMetadata'] %}
+        workloadMetadataConfig:
+          nodeMetadata: SECURE
+        {% endif %}
+        machineType: n1-standard-32
+        serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
+        oauthScopes: {{ VM_OAUTH_SCOPES }}
+        # Set min cpu platform to ensure AVX2 is supported.
+        minCpuPlatform: 'Intel Haswell'
+
+  metadata:
+    dependsOn:
+    # We can only create 1 node pool at a time.
+    - {{ GPU_POOL }}
+
 {# Project defaults to the project of the deployment. #}
 - name: {{ properties['ipName']  }}
  type: compute.v1.globalAddress
--- a/code_search/docker/t2t/Dockerfile.dataflow
+++ b/code_search/docker/t2t/Dockerfile.dataflow
@ -1,4 +1,5 @@
-# Dockerfile suitable for submitting Dataflow jobs.
+# Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator.
+#
 # We don't use the Docker image used for running the training jobs
 # because we have different versioning requirements.
 FROM python:2.7-jessie
@ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
 RUN pip install -r /tmp/requirements.dataflow.txt
 RUN pip install https://github.com/kubeflow/batch-predict/tarball/master

+# Install nmslib requirements so that we can create the index
+COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt
+RUN pip install -r /tmp/requirements.nmslib.txt
+
 # install the spacy model
 RUN python -m spacy download en

--- a/code_search/docker/t2t/build.jsonnet
+++ b/code_search/docker/t2t/build.jsonnet
@ -5,30 +5,81 @@
 	
 	"steps": [      
    {
+      "id": "pull-cpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"],
+      "waitFor": ["-"],
+    },
+    {
+      "id": "build-cpu",
      "name": "gcr.io/cloud-builders/docker",
      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
             	 "--label=git-versions=" + std.extVar("gitVersion"), 
               "--build-arg", "BASE_IMAGE_TAG=1.11.0",
-               "--file=docker/t2t/Dockerfile", "."],
+               "--file=docker/t2t/Dockerfile", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search:latest",
+               "."],
+      "waitFor": ["pull-cpu"],
+    },
+    {
+      "id": "tag-cpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search:latest",],
+      "waitFor": ["build-cpu"],
+    },    
+    {
+      "id": "pull-gpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"],
      "waitFor": ["-"],
    },
    {
+      "id": "build-gpu",
      "name": "gcr.io/cloud-builders/docker",      
      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
             	 "--label=git-versions=" + std.extVar("gitVersion"), 
               "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
-               "--file=docker/t2t/Dockerfile", "."],
+               "--file=docker/t2t/Dockerfile", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest",
+               "."],
+      "waitFor": ["pull-gpu"],
+    },    
+    {
+      "id": "tag-gpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search-gpu:latest",],
+      "waitFor": ["build-gpu"],
+    },
+    {
+      "id": "pull-dataflow",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
      "waitFor": ["-"],
    },
    {
+      "id": "build-dataflow",
      "name": "gcr.io/cloud-builders/docker",
      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), 
               "--label=git-versions=" + std.extVar("gitVersion"),
-               "--file=docker/t2t/Dockerfile.dataflow", "."],
-      "waitFor": ["-"],
+               "--file=docker/t2t/Dockerfile.dataflow", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest",
+               "."],
+      "waitFor": ["pull-dataflow"],
+    },
+    {
+      "id": "tag-dataflow",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search-dataflow:latest",],
+      "waitFor": ["build-dataflow"],
    },
  ],
  "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+             "gcr.io/kubeflow-examples/code-search:latest", 
             "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
-             "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
+             "gcr.io/kubeflow-examples/code-search-gpu:latest",
+             "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
+             "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
 }
--- a/code_search/kubeflow/components/experiments.libsonnet
+++ b/code_search/kubeflow/components/experiments.libsonnet
@ -11,5 +11,10 @@
    modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",    
    problem: "kf_github_function_docstring",
    model: "kf_similarity_transformer",
+
+    // Location to write the index file for nmslib and the file to be used as the reverse lookup
+    // with the index server.
+    lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv",
+    indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index",
  },
 }
--- a/code_search/kubeflow/components/nms.libsonnet
+++ b/code_search/kubeflow/components/nms.libsonnet
@ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
  containerSpec(params, env=[], volumeMounts=[], ports=[]):: {
    name: params.name,
    image: params.image,
-    args: params.args,
+    command: params.command,
    ports: ports,
    env: env,
    volumeMounts: volumeMounts,
@ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
      },
    ],

-    creator:: {
-      local creatorParams = params + {
-        args: [
-          "-m",
-          "code_search.nmslib.cli.create_search_index",
-          "--data_dir=" + params.dataDir,
-          "--lookup_file=" + params.lookupFile,
-          "--index_file=" + params.indexFile,
-        ],
-      },
-
-      all: [
-        $.jobSpec(creatorParams, env,
-                  [
-                    $.containerSpec(creatorParams, env=containerEnv,
-                                    volumeMounts=containerVolumeMounts)
-                  ],
-                  volumes=volumes),
-      ],
-    }.all,
-
    server:: {
      local serverParams = params + {
-        args: [
+        command: [
+          "python",
          "-m",
          "code_search.nmslib.cli.start_search_server",
          "--problem=" + params.problem,
--- a/code_search/kubeflow/components/params.libsonnet
+++ b/code_search/kubeflow/components/params.libsonnet
@ -8,7 +8,7 @@
    // are not picked up by the individual components.
    // Need to see if we can find a way to fix this.

-    local imageTag = "v20181108-004b5ad-dirty-eba459",
+    local imageTag = "v20181117-3c030ae-dirty-4d809c",
    "t2t-job": {
      jobType: "trainer",
      numChief: 0,
@ -20,7 +20,7 @@
      eval_steps: 10,
      image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
      imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
-      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
+      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag,

      imagePullSecrets: [],
      // TODO(jlewi): dataDir doesn't seem to be used.
--- a/code_search/kubeflow/components/search-index-creator.jsonnet
+++ b/code_search/kubeflow/components/search-index-creator.jsonnet
@ -3,5 +3,82 @@ local nms = import "nms.libsonnet";

 local env = std.extVar("__ksonnet/environments");
 local params = std.extVar("__ksonnet/params").components["search-index-creator"];
+local experiments = import "experiments.libsonnet";

-std.prune(k.core.v1.list.new(nms.parts(params, env).creator))
+local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
+local experimentName = baseParams.experiment;
+local params = baseParams + experiments[experimentName] + {
+  name: experimentName + "-create-search-index",
+};
+
+local jobSpec = {
+  apiVersion: "batch/v1",
+  kind: "Job",
+  metadata: {
+    name: params.name,
+    namespace: env.namespace,
+    labels: {
+      app: params.name,
+    },
+  },
+  spec: {
+    replicas: 1,
+    template: {
+      metadata: {
+        labels: {
+          app: params.name,
+        },
+      },
+      spec: {
+        // Don't restart because all the job should do is launch the Dataflow job.
+        restartPolicy: "Never",
+        containers: [
+          {
+            name: "dataflow",
+            image: params.image,
+            command: [
+		          "python",
+		          "-m",
+		          "code_search.nmslib.cli.create_search_index",
+		          "--data_dir=" + params.dataDir,
+		          "--lookup_file=" + params.lookupFile,
+		          "--index_file=" + params.indexFile,
+		        ],
+            env: [
+              {
+                name: "GOOGLE_APPLICATION_CREDENTIALS",
+                value: "/secret/gcp-credentials/user-gcp-sa.json",
+              },
+            ],
+            // Creating the index requires a lot of memory.
+            resources: {
+            	requests: {
+			        memory: "32Gi"
+			 	},
+			 	limits: {
+			        memory: "100Gi"
+			 	},
+			},
+            workingDir: "/src",            
+            volumeMounts: [
+              {
+                mountPath: "/secret/gcp-credentials",
+                name: "gcp-credentials",
+              },
+            ],  //volumeMounts
+          },
+        ],  // containers
+        volumes: [
+          {
+            name: "gcp-credentials",
+            secret: {
+              secretName: "user-gcp-sa",
+            },
+          },
+        ],
+      },  // spec
+    },
+  },
+};
+
+std.prune(k.core.v1.list.new(jobSpec))
--- a/code_search/kubeflow/components/tensorboard.jsonnet
+++ b/code_search/kubeflow/components/tensorboard.jsonnet
@ -101,3 +101,4 @@ local deployment = {
 };

 std.prune(k.core.v1.list.new([service, deployment]))
+
--- a/code_search/src/code_search/nmslib/cli/create_search_index.py
+++ b/code_search/src/code_search/nmslib/cli/create_search_index.py
@ -1,4 +1,5 @@
 import csv
+import logging
 import os
 import numpy as np
 import tensorflow as tf
@ -23,6 +24,7 @@ def create_search_index(argv=None):
  args = arguments.parse_arguments(argv)

  if not os.path.isdir(args.tmp_dir):
+    logging.info("Creating directory %s", args.tmp_dir)
    os.makedirs(args.tmp_dir)

  tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
@ -34,7 +36,7 @@ def create_search_index(argv=None):
    lookup_writer = csv.writer(lookup_file)

    for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)):
-      tf.logging.debug('Reading {}'.format(csv_file_path))
+      logging.info('Reading %s', csv_file_path)

      with tf.gfile.Open(csv_file_path) as csv_file:
        reader = csv.reader(csv_file)
@ -49,9 +51,19 @@ def create_search_index(argv=None):

  search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)

+  logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file)
  tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
+  logging.info("Copying file %s to %s", tmp_index_file, args.index_file)
  tf.gfile.Copy(tmp_index_file, args.index_file)
+  logging.info("Finished creating the index")


 if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
+  logging.info("Creating the search index")
  create_search_index()
--- a/code_search/src/code_search/nmslib/cli/start_search_server.py
+++ b/code_search/src/code_search/nmslib/cli/start_search_server.py
@ -85,4 +85,10 @@ def start_search_server(argv=None):


 if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
  start_search_server()
--- a/code_search/src/requirements.nmslib.txt
+++ b/code_search/src/requirements.nmslib.txt
@ -0,0 +1,2 @@
+# Requirements to run nmslib.
+nmslib~=1.7.0