From d2b68f15d784796d0ea819d10ec632db94b48311 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy+github@lewi.us>
Date: Tue, 20 Nov 2018 12:53:09 -0800
Subject: [PATCH] Fix the K8s job to create the nmslib index. (#338)

* Install nmslib in the Dataflow container so its suitable for running
  the index creation job.

* Use command not args in the job specs.

* Dockerfile.dataflow should install nmslib so that we can use that Docker
  image to create the index.

* build.jsonnet should tag images as latest. We will use this to use
  the latest images as a layer cache to speed up builds.

* Set logging level to info for start_search_server.py and
  create_search_index.py

* Create search index pod keeps was getting evicted because node runs out of
  memory

* Add a new node pool consisting of n1-standard-32 nodes to the demo cluster.
 These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8

* Set requests and limits on the creator search index pod.

* Move all the config for the search-index-creator job into the
  search-index-creator.jsonnet file. We need to customize the memory resources
  so there's not much value to try to sharing config with other components.
---
 code_search/Makefile                          |  3 +-
 .../gcp_config/cluster-kubeflow.yaml          |  2 +-
 .../cs-demo-1103/gcp_config/cluster.jinja     | 36 +++++++++
 code_search/docker/t2t/Dockerfile.dataflow    |  7 +-
 code_search/docker/t2t/build.jsonnet          | 65 +++++++++++++--
 .../kubeflow/components/experiments.libsonnet |  5 ++
 code_search/kubeflow/components/nms.libsonnet | 26 +-----
 .../kubeflow/components/params.libsonnet      |  4 +-
 .../components/search-index-creator.jsonnet   | 79 ++++++++++++++++++-
 .../kubeflow/components/tensorboard.jsonnet   |  1 +
 .../nmslib/cli/create_search_index.py         | 14 +++-
 .../nmslib/cli/start_search_server.py         |  6 ++
 code_search/src/requirements.nmslib.txt       |  2 +
 13 files changed, 213 insertions(+), 37 deletions(-)
 create mode 100644 code_search/src/requirements.nmslib.txt

diff --git a/code_search/Makefile b/code_search/Makefile
index 0cef8439..f5e39706 100644
--- a/code_search/Makefile
+++ b/code_search/Makefile
@@ -58,7 +58,8 @@ build-gcb:
 	cp -r ./src ./build/
 	rm -rf ./build/src/code_search/dataflow/cli/test_data
 	rm -rf ./build/src/code_search/t2t/test_data
-	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \
+		--timeout=3600 ./build
 
 
 # Build but don't attach the latest tag. This allows manual testing/inspection of the image
diff --git a/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml b/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml
index ebb923d8..8d9fa077 100644
--- a/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml
+++ b/code_search/demo/cs-demo-1103/gcp_config/cluster-kubeflow.yaml
@@ -37,7 +37,7 @@ resources:
     # bump this if you want to modify the node pools.
     # This will cause existing node pools to be deleted and new ones to be created.
     # Use prefix v so it will be treated as a string.
-    pool-version: v1
+    pool-version: v2
     # Two is small enough to fit within default quota.
     cpu-pool-initialNodeCount: 2
     gpu-pool-initialNodeCount: 0
diff --git a/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja b/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja
index 46114f9b..1ae000b1 100644
--- a/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja
+++ b/code_search/demo/cs-demo-1103/gcp_config/cluster.jinja
@@ -16,6 +16,7 @@ limitations under the License.
 {% set CLUSTER_NAME = NAME_PREFIX %}
 {% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
 {% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
+{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}
 
 {# Type names are the names to give to deployment manager type providers
    that will be created to represent Kubernetes objects.
@@ -152,6 +153,41 @@ resources:
     # We can only create 1 node pool at a time.
     - {{ CLUSTER_NAME }}
 
+# Add a high memory pool because creating the search index requires a lot of memory.
+- name: {{ LARGE_POOL }}
+  {% if properties['gkeApiVersion'] == 'v1beta1' %}
+  type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
+  {% else %}
+  type: container.v1.nodePool
+  {% endif %}
+  properties:
+    parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
+    project: {{ properties['securityConfig']['project'] }}
+    zone: {{ properties['zone'] }}
+    clusterId: {{ CLUSTER_NAME }}
+    nodePool:
+      name: large-pool
+      initialNodeCount: 0
+      autoscaling:
+        enabled: true
+        minNodeCount: 1
+        maxNodeCount: 10
+      config:
+        {% if properties['securityConfig']['secureNodeMetadata'] %}
+        workloadMetadataConfig:
+          nodeMetadata: SECURE
+        {% endif %}
+        machineType: n1-standard-32
+        serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
+        oauthScopes: {{ VM_OAUTH_SCOPES }}
+        # Set min cpu platform to ensure AVX2 is supported.
+        minCpuPlatform: 'Intel Haswell'
+
+  metadata:
+    dependsOn:
+    # We can only create 1 node pool at a time.
+    - {{ GPU_POOL }}
+
 {# Project defaults to the project of the deployment. #}
 - name: {{ properties['ipName']  }}
   type: compute.v1.globalAddress
diff --git a/code_search/docker/t2t/Dockerfile.dataflow b/code_search/docker/t2t/Dockerfile.dataflow
index c74a24aa..06a12aee 100644
--- a/code_search/docker/t2t/Dockerfile.dataflow
+++ b/code_search/docker/t2t/Dockerfile.dataflow
@@ -1,4 +1,5 @@
-# Dockerfile suitable for submitting Dataflow jobs.
+# Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator.
+#
 # We don't use the Docker image used for running the training jobs
 # because we have different versioning requirements.
 FROM python:2.7-jessie
@@ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
 RUN pip install -r /tmp/requirements.dataflow.txt
 RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
 
+# Install nmslib requirements so that we can create the index
+COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt
+RUN pip install -r /tmp/requirements.nmslib.txt
+
 # install the spacy model
 RUN python -m spacy download en
 
diff --git a/code_search/docker/t2t/build.jsonnet b/code_search/docker/t2t/build.jsonnet
index 711dca64..0131d624 100644
--- a/code_search/docker/t2t/build.jsonnet
+++ b/code_search/docker/t2t/build.jsonnet
@@ -3,32 +3,83 @@
 // https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
 {
 	
-	"steps": [
+	"steps": [      
     {
+      "id": "pull-cpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"],
+      "waitFor": ["-"],
+    },
+    {
+      "id": "build-cpu",
       "name": "gcr.io/cloud-builders/docker",
       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
              	 "--label=git-versions=" + std.extVar("gitVersion"), 
                "--build-arg", "BASE_IMAGE_TAG=1.11.0",
-               "--file=docker/t2t/Dockerfile", "."],
+               "--file=docker/t2t/Dockerfile", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search:latest",
+               "."],
+      "waitFor": ["pull-cpu"],
+    },
+    {
+      "id": "tag-cpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search:latest",],
+      "waitFor": ["build-cpu"],
+    },    
+    {
+      "id": "pull-gpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"],
       "waitFor": ["-"],
     },
     {
-      "name": "gcr.io/cloud-builders/docker",
+      "id": "build-gpu",
+      "name": "gcr.io/cloud-builders/docker",      
       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
              	 "--label=git-versions=" + std.extVar("gitVersion"), 
                "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
-               "--file=docker/t2t/Dockerfile", "."],
+               "--file=docker/t2t/Dockerfile", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest",
+               "."],
+      "waitFor": ["pull-gpu"],
+    },    
+    {
+      "id": "tag-gpu",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search-gpu:latest",],
+      "waitFor": ["build-gpu"],
+    },
+    {
+      "id": "pull-dataflow",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
       "waitFor": ["-"],
     },
     {
+      "id": "build-dataflow",
       "name": "gcr.io/cloud-builders/docker",
       "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), 
                "--label=git-versions=" + std.extVar("gitVersion"),
-               "--file=docker/t2t/Dockerfile.dataflow", "."],
-      "waitFor": ["-"],
+               "--file=docker/t2t/Dockerfile.dataflow", 
+               "--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest",
+               "."],
+      "waitFor": ["pull-dataflow"],
+    },
+    {
+      "id": "tag-dataflow",
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"), 
+               "gcr.io/kubeflow-examples/code-search-dataflow:latest",],
+      "waitFor": ["build-dataflow"],
     },
   ],
   "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+             "gcr.io/kubeflow-examples/code-search:latest", 
              "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
-             "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
+             "gcr.io/kubeflow-examples/code-search-gpu:latest",
+             "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
+             "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
 }
\ No newline at end of file
diff --git a/code_search/kubeflow/components/experiments.libsonnet b/code_search/kubeflow/components/experiments.libsonnet
index 926e4656..f9a1e381 100644
--- a/code_search/kubeflow/components/experiments.libsonnet
+++ b/code_search/kubeflow/components/experiments.libsonnet
@@ -11,5 +11,10 @@
     modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",    
     problem: "kf_github_function_docstring",
     model: "kf_similarity_transformer",
+
+    // Location to write the index file for nmslib and the file to be used as the reverse lookup
+    // with the index server.
+    lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv",
+    indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index",
   },
 }
diff --git a/code_search/kubeflow/components/nms.libsonnet b/code_search/kubeflow/components/nms.libsonnet
index aaa1508e..cca6202f 100644
--- a/code_search/kubeflow/components/nms.libsonnet
+++ b/code_search/kubeflow/components/nms.libsonnet
@@ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
   containerSpec(params, env=[], volumeMounts=[], ports=[]):: {
     name: params.name,
     image: params.image,
-    args: params.args,
+    command: params.command,
     ports: ports,
     env: env,
     volumeMounts: volumeMounts,
@@ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
       },
     ],
 
-    creator:: {
-      local creatorParams = params + {
-        args: [
-          "-m",
-          "code_search.nmslib.cli.create_search_index",
-          "--data_dir=" + params.dataDir,
-          "--lookup_file=" + params.lookupFile,
-          "--index_file=" + params.indexFile,
-        ],
-      },
-
-      all: [
-        $.jobSpec(creatorParams, env,
-                  [
-                    $.containerSpec(creatorParams, env=containerEnv,
-                                    volumeMounts=containerVolumeMounts)
-                  ],
-                  volumes=volumes),
-      ],
-    }.all,
-
     server:: {
       local serverParams = params + {
-        args: [
+        command: [
+          "python",
           "-m",
           "code_search.nmslib.cli.start_search_server",
           "--problem=" + params.problem,
diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet
index d74e7fa5..676ad98a 100644
--- a/code_search/kubeflow/components/params.libsonnet
+++ b/code_search/kubeflow/components/params.libsonnet
@@ -8,7 +8,7 @@
     // are not picked up by the individual components.
     // Need to see if we can find a way to fix this.
 
-    local imageTag = "v20181108-004b5ad-dirty-eba459",
+    local imageTag = "v20181117-3c030ae-dirty-4d809c",
     "t2t-job": {
       jobType: "trainer",
       numChief: 0,
@@ -20,7 +20,7 @@
       eval_steps: 10,
       image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
       imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
-      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
+      dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag,
 
       imagePullSecrets: [],
       // TODO(jlewi): dataDir doesn't seem to be used.
diff --git a/code_search/kubeflow/components/search-index-creator.jsonnet b/code_search/kubeflow/components/search-index-creator.jsonnet
index b66c72c3..86ab98ee 100644
--- a/code_search/kubeflow/components/search-index-creator.jsonnet
+++ b/code_search/kubeflow/components/search-index-creator.jsonnet
@@ -3,5 +3,82 @@ local nms = import "nms.libsonnet";
 
 local env = std.extVar("__ksonnet/environments");
 local params = std.extVar("__ksonnet/params").components["search-index-creator"];
+local experiments = import "experiments.libsonnet";
 
-std.prune(k.core.v1.list.new(nms.parts(params, env).creator))
+local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
+local experimentName = baseParams.experiment;
+local params = baseParams + experiments[experimentName] + {
+  name: experimentName + "-create-search-index",
+};
+
+local jobSpec = {
+  apiVersion: "batch/v1",
+  kind: "Job",
+  metadata: {
+    name: params.name,
+    namespace: env.namespace,
+    labels: {
+      app: params.name,
+    },
+  },
+  spec: {
+    replicas: 1,
+    template: {
+      metadata: {
+        labels: {
+          app: params.name,
+        },
+      },
+      spec: {
+        // Don't restart because all the job should do is launch the Dataflow job.
+        restartPolicy: "Never",
+        containers: [
+          {
+            name: "dataflow",
+            image: params.image,
+            command: [
+		          "python",
+		          "-m",
+		          "code_search.nmslib.cli.create_search_index",
+		          "--data_dir=" + params.dataDir,
+		          "--lookup_file=" + params.lookupFile,
+		          "--index_file=" + params.indexFile,
+		        ],
+            env: [
+              {
+                name: "GOOGLE_APPLICATION_CREDENTIALS",
+                value: "/secret/gcp-credentials/user-gcp-sa.json",
+              },
+            ],
+            // Creating the index requires a lot of memory.
+            resources: {
+            	requests: {
+			        memory: "32Gi"
+			 	},
+			 	limits: {
+			        memory: "100Gi"
+			 	},
+			},
+            workingDir: "/src",            
+            volumeMounts: [
+              {
+                mountPath: "/secret/gcp-credentials",
+                name: "gcp-credentials",
+              },
+            ],  //volumeMounts
+          },
+        ],  // containers
+        volumes: [
+          {
+            name: "gcp-credentials",
+            secret: {
+              secretName: "user-gcp-sa",
+            },
+          },
+        ],
+      },  // spec
+    },
+  },
+};
+
+std.prune(k.core.v1.list.new(jobSpec))
diff --git a/code_search/kubeflow/components/tensorboard.jsonnet b/code_search/kubeflow/components/tensorboard.jsonnet
index c4311e74..2af0b60d 100644
--- a/code_search/kubeflow/components/tensorboard.jsonnet
+++ b/code_search/kubeflow/components/tensorboard.jsonnet
@@ -101,3 +101,4 @@ local deployment = {
 };
 
 std.prune(k.core.v1.list.new([service, deployment]))
+
diff --git a/code_search/src/code_search/nmslib/cli/create_search_index.py b/code_search/src/code_search/nmslib/cli/create_search_index.py
index ddb26520..56998004 100644
--- a/code_search/src/code_search/nmslib/cli/create_search_index.py
+++ b/code_search/src/code_search/nmslib/cli/create_search_index.py
@@ -1,4 +1,5 @@
 import csv
+import logging
 import os
 import numpy as np
 import tensorflow as tf
@@ -23,6 +24,7 @@ def create_search_index(argv=None):
   args = arguments.parse_arguments(argv)
 
   if not os.path.isdir(args.tmp_dir):
+    logging.info("Creating directory %s", args.tmp_dir)
     os.makedirs(args.tmp_dir)
 
   tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
@@ -34,7 +36,7 @@ def create_search_index(argv=None):
     lookup_writer = csv.writer(lookup_file)
 
     for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)):
-      tf.logging.debug('Reading {}'.format(csv_file_path))
+      logging.info('Reading %s', csv_file_path)
 
       with tf.gfile.Open(csv_file_path) as csv_file:
         reader = csv.reader(csv_file)
@@ -49,9 +51,19 @@ def create_search_index(argv=None):
 
   search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)
 
+  logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file)
   tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
+  logging.info("Copying file %s to %s", tmp_index_file, args.index_file)
   tf.gfile.Copy(tmp_index_file, args.index_file)
+  logging.info("Finished creating the index")
 
 
 if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
+  logging.info("Creating the search index")
   create_search_index()
diff --git a/code_search/src/code_search/nmslib/cli/start_search_server.py b/code_search/src/code_search/nmslib/cli/start_search_server.py
index 04970fc2..44b4a18e 100644
--- a/code_search/src/code_search/nmslib/cli/start_search_server.py
+++ b/code_search/src/code_search/nmslib/cli/start_search_server.py
@@ -85,4 +85,10 @@ def start_search_server(argv=None):
 
 
 if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
   start_search_server()
diff --git a/code_search/src/requirements.nmslib.txt b/code_search/src/requirements.nmslib.txt
new file mode 100644
index 00000000..7dd54f5f
--- /dev/null
+++ b/code_search/src/requirements.nmslib.txt
@@ -0,0 +1,2 @@
+# Requirements to run nmslib.
+nmslib~=1.7.0