Fix the K8s job to create the nmslib index. (#338)

* Install nmslib in the Dataflow container so its suitable for running
  the index creation job.

* Use command not args in the job specs.

* Dockerfile.dataflow should install nmslib so that we can use that Docker
  image to create the index.

* build.jsonnet should tag images as latest. We will use this to use
  the latest images as a layer cache to speed up builds.

* Set logging level to info for start_search_server.py and
  create_search_index.py

* Create search index pod keeps was getting evicted because node runs out of
  memory

* Add a new node pool consisting of n1-standard-32 nodes to the demo cluster.
 These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8

* Set requests and limits on the creator search index pod.

* Move all the config for the search-index-creator job into the
  search-index-creator.jsonnet file. We need to customize the memory resources
  so there's not much value to try to sharing config with other components.
This commit is contained in:
Jeremy Lewi 2018-11-20 12:53:09 -08:00 committed by k8s-ci-robot
parent a402db1ccc
commit d2b68f15d7
13 changed files with 213 additions and 37 deletions

View File

@ -58,7 +58,8 @@ build-gcb:
cp -r ./src ./build/
rm -rf ./build/src/code_search/dataflow/cli/test_data
rm -rf ./build/src/code_search/t2t/test_data
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \
--timeout=3600 ./build
# Build but don't attach the latest tag. This allows manual testing/inspection of the image

View File

@ -37,7 +37,7 @@ resources:
# bump this if you want to modify the node pools.
# This will cause existing node pools to be deleted and new ones to be created.
# Use prefix v so it will be treated as a string.
pool-version: v1
pool-version: v2
# Two is small enough to fit within default quota.
cpu-pool-initialNodeCount: 2
gpu-pool-initialNodeCount: 0

View File

@ -16,6 +16,7 @@ limitations under the License.
{% set CLUSTER_NAME = NAME_PREFIX %}
{% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
{% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}
{# Type names are the names to give to deployment manager type providers
that will be created to represent Kubernetes objects.
@ -152,6 +153,41 @@ resources:
# We can only create 1 node pool at a time.
- {{ CLUSTER_NAME }}
# Add a high memory pool because creating the search index requires a lot of memory.
- name: {{ LARGE_POOL }}
{% if properties['gkeApiVersion'] == 'v1beta1' %}
type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
{% else %}
type: container.v1.nodePool
{% endif %}
properties:
parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
project: {{ properties['securityConfig']['project'] }}
zone: {{ properties['zone'] }}
clusterId: {{ CLUSTER_NAME }}
nodePool:
name: large-pool
initialNodeCount: 0
autoscaling:
enabled: true
minNodeCount: 1
maxNodeCount: 10
config:
{% if properties['securityConfig']['secureNodeMetadata'] %}
workloadMetadataConfig:
nodeMetadata: SECURE
{% endif %}
machineType: n1-standard-32
serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
oauthScopes: {{ VM_OAUTH_SCOPES }}
# Set min cpu platform to ensure AVX2 is supported.
minCpuPlatform: 'Intel Haswell'
metadata:
dependsOn:
# We can only create 1 node pool at a time.
- {{ GPU_POOL }}
{# Project defaults to the project of the deployment. #}
- name: {{ properties['ipName'] }}
type: compute.v1.globalAddress

View File

@ -1,4 +1,5 @@
# Dockerfile suitable for submitting Dataflow jobs.
# Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator.
#
# We don't use the Docker image used for running the training jobs
# because we have different versioning requirements.
FROM python:2.7-jessie
@ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
RUN pip install -r /tmp/requirements.dataflow.txt
RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
# Install nmslib requirements so that we can create the index
COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt
RUN pip install -r /tmp/requirements.nmslib.txt
# install the spacy model
RUN python -m spacy download en

View File

@ -3,32 +3,83 @@
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
{
"steps": [
"steps": [
{
"id": "pull-cpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"],
"waitFor": ["-"],
},
{
"id": "build-cpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
"--file=docker/t2t/Dockerfile", "."],
"--file=docker/t2t/Dockerfile",
"--cache-from=gcr.io/kubeflow-examples/code-search:latest",
"."],
"waitFor": ["pull-cpu"],
},
{
"id": "tag-cpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search:latest",],
"waitFor": ["build-cpu"],
},
{
"id": "pull-gpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"],
"waitFor": ["-"],
},
{
"name": "gcr.io/cloud-builders/docker",
"id": "build-gpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
"--file=docker/t2t/Dockerfile", "."],
"--file=docker/t2t/Dockerfile",
"--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest",
"."],
"waitFor": ["pull-gpu"],
},
{
"id": "tag-gpu",
"name": "gcr.io/cloud-builders/docker",
"args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-gpu:latest",],
"waitFor": ["build-gpu"],
},
{
"id": "pull-dataflow",
"name": "gcr.io/cloud-builders/docker",
"args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
"waitFor": ["-"],
},
{
"id": "build-dataflow",
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--file=docker/t2t/Dockerfile.dataflow", "."],
"waitFor": ["-"],
"--file=docker/t2t/Dockerfile.dataflow",
"--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest",
"."],
"waitFor": ["pull-dataflow"],
},
{
"id": "tag-dataflow",
"name": "gcr.io/cloud-builders/docker",
"args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-dataflow:latest",],
"waitFor": ["build-dataflow"],
},
],
"images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search:latest",
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
"gcr.io/kubeflow-examples/code-search-gpu:latest",
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-dataflow:latest"],
}

View File

@ -11,5 +11,10 @@
modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",
problem: "kf_github_function_docstring",
model: "kf_similarity_transformer",
// Location to write the index file for nmslib and the file to be used as the reverse lookup
// with the index server.
lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv",
indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index",
},
}

View File

@ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
containerSpec(params, env=[], volumeMounts=[], ports=[]):: {
name: params.name,
image: params.image,
args: params.args,
command: params.command,
ports: ports,
env: env,
volumeMounts: volumeMounts,
@ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
},
],
creator:: {
local creatorParams = params + {
args: [
"-m",
"code_search.nmslib.cli.create_search_index",
"--data_dir=" + params.dataDir,
"--lookup_file=" + params.lookupFile,
"--index_file=" + params.indexFile,
],
},
all: [
$.jobSpec(creatorParams, env,
[
$.containerSpec(creatorParams, env=containerEnv,
volumeMounts=containerVolumeMounts)
],
volumes=volumes),
],
}.all,
server:: {
local serverParams = params + {
args: [
command: [
"python",
"-m",
"code_search.nmslib.cli.start_search_server",
"--problem=" + params.problem,

View File

@ -8,7 +8,7 @@
// are not picked up by the individual components.
// Need to see if we can find a way to fix this.
local imageTag = "v20181108-004b5ad-dirty-eba459",
local imageTag = "v20181117-3c030ae-dirty-4d809c",
"t2t-job": {
jobType: "trainer",
numChief: 0,
@ -20,7 +20,7 @@
eval_steps: 10,
image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag,
imagePullSecrets: [],
// TODO(jlewi): dataDir doesn't seem to be used.

View File

@ -3,5 +3,82 @@ local nms = import "nms.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["search-index-creator"];
local experiments = import "experiments.libsonnet";
std.prune(k.core.v1.list.new(nms.parts(params, env).creator))
local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
local experimentName = baseParams.experiment;
local params = baseParams + experiments[experimentName] + {
name: experimentName + "-create-search-index",
};
local jobSpec = {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
app: params.name,
},
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: params.name,
},
},
spec: {
// Don't restart because all the job should do is launch the Dataflow job.
restartPolicy: "Never",
containers: [
{
name: "dataflow",
image: params.image,
command: [
"python",
"-m",
"code_search.nmslib.cli.create_search_index",
"--data_dir=" + params.dataDir,
"--lookup_file=" + params.lookupFile,
"--index_file=" + params.indexFile,
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
// Creating the index requires a lot of memory.
resources: {
requests: {
memory: "32Gi"
},
limits: {
memory: "100Gi"
},
},
workingDir: "/src",
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
], //volumeMounts
},
], // containers
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
}, // spec
},
},
};
std.prune(k.core.v1.list.new(jobSpec))

View File

@ -101,3 +101,4 @@ local deployment = {
};
std.prune(k.core.v1.list.new([service, deployment]))

View File

@ -1,4 +1,5 @@
import csv
import logging
import os
import numpy as np
import tensorflow as tf
@ -23,6 +24,7 @@ def create_search_index(argv=None):
args = arguments.parse_arguments(argv)
if not os.path.isdir(args.tmp_dir):
logging.info("Creating directory %s", args.tmp_dir)
os.makedirs(args.tmp_dir)
tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
@ -34,7 +36,7 @@ def create_search_index(argv=None):
lookup_writer = csv.writer(lookup_file)
for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)):
tf.logging.debug('Reading {}'.format(csv_file_path))
logging.info('Reading %s', csv_file_path)
with tf.gfile.Open(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
@ -49,9 +51,19 @@ def create_search_index(argv=None):
search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)
logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file)
tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
logging.info("Copying file %s to %s", tmp_index_file, args.index_file)
tf.gfile.Copy(tmp_index_file, args.index_file)
logging.info("Finished creating the index")
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
logging.info("Creating the search index")
create_search_index()

View File

@ -85,4 +85,10 @@ def start_search_server(argv=None):
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
start_search_server()

View File

@ -0,0 +1,2 @@
# Requirements to run nmslib.
nmslib~=1.7.0