mirror of https://github.com/kubeflow/examples.git
Fix the K8s job to create the nmslib index. (#338)
* Install nmslib in the Dataflow container so its suitable for running the index creation job. * Use command not args in the job specs. * Dockerfile.dataflow should install nmslib so that we can use that Docker image to create the index. * build.jsonnet should tag images as latest. We will use this to use the latest images as a layer cache to speed up builds. * Set logging level to info for start_search_server.py and create_search_index.py * Create search index pod keeps was getting evicted because node runs out of memory * Add a new node pool consisting of n1-standard-32 nodes to the demo cluster. These have 120 GB of RAM compared to 30GB in our default pool of n1-standard-8 * Set requests and limits on the creator search index pod. * Move all the config for the search-index-creator job into the search-index-creator.jsonnet file. We need to customize the memory resources so there's not much value to try to sharing config with other components.
This commit is contained in:
parent
a402db1ccc
commit
d2b68f15d7
|
|
@ -58,7 +58,8 @@ build-gcb:
|
|||
cp -r ./src ./build/
|
||||
rm -rf ./build/src/code_search/dataflow/cli/test_data
|
||||
rm -rf ./build/src/code_search/t2t/test_data
|
||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build
|
||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json \
|
||||
--timeout=3600 ./build
|
||||
|
||||
|
||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ resources:
|
|||
# bump this if you want to modify the node pools.
|
||||
# This will cause existing node pools to be deleted and new ones to be created.
|
||||
# Use prefix v so it will be treated as a string.
|
||||
pool-version: v1
|
||||
pool-version: v2
|
||||
# Two is small enough to fit within default quota.
|
||||
cpu-pool-initialNodeCount: 2
|
||||
gpu-pool-initialNodeCount: 0
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ limitations under the License.
|
|||
{% set CLUSTER_NAME = NAME_PREFIX %}
|
||||
{% set CPU_POOL = NAME_PREFIX + '-cpu-pool-' + properties['pool-version'] %}
|
||||
{% set GPU_POOL = NAME_PREFIX + '-gpu-pool-' + properties['pool-version'] %}
|
||||
{% set LARGE_POOL = NAME_PREFIX + '-large-pool-' + properties['pool-version'] %}
|
||||
|
||||
{# Type names are the names to give to deployment manager type providers
|
||||
that will be created to represent Kubernetes objects.
|
||||
|
|
@ -152,6 +153,41 @@ resources:
|
|||
# We can only create 1 node pool at a time.
|
||||
- {{ CLUSTER_NAME }}
|
||||
|
||||
# Add a high memory pool because creating the search index requires a lot of memory.
|
||||
- name: {{ LARGE_POOL }}
|
||||
{% if properties['gkeApiVersion'] == 'v1beta1' %}
|
||||
type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools
|
||||
{% else %}
|
||||
type: container.v1.nodePool
|
||||
{% endif %}
|
||||
properties:
|
||||
parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }}
|
||||
project: {{ properties['securityConfig']['project'] }}
|
||||
zone: {{ properties['zone'] }}
|
||||
clusterId: {{ CLUSTER_NAME }}
|
||||
nodePool:
|
||||
name: large-pool
|
||||
initialNodeCount: 0
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minNodeCount: 1
|
||||
maxNodeCount: 10
|
||||
config:
|
||||
{% if properties['securityConfig']['secureNodeMetadata'] %}
|
||||
workloadMetadataConfig:
|
||||
nodeMetadata: SECURE
|
||||
{% endif %}
|
||||
machineType: n1-standard-32
|
||||
serviceAccount: {{ KF_VM_SA_NAME }}@{{ env['project'] }}.iam.gserviceaccount.com
|
||||
oauthScopes: {{ VM_OAUTH_SCOPES }}
|
||||
# Set min cpu platform to ensure AVX2 is supported.
|
||||
minCpuPlatform: 'Intel Haswell'
|
||||
|
||||
metadata:
|
||||
dependsOn:
|
||||
# We can only create 1 node pool at a time.
|
||||
- {{ GPU_POOL }}
|
||||
|
||||
{# Project defaults to the project of the deployment. #}
|
||||
- name: {{ properties['ipName'] }}
|
||||
type: compute.v1.globalAddress
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
# Dockerfile suitable for submitting Dataflow jobs.
|
||||
# Dockerfile suitable for submitting Dataflow jobs and for runnin nmslib index creator.
|
||||
#
|
||||
# We don't use the Docker image used for running the training jobs
|
||||
# because we have different versioning requirements.
|
||||
FROM python:2.7-jessie
|
||||
|
|
@ -11,6 +12,10 @@ COPY src/requirements.dataflow.txt /tmp/requirements.dataflow.txt
|
|||
RUN pip install -r /tmp/requirements.dataflow.txt
|
||||
RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
|
||||
|
||||
# Install nmslib requirements so that we can create the index
|
||||
COPY src/requirements.nmslib.txt /tmp/requirements.nmslib.txt
|
||||
RUN pip install -r /tmp/requirements.nmslib.txt
|
||||
|
||||
# install the spacy model
|
||||
RUN python -m spacy download en
|
||||
|
||||
|
|
|
|||
|
|
@ -3,32 +3,83 @@
|
|||
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
|
||||
{
|
||||
|
||||
"steps": [
|
||||
"steps": [
|
||||
{
|
||||
"id": "pull-cpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["pull", "gcr.io/kubeflow-examples/code-search:latest"],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
{
|
||||
"id": "build-cpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
|
||||
"--file=docker/t2t/Dockerfile", "."],
|
||||
"--file=docker/t2t/Dockerfile",
|
||||
"--cache-from=gcr.io/kubeflow-examples/code-search:latest",
|
||||
"."],
|
||||
"waitFor": ["pull-cpu"],
|
||||
},
|
||||
{
|
||||
"id": "tag-cpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["tag", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search:latest",],
|
||||
"waitFor": ["build-cpu"],
|
||||
},
|
||||
{
|
||||
"id": "pull-gpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["pull", "gcr.io/kubeflow-examples/code-search-gpu:latest"],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
{
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"id": "build-gpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
|
||||
"--file=docker/t2t/Dockerfile", "."],
|
||||
"--file=docker/t2t/Dockerfile",
|
||||
"--cache-from=gcr.io/kubeflow-examples/code-search-gpu:latest",
|
||||
"."],
|
||||
"waitFor": ["pull-gpu"],
|
||||
},
|
||||
{
|
||||
"id": "tag-gpu",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["tag", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-gpu:latest",],
|
||||
"waitFor": ["build-gpu"],
|
||||
},
|
||||
{
|
||||
"id": "pull-dataflow",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["pull", "gcr.io/kubeflow-examples/code-search-dataflow:latest"],
|
||||
"waitFor": ["-"],
|
||||
},
|
||||
{
|
||||
"id": "build-dataflow",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
|
||||
"--label=git-versions=" + std.extVar("gitVersion"),
|
||||
"--file=docker/t2t/Dockerfile.dataflow", "."],
|
||||
"waitFor": ["-"],
|
||||
"--file=docker/t2t/Dockerfile.dataflow",
|
||||
"--cache-from=gcr.io/kubeflow-examples/code-search-dataflow:latest",
|
||||
"."],
|
||||
"waitFor": ["pull-dataflow"],
|
||||
},
|
||||
{
|
||||
"id": "tag-dataflow",
|
||||
"name": "gcr.io/cloud-builders/docker",
|
||||
"args": ["tag", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-dataflow:latest",],
|
||||
"waitFor": ["build-dataflow"],
|
||||
},
|
||||
],
|
||||
"images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search:latest",
|
||||
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
|
||||
"gcr.io/kubeflow-examples/code-search-gpu:latest",
|
||||
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
|
||||
"gcr.io/kubeflow-examples/code-search-dataflow:latest"],
|
||||
}
|
||||
|
|
@ -11,5 +11,10 @@
|
|||
modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",
|
||||
problem: "kf_github_function_docstring",
|
||||
model: "kf_similarity_transformer",
|
||||
|
||||
// Location to write the index file for nmslib and the file to be used as the reverse lookup
|
||||
// with the index server.
|
||||
lookupFile: "gs://code-search-demo/20181104/code-embeddings-index/embedding-to-info.csv",
|
||||
indexFile: "gs://code-search-demo/20181104/code-embeddings-index/embeddings.index",
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
|
|||
containerSpec(params, env=[], volumeMounts=[], ports=[]):: {
|
||||
name: params.name,
|
||||
image: params.image,
|
||||
args: params.args,
|
||||
command: params.command,
|
||||
ports: ports,
|
||||
env: env,
|
||||
volumeMounts: volumeMounts,
|
||||
|
|
@ -132,30 +132,10 @@ local baseParams = std.extVar("__ksonnet/params").components["nmslib"];
|
|||
},
|
||||
],
|
||||
|
||||
creator:: {
|
||||
local creatorParams = params + {
|
||||
args: [
|
||||
"-m",
|
||||
"code_search.nmslib.cli.create_search_index",
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--lookup_file=" + params.lookupFile,
|
||||
"--index_file=" + params.indexFile,
|
||||
],
|
||||
},
|
||||
|
||||
all: [
|
||||
$.jobSpec(creatorParams, env,
|
||||
[
|
||||
$.containerSpec(creatorParams, env=containerEnv,
|
||||
volumeMounts=containerVolumeMounts)
|
||||
],
|
||||
volumes=volumes),
|
||||
],
|
||||
}.all,
|
||||
|
||||
server:: {
|
||||
local serverParams = params + {
|
||||
args: [
|
||||
command: [
|
||||
"python",
|
||||
"-m",
|
||||
"code_search.nmslib.cli.start_search_server",
|
||||
"--problem=" + params.problem,
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
// are not picked up by the individual components.
|
||||
// Need to see if we can find a way to fix this.
|
||||
|
||||
local imageTag = "v20181108-004b5ad-dirty-eba459",
|
||||
local imageTag = "v20181117-3c030ae-dirty-4d809c",
|
||||
"t2t-job": {
|
||||
jobType: "trainer",
|
||||
numChief: 0,
|
||||
|
|
@ -20,7 +20,7 @@
|
|||
eval_steps: 10,
|
||||
image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
|
||||
imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
|
||||
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",
|
||||
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:" + imageTag,
|
||||
|
||||
imagePullSecrets: [],
|
||||
// TODO(jlewi): dataDir doesn't seem to be used.
|
||||
|
|
|
|||
|
|
@ -3,5 +3,82 @@ local nms = import "nms.libsonnet";
|
|||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["search-index-creator"];
|
||||
local experiments = import "experiments.libsonnet";
|
||||
|
||||
std.prune(k.core.v1.list.new(nms.parts(params, env).creator))
|
||||
local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
|
||||
local experimentName = baseParams.experiment;
|
||||
local params = baseParams + experiments[experimentName] + {
|
||||
name: experimentName + "-create-search-index",
|
||||
};
|
||||
|
||||
local jobSpec = {
|
||||
apiVersion: "batch/v1",
|
||||
kind: "Job",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
// Don't restart because all the job should do is launch the Dataflow job.
|
||||
restartPolicy: "Never",
|
||||
containers: [
|
||||
{
|
||||
name: "dataflow",
|
||||
image: params.image,
|
||||
command: [
|
||||
"python",
|
||||
"-m",
|
||||
"code_search.nmslib.cli.create_search_index",
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--lookup_file=" + params.lookupFile,
|
||||
"--index_file=" + params.indexFile,
|
||||
],
|
||||
env: [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
},
|
||||
],
|
||||
// Creating the index requires a lot of memory.
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "32Gi"
|
||||
},
|
||||
limits: {
|
||||
memory: "100Gi"
|
||||
},
|
||||
},
|
||||
workingDir: "/src",
|
||||
volumeMounts: [
|
||||
{
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
name: "gcp-credentials",
|
||||
},
|
||||
], //volumeMounts
|
||||
},
|
||||
], // containers
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "user-gcp-sa",
|
||||
},
|
||||
},
|
||||
],
|
||||
}, // spec
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new(jobSpec))
|
||||
|
|
|
|||
|
|
@ -101,3 +101,4 @@ local deployment = {
|
|||
};
|
||||
|
||||
std.prune(k.core.v1.list.new([service, deployment]))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import csv
|
||||
import logging
|
||||
import os
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
|
@ -23,6 +24,7 @@ def create_search_index(argv=None):
|
|||
args = arguments.parse_arguments(argv)
|
||||
|
||||
if not os.path.isdir(args.tmp_dir):
|
||||
logging.info("Creating directory %s", args.tmp_dir)
|
||||
os.makedirs(args.tmp_dir)
|
||||
|
||||
tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
|
||||
|
|
@ -34,7 +36,7 @@ def create_search_index(argv=None):
|
|||
lookup_writer = csv.writer(lookup_file)
|
||||
|
||||
for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)):
|
||||
tf.logging.debug('Reading {}'.format(csv_file_path))
|
||||
logging.info('Reading %s', csv_file_path)
|
||||
|
||||
with tf.gfile.Open(csv_file_path) as csv_file:
|
||||
reader = csv.reader(csv_file)
|
||||
|
|
@ -49,9 +51,19 @@ def create_search_index(argv=None):
|
|||
|
||||
search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)
|
||||
|
||||
logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file)
|
||||
tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
|
||||
logging.info("Copying file %s to %s", tmp_index_file, args.index_file)
|
||||
tf.gfile.Copy(tmp_index_file, args.index_file)
|
||||
logging.info("Finished creating the index")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format=('%(levelname)s|%(asctime)s'
|
||||
'|%(pathname)s|%(lineno)d| %(message)s'),
|
||||
datefmt='%Y-%m-%dT%H:%M:%S',
|
||||
)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
logging.info("Creating the search index")
|
||||
create_search_index()
|
||||
|
|
|
|||
|
|
@ -85,4 +85,10 @@ def start_search_server(argv=None):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format=('%(levelname)s|%(asctime)s'
|
||||
'|%(pathname)s|%(lineno)d| %(message)s'),
|
||||
datefmt='%Y-%m-%dT%H:%M:%S',
|
||||
)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
start_search_server()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
# Requirements to run nmslib.
|
||||
nmslib~=1.7.0
|
||||
Loading…
Reference in New Issue