Update the ks parameter (#394)

* refactor ks

* remove unecessary params

* update ks

* address comments
This commit is contained in:
IronPan 2018-12-02 22:14:11 -08:00 committed by Kubernetes Prow Robot
parent 78fdc74b56
commit cea0ffde0d
7 changed files with 51 additions and 32 deletions

View File

@ -20,7 +20,7 @@ usage() {
--workflowId=<workflow id invoking the container>
--indexFile=<index file>
--lookupFile=<lookup file>
--dataDir=<data dir>
--functionEmbeddingsDir=<input function embedding dir>
--timeout=<timeout>
--namespace=<kubernetes namespace>
--cluster=<cluster to deploy job to>"
@ -33,7 +33,7 @@ source "${DIR}/parse_arguments.sh"
source "${DIR}/initialize_kubectl.sh"
# Apply parameters
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName}
ks param set ${component} indexFile ${indexFile} --env ${ksEnvName}

View File

@ -24,16 +24,19 @@ usage() {
--workflowId=<workflow id invoking the container>
--modelDir=<directory contains the model>
--dataDir=<data dir>
--functionEmbeddingsDir=<output function embedding dir>
--tokenPairsBQTable=<input token pairs BQ table>
--functionEmbeddingsBQTable=<output function embedding BQ table>
--numWorkers=<num of workers>
--project=<project>
--targetDataset=<target BQ dataset>
--workerMachineType=<worker machine type>
--workingDir=<working dir>
--cluster=<cluster to deploy job to>"
--cluster=<cluster to deploy job to>
--namespace=<kubernetes namespace>"
}
# List of required parameters
names=(dataDir modelDir targetDataset workingDir workflowId cluster)
names=(dataDir modelDir functionEmbeddingsDir tokenPairsBQTable functionEmbeddingsBQTable workingDir workflowId cluster namespace)
source "${DIR}/parse_arguments.sh"
source "${DIR}/initialize_kubectl.sh"
@ -41,9 +44,11 @@ source "${DIR}/initialize_kubectl.sh"
# Apply parameters
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
ks param set ${component} tokenPairsBQTable ${tokenPairsBQTable} --env ${ksEnvName}
ks param set ${component} functionEmbeddingsBQTable ${functionEmbeddingsBQTable} --env ${ksEnvName}
ks param set ${component} modelDir ${modelDir} --env ${ksEnvName}
ks param set ${component} project ${project} --env ${ksEnvName}
ks param set ${component} targetDataset ${targetDataset} --env ${ksEnvName}
ks param set ${component} workingDir ${workingDir} --env ${ksEnvName}
ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName}
ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName}

View File

@ -22,7 +22,6 @@
bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"),
functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix,
// Location where the function embeddings should be written.
functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings",
@ -34,5 +33,8 @@
"pipeline": {
name: "pipeline",
problem: "kf_github_function_docstring",
project: "code-search-demo",
bqDataset: "code_search",
tokenPairsBQTable: self.project + ":" + self.bqDataset + ".token_pairs",
},
}

View File

@ -82,7 +82,7 @@
name: "search-index-creator",
jobNameSuffix: "null",
image: $.components["t2t-job"].dataflowImage,
dataDir: $.components["t2t-code-search"].workingDir + "/data",
functionEmbeddingsDir: "",
lookupFile: "null",
indexFile: "null",
},
@ -111,10 +111,6 @@
"submit-code-embeddings-job": {
name: "submit-code-embeddings-job",
image: $.components["t2t-job"].dataflowImage,
// Input table this should be of the form PROJECT:DATASET.table
inputTable: "",
// Big query table where results will be written.
targetDataset: "code_search",
// Directory where the model is stored.
modelDir: "",
jobName: "submit-code-embeddings-job",
@ -122,6 +118,11 @@
workerMachineType: "n1-highcpu-32",
numWorkers: 5,
waitUntilFinish: "false",
workingDir: $.components["t2t-code-search"].workingDir,
dataDir: self.workingDir + "/data",
functionEmbeddingsDir: self.workingDir + "/code_embeddings",
tokenPairsBQTable: "",
functionEmbeddingsBQTable: "",
},
tensorboard: {

View File

@ -39,7 +39,7 @@ local jobSpec = {
"python",
"-m",
"code_search.nmslib.cli.create_search_index",
"--data_dir=" + params.dataDir,
"--data_dir=" + params.functionEmbeddingsDir,
"--lookup_file=" + params.lookupFile,
"--index_file=" + params.indexFile,
],

View File

@ -30,7 +30,7 @@
"python2",
"-m",
"code_search.dataflow.cli.create_function_embeddings",
"--runner=DataflowRunner",
"--runner=DataflowRunner",
"--project=" + params.project,
"--token_pairs_table=" + params.tokenPairsBQTable,
"--function_embeddings_table=" + params.functionEmbeddingsBQTable,

View File

@ -1,4 +1,5 @@
from typing import Dict
import uuid
from kubernetes import client as k8s_client
import kfp.dsl as dsl
@ -59,39 +60,43 @@ def default_gcp_op(name: str, image: str, command: str = None,
)
def dataflow_function_embedding_op(
project: 'GcpProject', cluster_name: str, target_dataset: str, data_dir: 'GcsUri',
saved_model_dir: 'GcsUri', workflow_id: str, worker_machine_type: str,
num_workers: int, working_dir: str, step_name='dataflow_function_embedding'):
project: 'GcpProject', cluster_name: str, token_pairs_bq_table: str,
function_embeddings_bq_table: str, data_dir: 'GcsUri',
function_embeddings_dir: str, saved_model_dir: 'GcsUri', workflow_id: str,
worker_machine_type: str, num_workers: int, working_dir: str, namespace: str):
return default_gcp_op(
name=step_name,
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
name='dataflow_function_embedding',
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
command=['/usr/local/src/submit_code_embeddings_job.sh'],
arguments=[
"--workflowId=%s" % workflow_id,
"--modelDir=%s" % saved_model_dir,
"--dataDir=%s" % data_dir,
"--functionEmbeddingsDir=%s" % function_embeddings_dir,
"--numWorkers=%s" % num_workers,
"--project=%s" % project,
"--targetDataset=%s" % target_dataset,
"--tokenPairsBQTable=%s" % token_pairs_bq_table,
"--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table,
"--workerMachineType=%s" % worker_machine_type,
"--workingDir=%s" % working_dir,
'--cluster=%s' % cluster_name,
"--cluster=%s" % cluster_name,
"--namespace=%s" % namespace,
]
)
def search_index_creator_op(
index_file: str, lookup_file: str, data_dir: str,
index_file: str, lookup_file: str, function_embeddings_dir: str,
workflow_id: str, cluster_name: str, namespace: str):
return dsl.ContainerOp(
# use component name as step name
name='search_index_creator',
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
command=['/usr/local/src/launch_search_index_creator_job.sh'],
arguments=[
'--indexFile=%s' % index_file,
'--lookupFile=%s' % lookup_file,
'--dataDir=%s' % data_dir,
'--functionEmbeddingsDir=%s' % function_embeddings_dir,
'--workflowId=%s' % workflow_id,
'--cluster=%s' % cluster_name,
'--namespace=%s' % namespace,
@ -105,7 +110,7 @@ def update_index_op(
return (
dsl.ContainerOp(
name='update_index',
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
command=['/usr/local/src/update_index.sh'],
arguments=[
'--baseGitRepo=%s' % base_git_repo,
@ -162,22 +167,28 @@ def function_embedding_update(
fork_git_repo='IronPan/examples',
bot_email='kf.sample.bot@gmail.com'):
workflow_name = '{{workflow.name}}'
# Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and
# workflow name is assigned at runtime. Pipeline might need to support
# replacing characters in workflow name.
bq_suffix = uuid.uuid4().hex[:6].upper()
working_dir = '%s/%s' % (working_dir, workflow_name)
lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir
index_file = '%s/code-embeddings-index/embeddings.index'% working_dir
function_embeddings_dir = '%s/%s' % (working_dir, "/code_embeddings")
token_pairs_bq_table = '%s:%s.token_pairs' %(project, target_dataset)
function_embeddings_bq_table = \
'%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix)
function_embedding = dataflow_function_embedding_op(
project, cluster_name, target_dataset, data_dir,
saved_model_dir, workflow_name, worker_machine_type,
function_embedding_num_workers, working_dir)
project, cluster_name, token_pairs_bq_table, function_embeddings_bq_table,
data_dir, function_embeddings_dir, saved_model_dir, workflow_name,
worker_machine_type, function_embedding_num_workers, working_dir, namespace)
search_index_creator = search_index_creator_op(
index_file, lookup_file, data_dir, workflow_name, cluster_name, namespace)
index_file, lookup_file, function_embeddings_dir, workflow_name, cluster_name, namespace)
search_index_creator.after(function_embedding)
update_index_op(
base_git_repo, base_branch, app_dir, fork_git_repo,
index_file, lookup_file, workflow_name, bot_email)\
.after(search_index_creator)
index_file, lookup_file, workflow_name, bot_email).after(search_index_creator)
if __name__ == '__main__':