From cea0ffde0d5c6d7275c93f1bf12eef038a8b9328 Mon Sep 17 00:00:00 2001 From: IronPan Date: Sun, 2 Dec 2018 22:14:11 -0800 Subject: [PATCH] Update the ks parameter (#394) * refactor ks * remove unecessary params * update ks * address comments --- .../ks/launch_search_index_creator_job.sh | 4 +- .../docker/ks/submit_code_embeddings_job.sh | 13 +++-- .../kubeflow/components/experiments.libsonnet | 4 +- .../kubeflow/components/params.libsonnet | 11 +++-- .../components/search-index-creator.jsonnet | 2 +- .../submit-code-embeddings-job.libsonnet | 2 +- code_search/pipeline/index_update_pipeline.py | 47 ++++++++++++------- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/code_search/docker/ks/launch_search_index_creator_job.sh b/code_search/docker/ks/launch_search_index_creator_job.sh index 1a8a80a5..38dfa912 100755 --- a/code_search/docker/ks/launch_search_index_creator_job.sh +++ b/code_search/docker/ks/launch_search_index_creator_job.sh @@ -20,7 +20,7 @@ usage() { --workflowId= --indexFile= --lookupFile= - --dataDir= + --functionEmbeddingsDir= --timeout= --namespace= --cluster=" @@ -33,7 +33,7 @@ source "${DIR}/parse_arguments.sh" source "${DIR}/initialize_kubectl.sh" # Apply parameters -ks param set ${component} dataDir ${dataDir} --env ${ksEnvName} +ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName} ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName} ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName} ks param set ${component} indexFile ${indexFile} --env ${ksEnvName} diff --git a/code_search/docker/ks/submit_code_embeddings_job.sh b/code_search/docker/ks/submit_code_embeddings_job.sh index d2ee28c8..8881df19 100755 --- a/code_search/docker/ks/submit_code_embeddings_job.sh +++ b/code_search/docker/ks/submit_code_embeddings_job.sh @@ -24,16 +24,19 @@ usage() { --workflowId= --modelDir= --dataDir= + --functionEmbeddingsDir= + --tokenPairsBQTable= + --functionEmbeddingsBQTable= --numWorkers= --project= - --targetDataset= --workerMachineType= --workingDir= - --cluster=" + --cluster= + --namespace=" } # List of required parameters -names=(dataDir modelDir targetDataset workingDir workflowId cluster) +names=(dataDir modelDir functionEmbeddingsDir tokenPairsBQTable functionEmbeddingsBQTable workingDir workflowId cluster namespace) source "${DIR}/parse_arguments.sh" source "${DIR}/initialize_kubectl.sh" @@ -41,9 +44,11 @@ source "${DIR}/initialize_kubectl.sh" # Apply parameters ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName} ks param set ${component} dataDir ${dataDir} --env ${ksEnvName} +ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName} +ks param set ${component} tokenPairsBQTable ${tokenPairsBQTable} --env ${ksEnvName} +ks param set ${component} functionEmbeddingsBQTable ${functionEmbeddingsBQTable} --env ${ksEnvName} ks param set ${component} modelDir ${modelDir} --env ${ksEnvName} ks param set ${component} project ${project} --env ${ksEnvName} -ks param set ${component} targetDataset ${targetDataset} --env ${ksEnvName} ks param set ${component} workingDir ${workingDir} --env ${ksEnvName} ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName} ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName} diff --git a/code_search/kubeflow/components/experiments.libsonnet b/code_search/kubeflow/components/experiments.libsonnet index ce93168b..8d4fd3e0 100644 --- a/code_search/kubeflow/components/experiments.libsonnet +++ b/code_search/kubeflow/components/experiments.libsonnet @@ -22,7 +22,6 @@ bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"), functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix, - // Location where the function embeddings should be written. functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings", @@ -34,5 +33,8 @@ "pipeline": { name: "pipeline", problem: "kf_github_function_docstring", + project: "code-search-demo", + bqDataset: "code_search", + tokenPairsBQTable: self.project + ":" + self.bqDataset + ".token_pairs", }, } diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index 8ecf3854..4ee5fa5f 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -82,7 +82,7 @@ name: "search-index-creator", jobNameSuffix: "null", image: $.components["t2t-job"].dataflowImage, - dataDir: $.components["t2t-code-search"].workingDir + "/data", + functionEmbeddingsDir: "", lookupFile: "null", indexFile: "null", }, @@ -111,10 +111,6 @@ "submit-code-embeddings-job": { name: "submit-code-embeddings-job", image: $.components["t2t-job"].dataflowImage, - // Input table this should be of the form PROJECT:DATASET.table - inputTable: "", - // Big query table where results will be written. - targetDataset: "code_search", // Directory where the model is stored. modelDir: "", jobName: "submit-code-embeddings-job", @@ -122,6 +118,11 @@ workerMachineType: "n1-highcpu-32", numWorkers: 5, waitUntilFinish: "false", + workingDir: $.components["t2t-code-search"].workingDir, + dataDir: self.workingDir + "/data", + functionEmbeddingsDir: self.workingDir + "/code_embeddings", + tokenPairsBQTable: "", + functionEmbeddingsBQTable: "", }, tensorboard: { diff --git a/code_search/kubeflow/components/search-index-creator.jsonnet b/code_search/kubeflow/components/search-index-creator.jsonnet index 3ebe5c03..94c9295f 100644 --- a/code_search/kubeflow/components/search-index-creator.jsonnet +++ b/code_search/kubeflow/components/search-index-creator.jsonnet @@ -39,7 +39,7 @@ local jobSpec = { "python", "-m", "code_search.nmslib.cli.create_search_index", - "--data_dir=" + params.dataDir, + "--data_dir=" + params.functionEmbeddingsDir, "--lookup_file=" + params.lookupFile, "--index_file=" + params.indexFile, ], diff --git a/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet b/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet index d8d51813..e782a5b0 100644 --- a/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet +++ b/code_search/kubeflow/components/submit-code-embeddings-job.libsonnet @@ -30,7 +30,7 @@ "python2", "-m", "code_search.dataflow.cli.create_function_embeddings", - "--runner=DataflowRunner", + "--runner=DataflowRunner", "--project=" + params.project, "--token_pairs_table=" + params.tokenPairsBQTable, "--function_embeddings_table=" + params.functionEmbeddingsBQTable, diff --git a/code_search/pipeline/index_update_pipeline.py b/code_search/pipeline/index_update_pipeline.py index 789ae626..cb5d1cfb 100644 --- a/code_search/pipeline/index_update_pipeline.py +++ b/code_search/pipeline/index_update_pipeline.py @@ -1,4 +1,5 @@ from typing import Dict +import uuid from kubernetes import client as k8s_client import kfp.dsl as dsl @@ -59,39 +60,43 @@ def default_gcp_op(name: str, image: str, command: str = None, ) def dataflow_function_embedding_op( - project: 'GcpProject', cluster_name: str, target_dataset: str, data_dir: 'GcsUri', - saved_model_dir: 'GcsUri', workflow_id: str, worker_machine_type: str, - num_workers: int, working_dir: str, step_name='dataflow_function_embedding'): + project: 'GcpProject', cluster_name: str, token_pairs_bq_table: str, + function_embeddings_bq_table: str, data_dir: 'GcsUri', + function_embeddings_dir: str, saved_model_dir: 'GcsUri', workflow_id: str, + worker_machine_type: str, num_workers: int, working_dir: str, namespace: str): return default_gcp_op( - name=step_name, - image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', + name='dataflow_function_embedding', + image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', command=['/usr/local/src/submit_code_embeddings_job.sh'], arguments=[ "--workflowId=%s" % workflow_id, "--modelDir=%s" % saved_model_dir, "--dataDir=%s" % data_dir, + "--functionEmbeddingsDir=%s" % function_embeddings_dir, "--numWorkers=%s" % num_workers, "--project=%s" % project, - "--targetDataset=%s" % target_dataset, + "--tokenPairsBQTable=%s" % token_pairs_bq_table, + "--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table, "--workerMachineType=%s" % worker_machine_type, "--workingDir=%s" % working_dir, - '--cluster=%s' % cluster_name, + "--cluster=%s" % cluster_name, + "--namespace=%s" % namespace, ] ) def search_index_creator_op( - index_file: str, lookup_file: str, data_dir: str, + index_file: str, lookup_file: str, function_embeddings_dir: str, workflow_id: str, cluster_name: str, namespace: str): return dsl.ContainerOp( # use component name as step name name='search_index_creator', - image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', + image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', command=['/usr/local/src/launch_search_index_creator_job.sh'], arguments=[ '--indexFile=%s' % index_file, '--lookupFile=%s' % lookup_file, - '--dataDir=%s' % data_dir, + '--functionEmbeddingsDir=%s' % function_embeddings_dir, '--workflowId=%s' % workflow_id, '--cluster=%s' % cluster_name, '--namespace=%s' % namespace, @@ -105,7 +110,7 @@ def update_index_op( return ( dsl.ContainerOp( name='update_index', - image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', + image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', command=['/usr/local/src/update_index.sh'], arguments=[ '--baseGitRepo=%s' % base_git_repo, @@ -162,22 +167,28 @@ def function_embedding_update( fork_git_repo='IronPan/examples', bot_email='kf.sample.bot@gmail.com'): workflow_name = '{{workflow.name}}' + # Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and + # workflow name is assigned at runtime. Pipeline might need to support + # replacing characters in workflow name. + bq_suffix = uuid.uuid4().hex[:6].upper() working_dir = '%s/%s' % (working_dir, workflow_name) lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir index_file = '%s/code-embeddings-index/embeddings.index'% working_dir + function_embeddings_dir = '%s/%s' % (working_dir, "/code_embeddings") + token_pairs_bq_table = '%s:%s.token_pairs' %(project, target_dataset) + function_embeddings_bq_table = \ + '%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix) function_embedding = dataflow_function_embedding_op( - project, cluster_name, target_dataset, data_dir, - saved_model_dir, workflow_name, worker_machine_type, - function_embedding_num_workers, working_dir) - + project, cluster_name, token_pairs_bq_table, function_embeddings_bq_table, + data_dir, function_embeddings_dir, saved_model_dir, workflow_name, + worker_machine_type, function_embedding_num_workers, working_dir, namespace) search_index_creator = search_index_creator_op( - index_file, lookup_file, data_dir, workflow_name, cluster_name, namespace) + index_file, lookup_file, function_embeddings_dir, workflow_name, cluster_name, namespace) search_index_creator.after(function_embedding) update_index_op( base_git_repo, base_branch, app_dir, fork_git_repo, - index_file, lookup_file, workflow_name, bot_email)\ - .after(search_index_creator) + index_file, lookup_file, workflow_name, bot_email).after(search_index_creator) if __name__ == '__main__':