mirror of https://github.com/kubeflow/examples.git
Update the ks parameter (#394)
* refactor ks * remove unecessary params * update ks * address comments
This commit is contained in:
parent
78fdc74b56
commit
cea0ffde0d
|
|
@ -20,7 +20,7 @@ usage() {
|
||||||
--workflowId=<workflow id invoking the container>
|
--workflowId=<workflow id invoking the container>
|
||||||
--indexFile=<index file>
|
--indexFile=<index file>
|
||||||
--lookupFile=<lookup file>
|
--lookupFile=<lookup file>
|
||||||
--dataDir=<data dir>
|
--functionEmbeddingsDir=<input function embedding dir>
|
||||||
--timeout=<timeout>
|
--timeout=<timeout>
|
||||||
--namespace=<kubernetes namespace>
|
--namespace=<kubernetes namespace>
|
||||||
--cluster=<cluster to deploy job to>"
|
--cluster=<cluster to deploy job to>"
|
||||||
|
|
@ -33,7 +33,7 @@ source "${DIR}/parse_arguments.sh"
|
||||||
source "${DIR}/initialize_kubectl.sh"
|
source "${DIR}/initialize_kubectl.sh"
|
||||||
|
|
||||||
# Apply parameters
|
# Apply parameters
|
||||||
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
|
||||||
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
||||||
ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName}
|
ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName}
|
||||||
ks param set ${component} indexFile ${indexFile} --env ${ksEnvName}
|
ks param set ${component} indexFile ${indexFile} --env ${ksEnvName}
|
||||||
|
|
|
||||||
|
|
@ -24,16 +24,19 @@ usage() {
|
||||||
--workflowId=<workflow id invoking the container>
|
--workflowId=<workflow id invoking the container>
|
||||||
--modelDir=<directory contains the model>
|
--modelDir=<directory contains the model>
|
||||||
--dataDir=<data dir>
|
--dataDir=<data dir>
|
||||||
|
--functionEmbeddingsDir=<output function embedding dir>
|
||||||
|
--tokenPairsBQTable=<input token pairs BQ table>
|
||||||
|
--functionEmbeddingsBQTable=<output function embedding BQ table>
|
||||||
--numWorkers=<num of workers>
|
--numWorkers=<num of workers>
|
||||||
--project=<project>
|
--project=<project>
|
||||||
--targetDataset=<target BQ dataset>
|
|
||||||
--workerMachineType=<worker machine type>
|
--workerMachineType=<worker machine type>
|
||||||
--workingDir=<working dir>
|
--workingDir=<working dir>
|
||||||
--cluster=<cluster to deploy job to>"
|
--cluster=<cluster to deploy job to>
|
||||||
|
--namespace=<kubernetes namespace>"
|
||||||
}
|
}
|
||||||
|
|
||||||
# List of required parameters
|
# List of required parameters
|
||||||
names=(dataDir modelDir targetDataset workingDir workflowId cluster)
|
names=(dataDir modelDir functionEmbeddingsDir tokenPairsBQTable functionEmbeddingsBQTable workingDir workflowId cluster namespace)
|
||||||
|
|
||||||
source "${DIR}/parse_arguments.sh"
|
source "${DIR}/parse_arguments.sh"
|
||||||
source "${DIR}/initialize_kubectl.sh"
|
source "${DIR}/initialize_kubectl.sh"
|
||||||
|
|
@ -41,9 +44,11 @@ source "${DIR}/initialize_kubectl.sh"
|
||||||
# Apply parameters
|
# Apply parameters
|
||||||
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
||||||
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
||||||
|
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
|
||||||
|
ks param set ${component} tokenPairsBQTable ${tokenPairsBQTable} --env ${ksEnvName}
|
||||||
|
ks param set ${component} functionEmbeddingsBQTable ${functionEmbeddingsBQTable} --env ${ksEnvName}
|
||||||
ks param set ${component} modelDir ${modelDir} --env ${ksEnvName}
|
ks param set ${component} modelDir ${modelDir} --env ${ksEnvName}
|
||||||
ks param set ${component} project ${project} --env ${ksEnvName}
|
ks param set ${component} project ${project} --env ${ksEnvName}
|
||||||
ks param set ${component} targetDataset ${targetDataset} --env ${ksEnvName}
|
|
||||||
ks param set ${component} workingDir ${workingDir} --env ${ksEnvName}
|
ks param set ${component} workingDir ${workingDir} --env ${ksEnvName}
|
||||||
ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName}
|
ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName}
|
||||||
ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName}
|
ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName}
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,6 @@
|
||||||
bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"),
|
bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"),
|
||||||
functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix,
|
functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix,
|
||||||
|
|
||||||
|
|
||||||
// Location where the function embeddings should be written.
|
// Location where the function embeddings should be written.
|
||||||
functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings",
|
functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings",
|
||||||
|
|
||||||
|
|
@ -34,5 +33,8 @@
|
||||||
"pipeline": {
|
"pipeline": {
|
||||||
name: "pipeline",
|
name: "pipeline",
|
||||||
problem: "kf_github_function_docstring",
|
problem: "kf_github_function_docstring",
|
||||||
|
project: "code-search-demo",
|
||||||
|
bqDataset: "code_search",
|
||||||
|
tokenPairsBQTable: self.project + ":" + self.bqDataset + ".token_pairs",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@
|
||||||
name: "search-index-creator",
|
name: "search-index-creator",
|
||||||
jobNameSuffix: "null",
|
jobNameSuffix: "null",
|
||||||
image: $.components["t2t-job"].dataflowImage,
|
image: $.components["t2t-job"].dataflowImage,
|
||||||
dataDir: $.components["t2t-code-search"].workingDir + "/data",
|
functionEmbeddingsDir: "",
|
||||||
lookupFile: "null",
|
lookupFile: "null",
|
||||||
indexFile: "null",
|
indexFile: "null",
|
||||||
},
|
},
|
||||||
|
|
@ -111,10 +111,6 @@
|
||||||
"submit-code-embeddings-job": {
|
"submit-code-embeddings-job": {
|
||||||
name: "submit-code-embeddings-job",
|
name: "submit-code-embeddings-job",
|
||||||
image: $.components["t2t-job"].dataflowImage,
|
image: $.components["t2t-job"].dataflowImage,
|
||||||
// Input table this should be of the form PROJECT:DATASET.table
|
|
||||||
inputTable: "",
|
|
||||||
// Big query table where results will be written.
|
|
||||||
targetDataset: "code_search",
|
|
||||||
// Directory where the model is stored.
|
// Directory where the model is stored.
|
||||||
modelDir: "",
|
modelDir: "",
|
||||||
jobName: "submit-code-embeddings-job",
|
jobName: "submit-code-embeddings-job",
|
||||||
|
|
@ -122,6 +118,11 @@
|
||||||
workerMachineType: "n1-highcpu-32",
|
workerMachineType: "n1-highcpu-32",
|
||||||
numWorkers: 5,
|
numWorkers: 5,
|
||||||
waitUntilFinish: "false",
|
waitUntilFinish: "false",
|
||||||
|
workingDir: $.components["t2t-code-search"].workingDir,
|
||||||
|
dataDir: self.workingDir + "/data",
|
||||||
|
functionEmbeddingsDir: self.workingDir + "/code_embeddings",
|
||||||
|
tokenPairsBQTable: "",
|
||||||
|
functionEmbeddingsBQTable: "",
|
||||||
},
|
},
|
||||||
|
|
||||||
tensorboard: {
|
tensorboard: {
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ local jobSpec = {
|
||||||
"python",
|
"python",
|
||||||
"-m",
|
"-m",
|
||||||
"code_search.nmslib.cli.create_search_index",
|
"code_search.nmslib.cli.create_search_index",
|
||||||
"--data_dir=" + params.dataDir,
|
"--data_dir=" + params.functionEmbeddingsDir,
|
||||||
"--lookup_file=" + params.lookupFile,
|
"--lookup_file=" + params.lookupFile,
|
||||||
"--index_file=" + params.indexFile,
|
"--index_file=" + params.indexFile,
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@
|
||||||
"python2",
|
"python2",
|
||||||
"-m",
|
"-m",
|
||||||
"code_search.dataflow.cli.create_function_embeddings",
|
"code_search.dataflow.cli.create_function_embeddings",
|
||||||
"--runner=DataflowRunner",
|
"--runner=DataflowRunner",
|
||||||
"--project=" + params.project,
|
"--project=" + params.project,
|
||||||
"--token_pairs_table=" + params.tokenPairsBQTable,
|
"--token_pairs_table=" + params.tokenPairsBQTable,
|
||||||
"--function_embeddings_table=" + params.functionEmbeddingsBQTable,
|
"--function_embeddings_table=" + params.functionEmbeddingsBQTable,
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
import uuid
|
||||||
from kubernetes import client as k8s_client
|
from kubernetes import client as k8s_client
|
||||||
import kfp.dsl as dsl
|
import kfp.dsl as dsl
|
||||||
|
|
||||||
|
|
@ -59,39 +60,43 @@ def default_gcp_op(name: str, image: str, command: str = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def dataflow_function_embedding_op(
|
def dataflow_function_embedding_op(
|
||||||
project: 'GcpProject', cluster_name: str, target_dataset: str, data_dir: 'GcsUri',
|
project: 'GcpProject', cluster_name: str, token_pairs_bq_table: str,
|
||||||
saved_model_dir: 'GcsUri', workflow_id: str, worker_machine_type: str,
|
function_embeddings_bq_table: str, data_dir: 'GcsUri',
|
||||||
num_workers: int, working_dir: str, step_name='dataflow_function_embedding'):
|
function_embeddings_dir: str, saved_model_dir: 'GcsUri', workflow_id: str,
|
||||||
|
worker_machine_type: str, num_workers: int, working_dir: str, namespace: str):
|
||||||
return default_gcp_op(
|
return default_gcp_op(
|
||||||
name=step_name,
|
name='dataflow_function_embedding',
|
||||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||||
command=['/usr/local/src/submit_code_embeddings_job.sh'],
|
command=['/usr/local/src/submit_code_embeddings_job.sh'],
|
||||||
arguments=[
|
arguments=[
|
||||||
"--workflowId=%s" % workflow_id,
|
"--workflowId=%s" % workflow_id,
|
||||||
"--modelDir=%s" % saved_model_dir,
|
"--modelDir=%s" % saved_model_dir,
|
||||||
"--dataDir=%s" % data_dir,
|
"--dataDir=%s" % data_dir,
|
||||||
|
"--functionEmbeddingsDir=%s" % function_embeddings_dir,
|
||||||
"--numWorkers=%s" % num_workers,
|
"--numWorkers=%s" % num_workers,
|
||||||
"--project=%s" % project,
|
"--project=%s" % project,
|
||||||
"--targetDataset=%s" % target_dataset,
|
"--tokenPairsBQTable=%s" % token_pairs_bq_table,
|
||||||
|
"--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table,
|
||||||
"--workerMachineType=%s" % worker_machine_type,
|
"--workerMachineType=%s" % worker_machine_type,
|
||||||
"--workingDir=%s" % working_dir,
|
"--workingDir=%s" % working_dir,
|
||||||
'--cluster=%s' % cluster_name,
|
"--cluster=%s" % cluster_name,
|
||||||
|
"--namespace=%s" % namespace,
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def search_index_creator_op(
|
def search_index_creator_op(
|
||||||
index_file: str, lookup_file: str, data_dir: str,
|
index_file: str, lookup_file: str, function_embeddings_dir: str,
|
||||||
workflow_id: str, cluster_name: str, namespace: str):
|
workflow_id: str, cluster_name: str, namespace: str):
|
||||||
return dsl.ContainerOp(
|
return dsl.ContainerOp(
|
||||||
# use component name as step name
|
# use component name as step name
|
||||||
name='search_index_creator',
|
name='search_index_creator',
|
||||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||||
command=['/usr/local/src/launch_search_index_creator_job.sh'],
|
command=['/usr/local/src/launch_search_index_creator_job.sh'],
|
||||||
arguments=[
|
arguments=[
|
||||||
'--indexFile=%s' % index_file,
|
'--indexFile=%s' % index_file,
|
||||||
'--lookupFile=%s' % lookup_file,
|
'--lookupFile=%s' % lookup_file,
|
||||||
'--dataDir=%s' % data_dir,
|
'--functionEmbeddingsDir=%s' % function_embeddings_dir,
|
||||||
'--workflowId=%s' % workflow_id,
|
'--workflowId=%s' % workflow_id,
|
||||||
'--cluster=%s' % cluster_name,
|
'--cluster=%s' % cluster_name,
|
||||||
'--namespace=%s' % namespace,
|
'--namespace=%s' % namespace,
|
||||||
|
|
@ -105,7 +110,7 @@ def update_index_op(
|
||||||
return (
|
return (
|
||||||
dsl.ContainerOp(
|
dsl.ContainerOp(
|
||||||
name='update_index',
|
name='update_index',
|
||||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||||
command=['/usr/local/src/update_index.sh'],
|
command=['/usr/local/src/update_index.sh'],
|
||||||
arguments=[
|
arguments=[
|
||||||
'--baseGitRepo=%s' % base_git_repo,
|
'--baseGitRepo=%s' % base_git_repo,
|
||||||
|
|
@ -162,22 +167,28 @@ def function_embedding_update(
|
||||||
fork_git_repo='IronPan/examples',
|
fork_git_repo='IronPan/examples',
|
||||||
bot_email='kf.sample.bot@gmail.com'):
|
bot_email='kf.sample.bot@gmail.com'):
|
||||||
workflow_name = '{{workflow.name}}'
|
workflow_name = '{{workflow.name}}'
|
||||||
|
# Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and
|
||||||
|
# workflow name is assigned at runtime. Pipeline might need to support
|
||||||
|
# replacing characters in workflow name.
|
||||||
|
bq_suffix = uuid.uuid4().hex[:6].upper()
|
||||||
working_dir = '%s/%s' % (working_dir, workflow_name)
|
working_dir = '%s/%s' % (working_dir, workflow_name)
|
||||||
lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir
|
lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir
|
||||||
index_file = '%s/code-embeddings-index/embeddings.index'% working_dir
|
index_file = '%s/code-embeddings-index/embeddings.index'% working_dir
|
||||||
|
function_embeddings_dir = '%s/%s' % (working_dir, "/code_embeddings")
|
||||||
|
token_pairs_bq_table = '%s:%s.token_pairs' %(project, target_dataset)
|
||||||
|
function_embeddings_bq_table = \
|
||||||
|
'%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix)
|
||||||
|
|
||||||
function_embedding = dataflow_function_embedding_op(
|
function_embedding = dataflow_function_embedding_op(
|
||||||
project, cluster_name, target_dataset, data_dir,
|
project, cluster_name, token_pairs_bq_table, function_embeddings_bq_table,
|
||||||
saved_model_dir, workflow_name, worker_machine_type,
|
data_dir, function_embeddings_dir, saved_model_dir, workflow_name,
|
||||||
function_embedding_num_workers, working_dir)
|
worker_machine_type, function_embedding_num_workers, working_dir, namespace)
|
||||||
|
|
||||||
search_index_creator = search_index_creator_op(
|
search_index_creator = search_index_creator_op(
|
||||||
index_file, lookup_file, data_dir, workflow_name, cluster_name, namespace)
|
index_file, lookup_file, function_embeddings_dir, workflow_name, cluster_name, namespace)
|
||||||
search_index_creator.after(function_embedding)
|
search_index_creator.after(function_embedding)
|
||||||
update_index_op(
|
update_index_op(
|
||||||
base_git_repo, base_branch, app_dir, fork_git_repo,
|
base_git_repo, base_branch, app_dir, fork_git_repo,
|
||||||
index_file, lookup_file, workflow_name, bot_email)\
|
index_file, lookup_file, workflow_name, bot_email).after(search_index_creator)
|
||||||
.after(search_index_creator)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue