mirror of https://github.com/kubeflow/examples.git
Update the ks parameter (#394)
* refactor ks * remove unecessary params * update ks * address comments
This commit is contained in:
parent
78fdc74b56
commit
cea0ffde0d
|
|
@ -20,7 +20,7 @@ usage() {
|
|||
--workflowId=<workflow id invoking the container>
|
||||
--indexFile=<index file>
|
||||
--lookupFile=<lookup file>
|
||||
--dataDir=<data dir>
|
||||
--functionEmbeddingsDir=<input function embedding dir>
|
||||
--timeout=<timeout>
|
||||
--namespace=<kubernetes namespace>
|
||||
--cluster=<cluster to deploy job to>"
|
||||
|
|
@ -33,7 +33,7 @@ source "${DIR}/parse_arguments.sh"
|
|||
source "${DIR}/initialize_kubectl.sh"
|
||||
|
||||
# Apply parameters
|
||||
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
||||
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
|
||||
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
||||
ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName}
|
||||
ks param set ${component} indexFile ${indexFile} --env ${ksEnvName}
|
||||
|
|
|
|||
|
|
@ -24,16 +24,19 @@ usage() {
|
|||
--workflowId=<workflow id invoking the container>
|
||||
--modelDir=<directory contains the model>
|
||||
--dataDir=<data dir>
|
||||
--functionEmbeddingsDir=<output function embedding dir>
|
||||
--tokenPairsBQTable=<input token pairs BQ table>
|
||||
--functionEmbeddingsBQTable=<output function embedding BQ table>
|
||||
--numWorkers=<num of workers>
|
||||
--project=<project>
|
||||
--targetDataset=<target BQ dataset>
|
||||
--workerMachineType=<worker machine type>
|
||||
--workingDir=<working dir>
|
||||
--cluster=<cluster to deploy job to>"
|
||||
--cluster=<cluster to deploy job to>
|
||||
--namespace=<kubernetes namespace>"
|
||||
}
|
||||
|
||||
# List of required parameters
|
||||
names=(dataDir modelDir targetDataset workingDir workflowId cluster)
|
||||
names=(dataDir modelDir functionEmbeddingsDir tokenPairsBQTable functionEmbeddingsBQTable workingDir workflowId cluster namespace)
|
||||
|
||||
source "${DIR}/parse_arguments.sh"
|
||||
source "${DIR}/initialize_kubectl.sh"
|
||||
|
|
@ -41,9 +44,11 @@ source "${DIR}/initialize_kubectl.sh"
|
|||
# Apply parameters
|
||||
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
||||
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
||||
ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName}
|
||||
ks param set ${component} tokenPairsBQTable ${tokenPairsBQTable} --env ${ksEnvName}
|
||||
ks param set ${component} functionEmbeddingsBQTable ${functionEmbeddingsBQTable} --env ${ksEnvName}
|
||||
ks param set ${component} modelDir ${modelDir} --env ${ksEnvName}
|
||||
ks param set ${component} project ${project} --env ${ksEnvName}
|
||||
ks param set ${component} targetDataset ${targetDataset} --env ${ksEnvName}
|
||||
ks param set ${component} workingDir ${workingDir} --env ${ksEnvName}
|
||||
ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName}
|
||||
ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName}
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@
|
|||
bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"),
|
||||
functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix,
|
||||
|
||||
|
||||
// Location where the function embeddings should be written.
|
||||
functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings",
|
||||
|
||||
|
|
@ -34,5 +33,8 @@
|
|||
"pipeline": {
|
||||
name: "pipeline",
|
||||
problem: "kf_github_function_docstring",
|
||||
project: "code-search-demo",
|
||||
bqDataset: "code_search",
|
||||
tokenPairsBQTable: self.project + ":" + self.bqDataset + ".token_pairs",
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@
|
|||
name: "search-index-creator",
|
||||
jobNameSuffix: "null",
|
||||
image: $.components["t2t-job"].dataflowImage,
|
||||
dataDir: $.components["t2t-code-search"].workingDir + "/data",
|
||||
functionEmbeddingsDir: "",
|
||||
lookupFile: "null",
|
||||
indexFile: "null",
|
||||
},
|
||||
|
|
@ -111,10 +111,6 @@
|
|||
"submit-code-embeddings-job": {
|
||||
name: "submit-code-embeddings-job",
|
||||
image: $.components["t2t-job"].dataflowImage,
|
||||
// Input table this should be of the form PROJECT:DATASET.table
|
||||
inputTable: "",
|
||||
// Big query table where results will be written.
|
||||
targetDataset: "code_search",
|
||||
// Directory where the model is stored.
|
||||
modelDir: "",
|
||||
jobName: "submit-code-embeddings-job",
|
||||
|
|
@ -122,6 +118,11 @@
|
|||
workerMachineType: "n1-highcpu-32",
|
||||
numWorkers: 5,
|
||||
waitUntilFinish: "false",
|
||||
workingDir: $.components["t2t-code-search"].workingDir,
|
||||
dataDir: self.workingDir + "/data",
|
||||
functionEmbeddingsDir: self.workingDir + "/code_embeddings",
|
||||
tokenPairsBQTable: "",
|
||||
functionEmbeddingsBQTable: "",
|
||||
},
|
||||
|
||||
tensorboard: {
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ local jobSpec = {
|
|||
"python",
|
||||
"-m",
|
||||
"code_search.nmslib.cli.create_search_index",
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--data_dir=" + params.functionEmbeddingsDir,
|
||||
"--lookup_file=" + params.lookupFile,
|
||||
"--index_file=" + params.indexFile,
|
||||
],
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@
|
|||
"python2",
|
||||
"-m",
|
||||
"code_search.dataflow.cli.create_function_embeddings",
|
||||
"--runner=DataflowRunner",
|
||||
"--runner=DataflowRunner",
|
||||
"--project=" + params.project,
|
||||
"--token_pairs_table=" + params.tokenPairsBQTable,
|
||||
"--function_embeddings_table=" + params.functionEmbeddingsBQTable,
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from typing import Dict
|
||||
import uuid
|
||||
from kubernetes import client as k8s_client
|
||||
import kfp.dsl as dsl
|
||||
|
||||
|
|
@ -59,39 +60,43 @@ def default_gcp_op(name: str, image: str, command: str = None,
|
|||
)
|
||||
|
||||
def dataflow_function_embedding_op(
|
||||
project: 'GcpProject', cluster_name: str, target_dataset: str, data_dir: 'GcsUri',
|
||||
saved_model_dir: 'GcsUri', workflow_id: str, worker_machine_type: str,
|
||||
num_workers: int, working_dir: str, step_name='dataflow_function_embedding'):
|
||||
project: 'GcpProject', cluster_name: str, token_pairs_bq_table: str,
|
||||
function_embeddings_bq_table: str, data_dir: 'GcsUri',
|
||||
function_embeddings_dir: str, saved_model_dir: 'GcsUri', workflow_id: str,
|
||||
worker_machine_type: str, num_workers: int, working_dir: str, namespace: str):
|
||||
return default_gcp_op(
|
||||
name=step_name,
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
||||
name='dataflow_function_embedding',
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||
command=['/usr/local/src/submit_code_embeddings_job.sh'],
|
||||
arguments=[
|
||||
"--workflowId=%s" % workflow_id,
|
||||
"--modelDir=%s" % saved_model_dir,
|
||||
"--dataDir=%s" % data_dir,
|
||||
"--functionEmbeddingsDir=%s" % function_embeddings_dir,
|
||||
"--numWorkers=%s" % num_workers,
|
||||
"--project=%s" % project,
|
||||
"--targetDataset=%s" % target_dataset,
|
||||
"--tokenPairsBQTable=%s" % token_pairs_bq_table,
|
||||
"--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table,
|
||||
"--workerMachineType=%s" % worker_machine_type,
|
||||
"--workingDir=%s" % working_dir,
|
||||
'--cluster=%s' % cluster_name,
|
||||
"--cluster=%s" % cluster_name,
|
||||
"--namespace=%s" % namespace,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def search_index_creator_op(
|
||||
index_file: str, lookup_file: str, data_dir: str,
|
||||
index_file: str, lookup_file: str, function_embeddings_dir: str,
|
||||
workflow_id: str, cluster_name: str, namespace: str):
|
||||
return dsl.ContainerOp(
|
||||
# use component name as step name
|
||||
name='search_index_creator',
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||
command=['/usr/local/src/launch_search_index_creator_job.sh'],
|
||||
arguments=[
|
||||
'--indexFile=%s' % index_file,
|
||||
'--lookupFile=%s' % lookup_file,
|
||||
'--dataDir=%s' % data_dir,
|
||||
'--functionEmbeddingsDir=%s' % function_embeddings_dir,
|
||||
'--workflowId=%s' % workflow_id,
|
||||
'--cluster=%s' % cluster_name,
|
||||
'--namespace=%s' % namespace,
|
||||
|
|
@ -105,7 +110,7 @@ def update_index_op(
|
|||
return (
|
||||
dsl.ContainerOp(
|
||||
name='update_index',
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843',
|
||||
image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a',
|
||||
command=['/usr/local/src/update_index.sh'],
|
||||
arguments=[
|
||||
'--baseGitRepo=%s' % base_git_repo,
|
||||
|
|
@ -162,22 +167,28 @@ def function_embedding_update(
|
|||
fork_git_repo='IronPan/examples',
|
||||
bot_email='kf.sample.bot@gmail.com'):
|
||||
workflow_name = '{{workflow.name}}'
|
||||
# Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and
|
||||
# workflow name is assigned at runtime. Pipeline might need to support
|
||||
# replacing characters in workflow name.
|
||||
bq_suffix = uuid.uuid4().hex[:6].upper()
|
||||
working_dir = '%s/%s' % (working_dir, workflow_name)
|
||||
lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir
|
||||
index_file = '%s/code-embeddings-index/embeddings.index'% working_dir
|
||||
function_embeddings_dir = '%s/%s' % (working_dir, "/code_embeddings")
|
||||
token_pairs_bq_table = '%s:%s.token_pairs' %(project, target_dataset)
|
||||
function_embeddings_bq_table = \
|
||||
'%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix)
|
||||
|
||||
function_embedding = dataflow_function_embedding_op(
|
||||
project, cluster_name, target_dataset, data_dir,
|
||||
saved_model_dir, workflow_name, worker_machine_type,
|
||||
function_embedding_num_workers, working_dir)
|
||||
|
||||
project, cluster_name, token_pairs_bq_table, function_embeddings_bq_table,
|
||||
data_dir, function_embeddings_dir, saved_model_dir, workflow_name,
|
||||
worker_machine_type, function_embedding_num_workers, working_dir, namespace)
|
||||
search_index_creator = search_index_creator_op(
|
||||
index_file, lookup_file, data_dir, workflow_name, cluster_name, namespace)
|
||||
index_file, lookup_file, function_embeddings_dir, workflow_name, cluster_name, namespace)
|
||||
search_index_creator.after(function_embedding)
|
||||
update_index_op(
|
||||
base_git_repo, base_branch, app_dir, fork_git_repo,
|
||||
index_file, lookup_file, workflow_name, bot_email)\
|
||||
.after(search_index_creator)
|
||||
index_file, lookup_file, workflow_name, bot_email).after(search_index_creator)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
Loading…
Reference in New Issue