mirror of https://github.com/kubeflow/examples.git
				
				
				
			Update the ks parameter (#394)
* refactor ks * remove unecessary params * update ks * address comments
This commit is contained in:
		
							parent
							
								
									78fdc74b56
								
							
						
					
					
						commit
						cea0ffde0d
					
				|  | @ -20,7 +20,7 @@ usage() { | |||
| 	--workflowId=<workflow id invoking the container> | ||||
| 	--indexFile=<index file> | ||||
| 	--lookupFile=<lookup file> | ||||
| 	--dataDir=<data dir> | ||||
| 	--functionEmbeddingsDir=<input function embedding dir> | ||||
| 	--timeout=<timeout> | ||||
| 	--namespace=<kubernetes namespace> | ||||
| 	--cluster=<cluster to deploy job to>" | ||||
|  | @ -33,7 +33,7 @@ source "${DIR}/parse_arguments.sh" | |||
| source "${DIR}/initialize_kubectl.sh" | ||||
| 
 | ||||
| # Apply parameters | ||||
| ks param set ${component} dataDir ${dataDir} --env ${ksEnvName} | ||||
| ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName} | ||||
| ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName} | ||||
| ks param set ${component} lookupFile ${lookupFile} --env ${ksEnvName} | ||||
| ks param set ${component} indexFile ${indexFile} --env ${ksEnvName} | ||||
|  |  | |||
|  | @ -24,16 +24,19 @@ usage() { | |||
| 	--workflowId=<workflow id invoking the container> | ||||
| 	--modelDir=<directory contains the model> | ||||
| 	--dataDir=<data dir> | ||||
| 	--functionEmbeddingsDir=<output function embedding dir> | ||||
| 	--tokenPairsBQTable=<input token pairs BQ table> | ||||
| 	--functionEmbeddingsBQTable=<output function embedding BQ table> | ||||
| 	--numWorkers=<num of workers> | ||||
| 	--project=<project> | ||||
| 	--targetDataset=<target BQ dataset> | ||||
| 	--workerMachineType=<worker machine type> | ||||
| 	--workingDir=<working dir> | ||||
| 	--cluster=<cluster to deploy job to>" | ||||
| 	--cluster=<cluster to deploy job to> | ||||
|     --namespace=<kubernetes namespace>" | ||||
| } | ||||
| 
 | ||||
| # List of required parameters | ||||
| names=(dataDir modelDir targetDataset workingDir workflowId cluster) | ||||
| names=(dataDir modelDir functionEmbeddingsDir tokenPairsBQTable functionEmbeddingsBQTable workingDir workflowId cluster namespace) | ||||
| 
 | ||||
| source "${DIR}/parse_arguments.sh" | ||||
| source "${DIR}/initialize_kubectl.sh" | ||||
|  | @ -41,9 +44,11 @@ source "${DIR}/initialize_kubectl.sh" | |||
| # Apply parameters | ||||
| ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName} | ||||
| ks param set ${component} dataDir ${dataDir} --env ${ksEnvName} | ||||
| ks param set ${component} functionEmbeddingsDir ${functionEmbeddingsDir} --env ${ksEnvName} | ||||
| ks param set ${component} tokenPairsBQTable ${tokenPairsBQTable} --env ${ksEnvName} | ||||
| ks param set ${component} functionEmbeddingsBQTable ${functionEmbeddingsBQTable} --env ${ksEnvName} | ||||
| ks param set ${component} modelDir ${modelDir} --env ${ksEnvName} | ||||
| ks param set ${component} project ${project} --env ${ksEnvName} | ||||
| ks param set ${component} targetDataset ${targetDataset} --env ${ksEnvName} | ||||
| ks param set ${component} workingDir ${workingDir} --env ${ksEnvName} | ||||
| ks param set ${component} numWorkers ${numWorkers} --env ${ksEnvName} | ||||
| ks param set ${component} workerMachineType ${workerMachineType} --env ${ksEnvName} | ||||
|  |  | |||
|  | @ -22,7 +22,6 @@ | |||
|     bqSuffix: std.strReplace(self.jobNameSuffix, "-", "_"), | ||||
|     functionEmbeddingsBQTable: self.project + ":" + self.bqDataset + ".code_embeddings_" + self.bqSuffix, | ||||
| 
 | ||||
| 
 | ||||
|     // Location where the function embeddings should be written. | ||||
|     functionEmbeddingsDir: "gs://code-search-demo/20181130/code_embeddings", | ||||
| 
 | ||||
|  | @ -34,5 +33,8 @@ | |||
|   "pipeline": { | ||||
|     name: "pipeline", | ||||
|     problem: "kf_github_function_docstring", | ||||
|     project: "code-search-demo", | ||||
|     bqDataset: "code_search", | ||||
|     tokenPairsBQTable: self.project + ":" + self.bqDataset + ".token_pairs", | ||||
|   }, | ||||
| } | ||||
|  |  | |||
|  | @ -82,7 +82,7 @@ | |||
|       name: "search-index-creator", | ||||
|       jobNameSuffix: "null", | ||||
|       image: $.components["t2t-job"].dataflowImage, | ||||
|       dataDir: $.components["t2t-code-search"].workingDir + "/data", | ||||
|       functionEmbeddingsDir: "", | ||||
|       lookupFile: "null", | ||||
|       indexFile: "null", | ||||
|     }, | ||||
|  | @ -111,10 +111,6 @@ | |||
|     "submit-code-embeddings-job": { | ||||
|       name: "submit-code-embeddings-job", | ||||
|       image: $.components["t2t-job"].dataflowImage, | ||||
|       // Input table this should be of the form PROJECT:DATASET.table | ||||
|       inputTable: "", | ||||
|       // Big query table where results will be written. | ||||
|       targetDataset: "code_search",       | ||||
|       // Directory where the model is stored. | ||||
|       modelDir: "", | ||||
|       jobName: "submit-code-embeddings-job", | ||||
|  | @ -122,6 +118,11 @@ | |||
|       workerMachineType: "n1-highcpu-32", | ||||
|       numWorkers: 5, | ||||
|       waitUntilFinish: "false", | ||||
|       workingDir: $.components["t2t-code-search"].workingDir, | ||||
|       dataDir: self.workingDir + "/data", | ||||
|       functionEmbeddingsDir: self.workingDir + "/code_embeddings", | ||||
|       tokenPairsBQTable: "", | ||||
|       functionEmbeddingsBQTable: "", | ||||
|     }, | ||||
| 
 | ||||
|     tensorboard: { | ||||
|  |  | |||
|  | @ -39,7 +39,7 @@ local jobSpec = { | |||
| 		          "python", | ||||
| 		          "-m", | ||||
| 		          "code_search.nmslib.cli.create_search_index", | ||||
| 		          "--data_dir=" + params.dataDir, | ||||
| 		          "--data_dir=" + params.functionEmbeddingsDir, | ||||
| 		          "--lookup_file=" + params.lookupFile, | ||||
| 		          "--index_file=" + params.indexFile, | ||||
| 		        ], | ||||
|  |  | |||
|  | @ -30,7 +30,7 @@ | |||
| 	              "python2", | ||||
| 	              "-m", | ||||
| 	              "code_search.dataflow.cli.create_function_embeddings", | ||||
| 	              "--runner=DataflowRunner",	               | ||||
| 	              "--runner=DataflowRunner", | ||||
| 	              "--project=" + params.project, | ||||
| 	              "--token_pairs_table=" + params.tokenPairsBQTable, | ||||
| 	              "--function_embeddings_table=" + params.functionEmbeddingsBQTable, | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| from typing import Dict | ||||
| import uuid | ||||
| from kubernetes import client as k8s_client | ||||
| import kfp.dsl as dsl | ||||
| 
 | ||||
|  | @ -59,39 +60,43 @@ def default_gcp_op(name: str, image: str, command: str = None, | |||
|   ) | ||||
| 
 | ||||
| def dataflow_function_embedding_op( | ||||
|         project: 'GcpProject', cluster_name: str, target_dataset: str, data_dir: 'GcsUri', | ||||
|         saved_model_dir: 'GcsUri', workflow_id: str, worker_machine_type: str, | ||||
|         num_workers: int, working_dir: str, step_name='dataflow_function_embedding'): | ||||
|         project: 'GcpProject', cluster_name: str, token_pairs_bq_table: str, | ||||
|         function_embeddings_bq_table: str, data_dir: 'GcsUri', | ||||
|         function_embeddings_dir: str, saved_model_dir: 'GcsUri', workflow_id: str, | ||||
|         worker_machine_type: str, num_workers: int, working_dir: str, namespace: str): | ||||
|   return default_gcp_op( | ||||
|     name=step_name, | ||||
|     image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', | ||||
|     name='dataflow_function_embedding', | ||||
|     image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', | ||||
|     command=['/usr/local/src/submit_code_embeddings_job.sh'], | ||||
|     arguments=[ | ||||
|       "--workflowId=%s" % workflow_id, | ||||
|       "--modelDir=%s" % saved_model_dir, | ||||
|       "--dataDir=%s" % data_dir, | ||||
|       "--functionEmbeddingsDir=%s" % function_embeddings_dir, | ||||
|       "--numWorkers=%s" % num_workers, | ||||
|       "--project=%s" % project, | ||||
|       "--targetDataset=%s" % target_dataset, | ||||
|       "--tokenPairsBQTable=%s" % token_pairs_bq_table, | ||||
|       "--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table, | ||||
|       "--workerMachineType=%s" % worker_machine_type, | ||||
|       "--workingDir=%s" % working_dir, | ||||
|       '--cluster=%s' % cluster_name, | ||||
|       "--cluster=%s" % cluster_name, | ||||
|       "--namespace=%s" % namespace, | ||||
|     ] | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| def search_index_creator_op( | ||||
|         index_file: str, lookup_file: str, data_dir: str, | ||||
|         index_file: str, lookup_file: str, function_embeddings_dir: str, | ||||
|         workflow_id: str, cluster_name: str, namespace: str): | ||||
|   return dsl.ContainerOp( | ||||
|     # use component name as step name | ||||
|     name='search_index_creator', | ||||
|     image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', | ||||
|     image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', | ||||
|     command=['/usr/local/src/launch_search_index_creator_job.sh'], | ||||
|     arguments=[ | ||||
|       '--indexFile=%s' % index_file, | ||||
|       '--lookupFile=%s' % lookup_file, | ||||
|       '--dataDir=%s' % data_dir, | ||||
|       '--functionEmbeddingsDir=%s' % function_embeddings_dir, | ||||
|       '--workflowId=%s' % workflow_id, | ||||
|       '--cluster=%s' % cluster_name, | ||||
|       '--namespace=%s' % namespace, | ||||
|  | @ -105,7 +110,7 @@ def update_index_op( | |||
|   return ( | ||||
|     dsl.ContainerOp( | ||||
|       name='update_index', | ||||
|       image='gcr.io/kubeflow-examples/code-search/ks:v20181130-b807843', | ||||
|       image='gcr.io/kubeflow-examples/code-search/ks:v20181202-fbf5905-dirty-a8480a', | ||||
|       command=['/usr/local/src/update_index.sh'], | ||||
|       arguments=[ | ||||
|         '--baseGitRepo=%s' % base_git_repo, | ||||
|  | @ -162,22 +167,28 @@ def function_embedding_update( | |||
|     fork_git_repo='IronPan/examples', | ||||
|     bot_email='kf.sample.bot@gmail.com'): | ||||
|   workflow_name = '{{workflow.name}}' | ||||
|   # Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and | ||||
|   # workflow name is assigned at runtime. Pipeline might need to support | ||||
|   # replacing characters in workflow name. | ||||
|   bq_suffix = uuid.uuid4().hex[:6].upper() | ||||
|   working_dir = '%s/%s' % (working_dir, workflow_name) | ||||
|   lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir | ||||
|   index_file = '%s/code-embeddings-index/embeddings.index'% working_dir | ||||
|   function_embeddings_dir = '%s/%s' % (working_dir, "/code_embeddings") | ||||
|   token_pairs_bq_table = '%s:%s.token_pairs' %(project, target_dataset) | ||||
|   function_embeddings_bq_table = \ | ||||
|     '%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix) | ||||
| 
 | ||||
|   function_embedding = dataflow_function_embedding_op( | ||||
|                             project, cluster_name, target_dataset, data_dir, | ||||
|                             saved_model_dir, workflow_name, worker_machine_type, | ||||
|                             function_embedding_num_workers, working_dir) | ||||
| 
 | ||||
|     project, cluster_name, token_pairs_bq_table, function_embeddings_bq_table, | ||||
|     data_dir, function_embeddings_dir, saved_model_dir, workflow_name, | ||||
|     worker_machine_type, function_embedding_num_workers, working_dir, namespace) | ||||
|   search_index_creator = search_index_creator_op( | ||||
|     index_file, lookup_file, data_dir, workflow_name, cluster_name, namespace) | ||||
|     index_file, lookup_file, function_embeddings_dir, workflow_name, cluster_name, namespace) | ||||
|   search_index_creator.after(function_embedding) | ||||
|   update_index_op( | ||||
|       base_git_repo, base_branch, app_dir, fork_git_repo, | ||||
|       index_file, lookup_file, workflow_name, bot_email)\ | ||||
|     .after(search_index_creator) | ||||
|       index_file, lookup_file, workflow_name, bot_email).after(search_index_creator) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue