From f9873e6ac4c818dc71ff1af8a823af40c54ae450 Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor Date: Mon, 20 Aug 2018 16:35:07 -0700 Subject: [PATCH] Upgrade notebook commands and other relevant changes (#229) * Replace double quotes for field values (ks convention) * Recreate the ksonnet application from scratch * Fix pip commands to find requirements and redo installation, fix ks param set * Use sed replace instead of ks param set. * Add cells to first show JobSpec and then apply * Upgrade T2T, fix conflicting problem types * Update docker images * Reduce to 200k samples for vocab * Use Jupyter notebook service account * Add illustrative gsutil commands to show output files, specify index files glob explicitly * List files after index creation step * Use the model in current repository and not upstream t2t * Update Docker images * Expose TF Serving Rest API at 9001 * Spawn terminal from the notebooks ui, no need to go to lab --- code_search/README.md | 8 +- code_search/code-search.ipynb | 346 +++++++++++++++--- code_search/docker/t2t/Dockerfile | 4 +- code_search/docker/ui/Dockerfile | 2 +- code_search/kubeflow/app.yaml | 12 +- .../kubeflow/components/params.libsonnet | 29 +- .../README.md | 0 .../parts.yaml | 0 .../tf-serving-all-features.jsonnet | 0 .../tf-serving-with-request-log.jsonnet | 230 ++++++++++++ .../tf-serving.libsonnet | 31 +- .../util.libsonnet | 0 .../nmslib/cli/create_search_index.py | 2 +- .../src/code_search/t2t/function_docstring.py | 4 +- .../t2t/function_docstring_extended.py | 2 +- .../code_search/t2t/similarity_transformer.py | 2 +- code_search/src/requirements.txt | 2 +- 17 files changed, 570 insertions(+), 104 deletions(-) rename code_search/kubeflow/vendor/kubeflow/{tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77 => tf-serving@ab6084349673e6405ae486eb3be2141e3550643c}/README.md (100%) rename code_search/kubeflow/vendor/kubeflow/{tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77 => tf-serving@ab6084349673e6405ae486eb3be2141e3550643c}/parts.yaml (100%) rename code_search/kubeflow/vendor/kubeflow/{tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77 => tf-serving@ab6084349673e6405ae486eb3be2141e3550643c}/prototypes/tf-serving-all-features.jsonnet (100%) create mode 100644 code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-with-request-log.jsonnet rename code_search/kubeflow/vendor/kubeflow/{tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77 => tf-serving@ab6084349673e6405ae486eb3be2141e3550643c}/tf-serving.libsonnet (95%) rename code_search/kubeflow/vendor/kubeflow/{tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77 => tf-serving@ab6084349673e6405ae486eb3be2141e3550643c}/util.libsonnet (100%) diff --git a/code_search/README.md b/code_search/README.md index 5fb4f078..2a1c0afe 100644 --- a/code_search/README.md +++ b/code_search/README.md @@ -37,17 +37,15 @@ you replace this with the true FQDN of your Kubeflow cluster in any subsequent i * In the **Image** text field, enter `gcr.io/kubeflow-images-public/kubeflow-codelab-notebook:v20180808-v0.2-22-gcfdcb12`. This image contains all the pre-requisites needed for the demo. -* Once spawned, you should be redirected to the notebooks UI. We intend to go to the JupyterLab home - page which is available at the URL - **https://kubeflow.example.com/user//lab**. - **TIP**: Simply point the browser to **/lab** instead of the **/tree** path in the URL. - +* Once spawned, you should be redirected to the Jupyter Notebooks UI. + * Spawn a new Terminal and run ``` $ git clone --branch=master --depth=1 https://github.com/kubeflow/examples ``` This will create an examples folder. It is safe to close the terminal now. -* Refresh the File Explorer (typically to the left) and navigate to `examples/code_search`. Open +* Navigate back to the Jupyter Notebooks UI and navigate to `examples/code_search`. Open the Jupyter notebook `code-search.ipynb` and follow it along. # Acknowledgements diff --git a/code_search/code-search.ipynb b/code_search/code-search.ipynb index 76236d7c..4e7d13af 100644 --- a/code_search/code-search.ipynb +++ b/code_search/code-search.ipynb @@ -21,7 +21,14 @@ "source": [ "## Install dependencies\n", "\n", - "Let us install all the Python dependencies. Note that everything must be done with `Python 2`. This will take a while and only needs to be run once." + "Let us install all the Python dependencies. Note that everything must be done with `Python 2`. This will take a while the first time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Verify Version Information" ] }, { @@ -30,9 +37,30 @@ "metadata": {}, "outputs": [], "source": [ - "! pip2 install https://github.com/kubeflow/batch-predict/tarball/master\n", + "%%bash\n", "\n", - "! pip2 install -r src/requirements.txt" + "echo \"Pip Version Info: \" && python2 --version && python2 -m pip --version && echo\n", + "echo \"Google Cloud SDK Info: \" && gcloud --version && echo\n", + "echo \"Ksonnet Version Info: \" && ks version && echo\n", + "echo \"Kubectl Version Info: \" && kubectl version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Pip Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "! python2 -m pip install -U pip" ] }, { @@ -41,8 +69,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Only for BigQuery cells\n", - "! pip2 install pandas-gbq" + "# Code Search dependencies\n", + "! python2 -m pip install --user https://github.com/kubeflow/batch-predict/tarball/master\n", + "! python2 -m pip install --user -r src/requirements.txt" ] }, { @@ -51,6 +80,17 @@ "metadata": {}, "outputs": [], "source": [ + "# BigQuery Cell Dependencies\n", + "! python2 -m pip install --user pandas-gbq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.\n", "from pandas.io import gbq" ] }, @@ -72,10 +112,8 @@ "# Configuration Variables. Modify as desired.\n", "\n", "PROJECT = 'kubeflow-dev'\n", - "CLUSTER_NAME = 'kubeflow-latest'\n", - "CLUSTER_REGION = 'us-east1-d'\n", - "CLUSTER_NAMESPACE = 'kubeflow-latest'\n", "\n", + "# Dataflow Related Variables.\n", "TARGET_DATASET = 'code_search'\n", "WORKING_DIR = 'gs://kubeflow-examples/t2t-code-search/notebook-demo'\n", "WORKER_MACHINE_TYPE = 'n1-highcpu-32'\n", @@ -83,10 +121,6 @@ "\n", "# DO NOT MODIFY. These are environment variables to be used in a bash shell.\n", "%env PROJECT $PROJECT\n", - "%env CLUSTER_NAME $CLUSTER_NAME\n", - "%env CLUSTER_REGION $CLUSTER_REGION\n", - "%env CLUSTER_NAMESPACE $CLUSTER_NAMESPACE\n", - "\n", "%env TARGET_DATASET $TARGET_DATASET\n", "%env WORKING_DIR $WORKING_DIR\n", "%env WORKER_MACHINE_TYPE $WORKER_MACHINE_TYPE\n", @@ -99,7 +133,7 @@ "source": [ "### Setup Authorization\n", "\n", - "In a Kubeflow cluster, we already have the key credentials available with each pod and will re-use them to authenticate. This will allow us to submit `TFJob`s and execute `Dataflow` pipelines. We also set the new context for the Code Search Ksonnet application." + "In a Kubeflow cluster on GKE, we already have the Google Application Credentials mounted onto each Pod. We can simply point `gcloud` to activate that service account." ] }, { @@ -111,13 +145,35 @@ "%%bash\n", "\n", "# Activate Service Account provided by Kubeflow.\n", - "gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}\n", + "gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additionally, to interact with the underlying cluster, we configure `kubectl`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", "\n", - "# Get KUBECONFIG for the desired cluster.\n", - "gcloud container clusters get-credentials ${CLUSTER_NAME} --region ${CLUSTER_REGION}\n", - "\n", - "# Set the namespace of the context.\n", - "kubectl config set contexts.$(kubectl config current-context).namespace ${CLUSTER_NAMESPACE}" + "kubectl config set-cluster kubeflow --server=https://kubernetes.default --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt\n", + "kubectl config set-credentials jupyter --token \"$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)\"\n", + "kubectl config set-context kubeflow --cluster kubeflow --user jupyter\n", + "kubectl config use-context kubeflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Collectively, these allow us to interact with Google Cloud Services as well as the Kubernetes Cluster directly to submit `TFJob`s and execute `Dataflow` pipelines." ] }, { @@ -126,7 +182,7 @@ "source": [ "### Setup Ksonnet Application\n", "\n", - "This will use the context we've set above and provide it as a new environment to the Ksonnet application." + "We now point the Ksonnet application to the underlying Kubernetes cluster." ] }, { @@ -139,32 +195,15 @@ "\n", "cd kubeflow\n", "\n", - "# Update Ksonnet application to the context set earlier\n", - "ks env add code-search --context=$(kubectl config current-context)\n", + "# Update Ksonnet to point to the Kubernetes Cluster\n", + "ks env add code-search --context $(kubectl config current-context)\n", "\n", "# Update the Working Directory of the application\n", - "ks param set t2t-code-search workingDir ${WORKING_DIR}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Verify Version Information" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", + "sed -i'' \"s,gs://example/prefix,${WORKING_DIR},\" components/params.libsonnet\n", "\n", - "echo \"Pip Version Info: \" && pip2 --version && echo\n", - "echo \"Google Cloud SDK Info: \" && gcloud --version && echo\n", - "echo \"Ksonnet Version Info: \" && ks version && echo\n", - "echo \"Kubectl Version Info: \" && kubectl version" + "# FIXME(sanyamkapoor): This command completely replaces previous configurations.\n", + "# Hence, using string replacement in file.\n", + "# ks param set t2t-code-search workingDir ${WORKING_DIR}" ] }, { @@ -173,7 +212,9 @@ "source": [ "## View Github Files\n", "\n", - "This is the query that is run as the first step of the Pre-Processing pipeline and is sent through a set of transformations. This is illustrative of the rows being processed in the pipeline we trigger next." + "This is the query that is run as the first step of the Pre-Processing pipeline and is sent through a set of transformations. This is illustrative of the rows being processed in the pipeline we trigger next.\n", + "\n", + "**WARNING**: The table is large and the query can take a few minutes to complete." ] }, { @@ -294,7 +335,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "query = \"\"\"\n", @@ -308,13 +351,48 @@ "gbq.read_gbq(query, dialect='standard', project_id=PROJECT)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This pipeline also writes a set of CSV files which contain function and docstring pairs delimited by a comma. Here, we list a subset of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "LIMIT=10\n", + "\n", + "gsutil ls ${WORKING_DIR}/data/*.csv | head -n ${LIMIT}" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare Dataset for Training\n", "\n", - "In this step we will use `t2t-datagen` to convert the transformed data above into the `TFRecord` format. We will run this job on the Kubeflow cluster." + "We will use `t2t-datagen` to convert the transformed data above into the `TFRecord` format.\n", + "\n", + "**TIP**: Use `ks show` to view the Resource Spec submitted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c t2t-code-search-datagen" ] }, { @@ -334,7 +412,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Execute Tensorflow Training" + "Once this job finishes, the data directory should have a vocabulary file and a list of `TFRecords` prefixed by the problem name which in our case is `github_function_docstring_extended`. Here, we list a subset of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "LIMIT=10\n", + "\n", + "gsutil ls ${WORKING_DIR}/data/vocab*\n", + "gsutil ls ${WORKING_DIR}/data/*train* | head -n ${LIMIT}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execute Tensorflow Training\n", + "\n", + "Once, the `TFRecords` are generated, we will use `t2t-trainer` to execute the training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c t2t-code-search-trainer" ] }, { @@ -354,7 +468,40 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Export Tensorflow Model" + "This will generate TensorFlow model checkpoints which is illustrated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "gsutil ls ${WORKING_DIR}/output/*ckpt*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export Tensorflow Model\n", + "\n", + "We now use `t2t-exporter` to export the `TFModel`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c t2t-code-search-exporter" ] }, { @@ -370,6 +517,24 @@ "ks apply code-search -c t2t-code-search-exporter" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once completed, this will generate a TensorFlow `SavedModel` which we will further use for both online (via `TF Serving`) and offline inference (via `Kubeflow Batch Prediction`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "gsutil ls ${WORKING_DIR}/output/export/Servo" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -398,7 +563,7 @@ "source": [ "### Configuration\n", "\n", - "First, select a Exported Model version from the `${WORKING_DIR}/output/export/Servo`. This should be name of a folder with UNIX Seconds Timestamp like `1533685294`. Below, we automatically do that by selecting the folder which represents the latest timestamp." + "First, select a Exported Model version from the `${WORKING_DIR}/output/export/Servo` as seen above. This should be name of a folder with UNIX Seconds Timestamp like `1533685294`. Below, we automatically do that by selecting the folder which represents the latest timestamp." ] }, { @@ -442,14 +607,17 @@ "\n", "cd src\n", "\n", + "JOB_NAME=\"compute-function-embeddings-$(date +'%Y%m%d-%H%M%S')\"\n", + "PROBLEM=github_function_docstring_extended\n", + "\n", "python2 -m code_search.dataflow.cli.create_function_embeddings \\\n", - " --runner DataflowRunner\n", + " --runner DataflowRunner \\\n", " --project \"${PROJECT}\" \\\n", " --target_dataset \"${TARGET_DATASET}\" \\\n", - " --problem github_function_docstring \\\n", + " --problem \"${PROBLEM}\" \\\n", " --data_dir \"${WORKING_DIR}/data\" \\\n", " --saved_model_dir \"${WORKING_DIR}/output/export/Servo/${MODEL_VERSION}\" \\\n", - " --job_name compute-function-embeddings\n", + " --job_name \"${JOB_NAME}\" \\\n", " --temp_location \"${WORKING_DIR}/dataflow/temp\" \\\n", " --staging_location \"${WORKING_DIR}/dataflow/staging\" \\\n", " --worker_machine_type \"${WORKER_MACHINE_TYPE}\" \\\n", @@ -480,13 +648,46 @@ "gbq.read_gbq(query, dialect='standard', project_id=PROJECT)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The pipeline also generates a set of CSV files which will be useful to generate the search index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "LIMIT=10\n", + "\n", + "gsutil ls ${WORKING_DIR}/data/*index*.csv | head -n ${LIMIT}" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create Search Index\n", "\n", - "We now create the Search Index from the computed embeddings so that during a query we can do a k-Nearest Neighbor search to give out semantically similar results." + "We now create the Search Index from the computed embeddings. This facilitates k-Nearest Neighbor search to for semantically similar results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c search-index-creator" ] }, { @@ -506,7 +707,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the CSV files generated from the previous step, this creates an index using [NMSLib](https://github.com/nmslib/nmslib). A unified CSV file containing all the code examples for a human-readable reverse lookup during the query, is also created in the `WORKING_DIR`." + "Using the CSV files generated from the previous step, this creates an index using [NMSLib](https://github.com/nmslib/nmslib). A unified CSV file containing all the code examples for a human-readable reverse lookup during the query, is also created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "gsutil ls ${WORKING_DIR}/code_search_index*" ] }, { @@ -518,6 +730,19 @@ "We've seen offline inference during the computation of embeddings. For online inference, we deploy the exported Tensorflow model above using [Tensorflow Serving](https://www.tensorflow.org/serving/)." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c t2t-code-search-serving" + ] + }, { "cell_type": "code", "execution_count": null, @@ -540,6 +765,19 @@ "We finally deploy the Search UI which allows the user to input arbitrary strings and see a list of results corresponding to semantically similar Python functions. This internally uses the inference server we just deployed." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "cd kubeflow\n", + "\n", + "ks show code-search -c search-index-server" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/code_search/docker/t2t/Dockerfile b/code_search/docker/t2t/Dockerfile index ca6d1acc..141e5700 100644 --- a/code_search/docker/t2t/Dockerfile +++ b/code_search/docker/t2t/Dockerfile @@ -2,7 +2,7 @@ ARG BASE_IMAGE_TAG=1.8.0 FROM tensorflow/tensorflow:$BASE_IMAGE_TAG -RUN pip --no-cache-dir install tensor2tensor~=1.6.0 oauth2client~=4.1.0 &&\ +RUN pip --no-cache-dir install tensor2tensor~=1.7.0 oauth2client~=4.1.0 &&\ apt-get update && apt-get install -y jq &&\ rm -rf /var/lib/apt/lists/* @@ -15,6 +15,4 @@ ENV PYTHONIOENCODING=utf-8 T2T_USR_DIR=/app/code_search/t2t VOLUME ["/data", "/output"] -EXPOSE 8008 - ENTRYPOINT ["bash"] diff --git a/code_search/docker/ui/Dockerfile b/code_search/docker/ui/Dockerfile index ca8820f5..1adb85ab 100644 --- a/code_search/docker/ui/Dockerfile +++ b/code_search/docker/ui/Dockerfile @@ -9,7 +9,7 @@ RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - &&\ numpy~=1.14.0 \ oauth2client~=4.1.0 \ requests~=2.18.0 \ - tensor2tensor~=1.6.0 &&\ + tensor2tensor~=1.7.0 &&\ rm -rf /var/lib/apt/lists/* ADD src/ /src diff --git a/code_search/kubeflow/app.yaml b/code_search/kubeflow/app.yaml index 66e69805..9bc420bd 100644 --- a/code_search/kubeflow/app.yaml +++ b/code_search/kubeflow/app.yaml @@ -1,17 +1,17 @@ apiVersion: 0.2.0 environments: - kf-cs: + default: destination: namespace: kubeflow - server: https://35.232.164.190 - k8sVersion: v1.9.6 - path: kf-cs + server: https://35.237.202.148 + k8sVersion: v1.9.7 + path: default kind: ksonnet.io/app libraries: tf-serving: name: tf-serving registry: kubeflow - version: e95f94a1a97a0974ada734895d590b5ba565fa77 + version: ab6084349673e6405ae486eb3be2141e3550643c name: kubeflow registries: incubator: @@ -19,5 +19,5 @@ registries: uri: github.com/ksonnet/parts/tree/master/incubator kubeflow: protocol: github - uri: https://github.com/kubeflow/kubeflow/tree/v0.2.2/kubeflow + uri: https://github.com/kubeflow/kubeflow/tree/master/kubeflow version: 0.0.1 diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index 41e39b34..da88e46d 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -9,8 +9,8 @@ numPsGpu: 0, train_steps: 100, eval_steps: 10, - image: 'gcr.io/kubeflow-dev/code-search:v20180814-66d27b9', - imageGpu: 'gcr.io/kubeflow-dev/code-search:v20180814-66d27b9-gpu', + image: 'gcr.io/kubeflow-dev/code-search:v20180817-732333a', + imageGpu: 'gcr.io/kubeflow-dev/code-search:v20180817-732333a-gpu', imagePullSecrets: [], dataDir: 'null', outputDir: 'null', @@ -18,34 +18,35 @@ hparams_set: 'null', }, "t2t-code-search": { + name: 't2t-code-search', workingDir: 'gs://example/prefix', - problem: 'github_function_docstring_extended', - model: 'similarity_transformer', + problem: 'cs_github_function_docstring', + model: 'cs_similarity_transformer', hparams_set: 'transformer_tiny', }, "t2t-code-search-datagen": { jobType: 'datagen', name: 't2t-code-search-datagen', - problem: $.components["t2t-code-search"].problem, + problem: $.components['t2t-code-search'].problem, dataDir: $.components['t2t-code-search'].workingDir + '/data', }, "t2t-code-search-trainer": { jobType: 'trainer', name: 't2t-code-search-trainer', - problem: $.components["t2t-code-search"].problem, + problem: $.components['t2t-code-search'].problem, dataDir: $.components['t2t-code-search'].workingDir + '/data', outputDir: $.components['t2t-code-search'].workingDir + '/output', - model: $.components["t2t-code-search"].model, - hparams_set: $.components["t2t-code-search"].hparams_set, + model: $.components['t2t-code-search'].model, + hparams_set: $.components['t2t-code-search']['hparams_set'], }, "t2t-code-search-exporter": { jobType: 'exporter', name: 't2t-code-search-exporter', - problem: $.components["t2t-code-search"].problem, + problem: $.components['t2t-code-search'].problem, dataDir: $.components['t2t-code-search'].workingDir + '/data', outputDir: $.components['t2t-code-search'].workingDir + '/output', - model: $.components["t2t-code-search"].model, - hparams_set: $.components["t2t-code-search"].hparams_set, + model: $.components['t2t-code-search'].model, + hparams_set: $.components['t2t-code-search']['hparams_set'], }, "t2t-code-search-serving": { name: 't2t-code-search', @@ -57,7 +58,7 @@ }, nmslib: { replicas: 1, - image: 'gcr.io/kubeflow-dev/code-search-ui:v20180806-7b0fcaa', + image: 'gcr.io/kubeflow-dev/code-search-ui:v20180817-0d4a60d', problem: 'null', dataDir: 'null', lookupFile: 'null', @@ -72,11 +73,11 @@ }, "search-index-server": { name: 'search-index-server', - problem: $.components["t2t-code-search"].problem, + problem: $.components['t2t-code-search'].problem, dataDir: $.components['t2t-code-search'].workingDir + '/data', lookupFile: $.components['t2t-code-search'].workingDir + '/code_search_index.csv', indexFile: $.components['t2t-code-search'].workingDir + '/code_search_index.nmslib', - servingUrl: 'http://t2t-code-search.kubeflow:8000/v1/models/t2t-code-search:predict', + servingUrl: 'http://t2t-code-search.kubeflow:9001/v1/models/t2t-code-search:predict', }, }, } diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/README.md b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/README.md similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/README.md rename to code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/README.md diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/parts.yaml b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/parts.yaml similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/parts.yaml rename to code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/parts.yaml diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/prototypes/tf-serving-all-features.jsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-all-features.jsonnet similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/prototypes/tf-serving-all-features.jsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-all-features.jsonnet diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-with-request-log.jsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-with-request-log.jsonnet new file mode 100644 index 00000000..44ba455a --- /dev/null +++ b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/prototypes/tf-serving-with-request-log.jsonnet @@ -0,0 +1,230 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving-request-log +// @description tf-serving with request logging +// @shortDescription tf-serving with request logging +// @param name string Name to give to each of the components +// @param gcpProject string The gcp project for Bigquery dataset +// @param dataset string The Bigquery dataset +// @param table string The Bigquery table +// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path +// @optionalParam modelName string mnist The model name + +local k = import "k.libsonnet"; + +local namespace = "kubeflow"; +local appName = import "param://name"; +local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec"; +local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723"; +local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723"; + +local gcpSecretName = "user-gcp-sa"; + +local service = { + apiVersion: "v1", + kind: "Service", + metadata: { + labels: { + app: appName, + }, + name: appName, + namespace: namespace, + }, + spec: { + ports: [ + { + name: "grpc-tf-serving", + port: 9000, + targetPort: 9000, + }, + { + name: "http-tf-serving-proxy", + port: 8000, + targetPort: 8000, + }, + ], + selector: { + app: appName, + }, + type: "ClusterIP", + }, +}; + +local configMap = { + apiVersion: "v1", + kind: "ConfigMap", + metadata: { + name: appName + "fluentd-config", + namespace: namespace, + }, + data: { + "fluent.conf": std.format(||| + + @type tail + path /tmp/logs/request.log + pos_file /tmp/logs/request.log.pos + + @type json + + tag dummy + + + @type bigquery_insert + auth_method application_default + project %s + dataset %s + table %s + fetch_schema true + + |||, [params.gcpProject, params.dataset, params.table]), + }, +}; + +local deployment = { + apiVersion: "extensions/v1beta1", + kind: "Deployment", + metadata: { + labels: { + app: appName, + }, + name: appName, + namespace: namespace, + }, + spec: { + template: { + metadata: { + labels: { + app: appName, + }, + }, + spec: { + containers: [ + // ModelServer + { + args: [ + "/usr/bin/tensorflow_model_server", + "--port=9000", + "--model_name=" + params.modelName, + "--model_base_path=" + params.modelBasePath, + ], + image: image, + imagePullPolicy: "IfNotPresent", + name: "model-server", + ports: [ + { + containerPort: 9000, + }, + ], + resources: { + limits: { + cpu: "4", + memory: "4Gi", + }, + requests: { + cpu: "1", + memory: "1Gi", + }, + }, + }, + // Http proxy + { + name: "http-proxy", + image: httpProxyImage, + imagePullPolicy: "Always", + command: [ + "python", + "/usr/src/app/server.py", + "--port=8000", + "--rpc_port=9000", + "--rpc_timeout=10.0", + "--log_request=true", + ], + env: [], + ports: [ + { + containerPort: 8000, + }, + ], + resources: { + requests: { + memory: "1Gi", + cpu: "1", + }, + limits: { + memory: "4Gi", + cpu: "4", + }, + }, + securityContext: { + runAsUser: 1000, + fsGroup: 1000, + }, + volumeMounts: [ + { + name: "request-logs", + mountPath: "/tmp/logs", + }, + ], + }, + // TODO(lunkai): use admission controller to inject. + // Logging container. + { + name: "logging", + image: loggingImage, + imagePullPolicy: "Always", + env: [ + { name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" }, + ], + resources: { + requests: { + memory: "250Mi", + cpu: "0.25", + }, + limits: { + memory: "500Mi", + cpu: "0.5", + }, + }, + volumeMounts: [ + { + name: "request-logs", + mountPath: "/tmp/logs", + }, + { + name: "gcp-credentials", + mountPath: "/secret/gcp-credentials", + }, + { + name: "fluentd-config-volume", + mountPath: "/fluentd/etc/custom", + }, + ], + }, + ], + volumes: [ + { + name: "gcp-credentials", + secret: { + secretName: gcpSecretName, + }, + }, + { + name: "request-logs", + emptyDir: {}, + }, + { + configMap: { + name: "fluentd-config", + }, + name: "fluentd-config-volume", + }, + ], + }, + }, + }, +}; + +k.core.v1.list.new([ + service, + deployment, + configMap, +]) diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/tf-serving.libsonnet similarity index 95% rename from code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/tf-serving.libsonnet index 6a1561be..0de66d8d 100644 --- a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/tf-serving.libsonnet +++ b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/tf-serving.libsonnet @@ -18,12 +18,7 @@ deployIstio: false, deployHttpProxy: false, - defaultHttpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2", - httpProxyImage: "", - httpProxyImageToUse: if $.params.httpProxyImage == "" then - $.params.defaultHttpProxyImage - else - $.params.httpProxyImage, + httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2", serviceType: "ClusterIP", @@ -57,10 +52,10 @@ // Name of the k8s secrets containing S3 credentials s3SecretName: "", // Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID. - s3SecretAccesskeyidKeyName: "", + s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID", // Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY. - s3SecretSecretaccesskeyKeyName: "", + s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY", // S3 region s3AwsRegion: "us-west-1", @@ -122,7 +117,7 @@ args: [ "/usr/bin/tensorflow_model_server", "--port=9000", - "--rest_api_port=8000", + "--rest_api_port=9001", "--model_name=" + $.params.modelName, "--model_base_path=" + $.params.modelPath, ], @@ -130,6 +125,9 @@ { containerPort: 9000, }, + { + containerPort: 9001, + }, ], // TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that // model-server doesn't have something we can use out of the box. @@ -176,7 +174,7 @@ httpProxyContainer:: { name: $.params.name + "-http-proxy", - image: $.params.httpProxyImageToUse, + image: $.params.httpProxyImage, imagePullPolicy: "IfNotPresent", command: [ "python", @@ -193,12 +191,12 @@ ], resources: { requests: { - memory: "1Gi", - cpu: "1", + memory: "500Mi", + cpu: "0.5", }, limits: { - memory: "4Gi", - cpu: "4", + memory: "1Gi", + cpu: "1", }, }, securityContext: { @@ -274,6 +272,11 @@ port: 9000, targetPort: 9000, }, + { + name: "rest-tf-serving", + port: 9001, + targetPort: 9001, + }, { name: "http-tf-serving-proxy", port: 8000, diff --git a/code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/util.libsonnet b/code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/util.libsonnet similarity index 100% rename from code_search/kubeflow/vendor/kubeflow/tf-serving@e95f94a1a97a0974ada734895d590b5ba565fa77/util.libsonnet rename to code_search/kubeflow/vendor/kubeflow/tf-serving@ab6084349673e6405ae486eb3be2141e3550643c/util.libsonnet diff --git a/code_search/src/code_search/nmslib/cli/create_search_index.py b/code_search/src/code_search/nmslib/cli/create_search_index.py index 0df5c03b..ddb26520 100644 --- a/code_search/src/code_search/nmslib/cli/create_search_index.py +++ b/code_search/src/code_search/nmslib/cli/create_search_index.py @@ -33,7 +33,7 @@ def create_search_index(argv=None): with open(tmp_lookup_file, 'w') as lookup_file: lookup_writer = csv.writer(lookup_file) - for csv_file_path in tf.gfile.Glob('{}/*.csv'.format(args.data_dir)): + for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format(args.data_dir)): tf.logging.debug('Reading {}'.format(csv_file_path)) with tf.gfile.Open(csv_file_path) as csv_file: diff --git a/code_search/src/code_search/t2t/function_docstring.py b/code_search/src/code_search/t2t/function_docstring.py index a9c3a175..0fcc3ef4 100644 --- a/code_search/src/code_search/t2t/function_docstring.py +++ b/code_search/src/code_search/t2t/function_docstring.py @@ -4,11 +4,9 @@ from six import StringIO from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import metrics -from tensor2tensor.utils import registry import tensorflow as tf -@registry.register_problem class GithubFunctionDocstring(text_problems.Text2TextProblem): """Function and Docstring similarity Problem. @@ -67,7 +65,7 @@ class GithubFunctionDocstring(text_problems.Text2TextProblem): @property def max_samples_for_vocab(self): # FIXME(sanyamkapoor): This exists to handle memory explosion. - return int(3.5e5) + return int(2e5) def get_csv_files(self, _data_dir, tmp_dir, _dataset_split): return [ diff --git a/code_search/src/code_search/t2t/function_docstring_extended.py b/code_search/src/code_search/t2t/function_docstring_extended.py index 460647ca..8cf20119 100644 --- a/code_search/src/code_search/t2t/function_docstring_extended.py +++ b/code_search/src/code_search/t2t/function_docstring_extended.py @@ -2,7 +2,7 @@ from tensor2tensor.utils import registry from .function_docstring import GithubFunctionDocstring -@registry.register_problem +@registry.register_problem('cs_github_function_docstring') class GithubFunctionDocstringExtended(GithubFunctionDocstring): """Function Docstring problem with extended semantics. diff --git a/code_search/src/code_search/t2t/similarity_transformer.py b/code_search/src/code_search/t2t/similarity_transformer.py index 9a22be31..72e430ea 100644 --- a/code_search/src/code_search/t2t/similarity_transformer.py +++ b/code_search/src/code_search/t2t/similarity_transformer.py @@ -7,7 +7,7 @@ from tensor2tensor.utils import t2t_model import tensorflow as tf -@registry.register_model +@registry.register_model('cs_similarity_transformer') class SimilarityTransformer(t2t_model.T2TModel): """Transformer Model for Similarity between two strings. diff --git a/code_search/src/requirements.txt b/code_search/src/requirements.txt index 6b8fc458..91271d0d 100644 --- a/code_search/src/requirements.txt +++ b/code_search/src/requirements.txt @@ -7,5 +7,5 @@ numpy~=1.14.0 oauth2client~=4.1.0 requests~=2.18.0 spacy~=2.0.0 -tensor2tensor~=1.6.0 +tensor2tensor~=1.7.0 tensorflow~=1.8.0