{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Copyright (c) Facebook, Inc. and its affiliates.\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# http://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Bert Pipeline : PyTorch BERT News Classfication\n", "\n", "This notebook shows PyTorch BERT end-to-end news classification example using Kubeflow Pipelines.\n", "\n", "\n", "An example notebook that demonstrates how to:\n", "\n", "* Get different tasks needed for the pipeline\n", "* Create a Kubeflow pipeline\n", "* Include Pytorch KFP components to preprocess, train, visualize and deploy the model in the pipeline\n", "* Submit a job for execution\n", "* Query(prediction and explain) the final deployed model\n", "* Interpretation of the model using the Captum Insights\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "tags": [] }, "outputs": [], "source": [ "! pip uninstall -y kfp\n", "! pip install --no-cache-dir kfp captum" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1.8.12'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import kfp\n", "import json\n", "import os\n", "from kfp.onprem import use_k8s_secret\n", "from kfp import components\n", "from kfp.components import load_component_from_file, load_component_from_url\n", "from kfp import dsl\n", "from kfp import compiler\n", "\n", "kfp.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Enter your gateway and the cookie\n", "[Use this extension on chrome to get token]( https://chrome.google.com/webstore/detail/editthiscookie/fngmhnnpilhplaeedifhccceomclgfbg?hl=en)\n", "\n", "![image.png](./image.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Update values for the ingress gateway and auth session" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "INGRESS_GATEWAY='http://istio-ingressgateway.istio-system.svc.cluster.local'\n", "AUTH=\"\"\n", "NAMESPACE=\"kubeflow-user-example-com\"\n", "COOKIE=\"authservice_session=\"+AUTH\n", "EXPERIMENT=\"Default\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set Log bucket and Tensorboard Image" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "MINIO_ENDPOINT=\"http://minio-service.kubeflow:9000\"\n", "LOG_BUCKET=\"mlpipeline\"\n", "TENSORBOARD_IMAGE=\"public.ecr.aws/pytorch-samples/tboard:latest\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client(host=INGRESS_GATEWAY+\"/pipeline\", cookies=COOKIE)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment details." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'created_at': datetime.datetime(2022, 4, 21, 9, 45, 22, tzinfo=tzlocal()),\n", " 'description': None,\n", " 'id': 'b4bee8c3-381b-42a0-9494-bc81eb9aa359',\n", " 'name': 'Default',\n", " 'resource_references': [{'key': {'id': 'kubeflow-user-example-com',\n", " 'type': 'NAMESPACE'},\n", " 'name': None,\n", " 'relationship': 'OWNER'}],\n", " 'storage_state': 'STORAGESTATE_AVAILABLE'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.create_experiment(EXPERIMENT)\n", "experiments = client.list_experiments(namespace=NAMESPACE)\n", "my_experiment = experiments.experiments[0]\n", "my_experiment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set Inference parameters" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "DEPLOY_NAME=\"bertserve\"\n", "MODEL_NAME=\"bert\"" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing prediction_component.yaml\n", "Processing ax_complete_trials_component.yaml\n", "Processing preprocess_component.yaml\n", "Processing train_component.yaml\n", "Processing tensorboard_component.yaml\n", "Processing ax_generate_trials_component.yaml\n", "Processing minio_component.yaml\n", "Processing copy_component.yaml\n", "Processing ax_train_component.yaml\n" ] } ], "source": [ "! python utils/generate_templates.py bert/template_mapping.json" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "prepare_tensorboard_op = load_component_from_file(\"yaml/tensorboard_component.yaml\")\n", "prep_op = components.load_component_from_file(\n", " \"yaml/preprocess_component.yaml\"\n", ")\n", "train_op = components.load_component_from_file(\n", " \"yaml/train_component.yaml\"\n", ")\n", "deploy_op = load_component_from_file(\n", " \"../../../components/kserve/component.yaml\"\n", ")\n", "minio_op = components.load_component_from_file(\n", " \"yaml/minio_component.yaml\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define pipeline" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(name=\"Training pipeline\", description=\"Sample training job test\")\n", "def pytorch_bert( # pylint: disable=too-many-arguments\n", " minio_endpoint=MINIO_ENDPOINT,\n", " log_bucket=LOG_BUCKET,\n", " log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n", " mar_path=f\"mar/{dsl.RUN_ID_PLACEHOLDER}/model-store\",\n", " config_prop_path=f\"mar/{dsl.RUN_ID_PLACEHOLDER}/config\",\n", " model_uri=f\"s3://mlpipeline/mar/{dsl.RUN_ID_PLACEHOLDER}\",\n", " tf_image=TENSORBOARD_IMAGE,\n", " deploy=DEPLOY_NAME,\n", " namespace=NAMESPACE,\n", " confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n", " num_samples=1000,\n", " max_epochs=1\n", "):\n", " \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n", " prepare_tb_task = prepare_tensorboard_op(\n", " log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n", " image=tf_image,\n", " pod_template_spec=json.dumps({\n", " \"spec\": {\n", " \"containers\": [{\n", " \"env\": [\n", " {\n", " \"name\": \"AWS_ACCESS_KEY_ID\",\n", " \"valueFrom\": {\n", " \"secretKeyRef\": {\n", " \"name\": \"mlpipeline-minio-artifact\",\n", " \"key\": \"accesskey\",\n", " }\n", " },\n", " },\n", " {\n", " \"name\": \"AWS_SECRET_ACCESS_KEY\",\n", " \"valueFrom\": {\n", " \"secretKeyRef\": {\n", " \"name\": \"mlpipeline-minio-artifact\",\n", " \"key\": \"secretkey\",\n", " }\n", " },\n", " },\n", " {\n", " \"name\": \"AWS_REGION\",\n", " \"value\": \"minio\"\n", " },\n", " {\n", " \"name\": \"S3_ENDPOINT\",\n", " \"value\": f\"{minio_endpoint}\",\n", " },\n", " {\n", " \"name\": \"S3_USE_HTTPS\",\n", " \"value\": \"0\"\n", " },\n", " {\n", " \"name\": \"S3_VERIFY_SSL\",\n", " \"value\": \"0\"\n", " },\n", " ]\n", " }]\n", " }\n", " }),\n", " ).set_display_name(\"Visualization\")\n", "\n", " prep_task = (\n", " prep_op().after(prepare_tb_task\n", " ).set_display_name(\"Preprocess & Transform\")\n", " )\n", " confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n", " script_args = f\"model_name=bert.pth,\" \\\n", " f\"num_samples={num_samples},\" \\\n", " f\"confusion_matrix_url={confusion_matrix_url}\"\n", " # For GPU , set device count and strategy type\n", " ptl_args = f\"max_epochs={max_epochs},accelerator=gpu,profiler=pytorch,devices=0,strategy=None\"\n", " train_task = (\n", " train_op(\n", " input_data=prep_task.outputs[\"output_data\"],\n", " script_args=script_args,\n", " ptl_arguments=ptl_args\n", " ).after(prep_task).set_display_name(\"Training\")\n", " # For allocating resources, uncomment below lines\n", " # .set_memory_request('600M')\n", " # .set_memory_limit('1200M')\n", " # .set_cpu_request('700m')\n", " # .set_cpu_limit('1400m')\n", " # For GPU uncomment below line and set GPU limit and node selector\n", " # .set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n", " )\n", "\n", " (\n", " minio_op(\n", " bucket_name=\"mlpipeline\",\n", " folder_name=log_dir,\n", " input_path=train_task.outputs[\"tensorboard_root\"],\n", " filename=\"\",\n", " ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n", " )\n", " minio_mar_upload = (\n", " minio_op(\n", " bucket_name=\"mlpipeline\",\n", " folder_name=mar_path,\n", " input_path=train_task.outputs[\"checkpoint_dir\"],\n", " filename=\"bert_test.mar\",\n", " ).after(train_task).set_display_name(\"Mar Pusher\")\n", " )\n", " (\n", " minio_op(\n", " bucket_name=\"mlpipeline\",\n", " folder_name=config_prop_path,\n", " input_path=train_task.outputs[\"checkpoint_dir\"],\n", " filename=\"config.properties\",\n", " ).after(train_task).set_display_name(\"Conifg Pusher\")\n", " )\n", "\n", " model_uri = str(model_uri)\n", " # pylint: disable=unused-variable\n", " isvc_yaml = \"\"\"\n", " apiVersion: \"serving.kserve.io/v1beta1\"\n", " kind: \"InferenceService\"\n", " metadata:\n", " name: {}\n", " namespace: {}\n", " spec:\n", " predictor:\n", " serviceAccountName: sa\n", " pytorch:\n", " protocolVersion: v2\n", " storageUri: {}\n", " resources:\n", " requests: \n", " cpu: 4\n", " memory: 8Gi\n", " limits:\n", " cpu: 4\n", " memory: 8Gi\n", " \"\"\".format(deploy, namespace, model_uri)\n", "\n", " # For GPU inference use below yaml with gpu count and accelerator\n", " gpu_count = \"1\"\n", " accelerator = \"nvidia-tesla-p4\"\n", " isvc_gpu_yaml = \"\"\"\n", " apiVersion: \"serving.kserve.io/v1beta1\"\n", " kind: \"InferenceService\"\n", " metadata:\n", " name: {}\n", " namespace: {}\n", " spec:\n", " predictor:\n", " serviceAccountName: sa\n", " pytorch:\n", " protocolVersion: v2\n", " storageUri: {}\n", " resources:\n", " requests: \n", " cpu: 4\n", " memory: 8Gi\n", " limits:\n", " cpu: 4\n", " memory: 8Gi\n", " nvidia.com/gpu: {}\n", " nodeSelector:\n", " cloud.google.com/gke-accelerator: {}\n", "\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n", " # Update inferenceservice_yaml for GPU inference\n", " deploy_task = (\n", " deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n", " ).after(minio_mar_upload).set_display_name(\"Deployer\")\n", " )\n", "\n", " dsl.get_pipeline_conf().add_op_transformer(\n", " use_k8s_secret(\n", " secret_name=\"mlpipeline-minio-artifact\",\n", " k8s_secret_key_to_env={\n", " \"secretkey\": \"MINIO_SECRET_KEY\",\n", " \"accesskey\": \"MINIO_ACCESS_KEY\",\n", " },\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Compile pipeline\n", "compiler.Compiler().compile(pytorch_bert, 'pytorch.tar.gz', type_check=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Run details." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Execute pipeline\n", "run = client.run_pipeline(my_experiment.id, 'pytorch-bert', 'pytorch.tar.gz')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wait for inference service below to go to `READY True` state." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE\n", "bertserve http://bertserve.kubeflow-user-example-com.example.com True 100 bertserve-predictor-default-00003 160m\n" ] } ], "source": [ "!kubectl get isvc $DEPLOY" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Get Inferenceservice name" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bertserve.kubeflow-user-example-com.example.com'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "INFERENCE_SERVICE_LIST = ! kubectl get isvc {DEPLOY_NAME} -n {NAMESPACE} -o json | python3 -c \"import sys, json; print(json.load(sys.stdin)['status']['url'])\"| tr -d '\"' | cut -d \"/\" -f 3\n", "INFERENCE_SERVICE_NAME = INFERENCE_SERVICE_LIST[0]\n", "INFERENCE_SERVICE_NAME" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediction Request" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"id\": \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\",\n", " \"inputs\": [{\n", " \"name\": \"4b7c7d4a-51e4-43c8-af61-04639f6ef4bc\",\n", " \"shape\": -1,\n", " \"datatype\": \"BYTES\",\n", " \"data\": \"Bloomberg has reported on the economy\"\n", " }\n", " ]\n", "}" ] } ], "source": [ "! cat ./bert/sample.txt" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "!curl -v -H \"Host: $INFERENCE_SERVICE_NAME\" -H \"Cookie: $COOKIE\" \"$INGRESS_GATEWAY/v2/models/$MODEL_NAME/infer\" -d @./bert/sample.txt > bert_prediction_output.json" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"id\": \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\", \"model_name\": \"bert_test\", \"model_version\": \"1\", \"outputs\": [{\"name\": \"predict\", \"shape\": [], \"datatype\": \"BYTES\", \"data\": [\"\\\"Business\\\"\"]}]}" ] } ], "source": [ "! cat bert_prediction_output.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Explanation Request" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "scrolled": true }, "outputs": [], "source": [ "!curl -v -H \"Host: $INFERENCE_SERVICE_NAME\" -H \"Cookie: $COOKIE\" \"$INGRESS_GATEWAY/v2/models/$MODEL_NAME/explain\" -d @./bert/sample.txt > bert_explaination_output.json" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"id\": \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\", \"model_name\": \"bert_test\", \"model_version\": \"1\", \"outputs\": [{\"name\": \"explain\", \"shape\": [], \"datatype\": \"BYTES\", \"data\": [{\"words\": [\"bloomberg\", \"has\", \"reported\", \"on\", \"the\", \"economy\"], \"importances\": [0.2124089759942075, 0.3070123112652129, -0.3175794877732026, -0.4493290921520886, -0.23262562691072097, 0.7097589881393321], \"delta\": 0.01156902069987975}]}]}" ] } ], "source": [ "! cat bert_explaination_output.json" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ "explanations_json = json.loads(open(\"./bert_explaination_output.json\", \"r\").read())\n", "explanations_json" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "tags": [] }, "outputs": [], "source": [ "prediction_json = json.loads(open(\"./bert_prediction_output.json\", \"r\").read())" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "tags": [] }, "outputs": [], "source": [ "import torch\n", "attributions = explanations_json[\"outputs\"][0][\"data\"][0]['importances']\n", "tokens = explanations_json[\"outputs\"][0][\"data\"][0]['words']\n", "delta = explanations_json[\"outputs\"][0][\"data\"][0]['delta']\n", "\n", "attributions = torch.tensor(attributions)\n", "pred_prob = 0.75\n", "pred_class = str(prediction_json[\"outputs\"][0][\"data\"][0]).strip('\"\"')\n", "true_class = \"Business\"\n", "attr_class =\"world\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualization of Predictions" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "tags": [] }, "outputs": [], "source": [ "from captum.attr import visualization\n", "vis_data_records =[]\n", "vis_data_records.append(visualization.VisualizationDataRecord(\n", " attributions,\n", " pred_prob,\n", " pred_class,\n", " true_class,\n", " attr_class,\n", " attributions.sum(), \n", " tokens,\n", " delta))" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Legend: Negative Neutral Positive
True LabelPredicted LabelAttribution LabelAttribution ScoreWord Importance
BusinessBusiness (0.75)world0.23 bloomberg has reported on the economy
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "vis = visualization.visualize_text(vis_data_records)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### visualization appreas as below\n", "![viz1.png](./viz1.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleanup Script" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ "! kubectl delete --all isvc -n $NAMESPACE" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "! kubectl delete pod --field-selector=status.phase==Succeeded -n $NAMESPACE" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }