{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chicago Crime Prediction Pipeline\n", "\n", "An example notebook that demonstrates how to:\n", "* Download data from BigQuery\n", "* Create a Kubeflow pipeline\n", "* Include Google Cloud AI Platform components to train and deploy the model in the pipeline\n", "* Submit a job for execution\n", "\n", "The model forecasts how many crimes are expected to be reported the next day, based on how many were reported over the previous `n` days." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "\n", "# Install the SDK (Uncomment the code if the SDK is not installed before)\n", "!pip3 install --upgrade pip -q\n", "!pip3 install kfp --upgrade -q\n", "!pip3 install pandas --upgrade -q" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "import kfp\n", "import kfp.compiler as compiler\n", "import kfp.components as comp\n", "import kfp.dsl as dsl\n", "import kfp.gcp as gcp\n", "\n", "import pandas as pd\n", "\n", "import time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Constants" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Required Parameters\n", "PROJECT_ID = ''\n", "GCS_WORKING_DIR = 'gs://' # No ending slash\n", "\n", "# Optional Parameters\n", "REGION = 'us-central1'\n", "RUNTIME_VERSION = '1.13'\n", "PACKAGE_URIS=json.dumps(['gs://chicago-crime/chicago_crime_trainer-0.0.tar.gz'])\n", "TRAINER_OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/' + str(int(time.time())) + '/'\n", "DATA_GCS_PATH = GCS_WORKING_DIR + '/reports.csv'\n", "PYTHON_MODULE = 'trainer.task'\n", "TRAINER_ARGS = json.dumps([\n", " '--data-file-url', DATA_GCS_PATH,\n", " '--job-dir', GCS_WORKING_DIR\n", "])\n", "EXPERIMENT_NAME = 'Chicago Crime Prediction'\n", "PIPELINE_NAME = 'Chicago Crime Prediction'\n", "PIPELINE_FILENAME_PREFIX = 'chicago'\n", "PIPELINE_DESCRIPTION = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download data\n", "\n", "Define a download function that uses the BigQuery component" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bigquery_query_op = comp.load_component_from_url(\n", " 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/bigquery/query/component.yaml')\n", "\n", "QUERY = \"\"\"\n", " SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day\n", " FROM `bigquery-public-data.chicago_crime.crime`\n", " GROUP BY day\n", " ORDER BY day\n", "\"\"\"\n", "\n", "def download(project_id, data_gcs_path):\n", "\n", " return bigquery_query_op(\n", " query=QUERY,\n", " project_id=project_id,\n", " output_gcs_path=data_gcs_path\n", " ).apply(\n", " gcp.use_gcp_secret('user-gcp-sa') \n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train the model\n", "\n", "Run training code that will pre-process the data and then submit a training job to the AI Platform." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlengine_train_op = comp.load_component_from_url(\n", " 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/train/component.yaml')\n", "\n", "def train(project_id,\n", " trainer_args,\n", " package_uris,\n", " trainer_output_gcs_path,\n", " gcs_working_dir,\n", " region,\n", " python_module,\n", " runtime_version):\n", " \n", " return mlengine_train_op(\n", " project_id=project_id, \n", " python_module=python_module,\n", " package_uris=package_uris,\n", " region=region,\n", " args=trainer_args,\n", " job_dir=trainer_output_gcs_path,\n", " runtime_version=runtime_version\n", " ).apply(gcp.use_gcp_secret('user-gcp-sa'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deploy model\n", "\n", "Deploy the model with the ID given from the training step" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlengine_deploy_op = comp.load_component_from_url(\n", " 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/deploy/component.yaml')\n", "\n", "def deploy(\n", " project_id,\n", " model_uri,\n", " model_id,\n", " runtime_version):\n", " \n", " return mlengine_deploy_op(\n", " model_uri=model_uri,\n", " project_id=project_id, \n", " model_id=model_id, \n", " runtime_version=runtime_version, \n", " replace_existing_version=True, \n", " set_default=True).apply(gcp.use_gcp_secret('user-gcp-sa'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Define pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name=PIPELINE_NAME,\n", " description=PIPELINE_DESCRIPTION\n", ")\n", "\n", "def pipeline(\n", " data_gcs_path=DATA_GCS_PATH,\n", " gcs_working_dir=GCS_WORKING_DIR,\n", " project_id=PROJECT_ID,\n", " python_module=PYTHON_MODULE,\n", " region=REGION,\n", " runtime_version=RUNTIME_VERSION,\n", " package_uris=PACKAGE_URIS,\n", " trainer_output_gcs_path=TRAINER_OUTPUT_GCS_PATH,\n", " trainer_args=TRAINER_ARGS,\n", "): \n", " download_task = download(project_id,\n", " data_gcs_path)\n", "\n", " train_task = train(project_id,\n", " trainer_args,\n", " package_uris,\n", " trainer_output_gcs_path,\n", " gcs_working_dir,\n", " region,\n", " python_module,\n", " runtime_version).after(download_task)\n", " \n", " deploy_task = deploy(project_id,\n", " train_task.outputs['job_dir'],\n", " train_task.outputs['job_id'],\n", " runtime_version) \n", " return True\n", "\n", "# Reference for invocation later\n", "pipeline_func = pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compile pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline_filename = PIPELINE_FILENAME_PREFIX + '.pipeline.zip'\n", "\n", "compiler.Compiler().compile(pipeline_func, pipeline_filename)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit the pipeline for execution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Specify pipeline argument values\n", "arguments = {}\n", "\n", "# Get or create an experiment and submit a pipeline run\n", "client = kfp.Client()\n", "try:\n", " experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)\n", "except:\n", " experiment = client.create_experiment(EXPERIMENT_NAME)\n", "\n", "# Submit a pipeline run\n", "run_name = pipeline_func.__name__ + ' run'\n", "run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 2 }