334 lines
9.0 KiB
Plaintext
334 lines
9.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Chicago Crime Prediction Pipeline\n",
|
|
"\n",
|
|
"An example notebook that demonstrates how to:\n",
|
|
"* Download data from BigQuery\n",
|
|
"* Create a Kubeflow pipeline\n",
|
|
"* Include Google Cloud AI Platform components to train and deploy the model in the pipeline\n",
|
|
"* Submit a job for execution\n",
|
|
"\n",
|
|
"The model forecasts how many crimes are expected to be reported the next day, based on how many were reported over the previous `n` days."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%%capture\n",
|
|
"\n",
|
|
"# Install the SDK (Uncomment the code if the SDK is not installed before)\n",
|
|
"!pip3 install --upgrade pip -q\n",
|
|
"!pip3 install kfp --upgrade -q\n",
|
|
"!pip3 install pandas --upgrade -q"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"\n",
|
|
"import kfp\n",
|
|
"import kfp.compiler as compiler\n",
|
|
"import kfp.components as comp\n",
|
|
"import kfp.dsl as dsl\n",
|
|
"import kfp.gcp as gcp\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"import time"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Constants"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Required Parameters\n",
|
|
"PROJECT_ID = '<ADD GCP PROJECT HERE>'\n",
|
|
"GCS_WORKING_DIR = 'gs://<ADD STORAGE LOCATION HERE>' # No ending slash\n",
|
|
"\n",
|
|
"# Optional Parameters\n",
|
|
"REGION = 'us-central1'\n",
|
|
"RUNTIME_VERSION = '1.13'\n",
|
|
"PACKAGE_URIS=json.dumps(['gs://chicago-crime/chicago_crime_trainer-0.0.tar.gz'])\n",
|
|
"TRAINER_OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/' + str(int(time.time())) + '/'\n",
|
|
"DATA_GCS_PATH = GCS_WORKING_DIR + '/reports.csv'\n",
|
|
"PYTHON_MODULE = 'trainer.task'\n",
|
|
"TRAINER_ARGS = json.dumps([\n",
|
|
" '--data-file-url', DATA_GCS_PATH,\n",
|
|
" '--job-dir', GCS_WORKING_DIR\n",
|
|
"])\n",
|
|
"EXPERIMENT_NAME = 'Chicago Crime Prediction'\n",
|
|
"PIPELINE_NAME = 'Chicago Crime Prediction'\n",
|
|
"PIPELINE_FILENAME_PREFIX = 'chicago'\n",
|
|
"PIPELINE_DESCRIPTION = ''"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Download data\n",
|
|
"\n",
|
|
"Define a download function that uses the BigQuery component"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bigquery_query_op = comp.load_component_from_url(\n",
|
|
" 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/bigquery/query/component.yaml')\n",
|
|
"\n",
|
|
"QUERY = \"\"\"\n",
|
|
" SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day\n",
|
|
" FROM `bigquery-public-data.chicago_crime.crime`\n",
|
|
" GROUP BY day\n",
|
|
" ORDER BY day\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"def download(project_id, data_gcs_path):\n",
|
|
"\n",
|
|
" return bigquery_query_op(\n",
|
|
" query=QUERY,\n",
|
|
" project_id=project_id,\n",
|
|
" output_gcs_path=data_gcs_path\n",
|
|
" ).apply(\n",
|
|
" gcp.use_gcp_secret('user-gcp-sa') \n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Train the model\n",
|
|
"\n",
|
|
"Run training code that will pre-process the data and then submit a training job to the AI Platform."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mlengine_train_op = comp.load_component_from_url(\n",
|
|
" 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/train/component.yaml')\n",
|
|
"\n",
|
|
"def train(project_id,\n",
|
|
" trainer_args,\n",
|
|
" package_uris,\n",
|
|
" trainer_output_gcs_path,\n",
|
|
" gcs_working_dir,\n",
|
|
" region,\n",
|
|
" python_module,\n",
|
|
" runtime_version):\n",
|
|
" \n",
|
|
" return mlengine_train_op(\n",
|
|
" project_id=project_id, \n",
|
|
" python_module=python_module,\n",
|
|
" package_uris=package_uris,\n",
|
|
" region=region,\n",
|
|
" args=trainer_args,\n",
|
|
" job_dir=trainer_output_gcs_path,\n",
|
|
" runtime_version=runtime_version\n",
|
|
" ).apply(gcp.use_gcp_secret('user-gcp-sa'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Deploy model\n",
|
|
"\n",
|
|
"Deploy the model with the ID given from the training step"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mlengine_deploy_op = comp.load_component_from_url(\n",
|
|
" 'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/deploy/component.yaml')\n",
|
|
"\n",
|
|
"def deploy(\n",
|
|
" project_id,\n",
|
|
" model_uri,\n",
|
|
" model_id,\n",
|
|
" runtime_version):\n",
|
|
" \n",
|
|
" return mlengine_deploy_op(\n",
|
|
" model_uri=model_uri,\n",
|
|
" project_id=project_id, \n",
|
|
" model_id=model_id, \n",
|
|
" runtime_version=runtime_version, \n",
|
|
" replace_existing_version=True, \n",
|
|
" set_default=True).apply(gcp.use_gcp_secret('user-gcp-sa'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"@dsl.pipeline(\n",
|
|
" name=PIPELINE_NAME,\n",
|
|
" description=PIPELINE_DESCRIPTION\n",
|
|
")\n",
|
|
"\n",
|
|
"def pipeline(\n",
|
|
" data_gcs_path=DATA_GCS_PATH,\n",
|
|
" gcs_working_dir=GCS_WORKING_DIR,\n",
|
|
" project_id=PROJECT_ID,\n",
|
|
" python_module=PYTHON_MODULE,\n",
|
|
" region=REGION,\n",
|
|
" runtime_version=RUNTIME_VERSION,\n",
|
|
" package_uris=PACKAGE_URIS,\n",
|
|
" trainer_output_gcs_path=TRAINER_OUTPUT_GCS_PATH,\n",
|
|
" trainer_args=TRAINER_ARGS,\n",
|
|
"): \n",
|
|
" download_task = download(project_id,\n",
|
|
" data_gcs_path)\n",
|
|
"\n",
|
|
" train_task = train(project_id,\n",
|
|
" trainer_args,\n",
|
|
" package_uris,\n",
|
|
" trainer_output_gcs_path,\n",
|
|
" gcs_working_dir,\n",
|
|
" region,\n",
|
|
" python_module,\n",
|
|
" runtime_version).after(download_task)\n",
|
|
" \n",
|
|
" deploy_task = deploy(project_id,\n",
|
|
" train_task.outputs['job_dir'],\n",
|
|
" train_task.outputs['job_id'],\n",
|
|
" runtime_version) \n",
|
|
" return True\n",
|
|
"\n",
|
|
"# Reference for invocation later\n",
|
|
"pipeline_func = pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Compile pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pipeline_filename = PIPELINE_FILENAME_PREFIX + '.pipeline.zip'\n",
|
|
"\n",
|
|
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Submit the pipeline for execution"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Specify pipeline argument values\n",
|
|
"arguments = {}\n",
|
|
"\n",
|
|
"# Get or create an experiment and submit a pipeline run\n",
|
|
"client = kfp.Client()\n",
|
|
"try:\n",
|
|
" experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)\n",
|
|
"except:\n",
|
|
" experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
|
"\n",
|
|
"# Submit a pipeline run\n",
|
|
"run_name = pipeline_func.__name__ + ' run'\n",
|
|
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.7"
|
|
},
|
|
"pycharm": {
|
|
"stem_cell": {
|
|
"cell_type": "raw",
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |