pipelines/samples/core/tfx-oss/TFX Example.ipynb

385 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TFX on KubeFlow Pipelines Example\n",
"\n",
"This notebook should be run inside a KF Pipelines cluster.\n",
"\n",
"### Install TFX and KFP packages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip3 install tfx==0.13.0 --upgrade\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Enable DataFlow API for your GKE cluster\n",
"<https://console.developers.google.com/apis/api/dataflow.googleapis.com/overview>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get the TFX repo with sample pipeline\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Directory and data locations (uses Google Cloud Storage).\n",
"import os\n",
"_input_bucket = '<your gcs bucket>'\n",
"_output_bucket = '<your gcs bucket>'\n",
"_pipeline_root = os.path.join(_output_bucket, 'tfx')\n",
"\n",
"# Google Cloud Platform project id to use when deploying this pipeline.\n",
"_project_id = '<your project id>'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# copy the trainer code to a storage bucket as the TFX pipeline will need that code file in GCS\n",
"from tensorflow import gfile\n",
"gfile.Copy('utils/taxi_utils.py', _input_bucket + '/taxi_utils.py')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure the TFX pipeline example\n",
"\n",
"Reload this cell by running the load command to get the pipeline configuration file\n",
"```\n",
"%load tfx/examples/chicago_taxi_pipeline/taxi_pipeline_kubeflow.py\n",
"```\n",
"\n",
"Configure:\n",
"- Set `_input_bucket` to the GCS directory where you've copied taxi_utils.py. I.e. gs://<my bucket>/<path>/\n",
"- Set `_output_bucket` to the GCS directory where you've want the results to be written\n",
"- Set GCP project ID (replace my-gcp-project). Note that it should be project ID, not project name.\n",
"\n",
"The dataset in BigQuery has 100M rows, you can change the query parameters in WHERE clause to limit the number of rows used.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"Chicago Taxi example using TFX DSL on Kubeflow.\"\"\"\n",
"\n",
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"\n",
"import os\n",
"from tfx.components.evaluator.component import Evaluator\n",
"from tfx.components.example_gen.big_query_example_gen.component import BigQueryExampleGen\n",
"from tfx.components.example_validator.component import ExampleValidator\n",
"from tfx.components.model_validator.component import ModelValidator\n",
"from tfx.components.pusher.component import Pusher\n",
"from tfx.components.schema_gen.component import SchemaGen\n",
"from tfx.components.statistics_gen.component import StatisticsGen\n",
"from tfx.components.trainer.component import Trainer\n",
"from tfx.components.transform.component import Transform\n",
"from tfx.orchestration.kubeflow.runner import KubeflowRunner\n",
"from tfx.orchestration.pipeline import PipelineDecorator\n",
"from tfx.proto import evaluator_pb2\n",
"from tfx.proto import pusher_pb2\n",
"from tfx.proto import trainer_pb2\n",
"\n",
"# Python module file to inject customized logic into the TFX components. The\n",
"# Transform and Trainer both require user-defined functions to run successfully.\n",
"# Copy this from the current directory to a GCS bucket and update the location\n",
"# below.\n",
"_taxi_utils = os.path.join(_input_bucket, 'taxi_utils.py')\n",
"\n",
"# Path which can be listened to by the model server. Pusher will output the\n",
"# trained model here.\n",
"_serving_model_dir = os.path.join(_output_bucket, 'serving_model/taxi_bigquery')\n",
"\n",
"# Region to use for Dataflow jobs and CMLE training.\n",
"# Dataflow: https://cloud.google.com/dataflow/docs/concepts/regional-endpoints\n",
"# CMLE: https://cloud.google.com/ml-engine/docs/tensorflow/regions\n",
"_gcp_region = 'us-central1'\n",
"\n",
"# A dict which contains the training job parameters to be passed to Google\n",
"# Cloud ML Engine. For the full set of parameters supported by Google Cloud ML\n",
"# Engine, refer to\n",
"# https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job\n",
"_cmle_training_args = {\n",
" 'pythonModule': None, # Will be populated by TFX\n",
" 'args': None, # Will be populated by TFX\n",
" 'region': _gcp_region,\n",
" 'jobDir': os.path.join(_output_bucket, 'tmp'),\n",
" 'runtimeVersion': '1.12',\n",
" 'pythonVersion': '2.7',\n",
" 'project': _project_id,\n",
"}\n",
"\n",
"# A dict which contains the serving job parameters to be passed to Google\n",
"# Cloud ML Engine. For the full set of parameters supported by Google Cloud ML\n",
"# Engine, refer to\n",
"# https://cloud.google.com/ml-engine/reference/rest/v1/projects.models\n",
"_cmle_serving_args = {\n",
" 'model_name': 'chicago_taxi',\n",
" 'project_id': _project_id,\n",
" 'runtime_version': '1.12',\n",
"}\n",
"\n",
"# The rate at which to sample rows from the Chicago Taxi dataset using BigQuery.\n",
"# The full taxi dataset is > 120M record. In the interest of resource\n",
"# savings and time, we've set the default for this example to be much smaller.\n",
"# Feel free to crank it up and process the full dataset!\n",
"_query_sample_rate = 0.001 # Generate a 0.1% random sample.\n",
"\n",
"\n",
"# TODO(zhitaoli): Remove PipelineDecorator after 0.13.0.\n",
"@PipelineDecorator(\n",
" pipeline_name='chicago_taxi_pipeline_kubeflow',\n",
" log_root='/var/tmp/tfx/logs',\n",
" pipeline_root=_pipeline_root,\n",
" additional_pipeline_args={\n",
" 'beam_pipeline_args': [\n",
" '--runner=DataflowRunner',\n",
" '--experiments=shuffle_mode=auto',\n",
" '--project=' + _project_id,\n",
" '--temp_location=' + os.path.join(_output_bucket, 'tmp'),\n",
" '--region=' + _gcp_region,\n",
" ],\n",
" # Optional args:\n",
" # 'tfx_image': custom docker image to use for components. This is needed\n",
" # if TFX package is not installed from an RC or released version.\n",
" })\n",
"def _create_pipeline():\n",
" \"\"\"Implements the chicago taxi pipeline with TFX.\"\"\"\n",
"\n",
" query = \"\"\"\n",
" SELECT\n",
" pickup_community_area,\n",
" fare,\n",
" EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,\n",
" EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,\n",
" EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,\n",
" UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,\n",
" pickup_latitude,\n",
" pickup_longitude,\n",
" dropoff_latitude,\n",
" dropoff_longitude,\n",
" trip_miles,\n",
" pickup_census_tract,\n",
" dropoff_census_tract,\n",
" payment_type,\n",
" company,\n",
" trip_seconds,\n",
" dropoff_community_area,\n",
" tips\n",
" FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
" WHERE RAND() < {}\"\"\".format(_query_sample_rate)\n",
"\n",
" # Brings data into the pipeline or otherwise joins/converts training data.\n",
" example_gen = BigQueryExampleGen(query=query)\n",
"\n",
" # Computes statistics over data for visualization and example validation.\n",
" statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)\n",
"\n",
" # Generates schema based on statistics files.\n",
" infer_schema = SchemaGen(stats=statistics_gen.outputs.output)\n",
"\n",
" # Performs anomaly detection based on statistics and data schema.\n",
" validate_stats = ExampleValidator(\n",
" stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)\n",
"\n",
" # Performs transformations and feature engineering in training and serving.\n",
" transform = Transform(\n",
" input_data=example_gen.outputs.examples,\n",
" schema=infer_schema.outputs.output,\n",
" module_file=_taxi_utils)\n",
"\n",
" # Uses user-provided Python function that implements a model using TF-Learn.\n",
" trainer = Trainer(\n",
" module_file=_taxi_utils,\n",
" transformed_examples=transform.outputs.transformed_examples,\n",
" schema=infer_schema.outputs.output,\n",
" transform_output=transform.outputs.transform_output,\n",
" train_args=trainer_pb2.TrainArgs(num_steps=10000),\n",
" eval_args=trainer_pb2.EvalArgs(num_steps=5000),\n",
" custom_config={'cmle_training_args': _cmle_training_args})\n",
"\n",
" # Uses TFMA to compute a evaluation statistics over features of a model.\n",
" model_analyzer = Evaluator(\n",
" examples=example_gen.outputs.examples,\n",
" model_exports=trainer.outputs.output,\n",
" feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[\n",
" evaluator_pb2.SingleSlicingSpec(\n",
" column_for_slicing=['trip_start_hour'])\n",
" ]))\n",
"\n",
" # Performs quality validation of a candidate model (compared to a baseline).\n",
" model_validator = ModelValidator(\n",
" examples=example_gen.outputs.examples, model=trainer.outputs.output)\n",
"\n",
" # Checks whether the model passed the validation steps and pushes the model\n",
" # to a file destination if check passed.\n",
" pusher = Pusher(\n",
" model_export=trainer.outputs.output,\n",
" model_blessing=model_validator.outputs.blessing,\n",
" custom_config={'cmle_serving_args': _cmle_serving_args},\n",
" push_destination=pusher_pb2.PushDestination(\n",
" filesystem=pusher_pb2.PushDestination.Filesystem(\n",
" base_directory=_serving_model_dir)))\n",
"\n",
" return [\n",
" example_gen, statistics_gen, infer_schema, validate_stats, transform,\n",
" trainer, model_analyzer, model_validator, pusher\n",
" ]\n",
"\n",
"\n",
"pipeline = KubeflowRunner().run(_create_pipeline())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compile the pipeline and submit a run to the Kubeflow cluster"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get or create a new experiment\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment_name='TFX Examples'\n",
"try:\n",
" experiment_id = client.get_experiment(experiment_name=experiment_name).id\n",
"except:\n",
" experiment_id = client.create_experiment(experiment_name).id\n",
"\n",
"pipeline_filename = 'chicago_taxi_pipeline_kubeflow.tar.gz'\n",
"\n",
"#Submit a pipeline run\n",
"run_name = 'Run 1'\n",
"run_result = client.run_pipeline(experiment_id, run_name, pipeline_filename, {})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Connect to the ML Metadata Store"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip3 install ml_metadata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from ml_metadata.metadata_store import metadata_store\n",
"from ml_metadata.proto import metadata_store_pb2\n",
"import os\n",
"\n",
"connection_config = metadata_store_pb2.ConnectionConfig()\n",
"connection_config.mysql.host = os.getenv('MYSQL_SERVICE_HOST')\n",
"connection_config.mysql.port = int(os.getenv('MYSQL_SERVICE_PORT'))\n",
"connection_config.mysql.database = 'mlmetadata'\n",
"connection_config.mysql.user = 'root'\n",
"store = metadata_store.MetadataStore(connection_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get all output artifacts\n",
"store.get_artifacts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get a specific artifact type\n",
"\n",
"# TFX types \n",
"# types = ['ModelExportPath', 'ExamplesPath', 'ModelBlessingPath', 'ModelPushPath', 'TransformPath', 'SchemaPath']\n",
"\n",
"store.get_artifacts_by_type('ExamplesPath')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}