{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### TFX Components\n", "\n", "This notebook shows how to create pipeline that uses TFX components:\n", "\n", "* CsvExampleGen\n", "* StatisticsGen\n", "* SchemaGen\n", "* ExampleValidator\n", "* Transform\n", "* Trainer\n", "* Evaluator" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Put your KFP cluster endpoint URL here if working from GCP notebooks (or local notebooks). ('https://xxxxx.notebooks.googleusercontent.com/')\n", "kfp_endpoint='https://XXXXX.notebooks.googleusercontent.com/'\n", "\n", "# Replace with your GCS bucket, project ID and GCP region\n", "root_output_uri = ''\n", "project_id = ''\n", "gcp_region = ''\n", "\n", "beam_pipeline_args = [\n", " '--runner=DataflowRunner',\n", " '--experiments=shuffle_mode=auto',\n", " '--project=' + project_id,\n", " '--temp_location=' + root_output_uri + '/tmp',\n", " '--region=' + gcp_region,\n", " '--disk_size_gb=50',\n", "]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_data_uri = 'gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv'\n", "\n", "#Only S3/GCS is supported for now.\n", "module_file = 'gs://ml-pipeline-playground/tensorflow-tfx-repo/v0.21.4/tfx/examples/chicago_taxi_pipeline/taxi_utils.py'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import kfp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "from kfp.components import load_component_from_url\n", "\n", "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml')\n", "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/StatisticsGen/with_URI_IO/component.yaml')\n", "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/SchemaGen/with_URI_IO/component.yaml')\n", "ExampleValidator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/ExampleValidator/with_URI_IO/component.yaml')\n", "Transform_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Transform/with_URI_IO/component.yaml')\n", "Trainer_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Trainer/with_URI_IO/component.yaml')\n", "Evaluator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Evaluator/with_URI_IO/component.yaml')\n", "\n", "def tfx_pipeline(\n", " input_data_uri,\n", " root_output_uri,\n", "):\n", " generated_output_uri = root_output_uri + kfp.dsl.EXECUTION_ID_PLACEHOLDER\n", "\n", " examples_task = CsvExampleGen_op(\n", " input_uri=input_data_uri,\n", " input_config=json.dumps({\n", " \"splits\": [\n", " {'name': 'data', 'pattern': '*.csv'},\n", " ]\n", " }),\n", " output_config=json.dumps({\n", " \"splitConfig\": {\n", " \"splits\": [\n", " {'name': 'train', 'hash_buckets': 2},\n", " {'name': 'eval', 'hash_buckets': 1},\n", " ]\n", " }\n", " }),\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_examples_uri=generated_output_uri,\n", " )\n", " \n", " statistics_task = StatisticsGen_op(\n", " examples_uri=examples_task.outputs['examples_uri'],\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_statistics_uri=generated_output_uri,\n", " )\n", " \n", " schema_task = SchemaGen_op(\n", " statistics_uri=statistics_task.outputs['statistics_uri'],\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_schema_uri=generated_output_uri,\n", " )\n", "\n", " # Performs anomaly detection based on statistics and data schema.\n", " validator_task = ExampleValidator_op(\n", " statistics_uri=statistics_task.outputs['statistics_uri'],\n", " schema_uri=schema_task.outputs['schema_uri'],\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_anomalies_uri=generated_output_uri,\n", " )\n", "\n", " # Performs transformations and feature engineering in training and serving.\n", " transform_task = Transform_op(\n", " examples_uri=examples_task.outputs['examples_uri'],\n", " schema_uri=schema_task.outputs['schema_uri'],\n", " module_file=module_file,\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_transform_graph_uri=generated_output_uri + '/transform_graph',\n", " output_transformed_examples_uri=generated_output_uri + '/transformed_examples',\n", " )\n", "\n", " trainer_task = Trainer_op(\n", " module_file=module_file,\n", " examples_uri=transform_task.outputs['transformed_examples_uri'],\n", " schema_uri=schema_task.outputs['schema_uri'],\n", " transform_graph_uri=transform_task.outputs['transform_graph_uri'],\n", " train_args=json.dumps({'num_steps': 10000}),\n", " eval_args=json.dumps({'num_steps': 5000}),\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_model_uri=generated_output_uri,\n", " )\n", "\n", " # Uses TFMA to compute a evaluation statistics over features of a model.\n", " model_analyzer = Evaluator_op(\n", " examples_uri=examples_task.outputs['examples_uri'],\n", " model_uri=trainer_task.outputs['model_uri'],\n", " feature_slicing_spec=json.dumps({\n", " 'specs': [\n", " {'column_for_slicing': ['trip_start_hour']},\n", " ],\n", " }),\n", " beam_pipeline_args=beam_pipeline_args,\n", "\n", " output_evaluation_uri=generated_output_uri + '/evaluation',\n", " output_blessing_uri=generated_output_uri + '/blessing',\n", " )\n", "\n", "\n", "kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(\n", " tfx_pipeline,\n", " arguments=dict(\n", " input_data_uri=input_data_uri,\n", " root_output_uri=root_output_uri,\n", " ),\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 4 }