202 lines
7.5 KiB
Plaintext
202 lines
7.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### TFX Components\n",
|
|
"\n",
|
|
"This notebook shows how to create pipeline that uses TFX components:\n",
|
|
"\n",
|
|
"* CsvExampleGen\n",
|
|
"* StatisticsGen\n",
|
|
"* SchemaGen\n",
|
|
"* ExampleValidator\n",
|
|
"* Transform\n",
|
|
"* Trainer\n",
|
|
"* Evaluator"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Put your KFP cluster endpoint URL here if working from GCP notebooks (or local notebooks). ('https://xxxxx.notebooks.googleusercontent.com/')\n",
|
|
"kfp_endpoint='https://XXXXX.notebooks.googleusercontent.com/'\n",
|
|
"\n",
|
|
"# Replace with your GCS bucket, project ID and GCP region\n",
|
|
"root_output_uri = '<your gcs bucket>'\n",
|
|
"project_id = '<your project id>'\n",
|
|
"gcp_region = '<your gcp region>'\n",
|
|
"\n",
|
|
"beam_pipeline_args = [\n",
|
|
" '--runner=DataflowRunner',\n",
|
|
" '--experiments=shuffle_mode=auto',\n",
|
|
" '--project=' + project_id,\n",
|
|
" '--temp_location=' + root_output_uri + '/tmp',\n",
|
|
" '--region=' + gcp_region,\n",
|
|
" '--disk_size_gb=50',\n",
|
|
"]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"input_data_uri = 'gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv'\n",
|
|
"\n",
|
|
"#Only S3/GCS is supported for now.\n",
|
|
"module_file = 'gs://ml-pipeline-playground/tensorflow-tfx-repo/v0.21.4/tfx/examples/chicago_taxi_pipeline/taxi_utils.py'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import kfp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"from kfp.components import load_component_from_url\n",
|
|
"\n",
|
|
"CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml')\n",
|
|
"StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/StatisticsGen/with_URI_IO/component.yaml')\n",
|
|
"SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/SchemaGen/with_URI_IO/component.yaml')\n",
|
|
"ExampleValidator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/ExampleValidator/with_URI_IO/component.yaml')\n",
|
|
"Transform_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Transform/with_URI_IO/component.yaml')\n",
|
|
"Trainer_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Trainer/with_URI_IO/component.yaml')\n",
|
|
"Evaluator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0cc4bbd4/components/tfx/Evaluator/with_URI_IO/component.yaml')\n",
|
|
"\n",
|
|
"def tfx_pipeline(\n",
|
|
" input_data_uri,\n",
|
|
" root_output_uri,\n",
|
|
"):\n",
|
|
" generated_output_uri = root_output_uri + kfp.dsl.EXECUTION_ID_PLACEHOLDER\n",
|
|
"\n",
|
|
" examples_task = CsvExampleGen_op(\n",
|
|
" input_uri=input_data_uri,\n",
|
|
" input_config=json.dumps({\n",
|
|
" \"splits\": [\n",
|
|
" {'name': 'data', 'pattern': '*.csv'},\n",
|
|
" ]\n",
|
|
" }),\n",
|
|
" output_config=json.dumps({\n",
|
|
" \"splitConfig\": {\n",
|
|
" \"splits\": [\n",
|
|
" {'name': 'train', 'hash_buckets': 2},\n",
|
|
" {'name': 'eval', 'hash_buckets': 1},\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" }),\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_examples_uri=generated_output_uri,\n",
|
|
" )\n",
|
|
" \n",
|
|
" statistics_task = StatisticsGen_op(\n",
|
|
" examples_uri=examples_task.outputs['examples_uri'],\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_statistics_uri=generated_output_uri,\n",
|
|
" )\n",
|
|
" \n",
|
|
" schema_task = SchemaGen_op(\n",
|
|
" statistics_uri=statistics_task.outputs['statistics_uri'],\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_schema_uri=generated_output_uri,\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Performs anomaly detection based on statistics and data schema.\n",
|
|
" validator_task = ExampleValidator_op(\n",
|
|
" statistics_uri=statistics_task.outputs['statistics_uri'],\n",
|
|
" schema_uri=schema_task.outputs['schema_uri'],\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_anomalies_uri=generated_output_uri,\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Performs transformations and feature engineering in training and serving.\n",
|
|
" transform_task = Transform_op(\n",
|
|
" examples_uri=examples_task.outputs['examples_uri'],\n",
|
|
" schema_uri=schema_task.outputs['schema_uri'],\n",
|
|
" module_file=module_file,\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_transform_graph_uri=generated_output_uri + '/transform_graph',\n",
|
|
" output_transformed_examples_uri=generated_output_uri + '/transformed_examples',\n",
|
|
" )\n",
|
|
"\n",
|
|
" trainer_task = Trainer_op(\n",
|
|
" module_file=module_file,\n",
|
|
" examples_uri=transform_task.outputs['transformed_examples_uri'],\n",
|
|
" schema_uri=schema_task.outputs['schema_uri'],\n",
|
|
" transform_graph_uri=transform_task.outputs['transform_graph_uri'],\n",
|
|
" train_args=json.dumps({'num_steps': 10000}),\n",
|
|
" eval_args=json.dumps({'num_steps': 5000}),\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_model_uri=generated_output_uri,\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Uses TFMA to compute a evaluation statistics over features of a model.\n",
|
|
" model_analyzer = Evaluator_op(\n",
|
|
" examples_uri=examples_task.outputs['examples_uri'],\n",
|
|
" model_uri=trainer_task.outputs['model_uri'],\n",
|
|
" feature_slicing_spec=json.dumps({\n",
|
|
" 'specs': [\n",
|
|
" {'column_for_slicing': ['trip_start_hour']},\n",
|
|
" ],\n",
|
|
" }),\n",
|
|
" beam_pipeline_args=beam_pipeline_args,\n",
|
|
"\n",
|
|
" output_evaluation_uri=generated_output_uri + '/evaluation',\n",
|
|
" output_blessing_uri=generated_output_uri + '/blessing',\n",
|
|
" )\n",
|
|
"\n",
|
|
"\n",
|
|
"kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(\n",
|
|
" tfx_pipeline,\n",
|
|
" arguments=dict(\n",
|
|
" input_data_uri=input_data_uri,\n",
|
|
" root_output_uri=root_output_uri,\n",
|
|
" ),\n",
|
|
")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.5.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|