kfp-tekton/sdk/python/tests/compiler/testdata/big_data_passing.py

235 lines
8.2 KiB
Python

# Copyright 2020 kubeflow.org
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# %% [markdown]
# # Data passing tutorial
# Data passing is the most important aspect of Pipelines.
#
# In Kubeflow Pipelines, the pipeline authors compose pipelines by creating component instances (tasks)
# and connecting them together.
#
# Component have inputs and outputs. They can consume and produce arbitrary data.
#
# Pipeline authors establish connections between component tasks by connecting their data inputs and outputs
# - by passing the output of one task as an argument to another task's input.
#
# The system takes care of storing the data produced by components and later passing that data
# to other components for consumption as instructed by the pipeline.
#
# This tutorial shows how to create python components that produce, consume and transform data.
# It shows how to create data passing pipelines by instantiating components and connecting them together.
# %%
from kfp import dsl
from kfp.components import func_to_container_op, InputPath, OutputPath
from kfp_tekton.compiler import TektonCompiler
class Coder:
def empty(self):
return ""
TektonCompiler._get_unique_id_code = Coder.empty
# %% [markdown]
# ## Small data
#
# Small data is the data that you'll be comfortable passing as program's command-line argument.
# Small data size should not exceed few kilobytes.
#
# Some examples of typical types of small data are: number, URL, small string (e.g. column name).
#
# Small lists, dictionaries and JSON structures are fine, but keep an eye on the size
# and consider switching to file-based data passing methods taht are more suitable for
# big data(more than several kilobytes) or binary data.
#
# All small data outputs will be at some point serialized to strings
# and all small data input values will be at some point deserialized
# from strings (passed as command-line argumants).
# There are built-in serializers and deserializers for several common types
# (e.g. `str`, `int`, `float`, `bool`, `list`, `dict`).
# All other types of data need to be serialized manually before returning the data.
# Make sure to properly specify type annotations, otherwize there would be no automatic deserialization
# and the component function will receive strings instead of deserialized objects.
# %% [markdown]
# ## big data (files)
#
# big data should be read from files and written to files.
#
# The paths for the input and output files are chosen by the system and are passed into the function (as strings).
#
# Use the `InputPath` parameter annotation to tell the system that the function wants to
# consume the corresponding input data as a file. The system will download the data,
# write it to a local file and then pass the **path** of that file to the function.
#
# Use the `OutputPath` parameter annotation to tell the system that the function wants to produce
# the corresponding output data as a file. The system will prepare and pass the **path** of a file
# where the function should write the output data. After the function exits,
# the system will upload the data to the storage system so that it can be passed to downstream components.
#
# You can specify the type of the consumed/produced data
# by specifying the type argument to `InputPath` and `OutputPath`.
# The type can be a python type or an arbitrary type name string.
# `OutputPath('TFModel')` means that the function states that the data it has written to a file has type 'TFModel'.
# `InputPath('TFModel')` means that the function states
# that it expect the data it reads from a file to have type 'TFModel'.
# When the pipeline author connects inputs to outputs the system checks whether the types match.
#
# Note on input/output names: When the function is converted to component,
# the input and output names generally follow the parameter names,
# but the "\_path" and "\_file" suffixes are stripped from file/path inputs and outputs.
# E.g. the `number_file_path: InputPath(int)` parameter becomes the `number: int` input.
# This makes the argument passing look more natural: `number=42` instead of `number_file_path=42`.
# %% [markdown]
#
# ### Writing and reading big data
# %%
# Writing big data
@func_to_container_op
def repeat_line(line: str, output_text_path: OutputPath(str), count: int = 10):
'''Repeat the line specified number of times'''
with open(output_text_path, 'w') as writer:
for i in range(count):
writer.write(line + '\n')
# Reading big data
@func_to_container_op
def print_text(
text_path: InputPath()
): # The "text" input is untyped so that any data can be printed
'''Print text'''
with open(text_path, 'r') as reader:
for line in reader:
print(line, end='')
def print_repeating_lines_pipeline():
loop_args = [1, 2]
repeat_lines_task = repeat_line(line='Hello', count=5000)
with dsl.ParallelFor(loop_args=loop_args) as item2:
with dsl.ParallelFor(loop_args=loop_args) as item:
print_text(repeat_lines_task.output) # Don't forget .output !
# %% [markdown]
# ### Processing big data
# %%
@func_to_container_op
def split_text_lines(source_path: InputPath(str),
odd_lines_path: OutputPath(str),
even_lines_path: OutputPath(str)):
with open(source_path, 'r') as reader:
with open(odd_lines_path, 'w') as odd_writer:
with open(even_lines_path, 'w') as even_writer:
while True:
line = reader.readline()
if line == "":
break
odd_writer.write(line)
line = reader.readline()
if line == "":
break
even_writer.write(line)
def text_splitting_pipeline():
text = '\n'.join([
'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
'ten'
])
split_text_task = split_text_lines(text)
print_text(split_text_task.outputs['odd_lines'])
print_text(split_text_task.outputs['even_lines'])
# %% [markdown]
# ### Example: Pipeline that generates then sums many numbers
# %%
# Writing many numbers
@func_to_container_op
def write_numbers(
numbers_path: OutputPath(str), start: int = 0, count: int = 10):
with open(numbers_path, 'w') as writer:
for i in range(start, count):
writer.write(str(i) + '\n')
# Reading and summing many numbers
@func_to_container_op
def sum_numbers(numbers_path: InputPath(str)) -> int:
sum = 0
with open(numbers_path, 'r') as reader:
for line in reader:
sum = sum + int(line)
return sum
# Pipeline to sum 100000 numbers
def sum_pipeline(count: int = 100000):
numbers_task = write_numbers(count=count)
print_text(numbers_task.output)
sum_task = sum_numbers(numbers_task.outputs['numbers'])
print_text(sum_task.output)
# %% [markdown]
# ### Example: Pipeline that with samll data as input/output
# %%
# A samll data parameter of output, as an input paremeter of another tasks.
@func_to_container_op
def gen_params() -> int:
import random
num = random.randint(0, 9)
return num
# print the result
@func_to_container_op
def print_params(numbers_parm: int):
print("The result number is: %d" % numbers_parm)
# Pipeline to gen and echo the result
def params_pipeline():
gen_task = gen_params()
print_params(gen_task.output)
# Combining all pipelines together in a single pipeline
@dsl.pipeline(name='Big data passing')
def file_passing_pipelines():
print_repeating_lines_pipeline()
text_splitting_pipeline()
sum_pipeline()
params_pipeline()
# General by kfp-tekton
if __name__ == '__main__':
TektonCompiler().compile(file_passing_pipelines,
__file__.replace('.py', '.yaml'))