# Copyright 2020 kubeflow.org # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # %% [markdown] # # Data passing tutorial # Data passing is the most important aspect of Pipelines. # # In Kubeflow Pipelines, the pipeline authors compose pipelines by creating component instances (tasks) # and connecting them together. # # Component have inputs and outputs. They can consume and produce arbitrary data. # # Pipeline authors establish connections between component tasks by connecting their data inputs and outputs # - by passing the output of one task as an argument to another task's input. # # The system takes care of storing the data produced by components and later passing that data # to other components for consumption as instructed by the pipeline. # # This tutorial shows how to create python components that produce, consume and transform data. # It shows how to create data passing pipelines by instantiating components and connecting them together. # %% from kfp import dsl from kfp.components import func_to_container_op, InputPath, OutputPath from kfp_tekton.compiler import TektonCompiler class Coder: def empty(self): return "" TektonCompiler._get_unique_id_code = Coder.empty # %% [markdown] # ## Small data # # Small data is the data that you'll be comfortable passing as program's command-line argument. # Small data size should not exceed few kilobytes. # # Some examples of typical types of small data are: number, URL, small string (e.g. column name). # # Small lists, dictionaries and JSON structures are fine, but keep an eye on the size # and consider switching to file-based data passing methods taht are more suitable for # big data(more than several kilobytes) or binary data. # # All small data outputs will be at some point serialized to strings # and all small data input values will be at some point deserialized # from strings (passed as command-line argumants). # There are built-in serializers and deserializers for several common types # (e.g. `str`, `int`, `float`, `bool`, `list`, `dict`). # All other types of data need to be serialized manually before returning the data. # Make sure to properly specify type annotations, otherwize there would be no automatic deserialization # and the component function will receive strings instead of deserialized objects. # %% [markdown] # ## big data (files) # # big data should be read from files and written to files. # # The paths for the input and output files are chosen by the system and are passed into the function (as strings). # # Use the `InputPath` parameter annotation to tell the system that the function wants to # consume the corresponding input data as a file. The system will download the data, # write it to a local file and then pass the **path** of that file to the function. # # Use the `OutputPath` parameter annotation to tell the system that the function wants to produce # the corresponding output data as a file. The system will prepare and pass the **path** of a file # where the function should write the output data. After the function exits, # the system will upload the data to the storage system so that it can be passed to downstream components. # # You can specify the type of the consumed/produced data # by specifying the type argument to `InputPath` and `OutputPath`. # The type can be a python type or an arbitrary type name string. # `OutputPath('TFModel')` means that the function states that the data it has written to a file has type 'TFModel'. # `InputPath('TFModel')` means that the function states # that it expect the data it reads from a file to have type 'TFModel'. # When the pipeline author connects inputs to outputs the system checks whether the types match. # # Note on input/output names: When the function is converted to component, # the input and output names generally follow the parameter names, # but the "\_path" and "\_file" suffixes are stripped from file/path inputs and outputs. # E.g. the `number_file_path: InputPath(int)` parameter becomes the `number: int` input. # This makes the argument passing look more natural: `number=42` instead of `number_file_path=42`. # %% [markdown] # # ### Writing and reading big data # %% # Writing big data @func_to_container_op def repeat_line(line: str, output_text_path: OutputPath(str), count: int = 10): '''Repeat the line specified number of times''' with open(output_text_path, 'w') as writer: for i in range(count): writer.write(line + '\n') # Reading big data @func_to_container_op def print_text( text_path: InputPath() ): # The "text" input is untyped so that any data can be printed '''Print text''' with open(text_path, 'r') as reader: for line in reader: print(line, end='') def print_repeating_lines_pipeline(): loop_args = [1, 2] repeat_lines_task = repeat_line(line='Hello', count=5000) with dsl.ParallelFor(loop_args=loop_args) as item2: with dsl.ParallelFor(loop_args=loop_args) as item: print_text(repeat_lines_task.output) # Don't forget .output ! # %% [markdown] # ### Processing big data # %% @func_to_container_op def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)): with open(source_path, 'r') as reader: with open(odd_lines_path, 'w') as odd_writer: with open(even_lines_path, 'w') as even_writer: while True: line = reader.readline() if line == "": break odd_writer.write(line) line = reader.readline() if line == "": break even_writer.write(line) def text_splitting_pipeline(): text = '\n'.join([ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten' ]) split_text_task = split_text_lines(text) print_text(split_text_task.outputs['odd_lines']) print_text(split_text_task.outputs['even_lines']) # %% [markdown] # ### Example: Pipeline that generates then sums many numbers # %% # Writing many numbers @func_to_container_op def write_numbers( numbers_path: OutputPath(str), start: int = 0, count: int = 10): with open(numbers_path, 'w') as writer: for i in range(start, count): writer.write(str(i) + '\n') # Reading and summing many numbers @func_to_container_op def sum_numbers(numbers_path: InputPath(str)) -> int: sum = 0 with open(numbers_path, 'r') as reader: for line in reader: sum = sum + int(line) return sum # Pipeline to sum 100000 numbers def sum_pipeline(count: int = 100000): numbers_task = write_numbers(count=count) print_text(numbers_task.output) sum_task = sum_numbers(numbers_task.outputs['numbers']) print_text(sum_task.output) # %% [markdown] # ### Example: Pipeline that with samll data as input/output # %% # A samll data parameter of output, as an input paremeter of another tasks. @func_to_container_op def gen_params() -> int: import random num = random.randint(0, 9) return num # print the result @func_to_container_op def print_params(numbers_parm: int): print("The result number is: %d" % numbers_parm) # Pipeline to gen and echo the result def params_pipeline(): gen_task = gen_params() print_params(gen_task.output) # Combining all pipelines together in a single pipeline @dsl.pipeline(name='Big data passing') def file_passing_pipelines(): print_repeating_lines_pipeline() text_splitting_pipeline() sum_pipeline() params_pipeline() # General by kfp-tekton if __name__ == '__main__': TektonCompiler().compile(file_passing_pipelines, __file__.replace('.py', '.yaml'))