pipelines/sdk/python/kfp/components/_python_op.py

344 lines
14 KiB
Python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
'python_op',
'func_to_container_op',
'func_to_component_text',
]
from ._yaml_utils import dump_yaml
from ._components import _create_task_factory_from_component_spec
from ._structures import InputSpec, OutputSpec, ImplementationSpec, DockerContainerSpec, ComponentSpec
from pathlib import Path
from typing import TypeVar, Generic
T = TypeVar('T')
#OutputFile[GcsPath[Gzipped[Text]]]
class InputFile(Generic[T], str):
pass
class OutputFile(Generic[T], str):
pass
#TODO: Replace this image name with another name once people decide what to replace it with.
_default_base_image='tensorflow/tensorflow:1.11.0-py3'
def _python_function_name_to_component_name(name):
import re
return re.sub(' +', ' ', name.replace('_', ' ')).strip(' ').capitalize()
def _func_to_component_spec(func, extra_code='', base_image=_default_base_image) -> ComponentSpec:
import inspect
import re
from collections import OrderedDict
single_output_name_const = 'Output'
single_output_pythonic_name_const = 'output'
signature = inspect.signature(func)
parameters = list(signature.parameters.values())
parameter_to_type_name = OrderedDict()
inputs = []
outputs = []
extra_output_names = []
arguments = []
def annotation_to_argument_kind_and_type_name(annotation):
if not annotation or annotation == inspect.Parameter.empty:
return ('Value', None)
if hasattr(annotation, '__origin__'): #Generic type
type_name = annotation.__origin__.__name__
type_args = annotation.__args__
#if len(type_args) != 1:
# raise TypeError('Unsupported generic type {}'.format(type_name))
inner_type = type_args[0]
if type_name == InputFile.__name__:
return ('File', inner_type.__name__)
elif type_name == OutputFile.__name__:
return ('Output', inner_type.__name__)
if isinstance(annotation, type):
return ('Value', annotation.__name__)
else:
#!!! It's important to preserve string anotations as strings. Annotations that are neither types nor strings are converted to strings.
#Materializer adds double quotes to the types it does not recognize. - fix it to not quote strings.
#We need two kind of strings: we can use any type name for component YAML, but for generated Python code we must use valid python type annotations.
return ('Value', "'" + str(annotation) + "'")
for parameter in parameters:
annotation = parameter.annotation
(argument_kind, parameter_type_name) = annotation_to_argument_kind_and_type_name(annotation)
parameter_to_type_name[parameter.name] = parameter_type_name
#TODO: Humanize the input/output names
arguments.append([argument_kind, parameter.name])
parameter_spec = OrderedDict([('name', parameter.name)])
if parameter_type_name:
parameter_spec['type'] = parameter_type_name
if argument_kind == 'Value' or argument_kind == 'File':
inputs.append(parameter_spec)
elif argument_kind == 'Output':
outputs.append(parameter_spec)
else:
#Cannot happen
raise ValueError('Unrecognized argument kind {}.'.format(argument_kind))
#Analyzing the return type annotations.
return_ann = signature.return_annotation
if hasattr(return_ann, '_fields'): #NamedTuple
for field_name in return_ann._fields:
output_spec = OrderedDict([('name', field_name)])
if hasattr(return_ann, '_field_types'):
output_type = return_ann._field_types.get(field_name, None)
if isinstance(output_type, type):
output_type_name = output_type.__name__
else:
output_type_name = str(output_type)
if output_type:
output_spec['type'] = output_type_name
outputs.append(output_spec)
extra_output_names.append(field_name)
arguments.append(['Output', field_name])
else:
output_spec = OrderedDict([('name', single_output_name_const)])
(_, output_type_name) = annotation_to_argument_kind_and_type_name(signature.return_annotation)
if output_type_name:
output_spec['type'] = output_type_name
outputs.append(output_spec)
extra_output_names.append(single_output_pythonic_name_const)
arguments.append(['Output', single_output_name_const])
func_name=func.__name__
#Source code can include decorators line @python_op. Remove them
(func_code_lines, _) = inspect.getsourcelines(func)
while func_code_lines[0].lstrip().startswith('@'): #decorator
del func_code_lines[0]
#Function might be defined in some indented scope (e.g. in another function).
#We need to handle this and properly dedent the function source code
first_line = func_code_lines[0]
indent = len(first_line) - len(first_line.lstrip())
func_code_lines = [line[indent:] for line in func_code_lines]
func_code = ''.join(func_code_lines) #Lines retain their \n endings
input_parameters = parameters
internal_args = [param.name for param in input_parameters]
extra_output_external_names = [name + '_file' for name in extra_output_names]
external_annotated_parameters = [name + ':' + type_name if type_name else name for name, type_name in parameter_to_type_name.items()] + extra_output_external_names
#Unised at this moment
typing_code = \
'''\
from typing import TypeVar, Generic
T = TypeVar('T')
class InputFile(Generic[T], str):
pass
class OutputFile(Generic[T], str):
pass
'''
full_source = \
'''\
from pathlib import Path
from typing import NamedTuple
{extra_code}
{func_code}
def {wrapper_func_name}({external_annotated_parameters}):
outputs = {func_name}({internal_args})
if not isinstance(outputs, tuple):
outputs = (outputs,)
for idx, filename in enumerate([{output_external_names}]):
Path(filename).parent.mkdir(parents=True, exist_ok=True)
Path(filename).write_text(str(outputs[idx]))
try:
import fire
except ImportError:
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'fire==0.1.3'])
import fire
fire.Fire({wrapper_func_name})
'''.format(
wrapper_func_name=func_name + '_wrapper',
func_name=func_name,
external_annotated_parameters=', '.join(external_annotated_parameters),
internal_args=', '.join(internal_args),
output_external_names=', '.join(extra_output_external_names),
func_code=func_code,
extra_code=extra_code,
)
#Removing consecutive blank lines
full_source = re.sub('\n\n\n+', '\n\n', full_source).strip('\n') + '\n'
component_name = _python_function_name_to_component_name(func_name)
description = func.__doc__.strip() + '\n' if func.__doc__ else None #Interesting: unlike ruamel.yaml, PyYaml cannot handle trailing spaces in the last line (' \n') and switches the style to double-quoted.
component_spec = ComponentSpec(
name=component_name,
description=description,
inputs=[InputSpec.from_struct(input) for input in inputs],
outputs=[OutputSpec.from_struct(output) for output in outputs],
implementation=ImplementationSpec(
docker_container=DockerContainerSpec(
image=base_image,
command=['python3', '-c', full_source],
arguments=arguments,
)
)
)
return component_spec
def _func_to_component_dict(func, extra_code='', base_image=_default_base_image):
return _func_to_component_spec(func, extra_code, base_image).to_struct()
def func_to_component_text(func, extra_code='', base_image=_default_base_image):
'''
Converts a Python function to a component definition and returns its textual representation
Function docstring is used as component description.
Argument and return annotations are used as component input/output types.
To declare a function with multiple return values, use the NamedTuple return annotation syntax:
from typing import NamedTuple
def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('sum', float), ('product', float)]):
"""Returns sum and product of two arguments"""
return (a + b, a * b)
Args:
func: The python function to convert
base_image: Optional. Specify a custom Docker containerimage to use in the component. For lightweight components, the image needs to have python and the fire package.
extra_code: Optional. Extra code to add before the function code. May contain imports and other functions.
Returns:
Textual representation of a component definition
'''
component_dict = _func_to_component_dict(func, extra_code, base_image)
return dump_yaml(component_dict)
def func_to_component_file(func, output_component_file, base_image=_default_base_image, extra_code='') -> None:
'''
Converts a Python function to a component definition and writes it to a file
Function docstring is used as component description.
Argument and return annotations are used as component input/output types.
To declare a function with multiple return values, use the NamedTuple return annotation syntax:
from typing import NamedTuple
def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('sum', float), ('product', float)]):
"""Returns sum and product of two arguments"""
return (a + b, a * b)
Args:
func: The python function to convert
output_component_file: Write a component definition to a local file. Can be used for sharing.
base_image: Optional. Specify a custom Docker containerimage to use in the component. For lightweight components, the image needs to have python and the fire package.
extra_code: Optional. Extra code to add before the function code. May contain imports and other functions.
'''
component_yaml = func_to_component_text(func, extra_code, base_image)
Path(output_component_file).write_text(component_yaml)
def func_to_container_op(func, output_component_file=None, base_image=_default_base_image, extra_code=''):
'''
Converts a Python function to a component and returns a task (ContainerOp) factory
Function docstring is used as component description.
Argument and return annotations are used as component input/output types.
To declare a function with multiple return values, use the NamedTuple return annotation syntax:
from typing import NamedTuple
def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('DummyName', [('sum', float), ('product', float)]):
"""Returns sum and product of two arguments"""
return (a + b, a * b)
Args:
func: The python function to convert
base_image: Optional. Specify a custom Docker containerimage to use in the component. For lightweight components, the image needs to have python and the fire package.
output_component_file: Optional. Write a component definition to a local file. Can be used for sharing.
extra_code: Optional. Extra code to add before the function code. May contain imports and other functions.
Returns:
A factory function with a strongly-typed signature taken from the python function.
Once called with the required arguments, the factory constructs a pipeline task instance (ContainerOp) that can run the original function in a container.
'''
component_spec = _func_to_component_spec(func, extra_code, base_image)
if output_component_file:
component_dict = component_spec.to_struct()
component_yaml = dump_yaml(component_dict)
Path(output_component_file).write_text(component_yaml)
#TODO: assert ComponentSpec.from_struct(load_yaml(output_component_file)) == component_spec
return _create_task_factory_from_component_spec(component_spec)
def python_op(func=None, base_image=_default_base_image, output_component_file=None, extra_code=''):
'''
Decorator that replaces a Python function with an equivalent task (ContainerOp) factory
Function docstring is used as component description.
Argument and return annotations are used as component input/output types.
To declare a function with multiple return values, use the NamedTuple return annotation syntax:
from typing import NamedTuple
@python_op(base_image='tensorflow/tensorflow:1.11.0-py3')
def add_multiply_two_numbers_op(a: float, b: float) -> NamedTuple('DummyName', [('sum', float), ('product', float)]):
"""Returns sum and product of two arguments"""
return (a + b, a * b)
Args:
func: The python function to convert
base_image: Optional. Specify a custom Docker containerimage to use in the component. For lightweight components, the image needs to have python and the fire package.
output_component_file: Optional. Write a component definition to a local file. Can be used for sharing.
extra_code: Optional. Extra code to add before the function code. May contain imports and other functions.
Returns:
A factory function with a strongly-typed signature taken from the python function.
Once called with the required arguments, the factory constructs a pipeline task instance (ContainerOp) that can run the original function in a container.
'''
if func:
return func_to_container_op(func, output_component_file, base_image, extra_code)
else:
return lambda f: func_to_container_op(f, output_component_file, base_image, extra_code)