feat(sdk): support a Pythonic artifact authoring style (#9932)

* chore(sdk): test observability, refactorings, and cleanup

* implement unified i/o syntax

* clarify container component error message

* address review feedback

* add dill to packages_to_install

* update get_uri logic for KFP and tests

* update execution tests & cleanup
This commit is contained in:
Connor McCarthy 2023-10-19 11:49:58 -07:00 committed by GitHub
parent e8fb6990df
commit 8d00d0eb9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 1969 additions and 244 deletions

View File

@ -16,6 +16,7 @@
# 2.3.0
## Features
* Support `PipelineTaskFinalStatus` in tasks that use `.ignore_upstream_failure()` [\#10010](https://github.com/kubeflow/pipelines/pull/10010)
* Add support for a Pythonic artifact authoring style [\#9932](https://github.com/kubeflow/pipelines/pull/9932)
## Breaking changes

View File

@ -33,6 +33,7 @@ from kfp.compiler import compiler
from kfp.compiler import compiler_utils
from kfp.dsl import Artifact
from kfp.dsl import ContainerSpec
from kfp.dsl import Dataset
from kfp.dsl import graph_component
from kfp.dsl import Input
from kfp.dsl import Model
@ -5279,5 +5280,415 @@ class TestDslOneOf(unittest.TestCase):
return dsl.OneOf(t3, t4.output)
class TestPythonicArtifactAuthoring(unittest.TestCase):
# python component
def test_pythonic_input_artifact(self):
@dsl.component
def pythonic_style(in_artifact: Artifact):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
'system.Artifact',
)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.parameters)
@dsl.component
def standard_style(in_artifact: Input[Artifact]):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
)
def test_pythonic_input_artifact_optional(self):
@dsl.component
def pythonic_style(in_artifact: Optional[Artifact] = None):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
'system.Artifact',
)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.parameters)
@dsl.component
def standard_style(in_artifact: Optional[Input[Artifact]] = None):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
)
def test_pythonic_input_list_of_artifacts(self):
@dsl.component
def pythonic_style(in_artifact: List[Artifact]):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
'system.Artifact',
)
self.assertTrue(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].is_artifact_list)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.parameters)
@dsl.component
def standard_style(in_artifact: Input[List[Artifact]]):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
)
def test_pythonic_input_list_of_artifacts_optional(self):
@dsl.component
def pythonic_style(in_artifact: Optional[List[Artifact]] = None):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
'system.Artifact',
)
self.assertTrue(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].is_artifact_list)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.parameters)
@dsl.component
def standard_style(in_artifact: Optional[Input[List[Artifact]]] = None):
print(in_artifact)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.input_definitions.artifacts['in_artifact'].artifact_type
.schema_title,
)
def test_pythonic_output_artifact(self):
@dsl.component
def pythonic_style() -> Artifact:
return Artifact(uri='gs://my_bucket/foo')
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['Output'].artifact_type.schema_title,
'system.Artifact',
)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.parameters)
@dsl.component
def standard_style(named_output: Output[Artifact]):
return Artifact(uri='gs://my_bucket/foo')
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['Output'].artifact_type.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.output_definitions.artifacts['named_output'].artifact_type
.schema_title,
)
def test_pythonic_output_artifact_multiple_returns(self):
@dsl.component
def pythonic_style() -> NamedTuple('outputs', a=Artifact, d=Dataset):
a = Artifact(uri='gs://my_bucket/foo/artifact')
d = Artifact(uri='gs://my_bucket/foo/dataset')
outputs = NamedTuple('outputs', a=Artifact, d=Dataset)
return outputs(a=a, d=d)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['a'].artifact_type.schema_title,
'system.Artifact',
)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['d'].artifact_type.schema_title,
'system.Dataset',
)
self.assertFalse(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.parameters)
@dsl.component
def standard_style(a: Output[Artifact], d: Output[Dataset]):
a.uri = 'gs://my_bucket/foo/artifact'
d.uri = 'gs://my_bucket/foo/dataset'
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['a'].artifact_type.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.output_definitions.artifacts['a'].artifact_type.schema_title,
)
self.assertEqual(
pythonic_style.pipeline_spec.components['comp-pythonic-style']
.output_definitions.artifacts['d'].artifact_type.schema_title,
standard_style.pipeline_spec.components['comp-standard-style']
.output_definitions.artifacts['d'].artifact_type.schema_title,
)
def test_pythonic_output_list_artifacts(self):
with self.assertRaisesRegex(
ValueError,
r"Output lists of artifacts are only supported for pipelines\. Got output list of artifacts for output parameter 'Output' of component 'pythonic-style'\."
):
@dsl.component
def pythonic_style() -> List[Artifact]:
pass
def test_mixed_component_authoring_styles(self):
# can be permitted, since the expected behavior is unambiguous
# in traditional; out pythonic
@dsl.component
def back_compat_style(in_artifact: Input[Artifact]) -> Artifact:
print(in_artifact)
return Artifact(uri='gs://my_bucket/foo')
self.assertTrue(back_compat_style.pipeline_spec)
# out traditional; in pythonic
@dsl.component
def mixed_style(in_artifact: Artifact, out_artifact: Output[Artifact]):
print(in_artifact)
out_artifact.uri = 'gs://my_bucket/foo'
self.assertTrue(mixed_style.pipeline_spec)
# pipeline
def test_pipeline_input_artifact(self):
@dsl.component
def pythonic_style(in_artifact: Artifact):
print(in_artifact)
@dsl.pipeline
def my_pipeline(in_artifact: Artifact):
pythonic_style(in_artifact=in_artifact)
self.assertEqual(
my_pipeline.pipeline_spec.root.input_definitions
.artifacts['in_artifact'].artifact_type.schema_title,
'system.Artifact',
)
self.assertFalse(
my_pipeline.pipeline_spec.root.input_definitions.parameters)
def test_pipeline_input_artifact_optional(self):
@dsl.component
def pythonic_style(in_artifact: Optional[Artifact] = None):
print(in_artifact)
@dsl.pipeline
def my_pipeline(in_artifact: Optional[Artifact] = None):
pythonic_style(in_artifact=in_artifact)
self.assertEqual(
my_pipeline.pipeline_spec.root.input_definitions
.artifacts['in_artifact'].artifact_type.schema_title,
'system.Artifact',
)
self.assertFalse(
my_pipeline.pipeline_spec.root.input_definitions.parameters)
def test_pipeline_input_list_of_artifacts(self):
@dsl.component
def pythonic_style(in_artifact: List[Artifact]):
print(in_artifact)
@dsl.pipeline
def my_pipeline(in_artifact: List[Artifact]):
pythonic_style(in_artifact=in_artifact)
self.assertEqual(
my_pipeline.pipeline_spec.root.input_definitions
.artifacts['in_artifact'].artifact_type.schema_title,
'system.Artifact',
)
self.assertTrue(my_pipeline.pipeline_spec.root.input_definitions
.artifacts['in_artifact'].is_artifact_list)
self.assertFalse(
my_pipeline.pipeline_spec.root.input_definitions.parameters)
def test_pipeline_input_list_of_artifacts_optional(self):
@dsl.component
def pythonic_style(in_artifact: Optional[List[Artifact]] = None):
print(in_artifact)
@dsl.pipeline
def my_pipeline(in_artifact: Optional[List[Artifact]] = None):
pythonic_style(in_artifact=in_artifact)
self.assertEqual(
my_pipeline.pipeline_spec.root.input_definitions
.artifacts['in_artifact'].artifact_type.schema_title,
'system.Artifact',
)
self.assertFalse(
my_pipeline.pipeline_spec.root.input_definitions.parameters)
def test_pipeline_output_artifact(self):
@dsl.component
def pythonic_style() -> Artifact:
return Artifact(uri='gs://my_bucket/foo')
@dsl.pipeline
def my_pipeline() -> Artifact:
return pythonic_style().output
self.assertEqual(
my_pipeline.pipeline_spec.root.output_definitions
.artifacts['Output'].artifact_type.schema_title, 'system.Artifact')
self.assertFalse(
my_pipeline.pipeline_spec.root.output_definitions.parameters)
def test_pipeline_output_list_of_artifacts(self):
@dsl.component
def noop() -> Artifact:
# write artifact
return Artifact(uri='gs://my_bucket/foo/bar')
@dsl.pipeline
def my_pipeline() -> List[Artifact]:
with dsl.ParallelFor([1, 2, 3]):
t = noop()
return dsl.Collected(t.output)
self.assertEqual(
my_pipeline.pipeline_spec.root.output_definitions
.artifacts['Output'].artifact_type.schema_title, 'system.Artifact')
self.assertTrue(my_pipeline.pipeline_spec.root.output_definitions
.artifacts['Output'].is_artifact_list)
self.assertFalse(
my_pipeline.pipeline_spec.root.output_definitions.parameters)
# container
def test_container_input_artifact(self):
with self.assertRaisesRegex(
TypeError,
r"Container Components must wrap input and output artifact annotations with Input/Output type markers \(Input\[<artifact>\] or Output\[<artifact>\]\)\. Got function input 'in_artifact' with annotation <class 'kfp\.dsl\.types\.artifact_types\.Artifact'>\."
):
@dsl.container_component
def comp(in_artifact: Artifact):
return dsl.ContainerSpec(image='alpine', command=['pwd'])
def test_container_input_artifact_optional(self):
with self.assertRaisesRegex(
TypeError,
r"Container Components must wrap input and output artifact annotations with Input/Output type markers \(Input\[<artifact>\] or Output\[<artifact>\]\)\. Got function input 'in_artifact' with annotation <class 'kfp\.dsl\.types\.artifact_types\.Artifact'>\."
):
@dsl.container_component
def comp(in_artifact: Optional[Artifact] = None):
return dsl.ContainerSpec(image='alpine', command=['pwd'])
def test_container_input_list_of_artifacts(self):
with self.assertRaisesRegex(
TypeError,
r"Container Components must wrap input and output artifact annotations with Input/Output type markers \(Input\[<artifact>\] or Output\[<artifact>\]\)\. Got function input 'in_artifact' with annotation typing\.List\[kfp\.dsl\.types\.artifact_types\.Artifact\]\."
):
@dsl.container_component
def comp(in_artifact: List[Artifact]):
return dsl.ContainerSpec(image='alpine', command=['pwd'])
def test_container_input_list_of_artifacts_optional(self):
with self.assertRaisesRegex(
TypeError,
r"Container Components must wrap input and output artifact annotations with Input/Output type markers \(Input\[<artifact>\] or Output\[<artifact>\]\)\. Got function input 'in_artifact' with annotation typing\.List\[kfp\.dsl\.types\.artifact_types\.Artifact\]\."
):
@dsl.container_component
def comp(in_artifact: Optional[List[Artifact]] = None):
return dsl.ContainerSpec(image='alpine', command=['pwd'])
def test_container_output_artifact(self):
with self.assertRaisesRegex(
TypeError,
r'Return annotation should be either ContainerSpec or omitted for container components\.'
):
@dsl.container_component
def comp() -> Artifact:
return dsl.ContainerSpec(image='alpine', command=['pwd'])
def test_container_output_list_of_artifact(self):
with self.assertRaisesRegex(
TypeError,
r'Return annotation should be either ContainerSpec or omitted for container components\.'
):
@dsl.container_component
def comp() -> List[Artifact]:
return dsl.ContainerSpec(image='alpine', command=['pwd'])
if __name__ == '__main__':
unittest.main()

View File

@ -29,6 +29,7 @@ __all__ = [
'Metrics',
'Model',
'SlicedClassificationMetrics',
'get_uri',
'PIPELINE_JOB_NAME_PLACEHOLDER',
'PIPELINE_JOB_RESOURCE_NAME_PLACEHOLDER',
'PIPELINE_JOB_ID_PLACEHOLDER',
@ -44,6 +45,7 @@ from kfp.dsl.task_final_status import PipelineTaskFinalStatus
from kfp.dsl.types.artifact_types import Artifact
from kfp.dsl.types.artifact_types import ClassificationMetrics
from kfp.dsl.types.artifact_types import Dataset
from kfp.dsl.types.artifact_types import get_uri
from kfp.dsl.types.artifact_types import HTML
from kfp.dsl.types.artifact_types import Markdown
from kfp.dsl.types.artifact_types import Metrics

View File

@ -17,7 +17,8 @@ import itertools
import pathlib
import re
import textwrap
from typing import Callable, List, Mapping, Optional, Tuple, Type, Union
from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type,
Union)
import warnings
import docstring_parser
@ -192,7 +193,7 @@ def _get_function_source_definition(func: Callable) -> str:
return '\n'.join(func_code_lines)
def _maybe_make_unique(name: str, names: List[str]):
def maybe_make_unique(name: str, names: List[str]):
if name not in names:
return name
@ -204,6 +205,181 @@ def _maybe_make_unique(name: str, names: List[str]):
raise RuntimeError(f'Too many arguments with the name {name}')
def get_name_to_specs(
signature: inspect.Signature,
containerized: bool = False,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Returns two dictionaries.
The first is a mapping of input name to input annotation. The second
is a mapping of output name to output annotation.
"""
func_params = list(signature.parameters.values())
name_to_input_specs = {}
name_to_output_specs = {}
### handle function parameter annotations ###
for func_param in func_params:
name = func_param.name
if name == SINGLE_OUTPUT_NAME:
raise ValueError(
f'"{SINGLE_OUTPUT_NAME}" is an invalid parameter name.')
# Stripping Optional from Optional[<type>] is the only processing done
# on annotations in this flow. Other than that, we extract the raw
# annotation and process later.
annotation = type_annotations.maybe_strip_optional_from_annotation(
func_param.annotation)
# no annotation
if annotation == inspect._empty:
raise TypeError(f'Missing type annotation for argument: {name}')
# is Input[Artifact], Input[List[<Artifact>]], <param> (e.g., str), or InputPath(<param>)
elif (type_annotations.is_artifact_wrapped_in_Input(annotation) or
isinstance(
annotation,
type_annotations.InputPath,
) or type_utils.is_parameter_type(annotation)):
name_to_input_specs[maybe_make_unique(
name, list(name_to_input_specs))] = make_input_spec(
annotation, func_param)
# is Artifact annotation (e.g., Artifact, Dataset, etc.)
# or List[<Artifact>]
elif type_annotations.issubclass_of_artifact(
annotation) or type_annotations.is_list_of_artifacts(
annotation):
if containerized:
raise TypeError(
f"Container Components must wrap input and output artifact annotations with Input/Output type markers (Input[<artifact>] or Output[<artifact>]). Got function input '{name}' with annotation {annotation}."
)
name_to_input_specs[maybe_make_unique(
name, list(name_to_input_specs))] = make_input_spec(
annotation, func_param)
# is Output[Artifact] or OutputPath(<param>)
elif type_annotations.is_artifact_wrapped_in_Output(
annotation) or isinstance(annotation,
type_annotations.OutputPath):
name_to_output_specs[maybe_make_unique(
name,
list(name_to_output_specs))] = make_output_spec(annotation)
# parameter type
else:
type_string = type_utils._annotation_to_type_struct(annotation)
name_to_input_specs[maybe_make_unique(
name, list(name_to_input_specs))] = make_input_spec(
type_string, func_param)
### handle return annotations ###
return_ann = signature.return_annotation
# validate container component returns
if containerized:
if return_ann not in [
inspect.Parameter.empty,
structures.ContainerSpec,
]:
raise TypeError(
'Return annotation should be either ContainerSpec or omitted for container components.'
)
# ignore omitted returns
elif return_ann is None or return_ann == inspect.Parameter.empty:
pass
# is NamedTuple
elif hasattr(return_ann, '_fields'):
# Getting field type annotations.
# __annotations__ does not exist in python 3.5 and earlier
# _field_types does not exist in python 3.9 and later
field_annotations = getattr(return_ann, '__annotations__',
None) or getattr(return_ann, '_field_types')
for name in return_ann._fields:
annotation = field_annotations[name]
if not type_annotations.is_list_of_artifacts(
annotation) and not type_annotations.is_artifact_class(
annotation):
annotation = type_utils._annotation_to_type_struct(annotation)
name_to_output_specs[maybe_make_unique(
name,
list(name_to_output_specs))] = make_output_spec(annotation)
# is deprecated dict returns style
elif isinstance(return_ann, dict):
warnings.warn(
'The ability to specify multiple outputs using the dict syntax'
' has been deprecated. It will be removed soon after release'
' 0.1.32. Please use typing.NamedTuple to declare multiple'
' outputs.', DeprecationWarning)
for output_name, output_type_annotation in return_ann.items():
output_type = type_utils._annotation_to_type_struct(
output_type_annotation)
name_to_output_specs[maybe_make_unique(
output_name, list(name_to_output_specs))] = output_type
# is the simple single return case (can be `-> <param>` or `-> Artifact`)
# treated the same way, since processing is done in inner functions
else:
name_to_output_specs[maybe_make_unique(
SINGLE_OUTPUT_NAME,
list(name_to_output_specs))] = make_output_spec(return_ann)
return name_to_input_specs, name_to_output_specs
def canonicalize_annotation(annotation: Any):
"""Does cleaning on annotations that are common between input and output
annotations."""
if type_annotations.is_Input_Output_artifact_annotation(annotation):
annotation = type_annotations.strip_Input_or_Output_marker(annotation)
if isinstance(annotation,
(type_annotations.InputPath, type_annotations.OutputPath)):
annotation = annotation.type
return annotation
def make_input_output_spec_args(annotation: Any) -> Dict[str, Any]:
"""Gets a dict of kwargs shared between InputSpec and OutputSpec."""
is_artifact_list = type_annotations.is_list_of_artifacts(annotation)
if is_artifact_list:
annotation = type_annotations.get_inner_type(annotation)
if type_annotations.issubclass_of_artifact(annotation):
typ = type_utils.create_bundled_artifact_type(annotation.schema_title,
annotation.schema_version)
else:
typ = type_utils._annotation_to_type_struct(annotation)
return {'type': typ, 'is_artifact_list': is_artifact_list}
def make_output_spec(annotation: Any) -> structures.OutputSpec:
annotation = canonicalize_annotation(annotation)
args = make_input_output_spec_args(annotation)
return structures.OutputSpec(**args)
def make_input_spec(annotation: Any,
inspect_param: inspect.Parameter) -> structures.InputSpec:
"""Makes an InputSpec from a cleaned output annotation."""
annotation = canonicalize_annotation(annotation)
input_output_spec_args = make_input_output_spec_args(annotation)
if (type_annotations.issubclass_of_artifact(annotation) or
input_output_spec_args['is_artifact_list']
) and inspect_param.default not in {None, inspect._empty}:
raise ValueError(
f'Optional Input artifacts may only have default value None. Got: {inspect_param.default}.'
)
default = None if inspect_param.default == inspect.Parameter.empty or type_annotations.issubclass_of_artifact(
annotation) else inspect_param.default
optional = inspect_param.default is not inspect.Parameter.empty or type_utils.is_task_final_status_type(
getattr(inspect_param.annotation, '__name__', ''))
return structures.InputSpec(
**input_output_spec_args,
default=default,
optional=optional,
)
def extract_component_interface(
func: Callable,
containerized: bool = False,
@ -211,182 +387,6 @@ def extract_component_interface(
name: Optional[str] = None,
) -> structures.ComponentSpec:
signature = inspect.signature(func)
parameters = list(signature.parameters.values())
original_docstring = inspect.getdoc(func)
parsed_docstring = docstring_parser.parse(original_docstring)
inputs = {}
outputs = {}
input_names = set()
output_names = set()
for parameter in parameters:
parameter_type = type_annotations.maybe_strip_optional_from_annotation(
parameter.annotation)
passing_style = None
io_name = parameter.name
is_artifact_list = False
if type_annotations.is_Input_Output_artifact_annotation(parameter_type):
# passing_style is either type_annotations.InputAnnotation or
# type_annotations.OutputAnnotation.
passing_style = type_annotations.get_io_artifact_annotation(
parameter_type)
# parameter_type is a type like typing_extensions.Annotated[kfp.dsl.types.artifact_types.Artifact, <class 'kfp.dsl.types.type_annotations.OutputAnnotation'>] OR typing_extensions.Annotated[typing.List[kfp.dsl.types.artifact_types.Artifact], <class 'kfp.dsl.types.type_annotations.OutputAnnotation'>]
is_artifact_list = type_annotations.is_list_of_artifacts(
parameter_type.__origin__)
parameter_type = type_annotations.get_io_artifact_class(
parameter_type)
if not type_annotations.is_artifact_class(parameter_type):
raise ValueError(
f'Input[T] and Output[T] are only supported when T is an artifact or list of artifacts. Found `{io_name} with type {parameter_type}`'
)
if parameter.default is not inspect.Parameter.empty:
if passing_style in [
type_annotations.OutputAnnotation,
type_annotations.OutputPath,
]:
raise ValueError(
'Default values for Output artifacts are not supported.'
)
elif parameter.default is not None:
raise ValueError(
f'Optional Input artifacts may only have default value None. Got: {parameter.default}.'
)
elif isinstance(
parameter_type,
(type_annotations.InputPath, type_annotations.OutputPath)):
passing_style = type(parameter_type)
parameter_type = parameter_type.type
if parameter.default is not inspect.Parameter.empty and not (
passing_style == type_annotations.InputPath and
parameter.default is None):
raise ValueError(
'Path inputs only support default values of None. Default'
' values for outputs are not supported.')
type_struct = type_utils._annotation_to_type_struct(parameter_type)
if type_struct is None:
raise TypeError(
f'Missing type annotation for argument: {parameter.name}')
if passing_style in [
type_annotations.OutputAnnotation, type_annotations.OutputPath
]:
if io_name == SINGLE_OUTPUT_NAME:
raise ValueError(
f'"{SINGLE_OUTPUT_NAME}" is an invalid parameter name.')
io_name = _maybe_make_unique(io_name, output_names)
output_names.add(io_name)
if type_annotations.is_artifact_class(parameter_type):
schema_version = parameter_type.schema_version
output_spec = structures.OutputSpec(
type=type_utils.create_bundled_artifact_type(
type_struct, schema_version),
is_artifact_list=is_artifact_list)
else:
output_spec = structures.OutputSpec(type=type_struct)
outputs[io_name] = output_spec
else:
io_name = _maybe_make_unique(io_name, input_names)
input_names.add(io_name)
type_ = type_utils.create_bundled_artifact_type(
type_struct, parameter_type.schema_version
) if type_annotations.is_artifact_class(
parameter_type) else type_struct
default = None if parameter.default == inspect.Parameter.empty or type_annotations.is_artifact_class(
parameter_type) else parameter.default
optional = parameter.default is not inspect.Parameter.empty or type_utils.is_task_final_status_type(
type_struct)
input_spec = structures.InputSpec(
type=type_,
default=default,
optional=optional,
is_artifact_list=is_artifact_list,
)
inputs[io_name] = input_spec
#Analyzing the return type annotations.
return_ann = signature.return_annotation
if not containerized:
if hasattr(return_ann, '_fields'): #NamedTuple
# Getting field type annotations.
# __annotations__ does not exist in python 3.5 and earlier
# _field_types does not exist in python 3.9 and later
field_annotations = getattr(return_ann, '__annotations__',
None) or getattr(
return_ann, '_field_types', None)
for field_name in return_ann._fields:
output_name = _maybe_make_unique(field_name, output_names)
output_names.add(output_name)
type_var = field_annotations.get(field_name)
if type_annotations.is_list_of_artifacts(type_var):
artifact_cls = type_var.__args__[0]
output_spec = structures.OutputSpec(
type=type_utils.create_bundled_artifact_type(
artifact_cls.schema_title,
artifact_cls.schema_version),
is_artifact_list=True)
elif type_annotations.is_artifact_class(type_var):
output_spec = structures.OutputSpec(
type=type_utils.create_bundled_artifact_type(
type_var.schema_title, type_var.schema_version))
else:
type_struct = type_utils._annotation_to_type_struct(
type_var)
output_spec = structures.OutputSpec(type=type_struct)
outputs[output_name] = output_spec
# Deprecated dict-based way of declaring multiple outputs. Was only used by
# the @component decorator
elif isinstance(return_ann, dict):
warnings.warn(
'The ability to specify multiple outputs using the dict syntax'
' has been deprecated. It will be removed soon after release'
' 0.1.32. Please use typing.NamedTuple to declare multiple'
' outputs.')
for output_name, output_type_annotation in return_ann.items():
output_type_struct = type_utils._annotation_to_type_struct(
output_type_annotation)
output_spec = structures.OutputSpec(type=output_type_struct)
outputs[name] = output_spec
elif signature.return_annotation is not None and signature.return_annotation != inspect.Parameter.empty:
output_name = _maybe_make_unique(SINGLE_OUTPUT_NAME, output_names)
# Fixes exotic, but possible collision:
# `def func(output_path: OutputPath()) -> str: ...`
output_names.add(output_name)
return_ann = signature.return_annotation
if type_annotations.is_list_of_artifacts(return_ann):
artifact_cls = return_ann.__args__[0]
output_spec = structures.OutputSpec(
type=type_utils.create_bundled_artifact_type(
artifact_cls.schema_title, artifact_cls.schema_version),
is_artifact_list=True)
elif type_annotations.is_artifact_class(return_ann):
output_spec = structures.OutputSpec(
type=type_utils.create_bundled_artifact_type(
return_ann.schema_title, return_ann.schema_version),
is_artifact_list=False)
else:
type_struct = type_utils._annotation_to_type_struct(return_ann)
output_spec = structures.OutputSpec(type=type_struct)
outputs[output_name] = output_spec
elif return_ann != inspect.Parameter.empty and return_ann != structures.ContainerSpec:
raise TypeError(
'Return annotation should be either ContainerSpec or omitted for container components.'
)
component_name = name or _python_function_name_to_component_name(
func.__name__)
def assign_descriptions(
inputs_or_outputs: Mapping[str, Union[structures.InputSpec,
structures.OutputSpec]],
@ -417,23 +417,32 @@ def extract_component_interface(
return None
assign_descriptions(inputs, parsed_docstring.params)
signature = inspect.signature(func)
name_to_input_spec, name_to_output_spec = get_name_to_specs(
signature, containerized)
original_docstring = inspect.getdoc(func)
parsed_docstring = docstring_parser.parse(original_docstring)
assign_descriptions(name_to_input_spec, parsed_docstring.params)
modified_parsed_docstring = parse_docstring_with_return_as_args(
original_docstring)
if modified_parsed_docstring is not None:
assign_descriptions(outputs, modified_parsed_docstring.params)
assign_descriptions(name_to_output_spec,
modified_parsed_docstring.params)
description = get_pipeline_description(
decorator_description=description,
docstring=parsed_docstring,
)
component_name = name or _python_function_name_to_component_name(
func.__name__)
return structures.ComponentSpec(
name=component_name,
description=description,
inputs=inputs or None,
outputs=outputs or None,
inputs=name_to_input_spec or None,
outputs=name_to_output_spec or None,
implementation=structures.Implementation(),
)
@ -573,7 +582,7 @@ def make_input_for_parameterized_container_component_function(
Type[artifact_types.Artifact]]
) -> Union[placeholders.Placeholder, container_component_artifact_channel
.ContainerComponentArtifactChannel]:
if type_annotations.is_input_artifact(annotation):
if type_annotations.is_artifact_wrapped_in_Input(annotation):
if type_annotations.is_list_of_artifacts(annotation.__origin__):
return placeholders.InputListOfArtifactsPlaceholder(name)
@ -581,7 +590,7 @@ def make_input_for_parameterized_container_component_function(
return container_component_artifact_channel.ContainerComponentArtifactChannel(
io_type='input', var_name=name)
elif type_annotations.is_output_artifact(annotation):
elif type_annotations.is_artifact_wrapped_in_Output(annotation):
if type_annotations.is_list_of_artifacts(annotation.__origin__):
return placeholders.OutputListOfArtifactsPlaceholder(name)

View File

@ -16,6 +16,7 @@ import json
import os
import re
from typing import Any, Callable, Dict, List, Optional, Union
import warnings
from kfp import dsl
from kfp.dsl import task_final_status
@ -39,6 +40,12 @@ class Executor:
self.func = function_to_execute
self.executor_input = executor_input
self.executor_output_path = self.executor_input['outputs']['outputFile']
# drop executor_output.json part from the outputFile path
artifact_types.CONTAINER_TASK_ROOT = os.path.split(
self.executor_output_path)[0]
self.input_artifacts: Dict[str, Union[dsl.Artifact,
List[dsl.Artifact]]] = {}
self.output_artifacts: Dict[str, dsl.Artifact] = {}
@ -55,9 +62,14 @@ class Executor:
if list_of_artifact_proto_structs:
annotation = self.func.__annotations__[name]
# InputPath has no attribute __origin__ and also should be handled as a single artifact
if type_annotations.is_Input_Output_artifact_annotation(
annotation) and type_annotations.is_list_of_artifacts(
annotation.__origin__):
annotation = type_annotations.maybe_strip_optional_from_annotation(
annotation)
is_list_of_artifacts = (
type_annotations.is_Input_Output_artifact_annotation(
annotation) and
type_annotations.is_list_of_artifacts(annotation.__origin__)
) or type_annotations.is_list_of_artifacts(annotation)
if is_list_of_artifacts:
self.input_artifacts[name] = [
self.make_artifact(
msg,
@ -129,7 +141,7 @@ class Executor:
path = parameter.get('outputFile', None)
if path:
os.makedirs(os.path.dirname(path), exist_ok=True)
makedirs_recursively(path)
return path
def get_output_artifact_path(self, artifact_name: str) -> str:
@ -189,8 +201,29 @@ class Executor:
f'Function `{self.func.__name__}` returned value of type {type(return_value)}; want type {origin_type}'
)
self.write_output_parameter_value(output_name, return_value)
elif is_artifact(annotation_type):
self.write_output_artifact_payload(output_name, return_value)
if isinstance(return_value, artifact_types.Artifact):
# for -> Artifact annotations, where the user returns an artifact
artifact_name = self.executor_input['outputs']['artifacts'][
output_name]['artifacts'][0]['name']
# users should not override the name for Vertex Pipelines
# if empty string, replace
# else provide descriptive warning and prefer letting backend throw exception
running_on_vertex = 'VERTEX_AI_PIPELINES_RUN_LABELS' in os.environ
if running_on_vertex:
if return_value.name == '':
return_value.name = artifact_name
else:
# prefer letting the backend throw the runtime exception
warnings.warn(
f'If you are running your pipeline Vertex AI Pipelines, you should not provide a name for your artifact. It will be set to the Vertex artifact resource name {artifact_name} by default. Got value for name: {return_value.name}.',
RuntimeWarning,
stacklevel=2)
self.output_artifacts[output_name] = return_value
else:
# for -> Artifact annotations, where the user returns some data that the executor should serialize
self.write_output_artifact_payload(output_name, return_value)
else:
raise RuntimeError(
f'Unknown return type: {annotation_type}. Must be one of the supported data types: https://www.kubeflow.org/docs/components/pipelines/v2/data-types/'
@ -209,18 +242,6 @@ class Executor:
Returns:
Optional[str]: Returns the location of the executor_output file as a string if the file is written. Else, None.
"""
if self.output_artifacts:
self.excutor_output['artifacts'] = {}
for name, artifact in self.output_artifacts.items():
runtime_artifact = {
'name': artifact.name,
'uri': artifact.uri,
'metadata': artifact.metadata,
}
artifacts_list = {'artifacts': [runtime_artifact]}
self.excutor_output['artifacts'][name] = artifacts_list
if func_output is not None:
if is_parameter(self.return_annotation) or is_artifact(
@ -248,6 +269,19 @@ class Executor:
f'Unknown return type: {self.return_annotation}. Must be one of `str`, `int`, `float`, a subclass of `Artifact`, or a NamedTuple collection of these types.'
)
if self.output_artifacts:
self.excutor_output['artifacts'] = {}
for name, artifact in self.output_artifacts.items():
runtime_artifact = {
'name': artifact.name,
'uri': artifact.uri,
'metadata': artifact.metadata,
}
artifacts_list = {'artifacts': [runtime_artifact]}
self.excutor_output['artifacts'][name] = artifacts_list
# This check is to ensure only one worker (in a mirrored, distributed training/compute strategy) attempts to write to the same executor output file at the same time using gcsfuse, which enforces immutability of files.
write_file = True
@ -259,12 +293,10 @@ class Executor:
write_file = cluster_spec['task']['type'] in CHIEF_NODE_LABELS
if write_file:
executor_output_path = self.executor_input['outputs']['outputFile']
os.makedirs(os.path.dirname(executor_output_path), exist_ok=True)
with open(executor_output_path, 'w') as f:
makedirs_recursively(self.executor_output_path)
with open(self.executor_output_path, 'w') as f:
f.write(json.dumps(self.excutor_output))
return executor_output_path
return self.executor_output_path
return None
def execute(self) -> Optional[str]:
@ -300,17 +332,23 @@ class Executor:
error_message=value.get('error').get('message', None),
)
elif type_annotations.is_list_of_artifacts(v):
func_kwargs[k] = self.get_input_artifact(k)
elif is_parameter(v):
value = self.get_input_parameter_value(k)
if value is not None:
func_kwargs[k] = value
elif type_annotations.is_Input_Output_artifact_annotation(v):
if type_annotations.is_input_artifact(v):
if type_annotations.is_artifact_wrapped_in_Input(v):
func_kwargs[k] = self.get_input_artifact(k)
if type_annotations.is_output_artifact(v):
if type_annotations.is_artifact_wrapped_in_Output(v):
func_kwargs[k] = self.get_output_artifact(k)
elif is_artifact(v):
func_kwargs[k] = self.get_input_artifact(k)
elif isinstance(v, type_annotations.OutputPath):
if is_parameter(v.type):
func_kwargs[k] = self.get_output_parameter_path(k)

View File

@ -13,6 +13,7 @@
# limitations under the License.
"""Tests for kfp.dsl.executor."""
import contextlib
import json
import os
import tempfile
@ -41,10 +42,21 @@ class ExecutorTest(parameterized.TestCase):
def setUp(cls):
cls.maxDiff = None
cls._test_dir = tempfile.mkdtemp()
cls.prev_gcs_prefix = artifact_types._GCS_LOCAL_MOUNT_PREFIX
cls.prev_minio_prefix = artifact_types._MINIO_LOCAL_MOUNT_PREFIX
cls.prev_s3_prefix = artifact_types._S3_LOCAL_MOUNT_PREFIX
artifact_types._GCS_LOCAL_MOUNT_PREFIX = cls._test_dir + '/'
artifact_types._MINIO_LOCAL_MOUNT_PREFIX = cls._test_dir + '/minio/'
artifact_types._S3_LOCAL_MOUNT_PREFIX = cls._test_dir + '/s3/'
@classmethod
def tearDown(cls):
artifact_types._GCS_LOCAL_MOUNT_PREFIX = cls.prev_gcs_prefix
artifact_types._MINIO_LOCAL_MOUNT_PREFIX = cls.prev_minio_prefix
artifact_types._S3_LOCAL_MOUNT_PREFIX = cls.prev_s3_prefix
def execute(self, func: Callable, executor_input: str) -> None:
executor_input_dict = json.loads(executor_input %
{'test_dir': self._test_dir})
@ -52,12 +64,12 @@ class ExecutorTest(parameterized.TestCase):
executor.Executor(
executor_input=executor_input_dict,
function_to_execute=func).execute()
return executor_input_dict['outputs']['outputFile']
def execute_and_load_output_metadata(self, func: Callable,
executor_input: str) -> dict:
self.execute(func, executor_input)
with open(os.path.join(self._test_dir, 'output_metadata.json'),
'r') as f:
output_file = self.execute(func, executor_input)
with open(output_file) as f:
return json.loads(f.read())
def test_input_and_output_parameters(self):
@ -1153,6 +1165,39 @@ class ExecutorTest(parameterized.TestCase):
input_artifact.name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_artifact'
)
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(output_metadata, {})
def test_single_artifact_input_pythonic(self):
executor_input = """\
{
"inputs": {
"artifacts": {
"input_artifact": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_artifact",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_artifact"
}
]
}
}
},
"outputs": {
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func(input_artifact: Artifact):
self.assertIsInstance(input_artifact, Artifact)
self.assertEqual(
input_artifact.name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_artifact'
@ -1163,6 +1208,261 @@ class ExecutorTest(parameterized.TestCase):
self.assertDictEqual(output_metadata, {})
def test_single_artifact_input_pythonic_with_optional(self):
executor_input = """\
{
"inputs": {
"artifacts": {
"input_artifact": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_artifact",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_artifact"
}
]
}
}
},
"outputs": {
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func(input_artifact: Optional[Artifact] = None):
self.assertIsInstance(input_artifact, Artifact)
self.assertEqual(
input_artifact.name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_artifact'
)
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(output_metadata, {})
def test_single_artifact_output_pythonic(self):
executor_input = """\
{
"inputs": {},
"outputs": {
"artifacts": {
"Output": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/123",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output"
}
]
}
},
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func() -> Artifact:
return Artifact(
uri='gs://manually_specified_bucket/foo',
metadata={'data': 123},
)
with temporary_envvar('VERTEX_AI_PIPELINES_RUN_LABELS', '12325'):
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(
output_metadata, {
'artifacts': {
'Output': {
'artifacts': [{
'name':
'projects/123/locations/us-central1/metadataStores/default/artifacts/123',
'uri':
'gs://manually_specified_bucket/foo',
'metadata': {
'data': 123
}
}]
}
},
})
def test_single_artifact_output_pythonic_with_get_uri(self):
executor_input = """\
{
"inputs": {},
"outputs": {
"artifacts": {
"Output": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/123",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output"
}
]
}
},
"outputFile": "%(test_dir)s/another_bucket/output_metadata.json"
}
}
"""
def test_func() -> Artifact:
return Artifact(
uri=dsl.get_uri(suffix='my_artifact'),
metadata={'data': 123},
)
with temporary_envvar('VERTEX_AI_PIPELINES_RUN_LABELS', '12325'):
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(
output_metadata, {
'artifacts': {
'Output': {
'artifacts': [{
'name':
'projects/123/locations/us-central1/metadataStores/default/artifacts/123',
'uri':
'gs://another_bucket/my_artifact',
'metadata': {
'data': 123
}
}]
}
},
})
def test_multiple_artifact_output_pythonic_with_get_uri(self):
executor_input = """\
{
"inputs": {},
"outputs": {
"artifacts": {
"a": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/123",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output"
}
]
},
"d": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/321",
"type": {
"schemaTitle": "system.Dataset"
},
"uri": "gs://some-bucket/output"
}
]
}
},
"outputFile": "%(test_dir)s/another_bucket/output_metadata.json"
}
}
"""
def test_func() -> NamedTuple('outputs', a=Artifact, d=Dataset):
outputs = NamedTuple('outputs', a=Artifact, d=Dataset)
return outputs(
a=Artifact(
uri=dsl.get_uri(suffix='artifact'),
metadata={'data': 123},
),
d=Dataset(
uri=dsl.get_uri(suffix='dataset'),
metadata={},
))
with temporary_envvar('VERTEX_AI_PIPELINES_RUN_LABELS', '12325'):
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(
output_metadata, {
'artifacts': {
'a': {
'artifacts': [{
'name':
'projects/123/locations/us-central1/metadataStores/default/artifacts/123',
'uri':
'gs://another_bucket/artifact',
'metadata': {
'data': 123
}
}]
},
'd': {
'artifacts': [{
'name':
'projects/123/locations/us-central1/metadataStores/default/artifacts/321',
'uri':
'gs://another_bucket/dataset',
'metadata': {}
}]
}
},
})
def test_warns_if_artifact_name_for_vertex(self):
executor_input = """\
{
"inputs": {},
"outputs": {
"artifacts": {
"Output": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/123",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output"
}
]
}
},
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func() -> Artifact:
return Artifact(
name='illegal_custom_name',
uri='gs://manually_specified_bucket/foo',
metadata={'data': 123},
)
with temporary_envvar('VERTEX_AI_PIPELINES_RUN_LABELS', '12325'):
with self.assertWarnsRegex(
RuntimeWarning,
r'If you are running your pipeline Vertex AI Pipelines, you should not provide a name for your artifact\. It will be set to the Vertex artifact resource name projects/123/locations/us-central1/metadataStores/default/artifacts/123 by default\. Got value for name: illegal_custom_name\.'
):
self.execute_and_load_output_metadata(test_func, executor_input)
def test_list_of_artifacts_input(self):
executor_input = """\
{
@ -1212,6 +1512,104 @@ class ExecutorTest(parameterized.TestCase):
self.assertDictEqual(output_metadata, {})
def test_list_of_artifacts_input_pythonic(self):
executor_input = """\
{
"inputs": {
"artifacts": {
"input_list": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/0",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_list/0"
},
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/1",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_list/1"
}
]
}
}
},
"outputs": {
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func(input_list: List[Artifact]):
self.assertEqual(len(input_list), 2)
self.assertEqual(
input_list[0].name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/0'
)
self.assertEqual(
input_list[1].name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/1'
)
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(output_metadata, {})
def test_list_of_artifacts_input_pythonic_with_optional(self):
executor_input = """\
{
"inputs": {
"artifacts": {
"input_list": {
"artifacts": [
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/0",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_list/0"
},
{
"metadata": {},
"name": "projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/1",
"type": {
"schemaTitle": "system.Artifact"
},
"uri": "gs://some-bucket/output/input_list/1"
}
]
}
}
},
"outputs": {
"outputFile": "%(test_dir)s/output_metadata.json"
}
}
"""
def test_func(input_list: List[Artifact] = None):
self.assertEqual(len(input_list), 2)
self.assertEqual(
input_list[0].name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/0'
)
self.assertEqual(
input_list[1].name,
'projects/123/locations/us-central1/metadataStores/default/artifacts/input_list/1'
)
output_metadata = self.execute_and_load_output_metadata(
test_func, executor_input)
self.assertDictEqual(output_metadata, {})
class TestDictToArtifact(parameterized.TestCase):
@ -1329,5 +1727,23 @@ class TestDictToArtifact(parameterized.TestCase):
executor.create_artifact_instance(runtime_artifact), expected_type)
@contextlib.contextmanager
def temporary_envvar(key: str, value: str) -> None:
# Save the old value if it exists
old_value = os.environ.get(key, None)
# Set the new value
os.environ[key] = value
try:
yield
finally:
# Restore the old value or delete the key if it didn't exist before
if old_value is not None:
os.environ[key] = old_value
else:
del os.environ[key]
if __name__ == '__main__':
unittest.main()

View File

@ -11,9 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classes for input/output Artifacts in KFP SDK."""
"""Classes and utilities for using and creating artifacts in components."""
import os
from typing import Dict, List, Optional, Type
import warnings
_GCS_LOCAL_MOUNT_PREFIX = '/gcs/'
_MINIO_LOCAL_MOUNT_PREFIX = '/minio/'
@ -90,13 +92,17 @@ class Artifact:
return None
def _set_path(self, path: str) -> None:
if path.startswith(_GCS_LOCAL_MOUNT_PREFIX):
path = 'gs://' + path[len(_GCS_LOCAL_MOUNT_PREFIX):]
elif path.startswith(_MINIO_LOCAL_MOUNT_PREFIX):
path = 'minio://' + path[len(_MINIO_LOCAL_MOUNT_PREFIX):]
elif path.startswith(_S3_LOCAL_MOUNT_PREFIX):
path = 's3://' + path[len(_S3_LOCAL_MOUNT_PREFIX):]
self.uri = path
self.uri = convert_local_path_to_remote_path(path)
def convert_local_path_to_remote_path(path: str) -> str:
if path.startswith(_GCS_LOCAL_MOUNT_PREFIX):
return 'gs://' + path[len(_GCS_LOCAL_MOUNT_PREFIX):]
elif path.startswith(_MINIO_LOCAL_MOUNT_PREFIX):
return 'minio://' + path[len(_MINIO_LOCAL_MOUNT_PREFIX):]
elif path.startswith(_S3_LOCAL_MOUNT_PREFIX):
return 's3://' + path[len(_S3_LOCAL_MOUNT_PREFIX):]
return path
class Model(Artifact):
@ -470,3 +476,38 @@ _SCHEMA_TITLE_TO_TYPE: Dict[str, Type[Artifact]] = {
Markdown,
]
}
CONTAINER_TASK_ROOT: Optional[str] = None
# suffix default of 'Output' should be the same key as the default key for a
# single output component, but use value not variable for reference docs
def get_uri(suffix: str = 'Output') -> str:
"""Gets the task root URI, a unique object storage URI associated with the
current task. This function may only be called at task runtime.
Returns an empty string if the task root cannot be inferred from the runtime environment.
Args:
suffix: A suffix to append to the URI. This is a helpful for creating unique subdirectories when the component has multiple outputs.
Returns:
The URI or empty string.
"""
if CONTAINER_TASK_ROOT is None:
raise RuntimeError(
f"'dsl.{get_uri.__name__}' can only be called at task runtime. The task root is unknown in the current environment."
)
UNSUPPORTED_KFP_PATH = '/tmp/kfp_outputs'
if CONTAINER_TASK_ROOT == UNSUPPORTED_KFP_PATH:
warnings.warn(
f'dsl.{get_uri.__name__} is not yet supported by the KFP backend. Please specify a URI explicitly.',
RuntimeWarning,
stacklevel=2,
)
# return empty string, not None, to conform with logic in artifact
# constructor which immediately converts uri=None to uri=''
# this way the .path property can worry about handling fewer input types
return ''
remote_task_root = convert_local_path_to_remote_path(CONTAINER_TASK_ROOT)
return os.path.join(remote_task_root, suffix)

View File

@ -13,18 +13,20 @@
# limitations under the License.
"""Tests for kfp.components.types.artifact_types."""
import contextlib
import json
import os
import unittest
from absl.testing import parameterized
from kfp import dsl
from kfp.dsl.types import artifact_types
class ArtifactsTest(parameterized.TestCase):
class ArtifactsTest(unittest.TestCase):
def test_complex_metrics(self):
metrics = artifact_types.ClassificationMetrics()
metrics = dsl.ClassificationMetrics()
metrics.log_roc_data_point(threshold=0.1, tpr=98.2, fpr=96.2)
metrics.log_roc_data_point(threshold=24.3, tpr=24.5, fpr=98.4)
metrics.set_confusion_matrix_categories(['dog', 'cat', 'horses'])
@ -41,7 +43,7 @@ class ArtifactsTest(parameterized.TestCase):
self.assertEqual(expected_json, metrics.metadata)
def test_complex_metrics_bulk_loading(self):
metrics = artifact_types.ClassificationMetrics()
metrics = dsl.ClassificationMetrics()
metrics.log_roc_curve(
fpr=[85.1, 85.1, 85.1],
tpr=[52.6, 52.6, 52.6],
@ -57,5 +59,92 @@ class ArtifactsTest(parameterized.TestCase):
self.assertEqual(expected_json, metrics.metadata)
@contextlib.contextmanager
def set_temporary_task_root(task_root: str):
artifact_types.CONTAINER_TASK_ROOT = task_root
try:
yield
finally:
artifact_types.CONTAINER_TASK_ROOT = None
class TestGetUri(unittest.TestCase):
def test_raise_if_no_env_var(self):
with self.assertRaisesRegex(
RuntimeError,
r"'dsl\.get_uri' can only be called at task runtime\. The task root is unknown in the current environment\."
):
dsl.get_uri()
def test_default_gcs(self):
with set_temporary_task_root(
'/gcs/my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789'
):
self.assertEqual(
'gs://my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789/Output',
dsl.get_uri())
def test_default_s3(self):
with set_temporary_task_root(
'/s3/my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789'
):
self.assertEqual(
's3://my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789/Output',
dsl.get_uri())
def test_default_minio(self):
with set_temporary_task_root(
'/minio/my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789'
):
self.assertEqual(
'minio://my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789/Output',
dsl.get_uri())
def test_suffix_arg_gcs(self):
with set_temporary_task_root(
'/gcs/my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789'
):
self.assertEqual(
'gs://my_bucket/123456789/abc-09-14-2023-14-21-53/foo_123456789/model',
dsl.get_uri('model'))
def test_suffix_arg_tmp_no_suffix(self):
with set_temporary_task_root('/tmp/kfp_outputs'):
with self.assertWarnsRegex(
RuntimeWarning,
r'dsl\.get_uri is not yet supported by the KFP backend\. Please specify a URI explicitly\.'
):
actual = dsl.get_uri('model')
self.assertEqual('', actual)
def test_suffix_arg_tmp_with_suffix(self):
with set_temporary_task_root('/tmp/kfp_outputs'):
with self.assertWarnsRegex(
RuntimeWarning,
r'dsl\.get_uri is not yet supported by the KFP backend\. Please specify a URI explicitly\.'
):
actual = dsl.get_uri('model')
self.assertEqual('', actual)
class TestConvertLocalPathToRemotePath(parameterized.TestCase):
@parameterized.parameters([{
'local_path': local_path,
'expected': expected
} for local_path, expected in [
('/gcs/foo/bar', 'gs://foo/bar'),
('/minio/foo/bar', 'minio://foo/bar'),
('/s3/foo/bar', 's3://foo/bar'),
('/tmp/kfp_outputs', '/tmp/kfp_outputs'),
('/some/random/path', '/some/random/path'),
]])
def test_gcs(self, local_path, expected):
actual = artifact_types.convert_local_path_to_remote_path(local_path)
self.assertEqual(actual, expected)
if __name__ == '__main__':
unittest.main()

View File

@ -135,7 +135,7 @@ def is_Input_Output_artifact_annotation(typ) -> bool:
return True
def is_input_artifact(typ) -> bool:
def is_artifact_wrapped_in_Input(typ: Any) -> bool:
"""Returns True if typ is of type Input[T]."""
if not is_Input_Output_artifact_annotation(typ):
return False
@ -143,7 +143,7 @@ def is_input_artifact(typ) -> bool:
return typ.__metadata__[0] == InputAnnotation
def is_output_artifact(typ) -> bool:
def is_artifact_wrapped_in_Output(typ: Any) -> bool:
"""Returns True if typ is of type Output[T]."""
if not is_Input_Output_artifact_annotation(typ):
return False
@ -160,14 +160,19 @@ def get_io_artifact_class(typ):
return None
# extract inner type from list of artifacts
inner = typ.__args__[0]
inner = strip_Input_or_Output_marker(typ)
if hasattr(inner, '__origin__') and inner.__origin__ == list:
return inner.__args__[0]
return inner
def get_io_artifact_annotation(typ):
def strip_Input_or_Output_marker(typ: Any) -> artifact_types.Artifact:
return typ.__args__[0]
def get_input_or_output_marker(
typ) -> Optional[Union[InputAnnotation, OutputAnnotation]]:
if not is_Input_Output_artifact_annotation(typ):
return None

View File

@ -58,21 +58,24 @@ class AnnotationsTest(parameterized.TestCase):
Input,
])
def test_is_input_artifact(self, annotation):
self.assertTrue(type_annotations.is_input_artifact(annotation))
self.assertTrue(
type_annotations.is_artifact_wrapped_in_Input(annotation))
@parameterized.parameters([
Output[Model],
Output,
])
def test_is_not_input_artifact(self, annotation):
self.assertFalse(type_annotations.is_input_artifact(annotation))
self.assertFalse(
type_annotations.is_artifact_wrapped_in_Input(annotation))
@parameterized.parameters([
Output[Model],
Output[List[Model]],
])
def test_is_output_artifact(self, annotation):
self.assertTrue(type_annotations.is_output_artifact(annotation))
self.assertTrue(
type_annotations.is_artifact_wrapped_in_Output(annotation))
@parameterized.parameters([
Input[Model],
@ -80,7 +83,8 @@ class AnnotationsTest(parameterized.TestCase):
Input,
])
def test_is_not_output_artifact(self, annotation):
self.assertFalse(type_annotations.is_output_artifact(annotation))
self.assertFalse(
type_annotations.is_artifact_wrapped_in_Output(annotation))
def test_get_io_artifact_class(self):
self.assertEqual(
@ -97,26 +101,26 @@ class AnnotationsTest(parameterized.TestCase):
def test_get_io_artifact_annotation(self):
self.assertEqual(
type_annotations.get_io_artifact_annotation(Output[Model]),
type_annotations.get_input_or_output_marker(Output[Model]),
OutputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Output[List[Model]]),
type_annotations.get_input_or_output_marker(Output[List[Model]]),
OutputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Input[Model]),
type_annotations.get_input_or_output_marker(Input[Model]),
InputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Input[List[Model]]),
type_annotations.get_input_or_output_marker(Input[List[Model]]),
InputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Input), InputAnnotation)
type_annotations.get_input_or_output_marker(Input), InputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Output),
type_annotations.get_input_or_output_marker(Output),
OutputAnnotation)
self.assertEqual(
type_annotations.get_io_artifact_annotation(Model), None)
self.assertEqual(type_annotations.get_io_artifact_annotation(str), None)
type_annotations.get_input_or_output_marker(Model), None)
self.assertEqual(type_annotations.get_input_or_output_marker(str), None)
@parameterized.parameters(
{

View File

@ -0,0 +1,58 @@
# Copyright 2023 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kfp import dsl
from kfp.dsl import Dataset
from kfp.dsl import Model
@dsl.component(packages_to_install=['dill==0.3.7'])
def make_language_model(text_dataset: Dataset) -> Model:
# dill allows pickling objects belonging to a function's local namespace
import dill
with open(text_dataset.path) as f:
text = f.read()
# insert train on text here #
def dummy_model(x: str) -> str:
return x
model = Model(
uri=dsl.get_uri(suffix='model'),
metadata={'data': text_dataset.name},
)
with open(model.path, 'wb') as f:
dill.dump(dummy_model, f)
return model
@dsl.pipeline
def make_language_model_pipeline() -> Model:
importer = dsl.importer(
artifact_uri='gs://ml-pipeline-playground/shakespeare1.txt',
artifact_class=Dataset,
reimport=False,
metadata={'key': 'value'})
return make_language_model(text_dataset=importer.output).output
if __name__ == '__main__':
from kfp import compiler
compiler.Compiler().compile(
pipeline_func=make_language_model_pipeline,
package_path=__file__.replace('.py', '.yaml'))

View File

@ -0,0 +1,123 @@
# PIPELINE DEFINITION
# Name: make-language-model-pipeline
# Outputs:
# Output: system.Model
components:
comp-importer:
executorLabel: exec-importer
inputDefinitions:
parameters:
uri:
parameterType: STRING
outputDefinitions:
artifacts:
artifact:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
comp-make-language-model:
executorLabel: exec-make-language-model
inputDefinitions:
artifacts:
text_dataset:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Model
schemaVersion: 0.0.1
deploymentSpec:
executors:
exec-importer:
importer:
artifactUri:
constant: gs://ml-pipeline-playground/shakespeare1.txt
metadata:
key: value
typeSchema:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
exec-make-language-model:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- make_language_model
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.2.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.7'\
\ && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef make_language_model(text_dataset: Dataset) -> Model:\n # dill\
\ allows pickling objects belonging to a function's local namespace\n \
\ import dill\n\n with open(text_dataset.path) as f:\n text =\
\ f.read()\n\n # insert train on text here #\n\n def dummy_model(x:\
\ str) -> str:\n return x\n\n model = Model(\n uri=dsl.get_uri(suffix='model'),\n\
\ metadata={'data': text_dataset.name},\n )\n\n with open(model.path,\
\ 'wb') as f:\n dill.dump(dummy_model, f)\n\n return model\n\n"
image: python:3.7
pipelineInfo:
name: make-language-model-pipeline
root:
dag:
outputs:
artifacts:
Output:
artifactSelectors:
- outputArtifactKey: Output
producerSubtask: make-language-model
tasks:
importer:
cachingOptions:
enableCache: true
componentRef:
name: comp-importer
inputs:
parameters:
uri:
runtimeValue:
constant: gs://ml-pipeline-playground/shakespeare1.txt
taskInfo:
name: importer
make-language-model:
cachingOptions:
enableCache: true
componentRef:
name: comp-make-language-model
dependentTasks:
- importer
inputs:
artifacts:
text_dataset:
taskOutputArtifact:
outputArtifactKey: artifact
producerTask: importer
taskInfo:
name: make-language-model
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Model
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.2.0

View File

@ -0,0 +1,52 @@
# Copyright 2023 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from kfp import dsl
from kfp.dsl import Dataset
@dsl.component
def make_dataset(text: str) -> Dataset:
dataset = Dataset(uri=dsl.get_uri(), metadata={'length': len(text)})
with open(dataset.path, 'w') as f:
f.write(text)
return dataset
@dsl.component
def join_datasets(datasets: List[Dataset]) -> Dataset:
texts = []
for dataset in datasets:
with open(dataset.path, 'r') as f:
texts.append(f.read())
return ''.join(texts)
@dsl.pipeline
def make_and_join_datasets(
texts: List[str] = ['Hello', ',', ' ', 'world!']) -> Dataset:
with dsl.ParallelFor(texts) as text:
t1 = make_dataset(text=text)
return join_datasets(datasets=dsl.Collected(t1.output)).output
if __name__ == '__main__':
from kfp import compiler
compiler.Compiler().compile(
pipeline_func=make_and_join_datasets,
package_path=__file__.replace('.py', '.yaml'))

View File

@ -0,0 +1,187 @@
# PIPELINE DEFINITION
# Name: make-and-join-datasets
# Inputs:
# texts: list [Default: ['Hello', ',', ' ', 'world!']]
# Outputs:
# Output: system.Dataset
components:
comp-for-loop-1:
dag:
outputs:
artifacts:
pipelinechannel--make-dataset-Output:
artifactSelectors:
- outputArtifactKey: Output
producerSubtask: make-dataset
tasks:
make-dataset:
cachingOptions:
enableCache: true
componentRef:
name: comp-make-dataset
inputs:
parameters:
text:
componentInputParameter: pipelinechannel--texts-loop-item
taskInfo:
name: make-dataset
inputDefinitions:
parameters:
pipelinechannel--texts:
parameterType: LIST
pipelinechannel--texts-loop-item:
parameterType: STRING
outputDefinitions:
artifacts:
pipelinechannel--make-dataset-Output:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
isArtifactList: true
comp-join-datasets:
executorLabel: exec-join-datasets
inputDefinitions:
artifacts:
datasets:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
isArtifactList: true
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
comp-make-dataset:
executorLabel: exec-make-dataset
inputDefinitions:
parameters:
text:
parameterType: STRING
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
deploymentSpec:
executors:
exec-join-datasets:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- join_datasets
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.2.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef join_datasets(datasets: List[Dataset]) -> Dataset:\n texts\
\ = []\n for dataset in datasets:\n with open(dataset.path, 'r')\
\ as f:\n texts.append(f.read())\n\n return ''.join(texts)\n\
\n"
image: python:3.7
exec-make-dataset:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- make_dataset
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.2.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef make_dataset(text: str) -> Dataset:\n dataset = Dataset(uri=dsl.get_uri(),\
\ metadata={'length': len(text)})\n with open(dataset.path, 'w') as f:\n\
\ f.write(text)\n return dataset\n\n"
image: python:3.7
pipelineInfo:
name: make-and-join-datasets
root:
dag:
outputs:
artifacts:
Output:
artifactSelectors:
- outputArtifactKey: Output
producerSubtask: join-datasets
tasks:
for-loop-1:
componentRef:
name: comp-for-loop-1
inputs:
parameters:
pipelinechannel--texts:
componentInputParameter: texts
parameterIterator:
itemInput: pipelinechannel--texts-loop-item
items:
inputParameter: pipelinechannel--texts
taskInfo:
name: for-loop-1
join-datasets:
cachingOptions:
enableCache: true
componentRef:
name: comp-join-datasets
dependentTasks:
- for-loop-1
inputs:
artifacts:
datasets:
taskOutputArtifact:
outputArtifactKey: pipelinechannel--make-dataset-Output
producerTask: for-loop-1
taskInfo:
name: join-datasets
inputDefinitions:
parameters:
texts:
defaultValue:
- Hello
- ','
- ' '
- world!
isOptional: true
parameterType: LIST
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.2.0

View File

@ -0,0 +1,93 @@
# Copyright 2023 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import Artifact
from kfp.dsl import Dataset
@dsl.component
def dataset_splitter(
in_dataset: Dataset
) -> NamedTuple(
'outputs',
dataset1=Dataset,
dataset2=Dataset,
):
with open(in_dataset.path) as f:
in_data = f.read()
out_data1, out_data2 = in_data[:len(in_data) // 2], in_data[len(in_data) //
2:]
dataset1 = Dataset(
uri=dsl.get_uri(suffix='dataset1'),
metadata={'original_data': in_dataset.name},
)
with open(dataset1.path, 'w') as f:
f.write(out_data1)
dataset2 = Dataset(
uri=dsl.get_uri(suffix='dataset2'),
metadata={'original_data': in_dataset.name},
)
with open(dataset2.path, 'w') as f:
f.write(out_data2)
outputs = NamedTuple(
'outputs',
dataset1=Dataset,
dataset2=Dataset,
)
return outputs(dataset1=dataset1, dataset2=dataset2)
outputs = NamedTuple(
'outputs',
dataset1=Dataset,
dataset2=Dataset,
)
@dsl.pipeline
def splitter_pipeline(in_dataset: Dataset) -> outputs:
task = dataset_splitter(in_dataset=in_dataset)
return outputs(
task.outputs['dataset1'],
task.outputs['dataset1'],
)
@dsl.component
def make_dataset() -> Artifact:
artifact = Artifact(uri=dsl.get_uri('dataset'))
with open(artifact.path, 'w') as f:
f.write('Hello, world')
return artifact
@dsl.pipeline
def split_datasets_and_return_first() -> Dataset:
t1 = make_dataset()
return splitter_pipeline(in_dataset=t1.output).outputs['dataset1']
if __name__ == '__main__':
from kfp import compiler
compiler.Compiler().compile(
pipeline_func=split_datasets_and_return_first,
package_path=__file__.replace('.py', '.yaml'))

View File

@ -0,0 +1,184 @@
# PIPELINE DEFINITION
# Name: split-datasets-and-return-first
# Outputs:
# Output: system.Dataset
components:
comp-dataset-splitter:
executorLabel: exec-dataset-splitter
inputDefinitions:
artifacts:
in_dataset:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
outputDefinitions:
artifacts:
dataset1:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
dataset2:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
comp-make-dataset:
executorLabel: exec-make-dataset
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
comp-splitter-pipeline:
dag:
outputs:
artifacts:
dataset1:
artifactSelectors:
- outputArtifactKey: dataset1
producerSubtask: dataset-splitter
dataset2:
artifactSelectors:
- outputArtifactKey: dataset1
producerSubtask: dataset-splitter
tasks:
dataset-splitter:
cachingOptions:
enableCache: true
componentRef:
name: comp-dataset-splitter
inputs:
artifacts:
in_dataset:
componentInputArtifact: in_dataset
taskInfo:
name: dataset-splitter
inputDefinitions:
artifacts:
in_dataset:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
outputDefinitions:
artifacts:
dataset1:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
dataset2:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
deploymentSpec:
executors:
exec-dataset-splitter:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- dataset_splitter
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.2.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef dataset_splitter(\n in_dataset: Dataset\n) -> NamedTuple(\n\
\ 'outputs',\n dataset1=Dataset,\n dataset2=Dataset,\n\
):\n\n with open(in_dataset.path) as f:\n in_data = f.read()\n\
\n out_data1, out_data2 = in_data[:len(in_data) // 2], in_data[len(in_data)\
\ //\n 2:]\n\
\n dataset1 = Dataset(\n uri=dsl.get_uri(suffix='dataset1'),\n\
\ metadata={'original_data': in_dataset.name},\n )\n with open(dataset1.path,\
\ 'w') as f:\n f.write(out_data1)\n\n dataset2 = Dataset(\n \
\ uri=dsl.get_uri(suffix='dataset2'),\n metadata={'original_data':\
\ in_dataset.name},\n )\n with open(dataset2.path, 'w') as f:\n \
\ f.write(out_data2)\n\n outputs = NamedTuple(\n 'outputs',\n\
\ dataset1=Dataset,\n dataset2=Dataset,\n )\n return\
\ outputs(dataset1=dataset1, dataset2=dataset2)\n\n"
image: python:3.7
exec-make-dataset:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- make_dataset
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.2.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef make_dataset() -> Artifact:\n artifact = Artifact(uri=dsl.get_uri('dataset'))\n\
\ with open(artifact.path, 'w') as f:\n f.write('Hello, world')\n\
\ return artifact\n\n"
image: python:3.7
pipelineInfo:
name: split-datasets-and-return-first
root:
dag:
outputs:
artifacts:
Output:
artifactSelectors:
- outputArtifactKey: dataset1
producerSubtask: splitter-pipeline
tasks:
make-dataset:
cachingOptions:
enableCache: true
componentRef:
name: comp-make-dataset
taskInfo:
name: make-dataset
splitter-pipeline:
cachingOptions:
enableCache: true
componentRef:
name: comp-splitter-pipeline
dependentTasks:
- make-dataset
inputs:
artifacts:
in_dataset:
taskOutputArtifact:
outputArtifactKey: Output
producerTask: make-dataset
taskInfo:
name: splitter-pipeline
outputDefinitions:
artifacts:
Output:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.2.0

View File

@ -180,6 +180,15 @@ pipelines:
- module: if_elif_else_with_oneof_parameters
name: outer_pipeline
execute: false
- module: pythonic_artifact_with_single_return
name: make_language_model_pipeline
execute: false
- module: pythonic_artifacts_with_multiple_returns
name: split_datasets_and_return_first
execute: false
- module: pythonic_artifacts_with_list_of_artifacts
name: make_and_join_datasets
execute: false
components:
test_data_dir: sdk/python/test_data/components
read: true

View File

@ -1,4 +1,3 @@
sdk/python
pytest==7.1.3
pytest-asyncio-cooperative==0.28.0
pytest-mock==3.8.2

View File

@ -65,7 +65,9 @@ def create_test_case_parameters() -> List[TestCase]:
return parameters
def wait(run_result: client.client.RunPipelineResult) -> kfp_server_api.V2beta1Run:
def wait(
run_result: client.client.RunPipelineResult
) -> kfp_server_api.V2beta1Run:
return kfp_client.wait_for_run_completion(
run_id=run_result.run_id, timeout=int(TIMEOUT_SECONDS))
@ -104,16 +106,14 @@ def get_kfp_package_path() -> str:
return path
partial_component_decorator = functools.partial(
dsl.component = functools.partial(
dsl.component, kfp_package_path=get_kfp_package_path())
@pytest.mark.asyncio_cooperative
@pytest.mark.parametrize('test_case', create_test_case_parameters())
async def test(test_case: TestCase, mocker) -> None:
async def test(test_case: TestCase) -> None:
"""Asynchronously runs all samples and test that they succeed."""
mocker.patch.object(dsl, 'component', partial_component_decorator)
event_loop = asyncio.get_running_loop()
try:
run_url, run_result = run(test_case)
@ -123,3 +123,7 @@ async def test(test_case: TestCase, mocker) -> None:
api_run = await event_loop.run_in_executor(None, wait, run_result)
assert api_run.state == 'SUCCEEDED', f'Pipeline {test_case.name} ended with incorrect status: {api_run.state}. More info: {run_url}'
if __name__ == '__main__':
pytest.main()