feat(sdk): Refactor v2 component building (#5163)

* add executor input placeholder and remove unused entrypoint function

* add executor input generator

* add tests and fixes

* add comments for v2 component args

* refactor output metadata path placeholder

* fix test and change yaml dumping

* fix tests
This commit is contained in:
Jiaxiao Zheng 2021-02-24 07:45:14 +08:00 committed by GitHub
parent 1fee4054a7
commit d3a7fbf1ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 112 additions and 305 deletions

View File

@ -210,7 +210,9 @@ PRODUCER_POD_NAME_PARAMETER = '{}-producer-pod-id-'
# Format of the input output port name placeholder.
INPUT_OUTPUT_NAME_PATTERN = '{{{{kfp.input-output-name.{}}}}}'
# Fixed name for per-task output metadata json file.
OUTPUT_METADATA_JSON = 'executor_output.json'
OUTPUT_METADATA_JSON = '/tmp/outputs/executor_output.json'
# Executor input placeholder.
_EXECUTOR_INPUT_PLACEHOLDER = '{{$}}'
def _generate_output_uri(port_name: str) -> str:
@ -249,11 +251,7 @@ def _generate_input_uri(port_name: str) -> str:
def _generate_output_metadata_path() -> str:
"""Generates the URI to write the output metadata JSON file."""
return str(pathlib.PurePosixPath(
OUTPUT_DIR_PLACEHOLDER,
RUN_ID_PLACEHOLDER,
OUTPUT_METADATA_JSON
))
return OUTPUT_METADATA_JSON
def _generate_input_metadata_path(port_name: str) -> str:
@ -278,6 +276,11 @@ def _generate_input_output_name(port_name: str) -> str:
return INPUT_OUTPUT_NAME_PATTERN.format(port_name)
def _generate_executor_input() -> str:
"""Generates the placeholder for serialized executor input."""
return _EXECUTOR_INPUT_PLACEHOLDER
def _react_to_incompatible_reference_type(
input_type,
argument_type,
@ -494,6 +497,7 @@ def _resolve_command_line_and_paths(
[], str] = _generate_output_metadata_path,
input_output_name_generator: Callable[
[str], str] = _generate_input_output_name,
executor_input_generator: Callable[[], str] = _generate_executor_input,
) -> _ResolvedCommandLineAndPaths:
"""Resolves the command line argument placeholders. Also produces the maps of the generated inpuit/output paths."""
argument_values = arguments
@ -521,7 +525,8 @@ def _resolve_command_line_and_paths(
return None
if isinstance(arg, (str, int, float, bool)):
return str(arg)
if isinstance(arg, ExecutorInputPlaceholder):
return executor_input_generator()
if isinstance(arg, InputValuePlaceholder):
input_name = arg.input_name
input_spec = inputs_dict[input_name]

View File

@ -24,6 +24,7 @@ __all__ = [
'InputMetadataPlaceholder',
'InputOutputPortNamePlaceholder',
'OutputMetadataPlaceholder',
'ExecutorInputPlaceholder',
'ConcatPlaceholder',
'IsPresentPlaceholder',
'IfPlaceholderStructure',
@ -208,12 +209,44 @@ class OutputMetadataPlaceholder(ModelBase): # Non-standard attr names
Only supported in v2 components.
"""
_serialized_names = {
'output_name': 'outputMetadata',
'output_metadata': 'outputMetadata',
}
def __init__(self, output_name):
def __init__(self, output_metadata: type(None) = None):
if output_metadata:
raise RuntimeError(
'Output metadata placeholder cannot be associated with key')
super().__init__(locals())
def to_dict(self) -> Mapping[str, Any]:
# Override parent implementation. Otherwise it always returns {}.
return {'outputMetadata': None}
class ExecutorInputPlaceholder(ModelBase): # Non-standard attr names
"""Represents the serialized ExecutorInput message at runtime.
This placeholder will be replaced by a serialized
[ExecutorInput](https://github.com/kubeflow/pipelines/blob/61f9c2c328d245d89c9d9b8c923f24dbbd08cdc9/api/v2alpha1/pipeline_spec.proto#L730)
proto message at runtime, which includes parameters of the task, artifact
URIs and metadata.
"""
_serialized_names = {
'executor_input': 'executorInput',
}
def __init__(self, executor_input: type(None) = None):
if executor_input:
raise RuntimeError(
'Executor input placeholder cannot be associated with input key'
'. Got %s' % executor_input)
super().__init__(locals())
def to_dict(self) -> Mapping[str, Any]:
# Override parent implementation. Otherwise it always returns {}.
return {'executorInput': None}
CommandlineArgumentType = Union[
str,
@ -225,6 +258,7 @@ CommandlineArgumentType = Union[
InputMetadataPlaceholder,
InputOutputPortNamePlaceholder,
OutputMetadataPlaceholder,
ExecutorInputPlaceholder,
'ConcatPlaceholder',
'IfPlaceholder',
]
@ -365,7 +399,8 @@ class ComponentSpec(ModelBase):
if arg is None:
pass
elif isinstance(
arg, (str, int, float, bool, OutputMetadataPlaceholder)):
arg, (str, int, float, bool,
OutputMetadataPlaceholder, ExecutorInputPlaceholder)):
pass
elif isinstance(arg, list):
for arg2 in arg:

View File

@ -37,6 +37,16 @@ def dump_yaml(data):
class OrderedDumper(Dumper):
pass
def _dict_representer(dumper, data):
# Special-case executorInput and outputMetadata to make the output
# YAML prettier.
if data == {'executorInput': None}:
return dumper.represent_scalar(
value='{executorInput}',
tag=yaml.resolver.BaseResolver.DEFAULT_SCALAR_TAG)
if data == {'outputMetadata': None}:
return dumper.represent_scalar(
value='{outputMetadata}',
tag=yaml.resolver.BaseResolver.DEFAULT_SCALAR_TAG)
return dumper.represent_mapping(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
data.items())

View File

@ -755,50 +755,6 @@ implementation:
resolved_cmd.command
)
def test_metadata_placeholder_resolving(self):
component_text = textwrap.dedent("""\
name: Example function
inputs:
- {name: a, type: Dataset}
- {name: c, type: String}
outputs:
- {name: b, type: Model}
implementation:
container:
image: python:3.7
command:
- python3
- -u
args:
- --a
- {inputMetadata: a}
- --c
- {inputValue: c}
- --b
- {inputOutputPortName: a}
- --metadata-location
- {outputMetadata}
""")
op = comp.load_component_from_text(text=component_text)
task = op(a='foo', c='bar')
resolved_cmd = _resolve_command_line_and_paths(
component_spec=task.component_ref.spec,
arguments=task.arguments
)
self.assertEqual(
['--a',
'{{pipelineparam:op=;name=pipeline-output-directory}}/{{kfp.run_uid}}/{{inputs.parameters.a-producer-pod-id-}}/executor_output.json',
'--c',
'bar',
'--b',
'{{kfp.input-output-name.a}}',
'--metadata-location',
'{{pipelineparam:op=;name=pipeline-output-directory}}/{{kfp.run_uid}}/executor_output.json'],
resolved_cmd.args
)
def test_check_type_validation_of_task_spec_outputs(self):
producer_component_text = '''\
outputs:
@ -1123,6 +1079,41 @@ implementation:
with self.assertRaises(TypeError):
b_task = task_factory_b(in1=a_task.outputs['out1'])
def test_convert_executor_input_and_output_metadata_placeholder(self):
test_component = textwrap.dedent("""\
inputs:
- {name: in1}
outputs:
- {name: out1}
implementation:
container:
image: busybox
command: [echo, {executorInput}, {outputMetadata}]
""")
task_factory = comp.load_component_from_text(test_component)
task = task_factory(in1='foo')
resolved_cmd = _resolve_command_line_and_paths(
component_spec=task.component_ref.spec,
arguments=task.arguments
)
self.assertListEqual(
['echo', '{{$}}', '/tmp/outputs/executor_output.json'],
resolved_cmd.command)
def test_fail_executor_input_with_key(self):
test_component = textwrap.dedent("""\
inputs:
- {name: in1}
outputs:
- {name: out1}
implementation:
container:
image: busybox
command: [echo, {executorInput: a_bad_key}]
""")
with self.assertRaises(TypeError):
_ = comp.load_component_from_text(test_component)
if __name__ == '__main__':
unittest.main()

View File

@ -314,78 +314,21 @@ def build_python_component(
# Override user program args for new-styled component.
# TODO: The actual program args will be changed after we support v2
# component on KFP.
program_args = []
for component_input in component_spec.inputs or []:
if component_input._passing_style == components.InputArtifact:
# For each input artifact, there'll be possibly 3 arguments passed to
# the user program:
# 1. {name of the artifact}_input_path: The actual path, or uri, of the
# input artifact.
# 2. {name of the artifact}_input_pod_name: The pod ID of the producer.
# 3. {name of the artifact}_input_output_name: The output name of the
# artifact, by which the artifact can be found in the producer
# metadata JSON file.
program_args.append('--{}{}'.format(
component_input.name,
entrypoint.INPUT_URI_SUFFIX
))
program_args.append(
_structures.InputUriPlaceholder(
input_name=component_input.name))
program_args.append('--{}{}'.format(
component_input.name,
entrypoint.PRODUCER_POD_ID_SUFFIX
))
program_args.append(
'{{{{inputs.parameters.{input}}}}}'.format(
input=_components.PRODUCER_POD_NAME_PARAMETER.format(
component_input.name)))
# TODO(numerology): Consider removing the need of output name
# placeholder by letting v2 component output two metadata files per
# output.
program_args.append('--{}{}'.format(
component_input.name,
entrypoint.OUTPUT_NAME_SUFFIX
))
program_args.append(_structures.InputOutputPortNamePlaceholder(
input_name=component_input.name))
elif component_input._passing_style is None:
program_args.append('--{}{}'.format(
component_input.name,
entrypoint.ARGO_PARAM_SUFFIX
))
program_args.append(_structures.InputValuePlaceholder(
input_name=component_input.name))
else:
raise TypeError(
'Only Input/OutputArtifact and parameter annotations '
'are supported in V2 components. '
'Got %s' % component_input._passing_style)
for component_output in component_spec.outputs or []:
if component_output._passing_style == components.OutputArtifact:
# For each output artifact, there'll be one arguments passed to
# the user program:
# - {name of the artifact}_output_path: The actual path, or uri, of the
# input artifact.
program_args.append('--{}{}'.format(
component_output.name,
entrypoint.OUTPUT_ARTIFACT_PATH_SUFFIX
))
program_args.append(
_structures.OutputUriPlaceholder(
output_name=component_output.name))
elif component_output._passing_style is not None:
raise TypeError(
'Only Input/OutputArtifact and parameter annotations '
'are supported in V2 components. '
'Got %s' % component_output._passing_style)
program_args.append('--pipeline_context')
program_args.append(dsl.RUN_ID_PLACEHOLDER)
program_args.append('--{}'.format(entrypoint.FN_NAME_ARG))
program_args.append(component_func.__name__)
# For v2 component, the received command line args are fixed as follows:
# --executor_input_str
# {Executor input pb message at runtime}
# --function_name
# {The name of user defined function}
# --output_metadata_path
# {The place to write output metadata JSON file}
program_args = [
'--executor_input_str',
_structures.ExecutorInputPlaceholder(),
'--{}'.format(entrypoint.FN_NAME_ARG),
component_func.__name__,
'--output_metadata_path',
_structures.OutputMetadataPlaceholder()
]
component_spec.implementation.container.args = program_args
else:

View File

@ -231,169 +231,5 @@ def main(
output_metadata_path=output_metadata_path)
def main_2(**kwargs):
"""Container entrypoint used by KFP Python function based component.
This function has a dynamic signature, which will be interpreted according to
the I/O and data-passing contract of KFP Python function components. The
parameter will be received from command line interface.
For each declared parameter input of the user function, three command line
arguments will be recognized:
1. {name of the parameter}_input_param_metadata_file: The metadata JSON file
path output by the producer.
2. {name of the parameter}_input_field_name: The output name of the parameter,
by which the parameter can be found in the producer metadata JSON file.
3. {name of the parameter}_input_argo_param: The actual runtime value of the
input parameter.
When the producer is a new-styled KFP Python component, 1 and 2 will be
populated, and when it's a conventional KFP Python component, 3 will be in
use.
For each declared artifact input of the user function, three command line args
will be recognized:
1. {name of the artifact}_input_path: The actual path, or uri, of the input
artifact.
2. {name of the artifact}_input_artifact_metadata_file: The metadata JSON file
path output by the producer.
3. {name of the artifact}_input_output_name: The output name of the artifact,
by which the artifact can be found in the producer metadata JSON file.
If the producer is a new-styled KFP Python component, 2+3 will be used to give
user code access to MLMD (custom) properties associated with this artifact;
if the producer is a conventional KFP Python component, 1 will be used to
construct an Artifact with only the URI populated.
For each declared artifact or parameter output of the user function, a command
line arg, namely, `{name of the artifact|parameter}_(artifact|parameter)_output_path`,
will be passed to specify the location where the output content is written to.
In addition, `executor_metadata_json_file` specifies the location where the
output metadata JSON file will be written.
"""
if METADATA_FILE_ARG not in kwargs:
raise RuntimeError('Must specify executor_metadata_json_file')
# Group arguments according to suffixes.
input_params_metadata = {}
input_params_field_name = {}
input_params_value = {}
input_artifacts_metadata = {}
input_artifacts_uri = {}
input_artifacts_output_name = {}
output_artifacts_uri = {}
output_params_path = {}
for k, v in kwargs.items():
if k.endswith(PARAM_METADATA_SUFFIX):
param_name = k[:-len(PARAM_METADATA_SUFFIX)]
input_params_metadata[param_name] = v
elif k.endswith(FIELD_NAME_SUFFIX):
param_name = k[:-len(FIELD_NAME_SUFFIX)]
input_params_field_name[param_name] = v
elif k.endswith(ARGO_PARAM_SUFFIX):
param_name = k[:-len(ARGO_PARAM_SUFFIX)]
input_params_value[param_name] = v
elif k.endswith(ARTIFACT_METADATA_SUFFIX):
artifact_name = k[:-len(ARTIFACT_METADATA_SUFFIX)]
input_artifacts_metadata[artifact_name] = v
elif k.endswith(INPUT_URI_SUFFIX):
artifact_name = k[:-len(INPUT_URI_SUFFIX)]
input_artifacts_uri[artifact_name] = v
elif k.endswith(OUTPUT_NAME_SUFFIX):
artifact_name = k[:-len(OUTPUT_NAME_SUFFIX)]
input_artifacts_output_name[artifact_name] = v
elif k.endswith(OUTPUT_PARAM_PATH_SUFFIX):
param_name = k[:-len(OUTPUT_PARAM_PATH_SUFFIX)]
output_params_path[param_name] = v
elif k.endswith(OUTPUT_ARTIFACT_PATH_SUFFIX):
artifact_name = k[:-len(OUTPUT_ARTIFACT_PATH_SUFFIX)]
output_artifacts_uri[artifact_name] = v
elif k not in (METADATA_FILE_ARG, FN_NAME_ARG):
logging.warning(
'Got unexpected command line argument: %s=%s Ignoring', k, v)
# Instantiate POD objects.
input_params = {}
for param_name in (
input_params_value.keys() |
input_params_field_name.keys() | input_params_metadata.keys()):
input_param = InputParam(
value=input_params_value.get(param_name),
metadata_file=input_params_metadata.get(param_name),
field_name=input_params_field_name.get(param_name))
input_params[param_name] = input_param
input_artifacts = {}
for artifact_name in (
input_artifacts_uri.keys() |
input_artifacts_metadata.keys() |
input_artifacts_output_name.keys()
):
input_artifact = InputArtifact(
uri=input_artifacts_uri.get(artifact_name),
metadata_file=input_artifacts_metadata.get(artifact_name),
output_name=input_artifacts_output_name.get(artifact_name))
input_artifacts[artifact_name] = input_artifact
# Import and invoke the user-provided function.
# Currently the actual user code is built into container as /ml/main.py
# which is specified in
# kfp.containers._component_builder.build_python_component.
# Also, determine a way to inspect the function signature to decide the type
# of output artifacts.
fn_name = kwargs[FN_NAME_ARG]
fn = entrypoint_utils.import_func_from_source(FN_SOURCE, fn_name)
# Get the output artifacts and combine them with the provided URIs.
output_artifacts = entrypoint_utils.get_output_artifacts(
fn, output_artifacts_uri)
invoking_kwargs = {}
for k, v in output_artifacts.items():
invoking_kwargs[k] = v
for k, v in input_params.items():
invoking_kwargs[k] = v.value
for k, v in input_artifacts.items():
invoking_kwargs[k] = v.get_artifact()
# Execute the user function. fn_res is expected to contain output parameters
# only. It's either an namedtuple or a single primitive value.
fn_res = fn(**invoking_kwargs)
if isinstance(fn_res, (int, float, str)) and len(output_params_path) != 1:
raise RuntimeError('For primitive output a single output param path is '
'expected. Got %s' % output_params_path)
if isinstance(fn_res, (int, float, str)):
output_name = list(output_params_path.keys())[0]
# Write the output to the provided path.
_gcs_helper.GCSHelper.write_to_gcs_path(
path=output_params_path[output_name],
content=str(fn_res))
else:
# When multiple outputs, we'll need to match each field to the output paths.
for idx, output_name in enumerate(fn_res._fields):
path = output_params_path[output_name]
_gcs_helper.GCSHelper.write_to_gcs_path(
path=path,
content=str(fn_res[idx]))
# Write output metadata JSON file.
output_parameters = {}
if isinstance(fn_res, (int, float, str)):
output_parameters['output'] = fn_res
else:
for idx, output_name in enumerate(fn_res._fields):
output_parameters[output_name] = fn_res[idx]
executor_output = entrypoint_utils.get_executor_output(
output_artifacts=output_artifacts,
output_params=output_parameters)
_gcs_helper.GCSHelper.write_to_gcs_path(
path=kwargs[METADATA_FILE_ARG],
content=json_format.MessageToJson(executor_output))
if __name__ == '__main__':
fire.Fire(main)

View File

@ -11,18 +11,5 @@ implementation:
container:
image: gcr.io/my-project/my-image:123456
command: [python, -m, kfp.containers.entrypoint]
args:
- --test_param_input_argo_param
- {inputValue: test_param}
- --test_artifact_input_uri
- {inputUri: test_artifact}
- --test_artifact_pod_id
- '{{inputs.parameters.test_artifact-producer-pod-id-}}'
- --test_artifact_input_output_name
- {inputOutputPortName: test_artifact}
- --test_output_artifact_output_uri
- {outputUri: test_output}
- --pipeline_context
- '{{workflow.uid}}'
- --function_name
- test_function
args: [--executor_input_str, '{executorInput}', --function_name, test_function,
--output_metadata_path, '{outputMetadata}']