# Copyright 2019 The Kubeflow Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import re import warnings import yaml import copy from collections import OrderedDict from typing import Union, List, Any, Callable, TypeVar, Dict from kfp.deprecated.compiler._k8s_helper import convert_k8s_obj_to_json from kfp.deprecated import dsl from kfp.deprecated.dsl._container_op import BaseOp # generics T = TypeVar('T') def _process_obj(obj: Any, map_to_tmpl_var: dict): """Recursively sanitize and replace any PipelineParam (instances and serialized strings) in the object with the corresponding template variables (i.e. '{{inputs.parameters.}}'). Args: obj: any obj that may have PipelineParam map_to_tmpl_var: a dict that maps an unsanitized pipeline params signature into a template var """ # serialized str might be unsanitized if isinstance(obj, str): # get signature param_tuples = dsl.match_serialized_pipelineparam(obj) if not param_tuples: return obj # replace all unsanitized signature with template var for param_tuple in param_tuples: obj = re.sub(param_tuple.pattern, map_to_tmpl_var[param_tuple.pattern], obj) # list if isinstance(obj, list): return [_process_obj(item, map_to_tmpl_var) for item in obj] # tuple if isinstance(obj, tuple): return tuple((_process_obj(item, map_to_tmpl_var) for item in obj)) # dict if isinstance(obj, dict): return { _process_obj(key, map_to_tmpl_var): _process_obj(value, map_to_tmpl_var) for key, value in obj.items() } # pipelineparam if isinstance(obj, dsl.PipelineParam): # if not found in unsanitized map, then likely to be sanitized return map_to_tmpl_var.get( str(obj), '{{inputs.parameters.%s}}' % obj.full_name) # k8s objects (generated from swaggercodegen) if hasattr(obj, 'attribute_map') and isinstance(obj.attribute_map, dict): # process everything inside recursively for key in obj.attribute_map.keys(): setattr(obj, key, _process_obj(getattr(obj, key), map_to_tmpl_var)) # return json representation of the k8s obj return convert_k8s_obj_to_json(obj) # do nothing return obj def _process_base_ops(op: BaseOp): """Recursively go through the attrs listed in `attrs_with_pipelineparams` and sanitize and replace pipeline params with template var string. Returns a processed `BaseOp`. NOTE this is an in-place update to `BaseOp`'s attributes (i.e. the ones specified in `attrs_with_pipelineparams`, all `PipelineParam` are replaced with the corresponding template variable strings). Args: op {BaseOp}: class that inherits from BaseOp Returns: BaseOp """ # map param's (unsanitized pattern or serialized str pattern) -> input param var str map_to_tmpl_var = {(param.pattern or str(param)): '{{inputs.parameters.%s}}' % param.full_name for param in op.inputs} # process all attr with pipelineParams except inputs and outputs parameters for key in op.attrs_with_pipelineparams: setattr(op, key, _process_obj(getattr(op, key), map_to_tmpl_var)) return op def _parameters_to_json(params: List[dsl.PipelineParam]): """Converts a list of PipelineParam into an argo `parameter` JSON obj.""" _to_json = (lambda param: dict(name=param.full_name, value=param.value) if param.value else dict(name=param.full_name)) params = [_to_json(param) for param in params] # Sort to make the results deterministic. params.sort(key=lambda x: x['name']) return params def _inputs_to_json( inputs_params: List[dsl.PipelineParam], input_artifact_paths: Dict[str, str] = None, artifact_arguments: Dict[str, str] = None, ) -> Dict[str, Dict]: """Converts a list of PipelineParam into an argo `inputs` JSON obj.""" parameters = _parameters_to_json(inputs_params) # Building the input artifacts section artifacts = [] for name, path in (input_artifact_paths or {}).items(): artifact = {'name': name, 'path': path} if name in artifact_arguments: # The arguments should be compiled as DAG task arguments, not template's default values, but in the current DSL-compiler implementation it's too hard to make that work when passing artifact references. artifact['raw'] = {'data': str(artifact_arguments[name])} artifacts.append(artifact) artifacts.sort( key=lambda x: x['name']) #Stabilizing the input artifact ordering inputs_dict = {} if parameters: inputs_dict['parameters'] = parameters if artifacts: inputs_dict['artifacts'] = artifacts return inputs_dict def _outputs_to_json(op: BaseOp, outputs: Dict[str, dsl.PipelineParam], param_outputs: Dict[str, str], output_artifacts: List[dict]): """Creates an argo `outputs` JSON obj.""" if isinstance(op, dsl.ResourceOp): value_from_key = "jsonPath" else: value_from_key = "path" output_parameters = [] for param in set(outputs.values()): # set() dedupes output references output_parameters.append({ 'name': param.full_name, 'valueFrom': { value_from_key: param_outputs[param.name] } }) output_parameters.sort(key=lambda x: x['name']) ret = {} if output_parameters: ret['parameters'] = output_parameters if output_artifacts: ret['artifacts'] = output_artifacts return ret # TODO: generate argo python classes from swagger and use convert_k8s_obj_to_json?? def _op_to_template(op: BaseOp): """Generate template given an operator inherited from BaseOp.""" # Display name if op.display_name: op.add_pod_annotation('pipelines.kubeflow.org/task_display_name', op.display_name) # Caching option op.add_pod_label('pipelines.kubeflow.org/enable_caching', str(op.enable_caching).lower()) # NOTE in-place update to BaseOp # replace all PipelineParams with template var strings processed_op = _process_base_ops(op) if isinstance(op, dsl.ContainerOp): output_artifact_paths = OrderedDict(op.output_artifact_paths) # This should have been as easy as output_artifact_paths.update(op.file_outputs), but the _outputs_to_json function changes the output names and we must do the same here, so that the names are the same output_artifact_paths.update( sorted(((param.full_name, processed_op.file_outputs[param.name]) for param in processed_op.outputs.values()), key=lambda x: x[0])) output_artifacts = [{ 'name': name, 'path': path } for name, path in output_artifact_paths.items()] # workflow template template = { 'name': processed_op.name, 'container': convert_k8s_obj_to_json(processed_op.container) } elif isinstance(op, dsl.ResourceOp): # no output artifacts output_artifacts = [] # workflow template processed_op.resource["manifest"] = yaml.dump( convert_k8s_obj_to_json(processed_op.k8s_resource), default_flow_style=False) template = { 'name': processed_op.name, 'resource': convert_k8s_obj_to_json(processed_op.resource) } # inputs input_artifact_paths = processed_op.input_artifact_paths if isinstance( processed_op, dsl.ContainerOp) else None artifact_arguments = processed_op.artifact_arguments if isinstance( processed_op, dsl.ContainerOp) else None inputs = _inputs_to_json(processed_op.inputs, input_artifact_paths, artifact_arguments) if inputs: template['inputs'] = inputs # outputs if isinstance(op, dsl.ContainerOp): param_outputs = processed_op.file_outputs elif isinstance(op, dsl.ResourceOp): param_outputs = processed_op.attribute_outputs outputs_dict = _outputs_to_json(op, processed_op.outputs, param_outputs, output_artifacts) if outputs_dict: template['outputs'] = outputs_dict # pod spec used for runtime container settings podSpecPatch = {} # node selector if processed_op.node_selector: copy_node_selector = copy.deepcopy(processed_op.node_selector) for key, value in processed_op.node_selector.items(): if re.match('^{{inputs.parameters.*}}$', key) or re.match( '^{{inputs.parameters.*}}$', value): if not 'nodeSelector' in podSpecPatch: podSpecPatch['nodeSelector'] = {} podSpecPatch["nodeSelector"][key] = value del copy_node_selector[ key] # avoid to change the dict when iterating it if processed_op.node_selector: template['nodeSelector'] = copy_node_selector # tolerations if processed_op.tolerations: template['tolerations'] = processed_op.tolerations # affinity if processed_op.affinity: template['affinity'] = convert_k8s_obj_to_json(processed_op.affinity) # metadata if processed_op.pod_annotations or processed_op.pod_labels: template['metadata'] = {} if processed_op.pod_annotations: template['metadata']['annotations'] = processed_op.pod_annotations if processed_op.pod_labels: template['metadata']['labels'] = processed_op.pod_labels # retries if processed_op.num_retries or processed_op.retry_policy: template['retryStrategy'] = {} if processed_op.num_retries: template['retryStrategy']['limit'] = processed_op.num_retries if processed_op.retry_policy: template['retryStrategy']['retryPolicy'] = processed_op.retry_policy if not processed_op.num_retries: warnings.warn('retry_policy is set, but num_retries is not') backoff_dict = {} if processed_op.backoff_duration: backoff_dict['duration'] = processed_op.backoff_duration if processed_op.backoff_factor: backoff_dict['factor'] = processed_op.backoff_factor if processed_op.backoff_max_duration: backoff_dict['maxDuration'] = processed_op.backoff_max_duration if backoff_dict: template['retryStrategy']['backoff'] = backoff_dict # timeout if processed_op.timeout: template['activeDeadlineSeconds'] = processed_op.timeout # initContainers if processed_op.init_containers: template['initContainers'] = processed_op.init_containers # sidecars if processed_op.sidecars: template['sidecars'] = processed_op.sidecars # volumes if processed_op.volumes: template['volumes'] = [ convert_k8s_obj_to_json(volume) for volume in processed_op.volumes ] template['volumes'].sort(key=lambda x: x['name']) # Runtime resource requests if isinstance(op, dsl.ContainerOp) and ('resources' in op.container.keys()): for setting, val in op.container['resources'].items(): for resource, param in val.items(): if (resource in ['cpu', 'memory', 'amd.com/gpu', 'nvidia.com/gpu'] or re.match('^{{inputs.parameters.*}}$', resource))\ and re.match('^{{inputs.parameters.*}}$', str(param)): if not 'containers' in podSpecPatch: podSpecPatch['containers'] = [{ 'name': 'main', 'resources': {} }] if setting not in podSpecPatch['containers'][0][ 'resources']: podSpecPatch['containers'][0]['resources'][setting] = { resource: param } else: podSpecPatch['containers'][0]['resources'][setting][ resource] = param del template['container']['resources'][setting][resource] if not template['container']['resources'][setting]: del template['container']['resources'][setting] if isinstance(op, dsl.ContainerOp) and op._metadata and not op.is_v2: template.setdefault('metadata', {}).setdefault( 'annotations', {})['pipelines.kubeflow.org/component_spec'] = json.dumps( op._metadata.to_dict(), sort_keys=True) if hasattr(op, '_component_ref'): template.setdefault('metadata', {}).setdefault( 'annotations', {})['pipelines.kubeflow.org/component_ref'] = json.dumps( op._component_ref.to_dict(), sort_keys=True) if hasattr(op, '_parameter_arguments') and op._parameter_arguments: template.setdefault('metadata', {}).setdefault( 'annotations', {})['pipelines.kubeflow.org/arguments.parameters'] = json.dumps( op._parameter_arguments, sort_keys=True) if isinstance(op, dsl.ContainerOp) and op.execution_options: if op.execution_options.caching_strategy.max_cache_staleness: template.setdefault('metadata', {}).setdefault( 'annotations', {})['pipelines.kubeflow.org/max_cache_staleness'] = str( op.execution_options.caching_strategy.max_cache_staleness) if podSpecPatch: template['podSpecPatch'] = json.dumps(podSpecPatch) return template