name: "Sagemaker - TrainingJob" description: Create TrainingJob inputs: - { name: region, type: String, description: "The region to use for the training job", } ###########################GENERATED SECTION BELOW############################ - { name: algorithm_specification, type: JsonObject, default: '{}', description: "The registry path of the Docker image that contains the training algorithm and algorithm-specific metadata, including the input mode.", } - { name: checkpoint_config, type: JsonObject, default: '{}', description: "Contains information about the output location for managed spot training checkpoint data.", } - { name: debug_hook_config, type: JsonObject, default: '{}', description: "Configuration information for the Amazon SageMaker Debugger hook parameters, metric and tensor collections, and storage paths.", } - { name: debug_rule_configurations, type: JsonArray, default: '[]', description: "Configuration information for Amazon SageMaker Debugger rules for debugging output tensors.", } - { name: enable_inter_container_traffic_encryption, type: Bool, default: False, description: "To encrypt all communications between ML compute instances in distributed training, choose True.", } - { name: enable_managed_spot_training, type: Bool, default: False, description: "To train models using managed spot training, choose True.", } - { name: enable_network_isolation, type: Bool, default: False, description: "Isolates the training container.", } - { name: environment, type: JsonObject, default: '{}', description: "The environment variables to set in the Docker container.", } - { name: experiment_config, type: JsonObject, default: '{}', description: "Associates a SageMaker job as a trial component with an experiment and trial.", } - { name: hyper_parameters, type: JsonObject, default: '{}', description: "Algorithm-specific parameters that influence the quality of the model.", } - { name: input_data_config, type: JsonArray, default: '[]', description: "An array of Channel objects.", } - { name: output_data_config, type: JsonObject, default: '{}', description: "Specifies the path to the S3 location where you want to store model artifacts.", } - { name: profiler_config, type: JsonObject, default: '{}', description: "Configuration information for Amazon SageMaker Debugger system monitoring, framework profiling, and storage paths.", } - { name: profiler_rule_configurations, type: JsonArray, default: '[]', description: "Configuration information for Amazon SageMaker Debugger rules for profiling system and framework metrics.", } - { name: resource_config, type: JsonObject, default: '{}', description: "The resources, including the ML compute instances and ML storage volumes, to use for model training.", } - { name: retry_strategy, type: JsonObject, default: '{}', description: "The number of times to retry the job when the job fails due to an InternalServerError.", } - { name: role_arn, type: String, default: '', description: "The Amazon Resource Name (ARN) of an IAM role that SageMaker can assume to perform tasks on your behalf.", } - { name: stopping_condition, type: JsonObject, default: '{}', description: "Specifies a limit to how long a model training job can run.", } - { name: tags, type: JsonArray, default: '[]', description: "An array of key-value pairs.", } - { name: tensor_board_output_config, type: JsonObject, default: '{}', description: "Configuration of storage locations for the Amazon SageMaker Debugger TensorBoard output data.", } - { name: training_job_name, type: String, default: '', description: "The name of the training job.", } - { name: vpc_config, type: JsonObject, default: '{}', description: "A VpcConfig object that specifies the VPC that you want your training job to connect to.", } ###########################GENERATED SECTION ABOVE############################ outputs: ###########################GENERATED SECTION BELOW############################ - { name: ack_resource_metadata, type: JsonObject, description: "All CRs managed by ACK have a common `Status.", } - { name: conditions, type: JsonArray, description: "All CRS managed by ACK have a common `Status.", } - { name: creation_time, type: String, description: "A timestamp that indicates when the training job was created.", } - { name: debug_rule_evaluation_statuses, type: JsonArray, description: "Evaluation status of Amazon SageMaker Debugger rules for debugging on a training job.", } - { name: failure_reason, type: String, description: "If the training job failed, the reason it failed.", } - { name: last_modified_time, type: String, description: "A timestamp that indicates when the status of the training job was last modified.", } - { name: model_artifacts, type: JsonObject, description: "Information about the Amazon S3 location that is configured for storing model artifacts.", } - { name: profiler_rule_evaluation_statuses, type: JsonArray, description: "Evaluation status of Amazon SageMaker Debugger rules for profiling on a training job.", } - { name: profiling_status, type: String, description: "Profiling status of a training job.", } - { name: secondary_status, type: String, description: "Provides detailed information about the state of the training job.", } - { name: training_job_status, type: String, description: "The status of the training job.", } - { name: warm_pool_status, type: JsonObject, description: "The status of the warm pool associated with the training job.", } ###########################GENERATED SECTION ABOVE############################ implementation: container: image: public.ecr.aws/kubeflow-on-aws/aws-sagemaker-kfp-components:2.3.0 command: [python3] args: - TrainingJob/src/TrainingJob_component.py - --region - { inputValue: region } ###########################GENERATED SECTION BELOW############################ - --algorithm_specification - { inputValue: algorithm_specification } - --checkpoint_config - { inputValue: checkpoint_config } - --debug_hook_config - { inputValue: debug_hook_config } - --debug_rule_configurations - { inputValue: debug_rule_configurations } - --enable_inter_container_traffic_encryption - { inputValue: enable_inter_container_traffic_encryption } - --enable_managed_spot_training - { inputValue: enable_managed_spot_training } - --enable_network_isolation - { inputValue: enable_network_isolation } - --environment - { inputValue: environment } - --experiment_config - { inputValue: experiment_config } - --hyper_parameters - { inputValue: hyper_parameters } - --input_data_config - { inputValue: input_data_config } - --output_data_config - { inputValue: output_data_config } - --profiler_config - { inputValue: profiler_config } - --profiler_rule_configurations - { inputValue: profiler_rule_configurations } - --resource_config - { inputValue: resource_config } - --retry_strategy - { inputValue: retry_strategy } - --role_arn - { inputValue: role_arn } - --stopping_condition - { inputValue: stopping_condition } - --tags - { inputValue: tags } - --tensor_board_output_config - { inputValue: tensor_board_output_config } - --training_job_name - { inputValue: training_job_name } - --vpc_config - { inputValue: vpc_config } ###########################GENERATED SECTION ABOVE############################