pipelines/components/aws/sagemaker/TrainingJob/component.yaml

266 lines
8.3 KiB
YAML

name: "Sagemaker - TrainingJob"
description: Create TrainingJob
inputs:
- {
name: region,
type: String,
description: "The region to use for the training job",
}
###########################GENERATED SECTION BELOW############################
- {
name: algorithm_specification,
type: JsonObject,
default: '{}',
description: "The registry path of the Docker image that contains the training algorithm and algorithm-specific metadata, including the input mode.",
}
- {
name: checkpoint_config,
type: JsonObject,
default: '{}',
description: "Contains information about the output location for managed spot training checkpoint data.",
}
- {
name: debug_hook_config,
type: JsonObject,
default: '{}',
description: "Configuration information for the Amazon SageMaker Debugger hook parameters, metric and tensor collections, and storage paths.",
}
- {
name: debug_rule_configurations,
type: JsonArray,
default: '[]',
description: "Configuration information for Amazon SageMaker Debugger rules for debugging output tensors.",
}
- {
name: enable_inter_container_traffic_encryption,
type: Bool,
default: False,
description: "To encrypt all communications between ML compute instances in distributed training, choose True.",
}
- {
name: enable_managed_spot_training,
type: Bool,
default: False,
description: "To train models using managed spot training, choose True.",
}
- {
name: enable_network_isolation,
type: Bool,
default: False,
description: "Isolates the training container.",
}
- {
name: environment,
type: JsonObject,
default: '{}',
description: "The environment variables to set in the Docker container.",
}
- {
name: experiment_config,
type: JsonObject,
default: '{}',
description: "Associates a SageMaker job as a trial component with an experiment and trial.",
}
- {
name: hyper_parameters,
type: JsonObject,
default: '{}',
description: "Algorithm-specific parameters that influence the quality of the model.",
}
- {
name: input_data_config,
type: JsonArray,
default: '[]',
description: "An array of Channel objects.",
}
- {
name: output_data_config,
type: JsonObject,
default: '{}',
description: "Specifies the path to the S3 location where you want to store model artifacts.",
}
- {
name: profiler_config,
type: JsonObject,
default: '{}',
description: "Configuration information for Amazon SageMaker Debugger system monitoring, framework profiling, and storage paths.",
}
- {
name: profiler_rule_configurations,
type: JsonArray,
default: '[]',
description: "Configuration information for Amazon SageMaker Debugger rules for profiling system and framework metrics.",
}
- {
name: resource_config,
type: JsonObject,
default: '{}',
description: "The resources, including the ML compute instances and ML storage volumes, to use for model training.",
}
- {
name: retry_strategy,
type: JsonObject,
default: '{}',
description: "The number of times to retry the job when the job fails due to an InternalServerError.",
}
- {
name: role_arn,
type: String,
default: '',
description: "The Amazon Resource Name (ARN) of an IAM role that SageMaker can assume to perform tasks on your behalf.",
}
- {
name: stopping_condition,
type: JsonObject,
default: '{}',
description: "Specifies a limit to how long a model training job can run.",
}
- {
name: tags,
type: JsonArray,
default: '[]',
description: "An array of key-value pairs.",
}
- {
name: tensor_board_output_config,
type: JsonObject,
default: '{}',
description: "Configuration of storage locations for the Amazon SageMaker Debugger TensorBoard output data.",
}
- {
name: training_job_name,
type: String,
default: '',
description: "The name of the training job.",
}
- {
name: vpc_config,
type: JsonObject,
default: '{}',
description: "A VpcConfig object that specifies the VPC that you want your training job to connect to.",
}
###########################GENERATED SECTION ABOVE############################
outputs:
###########################GENERATED SECTION BELOW############################
- {
name: ack_resource_metadata,
type: JsonObject,
description: "All CRs managed by ACK have a common `Status.",
}
- {
name: conditions,
type: JsonArray,
description: "All CRS managed by ACK have a common `Status.",
}
- {
name: creation_time,
type: String,
description: "A timestamp that indicates when the training job was created.",
}
- {
name: debug_rule_evaluation_statuses,
type: JsonArray,
description: "Evaluation status of Amazon SageMaker Debugger rules for debugging on a training job.",
}
- {
name: failure_reason,
type: String,
description: "If the training job failed, the reason it failed.",
}
- {
name: last_modified_time,
type: String,
description: "A timestamp that indicates when the status of the training job was last modified.",
}
- {
name: model_artifacts,
type: JsonObject,
description: "Information about the Amazon S3 location that is configured for storing model artifacts.",
}
- {
name: profiler_rule_evaluation_statuses,
type: JsonArray,
description: "Evaluation status of Amazon SageMaker Debugger rules for profiling on a training job.",
}
- {
name: profiling_status,
type: String,
description: "Profiling status of a training job.",
}
- {
name: secondary_status,
type: String,
description: "Provides detailed information about the state of the training job.",
}
- {
name: training_job_status,
type: String,
description: "The status of the training job.",
}
- {
name: warm_pool_status,
type: JsonObject,
description: "The status of the warm pool associated with the training job.",
}
###########################GENERATED SECTION ABOVE############################
implementation:
container:
image: public.ecr.aws/kubeflow-on-aws/aws-sagemaker-kfp-components:2.3.0
command: [python3]
args:
- TrainingJob/src/TrainingJob_component.py
- --region
- { inputValue: region }
###########################GENERATED SECTION BELOW############################
- --algorithm_specification
- { inputValue: algorithm_specification }
- --checkpoint_config
- { inputValue: checkpoint_config }
- --debug_hook_config
- { inputValue: debug_hook_config }
- --debug_rule_configurations
- { inputValue: debug_rule_configurations }
- --enable_inter_container_traffic_encryption
- { inputValue: enable_inter_container_traffic_encryption }
- --enable_managed_spot_training
- { inputValue: enable_managed_spot_training }
- --enable_network_isolation
- { inputValue: enable_network_isolation }
- --environment
- { inputValue: environment }
- --experiment_config
- { inputValue: experiment_config }
- --hyper_parameters
- { inputValue: hyper_parameters }
- --input_data_config
- { inputValue: input_data_config }
- --output_data_config
- { inputValue: output_data_config }
- --profiler_config
- { inputValue: profiler_config }
- --profiler_rule_configurations
- { inputValue: profiler_rule_configurations }
- --resource_config
- { inputValue: resource_config }
- --retry_strategy
- { inputValue: retry_strategy }
- --role_arn
- { inputValue: role_arn }
- --stopping_condition
- { inputValue: stopping_condition }
- --tags
- { inputValue: tags }
- --tensor_board_output_config
- { inputValue: tensor_board_output_config }
- --training_job_name
- { inputValue: training_job_name }
- --vpc_config
- { inputValue: vpc_config }
###########################GENERATED SECTION ABOVE############################