266 lines
8.3 KiB
YAML
266 lines
8.3 KiB
YAML
name: "Sagemaker - TrainingJob"
|
|
description: Create TrainingJob
|
|
inputs:
|
|
- {
|
|
name: region,
|
|
type: String,
|
|
description: "The region to use for the training job",
|
|
}
|
|
###########################GENERATED SECTION BELOW############################
|
|
|
|
- {
|
|
name: algorithm_specification,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "The registry path of the Docker image that contains the training algorithm and algorithm-specific metadata, including the input mode.",
|
|
}
|
|
- {
|
|
name: checkpoint_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Contains information about the output location for managed spot training checkpoint data.",
|
|
}
|
|
- {
|
|
name: debug_hook_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Configuration information for the Amazon SageMaker Debugger hook parameters, metric and tensor collections, and storage paths.",
|
|
}
|
|
- {
|
|
name: debug_rule_configurations,
|
|
type: JsonArray,
|
|
default: '[]',
|
|
description: "Configuration information for Amazon SageMaker Debugger rules for debugging output tensors.",
|
|
}
|
|
- {
|
|
name: enable_inter_container_traffic_encryption,
|
|
type: Bool,
|
|
default: False,
|
|
description: "To encrypt all communications between ML compute instances in distributed training, choose True.",
|
|
}
|
|
- {
|
|
name: enable_managed_spot_training,
|
|
type: Bool,
|
|
default: False,
|
|
description: "To train models using managed spot training, choose True.",
|
|
}
|
|
- {
|
|
name: enable_network_isolation,
|
|
type: Bool,
|
|
default: False,
|
|
description: "Isolates the training container.",
|
|
}
|
|
- {
|
|
name: environment,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "The environment variables to set in the Docker container.",
|
|
}
|
|
- {
|
|
name: experiment_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Associates a SageMaker job as a trial component with an experiment and trial.",
|
|
}
|
|
- {
|
|
name: hyper_parameters,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Algorithm-specific parameters that influence the quality of the model.",
|
|
}
|
|
- {
|
|
name: input_data_config,
|
|
type: JsonArray,
|
|
default: '[]',
|
|
description: "An array of Channel objects.",
|
|
}
|
|
- {
|
|
name: output_data_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Specifies the path to the S3 location where you want to store model artifacts.",
|
|
}
|
|
- {
|
|
name: profiler_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Configuration information for Amazon SageMaker Debugger system monitoring, framework profiling, and storage paths.",
|
|
}
|
|
- {
|
|
name: profiler_rule_configurations,
|
|
type: JsonArray,
|
|
default: '[]',
|
|
description: "Configuration information for Amazon SageMaker Debugger rules for profiling system and framework metrics.",
|
|
}
|
|
- {
|
|
name: resource_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "The resources, including the ML compute instances and ML storage volumes, to use for model training.",
|
|
}
|
|
- {
|
|
name: retry_strategy,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "The number of times to retry the job when the job fails due to an InternalServerError.",
|
|
}
|
|
- {
|
|
name: role_arn,
|
|
type: String,
|
|
default: '',
|
|
description: "The Amazon Resource Name (ARN) of an IAM role that SageMaker can assume to perform tasks on your behalf.",
|
|
}
|
|
- {
|
|
name: stopping_condition,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Specifies a limit to how long a model training job can run.",
|
|
}
|
|
- {
|
|
name: tags,
|
|
type: JsonArray,
|
|
default: '[]',
|
|
description: "An array of key-value pairs.",
|
|
}
|
|
- {
|
|
name: tensor_board_output_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "Configuration of storage locations for the Amazon SageMaker Debugger TensorBoard output data.",
|
|
}
|
|
- {
|
|
name: training_job_name,
|
|
type: String,
|
|
default: '',
|
|
description: "The name of the training job.",
|
|
}
|
|
- {
|
|
name: vpc_config,
|
|
type: JsonObject,
|
|
default: '{}',
|
|
description: "A VpcConfig object that specifies the VPC that you want your training job to connect to.",
|
|
}
|
|
###########################GENERATED SECTION ABOVE############################
|
|
|
|
outputs:
|
|
###########################GENERATED SECTION BELOW############################
|
|
|
|
- {
|
|
name: ack_resource_metadata,
|
|
type: JsonObject,
|
|
description: "All CRs managed by ACK have a common `Status.",
|
|
}
|
|
- {
|
|
name: conditions,
|
|
type: JsonArray,
|
|
description: "All CRS managed by ACK have a common `Status.",
|
|
}
|
|
- {
|
|
name: creation_time,
|
|
type: String,
|
|
description: "A timestamp that indicates when the training job was created.",
|
|
}
|
|
- {
|
|
name: debug_rule_evaluation_statuses,
|
|
type: JsonArray,
|
|
description: "Evaluation status of Amazon SageMaker Debugger rules for debugging on a training job.",
|
|
}
|
|
- {
|
|
name: failure_reason,
|
|
type: String,
|
|
description: "If the training job failed, the reason it failed.",
|
|
}
|
|
- {
|
|
name: last_modified_time,
|
|
type: String,
|
|
description: "A timestamp that indicates when the status of the training job was last modified.",
|
|
}
|
|
- {
|
|
name: model_artifacts,
|
|
type: JsonObject,
|
|
description: "Information about the Amazon S3 location that is configured for storing model artifacts.",
|
|
}
|
|
- {
|
|
name: profiler_rule_evaluation_statuses,
|
|
type: JsonArray,
|
|
description: "Evaluation status of Amazon SageMaker Debugger rules for profiling on a training job.",
|
|
}
|
|
- {
|
|
name: profiling_status,
|
|
type: String,
|
|
description: "Profiling status of a training job.",
|
|
}
|
|
- {
|
|
name: secondary_status,
|
|
type: String,
|
|
description: "Provides detailed information about the state of the training job.",
|
|
}
|
|
- {
|
|
name: training_job_status,
|
|
type: String,
|
|
description: "The status of the training job.",
|
|
}
|
|
- {
|
|
name: warm_pool_status,
|
|
type: JsonObject,
|
|
description: "The status of the warm pool associated with the training job.",
|
|
}
|
|
###########################GENERATED SECTION ABOVE############################
|
|
|
|
implementation:
|
|
container:
|
|
image: public.ecr.aws/kubeflow-on-aws/aws-sagemaker-kfp-components:2.3.0
|
|
command: [python3]
|
|
args:
|
|
- TrainingJob/src/TrainingJob_component.py
|
|
- --region
|
|
- { inputValue: region }
|
|
###########################GENERATED SECTION BELOW############################
|
|
- --algorithm_specification
|
|
- { inputValue: algorithm_specification }
|
|
- --checkpoint_config
|
|
- { inputValue: checkpoint_config }
|
|
- --debug_hook_config
|
|
- { inputValue: debug_hook_config }
|
|
- --debug_rule_configurations
|
|
- { inputValue: debug_rule_configurations }
|
|
- --enable_inter_container_traffic_encryption
|
|
- { inputValue: enable_inter_container_traffic_encryption }
|
|
- --enable_managed_spot_training
|
|
- { inputValue: enable_managed_spot_training }
|
|
- --enable_network_isolation
|
|
- { inputValue: enable_network_isolation }
|
|
- --environment
|
|
- { inputValue: environment }
|
|
- --experiment_config
|
|
- { inputValue: experiment_config }
|
|
- --hyper_parameters
|
|
- { inputValue: hyper_parameters }
|
|
- --input_data_config
|
|
- { inputValue: input_data_config }
|
|
- --output_data_config
|
|
- { inputValue: output_data_config }
|
|
- --profiler_config
|
|
- { inputValue: profiler_config }
|
|
- --profiler_rule_configurations
|
|
- { inputValue: profiler_rule_configurations }
|
|
- --resource_config
|
|
- { inputValue: resource_config }
|
|
- --retry_strategy
|
|
- { inputValue: retry_strategy }
|
|
- --role_arn
|
|
- { inputValue: role_arn }
|
|
- --stopping_condition
|
|
- { inputValue: stopping_condition }
|
|
- --tags
|
|
- { inputValue: tags }
|
|
- --tensor_board_output_config
|
|
- { inputValue: tensor_board_output_config }
|
|
- --training_job_name
|
|
- { inputValue: training_job_name }
|
|
- --vpc_config
|
|
- { inputValue: vpc_config }
|
|
|
|
###########################GENERATED SECTION ABOVE############################
|
|
|