pipelines/components/aws/sagemaker/batch_transform/component.yaml

103 lines
4.5 KiB
YAML

name: 'SageMaker - Batch Transformation'
description: |
Batch Transformation Jobs in SageMaker
inputs:
- name: region
description: 'The region where the cluster launches.'
- name: job_name
description: 'The name of the batch transform job.'
default: ''
- name: model_name
description: 'The name of the model that you want to use for the transform job.'
- name: max_concurrent
description: 'The maximum number of parallel requests that can be sent to each instance in a transform job.'
default: '0'
- name: max_payload
description: 'The maximum allowed size of the payload, in MB.'
default: '6'
- name: batch_strategy
description: 'The number of records to include in a mini-batch for an HTTP inference request.'
default: ''
- name: environment
description: 'The environment variables to set in the Docker container. Up to 16 key-value entries in the map.'
default: '{}'
- name: input_location
description: 'The S3 location of the data source that is associated with a channel.'
- name: data_type
description: 'Data type of the input. Can be ManifestFile, S3Prefix, or AugmentedManifestFile.'
default: 'S3Prefix'
- name: content_type
description: 'The multipurpose internet mail extension (MIME) type of the data.'
default: ''
- name: split_type
description: 'The method to use to split the transform job data files into smaller batches.'
default: 'None'
- name: compression_type
description: 'If the transform data is compressed, the specification of the compression type.'
default: 'None'
- name: output_location
description: 'The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.'
- name: accept
description: 'The MIME type used to specify the output data.'
default: ''
- name: assemble_with
description: 'Defines how to assemble the results of the transform job as a single S3 object. Either None or Line.'
default: ''
- name: output_encryption_key
description: 'The AWS Key Management Service ID of the key used to encrypt the output data.'
default: ''
- name: input_filter
description: 'A JSONPath expression used to select a portion of the input data to pass to the algorithm.'
default: ''
- name: output_filter
description: 'A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.'
default: ''
- name: join_source
description: 'Specifies the source of the data to join with the transformed data.'
default: 'None'
- name: instance_type
description: 'The ML compute instance type.'
default: 'ml.m4.xlarge'
- name: instance_count
description: 'The number of ML compute instances to use in each training job.'
default: '1'
- name: resource_encryption_key
description: 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).'
default: ''
- name: tags
description: 'Key-value pairs to categorize AWS resources.'
default: '{}'
outputs:
- {name: output_location, description: 'S3 URI of the transform job results.'}
implementation:
container:
image: carowang/kubeflow-pipeline-aws-sm:20190809-02
command: ['python']
args: [
batch_transform.py,
--region, {inputValue: region},
--job_name, {inputValue: job_name},
--model_name, {inputValue: model_name},
--max_concurrent, {inputValue: max_concurrent},
--max_payload, {inputValue: max_payload},
--batch_strategy, {inputValue: batch_strategy},
--environment, {inputValue: environment},
--input_location, {inputValue: input_location},
--data_type, {inputValue: data_type},
--content_type, {inputValue: content_type},
--split_type, {inputValue: split_type},
--compression_type, {inputValue: compression_type},
--output_location, {inputValue: output_location},
--accept, {inputValue: accept},
--assemble_with, {inputValue: assemble_with},
--output_encryption_key, {inputValue: output_encryption_key},
--input_filter, {inputValue: input_filter},
--output_filter, {inputValue: output_filter},
--join_source, {inputValue: join_source},
--instance_type, {inputValue: instance_type},
--instance_count, {inputValue: instance_count},
--resource_encryption_key, {inputValue: resource_encryption_key},
--tags, {inputValue: tags},
--output_location_file, {outputPath: output_location}
]