name: 'SageMaker - Batch Transformation' description: | Batch Transformation Jobs in SageMaker inputs: - name: region description: 'The region where the cluster launches.' - name: job_name description: 'The name of the batch transform job.' default: '' - name: model_name description: 'The name of the model that you want to use for the transform job.' - name: max_concurrent description: 'The maximum number of parallel requests that can be sent to each instance in a transform job.' default: '0' - name: max_payload description: 'The maximum allowed size of the payload, in MB.' default: '6' - name: batch_strategy description: 'The number of records to include in a mini-batch for an HTTP inference request.' default: '' - name: environment description: 'The environment variables to set in the Docker container. Up to 16 key-value entries in the map.' default: '{}' - name: input_location description: 'The S3 location of the data source that is associated with a channel.' - name: data_type description: 'Data type of the input. Can be ManifestFile, S3Prefix, or AugmentedManifestFile.' default: 'S3Prefix' - name: content_type description: 'The multipurpose internet mail extension (MIME) type of the data.' default: '' - name: split_type description: 'The method to use to split the transform job data files into smaller batches.' default: 'None' - name: compression_type description: 'If the transform data is compressed, the specification of the compression type.' default: 'None' - name: output_location description: 'The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.' - name: accept description: 'The MIME type used to specify the output data.' default: '' - name: assemble_with description: 'Defines how to assemble the results of the transform job as a single S3 object. Either None or Line.' default: '' - name: output_encryption_key description: 'The AWS Key Management Service ID of the key used to encrypt the output data.' default: '' - name: input_filter description: 'A JSONPath expression used to select a portion of the input data to pass to the algorithm.' default: '' - name: output_filter description: 'A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.' default: '' - name: join_source description: 'Specifies the source of the data to join with the transformed data.' default: 'None' - name: instance_type description: 'The ML compute instance type.' default: 'ml.m4.xlarge' - name: instance_count description: 'The number of ML compute instances to use in each training job.' default: '1' - name: resource_encryption_key description: 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).' default: '' - name: tags description: 'Key-value pairs to categorize AWS resources.' default: '{}' outputs: - {name: output_location, description: 'S3 URI of the transform job results.'} implementation: container: image: carowang/kubeflow-pipeline-aws-sm:20190809-02 command: ['python'] args: [ batch_transform.py, --region, {inputValue: region}, --job_name, {inputValue: job_name}, --model_name, {inputValue: model_name}, --max_concurrent, {inputValue: max_concurrent}, --max_payload, {inputValue: max_payload}, --batch_strategy, {inputValue: batch_strategy}, --environment, {inputValue: environment}, --input_location, {inputValue: input_location}, --data_type, {inputValue: data_type}, --content_type, {inputValue: content_type}, --split_type, {inputValue: split_type}, --compression_type, {inputValue: compression_type}, --output_location, {inputValue: output_location}, --accept, {inputValue: accept}, --assemble_with, {inputValue: assemble_with}, --output_encryption_key, {inputValue: output_encryption_key}, --input_filter, {inputValue: input_filter}, --output_filter, {inputValue: output_filter}, --join_source, {inputValue: join_source}, --instance_type, {inputValue: instance_type}, --instance_count, {inputValue: instance_count}, --resource_encryption_key, {inputValue: resource_encryption_key}, --tags, {inputValue: tags}, --output_location_file, {outputPath: output_location} ]