pipelines/components/aws/sagemaker/batch_transform/component.yaml

name: 'SageMaker - Batch Transformation'
description: |
  Batch Transformation Jobs in SageMaker
inputs:
  - name: region
    description: 'The region where the cluster launches.'
  - name: job_name
    description: 'The name of the batch transform job.'
    default: ''
  - name: model_name
    description: 'The name of the model that you want to use for the transform job.'
  - name: max_concurrent
    description: 'The maximum number of parallel requests that can be sent to each instance in a transform job.'
    default: '0'
  - name: max_payload
    description: 'The maximum allowed size of the payload, in MB.'
    default: '6'
  - name: batch_strategy
    description: 'The number of records to include in a mini-batch for an HTTP inference request.'
    default: ''
  - name: environment
    description: 'The environment variables to set in the Docker container. Up to 16 key-value entries in the map.'
    default: '{}'
  - name: input_location
    description: 'The S3 location of the data source that is associated with a channel.'
  - name: data_type
    description: 'Data type of the input. Can be ManifestFile, S3Prefix, or AugmentedManifestFile.'
    default: 'S3Prefix'
  - name: content_type
    description: 'The multipurpose internet mail extension (MIME) type of the data.'
    default: ''
  - name: split_type
    description: 'The method to use to split the transform job data files into smaller batches.'
    default: 'None'
  - name: compression_type
    description: 'If the transform data is compressed, the specification of the compression type.'
    default: 'None'
  - name: output_location
    description: 'The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.'
  - name: accept
    description: 'The MIME type used to specify the output data.'
    default: ''
  - name: assemble_with
    description: 'Defines how to assemble the results of the transform job as a single S3 object. Either None or Line.'
    default: ''
  - name: output_encryption_key
    description: 'The AWS Key Management Service ID of the key used to encrypt the output data.'
    default: ''
  - name: input_filter
    description: 'A JSONPath expression used to select a portion of the input data to pass to the algorithm.'
    default: ''
  - name: output_filter
    description: 'A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.'
    default: ''
  - name: join_source
    description: 'Specifies the source of the data to join with the transformed data.'
    default: 'None'
  - name: instance_type
    description: 'The ML compute instance type.'
    default: 'ml.m4.xlarge'
  - name: instance_count
    description: 'The number of ML compute instances to use in each training job.'
    default: '1'
  - name: resource_encryption_key
    description: 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).'
    default: ''
  - name: tags
    description: 'Key-value pairs to categorize AWS resources.'
    default: '{}'
outputs:
  - {name: output_location,    description: 'S3 URI of the transform job results.'}
implementation:
  container:
    image: carowang/kubeflow-pipeline-aws-sm:20190809-02
    command: ['python']
    args: [
      batch_transform.py,
      --region, {inputValue: region},
      --job_name, {inputValue: job_name},
      --model_name, {inputValue: model_name},
      --max_concurrent, {inputValue: max_concurrent},
      --max_payload, {inputValue: max_payload},
      --batch_strategy, {inputValue: batch_strategy},
      --environment, {inputValue: environment},
      --input_location, {inputValue: input_location},
      --data_type, {inputValue: data_type},
      --content_type, {inputValue: content_type},
      --split_type, {inputValue: split_type},
      --compression_type, {inputValue: compression_type},
      --output_location, {inputValue: output_location},
      --accept, {inputValue: accept},
      --assemble_with, {inputValue: assemble_with},
      --output_encryption_key, {inputValue: output_encryption_key},
      --input_filter, {inputValue: input_filter},
      --output_filter, {inputValue: output_filter},
      --join_source, {inputValue: join_source},
      --instance_type, {inputValue: instance_type},
      --instance_count, {inputValue: instance_count},
      --resource_encryption_key, {inputValue: resource_encryption_key},
      --tags, {inputValue: tags},
      --output_location_file, {outputPath: output_location}
    ]