pipelines/components/deprecated/dataflow/tft/component.yaml

28 lines
2.0 KiB
YAML

name: Transform using TF on Dataflow
description: Runs TensorFlow Transform on Google Cloud Dataflow
inputs:
- {name: Training data file pattern, type: GCSPath, description: 'GCS path of train file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
- {name: Evaluation data file pattern, type: GCSPath, description: 'GCS path of eval file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
- {name: Schema, type: GCSPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: JSON}}
- {name: GCP project, type: GCPProjectID, description: 'The GCP project to run the dataflow job.'}
- {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' }
- {name: Preprocessing module, type: GCSPath, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} # type: {GCSPath: {data_type: Python}}
- {name: Transformed data dir, type: GCSPath, description: 'GCS or local directory'} #Also supports local paths # type: {GCSPath: {path_type: Directory}}
outputs:
- {name: Transformed data dir, type: GCSPath} # type: {GCSPath: {path_type: Directory}}
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:57d9f7f1cfd458e945d297957621716062d89a49
command: [python2, /ml/transform.py]
args: [
--train, {inputValue: Training data file pattern},
--eval, {inputValue: Evaluation data file pattern},
--schema, {inputValue: Schema},
--project, {inputValue: GCP project},
--mode, {inputValue: Run mode},
--preprocessing-module, {inputValue: Preprocessing module},
--output, {inputValue: Transformed data dir},
]
fileOutputs:
Transformed data dir: /output.txt