28 lines
2.0 KiB
YAML
28 lines
2.0 KiB
YAML
name: Transform using TF on Dataflow
|
|
description: Runs TensorFlow Transform on Google Cloud Dataflow
|
|
inputs:
|
|
- {name: Training data file pattern, type: GCSPath, description: 'GCS path of train file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
|
|
- {name: Evaluation data file pattern, type: GCSPath, description: 'GCS path of eval file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
|
|
- {name: Schema, type: GCSPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: JSON}}
|
|
- {name: GCP project, type: GCPProjectID, description: 'The GCP project to run the dataflow job.'}
|
|
- {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' }
|
|
- {name: Preprocessing module, type: GCSPath, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} # type: {GCSPath: {data_type: Python}}
|
|
- {name: Transformed data dir, type: GCSPath, description: 'GCS or local directory'} #Also supports local paths # type: {GCSPath: {path_type: Directory}}
|
|
outputs:
|
|
- {name: Transformed data dir, type: GCSPath} # type: {GCSPath: {path_type: Directory}}
|
|
implementation:
|
|
container:
|
|
image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:57d9f7f1cfd458e945d297957621716062d89a49
|
|
command: [python2, /ml/transform.py]
|
|
args: [
|
|
--train, {inputValue: Training data file pattern},
|
|
--eval, {inputValue: Evaluation data file pattern},
|
|
--schema, {inputValue: Schema},
|
|
--project, {inputValue: GCP project},
|
|
--mode, {inputValue: Run mode},
|
|
--preprocessing-module, {inputValue: Preprocessing module},
|
|
--output, {inputValue: Transformed data dir},
|
|
]
|
|
fileOutputs:
|
|
Transformed data dir: /output.txt
|