pipelines/components/dataflow/predict/component.yaml

name: Predict using TF on Dataflow
description: |
  Runs TensorFlow prediction on Google Cloud Dataflow
  Input and output data is in GCS
inputs:
  - {name: Data file pattern,   type: GCSPath,        description: 'GCS or local path of test file patterns.'} # type: {GCSPath: {data_type: CSV}}
  - {name: Schema,              type: GCSPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: TFDV schema JSON}}
  - {name: Target column,       type: String,                             description: 'Name of the column for prediction target.'}
  - {name: Model,               type: GCSPath, description: 'GCS or local path of model trained with tft preprocessed data.'}   # Models trained with estimator are exported to base/export/export/123456781 directory.  # Our trainer export only one model. #TODO: Output single model from trainer # type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}
  - {name: Batch size,          type: Integer,   default: '32',           description: 'Batch size used in prediction.'}
  - {name: Run mode,            type: String,    default: local,          description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'}
  - {name: GCP project,         type: GCPProjectID,                         description: 'The GCP project to run the dataflow job.'}
  - {name: Predictions dir,     type: GCSPath,  description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file # type: {GCSPath: {path_type: Directory}}
outputs:
  - {name: Predictions dir,     type: GCSPath,  description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file # type: {GCSPath: {path_type: Directory}}
implementation:
  container:
    image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:d4960d3379af4735fd04dc7167fab5fff82d0f22
    command: [python2, /ml/predict.py]
    args: [
      --data,       {inputValue: Data file pattern},
      --schema,     {inputValue: Schema},
      --target,     {inputValue: Target column},
      --model,      {inputValue: Model},
      --mode,       {inputValue: Run mode},
      --project,    {inputValue: GCP project},
      --batchsize,  {inputValue: Batch size},
      --output,     {inputValue: Predictions dir},
    ]
    fileOutputs:
      Predictions dir: /output.txt