pipelines/components/XGBoost/Predict/from_ApacheParquet/component.yaml

name: Xgboost predict
description: |-
  Make predictions using a trained XGBoost model.

      Args:
          data_path: Path for the feature data in Apache Parquet format.
          model_path: Path for the trained model in binary XGBoost format.
          predictions_path: Output path for the predictions.
          label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.

      Annotations:
          author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: data, type: ApacheParquet}
- {name: model, type: XGBoostModel}
- {name: label_column_name, type: String, optional: true}
outputs:
- {name: predictions, type: Predictions}
metadata:
  annotations:
    author: Alexey Volkov <alexey.volkov@ark-kun.com>
    canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/XGBoost/Predict/from_ApacheParquet/component.yaml'
implementation:
  container:
    image: python:3.7
    command:
    - sh
    - -c
    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
      'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
      python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
      'pyarrow==0.17.1' --user) && "$0" "$@"
    - python3
    - -u
    - -c
    - |
      def _make_parent_dirs_and_return_path(file_path: str):
          import os
          os.makedirs(os.path.dirname(file_path), exist_ok=True)
          return file_path

      def xgboost_predict(
          data_path,
          model_path,
          predictions_path,
          label_column_name = None,
      ):
          '''Make predictions using a trained XGBoost model.

          Args:
              data_path: Path for the feature data in Apache Parquet format.
              model_path: Path for the trained model in binary XGBoost format.
              predictions_path: Output path for the predictions.
              label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.

          Annotations:
              author: Alexey Volkov <alexey.volkov@ark-kun.com>
          '''
          from pathlib import Path

          import numpy
          import pandas
          import xgboost

          # Loading data
          df = pandas.read_parquet(data_path)
          if label_column_name:
              df = df.drop(columns=[label_column_name])

          evaluation_data = xgboost.DMatrix(
              data=df,
          )

          # Training
          model = xgboost.Booster(model_file=model_path)

          predictions = model.predict(evaluation_data)

          Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
          numpy.savetxt(predictions_path, predictions)

      import argparse
      _parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n    Args:\n        data_path: Path for the feature data in Apache Parquet format.\n        model_path: Path for the trained model in binary XGBoost format.\n        predictions_path: Output path for the predictions.\n        label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
      _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
      _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
      _parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
      _parsed_args = vars(_parser.parse_args())

      _outputs = xgboost_predict(**_parsed_args)
    args:
    - --data
    - {inputPath: data}
    - --model
    - {inputPath: model}
    - if:
        cond: {isPresent: label_column_name}
        then:
        - --label-column-name
        - {inputValue: label_column_name}
    - --predictions
    - {outputPath: predictions}