pipelines/components/XGBoost/Predict/from_ApacheParquet/component.yaml

103 lines
4.2 KiB
YAML

name: Xgboost predict
description: |-
Make predictions using a trained XGBoost model.
Args:
data_path: Path for the feature data in Apache Parquet format.
model_path: Path for the trained model in binary XGBoost format.
predictions_path: Output path for the predictions.
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: data, type: ApacheParquet}
- {name: model, type: XGBoostModel}
- {name: label_column_name, type: String, optional: true}
outputs:
- {name: predictions, type: Predictions}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/XGBoost/Predict/from_ApacheParquet/component.yaml'
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- |
def _make_parent_dirs_and_return_path(file_path: str):
import os
os.makedirs(os.path.dirname(file_path), exist_ok=True)
return file_path
def xgboost_predict(
data_path,
model_path,
predictions_path,
label_column_name = None,
):
'''Make predictions using a trained XGBoost model.
Args:
data_path: Path for the feature data in Apache Parquet format.
model_path: Path for the trained model in binary XGBoost format.
predictions_path: Output path for the predictions.
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pathlib import Path
import numpy
import pandas
import xgboost
# Loading data
df = pandas.read_parquet(data_path)
if label_column_name:
df = df.drop(columns=[label_column_name])
evaluation_data = xgboost.DMatrix(
data=df,
)
# Training
model = xgboost.Booster(model_file=model_path)
predictions = model.predict(evaluation_data)
Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
numpy.savetxt(predictions_path, predictions)
import argparse
_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n Args:\n data_path: Path for the feature data in Apache Parquet format.\n model_path: Path for the trained model in binary XGBoost format.\n predictions_path: Output path for the predictions.\n label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
_parsed_args = vars(_parser.parse_args())
_outputs = xgboost_predict(**_parsed_args)
args:
- --data
- {inputPath: data}
- --model
- {inputPath: model}
- if:
cond: {isPresent: label_column_name}
then:
- --label-column-name
- {inputValue: label_column_name}
- --predictions
- {outputPath: predictions}