103 lines
4.2 KiB
YAML
103 lines
4.2 KiB
YAML
name: Xgboost predict
|
|
description: |-
|
|
Make predictions using a trained XGBoost model.
|
|
|
|
Args:
|
|
data_path: Path for the feature data in Apache Parquet format.
|
|
model_path: Path for the trained model in binary XGBoost format.
|
|
predictions_path: Output path for the predictions.
|
|
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
|
|
|
|
Annotations:
|
|
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
|
inputs:
|
|
- {name: data, type: ApacheParquet}
|
|
- {name: model, type: XGBoostModel}
|
|
- {name: label_column_name, type: String, optional: true}
|
|
outputs:
|
|
- {name: predictions, type: Predictions}
|
|
metadata:
|
|
annotations:
|
|
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
|
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/XGBoost/Predict/from_ApacheParquet/component.yaml'
|
|
implementation:
|
|
container:
|
|
image: python:3.7
|
|
command:
|
|
- sh
|
|
- -c
|
|
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
|
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
|
|
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
|
|
'pyarrow==0.17.1' --user) && "$0" "$@"
|
|
- python3
|
|
- -u
|
|
- -c
|
|
- |
|
|
def _make_parent_dirs_and_return_path(file_path: str):
|
|
import os
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
return file_path
|
|
|
|
def xgboost_predict(
|
|
data_path,
|
|
model_path,
|
|
predictions_path,
|
|
label_column_name = None,
|
|
):
|
|
'''Make predictions using a trained XGBoost model.
|
|
|
|
Args:
|
|
data_path: Path for the feature data in Apache Parquet format.
|
|
model_path: Path for the trained model in binary XGBoost format.
|
|
predictions_path: Output path for the predictions.
|
|
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
|
|
|
|
Annotations:
|
|
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
|
'''
|
|
from pathlib import Path
|
|
|
|
import numpy
|
|
import pandas
|
|
import xgboost
|
|
|
|
# Loading data
|
|
df = pandas.read_parquet(data_path)
|
|
if label_column_name:
|
|
df = df.drop(columns=[label_column_name])
|
|
|
|
evaluation_data = xgboost.DMatrix(
|
|
data=df,
|
|
)
|
|
|
|
# Training
|
|
model = xgboost.Booster(model_file=model_path)
|
|
|
|
predictions = model.predict(evaluation_data)
|
|
|
|
Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
|
|
numpy.savetxt(predictions_path, predictions)
|
|
|
|
import argparse
|
|
_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n Args:\n data_path: Path for the feature data in Apache Parquet format.\n model_path: Path for the trained model in binary XGBoost format.\n predictions_path: Output path for the predictions.\n label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')
|
|
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
|
|
_parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
|
|
_parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS)
|
|
_parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
|
|
_parsed_args = vars(_parser.parse_args())
|
|
|
|
_outputs = xgboost_predict(**_parsed_args)
|
|
args:
|
|
- --data
|
|
- {inputPath: data}
|
|
- --model
|
|
- {inputPath: model}
|
|
- if:
|
|
cond: {isPresent: label_column_name}
|
|
then:
|
|
- --label-column-name
|
|
- {inputValue: label_column_name}
|
|
- --predictions
|
|
- {outputPath: predictions}
|