mirror of https://github.com/kubeflow/examples.git
780 lines
30 KiB
Plaintext
780 lines
30 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# 🪙 American Express - Default Prediction Competition Vanilla KFP Pipeline\n",
|
|
"\n",
|
|
"\n",
|
|
"---\n",
|
|
"\n",
|
|
"In this [Kaggle competition](https://www.kaggle.com/competitions/g-research-crypto-forecasting/overview), you'll use your machine learning expertise to predict credit default. This competition is hosted by American Express. \n",
|
|
"\n",
|
|
"> American Express is a globally integrated payments company. The largest payment card issuer in the world, they provide customers with access to products, insights, and experiences that enrich lives and build business success.\n",
|
|
"\n",
|
|
"The dataset provided is an industrial scale data set of about 5.5 million rows. It has been pre-processed and converted to a lightweight version by raddar for ease of training and better result. This dataset is available in a [parquet format][1].\n",
|
|
"\n",
|
|
"[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Install relevant libraries\n",
|
|
"\n",
|
|
"\n",
|
|
">Update pip `pip install --user --upgrade pip`\n",
|
|
"\n",
|
|
">Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`\n",
|
|
"\n",
|
|
"You may need to restart your notebook kernel after installing the kfp sdk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Requirement already satisfied: pip in /usr/local/lib/python3.6/dist-packages (21.3.1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install --user --upgrade pip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install kfp --upgrade --user --quiet"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: kfp\n",
|
|
"Version: 1.8.11\n",
|
|
"Summary: KubeFlow Pipelines SDK\n",
|
|
"Home-page: https://github.com/kubeflow/pipelines\n",
|
|
"Author: The Kubeflow Authors\n",
|
|
"Author-email: \n",
|
|
"License: UNKNOWN\n",
|
|
"Location: /home/jovyan/.local/lib/python3.6/site-packages\n",
|
|
"Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate\n",
|
|
"Required-by: kubeflow-kale\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# confirm the kfp sdk\n",
|
|
"! pip show kfp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"tags": [
|
|
"imports"
|
|
]
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import kfp\n",
|
|
"import kfp.components as comp\n",
|
|
"import kfp.dsl as dsl\n",
|
|
"from kfp.components import OutputPath\n",
|
|
"from typing import NamedTuple"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Kubeflow pipeline component creation\n",
|
|
"\n",
|
|
"## Download the dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# load data step\n",
|
|
"def download_data(dataset, \n",
|
|
" data_path):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import os, sys, subprocess, zipfile, pickle;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','kaggle'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','wget'])\n",
|
|
" \n",
|
|
" # import libraries\n",
|
|
" import pandas as pd\n",
|
|
" import wget\n",
|
|
"\n",
|
|
" # setup kaggle environment for data download\n",
|
|
" with open('/secret/kaggle-secret/password', 'r') as file:\n",
|
|
" kaggle_key = file.read().rstrip()\n",
|
|
" with open('/secret/kaggle-secret/username', 'r') as file:\n",
|
|
" kaggle_user = file.read().rstrip()\n",
|
|
" \n",
|
|
" os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY'] = kaggle_user, kaggle_key\n",
|
|
" \n",
|
|
" # create data_path directory\n",
|
|
" if not os.path.exists(data_path):\n",
|
|
" os.makedirs(data_path)\n",
|
|
" \n",
|
|
" # download kaggle's Amex-credit-prediction data\n",
|
|
" subprocess.run([\"kaggle\",\"datasets\", \"download\", \"-d\", f'raddar/{dataset}'])\n",
|
|
" \n",
|
|
" # extract Amex-credit-prediction.zip to data_path\n",
|
|
" with zipfile.ZipFile(f\"{dataset}.zip\",\"r\") as zip_ref:\n",
|
|
" zip_ref.extractall(data_path)\n",
|
|
"\n",
|
|
" # download kaggle's Amex-credit-prediction train_labels.zip\n",
|
|
" download_link = \"https://github.com/kubeflow/examples/blob/master/american-express-default-kaggle-competition/data/train_labels.zip?raw=true\"\n",
|
|
" \n",
|
|
" wget.download(download_link, f'{data_path}/train_labels.zip')\n",
|
|
"\n",
|
|
" # extract Amex-credit-prediction.zip to data_path\n",
|
|
" with zipfile.ZipFile(f'{data_path}/train_labels.zip','r') as zip_ref:\n",
|
|
" zip_ref.extractall(data_path)\n",
|
|
"\n",
|
|
" # delete zipfiles\n",
|
|
" subprocess.run(['rm', f'{dataset}.zip'])\n",
|
|
" subprocess.run(['rm', f'{data_path}/train_labels.zip'])\n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Load Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# load data step\n",
|
|
"def load_data(data_path):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import os, sys, subprocess, pickle;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pyarrow'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','fastparquet'])\n",
|
|
" \n",
|
|
" # import libraries\n",
|
|
" import pandas as pd\n",
|
|
"\n",
|
|
" TRAIN_CSV = (f'{data_path}/train.parquet')\n",
|
|
" TEST_CSV = f'{data_path}/test.parquet'\n",
|
|
" TARGET_CSV = f'{data_path}/train_labels.csv'\n",
|
|
" \n",
|
|
" # read parquet TRAIN, TEST and TARGET_CSV\n",
|
|
" df_train = pd.read_parquet(TRAIN_CSV)\n",
|
|
" df_test = pd.read_parquet(TEST_CSV)\n",
|
|
" target = pd.read_csv(TARGET_CSV).target.values\n",
|
|
" print(f\"target shape: {target.shape}\")\n",
|
|
" \n",
|
|
" \n",
|
|
" # Save all data as a pickle file to be used by the feature_engineering component.\n",
|
|
" with open(f'{data_path}/df_data', 'wb') as f:\n",
|
|
" pickle.dump((df_train, target, df_test), f)\n",
|
|
" \n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Feature Engineering"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# feature engineering step\n",
|
|
"\n",
|
|
"def feature_engineering(data_path):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, pickle, gc\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
"\n",
|
|
" # loading data\n",
|
|
" with open(f'{data_path}/df_data', 'rb') as f:\n",
|
|
" df_train, target, df_test = pickle.load(f)\n",
|
|
" \n",
|
|
" # feature engineering gotten from https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart\n",
|
|
" def get_features(df, \n",
|
|
" features_avg, \n",
|
|
" features_min, \n",
|
|
" features_max, \n",
|
|
" features_last\n",
|
|
" ):\n",
|
|
" '''\n",
|
|
" This function takes a dataframe with all features and returns the aggregated feature grouped by the customer id.\n",
|
|
"\n",
|
|
" df - dataframe\n",
|
|
" '''\n",
|
|
" cid = pd.Categorical(df.pop('customer_ID'), ordered=True) # get customer id\n",
|
|
" last = (cid != np.roll(cid, -1)) # mask for last statement of every customer\n",
|
|
"\n",
|
|
" df_avg = (df\n",
|
|
" .groupby(cid)\n",
|
|
" .mean()[features_avg]\n",
|
|
" .rename(columns={f: f\"{f}_avg\" for f in features_avg})\n",
|
|
" ) \n",
|
|
"\n",
|
|
" df_min = (df\n",
|
|
" .groupby(cid)\n",
|
|
" .min()[features_min]\n",
|
|
" .rename(columns={f: f\"{f}_min\" for f in features_min})\n",
|
|
" )\n",
|
|
" gc.collect()\n",
|
|
" print('Computed min')\n",
|
|
"\n",
|
|
" df_max = (df\n",
|
|
" .groupby(cid)\n",
|
|
" .max()[features_max]\n",
|
|
" .rename(columns={f: f\"{f}_max\" for f in features_max})\n",
|
|
" )\n",
|
|
" gc.collect()\n",
|
|
" print('Computed max')\n",
|
|
"\n",
|
|
" df = (df.loc[last, features_last]\n",
|
|
" .rename(columns={f: f\"{f}_last\" for f in features_last})\n",
|
|
" .set_index(np.asarray(cid[last]))\n",
|
|
" )\n",
|
|
" gc.collect()\n",
|
|
" print('Computed last')\n",
|
|
"\n",
|
|
" df_ = pd.concat([df, df_min, df_max, df_avg], axis=1, )\n",
|
|
"\n",
|
|
" del df, df_avg, df_min, df_max, cid, last\n",
|
|
"\n",
|
|
" return df_\n",
|
|
" \n",
|
|
" features_avg = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', \n",
|
|
" 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_28', 'B_29', 'B_30', \n",
|
|
" 'B_32', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', \n",
|
|
" 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_53', 'D_54', 'D_55', 'D_58', 'D_59', 'D_60', 'D_61', \n",
|
|
" 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', \n",
|
|
" 'D_80', 'D_82', 'D_84', 'D_86', 'D_91', 'D_92', 'D_94', 'D_96', 'D_103', 'D_104', 'D_108', 'D_112', 'D_113', \n",
|
|
" 'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', \n",
|
|
" 'D_128', 'D_129', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_140', 'D_141', 'D_142', 'D_144', \n",
|
|
" 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_14', 'R_15', 'R_16', \n",
|
|
" 'R_17', 'R_20', 'R_21', 'R_22', 'R_24', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_9', 'S_11', 'S_12', 'S_13', \n",
|
|
" 'S_15', 'S_16', 'S_18', 'S_22', 'S_23', 'S_25', 'S_26']\n",
|
|
" features_min = ['B_2', 'B_4', 'B_5', 'B_9', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_20', 'B_28', 'B_29', 'B_33', 'B_36', \n",
|
|
" 'B_42', 'D_39', 'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51', 'D_53', 'D_55', 'D_56', 'D_58', 'D_59', \n",
|
|
" 'D_60', 'D_62', 'D_70', 'D_71', 'D_74', 'D_75', 'D_78', 'D_83', 'D_102', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', \n",
|
|
" 'D_121', 'D_122', 'D_128', 'D_132', 'D_140', 'D_141', 'D_144', 'D_145', 'P_2', 'P_3', 'R_1', 'R_27', 'S_3', 'S_5', \n",
|
|
" 'S_7', 'S_9', 'S_11', 'S_12', 'S_23', 'S_25']\n",
|
|
" features_max = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', \n",
|
|
" 'B_18', 'B_19', 'B_21', 'B_23', 'B_24', 'B_25', 'B_29', 'B_30', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_42', 'D_39', \n",
|
|
" 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_52', 'D_55', 'D_56', 'D_58', 'D_59', \n",
|
|
" 'D_60', 'D_61', 'D_63', 'D_64', 'D_65', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', \n",
|
|
" 'D_84', 'D_91', 'D_102', 'D_105', 'D_107', 'D_110', 'D_111', 'D_112', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', \n",
|
|
" 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', \n",
|
|
" 'D_138', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_3', 'R_5', 'R_6', 'R_7', 'R_8', \n",
|
|
" 'R_10', 'R_11', 'R_14', 'R_17', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', \n",
|
|
" 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']\n",
|
|
" features_last = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', \n",
|
|
" 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', \n",
|
|
" 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', \n",
|
|
" 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63', \n",
|
|
" 'D_64', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', \n",
|
|
" 'D_83', 'D_86', 'D_91', 'D_96', 'D_105', 'D_106', 'D_112', 'D_114', 'D_119', 'D_120', 'D_121', 'D_122', 'D_124', 'D_125', \n",
|
|
" 'D_126', 'D_127', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138', 'D_140', 'D_141', 'D_142', 'D_145', 'P_2', 'P_3', \n",
|
|
" 'P_4', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', \n",
|
|
" 'R_19', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_16', 'S_19', 'S_20', \n",
|
|
" 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']\n",
|
|
" \n",
|
|
" # apply feature engineering function\n",
|
|
" train = get_features(df_train, features_avg, features_min, features_max, features_last)\n",
|
|
" test = get_features(df_test, features_avg, features_min, features_max, features_last)\n",
|
|
"\n",
|
|
" # save the feature engineered data as a pickle file to be used by the modeling component.\n",
|
|
" with open(f'{data_path}/features_df', 'wb') as f:\n",
|
|
" pickle.dump((train, test, target), f)\n",
|
|
" \n",
|
|
" return(print('Done!')) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.01421,
|
|
"end_time": "2022-04-17T07:17:13.396620",
|
|
"exception": false,
|
|
"start_time": "2022-04-17T07:17:13.382410",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Modelling\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# modeling step\n",
|
|
"\n",
|
|
"def modeling(data_path):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, pickle, joblib, warnings;\n",
|
|
" import pandas as pd\n",
|
|
" import numpy as np\n",
|
|
" from sklearn.model_selection import StratifiedKFold\n",
|
|
" from lightgbm import LGBMClassifier\n",
|
|
" warnings.filterwarnings(\"ignore\")\n",
|
|
" \n",
|
|
" # loading data\n",
|
|
" with open(f'{data_path}/features_df', 'rb') as f:\n",
|
|
" train, test, target = pickle.load(f)\n",
|
|
" \n",
|
|
" # define the evaluation metric\n",
|
|
" # From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020\n",
|
|
" def amex_metric(y_true: np.array, y_pred: np.array) -> float:\n",
|
|
"\n",
|
|
" # count of positives and negatives\n",
|
|
" n_pos = y_true.sum()\n",
|
|
" n_neg = y_true.shape[0] - n_pos\n",
|
|
"\n",
|
|
" # sorting by descring prediction values\n",
|
|
" indices = np.argsort(y_pred)[::-1]\n",
|
|
" preds, target = y_pred[indices], y_true[indices]\n",
|
|
"\n",
|
|
" # filter the top 4% by cumulative row weights\n",
|
|
" weight = 20.0 - target * 19.0\n",
|
|
" cum_norm_weight = (weight / weight.sum()).cumsum()\n",
|
|
" four_pct_filter = cum_norm_weight <= 0.04\n",
|
|
"\n",
|
|
" # default rate captured at 4%\n",
|
|
" d = target[four_pct_filter].sum() / n_pos\n",
|
|
"\n",
|
|
" # weighted gini coefficient\n",
|
|
" lorentz = (target / n_pos).cumsum()\n",
|
|
" gini = ((lorentz - cum_norm_weight) * weight).sum()\n",
|
|
"\n",
|
|
" # max weighted gini coefficient\n",
|
|
" gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))\n",
|
|
"\n",
|
|
" # normalized weighted gini coefficient\n",
|
|
" g = gini / gini_max\n",
|
|
"\n",
|
|
" return 0.5 * (g + d)\n",
|
|
"\n",
|
|
" def lgb_amex_metric(y_true, y_pred):\n",
|
|
" \"\"\"The competition metric with lightgbm's calling convention\"\"\"\n",
|
|
" return ('amex_metric_score',\n",
|
|
" amex_metric(y_true, y_pred),\n",
|
|
" True)\n",
|
|
" \n",
|
|
" # Cross-validation\n",
|
|
"\n",
|
|
" features = [f for f in train.columns if f != 'customer_ID' and f != 'target']\n",
|
|
"\n",
|
|
" print(f\"{len(features)} features\")\n",
|
|
"\n",
|
|
" score_list = [] # lgbm score per fold\n",
|
|
" y_pred_list = [] # fold predictions list\n",
|
|
"\n",
|
|
" # init StratifiedKFold\n",
|
|
" kf = StratifiedKFold(n_splits=4)\n",
|
|
"\n",
|
|
" for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):\n",
|
|
"\n",
|
|
" X_tr, X_va, y_tr, y_va, model = None, None, None, None, None\n",
|
|
"\n",
|
|
" X_tr = train.iloc[idx_tr][features]\n",
|
|
" X_va = train.iloc[idx_va][features]\n",
|
|
" y_tr = target[idx_tr]\n",
|
|
" y_va = target[idx_va]\n",
|
|
"\n",
|
|
" # init model\n",
|
|
" model = LGBMClassifier(n_estimators=30,\n",
|
|
" learning_rate=0.1, \n",
|
|
" num_leaves=100,\n",
|
|
" random_state=2022)\n",
|
|
" # fit model\n",
|
|
" model.fit(X_tr, y_tr,\n",
|
|
" eval_set = [(X_va, y_va)], \n",
|
|
" eval_metric=[lgb_amex_metric],\n",
|
|
" verbose = 20,\n",
|
|
" early_stopping_rounds=30)\n",
|
|
"\n",
|
|
" X_tr, y_tr = None, None\n",
|
|
"\n",
|
|
" # fold validation set predictions\n",
|
|
" y_va_pred = model.predict_proba(X_va, raw_score=True)\n",
|
|
"\n",
|
|
" # model score\n",
|
|
" score = amex_metric(y_va, y_va_pred)\n",
|
|
"\n",
|
|
" print(f\"Score = {score}\")\n",
|
|
" score_list.append(score)\n",
|
|
"\n",
|
|
" # test set predictions\n",
|
|
" y_pred_list.append(model.predict_proba(test[features], raw_score=True))\n",
|
|
"\n",
|
|
" print(f\"Fold {fold}\") \n",
|
|
"\n",
|
|
" # save model\n",
|
|
" joblib.dump(model, f'{data_path}/lgb.jl')\n",
|
|
" \n",
|
|
" return(print('Done!')) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.01428,
|
|
"end_time": "2022-04-17T07:17:23.959655",
|
|
"exception": false,
|
|
"start_time": "2022-04-17T07:17:23.945375",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# evaluation step\n",
|
|
"\n",
|
|
"def evaluation_result(data_path, \n",
|
|
" metrics_path: OutputPath(str)) -> NamedTuple(\"EvaluationOutput\", [(\"mlpipeline_metrics\", \"Metrics\")]):\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])\n",
|
|
" import json;\n",
|
|
" from collections import namedtuple\n",
|
|
" import joblib\n",
|
|
" import lightgbm as lgb\n",
|
|
" from lightgbm import LGBMRegressor\n",
|
|
" \n",
|
|
" # load model\n",
|
|
" model = joblib.load(f'{data_path}/lgb.jl')\n",
|
|
"\n",
|
|
" # model evaluation\n",
|
|
" binary_logloss = model.booster_.best_score.get('valid_0').get('binary_logloss')\n",
|
|
" amex_metric_score = model.booster_.best_score.get('valid_0').get('amex_metric_score')\n",
|
|
" \n",
|
|
" # create kubeflow metric metadata for UI \n",
|
|
" metrics = {\n",
|
|
" 'metrics': [\n",
|
|
" {'name': 'binary-logloss',\n",
|
|
" 'numberValue': binary_logloss,\n",
|
|
" 'format': 'RAW'},\n",
|
|
" {'name': 'amex-metric-score',\n",
|
|
" 'numberValue': amex_metric_score,\n",
|
|
" 'format': 'RAW'}\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \n",
|
|
"\n",
|
|
" with open(metrics_path, \"w\") as f:\n",
|
|
" json.dump(metrics, f)\n",
|
|
"\n",
|
|
" output_tuple = namedtuple(\"EvaluationOutput\", [\"mlpipeline_metrics\"])\n",
|
|
"\n",
|
|
" return output_tuple(json.dumps(metrics))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create pipeline components \n",
|
|
"\n",
|
|
"using `create_component_from_func`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create light weight components\n",
|
|
"download_op = comp.create_component_from_func(download_data,base_image=\"python:3.7.1\")\n",
|
|
"load_op = comp.create_component_from_func(load_data,base_image=\"python:3.7.1\")\n",
|
|
"feature_eng_op = comp.create_component_from_func(feature_engineering,base_image=\"python:3.7.1\")\n",
|
|
"modeling_op = comp.create_component_from_func(modeling, base_image=\"python:3.7.1\")\n",
|
|
"evaluation_op = comp.create_component_from_func(evaluation_result, base_image=\"python:3.7.1\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Kubeflow pipeline creation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# define pipeline\n",
|
|
"@dsl.pipeline(name=\"american-express-default-prediction-pipeline\", \n",
|
|
" description=\"predicting credit default.\")\n",
|
|
"\n",
|
|
"# Define parameters to be fed into pipeline\n",
|
|
"def american_express_default_prediction_pipeline(\n",
|
|
" dataset: str,\n",
|
|
" data_path: str\n",
|
|
" ):\n",
|
|
" # Define volume to share data between components.\n",
|
|
" vop = dsl.VolumeOp(\n",
|
|
" name=\"create_data_volume\",\n",
|
|
" resource_name=\"data-volume\", \n",
|
|
" size=\"24Gi\", \n",
|
|
" modes=dsl.VOLUME_MODE_RWO)\n",
|
|
" \n",
|
|
" \n",
|
|
" # Create download container.\n",
|
|
" download_container = download_op(dataset, data_path)\\\n",
|
|
" .add_pvolumes({data_path: vop.volume}).add_pod_label(\"kaggle-secret\", \"true\")\n",
|
|
" # Create load container.\n",
|
|
" load_container = load_op(data_path)\\\n",
|
|
" .add_pvolumes({data_path: download_container.pvolume})\n",
|
|
" # Create feature engineering container.\n",
|
|
" feat_eng_container = feature_eng_op(data_path)\\\n",
|
|
" .add_pvolumes({data_path: load_container.pvolume})\n",
|
|
" # Create modeling container.\n",
|
|
" modeling_container = modeling_op(data_path)\\\n",
|
|
" .add_pvolumes({data_path: feat_eng_container.pvolume})\n",
|
|
" # Create prediction container.\n",
|
|
" evaluation_container = evaluation_op(data_path).add_pvolumes({data_path: modeling_container.pvolume})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create client that would enable communication with the Pipelines API server \n",
|
|
"client = kfp.Client()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# arguments\n",
|
|
"dataset = \"amex-data-integer-dtypes-parquet-format\"\n",
|
|
"data_path = \"/mnt\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/experiments/details/2bdff04a-d9f5-4abb-8578-2d8822c501e4\" target=\"_blank\" >Experiment details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/runs/details/e09d582e-7ffd-4530-ad00-9c654f28c00c\" target=\"_blank\" >Run details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"pipeline_func = american_express_default_prediction_pipeline\n",
|
|
"\n",
|
|
"experiment_name = 'american_express_default_prediction_pipeline_lightweight'\n",
|
|
"run_name = pipeline_func.__name__ + ' run'\n",
|
|
"\n",
|
|
"arguments = {\n",
|
|
" \"dataset\": dataset,\n",
|
|
" \"data_path\": data_path\n",
|
|
" }\n",
|
|
"\n",
|
|
"# Compile pipeline to generate compressed YAML definition of the pipeline.\n",
|
|
"kfp.compiler.Compiler().compile(pipeline_func, \n",
|
|
" '{}.zip'.format(experiment_name))\n",
|
|
"\n",
|
|
"# Submit pipeline directly from pipeline function\n",
|
|
"run_result = client.create_run_from_pipeline_func(pipeline_func, \n",
|
|
" experiment_name=experiment_name, \n",
|
|
" run_name=run_name, \n",
|
|
" arguments=arguments\n",
|
|
" )\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"kubeflow_notebook": {
|
|
"autosnapshot": true,
|
|
"experiment": {
|
|
"id": "2efb8e27-3b2e-439b-a53c-b1f9d7b94cfc",
|
|
"name": "g-research-crypto-forecasting"
|
|
},
|
|
"experiment_name": "g-research-crypto-forecasting",
|
|
"katib_metadata": {
|
|
"algorithm": {
|
|
"algorithmName": "grid"
|
|
},
|
|
"maxFailedTrialCount": 3,
|
|
"maxTrialCount": 12,
|
|
"objective": {
|
|
"objectiveMetricName": "",
|
|
"type": "minimize"
|
|
},
|
|
"parallelTrialCount": 3,
|
|
"parameters": []
|
|
},
|
|
"katib_run": false,
|
|
"pipeline_description": "Forecasting short term returns in 14 popular cryptocurrencies.",
|
|
"pipeline_name": "g-research-crypto-forecasting-pipeline",
|
|
"snapshot_volumes": true,
|
|
"steps_defaults": [
|
|
"label:access-ml-pipeline:true",
|
|
"label:kaggle-secret:true",
|
|
"label:access-rok:true"
|
|
],
|
|
"volume_access_mode": "rwm",
|
|
"volumes": [
|
|
{
|
|
"annotations": [],
|
|
"mount_point": "/home/jovyan",
|
|
"name": "test-workspace-qtvmt",
|
|
"size": 32,
|
|
"size_type": "Gi",
|
|
"snapshot": false,
|
|
"type": "clone"
|
|
}
|
|
]
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
},
|
|
"papermill": {
|
|
"default_parameters": {},
|
|
"duration": 32.012084,
|
|
"end_time": "2022-04-17T07:17:25.053666",
|
|
"environment_variables": {},
|
|
"exception": null,
|
|
"input_path": "__notebook__.ipynb",
|
|
"output_path": "__notebook__.ipynb",
|
|
"parameters": {},
|
|
"start_time": "2022-04-17T07:16:53.041582",
|
|
"version": "2.3.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|