# 🪙 American Express - Default Prediction Competition Vanilla KFP Pipeline


---

In this [Kaggle competition](https://www.kaggle.com/competitions/g-research-crypto-forecasting/overview), you'll use your machine learning expertise to predict credit default. This competition is hosted by American Express. 

> American Express is a globally integrated payments company. The largest payment card issuer in the world, they provide customers with access to products, insights, and experiences that enrich lives and build business success.

The dataset provided is an industrial scale data set of about 5.5 million rows. It has been pre-processed and converted to a lightweight version by raddar for ease of training and better result. This dataset is available in a [parquet format][1].

[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format

# Install relevant libraries


>Update pip `pip install --user --upgrade pip`

>Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`

You may need to restart your notebook kernel after installing the kfp sdk

In [1]:
!pip install --user --upgrade pip



In [2]:
!pip install kfp --upgrade --user --quiet

In [3]:
# confirm the kfp sdk
! pip show kfp

Name: kfp
Version: 1.8.11
Summary: KubeFlow Pipelines SDK
Home-page: https://github.com/kubeflow/pipelines
Author: The Kubeflow Authors
Author-email: 
License: UNKNOWN
Location: /home/jovyan/.local/lib/python3.6/site-packages
Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate
Required-by: kubeflow-kale


In [4]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
from kfp.components import OutputPath
from typing import NamedTuple

# Kubeflow pipeline component creation

## Download the dataset

In [5]:
# load data step
def download_data(dataset, 
 data_path):
 
 # install the necessary libraries
 import os, sys, subprocess, zipfile, pickle;
 subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
 subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','kaggle'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','wget'])
 
 # import libraries
 import pandas as pd
 import wget

 # setup kaggle environment for data download
 with open('/secret/kaggle-secret/password', 'r') as file:
 kaggle_key = file.read().rstrip()
 with open('/secret/kaggle-secret/username', 'r') as file:
 kaggle_user = file.read().rstrip()
 
 os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY'] = kaggle_user, kaggle_key
 
 # create data_path directory
 if not os.path.exists(data_path):
 os.makedirs(data_path)
 
 # download kaggle's Amex-credit-prediction data
 subprocess.run(["kaggle","datasets", "download", "-d", f'raddar/{dataset}'])
 
 # extract Amex-credit-prediction.zip to data_path
 with zipfile.ZipFile(f"{dataset}.zip","r") as zip_ref:
 zip_ref.extractall(data_path)

 # download kaggle's Amex-credit-prediction train_labels.zip
 download_link = "https://github.com/kubeflow/examples/blob/master/american-express-default-kaggle-competition/data/train_labels.zip?raw=true"
 
 wget.download(download_link, f'{data_path}/train_labels.zip')

 # extract Amex-credit-prediction.zip to data_path
 with zipfile.ZipFile(f'{data_path}/train_labels.zip','r') as zip_ref:
 zip_ref.extractall(data_path)

 # delete zipfiles
 subprocess.run(['rm', f'{dataset}.zip'])
 subprocess.run(['rm', f'{data_path}/train_labels.zip'])
 return(print('Done!'))

## Load Data

In [6]:
# load data step
def load_data(data_path):
 
 # install the necessary libraries
 import os, sys, subprocess, pickle;
 subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
 subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','pyarrow'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','fastparquet'])
 
 # import libraries
 import pandas as pd

 TRAIN_CSV = (f'{data_path}/train.parquet')
 TEST_CSV = f'{data_path}/test.parquet'
 TARGET_CSV = f'{data_path}/train_labels.csv'
 
 # read parquet TRAIN, TEST and TARGET_CSV
 df_train = pd.read_parquet(TRAIN_CSV)
 df_test = pd.read_parquet(TEST_CSV)
 target = pd.read_csv(TARGET_CSV).target.values
 print(f"target shape: {target.shape}")
 
 
 # Save all data as a pickle file to be used by the feature_engineering component.
 with open(f'{data_path}/df_data', 'wb') as f:
 pickle.dump((df_train, target, df_test), f)
 
 return(print('Done!'))

## Feature Engineering

In [7]:
# feature engineering step

def feature_engineering(data_path):
 
 # install the necessary libraries
 import sys, subprocess;
 subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
 subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
 
 # import Library
 import os, pickle, gc
 import numpy as np
 import pandas as pd

 # loading data
 with open(f'{data_path}/df_data', 'rb') as f:
 df_train, target, df_test = pickle.load(f)
 
 # feature engineering gotten from https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart
 def get_features(df, 
 features_avg, 
 features_min, 
 features_max, 
 features_last
 ):
 '''
 This function takes a dataframe with all features and returns the aggregated feature grouped by the customer id.

 df - dataframe
 '''
 cid = pd.Categorical(df.pop('customer_ID'), ordered=True) # get customer id
 last = (cid != np.roll(cid, -1)) # mask for last statement of every customer

 df_avg = (df
 .groupby(cid)
 .mean()[features_avg]
 .rename(columns={f: f"{f}_avg" for f in features_avg})
 ) 

 df_min = (df
 .groupby(cid)
 .min()[features_min]
 .rename(columns={f: f"{f}_min" for f in features_min})
 )
 gc.collect()
 print('Computed min')

 df_max = (df
 .groupby(cid)
 .max()[features_max]
 .rename(columns={f: f"{f}_max" for f in features_max})
 )
 gc.collect()
 print('Computed max')

 df = (df.loc[last, features_last]
 .rename(columns={f: f"{f}_last" for f in features_last})
 .set_index(np.asarray(cid[last]))
 )
 gc.collect()
 print('Computed last')

 df_ = pd.concat([df, df_min, df_max, df_avg], axis=1, )

 del df, df_avg, df_min, df_max, cid, last

 return df_
 
 features_avg = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 
 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_28', 'B_29', 'B_30', 
 'B_32', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 
 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_53', 'D_54', 'D_55', 'D_58', 'D_59', 'D_60', 'D_61', 
 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 
 'D_80', 'D_82', 'D_84', 'D_86', 'D_91', 'D_92', 'D_94', 'D_96', 'D_103', 'D_104', 'D_108', 'D_112', 'D_113', 
 'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 
 'D_128', 'D_129', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_140', 'D_141', 'D_142', 'D_144', 
 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_14', 'R_15', 'R_16', 
 'R_17', 'R_20', 'R_21', 'R_22', 'R_24', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_9', 'S_11', 'S_12', 'S_13', 
 'S_15', 'S_16', 'S_18', 'S_22', 'S_23', 'S_25', 'S_26']
 features_min = ['B_2', 'B_4', 'B_5', 'B_9', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_20', 'B_28', 'B_29', 'B_33', 'B_36', 
 'B_42', 'D_39', 'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51', 'D_53', 'D_55', 'D_56', 'D_58', 'D_59', 
 'D_60', 'D_62', 'D_70', 'D_71', 'D_74', 'D_75', 'D_78', 'D_83', 'D_102', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 
 'D_121', 'D_122', 'D_128', 'D_132', 'D_140', 'D_141', 'D_144', 'D_145', 'P_2', 'P_3', 'R_1', 'R_27', 'S_3', 'S_5', 
 'S_7', 'S_9', 'S_11', 'S_12', 'S_23', 'S_25']
 features_max = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 
 'B_18', 'B_19', 'B_21', 'B_23', 'B_24', 'B_25', 'B_29', 'B_30', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_42', 'D_39', 
 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_52', 'D_55', 'D_56', 'D_58', 'D_59', 
 'D_60', 'D_61', 'D_63', 'D_64', 'D_65', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 
 'D_84', 'D_91', 'D_102', 'D_105', 'D_107', 'D_110', 'D_111', 'D_112', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 
 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 
 'D_138', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_3', 'R_5', 'R_6', 'R_7', 'R_8', 
 'R_10', 'R_11', 'R_14', 'R_17', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 
 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']
 features_last = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 
 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_28', 'B_29', 'B_30', 'B_32', 'B_33', 
 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41', 'B_42', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 
 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63', 
 'D_64', 'D_65', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 
 'D_83', 'D_86', 'D_91', 'D_96', 'D_105', 'D_106', 'D_112', 'D_114', 'D_119', 'D_120', 'D_121', 'D_122', 'D_124', 'D_125', 
 'D_126', 'D_127', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138', 'D_140', 'D_141', 'D_142', 'D_145', 'P_2', 'P_3', 
 'P_4', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 
 'R_19', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_16', 'S_19', 'S_20', 
 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']
 
 # apply feature engineering function
 train = get_features(df_train, features_avg, features_min, features_max, features_last)
 test = get_features(df_test, features_avg, features_min, features_max, features_last)

 # save the feature engineered data as a pickle file to be used by the modeling component.
 with open(f'{data_path}/features_df', 'wb') as f:
 pickle.dump((train, test, target), f)
 
 return(print('Done!')) 

## Modelling
 

In [8]:
# modeling step

def modeling(data_path):
 
 # install the necessary libraries
 import sys, subprocess;
 subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
 subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
 subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
 
 # import Library
 import os, pickle, joblib, warnings;
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import StratifiedKFold
 from lightgbm import LGBMClassifier
 warnings.filterwarnings("ignore")
 
 # loading data
 with open(f'{data_path}/features_df', 'rb') as f:
 train, test, target = pickle.load(f)
 
 # define the evaluation metric
 # From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
 def amex_metric(y_true: np.array, y_pred: np.array) -> float:

 # count of positives and negatives
 n_pos = y_true.sum()
 n_neg = y_true.shape[0] - n_pos

 # sorting by descring prediction values
 indices = np.argsort(y_pred)[::-1]
 preds, target = y_pred[indices], y_true[indices]

 # filter the top 4% by cumulative row weights
 weight = 20.0 - target * 19.0
 cum_norm_weight = (weight / weight.sum()).cumsum()
 four_pct_filter = cum_norm_weight <= 0.04

 # default rate captured at 4%
 d = target[four_pct_filter].sum() / n_pos

 # weighted gini coefficient
 lorentz = (target / n_pos).cumsum()
 gini = ((lorentz - cum_norm_weight) * weight).sum()

 # max weighted gini coefficient
 gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

 # normalized weighted gini coefficient
 g = gini / gini_max

 return 0.5 * (g + d)

 def lgb_amex_metric(y_true, y_pred):
 """The competition metric with lightgbm's calling convention"""
 return ('amex_metric_score',
 amex_metric(y_true, y_pred),
 True)
 
 # Cross-validation

 features = [f for f in train.columns if f != 'customer_ID' and f != 'target']

 print(f"{len(features)} features")

 score_list = [] # lgbm score per fold
 y_pred_list = [] # fold predictions list

 # init StratifiedKFold
 kf = StratifiedKFold(n_splits=4)

 for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):

 X_tr, X_va, y_tr, y_va, model = None, None, None, None, None

 X_tr = train.iloc[idx_tr][features]
 X_va = train.iloc[idx_va][features]
 y_tr = target[idx_tr]
 y_va = target[idx_va]

 # init model
 model = LGBMClassifier(n_estimators=30,
 learning_rate=0.1, 
 num_leaves=100,
 random_state=2022)
 # fit model
 model.fit(X_tr, y_tr,
 eval_set = [(X_va, y_va)], 
 eval_metric=[lgb_amex_metric],
 verbose = 20,
 early_stopping_rounds=30)

 X_tr, y_tr = None, None

 # fold validation set predictions
 y_va_pred = model.predict_proba(X_va, raw_score=True)

 # model score
 score = amex_metric(y_va, y_va_pred)

 print(f"Score = {score}")
 score_list.append(score)

 # test set predictions
 y_pred_list.append(model.predict_proba(test[features], raw_score=True))

 print(f"Fold {fold}") 

 # save model
 joblib.dump(model, f'{data_path}/lgb.jl')
 
 return(print('Done!')) 

## Evaluation

In [9]:
# evaluation step

def evaluation_result(data_path, 
 metrics_path: OutputPath(str)) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
 
 # import Library
 import sys, subprocess;
 subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
 subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
 import json;
 from collections import namedtuple
 import joblib
 import lightgbm as lgb
 from lightgbm import LGBMRegressor
 
 # load model
 model = joblib.load(f'{data_path}/lgb.jl')

 # model evaluation
 binary_logloss = model.booster_.best_score.get('valid_0').get('binary_logloss')
 amex_metric_score = model.booster_.best_score.get('valid_0').get('amex_metric_score')
 
 # create kubeflow metric metadata for UI 
 metrics = {
 'metrics': [
 {'name': 'binary-logloss',
 'numberValue': binary_logloss,
 'format': 'RAW'},
 {'name': 'amex-metric-score',
 'numberValue': amex_metric_score,
 'format': 'RAW'}
 ]
 }
 

 with open(metrics_path, "w") as f:
 json.dump(metrics, f)

 output_tuple = namedtuple("EvaluationOutput", ["mlpipeline_metrics"])

 return output_tuple(json.dumps(metrics))

## Create pipeline components 

using `create_component_from_func`

In [10]:
# create light weight components
download_op = comp.create_component_from_func(download_data,base_image="python:3.7.1")
load_op = comp.create_component_from_func(load_data,base_image="python:3.7.1")
feature_eng_op = comp.create_component_from_func(feature_engineering,base_image="python:3.7.1")
modeling_op = comp.create_component_from_func(modeling, base_image="python:3.7.1")
evaluation_op = comp.create_component_from_func(evaluation_result, base_image="python:3.7.1")

## Kubeflow pipeline creation

In [11]:
# define pipeline
@dsl.pipeline(name="american-express-default-prediction-pipeline", 
 description="predicting credit default.")

# Define parameters to be fed into pipeline
def american_express_default_prediction_pipeline(
 dataset: str,
 data_path: str
 ):
 # Define volume to share data between components.
 vop = dsl.VolumeOp(
 name="create_data_volume",
 resource_name="data-volume", 
 size="24Gi", 
 modes=dsl.VOLUME_MODE_RWO)
 
 
 # Create download container.
 download_container = download_op(dataset, data_path)\
 .add_pvolumes({data_path: vop.volume}).add_pod_label("kaggle-secret", "true")
 # Create load container.
 load_container = load_op(data_path)\
 .add_pvolumes({data_path: download_container.pvolume})
 # Create feature engineering container.
 feat_eng_container = feature_eng_op(data_path)\
 .add_pvolumes({data_path: load_container.pvolume})
 # Create modeling container.
 modeling_container = modeling_op(data_path)\
 .add_pvolumes({data_path: feat_eng_container.pvolume})
 # Create prediction container.
 evaluation_container = evaluation_op(data_path).add_pvolumes({data_path: modeling_container.pvolume})

In [12]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [13]:
# arguments
dataset = "amex-data-integer-dtypes-parquet-format"
data_path = "/mnt"

In [14]:
pipeline_func = american_express_default_prediction_pipeline

experiment_name = 'american_express_default_prediction_pipeline_lightweight'
run_name = pipeline_func.__name__ + ' run'

arguments = {
 "dataset": dataset,
 "data_path": data_path
 }

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func, 
 '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
 experiment_name=experiment_name, 
 run_name=run_name, 
 arguments=arguments
 )
