mirror of https://github.com/kubeflow/examples.git
1027 lines
37 KiB
Plaintext
1027 lines
37 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Telco Customer Churn Prediction Kubeflow Pipeline\n",
|
|
"\n",
|
|
"In this [Kaggle competition](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) \n",
|
|
"\n",
|
|
">In this competition, your goal is to analyze behavior to retain customers and predict churning. You can analyze all relevant customer data and develop focused customer retention programs."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Install relevant libraries\n",
|
|
"\n",
|
|
"\n",
|
|
">Update pip `pip install --user --upgrade pip`\n",
|
|
"\n",
|
|
">Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`\n",
|
|
"\n",
|
|
"You may need to restart your notebook kernel after installing the kfp sdk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"tags": [
|
|
"skip"
|
|
]
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Requirement already satisfied: pip in /usr/local/lib/python3.6/dist-packages (21.3.1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install --user --upgrade pip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install kfp --upgrade --user --quiet"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Name: kfp\n",
|
|
"Version: 1.8.11\n",
|
|
"Summary: KubeFlow Pipelines SDK\n",
|
|
"Home-page: https://github.com/kubeflow/pipelines\n",
|
|
"Author: The Kubeflow Authors\n",
|
|
"Author-email: \n",
|
|
"License: UNKNOWN\n",
|
|
"Location: /home/jovyan/.local/lib/python3.6/site-packages\n",
|
|
"Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate\n",
|
|
"Required-by: kubeflow-kale\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# confirm the kfp sdk\n",
|
|
"! pip show kfp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Imports\n",
|
|
"\n",
|
|
"In this section we import the kfp methods we need for this example. Make it a habit to gather your imports in a single place."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import kfp\n",
|
|
"import kfp.components as comp\n",
|
|
"import kfp.dsl as dsl\n",
|
|
"from kfp.components import InputPath, OutputPath\n",
|
|
"from typing import NamedTuple"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Kubeflow pipeline component creation\n",
|
|
"\n",
|
|
"## Download and load the dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# load data step\n",
|
|
"def load_data(download_link: str, data_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import os, sys, pickle, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"wget\"])\n",
|
|
" import wget\n",
|
|
" \n",
|
|
" # import libraries\n",
|
|
" import pandas as pd\n",
|
|
" \n",
|
|
" # create data_path directory\n",
|
|
" if not os.path.exists(data_path):\n",
|
|
" os.makedirs(data_path)\n",
|
|
"\n",
|
|
" # download data\n",
|
|
" wget.download(download_link, f'{data_path}/Telco-Customer-Churn.csv')\n",
|
|
" \n",
|
|
" # read data\n",
|
|
" data = pd.read_csv(f\"{data_path}/Telco-Customer-Churn.csv\")\n",
|
|
"\n",
|
|
" # Save data as a pickle file to be used by the tranform_data component.\n",
|
|
" with open(f'{data_path}/data', 'wb') as f:\n",
|
|
" pickle.dump(data, f)\n",
|
|
"\n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Transform Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# transform data step\n",
|
|
"\n",
|
|
"def transform_data(data_path: InputPath(str), \n",
|
|
" transform_data_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])\n",
|
|
" \n",
|
|
" # import Libraries\n",
|
|
" import os, pickle;\n",
|
|
" import pandas as pd\n",
|
|
" import numpy as np\n",
|
|
"\n",
|
|
" \n",
|
|
" # load data from data_path\n",
|
|
" with open(f'{data_path}/data', 'rb') as f:\n",
|
|
" data = pickle.load(f)\n",
|
|
"\n",
|
|
" # remove rows with spaces in TotalCharges column\n",
|
|
" data = data[data['TotalCharges'] !=' '].copy()\n",
|
|
" \n",
|
|
" # convert TotalCharges column datatype to float \n",
|
|
" data['TotalCharges'] = data['TotalCharges'].astype(float)\n",
|
|
" \n",
|
|
" # convert Churn targe from string to integers\n",
|
|
" # replace no with 1 and yes with 0\n",
|
|
" data['Churn'] = data['Churn'].replace({'No':1, 'Yes':0})\n",
|
|
" \n",
|
|
" #creating the transform_data_path\n",
|
|
" os.makedirs(transform_data_path, exist_ok = True)\n",
|
|
" \n",
|
|
" #Save data as a pickle file to be used by the feature_engineering component.\n",
|
|
" with open(f'{transform_data_path}/data', 'wb') as f:\n",
|
|
" pickle.dump(data, f)\n",
|
|
" \n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "G_rB_8FLfP4x",
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Feature Engineering\n",
|
|
"\n",
|
|
"Grouping the tenure, monthly charge and total charge column into different segments"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# feature engineering step\n",
|
|
"\n",
|
|
"def feature_engineering(transform_data_path: InputPath(str), \n",
|
|
" feat_eng_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, pickle;\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" from sklearn.model_selection import train_test_split\n",
|
|
" from sklearn.preprocessing import MinMaxScaler\n",
|
|
" \n",
|
|
" # loading the data\n",
|
|
" with open(f'{transform_data_path}/data', 'rb') as f:\n",
|
|
" data = pickle.load(f)\n",
|
|
" \n",
|
|
" def yearly_tenure(tenure):\n",
|
|
" if tenure <= 12:\n",
|
|
" return 1\n",
|
|
" elif tenure >12 and tenure <=24:\n",
|
|
" return 2\n",
|
|
" elif tenure >24 and tenure <=36:\n",
|
|
" return 3\n",
|
|
" elif tenure >36 and tenure <=48:\n",
|
|
" return 4\n",
|
|
" elif tenure >48 and tenure <=60:\n",
|
|
" return 5\n",
|
|
" elif tenure > 60 and tenure <=72:\n",
|
|
" return 6\n",
|
|
"\n",
|
|
" def monthly_charge_plan(charge):\n",
|
|
" if charge <= 35:\n",
|
|
" return 'Basic'\n",
|
|
" elif charge>35 and charge <= 80:\n",
|
|
" return 'Advanced'\n",
|
|
" elif charge>80:\n",
|
|
" return 'Premium'\n",
|
|
" \n",
|
|
" def total_charge_status(charge):\n",
|
|
" if charge <= 250:\n",
|
|
" return 'V-low'\n",
|
|
" elif charge > 250 and charge <= 450:\n",
|
|
" return 'low'\n",
|
|
" elif charge > 450 and charge <= 1500:\n",
|
|
" return 'medium'\n",
|
|
" elif charge > 1500 and charge <= 3500:\n",
|
|
" return 'High'\n",
|
|
" elif charge > 3500:\n",
|
|
" return 'V-High'\n",
|
|
" \n",
|
|
" # generating new features\n",
|
|
" data['OnlineSecurityBackup'] = data['OnlineSecurity'] + data['OnlineBackup']\n",
|
|
" data['OnlineSecurityDevice'] = data['OnlineSecurity'] + data['DeviceProtection']\n",
|
|
" data['Streaming'] = data['StreamingMovies'] + data['StreamingTV']\n",
|
|
" \n",
|
|
" # applying grouping functions to create new features\n",
|
|
" data['yearly_tenure'] = data['tenure'].apply(yearly_tenure)\n",
|
|
" data['MonthlyChargesplan'] = data['MonthlyCharges'].apply(monthly_charge_plan)\n",
|
|
" data['TotalChargestatus'] = data['TotalCharges'].apply(total_charge_status)\n",
|
|
" \n",
|
|
" # drop customerID\n",
|
|
" data = data.drop('customerID', axis=1)\n",
|
|
" \n",
|
|
" # scale 'MonthlyCharges' and 'TotalCharges' columns\n",
|
|
" scaler = MinMaxScaler()\n",
|
|
" data[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(data[['MonthlyCharges', 'TotalCharges']])\n",
|
|
" \n",
|
|
" # one-hot encode categorical variables \n",
|
|
" X = pd.get_dummies(data.drop(columns=['Churn']), drop_first=True)\n",
|
|
" y= data['Churn']\n",
|
|
" \n",
|
|
" # split data\n",
|
|
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22, stratify=y)\n",
|
|
"\n",
|
|
" # creating the feat_eng_path\n",
|
|
" os.makedirs(feat_eng_path, exist_ok = True)\n",
|
|
" \n",
|
|
" # save the train_test_split data as a pickle file to be used by the modeling component.\n",
|
|
" with open(f'{feat_eng_path}/split_data', 'wb') as f:\n",
|
|
" pickle.dump((X_train, X_test, y_train, y_test), f)\n",
|
|
" \n",
|
|
" return(print('Done!')) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "F0tD_7fKkw2J",
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Modelling\n",
|
|
"\n",
|
|
"## Catboost"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# catboost modeling step\n",
|
|
"\n",
|
|
"def catboost_modeling(feat_eng_path: InputPath(str), \n",
|
|
" cb_ensemble_path: OutputPath(str),\n",
|
|
" mlpipeline_ui_metadata_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','catboost'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, json, pickle;\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" from sklearn.utils import class_weight\n",
|
|
" from catboost import CatBoostClassifier\n",
|
|
" from sklearn.metrics import confusion_matrix\n",
|
|
" from collections import namedtuple\n",
|
|
"\n",
|
|
" #loading the new_feats data\n",
|
|
" with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
|
|
" X_train, X_test, y_train, y_test = pickle.load(f)\n",
|
|
" \n",
|
|
" y = np.append(y_train.values, y_test.values)\n",
|
|
" \n",
|
|
" # compute class weight to handle imbalance nature\n",
|
|
" class_weight = dict(zip(np.unique(y), class_weight.compute_class_weight('balanced',\n",
|
|
" classes=np.unique(y), y=y))) \n",
|
|
" \n",
|
|
" \n",
|
|
" #creating the cb_ensemble_path directory\n",
|
|
" os.makedirs(cb_ensemble_path, exist_ok = True)\n",
|
|
" \n",
|
|
" \n",
|
|
" # model initialization\n",
|
|
" cb=CatBoostClassifier(class_weights=class_weight, \n",
|
|
" n_estimators=150,\n",
|
|
" eval_metric='AUC', \n",
|
|
" learning_rate=0.1043242,\n",
|
|
" max_depth=5, \n",
|
|
" use_best_model=True, \n",
|
|
" random_state=22, \n",
|
|
" allow_writing_files=False, \n",
|
|
" metric_period=20)\n",
|
|
"\n",
|
|
" # fitting\n",
|
|
" cb.fit(X_train, y_train, eval_set=(X_test, y_test))\n",
|
|
" \n",
|
|
" # predict\n",
|
|
" cb_pred = cb.predict(X_test)\n",
|
|
" \n",
|
|
" #Save the predicted data as a pickle file to be used by the ensembling component.\n",
|
|
" with open(f'{cb_ensemble_path}/cb_pred', 'wb') as f:\n",
|
|
" pickle.dump(cb_pred, f)\n",
|
|
" \n",
|
|
" # plot confusion_matrix\n",
|
|
" cm = confusion_matrix(y_test, cb_pred)\n",
|
|
" vocab = list(np.unique(y_test))\n",
|
|
" \n",
|
|
" # confusion_matrix pair dataset \n",
|
|
" data = []\n",
|
|
" for target_index, target_row in enumerate(cm):\n",
|
|
" for predicted_index, count in enumerate(target_row):\n",
|
|
" data.append((vocab[target_index], vocab[predicted_index], count))\n",
|
|
" \n",
|
|
" # convert confusion_matrix pair dataset to dataframe\n",
|
|
" df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
|
|
" \n",
|
|
" # change 'target', 'predicted' to integer strings\n",
|
|
" df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
|
|
" \n",
|
|
" # create kubeflow metric metadata for UI\n",
|
|
" metadata = {\n",
|
|
" \"outputs\": [\n",
|
|
" {\n",
|
|
" \"type\": \"confusion_matrix\",\n",
|
|
" \"format\": \"csv\",\n",
|
|
" \"schema\": [\n",
|
|
" {\n",
|
|
" \"name\": \"target\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"predicted\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"count\",\n",
|
|
" \"type\": \"NUMBER\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"source\": df.to_csv(header=False, index=False),\n",
|
|
" \"storage\": \"inline\",\n",
|
|
" \"labels\": [\n",
|
|
" \"0\",\n",
|
|
" \"1\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \n",
|
|
" with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
|
|
" json.dump(metadata, metadata_file)\n",
|
|
"\n",
|
|
" conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
|
|
" \n",
|
|
" return conf_m_result(json.dumps(metadata))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "pAdRlNTmlBt6",
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Xgboost"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# xgboost modeling step\n",
|
|
"\n",
|
|
"def xgboost_modeling(feat_eng_path: InputPath(str), \n",
|
|
" xgb_ensemble_path: OutputPath(str),\n",
|
|
" mlpipeline_ui_metadata_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','xgboost'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, json, pickle, joblib;\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" from xgboost import XGBClassifier\n",
|
|
" from sklearn.metrics import confusion_matrix\n",
|
|
" from collections import namedtuple\n",
|
|
"\n",
|
|
" #loading the split_data data\n",
|
|
" with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
|
|
" X_train, X_test, y_train, y_test = pickle.load(f)\n",
|
|
" \n",
|
|
" #creating the ensemble_path directory\n",
|
|
" os.makedirs(xgb_ensemble_path, exist_ok = True)\n",
|
|
" \n",
|
|
" # model initialization\n",
|
|
" xgb=XGBClassifier(scale_pos_weight=0.3627, \n",
|
|
" max_depth=10, \n",
|
|
" learning_rate=0.1043242, \n",
|
|
" n_estimators=600, \n",
|
|
" colsample_bylevel=0.8, \n",
|
|
" reg_alpha=0.8,\n",
|
|
" silent=True, \n",
|
|
" metrics='auc', \n",
|
|
" random_state=22)\n",
|
|
" \n",
|
|
" # fitting\n",
|
|
" xgb.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=50,verbose=50)\n",
|
|
" \n",
|
|
" # predict\n",
|
|
" xgb_pred = xgb.predict(X_test)\n",
|
|
" \n",
|
|
" #Save the predicted data as a pickle file to be used by the ensembling component.\n",
|
|
" with open(f'{xgb_ensemble_path}/xgb_pred', 'wb') as f:\n",
|
|
" pickle.dump(xgb_pred, f) \n",
|
|
" \n",
|
|
" # plot confusion_matrix\n",
|
|
" cm = confusion_matrix(y_test, xgb_pred)\n",
|
|
" vocab = list(np.unique(y_test))\n",
|
|
" \n",
|
|
" # confusion_matrix pair dataset \n",
|
|
" data = []\n",
|
|
" for target_index, target_row in enumerate(cm):\n",
|
|
" for predicted_index, count in enumerate(target_row):\n",
|
|
" data.append((vocab[target_index], vocab[predicted_index], count))\n",
|
|
" \n",
|
|
" # convert confusion_matrix pair dataset to dataframe\n",
|
|
" df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
|
|
" \n",
|
|
" # change 'target', 'predicted' to integer strings\n",
|
|
" df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
|
|
" \n",
|
|
" # create kubeflow metric metadata for UI\n",
|
|
" metadata = {\n",
|
|
" \"outputs\": [\n",
|
|
" {\n",
|
|
" \"type\": \"confusion_matrix\",\n",
|
|
" \"format\": \"csv\",\n",
|
|
" \"schema\": [\n",
|
|
" {\n",
|
|
" \"name\": \"target\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"predicted\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"count\",\n",
|
|
" \"type\": \"NUMBER\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"source\": df.to_csv(header=False, index=False),\n",
|
|
" \"storage\": \"inline\",\n",
|
|
" \"labels\": [\n",
|
|
" \"0\",\n",
|
|
" \"1\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \n",
|
|
" with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
|
|
" json.dump(metadata, metadata_file)\n",
|
|
"\n",
|
|
" conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
|
|
" \n",
|
|
" return conf_m_result(json.dumps(metadata))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "OuZbR8G9mMsM",
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## LightGBM"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# lightgbm modeling step\n",
|
|
"\n",
|
|
"def lightgbm_modeling(feat_eng_path: InputPath(str), \n",
|
|
" lgbm_ensemble_path: OutputPath(str),\n",
|
|
" mlpipeline_ui_metadata_path: OutputPath(str)):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, json, pickle;\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" from lightgbm import LGBMClassifier\n",
|
|
" from sklearn.metrics import confusion_matrix\n",
|
|
" from collections import namedtuple\n",
|
|
"\n",
|
|
" #loading the new_feats data\n",
|
|
" with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
|
|
" X_train, X_test, y_train, y_test = pickle.load(f)\n",
|
|
" \n",
|
|
" \n",
|
|
" #creating the ensemble_path directory\n",
|
|
" os.makedirs(lgbm_ensemble_path, exist_ok = True)\n",
|
|
" \n",
|
|
" # model initialization\n",
|
|
" lgbm = LGBMClassifier(random_state=22,scale_pos_weight=0.362)\n",
|
|
"\n",
|
|
" # fitting\n",
|
|
" lgbm.fit(X_train, y_train, categorical_feature = 'auto', eval_set=(X_test, y_test),feature_name='auto', verbose=0)\n",
|
|
" \n",
|
|
" # predict\n",
|
|
" lgbm_pred = lgbm.predict(X_test)\n",
|
|
" \n",
|
|
" #Save the predicted data as a pickle file to be used by the ensembling component.\n",
|
|
" with open(f'{lgbm_ensemble_path}/lgbm_pred', 'wb') as f:\n",
|
|
" pickle.dump((y_test, lgbm_pred), f)\n",
|
|
" \n",
|
|
" # plot confusion_matrix\n",
|
|
" cm = confusion_matrix(y_test, lgbm_pred)\n",
|
|
" vocab = list(np.unique(y_test))\n",
|
|
" \n",
|
|
" # confusion_matrix pair dataset \n",
|
|
" data = []\n",
|
|
" for target_index, target_row in enumerate(cm):\n",
|
|
" for predicted_index, count in enumerate(target_row):\n",
|
|
" data.append((vocab[target_index], vocab[predicted_index], count))\n",
|
|
" \n",
|
|
" # convert confusion_matrix pair dataset to dataframe\n",
|
|
" df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
|
|
" \n",
|
|
" # change 'target', 'predicted' to integer strings\n",
|
|
" df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
|
|
" \n",
|
|
" # create kubeflow metric metadata for UI\n",
|
|
" metadata = {\n",
|
|
" \"outputs\": [\n",
|
|
" {\n",
|
|
" \"type\": \"confusion_matrix\",\n",
|
|
" \"format\": \"csv\",\n",
|
|
" \"schema\": [\n",
|
|
" {\n",
|
|
" \"name\": \"target\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"predicted\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"count\",\n",
|
|
" \"type\": \"NUMBER\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"source\": df.to_csv(header=False, index=False),\n",
|
|
" \"storage\": \"inline\",\n",
|
|
" \"labels\": [\n",
|
|
" \"0\",\n",
|
|
" \"1\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
" with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
|
|
" json.dump(metadata, metadata_file)\n",
|
|
"\n",
|
|
" conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
|
|
" \n",
|
|
" return conf_m_result(json.dumps(metadata))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "_Vnxkq9OmlqA",
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Ensembling"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ensembling step\n",
|
|
"\n",
|
|
"def ensembling(lgbm_ensemble_path: InputPath(str),\n",
|
|
" xgb_ensemble_path: InputPath(str),\n",
|
|
" cb_ensemble_path: InputPath(str),\n",
|
|
" mlpipeline_ui_metadata_path: OutputPath(str)) -> NamedTuple('conf_m_result', [('mlpipeline_ui_metadata', 'UI_metadata')]):\n",
|
|
" \n",
|
|
" # install the necessary libraries\n",
|
|
" import sys, subprocess;\n",
|
|
" subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])\n",
|
|
" subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
|
|
" \n",
|
|
" # import Library\n",
|
|
" import os, json, pickle;\n",
|
|
" import numpy as np\n",
|
|
" import pandas as pd\n",
|
|
" from scipy import stats\n",
|
|
" from sklearn.metrics import confusion_matrix\n",
|
|
" from collections import namedtuple\n",
|
|
" \n",
|
|
" #loading the new_feats data\n",
|
|
" with open(f'{lgbm_ensemble_path}/lgbm_pred', 'rb') as f:\n",
|
|
" (y_test, lgbm_pred) = pickle.load(f)\n",
|
|
" with open(f'{xgb_ensemble_path}/xgb_pred', 'rb') as g:\n",
|
|
" xgb_pred = pickle.load(g)\n",
|
|
" with open(f'{cb_ensemble_path}/cb_pred', 'rb') as h:\n",
|
|
" cb_pred = pickle.load(h)\n",
|
|
" \n",
|
|
" # create an array of all predictions\n",
|
|
" predictions = np.array([cb_pred, xgb_pred, lgbm_pred])\n",
|
|
" \n",
|
|
" # find the most frequent predicted value \n",
|
|
" pred_mode = stats.mode(predictions, axis=0)[0][0]\n",
|
|
" \n",
|
|
" # plot confusion_matrix\n",
|
|
" cm = confusion_matrix(y_test, pred_mode)\n",
|
|
" vocab = list(np.unique(y_test))\n",
|
|
" \n",
|
|
" # confusion_matrix pair dataset \n",
|
|
" data = []\n",
|
|
" for target_index, target_row in enumerate(cm):\n",
|
|
" for predicted_index, count in enumerate(target_row):\n",
|
|
" data.append((vocab[target_index], vocab[predicted_index], count))\n",
|
|
" \n",
|
|
" # convert confusion_matrix pair dataset to dataframe\n",
|
|
" df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
|
|
" \n",
|
|
" # change 'target', 'predicted' to integer strings\n",
|
|
" df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
|
|
" \n",
|
|
" # create kubeflow metric metadata for UI\n",
|
|
" metadata = {\n",
|
|
" \"outputs\": [\n",
|
|
" {\n",
|
|
" \"type\": \"confusion_matrix\",\n",
|
|
" \"format\": \"csv\",\n",
|
|
" \"schema\": [\n",
|
|
" {\n",
|
|
" \"name\": \"target\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"predicted\",\n",
|
|
" \"type\": \"CATEGORY\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\": \"count\",\n",
|
|
" \"type\": \"NUMBER\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"source\": df.to_csv(header=False, index=False),\n",
|
|
" \"storage\": \"inline\",\n",
|
|
" \"labels\": [\n",
|
|
" \"0\",\n",
|
|
" \"1\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \n",
|
|
" with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
|
|
" json.dump(metadata, metadata_file)\n",
|
|
"\n",
|
|
" conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
|
|
" \n",
|
|
" return conf_m_result(json.dumps(metadata))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Create pipeline components \n",
|
|
"\n",
|
|
"using `create_component_from_func`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create light weight components\n",
|
|
"load_op = comp.create_component_from_func(load_data,base_image=\"python:3.7.1\")\n",
|
|
"transform_op = comp.create_component_from_func(transform_data,base_image=\"python:3.7.1\")\n",
|
|
"feature_eng_op = comp.create_component_from_func(feature_engineering,base_image=\"python:3.7.1\")\n",
|
|
"catboost_modeling_op = comp.create_component_from_func(catboost_modeling, base_image=\"python:3.7.1\")\n",
|
|
"lightgbm_modeling_op = comp.create_component_from_func(lightgbm_modeling, base_image=\"python:3.7.1\")\n",
|
|
"xgboost_modeling_op = comp.create_component_from_func(xgboost_modeling, base_image=\"python:3.7.1\")\n",
|
|
"ensembling_op = comp.create_component_from_func(ensembling, base_image=\"python:3.7.1\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 66,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# define pipeline\n",
|
|
"@dsl.pipeline(name=\"telco-customer-churn\", \n",
|
|
" description=\"Predicting real future returns of around 2,000 stocks.\")\n",
|
|
"\n",
|
|
"# Define parameters to be fed into pipeline\n",
|
|
"def telco_customer_churn_pipeline(\n",
|
|
" download_link: str,\n",
|
|
" data_path: str,\n",
|
|
" transform_data_path: str, \n",
|
|
" feat_eng_data_path: str,\n",
|
|
" cb_ensemble_path:str,\n",
|
|
" xgb_ensemble_path:str,\n",
|
|
" lgbm_ensemble_path:str\n",
|
|
" ):\n",
|
|
"\n",
|
|
" # Create load container.\n",
|
|
" load_container = load_op(download_link)\n",
|
|
" # Create transform container.\n",
|
|
" transform_container = transform_op(load_container.output)\n",
|
|
" # Create feature engineering container.\n",
|
|
" feature_eng_container = feature_eng_op(transform_container.output)\n",
|
|
" # Create catboost modeling container.\n",
|
|
" cb_modeling_container = catboost_modeling_op(feature_eng_container.output)\n",
|
|
" # Create xgboost modeling container.\n",
|
|
" xgb_modeling_container = xgboost_modeling_op(feature_eng_container.output)\n",
|
|
" # Create lightgbm modeling container.\n",
|
|
" lgbm_modeling_container = lightgbm_modeling_op(feature_eng_container.output)\n",
|
|
" # Create ensemble container.\n",
|
|
" ensembling_container = ensembling_op(lgbm_modeling_container.outputs[\"lgbm_ensemble\"], \\\n",
|
|
" xgb_modeling_container.outputs[\"xgb_ensemble\"], \\\n",
|
|
" cb_modeling_container.outputs[\"cb_ensemble\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 67,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create client that would enable communication with the Pipelines API server \n",
|
|
"client = kfp.Client()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 68,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# arguments\n",
|
|
"download_link = \"https://github.com/kubeflow/examples/blob/master/telco-customer-churn-kaggle-competition/data/WA_Fn-UseC_-Telco-Customer-Churn.csv?raw=true\"\n",
|
|
"data_path = \"data\"\n",
|
|
"transform_data_path = \"tdp\"\n",
|
|
"feat_eng_data_path = \"feat\"\n",
|
|
"cb_ensemble_path = \"cep\" \n",
|
|
"xgb_ensemble_path = \"xep\"\n",
|
|
"lgbm_ensemble_path = \"lep\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 69,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'data': {{pipelineparam:op=load-data;name=data}}}\n",
|
|
"{'data': {{pipelineparam:op=load-data;name=data}}}\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/experiments/details/9ba10a1d-5deb-4082-bb10-4573e9fce164\" target=\"_blank\" >Experiment details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/runs/details/293e1031-e9fc-433a-8d6f-0f35cc5a78f5\" target=\"_blank\" >Run details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"pipeline_func = telco_customer_churn_pipeline\n",
|
|
"\n",
|
|
"experiment_name = 'telco_customer_churn_pipeline_lightweight'\n",
|
|
"run_name = pipeline_func.__name__ + ' run'\n",
|
|
"\n",
|
|
"arguments = {\n",
|
|
" \"download_link\": download_link,\n",
|
|
" \"data_path\": data_path,\n",
|
|
" \"transform_data_path\": transform_data_path,\n",
|
|
" \"feat_eng_data_path\": feat_eng_data_path,\n",
|
|
" \"cb_ensemble_path\": cb_ensemble_path,\n",
|
|
" \"xgb_ensemble_path\": xgb_ensemble_path,\n",
|
|
" \"lgbm_ensemble_path\": lgbm_ensemble_path\n",
|
|
" }\n",
|
|
"\n",
|
|
"# Compile pipeline to generate compressed YAML definition of the pipeline.\n",
|
|
"kfp.compiler.Compiler().compile(pipeline_func, \n",
|
|
" '{}.zip'.format(experiment_name))\n",
|
|
"\n",
|
|
"# Submit pipeline directly from pipeline function\n",
|
|
"run_result = client.create_run_from_pipeline_func(pipeline_func, \n",
|
|
" experiment_name=experiment_name, \n",
|
|
" run_name=run_name, \n",
|
|
" arguments=arguments\n",
|
|
" )\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "telco-customer-churn-orig.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"kubeflow_notebook": {
|
|
"autosnapshot": true,
|
|
"experiment": {
|
|
"id": "new",
|
|
"name": "telco"
|
|
},
|
|
"experiment_name": "telco",
|
|
"katib_metadata": {
|
|
"algorithm": {
|
|
"algorithmName": "grid"
|
|
},
|
|
"maxFailedTrialCount": 3,
|
|
"maxTrialCount": 12,
|
|
"objective": {
|
|
"objectiveMetricName": "",
|
|
"type": "minimize"
|
|
},
|
|
"parallelTrialCount": 3,
|
|
"parameters": []
|
|
},
|
|
"katib_run": false,
|
|
"pipeline_description": "analyze behavior to retain customers and predict churning",
|
|
"pipeline_name": "telco-customer-churn-pipeline",
|
|
"snapshot_volumes": true,
|
|
"steps_defaults": [
|
|
"label:access-ml-pipeline:true",
|
|
"label:access-rok:true"
|
|
],
|
|
"volume_access_mode": "rwm",
|
|
"volumes": [
|
|
{
|
|
"annotations": [],
|
|
"mount_point": "/home/jovyan",
|
|
"name": "demo-workspace-jtjff",
|
|
"size": 5,
|
|
"size_type": "Gi",
|
|
"snapshot": false,
|
|
"type": "clone"
|
|
}
|
|
]
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|