examples/telco-customer-churn-kaggle.../telco-customer-churn-kfp.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Telco Customer Churn Prediction Kubeflow Pipeline\n",
    "\n",
    "In this [Kaggle competition](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) \n",
    "\n",
    ">In this competition, your goal is to analyze behavior to retain customers and predict churning. You can analyze all relevant customer data and develop focused customer retention programs."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Install relevant libraries\n",
    "\n",
    "\n",
    ">Update pip `pip install --user --upgrade pip`\n",
    "\n",
    ">Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`\n",
    "\n",
    "You may need to restart your notebook kernel after installing the kfp sdk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": [
     "skip"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pip in /usr/local/lib/python3.6/dist-packages (21.3.1)\n"
     ]
    }
   ],
   "source": [
    "!pip install --user --upgrade pip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install kfp --upgrade --user --quiet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name: kfp\n",
      "Version: 1.8.11\n",
      "Summary: KubeFlow Pipelines SDK\n",
      "Home-page: https://github.com/kubeflow/pipelines\n",
      "Author: The Kubeflow Authors\n",
      "Author-email: \n",
      "License: UNKNOWN\n",
      "Location: /home/jovyan/.local/lib/python3.6/site-packages\n",
      "Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate\n",
      "Required-by: kubeflow-kale\n"
     ]
    }
   ],
   "source": [
    "# confirm the kfp sdk\n",
    "! pip show kfp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Imports\n",
    "\n",
    "In this section we import the kfp methods we need for this example. Make it a habit to gather your imports in a single place."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp\n",
    "import kfp.components as comp\n",
    "import kfp.dsl as dsl\n",
    "from kfp.components import InputPath, OutputPath\n",
    "from typing import NamedTuple"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kubeflow pipeline component creation\n",
    "\n",
    "## Download and load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data step\n",
    "def load_data(download_link: str, data_path: OutputPath(str)):\n",
    "        \n",
    "    # install the necessary libraries\n",
    "    import os, sys, pickle, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"wget\"])\n",
    "    import wget\n",
    "    \n",
    "    # import libraries\n",
    "    import pandas as pd\n",
    "    \n",
    "    # create data_path directory\n",
    "    if not os.path.exists(data_path):\n",
    "        os.makedirs(data_path)\n",
    "\n",
    "    # download data\n",
    "    wget.download(download_link, f'{data_path}/Telco-Customer-Churn.csv')\n",
    "    \n",
    "    # read data\n",
    "    data = pd.read_csv(f\"{data_path}/Telco-Customer-Churn.csv\")\n",
    "\n",
    "    # Save data as a pickle file to be used by the tranform_data component.\n",
    "    with open(f'{data_path}/data', 'wb') as f:\n",
    "        pickle.dump(data, f)\n",
    "\n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transform Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform data step\n",
    "\n",
    "def transform_data(data_path: InputPath(str), \n",
    "              transform_data_path: OutputPath(str)):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])\n",
    "    \n",
    "    # import Libraries\n",
    "    import os, pickle;\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "\n",
    "    \n",
    "    # load data from data_path\n",
    "    with open(f'{data_path}/data', 'rb') as f:\n",
    "        data = pickle.load(f)\n",
    "\n",
    "    # remove rows with spaces in TotalCharges column\n",
    "    data = data[data['TotalCharges'] !=' '].copy()\n",
    "    \n",
    "    # convert TotalCharges column datatype to float \n",
    "    data['TotalCharges'] = data['TotalCharges'].astype(float)\n",
    "    \n",
    "    # convert Churn targe from string to integers\n",
    "    # replace no with 1 and yes with 0\n",
    "    data['Churn'] = data['Churn'].replace({'No':1, 'Yes':0})\n",
    "    \n",
    "    #creating the transform_data_path\n",
    "    os.makedirs(transform_data_path, exist_ok = True)\n",
    "    \n",
    "    #Save data as a pickle file to be used by the feature_engineering component.\n",
    "    with open(f'{transform_data_path}/data', 'wb') as f:\n",
    "        pickle.dump(data, f)\n",
    "    \n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "G_rB_8FLfP4x",
    "tags": []
   },
   "source": [
    "# Feature Engineering\n",
    "\n",
    "Grouping the tenure, monthly charge and total charge column into different segments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# feature engineering step\n",
    "\n",
    "def feature_engineering(transform_data_path: InputPath(str), \n",
    "            feat_eng_path: OutputPath(str)):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
    "    \n",
    "  \n",
    "    \n",
    "    # import Library\n",
    "    import os, pickle;\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    from sklearn.preprocessing import MinMaxScaler\n",
    "    \n",
    "    # loading the data\n",
    "    with open(f'{transform_data_path}/data', 'rb') as f:\n",
    "        data = pickle.load(f)\n",
    "        \n",
    "    def yearly_tenure(tenure):\n",
    "        if tenure <= 12:\n",
    "            return 1\n",
    "        elif tenure >12 and tenure <=24:\n",
    "            return 2\n",
    "        elif tenure >24 and tenure <=36:\n",
    "            return 3\n",
    "        elif tenure >36 and tenure <=48:\n",
    "            return 4\n",
    "        elif tenure >48 and tenure <=60:\n",
    "            return 5\n",
    "        elif tenure > 60 and tenure <=72:\n",
    "            return 6\n",
    "\n",
    "    def monthly_charge_plan(charge):\n",
    "        if charge <= 35:\n",
    "            return 'Basic'\n",
    "        elif charge>35 and charge <= 80:\n",
    "            return 'Advanced'\n",
    "        elif charge>80:\n",
    "            return 'Premium'\n",
    "        \n",
    "    def total_charge_status(charge):\n",
    "        if charge <= 250:\n",
    "            return 'V-low'\n",
    "        elif charge > 250 and charge <= 450:\n",
    "            return 'low'\n",
    "        elif charge > 450 and charge <= 1500:\n",
    "            return 'medium'\n",
    "        elif charge > 1500 and charge <= 3500:\n",
    "            return 'High'\n",
    "        elif charge > 3500:\n",
    "            return 'V-High'\n",
    "        \n",
    "    # generating new features\n",
    "    data['OnlineSecurityBackup'] = data['OnlineSecurity'] + data['OnlineBackup']\n",
    "    data['OnlineSecurityDevice'] = data['OnlineSecurity'] + data['DeviceProtection']\n",
    "    data['Streaming'] = data['StreamingMovies'] + data['StreamingTV']\n",
    "    \n",
    "    # applying grouping functions to create new features\n",
    "    data['yearly_tenure'] = data['tenure'].apply(yearly_tenure)\n",
    "    data['MonthlyChargesplan'] = data['MonthlyCharges'].apply(monthly_charge_plan)\n",
    "    data['TotalChargestatus'] = data['TotalCharges'].apply(total_charge_status)\n",
    "    \n",
    "    # drop customerID\n",
    "    data = data.drop('customerID', axis=1)\n",
    "    \n",
    "    # scale 'MonthlyCharges' and 'TotalCharges' columns\n",
    "    scaler = MinMaxScaler()\n",
    "    data[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(data[['MonthlyCharges', 'TotalCharges']])\n",
    "    \n",
    "    # one-hot encode categorical variables \n",
    "    X = pd.get_dummies(data.drop(columns=['Churn']), drop_first=True)\n",
    "    y= data['Churn']\n",
    "    \n",
    "    # split data\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22, stratify=y)\n",
    "\n",
    "    # creating the feat_eng_path\n",
    "    os.makedirs(feat_eng_path, exist_ok = True)\n",
    "      \n",
    "    # save the train_test_split data as a pickle file to be used by the modeling component.\n",
    "    with open(f'{feat_eng_path}/split_data', 'wb') as f:\n",
    "        pickle.dump((X_train, X_test, y_train, y_test), f)\n",
    "    \n",
    "    return(print('Done!'))  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F0tD_7fKkw2J",
    "tags": []
   },
   "source": [
    "# Modelling\n",
    "\n",
    "## Catboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# catboost modeling step\n",
    "\n",
    "def catboost_modeling(feat_eng_path: InputPath(str), \n",
    "                      cb_ensemble_path: OutputPath(str),\n",
    "                      mlpipeline_ui_metadata_path: OutputPath(str)):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','catboost'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
    "    \n",
    "    # import Library\n",
    "    import os, json, pickle;\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    from sklearn.utils import class_weight\n",
    "    from catboost import CatBoostClassifier\n",
    "    from sklearn.metrics import confusion_matrix\n",
    "    from collections import namedtuple\n",
    "\n",
    "    #loading the new_feats data\n",
    "    with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
    "        X_train, X_test, y_train, y_test = pickle.load(f)\n",
    "        \n",
    "    y = np.append(y_train.values, y_test.values)\n",
    "    \n",
    "    # compute class weight to handle imbalance nature\n",
    "    class_weight = dict(zip(np.unique(y), class_weight.compute_class_weight('balanced',\n",
    "                                                 classes=np.unique(y), y=y))) \n",
    "        \n",
    "    \n",
    "    #creating the cb_ensemble_path directory\n",
    "    os.makedirs(cb_ensemble_path, exist_ok = True)\n",
    "    \n",
    "    \n",
    "    # model initialization\n",
    "    cb=CatBoostClassifier(class_weights=class_weight, \n",
    "                          n_estimators=150,\n",
    "                          eval_metric='AUC', \n",
    "                          learning_rate=0.1043242,\n",
    "                          max_depth=5, \n",
    "                          use_best_model=True, \n",
    "                          random_state=22, \n",
    "                          allow_writing_files=False, \n",
    "                          metric_period=20)\n",
    "\n",
    "    # fitting\n",
    "    cb.fit(X_train, y_train, eval_set=(X_test, y_test))\n",
    "    \n",
    "    # predict\n",
    "    cb_pred = cb.predict(X_test)\n",
    "    \n",
    "    #Save the predicted data as a pickle file to be used by the ensembling component.\n",
    "    with open(f'{cb_ensemble_path}/cb_pred', 'wb') as f:\n",
    "        pickle.dump(cb_pred, f)\n",
    "    \n",
    "    # plot confusion_matrix\n",
    "    cm = confusion_matrix(y_test, cb_pred)\n",
    "    vocab = list(np.unique(y_test))\n",
    "    \n",
    "    # confusion_matrix pair dataset \n",
    "    data = []\n",
    "    for target_index, target_row in enumerate(cm):\n",
    "        for predicted_index, count in enumerate(target_row):\n",
    "            data.append((vocab[target_index], vocab[predicted_index], count))\n",
    "    \n",
    "    # convert confusion_matrix pair dataset to dataframe\n",
    "    df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
    "    \n",
    "    # change 'target', 'predicted' to integer strings\n",
    "    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
    "    \n",
    "    # create kubeflow metric metadata for UI\n",
    "    metadata = {\n",
    "                \"outputs\": [\n",
    "                    {\n",
    "                        \"type\": \"confusion_matrix\",\n",
    "                        \"format\": \"csv\",\n",
    "                        \"schema\": [\n",
    "                            {\n",
    "                                \"name\": \"target\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"predicted\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"count\",\n",
    "                                \"type\": \"NUMBER\"\n",
    "                            }\n",
    "                        ],\n",
    "                        \"source\": df.to_csv(header=False, index=False),\n",
    "                        \"storage\": \"inline\",\n",
    "                        \"labels\": [\n",
    "                            \"0\",\n",
    "                            \"1\"\n",
    "                        ]\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "    \n",
    "    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
    "        json.dump(metadata, metadata_file)\n",
    "\n",
    "    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
    "    \n",
    "    return conf_m_result(json.dumps(metadata))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pAdRlNTmlBt6",
    "tags": []
   },
   "source": [
    "## Xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xgboost modeling step\n",
    "\n",
    "def xgboost_modeling(feat_eng_path: InputPath(str), \n",
    "                     xgb_ensemble_path: OutputPath(str),\n",
    "                     mlpipeline_ui_metadata_path: OutputPath(str)):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','xgboost'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
    "    \n",
    "    # import Library\n",
    "    import os, json, pickle, joblib;\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    from xgboost import XGBClassifier\n",
    "    from sklearn.metrics import confusion_matrix\n",
    "    from collections import namedtuple\n",
    "\n",
    "    #loading the split_data data\n",
    "    with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
    "        X_train, X_test, y_train, y_test = pickle.load(f)\n",
    "            \n",
    "    #creating the ensemble_path directory\n",
    "    os.makedirs(xgb_ensemble_path, exist_ok = True)\n",
    "    \n",
    "    # model initialization\n",
    "    xgb=XGBClassifier(scale_pos_weight=0.3627, \n",
    "                      max_depth=10, \n",
    "                      learning_rate=0.1043242, \n",
    "                      n_estimators=600, \n",
    "                      colsample_bylevel=0.8, \n",
    "                      reg_alpha=0.8,\n",
    "                      silent=True, \n",
    "                      metrics='auc', \n",
    "                      random_state=22)\n",
    "    \n",
    "    # fitting\n",
    "    xgb.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=50,verbose=50)\n",
    "    \n",
    "    # predict\n",
    "    xgb_pred = xgb.predict(X_test)\n",
    "    \n",
    "    #Save the predicted data as a pickle file to be used by the ensembling component.\n",
    "    with open(f'{xgb_ensemble_path}/xgb_pred', 'wb') as f:\n",
    "        pickle.dump(xgb_pred, f) \n",
    "    \n",
    "    # plot confusion_matrix\n",
    "    cm = confusion_matrix(y_test, xgb_pred)\n",
    "    vocab = list(np.unique(y_test))\n",
    "    \n",
    "    # confusion_matrix pair dataset \n",
    "    data = []\n",
    "    for target_index, target_row in enumerate(cm):\n",
    "        for predicted_index, count in enumerate(target_row):\n",
    "            data.append((vocab[target_index], vocab[predicted_index], count))\n",
    "    \n",
    "    # convert confusion_matrix pair dataset to dataframe\n",
    "    df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
    "    \n",
    "    # change 'target', 'predicted' to integer strings\n",
    "    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
    "    \n",
    "    # create kubeflow metric metadata for UI\n",
    "    metadata = {\n",
    "                \"outputs\": [\n",
    "                    {\n",
    "                        \"type\": \"confusion_matrix\",\n",
    "                        \"format\": \"csv\",\n",
    "                        \"schema\": [\n",
    "                            {\n",
    "                                \"name\": \"target\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"predicted\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"count\",\n",
    "                                \"type\": \"NUMBER\"\n",
    "                            }\n",
    "                        ],\n",
    "                        \"source\": df.to_csv(header=False, index=False),\n",
    "                        \"storage\": \"inline\",\n",
    "                        \"labels\": [\n",
    "                            \"0\",\n",
    "                            \"1\"\n",
    "                        ]\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "    \n",
    "    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
    "        json.dump(metadata, metadata_file)\n",
    "\n",
    "    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
    "    \n",
    "    return conf_m_result(json.dumps(metadata))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OuZbR8G9mMsM",
    "tags": []
   },
   "source": [
    "## LightGBM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lightgbm modeling step\n",
    "\n",
    "def lightgbm_modeling(feat_eng_path: InputPath(str), \n",
    "                      lgbm_ensemble_path: OutputPath(str),\n",
    "                      mlpipeline_ui_metadata_path: OutputPath(str)):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
    "    \n",
    "    # import Library\n",
    "    import os, json, pickle;\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    from lightgbm import LGBMClassifier\n",
    "    from sklearn.metrics import confusion_matrix\n",
    "    from collections import namedtuple\n",
    "\n",
    "    #loading the new_feats data\n",
    "    with open(f'{feat_eng_path}/split_data', 'rb') as f:\n",
    "        X_train, X_test, y_train, y_test = pickle.load(f)\n",
    "        \n",
    "    \n",
    "    #creating the ensemble_path directory\n",
    "    os.makedirs(lgbm_ensemble_path, exist_ok = True)\n",
    "    \n",
    "    # model initialization\n",
    "    lgbm = LGBMClassifier(random_state=22,scale_pos_weight=0.362)\n",
    "\n",
    "    # fitting\n",
    "    lgbm.fit(X_train, y_train, categorical_feature = 'auto', eval_set=(X_test, y_test),feature_name='auto', verbose=0)\n",
    "    \n",
    "    # predict\n",
    "    lgbm_pred = lgbm.predict(X_test)\n",
    "    \n",
    "    #Save the predicted data as a pickle file to be used by the ensembling component.\n",
    "    with open(f'{lgbm_ensemble_path}/lgbm_pred', 'wb') as f:\n",
    "        pickle.dump((y_test, lgbm_pred), f)\n",
    "    \n",
    "    # plot confusion_matrix\n",
    "    cm = confusion_matrix(y_test, lgbm_pred)\n",
    "    vocab = list(np.unique(y_test))\n",
    "    \n",
    "    # confusion_matrix pair dataset \n",
    "    data = []\n",
    "    for target_index, target_row in enumerate(cm):\n",
    "        for predicted_index, count in enumerate(target_row):\n",
    "            data.append((vocab[target_index], vocab[predicted_index], count))\n",
    "    \n",
    "    # convert confusion_matrix pair dataset to dataframe\n",
    "    df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
    "    \n",
    "    # change 'target', 'predicted' to integer strings\n",
    "    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
    "    \n",
    "    # create kubeflow metric metadata for UI\n",
    "    metadata = {\n",
    "                \"outputs\": [\n",
    "                    {\n",
    "                        \"type\": \"confusion_matrix\",\n",
    "                        \"format\": \"csv\",\n",
    "                        \"schema\": [\n",
    "                            {\n",
    "                                \"name\": \"target\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"predicted\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"count\",\n",
    "                                \"type\": \"NUMBER\"\n",
    "                            }\n",
    "                        ],\n",
    "                        \"source\": df.to_csv(header=False, index=False),\n",
    "                        \"storage\": \"inline\",\n",
    "                        \"labels\": [\n",
    "                            \"0\",\n",
    "                            \"1\"\n",
    "                        ]\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "    \n",
    "    \n",
    "    \n",
    "    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
    "        json.dump(metadata, metadata_file)\n",
    "\n",
    "    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
    "    \n",
    "    return conf_m_result(json.dumps(metadata))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_Vnxkq9OmlqA",
    "tags": []
   },
   "source": [
    "## Ensembling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ensembling step\n",
    "\n",
    "def ensembling(lgbm_ensemble_path: InputPath(str),\n",
    "               xgb_ensemble_path: InputPath(str),\n",
    "               cb_ensemble_path: InputPath(str),\n",
    "               mlpipeline_ui_metadata_path: OutputPath(str)) -> NamedTuple('conf_m_result', [('mlpipeline_ui_metadata', 'UI_metadata')]):\n",
    "    \n",
    "    # install the necessary libraries\n",
    "    import sys, subprocess;\n",
    "    subprocess.run([\"python\", \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])\n",
    "    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])\n",
    "    \n",
    "    # import Library\n",
    "    import os, json, pickle;\n",
    "    import numpy as np\n",
    "    import pandas as pd\n",
    "    from scipy import stats\n",
    "    from sklearn.metrics import confusion_matrix\n",
    "    from collections import namedtuple\n",
    "    \n",
    "    #loading the new_feats data\n",
    "    with open(f'{lgbm_ensemble_path}/lgbm_pred', 'rb') as f:\n",
    "        (y_test, lgbm_pred) = pickle.load(f)\n",
    "    with open(f'{xgb_ensemble_path}/xgb_pred', 'rb') as g:\n",
    "        xgb_pred = pickle.load(g)\n",
    "    with open(f'{cb_ensemble_path}/cb_pred', 'rb') as h:\n",
    "        cb_pred = pickle.load(h)\n",
    "    \n",
    "    # create an array of all predictions\n",
    "    predictions = np.array([cb_pred, xgb_pred, lgbm_pred])\n",
    "    \n",
    "    # find the most frequent predicted value \n",
    "    pred_mode = stats.mode(predictions, axis=0)[0][0]\n",
    "    \n",
    "    # plot confusion_matrix\n",
    "    cm = confusion_matrix(y_test, pred_mode)\n",
    "    vocab = list(np.unique(y_test))\n",
    "    \n",
    "    # confusion_matrix pair dataset \n",
    "    data = []\n",
    "    for target_index, target_row in enumerate(cm):\n",
    "        for predicted_index, count in enumerate(target_row):\n",
    "            data.append((vocab[target_index], vocab[predicted_index], count))\n",
    "    \n",
    "    # convert confusion_matrix pair dataset to dataframe\n",
    "    df = pd.DataFrame(data,columns=['target','predicted','count'])\n",
    "    \n",
    "    # change 'target', 'predicted' to integer strings\n",
    "    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)\n",
    "    \n",
    "    # create kubeflow metric metadata for UI\n",
    "    metadata = {\n",
    "                \"outputs\": [\n",
    "                    {\n",
    "                        \"type\": \"confusion_matrix\",\n",
    "                        \"format\": \"csv\",\n",
    "                        \"schema\": [\n",
    "                            {\n",
    "                                \"name\": \"target\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"predicted\",\n",
    "                                \"type\": \"CATEGORY\"\n",
    "                            },\n",
    "                            {\n",
    "                                \"name\": \"count\",\n",
    "                                \"type\": \"NUMBER\"\n",
    "                            }\n",
    "                        ],\n",
    "                        \"source\": df.to_csv(header=False, index=False),\n",
    "                        \"storage\": \"inline\",\n",
    "                        \"labels\": [\n",
    "                            \"0\",\n",
    "                            \"1\"\n",
    "                        ]\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "    \n",
    "    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:\n",
    "        json.dump(metadata, metadata_file)\n",
    "\n",
    "    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])\n",
    "    \n",
    "    return conf_m_result(json.dumps(metadata))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Create pipeline components \n",
    "\n",
    "using `create_component_from_func`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# create light weight components\n",
    "load_op = comp.create_component_from_func(load_data,base_image=\"python:3.7.1\")\n",
    "transform_op = comp.create_component_from_func(transform_data,base_image=\"python:3.7.1\")\n",
    "feature_eng_op = comp.create_component_from_func(feature_engineering,base_image=\"python:3.7.1\")\n",
    "catboost_modeling_op = comp.create_component_from_func(catboost_modeling, base_image=\"python:3.7.1\")\n",
    "lightgbm_modeling_op = comp.create_component_from_func(lightgbm_modeling, base_image=\"python:3.7.1\")\n",
    "xgboost_modeling_op = comp.create_component_from_func(xgboost_modeling, base_image=\"python:3.7.1\")\n",
    "ensembling_op = comp.create_component_from_func(ensembling, base_image=\"python:3.7.1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define pipeline\n",
    "@dsl.pipeline(name=\"telco-customer-churn\", \n",
    "              description=\"Predicting real future returns of around 2,000 stocks.\")\n",
    "\n",
    "# Define parameters to be fed into pipeline\n",
    "def telco_customer_churn_pipeline(\n",
    "                             download_link: str,\n",
    "                             data_path: str,\n",
    "                             transform_data_path: str, \n",
    "                             feat_eng_data_path: str,\n",
    "                             cb_ensemble_path:str,\n",
    "                             xgb_ensemble_path:str,\n",
    "                             lgbm_ensemble_path:str\n",
    "                            ):\n",
    "\n",
    "    # Create load container.\n",
    "    load_container = load_op(download_link)\n",
    "    # Create transform container.\n",
    "    transform_container = transform_op(load_container.output)\n",
    "    # Create feature engineering container.\n",
    "    feature_eng_container = feature_eng_op(transform_container.output)\n",
    "    # Create catboost modeling container.\n",
    "    cb_modeling_container = catboost_modeling_op(feature_eng_container.output)\n",
    "    # Create xgboost modeling container.\n",
    "    xgb_modeling_container = xgboost_modeling_op(feature_eng_container.output)\n",
    "    # Create lightgbm modeling container.\n",
    "    lgbm_modeling_container = lightgbm_modeling_op(feature_eng_container.output)\n",
    "    # Create ensemble container.\n",
    "    ensembling_container = ensembling_op(lgbm_modeling_container.outputs[\"lgbm_ensemble\"], \\\n",
    "                                         xgb_modeling_container.outputs[\"xgb_ensemble\"], \\\n",
    "                                         cb_modeling_container.outputs[\"cb_ensemble\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create client that would enable communication with the Pipelines API server \n",
    "client = kfp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# arguments\n",
    "download_link = \"https://github.com/kubeflow/examples/blob/master/telco-customer-churn-kaggle-competition/data/WA_Fn-UseC_-Telco-Customer-Churn.csv?raw=true\"\n",
    "data_path = \"data\"\n",
    "transform_data_path = \"tdp\"\n",
    "feat_eng_data_path = \"feat\"\n",
    "cb_ensemble_path = \"cep\"                             \n",
    "xgb_ensemble_path = \"xep\"\n",
    "lgbm_ensemble_path = \"lep\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'data': {{pipelineparam:op=load-data;name=data}}}\n",
      "{'data': {{pipelineparam:op=load-data;name=data}}}\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a href=\"/pipeline/#/experiments/details/9ba10a1d-5deb-4082-bb10-4573e9fce164\" target=\"_blank\" >Experiment details</a>."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<a href=\"/pipeline/#/runs/details/293e1031-e9fc-433a-8d6f-0f35cc5a78f5\" target=\"_blank\" >Run details</a>."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pipeline_func = telco_customer_churn_pipeline\n",
    "\n",
    "experiment_name = 'telco_customer_churn_pipeline_lightweight'\n",
    "run_name = pipeline_func.__name__ + ' run'\n",
    "\n",
    "arguments = {\n",
    "             \"download_link\": download_link,\n",
    "             \"data_path\": data_path,\n",
    "             \"transform_data_path\": transform_data_path,\n",
    "             \"feat_eng_data_path\": feat_eng_data_path,\n",
    "             \"cb_ensemble_path\": cb_ensemble_path,\n",
    "             \"xgb_ensemble_path\": xgb_ensemble_path,\n",
    "             \"lgbm_ensemble_path\": lgbm_ensemble_path\n",
    "            }\n",
    "\n",
    "# Compile pipeline to generate compressed YAML definition of the pipeline.\n",
    "kfp.compiler.Compiler().compile(pipeline_func,  \n",
    "  '{}.zip'.format(experiment_name))\n",
    "\n",
    "# Submit pipeline directly from pipeline function\n",
    "run_result = client.create_run_from_pipeline_func(pipeline_func, \n",
    "                                                  experiment_name=experiment_name, \n",
    "                                                  run_name=run_name, \n",
    "                                                  arguments=arguments\n",
    "                                                 )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "telco-customer-churn-orig.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "kubeflow_notebook": {
   "autosnapshot": true,
   "experiment": {
    "id": "new",
    "name": "telco"
   },
   "experiment_name": "telco",
   "katib_metadata": {
    "algorithm": {
     "algorithmName": "grid"
    },
    "maxFailedTrialCount": 3,
    "maxTrialCount": 12,
    "objective": {
     "objectiveMetricName": "",
     "type": "minimize"
    },
    "parallelTrialCount": 3,
    "parameters": []
   },
   "katib_run": false,
   "pipeline_description": "analyze behavior to retain customers and predict churning",
   "pipeline_name": "telco-customer-churn-pipeline",
   "snapshot_volumes": true,
   "steps_defaults": [
    "label:access-ml-pipeline:true",
    "label:access-rok:true"
   ],
   "volume_access_mode": "rwm",
   "volumes": [
    {
     "annotations": [],
     "mount_point": "/home/jovyan",
     "name": "demo-workspace-jtjff",
     "size": 5,
     "size_type": "Gi",
     "snapshot": false,
     "type": "clone"
    }
   ]
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}