{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Telecom Customer Churn Prediction with XGBoost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This tutorial is based on [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook and [this](https://github.com/gojek/feast/tree/master/examples/feast-xgboost-churn-prediction-tutorial) Feast notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_kg_hide-input": false,
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th></tr><tr><td>13</td><td>application_1592283535818_0014</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"http://resourcemanager.service.consul:8088/proxy/application_1592283535818_0014/\">Link</a></td><td><a target=\"_blank\" href=\"http://ip-10-0-0-154.eu-west-1.compute.internal:8042/node/containerlogs/container_e02_1592283535818_0014_01_000001/telecom__meb10179\">Link</a></td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from hops import featurestore, hdfs\n",
    "from hops import numpy_helper as numpy\n",
    "from hops import pandas_helper as pandas\n",
    "import os\n",
    "import itertools\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import io\n",
    "import statsmodels, yellowbrick\n",
    "import sklearn # Tested with 0.22.1\n",
    "import imblearn\n",
    "from slugify import slugify"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1 Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running sql: use telecom_featurestore against offline feature store\n",
      "SQL string for the query created successfully\n",
      "Running sql: SELECT * FROM telcom_featuregroup_1 against offline feature store\n",
      "   churn  ...  tenure_group_tenure_gt_60\n",
      "0      0  ...                          0\n",
      "1      1  ...                          0\n",
      "2      1  ...                          0\n",
      "3      0  ...                          0\n",
      "4      1  ...                          0\n",
      "\n",
      "[5 rows x 47 columns]"
     ]
    }
   ],
   "source": [
    "telecom_df = featurestore.get_featuregroup(\"telcom_featuregroup\", dataframe_type=\"pandas\")\n",
    "telecom_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "f944336cbe67efb3422b79864d9478e2cfbdc860"
   },
   "source": [
    "### 1.6 Data Preparation for Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix,accuracy_score,classification_report\n",
    "from sklearn.metrics import roc_auc_score,roc_curve,scorer\n",
    "from sklearn.metrics import f1_score\n",
    "import statsmodels.api as sm\n",
    "from sklearn.metrics import precision_score,recall_score\n",
    "from yellowbrick.classifier import DiscriminationThreshold\n",
    "\n",
    "Id_col     = ['customer_id']\n",
    "target_col = [\"churn\"]\n",
    "# Split into a train and test set\n",
    "train, test = train_test_split(telecom_df,test_size = .25 ,random_state = 111)\n",
    "    \n",
    "# Seperating dependent and independent variables\n",
    "cols    = [i for i in telecom_df.columns if i not in Id_col + target_col]\n",
    "training_x = train[cols]\n",
    "training_y = train[target_col]\n",
    "testing_x  = test[cols]\n",
    "testing_y  = test[target_col]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.7 Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "\n",
    "xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
    "                      colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,\n",
    "                      max_depth=7, min_child_weight=1, missing=None, n_estimators=100,\n",
    "                      n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n",
    "                      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
    "                      silent=True, subsample=1)\n",
    "\n",
    "# Train model\n",
    "xgb_model.fit(training_x, training_y)\n",
    "predictions = xgb_model.predict(testing_x)\n",
    "probabilities = xgb_model.predict_proba(testing_x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.8 Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
      "              colsample_bynode=1, colsample_bytree=1, gamma=0,\n",
      "              learning_rate=0.9, max_delta_step=0, max_depth=7,\n",
      "              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n",
      "              nthread=None, objective='binary:logistic', random_state=0,\n",
      "              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
      "              silent=True, subsample=1, verbosity=1)\n",
      "\n",
      " Classification report : \n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "           0       0.82      0.86      0.84      1282\n",
      "           1       0.57      0.50      0.54       476\n",
      "\n",
      "    accuracy                           0.76      1758\n",
      "   macro avg       0.70      0.68      0.69      1758\n",
      "weighted avg       0.76      0.76      0.76      1758\n",
      "\n",
      "Accuracy   Score :  0.7639362912400455"
     ]
    }
   ],
   "source": [
    "coefficients = pd.DataFrame(xgb_model.feature_importances_)\n",
    "column_df = pd.DataFrame(cols)\n",
    "coef_sumry = (pd.merge(coefficients, column_df, left_index=True,\n",
    "                       right_index=True, how=\"left\"))\n",
    "coef_sumry.columns = [\"coefficients\", \"features\"]\n",
    "coef_sumry = coef_sumry.sort_values(by=\"coefficients\", ascending=False)\n",
    "\n",
    "acc = accuracy_score(testing_y, predictions)\n",
    "print(xgb_model)\n",
    "print(\"\\n Classification report : \\n\", classification_report(testing_y, predictions))\n",
    "print(\"Accuracy   Score : \", acc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started copying local path xgb_reg.pkl to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/telecom/Resources/xgboost_model/xgb_reg.pkl\n",
      "\n",
      "Finished copying\n",
      "\n",
      "Exported model XGBoost_Churn_Classifier as version 1 successfully.\n",
      "Polling XGBoost_Churn_Classifier version 1 for model availability.\n",
      "Model now available."
     ]
    }
   ],
   "source": [
    "from hops import model\n",
    "import pickle\n",
    "MODEL_NAME = \"XGBoost_Churn_Classifier\"\n",
    "file_name = \"xgb_reg.pkl\"\n",
    "hdfs_path = \"Resources/xgboost_model\"\n",
    "\n",
    "pickle.dump(xgb_model, open(file_name, \"wb\"))\n",
    "hdfs.mkdir(hdfs_path)\n",
    "hdfs.copy_to_hdfs(file_name, hdfs_path, overwrite=True)\n",
    "\n",
    "# test that we can load and use the model\n",
    "xgb_model_loaded = pickle.load(open(file_name, \"rb\"))\n",
    "xgb_model_loaded.predict(testing_x)[0] == xgb_model.predict(testing_x)[0]\n",
    "\n",
    "# save to the model registry\n",
    "model.export(hdfs_path, MODEL_NAME, metrics={'accuracy': acc})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark",
   "language": "python",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 3
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}