{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Train Telecom Customer Churn Prediction with XGBoost" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This tutorial is based on [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook and [this](https://github.com/gojek/feast/tree/master/examples/feast-xgboost-churn-prediction-tutorial) Feast notebook" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_kg_hide-input": false, "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver log
13application_1592283535818_0014pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from hops import featurestore, hdfs\n", "from hops import numpy_helper as numpy\n", "from hops import pandas_helper as pandas\n", "import os\n", "import itertools\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "import io\n", "import statsmodels, yellowbrick\n", "import sklearn # Tested with 0.22.1\n", "import imblearn\n", "from slugify import slugify" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running sql: use telecom_featurestore against offline feature store\n", "SQL string for the query created successfully\n", "Running sql: SELECT * FROM telcom_featuregroup_1 against offline feature store\n", " churn ... tenure_group_tenure_gt_60\n", "0 0 ... 0\n", "1 1 ... 0\n", "2 1 ... 0\n", "3 0 ... 0\n", "4 1 ... 0\n", "\n", "[5 rows x 47 columns]" ] } ], "source": [ "telecom_df = featurestore.get_featuregroup(\"telcom_featuregroup\", dataframe_type=\"pandas\")\n", "telecom_df.head()" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "f944336cbe67efb3422b79864d9478e2cfbdc860" }, "source": [ "### 1.6 Data Preparation for Training" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix,accuracy_score,classification_report\n", "from sklearn.metrics import roc_auc_score,roc_curve,scorer\n", "from sklearn.metrics import f1_score\n", "import statsmodels.api as sm\n", "from sklearn.metrics import precision_score,recall_score\n", "from yellowbrick.classifier import DiscriminationThreshold\n", "\n", "Id_col = ['customer_id']\n", "target_col = [\"churn\"]\n", "# Split into a train and test set\n", "train, test = train_test_split(telecom_df,test_size = .25 ,random_state = 111)\n", " \n", "# Seperating dependent and independent variables\n", "cols = [i for i in telecom_df.columns if i not in Id_col + target_col]\n", "training_x = train[cols]\n", "training_y = train[target_col]\n", "testing_x = test[cols]\n", "testing_y = test[target_col]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.7 Training" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from xgboost import XGBClassifier\n", "\n", "xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", " colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,\n", " max_depth=7, min_child_weight=1, missing=None, n_estimators=100,\n", " n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n", " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", " silent=True, subsample=1)\n", "\n", "# Train model\n", "xgb_model.fit(training_x, training_y)\n", "predictions = xgb_model.predict(testing_x)\n", "probabilities = xgb_model.predict_proba(testing_x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.8 Analysis" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", " colsample_bynode=1, colsample_bytree=1, gamma=0,\n", " learning_rate=0.9, max_delta_step=0, max_depth=7,\n", " min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n", " nthread=None, objective='binary:logistic', random_state=0,\n", " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", " silent=True, subsample=1, verbosity=1)\n", "\n", " Classification report : \n", " precision recall f1-score support\n", "\n", " 0 0.82 0.86 0.84 1282\n", " 1 0.57 0.50 0.54 476\n", "\n", " accuracy 0.76 1758\n", " macro avg 0.70 0.68 0.69 1758\n", "weighted avg 0.76 0.76 0.76 1758\n", "\n", "Accuracy Score : 0.7639362912400455" ] } ], "source": [ "coefficients = pd.DataFrame(xgb_model.feature_importances_)\n", "column_df = pd.DataFrame(cols)\n", "coef_sumry = (pd.merge(coefficients, column_df, left_index=True,\n", " right_index=True, how=\"left\"))\n", "coef_sumry.columns = [\"coefficients\", \"features\"]\n", "coef_sumry = coef_sumry.sort_values(by=\"coefficients\", ascending=False)\n", "\n", "acc = accuracy_score(testing_y, predictions)\n", "print(xgb_model)\n", "print(\"\\n Classification report : \\n\", classification_report(testing_y, predictions))\n", "print(\"Accuracy Score : \", acc)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Started copying local path xgb_reg.pkl to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/telecom/Resources/xgboost_model/xgb_reg.pkl\n", "\n", "Finished copying\n", "\n", "Exported model XGBoost_Churn_Classifier as version 1 successfully.\n", "Polling XGBoost_Churn_Classifier version 1 for model availability.\n", "Model now available." ] } ], "source": [ "from hops import model\n", "import pickle\n", "MODEL_NAME = \"XGBoost_Churn_Classifier\"\n", "file_name = \"xgb_reg.pkl\"\n", "hdfs_path = \"Resources/xgboost_model\"\n", "\n", "pickle.dump(xgb_model, open(file_name, \"wb\"))\n", "hdfs.mkdir(hdfs_path)\n", "hdfs.copy_to_hdfs(file_name, hdfs_path, overwrite=True)\n", "\n", "# test that we can load and use the model\n", "xgb_model_loaded = pickle.load(open(file_name, \"rb\"))\n", "xgb_model_loaded.predict(testing_x)[0] == xgb_model.predict(testing_x)[0]\n", "\n", "# save to the model registry\n", "model.export(hdfs_path, MODEL_NAME, metrics={'accuracy': acc})" ] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "python", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }