{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "---\n", "title: \"Snowflake Ingestion (2) Create Feature groups in Python\"\n", "date: 2021-02-24\n", "type: technical_note\n", "draft: false\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Telecom Customer Churn Prediction Feature Engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This tutorial is based on [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook and [this](https://github.com/gojek/feast/tree/master/examples/feast-xgboost-churn-prediction-tutorial) Feast notebook" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "fa7e507381a982dfb9bcba253537c50ecd956230" }, "source": [ "## 1. Churn Feature Engineering" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_kg_hide-input": false, "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver log
51application_1598866185540_0015pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] } ], "source": [ "from hops import pandas_helper as pandas\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import sklearn # Tested with 0.22.1\n", "from slugify import slugify\n", "import snowflake.connector # tested on snowflake-connector-python==2.3.1 " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import hsfs\n", "connection = hsfs.connection()\n", "# get a reference to the feature store, you can access also shared feature stores by providing the feature store name\n", "fs = connection.get_feature_store()\n", "connector = fs.get_storage_connector(\"snowflake_spark_connector\", \"JDBC\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " customer_id gender senior_citizen partner dependents tenure phone_service multiple_lines internet_service online_security online_backup device_protection tech_support streaming_tv streaming_movies contract paperless_billing payment_method monthly_charges total_charges churn\n", "0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No\n", "1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No\n", "2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes\n", "3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No\n", "4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes" ] } ], "source": [ "ctx = snowflake.connector.connect(\n", " user='HOPSWORKS',\n", " password=connector.arguments,\n", " account='ra96958.eu-central-1',\n", " warehouse='HOPSWORKS_WH',\n", " database='ML_WORKSHOP',\n", " schema='PUBLIC'\n", ")\n", "\n", "# Query Snowflake Data\n", "cs=ctx.cursor()\n", "allrows=cs.execute(\"\"\"select * from CUSTOMER_CHURN \"\"\").fetchall()\n", "telcom = pd.DataFrame(allrows)\n", "telcom.columns = ['customer_id', 'gender', 'senior_citizen', 'partner', 'dependents',\n", " 'tenure', 'phone_service', 'multiple_lines', 'internet_service',\n", " 'online_security', 'online_backup', 'device_protection', 'tech_support',\n", " 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing',\n", " 'payment_method', 'monthly_charges', 'total_charges', 'churn']\n", "\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 2000)\n", "telcom.head(5)" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "c8c010f36e29c116c6662301b08b0b0019d6e22e" }, "source": [ "### 1.2 Data Manipulation" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "_uuid": "8b10c13086dff7182e399b849e31bc03df54a14e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " customer_id gender senior_citizen partner dependents tenure phone_service multiple_lines internet_service online_security online_backup device_protection tech_support streaming_tv streaming_movies contract paperless_billing payment_method monthly_charges total_charges churn tenure_group\n", "0 7590-VHVEG Female No Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No Tenure_0-12\n", "1 5575-GNVDE Male No No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.50 No Tenure_24-48\n", "2 3668-QPYBK Male No No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes Tenure_0-12\n", "3 7795-CFOCW Male No No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No Tenure_24-48\n", "4 9237-HQITU Female No No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes Tenure_0-12" ] } ], "source": [ "# Replacing spaces with null values in total charges column\n", "telcom['total_charges'] = telcom[\"total_charges\"].replace(\" \",np.nan)\n", "\n", "# Dropping null values from total charges column which contain .15% missing data \n", "telcom = telcom[telcom[\"total_charges\"].notnull()]\n", "telcom = telcom.reset_index()[telcom.columns]\n", "\n", "# Convert to float type\n", "telcom[\"total_charges\"] = telcom[\"total_charges\"].astype(float)\n", "\n", "# Replace 'No internet service' to No for the following columns\n", "replace_cols = [ 'online_security', 'online_backup', 'device_protection',\n", " 'tech_support','streaming_tv', 'streaming_movies']\n", "for i in replace_cols : \n", " telcom[i] = telcom[i].replace({'internet_service' : 'No'})\n", " \n", "# Replace binary values with strings\n", "telcom[\"senior_citizen\"] = telcom[\"senior_citizen\"].replace({1:\"Yes\",0:\"No\"})\n", "\n", "# Tenure to categorical column\n", "def tenure_lab(telcom) :\n", " \n", " if telcom[\"tenure\"] <= 12 :\n", " return \"Tenure_0-12\"\n", " elif (telcom[\"tenure\"] > 12) & (telcom[\"tenure\"] <= 24 ):\n", " return \"Tenure_12-24\"\n", " elif (telcom[\"tenure\"] > 24) & (telcom[\"tenure\"] <= 48) :\n", " return \"Tenure_24-48\"\n", " elif (telcom[\"tenure\"] > 48) & (telcom[\"tenure\"] <= 60) :\n", " return \"Tenure_48-60\"\n", " elif telcom[\"tenure\"] > 60 :\n", " return \"Tenure_gt_60\"\n", "telcom[\"tenure_group\"] = telcom.apply(lambda telcom:tenure_lab(telcom),\n", " axis = 1)\n", "\n", "# Separating churn and non churn customers\n", "churn = telcom[telcom[\"churn\"] == \"Yes\"]\n", "not_churn = telcom[telcom[\"churn\"] == \"No\"]\n", "\n", "# Separating catagorical and numerical columns\n", "Id_col = ['customer_id']\n", "target_col = [\"churn\"]\n", "cat_cols = telcom.nunique()[telcom.nunique() < 6].keys().tolist()\n", "cat_cols = [x for x in cat_cols if x not in target_col]\n", "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]\n", "\n", "telcom.head()" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "6dfa77b43fe1a1a301bab65186c2a9f90245ab7d" }, "source": [ "### 1.3 Data Processing" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "_uuid": "8921591320c5e336ec5a2e1efc5ed3cb0f9ec1b2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " customer_id gender senior_citizen partner dependents phone_service paperless_billing churn multiple_lines_no multiple_lines_no_phone_service multiple_lines_yes internet_service_dsl internet_service_fiber_optic internet_service_no online_security_no online_security_no_internet_service online_security_yes online_backup_no online_backup_no_internet_service online_backup_yes device_protection_no device_protection_no_internet_service device_protection_yes tech_support_no tech_support_no_internet_service tech_support_yes streaming_tv_no streaming_tv_no_internet_service streaming_tv_yes streaming_movies_no streaming_movies_no_internet_service streaming_movies_yes contract_month_to_month contract_one_year contract_two_year payment_method_bank_transfer_automatic payment_method_credit_card_automatic payment_method_electronic_check payment_method_mailed_check tenure_group_tenure_0_12 tenure_group_tenure_12_24 tenure_group_tenure_24_48 tenure_group_tenure_48_60 tenure_group_tenure_gt_60 tenure monthly_charges total_charges\n", "0 7590-VHVEG 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 -1.280248 -1.161694 -0.994194\n", "1 5575-GNVDE 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0.064303 -0.260878 -0.173740\n", "2 3668-QPYBK 1 0 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 -1.239504 -0.363923 -0.959649\n", "3 7795-CFOCW 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0.512486 -0.747850 -0.195248\n", "4 9237-HQITU 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 -1.239504 0.196178 -0.940457" ] } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# Customer id col\n", "Id_col = ['customer_id']\n", "\n", "# Target columns\n", "target_col = [\"churn\"]\n", "\n", "# Categorical columns\n", "cat_cols = telcom.nunique()[telcom.nunique() < 6].keys().tolist()\n", "cat_cols = [x for x in cat_cols if x not in target_col]\n", "\n", "# Numerical columns\n", "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]\n", "\n", "# Binary columns with 2 values\n", "bin_cols = telcom.nunique()[telcom.nunique() == 2].keys().tolist()\n", "\n", "# Columns more than 2 values\n", "multi_cols = [i for i in cat_cols if i not in bin_cols]\n", "\n", "# Label encoding Binary columns\n", "le = LabelEncoder()\n", "for i in bin_cols :\n", " telcom[i] = le.fit_transform(telcom[i])\n", " \n", "# Duplicating columns for multi value columns\n", "telcom = pd.get_dummies(data = telcom,columns = multi_cols )\n", "\n", "# Scaling Numerical columns\n", "std = StandardScaler()\n", "scaled = std.fit_transform(telcom[num_cols])\n", "scaled = pd.DataFrame(scaled,columns=num_cols)\n", "\n", "# Dropping original values merging scaled values for numerical columns\n", "df_telcom_og = telcom.copy()\n", "telcom = telcom.drop(columns = num_cols,axis = 1)\n", "telcom = telcom.merge(scaled,left_index=True,right_index=True,how = \"left\")\n", "\n", "# Clean up column names\n", "telcom.columns = [slugify(col, lowercase=True, separator='_') for col in telcom.columns]\n", "telcom.columns = map(str.lower, telcom.columns)\n", "telcom.head()" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "9ec25cff71c0eb0f0c839a726cb06cb43462a53f" }, "source": [ "### 1.4 Create FeatureGroups" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "val telcoFg = (fs.createFeatureGroup()\n", " .name(\"telco_customer_features\")\n", " .version(2)\n", " .description(\"Telecom customer features\")\n", " .onlineEnabled(true)\n", " .timeTravelFormat(TimeTravelFormat.HUDI)\n", " .primaryKeys(Seq(\"customer_id\"))\n", " .statisticsEnabled(true)\n", " .build())\n", "\n", "telcoFg.save(telcom_churn_features)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }