From 3a50d987c2f4fd3353839fdfcd66392c6c59c042 Mon Sep 17 00:00:00 2001 From: Alaa Date: Wed, 23 Oct 2024 18:55:07 +0100 Subject: [PATCH 1/3] initial cleaning --- data/Dirty Data.ipynb | 3196 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 3183 insertions(+), 13 deletions(-) diff --git a/data/Dirty Data.ipynb b/data/Dirty Data.ipynb index de8964f..c224a42 100644 --- a/data/Dirty Data.ipynb +++ b/data/Dirty Data.ipynb @@ -2,56 +2,3226 @@ "cells": [ { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "import numpy as np" + "import matplotlib.pyplot as plt\n", + "import seaborn as sn\n", + "import numpy as np\n", + "import plotly.graph_objs as go\n", + "from plotly.offline import iplot\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "dataset = pd.read_csv('BankChurners.csv')" + "# Read the dataset\n", + "dataset = pd.read_csv('BankChurners_v2.csv')" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10127, 23)\n" + ] + } + ], + "source": [ + "# Check the shape of the dataset\n", + "print(dataset.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n", + " 'Dependent_count', 'Education_Level', 'Marital_Status',\n", + " 'Income_Category', 'Card_Category', 'Months_on_book',\n", + " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n", + " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n", + " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n", + " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',\n", + " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n", + " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],\n", + " dtype='object')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display the features of the dataset\n", + "dataset.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Credit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_RatioNaive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
090032Existing Customer45M3High SchoolMarried$60K - $80KBlue39...12691.077711914.01.3351144421.6250.0610.0000930.99991
190033Existing Customer49F5GraduateSingleLess than $40KBlue44...8256.08647392.01.5411291333.7140.1050.0000570.99994
290034Existing Customer51M3GraduateMarried$80K - $120KBlue36...3418.003418.02.5941887202.3330.0000.0000210.99998
390035Existing Customer40F4High SchoolNaNLess than $40KBlue34...3313.02517796.01.4051171202.3330.7600.0001340.99987
490036Existing Customer40M3UneducatedMarried$60K - $80KBlue21...4716.004716.02.175816282.5000.0000.0000220.99998
\n", + "

5 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "0 90032 Existing Customer 45 M 3 \n", + "1 90033 Existing Customer 49 F 5 \n", + "2 90034 Existing Customer 51 M 3 \n", + "3 90035 Existing Customer 40 F 4 \n", + "4 90036 Existing Customer 40 M 3 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "0 High School Married $60K - $80K Blue \n", + "1 Graduate Single Less than $40K Blue \n", + "2 Graduate Married $80K - $120K Blue \n", + "3 High School NaN Less than $40K Blue \n", + "4 Uneducated Married $60K - $80K Blue \n", + "\n", + " Months_on_book ... Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "0 39 ... 12691.0 777 11914.0 \n", + "1 44 ... 8256.0 864 7392.0 \n", + "2 36 ... 3418.0 0 3418.0 \n", + "3 34 ... 3313.0 2517 796.0 \n", + "4 21 ... 4716.0 0 4716.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \\\n", + "0 1.335 1144 42 1.625 \n", + "1 1.541 1291 33 3.714 \n", + "2 2.594 1887 20 2.333 \n", + "3 1.405 1171 20 2.333 \n", + "4 2.175 816 28 2.500 \n", + "\n", + " Avg_Utilization_Ratio \\\n", + "0 0.061 \n", + "1 0.105 \n", + "2 0.000 \n", + "3 0.760 \n", + "4 0.000 \n", + "\n", + " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 \\\n", + "0 0.000093 \n", + "1 0.000057 \n", + "2 0.000021 \n", + "3 0.000134 \n", + "4 0.000022 \n", + "\n", + " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 \n", + "0 0.99991 \n", + "1 0.99994 \n", + "2 0.99998 \n", + "3 0.99987 \n", + "4 0.99998 \n", + "\n", + "[5 rows x 23 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display the first 5 rows of the dataset\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleaning " + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "dataset = dataset.replace('Unknown', np.nan)" + "Duplication" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10127" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Data Duplication\n", + "# Check the original number of rows \n", + "dataset.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are two ways to check if there are duplications. " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10127" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First way\n", + "# CLIENTNUM is a unique number, so it should not be duplicated\n", + "# Check if all the CLIENTNUM values are unique\n", + "dataset['CLIENTNUM'].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No duplication in the dataset " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10127, 23)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Second way \n", + "# Drop the duplication\n", + "dataset.drop_duplicates(subset= 'CLIENTNUM', keep= False, inplace=True )\n", + "\n", + "# Check back the number of row after dropping any duplicates\n", + "dataset.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both ways showed no duplicated rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop irrelevant columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ - "dataset['CLIENTNUM'] = [x + 90032 for x in list(range(len(dataset)))]" + "dataset.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n", + " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n", + " 'Dependent_count', 'Education_Level', 'Marital_Status',\n", + " 'Income_Category', 'Card_Category', 'Months_on_book',\n", + " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n", + " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n", + " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n", + " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],\n", + " dtype='object')\n", + "(10127, 21)\n" + ] + } + ], + "source": [ + "# Check the columns\n", + "print(dataset.columns)\n", + "# Check the number of columns\n", + "print(dataset.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Features' Datatypes" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CLIENTNUM int64\n", + "Attrition_Flag object\n", + "Customer_Age int64\n", + "Gender object\n", + "Dependent_count int64\n", + "Education_Level object\n", + "Marital_Status object\n", + "Income_Category object\n", + "Card_Category object\n", + "Months_on_book int64\n", + "Total_Relationship_Count int64\n", + "Months_Inactive_12_mon int64\n", + "Contacts_Count_12_mon int64\n", + "Credit_Limit float64\n", + "Total_Revolving_Bal int64\n", + "Avg_Open_To_Buy float64\n", + "Total_Amt_Chng_Q4_Q1 float64\n", + "Total_Trans_Amt int64\n", + "Total_Trans_Ct int64\n", + "Total_Ct_Chng_Q4_Q1 float64\n", + "Avg_Utilization_Ratio float64\n", + "dtype: object" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check the datatype of the variables\n", + "dataset.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Missing Values" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CLIENTNUM 0\n", + "Attrition_Flag 0\n", + "Customer_Age 0\n", + "Gender 0\n", + "Dependent_count 0\n", + "Education_Level 1519\n", + "Marital_Status 749\n", + "Income_Category 1112\n", + "Card_Category 0\n", + "Months_on_book 0\n", + "Total_Relationship_Count 0\n", + "Months_Inactive_12_mon 0\n", + "Contacts_Count_12_mon 0\n", + "Credit_Limit 0\n", + "Total_Revolving_Bal 0\n", + "Avg_Open_To_Buy 0\n", + "Total_Amt_Chng_Q4_Q1 0\n", + "Total_Trans_Amt 0\n", + "Total_Trans_Ct 0\n", + "Total_Ct_Chng_Q4_Q1 0\n", + "Avg_Utilization_Ratio 0\n", + "dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check if there are any missing values\n", + "dataset.isna().sum() # OR dataset.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Three variables have missing values. These variables are categorical." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
690038Existing Customer51M4NaNMarried$120K +Gold46...1334516.0226432252.01.9751330310.7220.066
1190043Existing Customer65M1NaNMarried$40K - $60KBlue54...239095.015877508.01.4331314261.3640.174
1590047Existing Customer44M4NaNNaN$80K - $120KBlue37...124234.09723262.01.7071348271.7000.230
1790049Existing Customer41M3NaNMarried$80K - $120KBlue34...4113535.0129112244.00.6531028211.6250.095
2390055Existing Customer47F4NaNSingleLess than $40KBlue36...322492.01560932.00.5731126230.3530.626
..................................................................
10090100122Existing Customer36F3NaNMarried$40K - $60KBlue22...3312958.0227310685.00.60815681960.6270.175
10094100126Existing Customer59M1NaNSingle$60K - $80KBlue48...127288.007288.00.640148731200.7140.000
10095100127Existing Customer46M3NaNMarried$80K - $120KBlue33...1334516.0109933417.00.816154901100.6180.032
10118100150Attrited Customer50M1NaNNaN$80K - $120KBlue36...349959.09529007.00.82510310631.1000.096
10123100155Attrited Customer41M2NaNDivorced$40K - $60KBlue25...234277.021862091.00.8048764690.6830.511
\n", + "

1519 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "6 90038 Existing Customer 51 M 4 \n", + "11 90043 Existing Customer 65 M 1 \n", + "15 90047 Existing Customer 44 M 4 \n", + "17 90049 Existing Customer 41 M 3 \n", + "23 90055 Existing Customer 47 F 4 \n", + "... ... ... ... ... ... \n", + "10090 100122 Existing Customer 36 F 3 \n", + "10094 100126 Existing Customer 59 M 1 \n", + "10095 100127 Existing Customer 46 M 3 \n", + "10118 100150 Attrited Customer 50 M 1 \n", + "10123 100155 Attrited Customer 41 M 2 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "6 NaN Married $120K + Gold \n", + "11 NaN Married $40K - $60K Blue \n", + "15 NaN NaN $80K - $120K Blue \n", + "17 NaN Married $80K - $120K Blue \n", + "23 NaN Single Less than $40K Blue \n", + "... ... ... ... ... \n", + "10090 NaN Married $40K - $60K Blue \n", + "10094 NaN Single $60K - $80K Blue \n", + "10095 NaN Married $80K - $120K Blue \n", + "10118 NaN NaN $80K - $120K Blue \n", + "10123 NaN Divorced $40K - $60K Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "6 46 ... 1 3 \n", + "11 54 ... 2 3 \n", + "15 37 ... 1 2 \n", + "17 34 ... 4 1 \n", + "23 36 ... 3 2 \n", + "... ... ... ... ... \n", + "10090 22 ... 3 3 \n", + "10094 48 ... 1 2 \n", + "10095 33 ... 1 3 \n", + "10118 36 ... 3 4 \n", + "10123 25 ... 2 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "6 34516.0 2264 32252.0 \n", + "11 9095.0 1587 7508.0 \n", + "15 4234.0 972 3262.0 \n", + "17 13535.0 1291 12244.0 \n", + "23 2492.0 1560 932.0 \n", + "... ... ... ... \n", + "10090 12958.0 2273 10685.0 \n", + "10094 7288.0 0 7288.0 \n", + "10095 34516.0 1099 33417.0 \n", + "10118 9959.0 952 9007.0 \n", + "10123 4277.0 2186 2091.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "6 1.975 1330 31 \n", + "11 1.433 1314 26 \n", + "15 1.707 1348 27 \n", + "17 0.653 1028 21 \n", + "23 0.573 1126 23 \n", + "... ... ... ... \n", + "10090 0.608 15681 96 \n", + "10094 0.640 14873 120 \n", + "10095 0.816 15490 110 \n", + "10118 0.825 10310 63 \n", + "10123 0.804 8764 69 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "6 0.722 0.066 \n", + "11 1.364 0.174 \n", + "15 1.700 0.230 \n", + "17 1.625 0.095 \n", + "23 0.353 0.626 \n", + "... ... ... \n", + "10090 0.627 0.175 \n", + "10094 0.714 0.000 \n", + "10095 0.618 0.032 \n", + "10118 1.100 0.096 \n", + "10123 0.683 0.511 \n", + "\n", + "[1519 rows x 21 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Have a look on the missing values in Education_Level\n", + "dataset[dataset.Education_Level.isna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No trend is seen. There also other missing values in other variables in the above rows." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
390035Existing Customer40F4High SchoolNaNLess than $40KBlue34...413313.02517796.01.4051171202.3330.760
790039Existing Customer32M0High SchoolNaN$60K - $80KSilver27...2229081.0139627685.02.2041538360.7140.048
1090042Existing Customer42M5UneducatedNaN$120K +Blue31...326748.014675281.00.8311201420.6800.217
1390045Existing Customer35M3GraduateNaN$60K - $80KBlue30...138547.016666881.01.1631311332.0000.195
1590047Existing Customer44M4NaNNaN$80K - $120KBlue37...124234.09723262.01.7071348271.7000.230
..................................................................
10070100102Existing Customer47M3High SchoolNaN$80K - $120KSilver40...3234516.0137133145.00.691159301230.8360.040
10100100132Existing Customer39M2GraduateNaN$60K - $80KSilver36...2229808.0029808.00.669160981280.6840.000
10101100133Existing Customer42M2GraduateNaN$40K - $60KBlue30...253735.017232012.00.59514501920.8400.461
10118100150Attrited Customer50M1NaNNaN$80K - $120KBlue36...349959.09529007.00.82510310631.1000.096
10125100157Attrited Customer30M2GraduateNaN$40K - $60KBlue36...335281.005281.00.5358395620.7220.000
\n", + "

749 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "3 90035 Existing Customer 40 F 4 \n", + "7 90039 Existing Customer 32 M 0 \n", + "10 90042 Existing Customer 42 M 5 \n", + "13 90045 Existing Customer 35 M 3 \n", + "15 90047 Existing Customer 44 M 4 \n", + "... ... ... ... ... ... \n", + "10070 100102 Existing Customer 47 M 3 \n", + "10100 100132 Existing Customer 39 M 2 \n", + "10101 100133 Existing Customer 42 M 2 \n", + "10118 100150 Attrited Customer 50 M 1 \n", + "10125 100157 Attrited Customer 30 M 2 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "3 High School NaN Less than $40K Blue \n", + "7 High School NaN $60K - $80K Silver \n", + "10 Uneducated NaN $120K + Blue \n", + "13 Graduate NaN $60K - $80K Blue \n", + "15 NaN NaN $80K - $120K Blue \n", + "... ... ... ... ... \n", + "10070 High School NaN $80K - $120K Silver \n", + "10100 Graduate NaN $60K - $80K Silver \n", + "10101 Graduate NaN $40K - $60K Blue \n", + "10118 NaN NaN $80K - $120K Blue \n", + "10125 Graduate NaN $40K - $60K Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "3 34 ... 4 1 \n", + "7 27 ... 2 2 \n", + "10 31 ... 3 2 \n", + "13 30 ... 1 3 \n", + "15 37 ... 1 2 \n", + "... ... ... ... ... \n", + "10070 40 ... 3 2 \n", + "10100 36 ... 2 2 \n", + "10101 30 ... 2 5 \n", + "10118 36 ... 3 4 \n", + "10125 36 ... 3 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "3 3313.0 2517 796.0 \n", + "7 29081.0 1396 27685.0 \n", + "10 6748.0 1467 5281.0 \n", + "13 8547.0 1666 6881.0 \n", + "15 4234.0 972 3262.0 \n", + "... ... ... ... \n", + "10070 34516.0 1371 33145.0 \n", + "10100 29808.0 0 29808.0 \n", + "10101 3735.0 1723 2012.0 \n", + "10118 9959.0 952 9007.0 \n", + "10125 5281.0 0 5281.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "3 1.405 1171 20 \n", + "7 2.204 1538 36 \n", + "10 0.831 1201 42 \n", + "13 1.163 1311 33 \n", + "15 1.707 1348 27 \n", + "... ... ... ... \n", + "10070 0.691 15930 123 \n", + "10100 0.669 16098 128 \n", + "10101 0.595 14501 92 \n", + "10118 0.825 10310 63 \n", + "10125 0.535 8395 62 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "3 2.333 0.760 \n", + "7 0.714 0.048 \n", + "10 0.680 0.217 \n", + "13 2.000 0.195 \n", + "15 1.700 0.230 \n", + "... ... ... \n", + "10070 0.836 0.040 \n", + "10100 0.684 0.000 \n", + "10101 0.840 0.461 \n", + "10118 1.100 0.096 \n", + "10125 0.722 0.000 \n", + "\n", + "[749 rows x 21 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Investigate the missing values in Marital_Status\n", + "dataset[dataset['Marital_Status'].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
1990051Existing Customer45F2GraduateMarriedNaNBlue37...1214470.0115713313.00.9661207210.9090.080
2890060Existing Customer44F3UneducatedSingleNaNBlue34...2210100.0010100.00.5251052181.5710.000
3990071Attrited Customer66F0DoctorateMarriedNaNBlue56...437882.06057277.01.052704160.1430.077
4490076Existing Customer38F4GraduateSingleNaNBlue28...339830.020557775.00.9771042230.9170.209
5890090Existing Customer44F5GraduateMarriedNaNBlue35...126273.09785295.02.2751359251.0830.156
..................................................................
10021100053Attrited Customer30F1GraduateMarriedNaNBlue18...144377.025171860.00.9418759740.6090.575
10040100072Attrited Customer50F3DoctorateSingleNaNBlue36...335173.005173.00.9128757680.7890.000
10083100115Existing Customer42F4UneducatedMarriedNaNBlue23...128348.008348.00.695159051110.7080.000
10092100124Attrited Customer40F3GraduateMarriedNaNBlue25...236888.018785010.01.0599038640.8290.273
10119100151Attrited Customer55F3UneducatedSingleNaNBlue47...3314657.0251712140.00.1666009530.5140.172
\n", + "

1112 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "19 90051 Existing Customer 45 F 2 \n", + "28 90060 Existing Customer 44 F 3 \n", + "39 90071 Attrited Customer 66 F 0 \n", + "44 90076 Existing Customer 38 F 4 \n", + "58 90090 Existing Customer 44 F 5 \n", + "... ... ... ... ... ... \n", + "10021 100053 Attrited Customer 30 F 1 \n", + "10040 100072 Attrited Customer 50 F 3 \n", + "10083 100115 Existing Customer 42 F 4 \n", + "10092 100124 Attrited Customer 40 F 3 \n", + "10119 100151 Attrited Customer 55 F 3 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "19 Graduate Married NaN Blue \n", + "28 Uneducated Single NaN Blue \n", + "39 Doctorate Married NaN Blue \n", + "44 Graduate Single NaN Blue \n", + "58 Graduate Married NaN Blue \n", + "... ... ... ... ... \n", + "10021 Graduate Married NaN Blue \n", + "10040 Doctorate Single NaN Blue \n", + "10083 Uneducated Married NaN Blue \n", + "10092 Graduate Married NaN Blue \n", + "10119 Uneducated Single NaN Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "19 37 ... 1 2 \n", + "28 34 ... 2 2 \n", + "39 56 ... 4 3 \n", + "44 28 ... 3 3 \n", + "58 35 ... 1 2 \n", + "... ... ... ... ... \n", + "10021 18 ... 1 4 \n", + "10040 36 ... 3 3 \n", + "10083 23 ... 1 2 \n", + "10092 25 ... 2 3 \n", + "10119 47 ... 3 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "19 14470.0 1157 13313.0 \n", + "28 10100.0 0 10100.0 \n", + "39 7882.0 605 7277.0 \n", + "44 9830.0 2055 7775.0 \n", + "58 6273.0 978 5295.0 \n", + "... ... ... ... \n", + "10021 4377.0 2517 1860.0 \n", + "10040 5173.0 0 5173.0 \n", + "10083 8348.0 0 8348.0 \n", + "10092 6888.0 1878 5010.0 \n", + "10119 14657.0 2517 12140.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "19 0.966 1207 21 \n", + "28 0.525 1052 18 \n", + "39 1.052 704 16 \n", + "44 0.977 1042 23 \n", + "58 2.275 1359 25 \n", + "... ... ... ... \n", + "10021 0.941 8759 74 \n", + "10040 0.912 8757 68 \n", + "10083 0.695 15905 111 \n", + "10092 1.059 9038 64 \n", + "10119 0.166 6009 53 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "19 0.909 0.080 \n", + "28 1.571 0.000 \n", + "39 0.143 0.077 \n", + "44 0.917 0.209 \n", + "58 1.083 0.156 \n", + "... ... ... \n", + "10021 0.609 0.575 \n", + "10040 0.789 0.000 \n", + "10083 0.708 0.000 \n", + "10092 0.829 0.273 \n", + "10119 0.514 0.172 \n", + "\n", + "[1112 rows x 21 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Investigate the missing values in Marital_Status\n", + "dataset[dataset['Income_Category'].isna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fill the missing values with \"Unknown\" in the three variables." + ] + }, + { + "cell_type": "code", + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "dataset.to_csv('BankChurners_v2.csv', index=False)" + "# Impute the missing values with Unknown\n", + "dataset['Education_Level'] = dataset['Education_Level'].fillna(\"Unknown\")\n", + "dataset['Marital_Status'] = dataset['Marital_Status'].fillna(\"Unknown\")\n", + "dataset['Income_Category'] = dataset['Income_Category'].fillna(\"Unknown\")\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CLIENTNUM 0\n", + "Attrition_Flag 0\n", + "Customer_Age 0\n", + "Gender 0\n", + "Dependent_count 0\n", + "Education_Level 0\n", + "Marital_Status 0\n", + "Income_Category 0\n", + "Card_Category 0\n", + "Months_on_book 0\n", + "Total_Relationship_Count 0\n", + "Months_Inactive_12_mon 0\n", + "Contacts_Count_12_mon 0\n", + "Credit_Limit 0\n", + "Total_Revolving_Bal 0\n", + "Avg_Open_To_Buy 0\n", + "Total_Amt_Chng_Q4_Q1 0\n", + "Total_Trans_Amt 0\n", + "Total_Trans_Ct 0\n", + "Total_Ct_Chng_Q4_Q1 0\n", + "Avg_Utilization_Ratio 0\n", + "dtype: int64" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check that there is no missing values\n", + "dataset.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
690038Existing Customer51M4UnknownMarried$120K +Gold46...1334516.0226432252.01.9751330310.7220.066
1190043Existing Customer65M1UnknownMarried$40K - $60KBlue54...239095.015877508.01.4331314261.3640.174
1590047Existing Customer44M4UnknownUnknown$80K - $120KBlue37...124234.09723262.01.7071348271.7000.230
1790049Existing Customer41M3UnknownMarried$80K - $120KBlue34...4113535.0129112244.00.6531028211.6250.095
2390055Existing Customer47F4UnknownSingleLess than $40KBlue36...322492.01560932.00.5731126230.3530.626
..................................................................
10090100122Existing Customer36F3UnknownMarried$40K - $60KBlue22...3312958.0227310685.00.60815681960.6270.175
10094100126Existing Customer59M1UnknownSingle$60K - $80KBlue48...127288.007288.00.640148731200.7140.000
10095100127Existing Customer46M3UnknownMarried$80K - $120KBlue33...1334516.0109933417.00.816154901100.6180.032
10118100150Attrited Customer50M1UnknownUnknown$80K - $120KBlue36...349959.09529007.00.82510310631.1000.096
10123100155Attrited Customer41M2UnknownDivorced$40K - $60KBlue25...234277.021862091.00.8048764690.6830.511
\n", + "

1519 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "6 90038 Existing Customer 51 M 4 \n", + "11 90043 Existing Customer 65 M 1 \n", + "15 90047 Existing Customer 44 M 4 \n", + "17 90049 Existing Customer 41 M 3 \n", + "23 90055 Existing Customer 47 F 4 \n", + "... ... ... ... ... ... \n", + "10090 100122 Existing Customer 36 F 3 \n", + "10094 100126 Existing Customer 59 M 1 \n", + "10095 100127 Existing Customer 46 M 3 \n", + "10118 100150 Attrited Customer 50 M 1 \n", + "10123 100155 Attrited Customer 41 M 2 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "6 Unknown Married $120K + Gold \n", + "11 Unknown Married $40K - $60K Blue \n", + "15 Unknown Unknown $80K - $120K Blue \n", + "17 Unknown Married $80K - $120K Blue \n", + "23 Unknown Single Less than $40K Blue \n", + "... ... ... ... ... \n", + "10090 Unknown Married $40K - $60K Blue \n", + "10094 Unknown Single $60K - $80K Blue \n", + "10095 Unknown Married $80K - $120K Blue \n", + "10118 Unknown Unknown $80K - $120K Blue \n", + "10123 Unknown Divorced $40K - $60K Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "6 46 ... 1 3 \n", + "11 54 ... 2 3 \n", + "15 37 ... 1 2 \n", + "17 34 ... 4 1 \n", + "23 36 ... 3 2 \n", + "... ... ... ... ... \n", + "10090 22 ... 3 3 \n", + "10094 48 ... 1 2 \n", + "10095 33 ... 1 3 \n", + "10118 36 ... 3 4 \n", + "10123 25 ... 2 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "6 34516.0 2264 32252.0 \n", + "11 9095.0 1587 7508.0 \n", + "15 4234.0 972 3262.0 \n", + "17 13535.0 1291 12244.0 \n", + "23 2492.0 1560 932.0 \n", + "... ... ... ... \n", + "10090 12958.0 2273 10685.0 \n", + "10094 7288.0 0 7288.0 \n", + "10095 34516.0 1099 33417.0 \n", + "10118 9959.0 952 9007.0 \n", + "10123 4277.0 2186 2091.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "6 1.975 1330 31 \n", + "11 1.433 1314 26 \n", + "15 1.707 1348 27 \n", + "17 0.653 1028 21 \n", + "23 0.573 1126 23 \n", + "... ... ... ... \n", + "10090 0.608 15681 96 \n", + "10094 0.640 14873 120 \n", + "10095 0.816 15490 110 \n", + "10118 0.825 10310 63 \n", + "10123 0.804 8764 69 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "6 0.722 0.066 \n", + "11 1.364 0.174 \n", + "15 1.700 0.230 \n", + "17 1.625 0.095 \n", + "23 0.353 0.626 \n", + "... ... ... \n", + "10090 0.627 0.175 \n", + "10094 0.714 0.000 \n", + "10095 0.618 0.032 \n", + "10118 1.100 0.096 \n", + "10123 0.683 0.511 \n", + "\n", + "[1519 rows x 21 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check that the imputation is working well. Compare the rows before and after imputation\n", + "dataset[dataset['Education_Level'] == \"Unknown\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
1990051Existing Customer45F2GraduateMarriedUnknownBlue37...1214470.0115713313.00.9661207210.9090.080
2890060Existing Customer44F3UneducatedSingleUnknownBlue34...2210100.0010100.00.5251052181.5710.000
3990071Attrited Customer66F0DoctorateMarriedUnknownBlue56...437882.06057277.01.052704160.1430.077
4490076Existing Customer38F4GraduateSingleUnknownBlue28...339830.020557775.00.9771042230.9170.209
5890090Existing Customer44F5GraduateMarriedUnknownBlue35...126273.09785295.02.2751359251.0830.156
..................................................................
10021100053Attrited Customer30F1GraduateMarriedUnknownBlue18...144377.025171860.00.9418759740.6090.575
10040100072Attrited Customer50F3DoctorateSingleUnknownBlue36...335173.005173.00.9128757680.7890.000
10083100115Existing Customer42F4UneducatedMarriedUnknownBlue23...128348.008348.00.695159051110.7080.000
10092100124Attrited Customer40F3GraduateMarriedUnknownBlue25...236888.018785010.01.0599038640.8290.273
10119100151Attrited Customer55F3UneducatedSingleUnknownBlue47...3314657.0251712140.00.1666009530.5140.172
\n", + "

1112 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "19 90051 Existing Customer 45 F 2 \n", + "28 90060 Existing Customer 44 F 3 \n", + "39 90071 Attrited Customer 66 F 0 \n", + "44 90076 Existing Customer 38 F 4 \n", + "58 90090 Existing Customer 44 F 5 \n", + "... ... ... ... ... ... \n", + "10021 100053 Attrited Customer 30 F 1 \n", + "10040 100072 Attrited Customer 50 F 3 \n", + "10083 100115 Existing Customer 42 F 4 \n", + "10092 100124 Attrited Customer 40 F 3 \n", + "10119 100151 Attrited Customer 55 F 3 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "19 Graduate Married Unknown Blue \n", + "28 Uneducated Single Unknown Blue \n", + "39 Doctorate Married Unknown Blue \n", + "44 Graduate Single Unknown Blue \n", + "58 Graduate Married Unknown Blue \n", + "... ... ... ... ... \n", + "10021 Graduate Married Unknown Blue \n", + "10040 Doctorate Single Unknown Blue \n", + "10083 Uneducated Married Unknown Blue \n", + "10092 Graduate Married Unknown Blue \n", + "10119 Uneducated Single Unknown Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "19 37 ... 1 2 \n", + "28 34 ... 2 2 \n", + "39 56 ... 4 3 \n", + "44 28 ... 3 3 \n", + "58 35 ... 1 2 \n", + "... ... ... ... ... \n", + "10021 18 ... 1 4 \n", + "10040 36 ... 3 3 \n", + "10083 23 ... 1 2 \n", + "10092 25 ... 2 3 \n", + "10119 47 ... 3 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "19 14470.0 1157 13313.0 \n", + "28 10100.0 0 10100.0 \n", + "39 7882.0 605 7277.0 \n", + "44 9830.0 2055 7775.0 \n", + "58 6273.0 978 5295.0 \n", + "... ... ... ... \n", + "10021 4377.0 2517 1860.0 \n", + "10040 5173.0 0 5173.0 \n", + "10083 8348.0 0 8348.0 \n", + "10092 6888.0 1878 5010.0 \n", + "10119 14657.0 2517 12140.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "19 0.966 1207 21 \n", + "28 0.525 1052 18 \n", + "39 1.052 704 16 \n", + "44 0.977 1042 23 \n", + "58 2.275 1359 25 \n", + "... ... ... ... \n", + "10021 0.941 8759 74 \n", + "10040 0.912 8757 68 \n", + "10083 0.695 15905 111 \n", + "10092 1.059 9038 64 \n", + "10119 0.166 6009 53 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "19 0.909 0.080 \n", + "28 1.571 0.000 \n", + "39 0.143 0.077 \n", + "44 0.917 0.209 \n", + "58 1.083 0.156 \n", + "... ... ... \n", + "10021 0.609 0.575 \n", + "10040 0.789 0.000 \n", + "10083 0.708 0.000 \n", + "10092 0.829 0.273 \n", + "10119 0.514 0.172 \n", + "\n", + "[1112 rows x 21 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset[dataset['Income_Category'] == \"Unknown\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Months_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
390035Existing Customer40F4High SchoolUnknownLess than $40KBlue34...413313.02517796.01.4051171202.3330.760
790039Existing Customer32M0High SchoolUnknown$60K - $80KSilver27...2229081.0139627685.02.2041538360.7140.048
1090042Existing Customer42M5UneducatedUnknown$120K +Blue31...326748.014675281.00.8311201420.6800.217
1390045Existing Customer35M3GraduateUnknown$60K - $80KBlue30...138547.016666881.01.1631311332.0000.195
1590047Existing Customer44M4UnknownUnknown$80K - $120KBlue37...124234.09723262.01.7071348271.7000.230
..................................................................
10070100102Existing Customer47M3High SchoolUnknown$80K - $120KSilver40...3234516.0137133145.00.691159301230.8360.040
10100100132Existing Customer39M2GraduateUnknown$60K - $80KSilver36...2229808.0029808.00.669160981280.6840.000
10101100133Existing Customer42M2GraduateUnknown$40K - $60KBlue30...253735.017232012.00.59514501920.8400.461
10118100150Attrited Customer50M1UnknownUnknown$80K - $120KBlue36...349959.09529007.00.82510310631.1000.096
10125100157Attrited Customer30M2GraduateUnknown$40K - $60KBlue36...335281.005281.00.5358395620.7220.000
\n", + "

749 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n", + "3 90035 Existing Customer 40 F 4 \n", + "7 90039 Existing Customer 32 M 0 \n", + "10 90042 Existing Customer 42 M 5 \n", + "13 90045 Existing Customer 35 M 3 \n", + "15 90047 Existing Customer 44 M 4 \n", + "... ... ... ... ... ... \n", + "10070 100102 Existing Customer 47 M 3 \n", + "10100 100132 Existing Customer 39 M 2 \n", + "10101 100133 Existing Customer 42 M 2 \n", + "10118 100150 Attrited Customer 50 M 1 \n", + "10125 100157 Attrited Customer 30 M 2 \n", + "\n", + " Education_Level Marital_Status Income_Category Card_Category \\\n", + "3 High School Unknown Less than $40K Blue \n", + "7 High School Unknown $60K - $80K Silver \n", + "10 Uneducated Unknown $120K + Blue \n", + "13 Graduate Unknown $60K - $80K Blue \n", + "15 Unknown Unknown $80K - $120K Blue \n", + "... ... ... ... ... \n", + "10070 High School Unknown $80K - $120K Silver \n", + "10100 Graduate Unknown $60K - $80K Silver \n", + "10101 Graduate Unknown $40K - $60K Blue \n", + "10118 Unknown Unknown $80K - $120K Blue \n", + "10125 Graduate Unknown $40K - $60K Blue \n", + "\n", + " Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \\\n", + "3 34 ... 4 1 \n", + "7 27 ... 2 2 \n", + "10 31 ... 3 2 \n", + "13 30 ... 1 3 \n", + "15 37 ... 1 2 \n", + "... ... ... ... ... \n", + "10070 40 ... 3 2 \n", + "10100 36 ... 2 2 \n", + "10101 30 ... 2 5 \n", + "10118 36 ... 3 4 \n", + "10125 36 ... 3 3 \n", + "\n", + " Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n", + "3 3313.0 2517 796.0 \n", + "7 29081.0 1396 27685.0 \n", + "10 6748.0 1467 5281.0 \n", + "13 8547.0 1666 6881.0 \n", + "15 4234.0 972 3262.0 \n", + "... ... ... ... \n", + "10070 34516.0 1371 33145.0 \n", + "10100 29808.0 0 29808.0 \n", + "10101 3735.0 1723 2012.0 \n", + "10118 9959.0 952 9007.0 \n", + "10125 5281.0 0 5281.0 \n", + "\n", + " Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "3 1.405 1171 20 \n", + "7 2.204 1538 36 \n", + "10 0.831 1201 42 \n", + "13 1.163 1311 33 \n", + "15 1.707 1348 27 \n", + "... ... ... ... \n", + "10070 0.691 15930 123 \n", + "10100 0.669 16098 128 \n", + "10101 0.595 14501 92 \n", + "10118 0.825 10310 63 \n", + "10125 0.535 8395 62 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \n", + "3 2.333 0.760 \n", + "7 0.714 0.048 \n", + "10 0.680 0.217 \n", + "13 2.000 0.195 \n", + "15 1.700 0.230 \n", + "... ... ... \n", + "10070 0.836 0.040 \n", + "10100 0.684 0.000 \n", + "10101 0.840 0.461 \n", + "10118 1.100 0.096 \n", + "10125 0.722 0.000 \n", + "\n", + "[749 rows x 21 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset[dataset['Marital_Status'] == \"Unknown\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Transformation: Binning" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 10127.000000\n", + "mean 46.325960\n", + "std 8.016814\n", + "min 26.000000\n", + "25% 41.000000\n", + "50% 46.000000\n", + "75% 52.000000\n", + "max 73.000000\n", + "Name: Customer_Age, dtype: float64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the min. and max. age\n", + "dataset['Customer_Age'].describe()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use cut function in pandas to create bins for the customer age. However, another way is done below where a function is created and apply function is used later to execute it." + ] + }, + { + "cell_type": "code", + "execution_count": 49, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def get_age_bins(x):\n", + " if x >=20 and x<30:\n", + " return \"20s\"\n", + " if x >= 30 and x<40:\n", + " return \"30s\"\n", + " if x>=40 and x<50:\n", + " return \"40s\"\n", + " if x>=50 and x<60:\n", + " return \"50s\"\n", + " if x>=60 and x<70:\n", + " return \"60s\"\n", + " if x>=70 and x<80:\n", + " return \"70s\"\n", + " else:\n", + " return \">=80\"\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "dataset['customer_age_bin'] = dataset['Customer_Age'].apply(get_age_bins)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 45\n", + "1 49\n", + "2 51\n", + "3 40\n", + "4 40\n", + "Name: Customer_Age, dtype: int64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['Customer_Age'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 40s\n", + "1 40s\n", + "2 50s\n", + "3 40s\n", + "4 40s\n", + "Name: customer_age_bin, dtype: object" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['customer_age_bin'].head()" + ] } ], "metadata": { @@ -70,7 +3240,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.11" } }, "nbformat": 4, From 72b828ee4d450456fdea253b0bd675bfc9a71819 Mon Sep 17 00:00:00 2001 From: Alaa Date: Wed, 6 Nov 2024 13:12:29 +0000 Subject: [PATCH 2/3] EDA_Part_1 --- data/Dirty Data.ipynb | 367 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 326 insertions(+), 41 deletions(-) diff --git a/data/Dirty Data.ipynb b/data/Dirty Data.ipynb index c224a42..08a08c5 100644 --- a/data/Dirty Data.ipynb +++ b/data/Dirty Data.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -62,7 +62,7 @@ " dtype='object')" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -300,7 +300,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 17, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -326,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -335,7 +335,7 @@ "10127" ] }, - "execution_count": 22, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -364,7 +364,7 @@ "10127" ] }, - "execution_count": 24, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -385,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -394,7 +394,7 @@ "(10127, 23)" ] }, - "execution_count": 26, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -424,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -434,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -469,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -499,7 +499,7 @@ "dtype: object" ] }, - "execution_count": 29, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -518,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -548,7 +548,7 @@ "dtype: int64" ] }, - "execution_count": 30, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -567,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -966,7 +966,7 @@ "[1519 rows x 21 columns]" ] }, - "execution_count": 33, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -985,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1384,7 +1384,7 @@ "[749 rows x 21 columns]" ] }, - "execution_count": 35, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1396,7 +1396,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1795,7 +1795,7 @@ "[1112 rows x 21 columns]" ] }, - "execution_count": 36, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1814,7 +1814,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1826,7 +1826,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1856,7 +1856,7 @@ "dtype: int64" ] }, - "execution_count": 38, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1868,7 +1868,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -2267,7 +2267,7 @@ "[1519 rows x 21 columns]" ] }, - "execution_count": 39, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2279,7 +2279,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2678,7 +2678,7 @@ "[1112 rows x 21 columns]" ] }, - "execution_count": 40, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2689,7 +2689,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -3088,7 +3088,7 @@ "[749 rows x 21 columns]" ] }, - "execution_count": 41, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -3106,7 +3106,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -3123,7 +3123,7 @@ "Name: Customer_Age, dtype: float64" ] }, - "execution_count": 42, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -3142,7 +3142,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -3166,7 +3166,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -3175,7 +3175,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -3189,7 +3189,7 @@ "Name: Customer_Age, dtype: int64" ] }, - "execution_count": 52, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -3200,7 +3200,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -3214,7 +3214,7 @@ "Name: customer_age_bin, dtype: object" ] }, - "execution_count": 53, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -3222,6 +3222,291 @@ "source": [ "dataset['customer_age_bin'].head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Attrition_Flag\n", + "Existing Customer 8500\n", + "Attrited Customer 1627\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset[\"Attrition_Flag\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculate the percentage of the churned customers" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16.07 % of our customers have churned\n" + ] + } + ], + "source": [ + "# Calculate the churn percentage\n", + "churned_percent = round(dataset[\"Attrition_Flag\"].value_counts()['Attrited Customer']/ dataset.shape[0] *100 , 2)\n", + "# Print the percentage of the churn\n", + "print(f\"{churned_percent} % of our customers have churned\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Summary Statistics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The average number of relationship of the customers in years was around 4 years (3.81)." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The mean of the toata relatioanship count is :3.81 and the medain is 4.0\n" + ] + } + ], + "source": [ + "# The mean and median of the \n", + "print(f\"The mean of the toata relatioanship count is :{round(dataset['Total_Relationship_Count'].mean(), 2)} and the medain is {np.median(dataset['Total_Relationship_Count'])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The longest number of relationship of the customers was 6 years." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The maximum number of relatioanship count is :6\n" + ] + } + ], + "source": [ + "print(f\"The maximum number of relatioanship count is :{round(np.max(dataset['Total_Relationship_Count']), 2)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The average credit limit of customers was 8631.95 and the median is 4549." + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The average credit limit is : 8631.95 while the median is 4549.0\n" + ] + } + ], + "source": [ + "print(f\"The average credit limit is : {round(np.mean(dataset['Credit_Limit']), 2)} while the median is {round(np.median(dataset['Credit_Limit']), 2)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The difference between the mean and the median of credit_limit suggests the existing of outliers in this variable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Histograms" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(dataset['Customer_Age']);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the above histogram plot, customer age is normaly distributed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, display a histogram showing the frequency of months on booking." + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(dataset['Months_on_book']);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above figure shows more normal distributed variable. The biggest number of customers (~3500) have 36 months on booking. More investigation can be conducted to understand this output. The biggest number of customers at 36 months can be linked to a company campain 36 months ago or every new customer gets 36 as a default month_on_booking." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(dataset['Credit_Limit']);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The figure shows skewness to the right. Most of the customers have less than 5000 credit limit while smaller number of customers have over this limit. This figure confirms the revealed difference between the medain and the mean of Credit_Limit variable earlier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's show the mean and the median on the histgram." + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(3549.0, 2550, 'Median')" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,7))\n", + "# Use 30 bins for a better representation for the distribution\n", + "plt.hist(dataset['Credit_Limit'], bins=30)\n", + "plt.vlines(dataset['Credit_Limit'].mean(), 0, 2500, colors='Black')\n", + "plt.vlines(dataset['Credit_Limit'].median(), 0, 2500,colors='Black')\n", + "\n", + "# Print the mean and median text over the vertical lines\n", + "plt.text(dataset['Credit_Limit'].mean()-1000, 2500+50, 'Mean')\n", + "plt.text(dataset['Credit_Limit'].median()-1000, 2500+50, 'Median')" + ] } ], "metadata": { From cf4d24c0b6fd9d7f275e61c71b95eb75d1d0e0b4 Mon Sep 17 00:00:00 2001 From: Alaa Date: Sun, 10 Nov 2024 12:31:22 +0000 Subject: [PATCH 3/3] Data transformation --- data/Dirty Data.ipynb | 514 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 461 insertions(+), 53 deletions(-) diff --git a/data/Dirty Data.ipynb b/data/Dirty Data.ipynb index 08a08c5..af86618 100644 --- a/data/Dirty Data.ipynb +++ b/data/Dirty Data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -62,7 +62,7 @@ " dtype='object')" ] }, - "execution_count": 5, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 65, "metadata": {}, "outputs": [ { @@ -300,7 +300,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 6, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -326,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 66, "metadata": {}, "outputs": [ { @@ -335,7 +335,7 @@ "10127" ] }, - "execution_count": 7, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -364,7 +364,7 @@ "10127" ] }, - "execution_count": 8, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -385,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 68, "metadata": {}, "outputs": [ { @@ -394,7 +394,7 @@ "(10127, 23)" ] }, - "execution_count": 9, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -424,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -434,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 70, "metadata": {}, "outputs": [ { @@ -469,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -499,7 +499,7 @@ "dtype: object" ] }, - "execution_count": 12, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -518,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 72, "metadata": {}, "outputs": [ { @@ -548,7 +548,7 @@ "dtype: int64" ] }, - "execution_count": 13, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -567,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -966,7 +966,7 @@ "[1519 rows x 21 columns]" ] }, - "execution_count": 14, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -985,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -1384,7 +1384,7 @@ "[749 rows x 21 columns]" ] }, - "execution_count": 15, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } @@ -1396,7 +1396,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -1795,7 +1795,7 @@ "[1112 rows x 21 columns]" ] }, - "execution_count": 16, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -1814,7 +1814,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1826,7 +1826,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 77, "metadata": {}, "outputs": [ { @@ -1856,7 +1856,7 @@ "dtype: int64" ] }, - "execution_count": 18, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -1868,7 +1868,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -2267,7 +2267,7 @@ "[1519 rows x 21 columns]" ] }, - "execution_count": 19, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -2279,7 +2279,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 79, "metadata": {}, "outputs": [ { @@ -2678,7 +2678,7 @@ "[1112 rows x 21 columns]" ] }, - "execution_count": 20, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -2689,7 +2689,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -3088,7 +3088,7 @@ "[749 rows x 21 columns]" ] }, - "execution_count": 21, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -3106,7 +3106,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -3123,7 +3123,7 @@ "Name: Customer_Age, dtype: float64" ] }, - "execution_count": 22, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -3142,7 +3142,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -3166,7 +3166,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -3175,7 +3175,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 84, "metadata": {}, "outputs": [ { @@ -3189,7 +3189,7 @@ "Name: Customer_Age, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -3200,7 +3200,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -3214,7 +3214,7 @@ "Name: customer_age_bin, dtype: object" ] }, - "execution_count": 26, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -3232,7 +3232,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -3244,7 +3244,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 41, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -3262,7 +3262,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 87, "metadata": {}, "outputs": [ { @@ -3297,7 +3297,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 88, "metadata": {}, "outputs": [ { @@ -3322,7 +3322,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 89, "metadata": {}, "outputs": [ { @@ -3346,7 +3346,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 90, "metadata": {}, "outputs": [ { @@ -3377,7 +3377,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 91, "metadata": {}, "outputs": [ { @@ -3411,7 +3411,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 92, "metadata": {}, "outputs": [ { @@ -3438,7 +3438,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 93, "metadata": {}, "outputs": [ { @@ -3472,7 +3472,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 94, "metadata": {}, "outputs": [ { @@ -3481,7 +3481,7 @@ "Text(3549.0, 2550, 'Median')" ] }, - "execution_count": 92, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" }, @@ -3507,6 +3507,414 @@ "plt.text(dataset['Credit_Limit'].mean()-1000, 2500+50, 'Mean')\n", "plt.text(dataset['Credit_Limit'].median()-1000, 2500+50, 'Median')" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Transformation: Normalization and Log" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Two ways to transform skewed distribution into more normal one. Normalisation using min-max scaler and using log transformation." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a funbction for min-max scaler\n", + "def get_normalised_value(column):\n", + " max_val = column.max()\n", + " min_val = column.min()\n", + " y = (column-min_val)/(max_val-min_val)\n", + " return y\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since credit_limit variable is skewed, we will try using the normalisation method to solve the problem. This method will make the values range between 0 and 1." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new column create the normalised value of Credit_Limit\n", + "dataset['Credit_Limit_normalised'] = get_normalised_value(dataset['Credit_Limit'])" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0\n", + "1.0\n", + "0.21747744547036094\n", + "0.09404220970623714\n" + ] + } + ], + "source": [ + "# Check the min and max values of the new columnn\n", + "print(dataset['Credit_Limit_normalised'].min())\n", + "print(dataset['Credit_Limit_normalised'].max())\n", + "print(dataset['Credit_Limit_normalised'].mean())\n", + "print(dataset['Credit_Limit_normalised'].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We noticed that the difference between the mean and the median is smaller after the normalisation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "log from Numpy is used for data transformation of the same variable to compare between the two methods. " + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "# Data transformation using log \n", + "dataset['Credit_Limit_log']= np.log(dataset['Credit_Limit'])" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.271217139609844\n", + "10.449178263628198\n", + "8.60341207921584\n", + "8.422662707570003\n" + ] + } + ], + "source": [ + "# Check the min and max values of the new columnn\n", + "print(dataset['Credit_Limit_log'].min())\n", + "print(dataset['Credit_Limit_log'].max())\n", + "print(dataset['Credit_Limit_log'].mean())\n", + "print(dataset['Credit_Limit_log'].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We noticed that the difference between the mean and the median is too small after the log transformation." + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\alaah\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\seaborn\\_oldcore.py:1119: FutureWarning:\n", + "\n", + "use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + "\n", + "C:\\Users\\alaah\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\seaborn\\_oldcore.py:1119: FutureWarning:\n", + "\n", + "use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + "\n", + "C:\\Users\\alaah\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\seaborn\\_oldcore.py:1119: FutureWarning:\n", + "\n", + "use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + "\n", + "C:\\Users\\alaah\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\seaborn\\_oldcore.py:1119: FutureWarning:\n", + "\n", + "use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axes = plt.subplots(2,2, figsize = (15,10))\n", + "fig.suptitle(\"Before and After Transformation\")\n", + "sn.histplot(dataset , x= \"Credit_Limit\", ax= axes[0,0])\n", + "sn.histplot(dataset , x= \"Credit_Limit_normalised\", ax = axes[0,1])\n", + "sn.histplot(dataset , x= \"Credit_Limit\",ax = axes[1,0])\n", + "sn.histplot(dataset , x= \"Credit_Limit_log\", ax = axes[1,1]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above plots improve that the log transformation makes a better impacts towards a normal distribution. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other distribution Plots " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Box and Wisker plots" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# The box plot helps to easily oidentify the median, quartiles and outliers\n", + "sn.boxplot(x = dataset[\"Gender\"] , y= dataset[\"Total_Trans_Ct\"]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above plot shows that the median transaction count is higher among females compared with males. There are outliers in of total transaction count in both the males and females." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pyramid plot" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Gendercustomer_age_binCLIENTNUM
0F20s93
1F30s956
2F40s2410
3F50s1619
4F60s280
5M20s102
6M30s885
7M40s2151
8M50s1379
9M60s250
10M70s2
\n", + "
" + ], + "text/plain": [ + " Gender customer_age_bin CLIENTNUM\n", + "0 F 20s 93\n", + "1 F 30s 956\n", + "2 F 40s 2410\n", + "3 F 50s 1619\n", + "4 F 60s 280\n", + "5 M 20s 102\n", + "6 M 30s 885\n", + "7 M 40s 2151\n", + "8 M 50s 1379\n", + "9 M 60s 250\n", + "10 M 70s 2" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pyramid = dataset.groupby([\"Gender\", \"customer_age_bin\"])['CLIENTNUM'].nunique().reset_index()\n", + "pyramid" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Mime type rendering requires nbformat>=4.2.0 but it is not installed", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[106], line 31\u001b[0m\n\u001b[0;32m 6\u001b[0m layout \u001b[38;5;241m=\u001b[39m go\u001b[38;5;241m.\u001b[39mLayout(yaxis\u001b[38;5;241m=\u001b[39mgo\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39mYAxis(title\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[0;32m 7\u001b[0m xaxis\u001b[38;5;241m=\u001b[39mgo\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39mXAxis(\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28mrange\u001b[39m\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3000\u001b[39m, \u001b[38;5;241m3000\u001b[39m], \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m barmode\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124moverlay\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 13\u001b[0m bargap\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m)\n\u001b[0;32m 15\u001b[0m data \u001b[38;5;241m=\u001b[39m [go\u001b[38;5;241m.\u001b[39mBar(y\u001b[38;5;241m=\u001b[39my,\n\u001b[0;32m 16\u001b[0m x\u001b[38;5;241m=\u001b[39mmen_bins,\n\u001b[0;32m 17\u001b[0m orientation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mh\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 28\u001b[0m marker\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mdict\u001b[39m(color\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseagreen\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 29\u001b[0m )]\n\u001b[1;32m---> 31\u001b[0m \u001b[43miplot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlayout\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\plotly\\offline\\offline.py:387\u001b[0m, in \u001b[0;36miplot\u001b[1;34m(figure_or_data, show_link, link_text, validate, image, filename, image_width, image_height, config, auto_play, animation_opts)\u001b[0m\n\u001b[0;32m 382\u001b[0m post_script \u001b[38;5;241m=\u001b[39m build_save_image_post_script(\n\u001b[0;32m 383\u001b[0m image, filename, image_height, image_width, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miplot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 384\u001b[0m )\n\u001b[0;32m 386\u001b[0m \u001b[38;5;66;03m# Show figure\u001b[39;00m\n\u001b[1;32m--> 387\u001b[0m \u001b[43mpio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshow\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mfigure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mauto_play\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauto_play\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 392\u001b[0m \u001b[43m \u001b[49m\u001b[43mpost_script\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpost_script\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[43manimation_opts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43manimation_opts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 394\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\plotly\\io\\_renderers.py:394\u001b[0m, in \u001b[0;36mshow\u001b[1;34m(fig, renderer, validate, **kwargs)\u001b[0m\n\u001b[0;32m 389\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 390\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMime type rendering requires ipython but it is not installed\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 391\u001b[0m )\n\u001b[0;32m 393\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m nbformat \u001b[38;5;129;01mor\u001b[39;00m Version(nbformat\u001b[38;5;241m.\u001b[39m__version__) \u001b[38;5;241m<\u001b[39m Version(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m4.2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 394\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 395\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMime type rendering requires nbformat>=4.2.0 but it is not installed\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 396\u001b[0m )\n\u001b[0;32m 398\u001b[0m ipython_display\u001b[38;5;241m.\u001b[39mdisplay(bundle, raw\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 400\u001b[0m \u001b[38;5;66;03m# external renderers\u001b[39;00m\n", + "\u001b[1;31mValueError\u001b[0m: Mime type rendering requires nbformat>=4.2.0 but it is not installed" + ] + } + ], + "source": [ + "women_bins = np.array(-1 *pyramid[pyramid['Gender'] == 'F']['CLIENTNUM'])\n", + "men_bins = np.array(pyramid[pyramid['Gender'] == 'M']['CLIENTNUM'])\n", + "\n", + "y = list(range(20, 100, 10))\n", + "\n", + "layout = go.Layout(yaxis=go.layout.YAxis(title='Age'),\n", + " xaxis=go.layout.XAxis(\n", + " range=[-3000, 3000], \n", + " tickvals=[-2500, -2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000, 2500],\n", + " ticktext=[2500, 2000, 1500, 1000, 500, 0, 500, 1000, 1500, 2000, 2500],\n", + " title='Customers'),\n", + " barmode='overlay',\n", + " bargap=0.1)\n", + "\n", + "data = [go.Bar(y=y,\n", + " x=men_bins,\n", + " orientation='h',\n", + " name='Men',\n", + " hoverinfo='x',\n", + " marker=dict(color='powderblue')\n", + " ),\n", + " go.Bar(y=y,\n", + " x=women_bins,\n", + " orientation='h',\n", + " name='Women',\n", + " text=-1 * women_bins.astype('int'),\n", + " hoverinfo='text',\n", + " marker=dict(color='seagreen')\n", + " )]\n", + "\n", + "iplot(dict(data=data, layout=layout))" + ] } ], "metadata": {