# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as spy
%matplotlib inline
import copy
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Pandas display settings - columns
# Display all columns
pd.set_option("display.max_columns", None)
# Load dataset
data = pd.read_excel("InsurancePremiumDefault.xlsx",sheet_name='premium')
Preview dataset
# Preview the dataset
# View the first 5, last 5 and random 10 rows
print('First five rows', '--'*55)
display(data.head())
print('Last five rows', '--'*55)
display(data.tail())
print('Random ten rows', '--'*55)
np.random.seed(1)
display(data.sample(n=10))
First five rows --------------------------------------------------------------------------------------------------------------
Output:
id | perc_premium_paid_by_cash_credit | age_in_days | Income | Count_3-6_months_late | Count_6-12_months_late | Count_more_than_12_months_late | Marital Status | Veh_Owned | No_of_dep | Accomodation | risk_score | no_of_premiums_paid | sourcing_channel | residence_area_type | premium | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.317 | 11330 | 90050 | 0 | 0 | 0 | 0 | 3 | 3 | 1 | 98.810 | 8 | A | Rural | 5400 | 1 |
1 | 2 | 0.000 | 30309 | 156080 | 0 | 0 | 0 | 1 | 3 | 1 | 1 | 99.066 | 3 | A | Urban | 11700 | 1 |
2 | 3 | 0.015 | 16069 | 145020 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 99.170 | 14 | C | Urban | 18000 | 1 |
3 | 4 | 0.000 | 23733 | 187560 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 99.370 | 13 | A | Urban | 13800 | 1 |
4 | 5 | 0.888 | 19360 | 103050 | 7 | 3 | 4 | 0 | 2 | 1 | 0 | 98.800 | 15 | A | Urban | 7500 | 0 |
Last five rows --------------------------------------------------------------------------------------------------------------
Output:
id | perc_premium_paid_by_cash_credit | age_in_days | Income | Count_3-6_months_late | Count_6-12_months_late | Count_more_than_12_months_late | Marital Status | Veh_Owned | No_of_dep | Accomodation | risk_score | no_of_premiums_paid | sourcing_channel | residence_area_type | premium | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
79848 | 79849 | 0.249 | 25555 | 64420 | 0 | 0 | 0 | 1 | 2 | 4 | 0 | 99.08 | 10 | A | Urban | 5700 | 1 |
79849 | 79850 | 0.003 | 16797 | 660040 | 1 | 0 | 0 | 0 | 2 | 1 | 0 | 99.65 | 9 | B | Urban | 28500 | 1 |
79850 | 79851 | 0.012 | 24835 | 227760 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 99.66 | 11 | A | Rural | 11700 | 1 |
79851 | 79852 | 0.190 | 10959 | 153060 | 1 | 0 | 0 | 0 | 2 | 1 | 0 | 99.46 | 24 | A | Urban | 11700 | 1 |
79852 | 79853 | 0.000 | 19720 | 324030 | 0 | 0 | 0 | 0 | 3 | 3 | 0 | 99.80 | 7 | D | Rural | 3300 | 1 |
Random ten rows --------------------------------------------------------------------------------------------------------------
Output:
id | perc_premium_paid_by_cash_credit | age_in_days | Income | Count_3-6_months_late | Count_6-12_months_late | Count_more_than_12_months_late | Marital Status | Veh_Owned | No_of_dep | Accomodation | risk_score | no_of_premiums_paid | sourcing_channel | residence_area_type | premium | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
53658 | 53659 | 0.150 | 22643 | 171080 | 0 | 0 | 0 | 1 | 2 | 2 | 0 | 98.78 | 12 | A | Urban | 11700 | 1 |
25309 | 25310 | 0.201 | 10232 | 75090 | 0 | 0 | 0 | 1 | 3 | 4 | 0 | 99.61 | 7 | A | Rural | 1200 | 1 |
26163 | 26164 | 0.010 | 19719 | 144120 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 99.89 | 7 | A | Urban | 11700 | 1 |
55134 | 55135 | 0.000 | 23734 | 350070 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 99.51 | 9 | A | Urban | 1200 | 1 |
29706 | 29707 | 0.094 | 27023 | 69110 | 0 | 0 | 0 | 1 | 3 | 4 | 0 | 99.83 | 8 | A | Urban | 5400 | 1 |
11087 | 11088 | 0.123 | 28844 | 180030 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 99.47 | 8 | A | Rural | 9600 | 1 |
71267 | 71268 | 0.990 | 12785 | 55640 | 1 | 0 | 0 | 0 | 1 | 2 | 1 | 99.66 | 7 | A | Rural | 3300 | 1 |
4389 | 4390 | 0.051 | 22276 | 171080 | 0 | 0 | 0 | 0 | 2 | 2 | 0 | 99.69 | 11 | A | Rural | 7500 | 1 |
47666 | 47667 | 0.000 | 19717 | 219430 | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 99.27 | 11 | D | Urban | 11700 | 1 |
31240 | 31241 | 0.000 | 15708 | 150110 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 99.71 | 4 | C | Urban | 11700 | 1 |
Initial observations
id
is row identifier, which does not add any value. This variable can be removed later.perc_premium_paid_by_cash_credit
is a continuous,ratio numerical variable.age_in_days
,Income
,Veh_Owned
,No_of_dep
,No_of_dep
,no_of_premiums_paid
&premium
are discrete,interval numerical variables.Income
is a discrete,interval numerical variable.Count_3-6_months_late
,Count_6-12_months_late
&Count_more_than_12_months_late
are interval and discrete numerical variables.Marital Status
is binary (0 - Unmarried, 1 - Married)Accomodation
is binary (0 - Rented, 1 - Owned)risk_score
is a continuous, numerical variable.sourcing_channel
&residence_area_type
are categorical nominal variables.default
is the Target variable. It is a binary (0 - defaulter, 1 - non_defaulter) variable.
# Display list of variables in dataset
variable_list = data.columns.tolist()
print(variable_list)
['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income', 'Count_3-6_months_late', 'Count_6-12_months_late', 'Count_more_than_12_months_late', 'Marital Status', 'Veh_Owned', 'No_of_dep', 'Accomodation', 'risk_score', 'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type', 'premium', 'default']
Let's rename the variables for ease of programming
# Column rename dictionary
renamed_columns = {
'id': 'ID',
'perc_premium_paid_by_cash_credit': 'Perc_premium_paid_in_cash',
'age_in_days': 'Age_in_days',
'Count_3-6_months_late': 'Late_premium_payment_3-6_months',
'Count_6-12_months_late': 'Late_premium_payment_6-12_months',
'Count_more_than_12_months_late': 'Late_premium_payment_>12_months',
'Marital Status': 'Marital_Status',
'Veh_Owned': 'Vehicles_Owned',
'No_of_dep': 'No_of_dependents',
'risk_score': 'Risk_score',
'no_of_premiums_paid': 'No_of_premiums_paid',
'sourcing_channel': 'Sourcing_channel',
'residence_area_type': 'Customer_demographic',
'premium': 'Premium_payment',
'default': 'Default'
}
# Rename dataframe columns names
data = data.rename(columns = renamed_columns)
# Check for updated column names
variable_list = data.columns.tolist()
print(variable_list)
['ID', 'Perc_premium_paid_in_cash', 'Age_in_days', 'Income', 'Late_premium_payment_3-6_months', 'Late_premium_payment_6-12_months', 'Late_premium_payment_>12_months', 'Marital_Status', 'Vehicles_Owned', 'No_of_dependents', 'Accomodation', 'Risk_score', 'No_of_premiums_paid', 'Sourcing_channel', 'Customer_demographic', 'Premium_payment', 'Default']
shape = data.shape
n_rows = shape[0]
n_cols = shape[1]
print(f"The Dataframe consists of '{n_rows}' rows and '{n_cols}' columns")
The Dataframe consists of '79853' rows and '17' columns
Data types
# Check the data types
data.dtypes
ID int64
Perc_premium_paid_in_cash float64
Age_in_days int64
Income int64
Late_premium_payment_3-6_months int64
Late_premium_payment_6-12_months int64
Late_premium_payment_>12_months int64
Marital_Status int64
Vehicles_Owned int64
No_of_dependents int64
Accomodation int64
Risk_score float64
No_of_premiums_paid int64
Sourcing_channel object
Customer_demographic object
Premium_payment int64
Default int64
dtype: object
Data info
# Get info of the dataframe columns
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79853 entries, 0 to 79852
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 79853 non-null int64
1 Perc_premium_paid_in_cash 79853 non-null float64
2 Age_in_days 79853 non-null int64
3 Income 79853 non-null int64
4 Late_premium_payment_3-6_months 79853 non-null int64
5 Late_premium_payment_6-12_months 79853 non-null int64
6 Late_premium_payment_>12_months 79853 non-null int64
7 Marital_Status 79853 non-null int64
8 Vehicles_Owned 79853 non-null int64
9 No_of_dependents 79853 non-null int64
10 Accomodation 79853 non-null int64
11 Risk_score 79853 non-null float64
12 No_of_premiums_paid 79853 non-null int64
13 Sourcing_channel 79853 non-null object
14 Customer_demographic 79853 non-null object
15 Premium_payment 79853 non-null int64
16 Default 79853 non-null int64
dtypes: float64(2), int64(13), object(2)
memory usage: 10.4+ MB
- Two (2) variables have been identified as
Panda object
type. These shall be converted to thecategory
type.
Convert Pandas Objects to Category type
# Convert variables with "object" type to "category" type
for i in data.columns:
if data[i].dtypes == "object":
data[i] = data[i].astype("category")
# Confirm if there no variables with "object" type
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79853 entries, 0 to 79852
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 79853 non-null int64
1 Perc_premium_paid_in_cash 79853 non-null float64
2 Age_in_days 79853 non-null int64
3 Income 79853 non-null int64
4 Late_premium_payment_3-6_months 79853 non-null int64
5 Late_premium_payment_6-12_months 79853 non-null int64
6 Late_premium_payment_>12_months 79853 non-null int64
7 Marital_Status 79853 non-null int64
8 Vehicles_Owned 79853 non-null int64
9 No_of_dependents 79853 non-null int64
10 Accomodation 79853 non-null int64
11 Risk_score 79853 non-null float64
12 No_of_premiums_paid 79853 non-null int64
13 Sourcing_channel 79853 non-null category
14 Customer_demographic 79853 non-null category
15 Premium_payment 79853 non-null int64
16 Default 79853 non-null int64
dtypes: category(2), float64(2), int64(13)
memory usage: 9.3 MB
The memory usage has decreased from 10.4+ MB to 9.3 MB
Missing value summary function
def missing_val_chk(data):
"""
This function to checks for missing values
and generates a summary.
"""
if data.isnull().sum().any() == True:
# Number of missing in each column
missing_vals = pd.DataFrame(data.isnull().sum().sort_values(
ascending=False)).rename(columns={0: '# missing'})
# Create a percentage missing
missing_vals['percent'] = ((missing_vals['# missing'] / len(data)) *
100).round(decimals=3)
# Remove rows with 0
missing_vals = missing_vals[missing_vals['# missing'] != 0].dropna()
# display missing value dataframe
print("The missing values summary")
display(missing_vals)
else:
print("There are NO missing values in the dataset")
#Applying the missing value summary function
missing_val_chk(data)
There are NO missing values in the dataset
Before we check the 5 Point numerical summary, let's verify that ID
is row identifier.
If this is the case then the number of unique values will equal the number of rows.
# Check to see if ID unique values equal number of rows
if data.ID.nunique() == data.shape[0]:
print("ID is a row identifier, we shall drop ID variable")
else:
print("ID is a row identifier")
ID is a row identifier, we shall drop ID variable
Dropping ID variable
data.drop(columns="ID", axis=1, inplace=True)
Numerical type Summary
# Five point summary of all numerical type variables in the dataset
data.describe().T
Output:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Perc_premium_paid_in_cash | 79853.0 | 0.314288 | 0.334915 | 0.0 | 0.034 | 0.167 | 0.538 | 1.00 |
Age_in_days | 79853.0 | 18846.696906 | 5208.719136 | 7670.0 | 14974.000 | 18625.000 | 22636.000 | 37602.00 |
Income | 79853.0 | 208847.171177 | 496582.597257 | 24030.0 | 108010.000 | 166560.000 | 252090.000 | 90262600.00 |
Late_premium_payment_3-6_months | 79853.0 | 0.248369 | 0.691102 | 0.0 | 0.000 | 0.000 | 0.000 | 13.00 |
Late_premium_payment_6-12_months | 79853.0 | 0.078093 | 0.436251 | 0.0 | 0.000 | 0.000 | 0.000 | 17.00 |
Late_premium_payment_>12_months | 79853.0 | 0.059935 | 0.311840 | 0.0 | 0.000 | 0.000 | 0.000 | 11.00 |
Marital_Status | 79853.0 | 0.498679 | 0.500001 | 0.0 | 0.000 | 0.000 | 1.000 | 1.00 |
Vehicles_Owned | 79853.0 | 1.998009 | 0.817248 | 1.0 | 1.000 | 2.000 | 3.000 | 3.00 |
No_of_dependents | 79853.0 | 2.503012 | 1.115901 | 1.0 | 2.000 | 3.000 | 3.000 | 4.00 |
Accomodation | 79853.0 | 0.501296 | 0.500001 | 0.0 | 0.000 | 1.000 | 1.000 | 1.00 |
Risk_score | 79853.0 | 99.067243 | 0.725892 | 91.9 | 98.830 | 99.180 | 99.520 | 99.89 |
No_of_premiums_paid | 79853.0 | 10.863887 | 5.170687 | 2.0 | 7.000 | 10.000 | 14.000 | 60.00 |
Premium_payment | 79853.0 | 10924.507533 | 9401.676542 | 1200.0 | 5400.000 | 7500.000 | 13800.000 | 60000.00 |
Default | 79853.0 | 0.937410 | 0.242226 | 0.0 | 1.000 | 1.000 | 1.000 | 1.00 |
-
Perc_premium_paid_in_cash
is highly right skewed as the mean is almost twice the median. Also the standard deviation is greater than the mean implying a wide spread. -
Age_in_days
is fairly symmetrical with mean and median being very close but there is some right skew as the difference between Q3 & Q4 is larger than other quartiles. -
Income
highly right skewed as the mean is greater than the median. Also the standard deviation is more than twice the mean implying a wide spread. -
Late_premium_payment_3-6_months
,Late_premium_payment_6-12_months
&Late_premium_payment_>12_months
are categorical ordinal variables. -
Marital_Status
is a binary variable with approximately 50% of the rows having a value of 1 (50% of the customers are married). -
Vehicles_Owned
&No_of_dependents
are categorical ordinal variables. -
Accomodation
is a binary variable with approximately 50% of the rows having a value of 1 (50% of the customers owned their homes). -
Risk_score
is fairly symmetrical with mean and median being very close. -
No_of_premiums_paid
is fairly symmetrical with mean and median being very close but there is some right skew as the difference between Q3 & Q4 is larger than other quartiles. -
Premium_payment
is highly right skewed as the mean is greater than the median. Also the standard deviation is close to the mean implying a wide spread. -
Default
is a binary variable with approximately 94% of the rows having a value of 1 (94% of the customers are non-defaulters). This target variable is heavily imbalanced as only 6% of the rows are defaulters.
Categorical type Summary
data.describe(include=['category']).T
Output:
count | unique | top | freq | |
---|---|---|---|---|
Sourcing_channel | 79853 | 5 | A | 43134 |
Customer_demographic | 79853 | 2 | Urban | 48183 |
Sourcing_channel
there are 5 different states in which "A" is the most frequent.Customer_demographic
there are 2 different states in which "Urban" is the most frequent.
This dataset will be skewed to policy holders sourced from Channel A and Urban residences
Number of unique states for all variables
# Check the unique values
data.nunique().to_frame()
Output:
0 | |
---|---|
Perc_premium_paid_in_cash | 1001 |
Age_in_days | 833 |
Income | 24165 |
Late_premium_payment_3-6_months | 14 |
Late_premium_payment_6-12_months | 17 |
Late_premium_payment_>12_months | 10 |
Marital_Status | 2 |
Vehicles_Owned | 3 |
No_of_dependents | 4 |
Accomodation | 2 |
Risk_score | 673 |
No_of_premiums_paid | 57 |
Sourcing_channel | 5 |
Customer_demographic | 2 |
Premium_payment | 30 |
Default | 2 |
Categorical Variable Identification
Although the following variables are numerical in nature, they represent categorical variables:
Late_premium_payment_3-6_months
Late_premium_payment_6-12_months
Late_premium_payment_>12_months
Vehicles_Owned
No_of_dependents
Create a list of numerical variables
numerical_vars = [
'Perc_premium_paid_in_cash', 'Age_in_days', 'Income', 'Risk_score',
'No_of_premiums_paid', 'Premium_payment'
]
Create a list of categorical variables
categorical_vars = [
'Late_premium_payment_3-6_months', 'Late_premium_payment_6-12_months',
'Late_premium_payment_>12_months', 'Marital_Status', 'Vehicles_Owned',
'No_of_dependents', 'Accomodation', 'Sourcing_channel',
'Customer_demographic', 'Default'
]
data[numerical_vars].describe().T
Output:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Perc_premium_paid_in_cash | 79853.0 | 0.314288 | 0.334915 | 0.0 | 0.034 | 0.167 | 0.538 | 1.00 |
Age_in_days | 79853.0 | 18846.696906 | 5208.719136 | 7670.0 | 14974.000 | 18625.000 | 22636.000 | 37602.00 |
Income | 79853.0 | 208847.171177 | 496582.597257 | 24030.0 | 108010.000 | 166560.000 | 252090.000 | 90262600.00 |
Risk_score | 79853.0 | 99.067243 | 0.725892 | 91.9 | 98.830 | 99.180 | 99.520 | 99.89 |
No_of_premiums_paid | 79853.0 | 10.863887 | 5.170687 | 2.0 | 7.000 | 10.000 | 14.000 | 60.00 |
Premium_payment | 79853.0 | 10924.507533 | 9401.676542 | 1200.0 | 5400.000 | 7500.000 | 13800.000 | 60000.00 |
# Display the skew summary for the numerical variables
for var in data[numerical_vars].columns:
var_skew = data[var].skew()
if var_skew > 1:
print(f"The '{var}' distribution is highly right skewed.\n")
elif var_skew < -1:
print(f"The '{var}' distribution is highly left skewed.\n")
elif (var_skew > 0.5) & (var_skew < 1):
print(f"The '{var}' distribution is moderately right skewed.\n")
elif (var_skew < -0.5) & (var_skew > -1):
print(f"The '{var}' distribution is moderately left skewed.\n")
else:
print(f"The '{var}' distribution is fairly symmetrical.\n")
The 'Perc_premium_paid_in_cash' distribution is moderately right skewed.
The 'Age_in_days' distribution is fairly symmetrical.
The 'Income' distribution is highly right skewed.
The 'Risk_score' distribution is highly left skewed.
The 'No_of_premiums_paid' distribution is highly right skewed.
The 'Premium_payment' distribution is highly right skewed.
Outlier check function
# Outlier check
def outlier_count(data):
"""
This function checks the lower and upper
outliers for all numerical variables.
Outliers are found where data points exists either:
- Greater than `1.5*IQR` above the 75th percentile
- Less than `1.5*IQR` below the 25th percentile
"""
numeric = data.select_dtypes(include=np.number).columns.to_list()
for i in numeric:
# Get name of series
name = data[i].name
# Calculate the IQR for all values and omit NaNs
IQR = spy.stats.iqr(data[i], nan_policy="omit")
# Calculate the boxplot upper fence
upper_fence = data[i].quantile(0.75) + 1.5 * IQR
# Calculate the boxplot lower fence
lower_fence = data[i].quantile(0.25) - 1.5 * IQR
# Calculate the count of outliers above upper fence
upper_outliers = data[i][data[i] > upper_fence].count()
# Calculate the count of outliers below lower fence
lower_outliers = data[i][data[i] < lower_fence].count()
# Check if there are no outliers
if (upper_outliers == 0) & (lower_outliers == 0):
continue
print(
f"The '{name}' distribution has '{lower_outliers}' lower outliers and '{upper_outliers}' upper outliers.\n"
)
#Applying the Outlier check function for the sub-dataframe of numerical variables
outlier_count(data[numerical_vars])
The 'Age_in_days' distribution has '0' lower outliers and '44' upper outliers.
The 'Income' distribution has '0' lower outliers and '3428' upper outliers.
The 'Risk_score' distribution has '3784' lower outliers and '0' upper outliers.
The 'No_of_premiums_paid' distribution has '0' lower outliers and '1426' upper outliers.
The 'Premium_payment' distribution has '0' lower outliers and '4523' upper outliers.
Variable | Skew | Outliers |
---|---|---|
Perc_premium_paid_in_cash | Moderately right skewed | No Outliers |
Age_in_days | Fairly symmetrical | 44 Upper Outliers |
Income | Highly right skewed | 3428 Upper Outliers |
Risk_score | Highly left skewed | 3784 Lower Outliers |
No_of_premiums_paid | Highly right skewed | 1426 Upper Outliers |
Premium_payment | Highly right skewed | 4523 Upper Outliers |
Detailed investigation of unique values
# Display the unique values for all categorical variables
for i in categorical_vars:
print('Unique values in',i, 'are :')
print(data[i].value_counts())
print('--'*55)
Unique values in Late_premium_payment_3-6_months are :
Late_premium_payment_3-6_months
0 66898
1 8826
2 2519
3 954
4 374
5 168
6 68
7 23
8 15
9 4
13 1
12 1
10 1
11 1
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Late_premium_payment_6-12_months are :
Late_premium_payment_6-12_months
0 75928
1 2680
2 693
3 317
4 130
5 46
6 26
7 11
8 5
9 4
10 4
11 2
14 2
13 2
15 1
17 1
12 1
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Late_premium_payment_>12_months are :
Late_premium_payment_>12_months
0 76135
1 2996
2 498
3 151
4 48
5 13
6 6
7 3
8 2
11 1
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Marital_Status are :
Marital_Status
0 40032
1 39821
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Vehicles_Owned are :
Vehicles_Owned
1 26746
3 26587
2 26520
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in No_of_dependents are :
No_of_dependents
3 20215
2 19902
4 19896
1 19840
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Accomodation are :
Accomodation
1 40030
0 39823
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Sourcing_channel are :
Sourcing_channel
A 43134
B 16512
C 12039
D 7559
E 609
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Customer_demographic are :
Customer_demographic
Urban 48183
Rural 31670
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Unique values in Default are :
Default
1 74855
0 4998
Name: count, dtype: int64
--------------------------------------------------------------------------------------------------------------
Late_premium_payment_6-12_months
- there is discontinuity in the unique values as state 16 is missing in the range.Late_premium_payment_>12_months
- there is discontinuity in the unique values as states 9 & 10 are missing in the range.
There are categorical variables in the numeric format.
Variable | Type | Range |
---|---|---|
Late_premium_payment_3-6_months | Ordinal | 14 states |
Late_premium_payment_6-12_months | Ordinal | 17 states |
Late_premium_payment_>12_months | Ordinal | 10 states |
Marital_Status | Nominal | 2 states |
Vehicles_Owned | Ordinal | 3 states |
No_of_dependents | Ordinal | 4 states |
Accomodation | Nominal | 2 states |
Sourcing_channel | Nominal | 5 states |
Customer_demographic | Nominal | 2 states |
Default | Nominal | 2 states |
Target variable is Default
# Checking the distribution of target variable
# Count the different "Default" states
count = data["Default"].value_counts().T
# Calculate the percentage different "Default" states
percentage = data['Default'].value_counts(normalize=True).T * 100
# Join count and percentage series
target_dist = pd.concat([count, percentage], axis=1)
# Set column names
target_dist.columns = ['count', 'percentage']
# Set Index name
target_dist.index.name = "Default"
# Display target distribution dataframe
target_dist
Output:
count | percentage | |
---|---|---|
Default | ||
1 | 74855 | 93.740999 |
0 | 4998 | 6.259001 |
Out of the 79854 policy holders, only 6.26% defaulted
The Target variable is Heavily Imbalanced