Skip to content

Latest commit

 

History

History
2288 lines (1347 loc) · 46.3 KB

File metadata and controls

2288 lines (1347 loc) · 46.3 KB
Importing Libraries & Data ingestion
# Import all the necessary libraries
# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as spy
%matplotlib inline
import copy
import math
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Pandas display settings - columns

# Display all columns
pd.set_option("display.max_columns", None)

Data ingestion

# Load dataset
data = pd.read_excel("InsurancePremiumDefault.xlsx",sheet_name='premium')

Let's rename the variables for ease of programming

# Column rename dictionary
renamed_columns = {
    'id': 'ID',
    'perc_premium_paid_by_cash_credit': 'Perc_premium_paid_in_cash',
    'age_in_days': 'Age_in_days',
    'Count_3-6_months_late': 'Late_premium_payment_3-6_months',
    'Count_6-12_months_late': 'Late_premium_payment_6-12_months',
    'Count_more_than_12_months_late': 'Late_premium_payment_>12_months',
    'Marital Status': 'Marital_Status',
    'Veh_Owned': 'Vehicles_Owned',
    'No_of_dep': 'No_of_dependents',
    'risk_score': 'Risk_score',
    'no_of_premiums_paid': 'No_of_premiums_paid',
    'sourcing_channel': 'Sourcing_channel',
    'residence_area_type': 'Customer_demographic',
    'premium': 'Premium_payment',
    'default': 'Default'
}

# Rename dataframe columns names
data = data.rename(columns = renamed_columns)

Convert Pandas Objects to Category type

# Convert variables with "object" type to "category" type
for i in data.columns:
    if data[i].dtypes == "object":
        data[i] = data[i].astype("category") 

# Confirm if there no variables with "object" type
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79853 entries, 0 to 79852
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   ID                                79853 non-null  int64   
 1   Perc_premium_paid_in_cash         79853 non-null  float64 
 2   Age_in_days                       79853 non-null  int64   
 3   Income                            79853 non-null  int64   
 4   Late_premium_payment_3-6_months   79853 non-null  int64   
 5   Late_premium_payment_6-12_months  79853 non-null  int64   
 6   Late_premium_payment_>12_months   79853 non-null  int64   
 7   Marital_Status                    79853 non-null  int64   
 8   Vehicles_Owned                    79853 non-null  int64   
 9   No_of_dependents                  79853 non-null  int64   
 10  Accomodation                      79853 non-null  int64   
 11  Risk_score                        79853 non-null  float64 
 12  No_of_premiums_paid               79853 non-null  int64   
 13  Sourcing_channel                  79853 non-null  category
 14  Customer_demographic              79853 non-null  category
 15  Premium_payment                   79853 non-null  int64   
 16  Default                           79853 non-null  int64   
dtypes: category(2), float64(2), int64(13)
memory usage: 9.3 MB

Dropping ID variable

data.drop(columns="ID", axis=1, inplace=True)

Create a list of numerical variables

numerical_vars = [
    'Perc_premium_paid_in_cash', 'Age_in_days', 'Income', 'Risk_score',
    'No_of_premiums_paid', 'Premium_payment'
]

Create a list of categorical variables

categorical_vars = [
    'Late_premium_payment_3-6_months', 'Late_premium_payment_6-12_months',
    'Late_premium_payment_>12_months', 'Marital_Status', 'Vehicles_Owned',
    'No_of_dependents', 'Accomodation', 'Sourcing_channel',
    'Customer_demographic', 'Default'
]

Exploratory Data Analysis

Univariate Analysis

Numerical Variables

Histogram Overview

Let's get an overview of the distributions of the numerical variables.

def histogram_overview(data):
    """
    Histogram Overview function
    
    This function below generates a subplots of  `histogram plots` & 
    showing the `distribution of the numerical varible input`
    
    * Generates subplots for each numerical variable in a three column structure.
    * The function takes the Pandas dataframe as the input
    * The function selects the numerical variables from the applied dataframe.
    * It generates a vertical `line` to indicate the `mean`, `median` and `mode` on the histogram
    * `sns.set_style` - sets the Seaborn theme
    * `subplot_nrows` - calculated number of subplot rows
    * `subplot_ncols` - configures the three column structure subplots
    * `figsize` - indicates the size of the plot
    * `sns.reset_defaults()` - resets Seaborn themes and settings to default
    
    """
    num_vars = data.select_dtypes(include=np.number).columns.to_list()
    plt.figure(figsize=(10, 10))
    for i in range(len(num_vars)):
        # Set seaborn theme
        sns.set_style("darkgrid")
        # Subplot no of columns
        subplot_ncols = math.ceil(np.sqrt(len(num_vars)))
        # Subplot no of rows
        subplot_nrows = subplot_ncols

        plt.subplot(subplot_nrows, subplot_ncols, i + 1)

        plt.hist(data[num_vars[i]])
        #Plot vertical line for the mean
        plt.axvline(data[num_vars[i]].mean(),
                    color='green',
                    linestyle='--',
                    label="mean")
        #Plot vertical line for the median
        plt.axvline(data[num_vars[i]].median(),
                    color='red',
                    linestyle='-',
                    label="median")
        #Plot vertical line for the mode
        plt.axvline(data[num_vars[i]].mode()[0],
                    color='black',
                    linestyle='-',
                    label="mode")
        plt.legend()
        plt.tight_layout()
        plt.title(num_vars[i], fontsize=16)
    plt.show()
    # Reset seaborn theme
    sns.reset_defaults()
histogram_overview(data[numerical_vars])

png

Observation:

  • Age_in_days - Somewhat normal as the mean and median are very close.

  • All other variables are skewed.

Boxplot overview

Let's get a boxplot overview across each numerical variable

def boxplot_overview(data):
    """
    This function below generates a subplots of `box plots` &
    showing the `distribution of the numerical variable input with outliers`.

    * Generates subplots for each numerical variable in a three column structure.
    * The function takes the Pandas dataframe as the input
    * The function selects the numerical variables from the applied dataframe.
    * It shows the `mean` in the boxplot.
    * `sns.set_style` - sets the Seaborn theme
    * `subplot_nrows` - calculated number of subplot rows
    * `subplot_ncols` - configures the three column structure subplots
    * `figsize` - indicates the size of the plot
    * `sns.reset_defaults()` - resets Seaborn themes and settings to default
    """

    num_vars = data.select_dtypes(include=np.number).columns.to_list()
    plt.figure(figsize=(10, 10))
    for i in range(len(num_vars)):
        # Set seaborn theme
        sns.set_style("darkgrid")
        # Subplot no of columns
        subplot_ncols = math.ceil(np.sqrt(len(num_vars)) )  
        # Subplot no of rows
        subplot_nrows = math.ceil(len(num_vars) / subplot_ncols)  
          
        plt.subplot(subplot_nrows, subplot_ncols, i + 1)
        sns.boxplot(y=data[num_vars[i]], width=0.3, showmeans=True)
        plt.tight_layout()
        plt.title(num_vars[i], fontsize=16)
    plt.show()
    # Reset seaborn theme
    sns.reset_defaults()
boxplot_overview(data[numerical_vars])

png

Observation:

  • Perc_premium_paid_in_cash has no outliers.

  • Age_in_days , Income, No_of_premiums_paid & Premium_payment - have upper outliers.

  • Risk_score has lower outliers

Histogram Distribution

Let's generate Histograms for each numerical variable and visually identify any its distributions.

def hist_box(data):
    """
    This function below generates a `box plot` & `histogram` 
    showing the `distribution of the numerical varible input`.
    * The function also checks for `outliers` and states the location (`lower`/`upper`)
    * The function also `generates an image file` of the plot.
    * The function takes the Pandas series as the input.
    * It creates a `subplot` with `box plot` and `histogram` distribution
    * It generates a vertical `line` to indicate the `mean`, `median` and `mode` on the histogram
    * It calculates the Inter Quartile Range using `Scipy Stats`
    * `sns.set_style` - sets the Seaborn theme
    * `nrows` - sets the shape of the subplot
    * `gridspec_kw` - configures the ratio of the size of the plots
    * `figsize` - indicates the size of the plot
    * `sns.reset_defaults()` - resets Seaborn themes and settings to default
    
    """

    # Get name of series
    name = data.name
    sns.set_style("darkgrid")
    f, axes = plt.subplots(nrows=2,
                           gridspec_kw={"height_ratios": (1, 3)})
    sns.boxplot(data, showmeans=True, color='m', ax=axes[0])
    sns.distplot(data, bins=15, ax=axes[1], color='deepskyblue', kde=False)
    axes[1].axvline(data.mean(), color='green', linestyle='--', label="mean")
    axes[1].axvline(data.median(), color='red', linestyle='-', label="median")
    axes[1].axvline(data.mode()[0], color='black', linestyle='-', label="mode")
    plt.legend(("mean", "median", "mode"), fontsize=12)
    plt.suptitle("Distribution of {}".format(name), fontsize=22)
    plt.tight_layout()
    plt.show()

    # Outlier check
    IQR = spy.stats.iqr(data, nan_policy="omit")
    upper_fence = data.quantile(0.75) + 1.5 * IQR
    lower_fence = data.quantile(0.25) - 1.5 * IQR
    upper_outliers = data[data > upper_fence].count()
    lower_outliers = data[data < lower_fence].count()
    print(
        f"The '{name}' distribution has '{lower_outliers}' lower outliers and '{upper_outliers}' upper outliers."
    )

    # Line separator
    print('--' * 55)

Plot the distribution of all numerical variables

for each_var in data[numerical_vars].columns:
    hist_box(data=data[each_var])
    plt.figure()
    plt.show()

png

The 'Perc_premium_paid_in_cash' distribution has '0' lower outliers and '0' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

png

The 'Age_in_days' distribution has '0' lower outliers and '44' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

png

The 'Income' distribution has '0' lower outliers and '3428' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

png

The 'Risk_score' distribution has '3784' lower outliers and '0' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

png

The 'No_of_premiums_paid' distribution has '0' lower outliers and '1426' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

png

The 'Premium_payment' distribution has '0' lower outliers and '4523' upper outliers.
--------------------------------------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Observation:

  • Perc_premium_paid_in_cash

    • The bulk of the policy holders paid between 5%-55% of their policy in cash.
  • Age_in_days

    • The bulk of the policy holders are between 15,000-22,500 days old (41-62 years old), i.e., middle aged to senior adults.
  • Income

    • Income is greatly skewed in the dataset with some policy holders making over 10,000,000.
  • Risk_score

    • The policy holders' risk scores typically range from 98-100% with outliers as far as 92%
  • No_of_premiums_paid

    • The number of premiums paid by policy holders typically range from 0-24 with the bulk being within 7-14.
  • Premium_payment

    • The bulk of the insurance premiums range from 5,000 to 15,000.
    • There are some policies in excess of 30,000

Categorical Variables

We shall use bar chart to represent the categorical variables.

def bar_chart(data):
    """
    This function below generates a `bar chart` showing
    the `distribution of the categorical varible input`.
    * The function also `generates an image file` of the plot.
    * The function takes the Pandas series as the input.
    * It `computes the frequency of each unique element` and 
      displays the distribution of the elements to in horizontal bars.
    * The `percentage of each bar` is also calculated and placed to 
      the right end of each bar.
    * `sns.despine()` - removes the upper and right border of the chart
    * For each horizontal bar the width is calculated as a percentage of
      the entire quanta of datapoints.
    * The percentage is annotated to the each bar by plotting the cardinal locations.

    """
    
    # Create a horizontal count plot while sorting variables in descending order
    g=sns.countplot(y=data)
    # Remove the top and right spines from plot
    sns.despine()
    # length of the column
    col_length = len(data) 
    for p in g.patches:
        # percentage of each class of the category
        percentage = '{:.1f}%'.format(100 * p.get_width()/col_length)
        # width of the plot
        x = p.get_x() + p.get_width() + 0.02
        # height of the plot
        y = p.get_y() + p.get_height()/2
        # annotate the percentage
        g.annotate(percentage, (x, y), size = 12) 
        plt.title("Distribution of {}".format(data.name),loc="center",fontsize = 22)
    plt.show()
    # Line separator
    print('--'*55)

Plot the distribution of all categorical variables

for each_var in data[categorical_vars].columns:
    plt.figure()
    bar_chart(data=data[each_var])

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

Observations:

  • Policy holders typically pay their premiums on times as the majority of them were not late on payments.
    This is indicative of the policy holders having an active policy to exercise claims.
  • The policy holders in this dataset is more or less evenly balanced across Marital Status as appromimately 50% are Married and Unmarried
  • The distribution of Number of vehicles owned across policy holders is also evenly balanced as approximately 33% have one (1), two (2) and three(3) vehicles.
  • The distribution of Number of dependents across policy holders is also evenly balanced as approximately 25% have one (1), two (2) , three(3) and four(4) dependents.
  • The policy holders in this dataset is more or less evenly balanced across Accomodation as appromimately 50% owned or rented their place of residence.
  • The bulk of the policy holders were sourced using Channel A (54%) while the least was Channel E
  • The majority of the policy holders reside in Urban residence types (~60%)
  • 6.3% of the policy holders defaulted on their premium payments. The data is heavily imbalanced

Numerical Correlation Analysis

Let's check to see to see if there are correlations between the numerical variables.

Since it was observed that Marital Status, Number of vehicles owned,Number of dependents & Accomodation were evenly balanced across the dataset, these variables will not provide any meaningful correlations when exploring the heatmaps.

Therefore let's create a subset of the variables for numerical correlation analysis.

# Variables to ignore in numerical correlation analysis
variables_to_ignore = [
    'Marital_Status', 'Vehicles_Owned', 'No_of_dependents', 'Accomodation'
]
# Create a new list of columns
num_corr_vars = data.columns.tolist()
for variables in variables_to_ignore:
    num_corr_vars.remove(variables)
print(num_corr_vars)
['Perc_premium_paid_in_cash', 'Age_in_days', 'Income', 'Late_premium_payment_3-6_months', 'Late_premium_payment_6-12_months', 'Late_premium_payment_>12_months', 'Risk_score', 'No_of_premiums_paid', 'Sourcing_channel', 'Customer_demographic', 'Premium_payment', 'Default']

Heat Map

Pearson

onehot = pd.get_dummies(data[num_corr_vars],
                        columns=data.select_dtypes(include=['category']).columns.tolist())


oh_corr = onehot.corr(method='pearson')

annot_kws = {"fontsize": 12}

symmetric_matrix = (oh_corr + oh_corr.T) / 2

# Create a mask for the upper half of the matrix
mask = np.triu(np.ones_like(symmetric_matrix), k=1)

plt.figure(figsize=(16, 12))
sns.heatmap(oh_corr, annot=True, fmt=".2f", mask=mask,
            cmap='coolwarm', square=True, annot_kws=annot_kws)
plt.yticks(rotation=0)
plt.show()

png

Observation:

  • There no significantly correlated variables according to the Pearson analysis.

Since there are many outliers in the data, let's run the Spearman correlation analysis which is not sensitive to outliers.

Spearman

onehot = pd.get_dummies(data[num_corr_vars],
                        columns=data.select_dtypes(include=['category']).columns.tolist())


oh_corr = onehot.corr(method='spearman')

annot_kws = {"fontsize": 12}

symmetric_matrix = (oh_corr + oh_corr.T) / 2

# Create a mask for the upper half of the matrix
mask = np.triu(np.ones_like(symmetric_matrix), k=1)

plt.figure(figsize=(16, 12))
sns.heatmap(oh_corr, annot=True, fmt=".2f", mask=mask,
            cmap='coolwarm', square=True, annot_kws=annot_kws)
plt.yticks(rotation=0)
plt.show()

png

Observation:

  • There is a highly positive correlation with Income and Premium_payment

Observations:

The Spearman correlation resulted in only one pair of correlated variables as the data has many outliers.

Variable1 Variable2 Correlation
Premium_payment Income highly correlated

Bivariate Analysis

Bivariate Scatter Plots

Let's generate a pairplot of the numerical variables before we dive into the Numerical Variable Bivariate Analysis

# sns.pairplot(data,corner=True, hue="Default", markers="o");

There are no clear linear relationships among all the variables.

Let's use the variables from the numerical correlation analysis

# sns.pairplot(data[num_corr_vars],corner=True, hue="Default", markers="o");

Income and Premium_payment was observed to be highly correlated.
Let's visualize using a scatterplot

Income vs Premium_payment

sns.scatterplot(data=data, y='Income', x='Premium_payment',hue='Default')
plt.show();

png

Observation:

  • There appears to be a linear relationship which is being masked by the presence of outliers.
    In Part 2 of the Capstone, the outliers will be removed and further analysis will be conducted.

Continuous Variable Exploration

Numerical - Categorical

Let's define a function to generate numerical and categorical plots

def num_cat_plots(numerical_variable):
    """
    This function creates a list of the 
    categorical variables without the target varible.
    It then generates boxplots for the input
    numerical variable with each categorical variable
    to display the distribution.
    
    """
    cat_vars = data[categorical_vars].columns.to_list()
    cat_vars.remove('Default')
    for cat_var in cat_vars:
        sns.catplot(y=numerical_variable.name,
                    x=cat_var,
                    hue="Default",
                    kind="box",
                    data=data,
                    showmeans=True,
                    height=2,
                    aspect=3)
        plt.show()
        # Line separator
        print('--' * 55)

Perc_premium_paid_in_cash

num_cat_plots(data.Perc_premium_paid_in_cash)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • Perc_premium_paid_in_cash
    • Policy holders who default on their insurance premiums tend to pay the bulk of their premium with cash.
      More data is need to confirm but a reasonable assumption is the policy holders who default, work at jobs which pays their income in cash.

Age_in_days

num_cat_plots(data.Age_in_days)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • Age_in_days
    • The average Age_in_days of policy holders who default on their premium payments is lower than those who dont default.

Income

num_cat_plots(data.Income)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • Income
    • The presence of many outliers make it visually dificult to discern any differnce across defaulters and non-defaulters across income.
      In the Capstone Part 2, Outlier treatment will be conducted to rectify and allow the any trends to be visually observed.

Risk_score

num_cat_plots(data.Risk_score)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • Risk_score
    • The average Risk_score of policy holders who default on their premium payments is marginally lower than those who don't default.
    • There is a significant quantity of outliers which is prevently clearer distinguishment of any differences between defaulters and non-defaulters. This will be rectified in Capstone Part 2.

No_of_premiums_paid

num_cat_plots(data.No_of_premiums_paid)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • No_of_premiums_paid
    • The average No_of_premiums_paid of policy holders who default on their premium payments is marginally lower than those who don't default. There is a significant quantity of outliers which is prevently clearer distinguishment of any differences between defaulters and non-defaulters. This will be rectified in Capstone Part 2.

Premium_payment

num_cat_plots(data.Premium_payment)

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------
  • Premium_payment
    • The average Premium_payment of policy holders who default on their premium payments is lower than those who don't default.

Default by Age_in_days

sns.lineplot(x='Age_in_days',y='Default',data=data)
plt.title("Default by Age_in_days");

Observations:

  • Senior Adults (80+ years) have a higher probability of defaulting on their insurance premiums.

Categorical Variable Exploration

Let's define a function to generate categorical variables vs target variable plots

def categ_target_plots(target_variable):
    """
    This function creates a list of the 
    categorical variables without the target varible.
    It then generates countplots for the input
    target variable with each categorical variable
    to display the distribution.
    
    """
    cat_vars = data[categorical_vars].columns.to_list()
    cat_vars.remove(target_variable.name)

    for cat_var in cat_vars:
        sns.catplot(data=data,
                    y=cat_var,
                    hue=target_variable.name,
                    kind="count")
        plt.title("{} by {}".format(cat_var, target_variable.name),
                  loc="center",
                  fontsize=16)
        plt.show()
        # Line separator
        print('--' * 55)
categ_target_plots(data.Default)

png

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

png

--------------------------------------------------------------------------------------------------------------

def stacked_plot(x, flag=True):
    sns.set(palette='nipy_spectral')
    table_values = pd.crosstab(x, data['Default'], margins=True)
    if flag == True:
        display(table_values)

    table_norm = pd.crosstab(x, data['Default'], normalize='index')
    table_norm.plot(kind='bar', stacked=True, figsize=(8, 5))
    plt.legend(loc='lower left', frameon=False)
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45)
    plt.show()
    print('-'*80)  
# Categorical variables
categ_list = data[categorical_vars].columns.to_list()
categ_list.remove('Default')

for each_var in categ_list:
    stacked_plot(data[each_var])
    plt.figure()
    plt.show();

Output:

Default 0 1 All
Late_premium_payment_3-6_months
0 2687 64211 66898
1 1153 7673 8826
2 592 1927 2519
3 288 666 954
4 158 216 374
5 67 101 168
6 31 37 68
7 10 13 23
8 6 9 15
9 2 2 4
10 1 0 1
11 1 0 1
12 1 0 1
13 1 0 1
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Late_premium_payment_6-12_months
0 3505 72423 75928
1 828 1852 2680
2 334 359 693
3 185 132 317
4 85 45 130
5 30 16 46
6 13 13 26
7 7 4 11
8 3 2 5
9 2 2 4
10 1 3 4
11 1 1 2
12 1 0 1
13 1 1 2
14 1 1 2
15 0 1 1
17 1 0 1
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Late_premium_payment_>12_months
0 3810 72325 76135
1 835 2161 2996
2 228 270 498
3 85 66 151
4 25 23 48
5 7 6 13
6 4 2 6
7 2 1 3
8 1 1 2
11 1 0 1
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Marital_Status
0 2571 37461 40032
1 2427 37394 39821
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Vehicles_Owned
1 1668 25078 26746
2 1678 24842 26520
3 1652 24935 26587
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
No_of_dependents
1 1190 18650 19840
2 1258 18644 19902
3 1283 18932 20215
4 1267 18629 19896
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Accomodation
0 2453 37370 39823
1 2545 37485 40030
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Sourcing_channel
A 2349 40785 43134
B 1066 15446 16512
C 903 11136 12039
D 634 6925 7559
E 46 563 609
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

Output:

Default 0 1 All
Customer_demographic
Rural 1998 29672 31670
Urban 3000 45183 48183
All 4998 74855 79853

png

--------------------------------------------------------------------------------



<Figure size 640x480 with 0 Axes>

The later the insurance premium payment, the higher probability of the policy holder being a defaulter.