Predicting Conversion on a Personal Loan

The Case

A sample of 4499 customers of a large Canadian bank currently has a surplus of liability customers, i.e., depositors. The bank would like to grow their asset customer base, i.e., borrowers, through a targeted marketing campaign. Previous marketing efforts were able to garner a roughly 10% conversion rate, where a conversion was considered to be a sale of a personal loan product to an existing liability customer. From this campaign, which was held in Toronto, a data set was established and to be used to improve the efficiency of this year’s marketing campaign. This efficiency boost will stem from designing an algorithm to predict which clients are most likely to convert on the personal loan, and then focus marketing on this group. All sampled customers have a financial advisor/planner with this bank. The financial planners are mobile and can come to the customers if they wish so, however, listed in the data set are the branches that they most often serve. Financial advisors earn commission if they can advise customers to buy a personal loan product.

Data Cleaning and Basic Data Exploration

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import dataset
bank = pd.read_csv("Bank.csv")

bank.shape # ensure rows (number of customers) and columns (number of variables) are correct
bank.head() # look at data

bank.info() # ensure no missing values, look at data types
# Issue: CCAvg should be an float

bank.describe() 
# Issue: experience has an invalid minimum value (-3.00)

# Convert 'CCAvg' to float, print value if not
for obj in bank['CCAvg']:
    try:
        casted = float(obj)
    except:
        print('Invalid:', obj) # One invalid: Invalid: 2,00

bank['CCAvg'] = bank['CCAvg'].replace(['2,00'], '2.00') # Replace invalid object '2,00' with value 2.00
bank['CCAvg'] = bank['CCAvg'].astype(float) # Make each object in series a float


# Replace invalid responses with None
for val in bank['Experience']:
    if val < 0:
        bank['Experience'].replace(val, None, inplace = True)

# Create two mortgage variables: presence/absence and mortgage amount
bank["MortgageAmount"] = bank['Mortgage'].replace(0, np.nan)
for val in bank['Mortgage']:
    if val > 0:
        bank["MortgagePresent"] = bank["Mortgage"].replace(val, 1, inplace = True) 
    else:
        bank["MortgagePresent"] = bank["Mortgage"]
bank["MortgagePresent"].value_counts() # examine ratio of presence/absence

# Examine and transform skewed data
contSub = bank[['Age', 'Experience', 'Income','CCAvg', 'MortgageAmount']] # Create a subset of continuous variables
skew = contSub.skew()
kurt = contSub.kurt()
print(skew) # CCAvg and Mortgage are highly skewed, Income is moderately skewed
print(kurt) # Mortgage has heavier tails than a normal distribution
# Examine outliers using box plots 
contSub.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False, figsize=(20,20), fontsize = 20)
# Examine distributions using density plots
contSub.plot(kind = 'density', subplots = True, layout = (3,2), sharex = False, sharey = False, figsize=(20,20), fontsize = 20)
plt.legend()
plt.rc('legend', fontsize=20)
# SQRT transform for moderate positive skew
bank["income_sqrt"] = np.sqrt(bank["Income"])

# log transform for substantial positive skew
bank["CCAvg_log"] = np.log1p(bank["CCAvg"])

# log transform for substantial positive skew
bank["mortgage_log"] = np.log1p(bank["MortgageAmount"])
# Examine box plots for transformed data
tranSub = bank[["mortgage_log", "CCAvg_log", "income_sqrt"]]
tranSub.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False, figsize=(20,20), fontsize = 20)
# Examine density plots for transformed data
tranSub.plot(kind = 'density', subplots = True, layout = (3,2), sharex = False, sharey = False, figsize=(20,20), fontsize = 20)

Exploratory Data Analysis

# Summarize the personal loan variable
bank["Personal Loan"].value_counts()

# Calculate the proportion of converters from last year's marketing campagin
bank["Personal Loan"].value_counts(normalize = True) * 100

# Bar chart of Personal Loan variable
PL = sns.countplot(bank["Personal Loan"])
PL.set_title("Coverted Customers")
# Examine correlations among variables
# Data subset removing non-transformed variables
bankNew = bank[["Age", "Experience", "Family", "Personal Loan", "Brokerage Account", "GIC Account", "Online",
                "CreditCard", "MortgagePresent", "income_sqrt", "CCAvg_log", "mortgage_log"]]
# Print correlation matrix
correlations = bankNew.corr()
print(correlations)

# Plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax= ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)

# Change tick labels
ticks = range(0,12,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(bankNew.columns, rotation=90)
ax.set_yticklabels(bankNew.columns)

# Highest correlations with my var of interest (Personal Loan): income_sqrt, CCAvg_log, mortgage_log, GIC Account

Exploring variables of interest: income, credit card spending average, mortgage amount, GIC account

For exploration into other variables within the dataset, see my github.

Income

# Separate the variable income_sqrt, by the two values of the target
loan_income_T= bank[bank["Personal Loan"] == 1] ["income_sqrt"]
loan_income_F= bank[bank["Personal Loan"] == 0] ["income_sqrt"]

# Create a stacked histogram of the two variables
plt.hist([loan_income_T, loan_income_F], bins = 10, stacked = True)
plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Histogram of Income with Personal Loan Overlay')
plt.xlabel('Income_sqrt')
plt.ylabel('Frequency')

(n, bins, patches) = plt.hist([loan_income_T, loan_income_F], bins = 10, stacked = True)

# Normalized for better interpretation of the relationship
n_table = np.column_stack((n[0], n[1]))
n_norm = n_table / n_table.sum(axis=1)[:, None]
ourbins= np.column_stack((bins[0:10], bins[1:11]))
plt.bar(x = ourbins[:,0], height = n_norm[:,0], width = ourbins[:, 1] -ourbins[:, 0])
plt.bar(x = ourbins[:,0], height = n_norm[:,1], width = ourbins[:, 1] -ourbins[:, 0],
bottom = n_norm[:,0])

plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Normalized Histogram of Income with Personal Loan Overlay')
plt.xlabel('Income (sqrt)')
plt.ylabel('Frequency')

Credit Card Spending Average

# Separate the variable CCAvg_log, by the two values of the target
loan_CCAvg_T= bank[bank["Personal Loan"] == 1] ["CCAvg_log"]
loan_CCAvg_F= bank[bank["Personal Loan"] == 0] ["CCAvg_log"]

# Create a stacked histogram of the two variables
plt.hist([loan_CCAvg_T, loan_CCAvg_F], bins = 10, stacked = True)
plt.xlabel('CCAvg_log')
plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Histogram of Credit Card Average with Personal Loan Overlay')
plt.xlabel('CCAvg_log')
plt.ylabel('Frequency')

(n, bins, patches) = plt.hist([loan_CCAvg_T, loan_CCAvg_F], bins = 10, stacked = True)

# Normalized
n_table = np.column_stack((n[0], n[1]))
n_norm = n_table / n_table.sum(axis=1)[:, None]
ourbins= np.column_stack((bins[0:10], bins[1:11]))
plt.bar(x = ourbins[:,0], height = n_norm[:,0], width = ourbins[:, 1] -ourbins[:, 0])
plt.bar(x = ourbins[:,0], height = n_norm[:,1], width = ourbins[:, 1] -ourbins[:, 0],
bottom = n_norm[:,0])

plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Normalized Histogram of Credit Card Average with Personal Loan Overlay')
plt.xlabel('Credit Card Average (log)')
plt.ylabel('Frequency')

Mortgage Amount

# Separate the variable income_sqrt, by the two values of the target
loan_mortgage_T= bank[bank["Personal Loan"] == 1] ["mortgage_log"]
loan_mortgage_F= bank[bank["Personal Loan"] == 0] ["mortgage_log"]

# Create a stacked histogram of the two variables
plt.hist([loan_mortgage_T, loan_mortgage_F], bins = 10, stacked = True)
plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Histogram of Mortgage with Personal Loan Overlay')
plt.xlabel('Mortgage Amount (log)')
plt.ylabel('Frequency')

(n, bins, patches) = plt.hist([loan_mortgage_T, loan_mortgage_F], bins = 10, stacked = True)

# Normalized
n_table = np.column_stack((n[0], n[1]))
n_norm = n_table / n_table.sum(axis=1)[:, None]
ourbins= np.column_stack((bins[0:10], bins[1:11]))
plt.bar(x = ourbins[:,0], height = n_norm[:,0], width = ourbins[:, 1] -ourbins[:, 0])
plt.bar(x = ourbins[:,0], height = n_norm[:,1], width = ourbins[:, 1] -ourbins[:, 0],
bottom = n_norm[:,0])

plt.legend(['loan = True', 'loan = False'], prop={'size': 12})
plt.title('Normalized Histogram of Mortgage Amount with Personal Loan Overlay')
plt.xlabel('Mortgage Amount (log)')
plt.ylabel('Frequency')

GIC ACCOUNT

GIC_crosstab = pd.crosstab(bank["Personal Loan"], bank["GIC Account"], margins = False)
print(GIC_crosstab)

GIC_crosstab_norm= GIC_crosstab.div(GIC_crosstab.sum(axis=1), axis=0)
print(GIC_crosstab_norm)
GIC_crosstab_norm.plot(kind = 'bar', stacked = True, fontsize = 16)
plt.xlabel('Personal Loan', fontsize=16)
plt.legend(loc='center',prop={'size': 12}, title = "Presence of \nGIC Account") # looks like those who have a GIC account are more likely to convert

Business Analytics

Who is or are the most successful financial advisor(s)?

If the goal of the company is to convert customers on a personal loan, I can examine which advisor has converted the most customers.

advisor_crosstab= pd.crosstab(bank["Personal Loan"], bank["Advisor Name"], margins=True)


mostSuccessfulRaw = (advisor_crosstab.iloc[1]).sort_values(ascending = False)
mostSuccessfulRaw = mostSuccessfulRaw[1:] # remove "all" total row
print(mostSuccessfulRaw.head())
print(mostSuccessfulRaw.describe())
top5Raw = mostSuccessfulRaw.head() 
top5Raw.plot(kind = 'bar', stacked = True)
plt.ylabel("Number of Customers Who Accepted a Personal Loan")
plt.ylim((0,20))

Some advisors have many customers, success could also be defined as: # of customers converted on a personal loan / # of customers total.

# Frequency of advisors by customer
print(bank["Advisor Name"].value_counts())
mostSuccessfulTotal = ((mostSuccessfulRaw / bank["Advisor Name"].value_counts()).sort_values(ascending = False))*100

top5Total = mostSuccessfulTotal.head()
top5Total.plot(kind = 'bar', stacked = True)
plt.ylabel("Percent of Customers Who Accepted a Personal Loan")
plt.ylim((0,20))

Machine Learning: Predicting Conversion on a Personal Loan

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC 

# class distribution
print(bank.groupby('Personal Loan').size()) # 0: 4019; 1 (conversion): 480

MLdata = bank[["Personal Loan", "Age", "Experience", "income_sqrt", "Family", "CCAvg_log","Brokerage Account", "GIC Account", "CreditCard", "MortgagePresent", "mortgage_log"]]

# get values of dataframe
array = MLdata.values # just the data inside the data frame - get rid of row indices and col names

# features = predictor/independent/explanatory variables
X = array[:,1:10] #all predictors are in the last nine columns

# target variable = dependent/response variable
Y = array[:,0] # vertical split to separate x (explanatory) from y (target)

# Split off test dataset- 25% of the data will be used for testing
test_size = 0.25
seed = 12

# select 25% randomly and split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # generate four subsets

# Algorithms
models = []
#models.append(('LR', LogisticRegression(max_iter = 300))) commented out because max_iter reached despite increasing
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('KNN', KNeighborsClassifier())) 
models.append(('CART', DecisionTreeClassifier())) 
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []

for name, model in models:
    # define how to split off validation data (the split in the test)
    kfold= KFold(n_splits = 10, random_state = seed) # folding data into 10 peices
    # train the model
    cv_results= cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') # score model with kfold
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
    print(msg)
    
    # CART is best with ~94% accuracy

# Compare Algorithms
print(results)# results for all algorithms, 10 each
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Make predictions on test dataset
# instantiate learning model
cart = DecisionTreeClassifier()

# fitting the model
cart.fit(X_train, Y_train) # Learn on all of the training data

# predict the response
predictions = cart.predict(X_test) # "final exam"- how does it perform on data it has never seen before
print(predictions)
print(Y_test)

# evaluate accuracy
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))