Credit Worthiness Analysis#

This project aims to build classification models that analyze credit applicants’ creditworthiness. Since the costs of a false positive outweigh the benefits of a true positive by a factor of 5 (Shmueli et al., 2017), the models are to be optimized to minimize the average misclassification costs in addition to improve accuracy. The dataset used for this project contains 30 variables and 1,000 records where each credit applicant had been rated as “good credit” (700 cases) or “bad credit” (300 cases), and it was divided into training (80%) and validation (20%) sets.

As seen in Table 1, five models were built as the final models: two logistic regression models with full and selected independent variables and three classification trees with default settings without pruning; with pruning using a complexity parameter; a loss matrix that penalizes false positives. Statistically significant variables (p-value < 0.05.) were selected for the regression model with fewer variables based on the performance of the full model, and a few levels in the selected variables were combined into new variables to reduce dimensions. Confusion matrices were used to evaluate models’ performance. A cutoff value optimized to maximize the classification accuracy is not adequate for the classification problems with different misclassification costs (Calabrese, 2014), which applies to this project. Therefore, the cutoff value to evaluate these regression models was set to 0.7761. The “cutpointr” function was used to identify the optimal cutoff value that minimizes the misclassification cost (Thiele, 2021).

AUG values indicate models’ predictive capabilities (Muschelli, 2019), and the logistic regression full model has the highest AUG value. However, the model has an overfitting risk since it used all 30 target variables. Therefore, the logistic regression model with fewer variables might be the better model, and its AUG value is 0.738, which is the second to the highest. The Loss Matrix classification tree model performs the best in terms of the average misclassification costs; however, its model accuracy is only 0.575. Two other classification trees have high accuracy, but the average misclassification costs are also higher. The classification tree - CP has the highest accuracy among all models.

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 9)

  Cell In[1], line 1
    knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 9)
          ^
SyntaxError: invalid syntax

Preparation#

#load packages
library(ggplot2)
library(reshape)
library(lattice)
library(caret)
library(dplyr)
library(scales)
library(gridExtra)
library(pROC)
library(cutpointr)
library(rpart)
library(rpart.plot)

#Display options
options(scipen = 999)
par(pty = "s") #remove the extra spacing by setting the plot type to "square"
options(digits = 3)
#Import data
credit <- read.csv("GermanCredit.csv")
credit <- credit[, -1] #drop the observation column

#based on the information in the textbook, there are 6 numerical variables:
# - DURATION
# - AMOUNT
# - INSTALL_RATE
# - AGE
# - NUM_CREDITS
# - NUM_DEPENDENTS

numeric_val = c("DURATION", "AMOUNT", "INSTALL_RATE", "AGE", "NUM_CREDITS", "NUM_DEPENDENTS")

#Convert categorical variables to factors
credit[,-which(names(credit) %in% numeric_val)] <-  lapply(credit[,-which(names(credit) %in% numeric_val)], factor)
str(credit)

Data Exploration#

summary(credit)
dim(credit)
str(credit)
head(credit)

#check the proportion of "RESPONSE"
table(credit$RESPONSE)
#0 is bad credit, 1 is good credit

#show the unique values of each variables
data.frame(unique.values = sapply(credit, n_distinct)) 

#Check missing values
data.frame(missing.val = sapply(credit, function(x) sum(length(which(is.na(x)))))) 

#summer statistics for the numeric variables
summary.stats <-  data.frame(
  mean = sapply(credit[numeric_val], mean),
  median = sapply(credit[numeric_val], median),
  min = sapply(credit[numeric_val], min),
  max = sapply(credit[numeric_val], max),
  sd = sapply(credit[numeric_val], sd),
  sum = sapply(credit[numeric_val], sum),
  length = sapply(credit[numeric_val], length)
)
summary.stats

Visualize the dataset#

#Visualize the distribution of numeric variables
duration.hist <- ggplot(data=credit)+geom_histogram(mapping = aes(DURATION), binwidth = 3)
amount.hist <- ggplot(data=credit)+geom_histogram(mapping = aes(AMOUNT), binwidth = 500)
install.rate.bar <- ggplot(data=credit)+geom_bar(mapping = aes(factor(INSTALL_RATE)))+xlab("Install Rate")
age.hist <- ggplot(data=credit)+geom_histogram(mapping = aes(AGE), binwidth = 5)
num.credit.bar <- ggplot(data=credit)+geom_bar(mapping = aes(x=factor(NUM_CREDITS)))+xlab("Number of existing credits")
num.dependent.bar <- ggplot(data=credit)+geom_bar(mapping = aes(x=factor(NUM_DEPENDENTS)))+xlab("Numer of dependents")

#show the plots in 3x2 grid
grid.arrange(duration.hist,amount.hist,install.rate.bar,age.hist,num.credit.bar, num.dependent.bar, ncol=2, nrow=3)


#detect multicollinearity
#heatmap
cor.mat <-  round(cor(credit[numeric_val]),2) #rounded correlation matrix
melted.cor.mat <-  melt(cor.mat)
ggplot(melted.cor.mat, mapping = aes(x= X1, y=X2, fill = value))+
  geom_tile()+
  geom_text(aes(x=X1, y= X2, label = value))

#from caret package
findCorrelation(x=cor(credit[numeric_val]), cutoff = 0.6, verbose = FALSE)

Data Partitioning#

set.seed(42)
train.index <- sample(c(1:dim(credit)[1]), dim(credit)[1]*0.8)
train.df <- credit[train.index,]
valid.df <- credit[-train.index,]

Data Mining#

Logistic Regression Models#

Full model#

#Full model
logit.reg.full <- glm(RESPONSE ~ ., data=train.df, family = "binomial")
summary(logit.reg.full)
round(data.frame(summary(logit.reg.full)$coefficients, odds = exp(coef(logit.reg.full))),3)

#fit the model
logit.reg.full.pred.train <- predict(logit.reg.full, train.df, type = "response")
confusionMatrix(factor(ifelse(logit.reg.full.pred.train > 0.5, 1, 0)), factor(train.df$RESPONSE), positive = "1")

Evaluate the model performance – using default cut-off value of 0.5#

#Evaluate the model performance – using default cut-off value of 0.5
logit.reg.full.pred <- predict(logit.reg.full, valid.df, type = "response")
confusionMatrix(factor(ifelse(logit.reg.full.pred > 0.5, 1, 0)), factor(valid.df$RESPONSE), positive = "1")

#roc curve and AUC
r <- pROC::roc(valid.df$RESPONSE, logit.reg.full.pred, plot=TRUE, main="ROC curve", print.auc=TRUE,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)")

The costs of a false positive (incorrectly classifying an applicants with a bad rating as good ) outweigh the benefits of a true positive (correctly classifying the an applicant with a good credit as good) by a factor of 5.

  • Cost of a false positive $500

  • Opportunity cost of False negative $100

Therefore, change the cutoff value to minimize the misclassificaiton cost.

Buidling a custom function to calculate average misclassification costs#

cost <-  function(act, p){
  weight1 = 5  #false positive weight
  weight0 = 1  #false negative weight
  c1 = (act==0)&(p==1) #TRUE if false positive - actual0 (bad) but predicted 1 (good)
  c0 = (act==1)&(p==0) #TRUE if false negative - actual1 (good) but predicted 0 (bad)
  return(mean(weight1*c1+weight0*c0))
}

Identifying the optimal cutoff value that minimizes the misclassification costs using ‘cutpointr’ package#

cp <- cutpointr(valid.df, logit.reg.full.pred, RESPONSE, method = minimize_metric, metric= misclassification_cost, cost_fp = 5, cost_fn = 1)
summary(cp)
plot(cp)
plot_metric(cp)

Changing the cutoff value#

confusionMatrix(factor(ifelse(logit.reg.full.pred > 0.7661 , 1, 0)), factor(valid.df$RESPONSE), positive = "1" )

#Compare the average misclassification costs of the cut-off value 0.5 vs 0.7661
#cut-off value: 0.5 (default)
cost(valid.df$RESPONSE, (factor(ifelse(logit.reg.full.pred > 0.5, 1, 0))))

#cut-off value: 0.7661
cost(valid.df$RESPONSE, (factor(ifelse(logit.reg.full.pred > 0.7661, 1, 0))))

Logstic Regression with selected variables#

#create new variables that contains significant factors only
credit$HISTORY.DELAY <- credit$HISTORY %in% c("4") 
credit$SAV_ACCT3.4 <- credit$SAV_ACCT %in% c("3","4")
credit$CHK_ACCT2.3 <- credit$CHK_ACCT %in% c("2", "3")
credit$PRESENT_RESIDENT2 <- credit$PRESENT_RESIDENT %in% c("2")

#includes significant variables only
selected_vals <- c( "RESPONSE","HISTORY.DELAY","CHK_ACCT2.3","DURATION", "NEW_CAR", "EDUCATION","AMOUNT",
                    "INSTALL_RATE","SAV_ACCT3.4","PRESENT_RESIDENT2","PROP_UNKN_NONE","OTHER_INSTALL", "FOREIGN")

set.seed(42)
train.index <- sample(c(1:dim(credit)[1]), dim(credit)[1]*0.8)
train.df <- credit[train.index,]
valid.df <- credit[-train.index,]

logit.reg <- glm(RESPONSE ~ ., data = train.df[,selected_vals], family = "binomial")

#fit the model
logit.reg.pred.train <- predict(logit.reg, train.df[,selected_vals], type = "response")
confusionMatrix(factor(ifelse(logit.reg.pred.train > 0.7661, 1, 0)), factor(train.df$RESPONSE), positive = "1" )

Evaluate the model performance#

#Evaluate the model performance
logit.reg.pred <- predict(logit.reg, valid.df[,selected_vals], type = "response")
confusionMatrix(factor(ifelse(logit.reg.pred > 0.7661, 1, 0)), factor(valid.df$RESPONSE), positive = "1" )

#roc curve and AUC
r <- pROC::roc(valid.df$RESPONSE, logit.reg.pred, plot=TRUE, main="ROC curve", print.auc = TRUE,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)") #legacy.axes show the 1-specificity (=False positive rate)

#calculate the average misclassification cost
cost(valid.df$RESPONSE, (factor(ifelse(logit.reg.pred > 0.7661, 1, 0))))

Classification Tree#

Base Model#

tree.base <- rpart(RESPONSE ~., data = train.df, method = "class") 
printcp(tree.base)
plotcp(tree.base)
rpart.plot(tree.base)

#Make a prediction
tree.base.pred.train <- predict(tree.base, train.df, type = "class") #set type to "class" to generate predicted class membership
confusionMatrix(tree.base.pred.train,train.df$RESPONSE, positive = "1")

Evaluate the model performance#

#evaluate the model with the validation set
tree.base.valid <- predict(tree.base, valid.df, type = "class")
confusionMatrix(tree.base.valid, valid.df$RESPONSE, positive = "1")
tree.base.valid.prob <- predict(tree.base, valid.df, type = "prob")[,2] #set class to "prob" to get probability for the ROC curve

r <- pROC::roc(valid.df$RESPONSE, tree.base.valid.prob, plot=TRUE, main="ROC curve",print.auc=TRUE,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)") #legacy.axes show the 1-specificity (=False positive rate)
#calculate misclassification cost
cost(valid.df$RESPONSE, tree.base.valid)

Classification Tree - Pruning using complexity Parameter#

printcp(tree.base)
plotcp(tree.base)

#set cp = 0.029
class.tree.cp <- rpart(RESPONSE ~., data = train.df, method = "class", control = rpart.control(cp=0.029)) 
rpart.plot(class.tree.cp)

Evaluate the model performance#

#evaluate the model with validation set
class.tree.cp.valid <- predict(class.tree.cp, valid.df, type = "class")
confusionMatrix(class.tree.cp.valid, valid.df$RESPONSE, positive = "1")
class.tree.cp.valid.prob <- predict(class.tree.cp, valid.df, type = "prob")[,2] 

r <- pROC::roc(valid.df$RESPONSE, class.tree.cp.valid.prob, plot=TRUE, main="ROC curve",print.auc=TRUE,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)") 

#calculate misclassification cost
cost(valid.df$RESPONSE, class.tree.cp.valid)

Classification Tree - Pruning using a loss matrix to penalize false positive#

loss.matrix <-  matrix(c(0,1,5,0), nrow=2) #the loss matrix is structured with actual on the rows and prediction on the columns
class.tree.lossmt <- rpart(RESPONSE ~., data = train.df, parms = list(loss=loss.matrix),method = "class", control = rpart.control(cp=0))
rpart.plot(class.tree.lossmt)

Evaluate the model performance#

#evaluate the model
class.tree.lossmt.valid <- predict(class.tree.lossmt, valid.df, type = "class")
confusionMatrix(class.tree.lossmt.valid, valid.df$RESPONSE, positive = "1")
class.tree.lossmt.valid.prob <- predict(class.tree.lossmt, valid.df, type = "prob")[,2]

#roc curve and AUC
r <- pROC::roc(valid.df$RESPONSE, class.tree.lossmt.valid.prob, plot=TRUE, main="ROC curve", print.auc = TRUE,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)") 

#calculate misclassification cost
cost(valid.df$RESPONSE, class.tree.lossmt.valid.prob)

Model Comparison#

Summarizing the model performance#

pROC::roc(valid.df$RESPONSE, logit.reg.pred, plot=TRUE, main="ROC curve", print.auc = TRUE,col="#4daf4a", lwd=4,
         legacy.axes = TRUE, ylab = "Sensitivity (True Positive rate)", xlab = "1-Specificity (False positive rate)")

plot.roc(valid.df$RESPONSE, class.tree.lossmt.valid.prob, col="#377eb8", lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=0.4)

plot.roc(valid.df$RESPONSE, class.tree.cp.valid.prob, col="#dd1c77", lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=0.3) 

legend("bottomright", legend=c("Logistic Regression - Selected variables", "Classification tree - Loss Matrix","Classification tree - CP"), col=c("#4daf4a", "#377eb8", "#dd1c77"), lwd=4)


models <- c("Logistic Regression full model", "Logistic Regression with selected variables",
            "Classification Tree base model", "Classification Tree - CP", "Classification Tree - Loss Matrix")
accuracy <- c(0.645, 0.62, 0.71, 0.72, 0.575)
avg.cost <- c(0.595, 0.62, 0.84, 0.82, 0.125)
AUC <- c(0.748, 0.738, 0.712, 0.701, 0.731)

performance_summary <- data.frame(models, accuracy, avg.cost, AUC)

The model performance summary#

knitr::kable(
performance_summary,
caption = "Table 1 The model performance summary"
)