Sei sulla pagina 1di 29

5/7/2017

Logistic Regression

Logistic Regression

Peter Caya April 25, 2017

# This is a code block library(ggplot2) # Data visualization library(readr) # CSV file I/O, e.g. the read_csv function library(caret)

## Loading required package: lattice

library(dplyr)

## ## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':

##

##

filter, lag

## The following objects are masked from 'package:base':

##

##

intersect, setdiff, setequal, union

[1] Working with the Data

a. Cleaning the Data

The data was loaded. While there were no missing values, the data was cleaned in order to remove the following:

*

5/7/2017

Logistic Regression

# This R environment comes with all of CRAN preinstalled, as well as many other helpful packages

# The environment is defined by the kaggle/rstats docker image: https://github.com/kagg le/docker-rstats

# For example, here's several helpful packages to load in

library(ggplot2) # Data visualization library(readr) # CSV file I/O, e.g. the read_csv function library(dplyr) library(caret)

library(reshape2)

library(knitr)

set.seed(13)

raw_data <- read.csv("UCI_Credit_Card.csv")

# Input data files are available in the "

# For example, running this (by clicking run or pressing Shift+Enter) will list the fil

es in the input directory

/input/"

directory.

# The purpose of this script is to do the following:

# [1] Identify any duplications or missing observations in the dataset.

# [2] Observe the way that payment and limit variables change based on the variables

of

# - Sex

# - Education

# - Marriage

# - Age groups

# [3] Observe the amount of defaults that occur based on the previously mentioned va riables.

# [4] Identify pattens in Pay, Bill and Pay_AMT variables for defaulted and undefaul

ted datasets.

# [5] Get percentiles for different variables.

# [1] ---------------------------------------------------------------------

# 25 variables with 30,000 observations. No duplicates were observed. There were al

so no missing values so this appears to be a rather

# clean dataset.

print(dim(raw_data))

## [1] 30000

25

print(sum(duplicated(raw_data)))

5/7/2017

Logistic Regression

count_missing <- apply(X = raw_data,MARGIN = 2,FUN = function(X){sum(is.na(X))}) sum(count_missing)

## [1] 0

apply(X = raw_data, MARGIN = 2,FUN = class)

##

ID

LIMIT_BAL

##

"numeric"

"numeric"

##

SEX

EDUCATION

##

"numeric"

"numeric"

##

MARRIAGE

AGE

##

"numeric"

"numeric"

##

PAY_0

PAY_2

##

"numeric"

"numeric"

##

PAY_3

PAY_4

##

"numeric"

"numeric"

##

PAY_5

PAY_6

##

"numeric"

"numeric"

##

BILL_AMT1

BILL_AMT2

##

"numeric"

"numeric"

##

BILL_AMT3

BILL_AMT4

##

"numeric"

"numeric"

##

BILL_AMT5

BILL_AMT6

##

"numeric"

"numeric"

##

PAY_AMT1

PAY_AMT2

##

"numeric"

"numeric"

##

PAY_AMT3

PAY_AMT4

##

"numeric"

"numeric"

##

PAY_AMT5

PAY_AMT6

##

"numeric"

"numeric"

## default.payment.next.month

##

"numeric"

Note something: The “PAY_#” variables contain observations of -2 which are not mentioned in the data dictionary. If you look at the data you can further see that we have fewer instances of 1 than we would expect - in the case of PAY_5 and PAY_6 there are no observations! I am guessing that these were mislabeled somehow and correct the mistake below along with some other transformations on the data.

pay_names <- grep(names(raw_data),pattern = "PAY_\\d")

for(i in pay_names){ raw_data[,i] <- ifelse(raw_data[,i]==-2,1,raw_data[,i])

}

5/7/2017

Logistic Regression

bill_names <- grep(names(raw_data),pattern = "BILL_AMT\\d")

for(i in bill_names){ raw_data[,i] <- ifelse(raw_data[,i]<0, -raw_data[,i] ,raw_data[,i])

}

exp_data <- raw_data %>% mutate(SEX = factor(ifelse(SEX ==1,"Male","Female"))) %>% filter(EDUCATION !=0 & EDUCATION != 4 & EDUCATION != 5 & EDUCATION != 6) %>% mutate(EDUCATION = factor( ifelse(EDUCATION==1,"GS",ifelse(EDUCATION==2,"Uni","HS")))) %>% filter(MARRIAGE == 1 | MARRIAGE == 2 ) %>% mutate(MARRIAGE = factor( ifelse(MARRIAGE == 1,"Married","Single"))) %>% mutate( Default = factor(ifelse(default.payment.next.month==1,"Default","Reg"))) %>% mutate_at(.cols = grep(names(raw_data),pattern = "PAY_\\d"), .funs = factor)

exp_data <- exp_data %>% mutate( AGE_Range = factor(

"20-30",

<= 40 , "30-40",

0 & AGE <= 50,"40-50",

AGE >= 50 & AGE <= 60, "50-60", "70-80"))))))

ifelse(

AGE >= 20 & AGE <= 30 ,

ifelse( AGE >= 30 & AGE

ifelse( AGE >= 4

ifelse(

raw_data <- raw_data %>% # filter(EDUCATION !=0 & EDUCATION != 4 & EDUCATION != 5 & EDUCATION != 6) %

>%

mutate(EDUCATION = ifelse(EDUCATION == 0 , 4,ifelse(EDUCATION >=4 , 4,EDUCATI

ON))) %>%

filter(MARRIAGE == 1 | MARRIAGE == 2 ) %>% mutate(MARRIAGE=as.factor(MARRIAGE)) %>% mutate(SEX = as.factor(SEX)) %>% mutate(EDUCATION = as.factor(EDUCATION)) %>% select(-ID) %>% mutate(Default = as.factor(default.payment.next.month)) %>% select(-default.payment.next.month)

for_testing <- raw_data

NOTE TO SELF: Create one variable from the payment data showing the number of months that were missed as of payment 4!

5/7/2017

Logistic Regression

bill_data <- exp_data %>% select( SEX,Default , contains( "BILL")

pmt_data <- exp_data %>% select(

SEX ,Default , contains( "PAY_AMT")

)

)

ggplot(melt(bill_data,id.vars = c("SEX","Default"))) + geom_boxplot(aes(x=variable, y= value))+

~ Default)

xlab("Gender")+ylab("Amount")+ facet_grid(SEX

facet_grid(SEX ggplot(melt(pmt_data,id.vars =

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

EDUCATION ,Default, contains( "BILL") EDUCATION ,Default, contains( "PAY_AMT")

)

)

ggplot(melt(bill_data,id.vars = c("EDUCATION","Default" ) )) + geom_boxplot(aes(x=varia ble, y = value ))+ xlab("Education")+ylab("Amount") + facet_grid(EDUCATION~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(melt(pmt_data,id.vars = c("EDUCATION","Default" ))) +

ggplot(melt(pmt_data,id.vars = c("EDUCATION","Default" ))) + geom_boxplot(aes(x=variabl e, y = value ))+ xlab("Education")+ylab("Amount") + facet_grid(EDUCATION~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

MARRIAGE,Default , contains( "BILL") MARRIAGE ,Default , contains( "PAY_AMT")

)

)

ggplot(melt(bill_data,id.vars = c("MARRIAGE","Default" ) )) + geom_boxplot(aes(x=varia ble, y = value ))+ xlab("MARRIAGE")+ylab("Amount") + facet_grid(MARRIAGE~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(melt(pmt_data,id.vars = c("MARRIAGE","Default" ))) +

ggplot(melt(pmt_data,id.vars = c("MARRIAGE","Default" ))) + geom_boxplot(aes(x=variabl e, y = value ))+ xlab("MARRIAGE")+ylab("Amount") + facet_grid(MARRIAGE~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression # Summarize age: summary(exp_data$AGE) ## Min. 1st Qu. Median Mean 3rd Qu.

# Summarize age:

summary(exp_data$AGE)

##

Min. 1st Qu.

Median

Mean 3rd Qu.

Max.

##

21.00

28.00

34.00

35.39

41.00

79.00

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

bill_data <- exp_data %>% select( pmt_data <- exp_data %>% select(

AGE_Range,Default , contains( "BILL") AGE_Range ,Default , contains( "PAY_AMT")

)

)

ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia ble, y = value ))+ xlab("AGE_Range")+ylab("Amount") + facet_grid(AGE_Range~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) +

ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia ble, y = value ))+ xlab("AGE_Range")+ylab("Amount") + facet_grid(AGE_Range~Default)

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression # [3] --------------------------------------------------------------------- # We see that

# [3] ---------------------------------------------------------------------

# We see that bankruptcies make up ~22% of the data.

sum(exp_data$default.payment.next.month==1)/nrow(exp_data)

## [1] 0.2233652

5/7/2017

Logistic Regression

## Source: local data frame [2 x 4] ## Groups: SEX [2] ##

##

SEX Default `Default Status Breakdown` `P(Default|Sex)`

## <fctr> <fctr>

<int>

<dbl>

## 1 Female Default

3697

0.2102001

## 2

Male Default

2817

0.2433693

exp_data %>% group_by(EDUCATION,Default) %>% summarise("Default Status Breakdown"=n()) %>% mutate( `P(Default|Education)` = `Default Status Breakdown`/sum(`Default St atus Breakdown`)) %>% filter(Default == "Default")

## Source: local data frame [3 x 4] ## Groups: EDUCATION [3] ##

##

EDUCATION Default `Default Status Breakdown` `P(Default|Education)`

##

<fctr> <fctr>

<int>

<dbl>

## 1

GS Default

2023

0.1920995

## 2

HS Default

1206

0.2528302

## 3

Uni Default

3285

0.2369788

exp_data %>% group_by(MARRIAGE,Default) %>% summarise("Default Status Breakdown"=n()) %>% mutate( `P(Default|MARRIAGE)` = `Default Status Breakdown`/sum(`Default S tatus Breakdown`)) %>% filter(Default == "Default")

## Source: local data frame [2 x 4] ## Groups: MARRIAGE [2] ##

##

MARRIAGE Default `Default Status Breakdown` `P(Default|MARRIAGE)`

##

<fctr> <fctr>

<int>

<dbl>

## 1 Married Default

3189

0.2375419

## 2

Single Default

3325

0.2112721

ggplot(exp_data)+geom_bar(aes(Default))+ coord_flip()

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( SEX ~. ) + coord_flip() + ylab("Ob

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( SEX ~. ) + coord_flip() + ylab("Ob servations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( EDUCATION ~. ) + coord_flip()+ yla

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( EDUCATION ~. ) + coord_flip()+ yla b("Observations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( MARRIAGE ~. ) + coord_flip()+

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( MARRIAGE ~. ) + coord_flip()+ ylab("Observations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_0 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_0 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_2 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_2 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_3 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_3 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_4 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_4 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_5 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_5 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_6 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_6 ~. ) + coord_flip()+ ylab("O bservations") + xlab("Status")

5/7/2017

Logistic Regression

5/7/2017 Logistic Regression file:///media/petercaya/B192-0759/Kaggle/CC_Default/Logistic_Regression_for_CC_Fraud.html

5/7/2017

Logistic Regression

# [4] ---------------------------------------------------------------------

# Create probability functions:

q1 <- function(x){quantile(x=x, probs = .2)} q2 <- function(x){quantile(x=x, probs = .4)}

q3 <- function(x){quantile(x=x, probs = .6)} q4 <- function(x){quantile(x=x, probs = .8)}

hold <- exp_data %>% group_by(Default) %>% summarise_if(.predicate = is.numeric, funs(m in,q1,q2,median,q3,q4,max) ) hold <- melt(hold,id.vars = "Default")

pay_names <- names(exp_data)[grep(names(exp_data),pattern = "PAY_\\d")]

# exp_data %>% group_by_(i,"Default") %>% summarise(Observations = n() ) %>% group_by_ (i) %>% mutate(pct = Observations/sum(Observations))

k <- 1 pay_table <- list() for(i in pay_names){ cur_table <- exp_data %>% group_by_(i,"Default") %>% summarise(Observations = n() ) %>% group_by_(i) %>% mutate(pct = Observations/sum(Observations)) %>% filter(Default == "Default") names(cur_table)[1] <- c("Category") pay_table[[k]] <- cur_table[c(1,4)]

k = k+1

}

for( i in 1:length(pay_table)){ if(i==1){ disp_table <- pay_table[[i]]

names(disp_table)[ncol(disp_table)]

<- pay_names[i] }else{

disp_table <- merge(disp_table,pay_table[[i]],by = "Category",all = TRUE)

names(disp_table)[ncol(disp_table)]

<- pay_names[i]}

}

kable(disp_table)

Category

PAY_0

PAY_2

PAY_3

PAY_4

PAY_5

PAY_6

0

0.1285844

0.1598030

0.1759120

0.1846231

0.1896783

0.1896923

1

0.2537885

0.1853156

0.1881761

0.1956470

0.1992302

0.2032040

5/7/2017

Logistic Regression

Category

PAY_0

PAY_2

PAY_3

PAY_4

PAY_5

PAY_6

-1

0.1691203

0.1619289

0.1581315

0.1622744

0.1647538

0.1718219

2

0.6968187

0.5585492

0.5159659

0.5220470

0.5427911

0.5089974

3

0.7667732

0.6172840

0.5751073

0.6193182

0.6363636

0.6448087

4

0.6842105

0.5052632

0.5866667

0.6666667

0.6071429

0.6250000

5

0.5416667

0.6000000

0.5714286

0.5142857

0.5882353

0.5384615

6

0.5454545

0.7500000

0.6086957

0.4000000

0.7500000

0.7222222

7

0.7777778

0.6000000

0.8076923

0.8214286

0.8214286

0.8222222

8

0.5789474

NA

0.6666667

0.5000000

1.0000000

1.0000000

b. Exploratory Analysis

In the exploratory analysis I will attempt to answer a few questions I have regarding this dataset.

How much of a determinant are the payment status variables in determining default behavior?

The variable description documentation states that PAY_0 is: > Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)

PAY_2, PAY_3, PAY_4, PAY_5, and PAY_6 represent the same metric for the months of August, July, June, May, and April respectively. How does the number of defaults change as we move from the earliest month to the latest?

[2] Applying Logistic Regression.

train_portion = .8

train_index <- createDataPartition(for_testing$Default,p = train_portion, list = FALSE,times = 1) training_data <- for_testing[train_index,] testing_data <- for_testing[-train_index,]

#

training_data <- upSample(x = training_data[-24],training_data[,24],list = FALSE,ynam

e

= "Default")

To start with this analysis I will apply logistic regression using R’s caret package. In this

5/7/2017

Logistic Regression

fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)

training_data$AGE <- as.numeric( training_data$AGE )

def <- training_data$Default

#

fact <- training_data$BILL_AMT1

#

fact2 <- training_data$BILL_AMT2

#

fact3 <- training_data$SEX

#

fact4 <- training_data$EDUCATION

#

#

hold1 = glm(data = training_data, formula = Default ~. , family = binomial(link = "l

 

ogit") )

#

#

#

hold2 = glm(data = training_data, formula = Default ~. , family = binomial(link = "l

ogit") )

#

#

predict(object = hold, )

#

#

log_model <- train(Default ~. , data = training_data,

method="glm", family = binomial(link = "logit"), trControl = fitControl)

#

#

#

vars <- training_data[1:11]

#

#

#

y = training_data$Default

#

#

nb_model <- train(vars,y,'nb',trControl=trainControl(method='cv',number=10))

#

#

log_model <- train(

,

#

method="nb",trControl = fitControl)

#

#

#

#

[3] Model Accuracy.

5/7/2017

Logistic Regression

library(mlbench)

#

#

train_preds <- predict(log_model,newdata = training_data) test_preds <- predict(log_model,newdata = testing_data)

#

confusionMatrix(train_preds,training_data$Default)

## Confusion Matrix and Statistics ##

##

Reference

 

## Prediction

0

1

##

0 17998

4057

##

1

463

1181

##

##

Accuracy : 0.8093

##

95% CI : (0.8042, 0.8143)

##

No Information Rate : 0.779

##

P-Value [Acc > NIR] : < 2.2e-16

##

##

Kappa : 0.2657

## Mcnemar's Test P-Value : < 2.2e-16 ##

##

Sensitivity : 0.9749

##

Specificity : 0.2255

##

Pos Pred Value : 0.8161

##

Neg Pred Value : 0.7184

##

Prevalence : 0.7790

##

Detection Rate : 0.7594

##

Detection Prevalence : 0.9306

##

Balanced Accuracy : 0.6002

##

##

'Positive' Class : 0

##

confusionMatrix(test_preds,testing_data$Default)

5/7/2017

Logistic Regression

## Confusion Matrix and Statistics ##

##

Reference

## Prediction

0

1

##

0 4493 1005

##

1

122

304

##

##

Accuracy : 0.8098

##

95% CI : (0.7995, 0.8197)

##

No Information Rate : 0.779

##

P-Value [Acc > NIR] : 3.697e-09

##

##

Kappa : 0.2714

## Mcnemar's Test P-Value : < 2.2e-16

##

##

Sensitivity : 0.9736

##

Specificity : 0.2322

##

Pos Pred Value : 0.8172

##

Neg Pred Value : 0.7136

##

Prevalence : 0.7790

##

Detection Rate : 0.7584

##

Detection Prevalence : 0.9281

##

Balanced Accuracy : 0.6029

##

##

'Positive' Class : 0

##

[4] Summary of Results