## Warning: package 'knitr' was built under R version 3.5.2
## Warning: package 'rmdformats' was built under R version 3.5.3
Modeling Fraud Dtection looking for max AIC
Importing packages
library(corrplot)
library(RColorBrewer)
library(forecast)
library(gains)
library(caret)
library(ROCR)
library(leaps)
library(bestglm)
library(dplyr)
library(ggplot2)
Importing and balancing the data
set.seed(3997)
fraud.df<-read.csv(file.choose())
isNotFraud.df<-fraud.df %>%
filter(isFraud==0)
isFraud.df<-fraud.df %>%
filter(isFraud==1)
selected.index<-sample(c(1:276241),800)
isNotFraud.df<-isNotFraud.df[selected.index,]
fraud.df<-rbind(isNotFraud.df,isFraud.df)
Factoring and adding variables
fraud.df$type<-as.factor(fraud.df$type)
fraud.df$isFlaggedFraud<-factor(fraud.df$isFlaggedFraud,
levels = c(0,1))
fraud.df$isFraud<-factor(fraud.df$isFraud,
levels=c(0,1))
fraud.df$diff.orig<-fraud.df$oldbalanceOrg-(fraud.df$newbalanceOrig)
fraud.df$diff.dest<-fraud.df$newbalanceDest-(fraud.df$oldbalanceDest)
fraud.df$Hours<-(fraud.df$step)%%24
Deleting some variables
fraud.df$step<-NULL
fraud.df$isFlaggedFraud<-NULL
fraud.df$nameOrig<-NULL
fraud.df$nameDest<-NULL
Corr. of variables
mat_cor <- cor(x= fraud.df [,sapply(fraud.df, is.numeric)])
corrplot(mat_cor,
type="upper",
order="hclust",
method = "number",
col=brewer.pal(n=8, name="RdYlBu"))
## Preliminar Analysis
summary(fraud.df)
type amount oldbalanceOrg newbalanceOrig
CASH_OUT:1047 Min. : 0 Min. : 0 Min. : 0
TRANSFER: 553 1st Qu.: 88764 1st Qu.: 136 1st Qu.: 0
Median : 224704 Median : 61726 Median : 0
Mean : 921945 Mean : 997917 Mean : 212344
3rd Qu.: 638990 3rd Qu.: 448641 3rd Qu.: 0
Max. :21102432 Max. :59585040 Max. :49585040
oldbalanceDest newbalanceDest isFraud diff.orig
Min. : 0 Min. : 0 0:800 Min. : 0
1st Qu.: 0 1st Qu.: 0 1:800 1st Qu.: 136
Median : 139076 Median : 487248 Median : 52942
Mean : 1128079 Mean : 1640097 Mean : 785574
3rd Qu.: 951566 3rd Qu.: 1749240 3rd Qu.: 438383
Max. :28658510 Max. :50738796 Max. :10000000
diff.dest Hours
Min. : -401284 Min. : 0.00
1st Qu.: 0 1st Qu.:10.00
Median : 114672 Median :14.00
Mean : 512018 Mean :13.61
3rd Qu.: 337569 3rd Qu.:18.00
Max. :22417267 Max. :23.00
Some graphics with distributions
ggplot(fraud.df, aes(isFraud,amount))+
geom_boxplot() + ggtitle("Distribution of Amount filtered by isFraud")
ggplot(fraud.df, aes(Hours, amount))+
geom_point() + ggtitle("Distribution of Amount filtered by Hours")
ggplot(fraud.df, aes(isFraud,Hours))+
geom_boxplot() + ggtitle("Cross of isFraud and Hours")
ggplot(fraud.df, aes(isFraud, newbalanceDest))+
geom_boxplot()+ggtitle("Cross of isFraud and newbalanceDest")
Tables
table(fraud.df$type,fraud.df$isFraud)
0 1
CASH_OUT 654 393
TRANSFER 146 407
table(fraud.df$Hours,fraud.df$isFraud)
0 1
0 2 30
1 3 26
2 0 26
3 1 30
4 0 25
5 0 44
6 0 46
7 0 37
8 2 37
9 31 25
10 58 37
11 55 27
12 76 27
13 54 33
14 50 46
15 52 26
16 59 31
17 60 36
18 81 32
19 88 30
20 74 42
21 25 30
22 15 45
23 14 32
Creating Partition of data
set.seed(3997)
train.index <- sample(x= c(1:nrow(fraud.df)),
size= nrow(fraud.df)*0.8)
train.df <- fraud.df[train.index,]
valid.df <- fraud.df[-train.index,]
Looking for max AIC
train.df <-
train.df[, c("Hours","type","amount","diff.dest","diff.orig","isFraud")]
res.bestglm <- bestglm(Xy = train.df,
family = binomial,
IC = "AIC",
method = "exhaustive")
res.bestglm$BestModels
options(scipen = 999)
summary(res.bestglm$BestModel)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.6580 -0.1356 0.0000 0.0992 8.4904
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.187488049 0.341130114 3.481 0.000499 ***
Hours -0.145233242 0.022302740 -6.512 0.00000000007421 ***
typeTRANSFER 2.120225486 0.298790917 7.096 0.00000000000128 ***
amount -0.000024351 0.000004158 -5.857 0.00000000472271 ***
diff.dest -0.000003434 0.000002027 -1.694 0.090195 .
diff.orig 0.000037677 0.000003603 10.457 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1774.43 on 1279 degrees of freedom
Residual deviance: 464.75 on 1274 degrees of freedom
AIC: 476.75
Number of Fisher Scoring iterations: 11
Modeling the data with Logit
logit.reg <- glm(isFraud ~ Hours+type+amount+diff.dest+diff.orig,
data = train.df,
family = "binomial")
options(scipen=999)
summary(logit.reg)
Call:
glm(formula = isFraud ~ Hours + type + amount + diff.dest + diff.orig,
family = "binomial", data = train.df)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.6580 -0.1356 0.0000 0.0992 8.4904
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.187488049 0.341130114 3.481 0.000499 ***
Hours -0.145233242 0.022302740 -6.512 0.00000000007421 ***
typeTRANSFER 2.120225486 0.298790917 7.096 0.00000000000128 ***
amount -0.000024351 0.000004158 -5.857 0.00000000472271 ***
diff.dest -0.000003434 0.000002027 -1.694 0.090195 .
diff.orig 0.000037677 0.000003603 10.457 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1774.43 on 1279 degrees of freedom
Residual deviance: 464.75 on 1274 degrees of freedom
AIC: 476.75
Number of Fisher Scoring iterations: 11
Odds og the model
round(data.frame(summary(logit.reg)$coefficients, odds = exp(coef(logit.reg))), 5)
Estimate Std..Error z.value Pr...z.. odds
(Intercept) 1.18749 0.34113 3.48104 0.0005 3.27883
Hours -0.14523 0.02230 -6.51190 0.0000 0.86482
typeTRANSFER 2.12023 0.29879 7.09602 0.0000 8.33302
amount -0.00002 0.00000 -5.85666 0.0000 0.99998
diff.dest 0.00000 0.00000 -1.69437 0.0902 1.00000
diff.orig 0.00004 0.00000 10.45695 0.0000 1.00004
R2 of the logit model
basereg <- glm(isFraud ~ 1, data = train.df, family = "binomial")
1-(logLik(logit.reg))/(logLik(basereg))
'log Lik.' 0.7380872 (df=6)
ROC Curve
logit.reg.pred <- predict(logit.reg, valid.df, type = "response")
pred <- prediction(logit.reg.pred, valid.df$isFraud)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=1.3,cex.lab=1.4)
plot(perf, col=rainbow(10))
abline(a=0, b= 1)
auc <- round(as.numeric(performance(pred,"auc")@y.values),2)
auc_value <- paste("AUC= ", auc, sep="")
legend(0.7,0.2,auc_value,border="white",cex=1,box.col = "white")
grid(nx = 10, ny = NULL, col = "lightgray", lty = "dotted")
Cutoff, Recall and other measures
pred <- prediction(logit.reg.pred, valid.df$isFraud)
perf <- performance(pred, x.measure = "prec", measure = "rec")
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (2*(x*y))/(x+y)
ind = min(which(d == max(d, na.rm=TRUE)))
c(recall = y[[ind]], precision = x[[ind]],
cutoff = p[[ind]], f1score= max(d, na.rm=TRUE))
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf, pred))
[,1]
recall 0.9554140
precision 0.9433962
cutoff 0.4586503
f1score 0.9493671
Confussion Matrix
cutoff <- opt.cut(perf, pred)[3]
CM <- confusionMatrix(data= as.factor(ifelse(logit.reg.pred > cutoff, 1, 0)),
reference= as.factor(valid.df$isFraud),
positive= "1",
mode= "prec_recall")
t(CM$table)
Prediction
Reference 0 1
0 154 9
1 8 149
options(scipen=999)
CM$byClass
Sensitivity Specificity Pos Pred Value
0.9490446 0.9447853 0.9430380
Neg Pred Value Precision Recall
0.9506173 0.9430380 0.9490446
F1 Prevalence Detection Rate
0.9460317 0.4906250 0.4656250
Detection Prevalence Balanced Accuracy
0.4937500 0.9469149