## Warning: package 'knitr' was built under R version 3.5.2
## Warning: package 'rmdformats' was built under R version 3.5.3

Modeling Fraud Dtection looking for max AIC

Importing packages

library(corrplot)
library(RColorBrewer)
library(forecast)
library(gains)
library(caret)
library(ROCR)
library(leaps)
library(bestglm)
library(dplyr)
library(ggplot2)

Importing and balancing the data

set.seed(3997)
fraud.df<-read.csv(file.choose())
isNotFraud.df<-fraud.df %>%
  filter(isFraud==0)
isFraud.df<-fraud.df %>%
  filter(isFraud==1)
selected.index<-sample(c(1:276241),800)
isNotFraud.df<-isNotFraud.df[selected.index,]

fraud.df<-rbind(isNotFraud.df,isFraud.df)

Factoring and adding variables

fraud.df$type<-as.factor(fraud.df$type)
fraud.df$isFlaggedFraud<-factor(fraud.df$isFlaggedFraud,
                                levels = c(0,1))
fraud.df$isFraud<-factor(fraud.df$isFraud,
                         levels=c(0,1))
fraud.df$diff.orig<-fraud.df$oldbalanceOrg-(fraud.df$newbalanceOrig)
fraud.df$diff.dest<-fraud.df$newbalanceDest-(fraud.df$oldbalanceDest)
fraud.df$Hours<-(fraud.df$step)%%24

Deleting some variables

fraud.df$step<-NULL
fraud.df$isFlaggedFraud<-NULL
fraud.df$nameOrig<-NULL
fraud.df$nameDest<-NULL

Corr. of variables

mat_cor <- cor(x= fraud.df [,sapply(fraud.df, is.numeric)])
corrplot(mat_cor,
         type="upper",
         order="hclust",
         method = "number", 
         col=brewer.pal(n=8, name="RdYlBu"))

## Preliminar Analysis

summary(fraud.df)
       type          amount         oldbalanceOrg      newbalanceOrig    
 CASH_OUT:1047   Min.   :       0   Min.   :       0   Min.   :       0  
 TRANSFER: 553   1st Qu.:   88764   1st Qu.:     136   1st Qu.:       0  
                 Median :  224704   Median :   61726   Median :       0  
                 Mean   :  921945   Mean   :  997917   Mean   :  212344  
                 3rd Qu.:  638990   3rd Qu.:  448641   3rd Qu.:       0  
                 Max.   :21102432   Max.   :59585040   Max.   :49585040  
 oldbalanceDest     newbalanceDest     isFraud   diff.orig       
 Min.   :       0   Min.   :       0   0:800   Min.   :       0  
 1st Qu.:       0   1st Qu.:       0   1:800   1st Qu.:     136  
 Median :  139076   Median :  487248           Median :   52942  
 Mean   : 1128079   Mean   : 1640097           Mean   :  785574  
 3rd Qu.:  951566   3rd Qu.: 1749240           3rd Qu.:  438383  
 Max.   :28658510   Max.   :50738796           Max.   :10000000  
   diff.dest            Hours      
 Min.   : -401284   Min.   : 0.00  
 1st Qu.:       0   1st Qu.:10.00  
 Median :  114672   Median :14.00  
 Mean   :  512018   Mean   :13.61  
 3rd Qu.:  337569   3rd Qu.:18.00  
 Max.   :22417267   Max.   :23.00  

Some graphics with distributions

ggplot(fraud.df, aes(isFraud,amount))+
  geom_boxplot() + ggtitle("Distribution of Amount filtered by isFraud")

ggplot(fraud.df, aes(Hours, amount))+
  geom_point() + ggtitle("Distribution of Amount filtered by Hours")

ggplot(fraud.df, aes(isFraud,Hours))+
  geom_boxplot() + ggtitle("Cross of isFraud and Hours")

ggplot(fraud.df, aes(isFraud, newbalanceDest))+
  geom_boxplot()+ggtitle("Cross of isFraud and newbalanceDest")

Tables

table(fraud.df$type,fraud.df$isFraud)
          
             0   1
  CASH_OUT 654 393
  TRANSFER 146 407
table(fraud.df$Hours,fraud.df$isFraud)
    
      0  1
  0   2 30
  1   3 26
  2   0 26
  3   1 30
  4   0 25
  5   0 44
  6   0 46
  7   0 37
  8   2 37
  9  31 25
  10 58 37
  11 55 27
  12 76 27
  13 54 33
  14 50 46
  15 52 26
  16 59 31
  17 60 36
  18 81 32
  19 88 30
  20 74 42
  21 25 30
  22 15 45
  23 14 32

Creating Partition of data

set.seed(3997)
train.index <- sample(x= c(1:nrow(fraud.df)), 
                      size= nrow(fraud.df)*0.8)

train.df <- fraud.df[train.index,]
valid.df <- fraud.df[-train.index,]

Looking for max AIC

train.df <-
  train.df[, c("Hours","type","amount","diff.dest","diff.orig","isFraud")]
res.bestglm <- bestglm(Xy = train.df,
                       family = binomial,
                       IC = "AIC",                 
                       method = "exhaustive")
res.bestglm$BestModels
options(scipen = 999)
summary(res.bestglm$BestModel)

Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.6580  -0.1356   0.0000   0.0992   8.4904  

Coefficients:
                 Estimate   Std. Error z value             Pr(>|z|)    
(Intercept)   1.187488049  0.341130114   3.481             0.000499 ***
Hours        -0.145233242  0.022302740  -6.512     0.00000000007421 ***
typeTRANSFER  2.120225486  0.298790917   7.096     0.00000000000128 ***
amount       -0.000024351  0.000004158  -5.857     0.00000000472271 ***
diff.dest    -0.000003434  0.000002027  -1.694             0.090195 .  
diff.orig     0.000037677  0.000003603  10.457 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1774.43  on 1279  degrees of freedom
Residual deviance:  464.75  on 1274  degrees of freedom
AIC: 476.75

Number of Fisher Scoring iterations: 11

Modeling the data with Logit

logit.reg <- glm(isFraud ~ Hours+type+amount+diff.dest+diff.orig, 
                 data = train.df, 
                 family = "binomial")
options(scipen=999)
summary(logit.reg)

Call:
glm(formula = isFraud ~ Hours + type + amount + diff.dest + diff.orig, 
    family = "binomial", data = train.df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.6580  -0.1356   0.0000   0.0992   8.4904  

Coefficients:
                 Estimate   Std. Error z value             Pr(>|z|)    
(Intercept)   1.187488049  0.341130114   3.481             0.000499 ***
Hours        -0.145233242  0.022302740  -6.512     0.00000000007421 ***
typeTRANSFER  2.120225486  0.298790917   7.096     0.00000000000128 ***
amount       -0.000024351  0.000004158  -5.857     0.00000000472271 ***
diff.dest    -0.000003434  0.000002027  -1.694             0.090195 .  
diff.orig     0.000037677  0.000003603  10.457 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1774.43  on 1279  degrees of freedom
Residual deviance:  464.75  on 1274  degrees of freedom
AIC: 476.75

Number of Fisher Scoring iterations: 11

Odds og the model

round(data.frame(summary(logit.reg)$coefficients, odds = exp(coef(logit.reg))), 5)
             Estimate Std..Error  z.value Pr...z..    odds
(Intercept)   1.18749    0.34113  3.48104   0.0005 3.27883
Hours        -0.14523    0.02230 -6.51190   0.0000 0.86482
typeTRANSFER  2.12023    0.29879  7.09602   0.0000 8.33302
amount       -0.00002    0.00000 -5.85666   0.0000 0.99998
diff.dest     0.00000    0.00000 -1.69437   0.0902 1.00000
diff.orig     0.00004    0.00000 10.45695   0.0000 1.00004

R2 of the logit model

basereg <- glm(isFraud ~ 1, data = train.df, family = "binomial")
1-(logLik(logit.reg))/(logLik(basereg))
'log Lik.' 0.7380872 (df=6)

ROC Curve

logit.reg.pred <- predict(logit.reg, valid.df, type = "response")
pred <- prediction(logit.reg.pred, valid.df$isFraud)
perf <- performance(pred, measure = "tpr", x.measure = "fpr") 
par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=1.3,cex.lab=1.4)
plot(perf, col=rainbow(10))
abline(a=0, b= 1)
auc <- round(as.numeric(performance(pred,"auc")@y.values),2)
auc_value <- paste("AUC= ", auc, sep="")
legend(0.7,0.2,auc_value,border="white",cex=1,box.col = "white")
grid(nx = 10, ny = NULL, col = "lightgray", lty = "dotted")

Cutoff, Recall and other measures

pred <- prediction(logit.reg.pred, valid.df$isFraud)
perf <- performance(pred, x.measure = "prec", measure = "rec")

opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (2*(x*y))/(x+y)
    ind = min(which(d == max(d, na.rm=TRUE)))
    c(recall = y[[ind]], precision = x[[ind]], 
      cutoff = p[[ind]], f1score= max(d, na.rm=TRUE))
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf, pred))
               [,1]
recall    0.9554140
precision 0.9433962
cutoff    0.4586503
f1score   0.9493671

Confussion Matrix

cutoff <- opt.cut(perf, pred)[3]
CM <- confusionMatrix(data= as.factor(ifelse(logit.reg.pred > cutoff, 1, 0)), 
                      reference= as.factor(valid.df$isFraud),
                      positive= "1",
                      mode= "prec_recall")
t(CM$table)
         Prediction
Reference   0   1
        0 154   9
        1   8 149
options(scipen=999)
CM$byClass
         Sensitivity          Specificity       Pos Pred Value 
           0.9490446            0.9447853            0.9430380 
      Neg Pred Value            Precision               Recall 
           0.9506173            0.9430380            0.9490446 
                  F1           Prevalence       Detection Rate 
           0.9460317            0.4906250            0.4656250 
Detection Prevalence    Balanced Accuracy 
           0.4937500            0.9469149