The data preparation here is intended to understand the indians diabete data in terms of the distributions of exogeneous variables and their correlations. It is important to cote that this step is important to us because it determines how we will fit our logistic model.
We downloaded the diabete data from the kaggle platform manually and we will be reading it from our file system. As we know, the data of indians diabete data which has these variables:
#' load readr package for data import from csv file
#'
library(readr)
diabete <- read_csv("diabetes.csv")
Rows: 768 Columns: 9
-- Column specification ----------------------------------------------------------------------------------------------
Delimiter: ","
dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#' Transform @diabete as r dataframe
#'
diabete=as.data.frame(diabete)
#' Rename the dataset to be understandable
#'
varnames=c('Gestite','Glucose','TensionA','PliCutane','Insuline','Imc','Index','Age','TestMaladie')
names(diabete)=varnames
#' make a copy of the data set to conserve the original data
#'
diabete1=diabete
#' Transform the target variable
#'
diabete1$TestMaladie=factor(diabete1$TestMaladie)
head(diabete1)
NA
#' Features types
#'
str(diabete1)
'data.frame': 768 obs. of 9 variables:
$ Gestite : num 6 1 8 1 0 5 3 10 2 8 ...
$ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
$ TensionA : num 72 66 64 66 40 74 50 0 70 96 ...
$ PliCutane : num 35 29 0 23 35 0 32 0 45 0 ...
$ Insuline : num 0 0 0 94 168 0 88 0 543 0 ...
$ Imc : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
$ Index : num 0.627 0.351 0.672 0.167 2.288 ...
$ Age : num 50 31 32 21 33 30 26 29 53 54 ...
$ TestMaladie: Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
#' features plots
#'
par(mfrow=c(3,3))
for(j in names(diabete1)){
if(class(diabete1[,j])=="numeric"){
hist(diabete1[,j],main = paste0("Histogramm of ",j),xlab=j)
}else{
barplot(table(diabete1[,j]),main = paste0("Bar plot of ",j))
}
}
#' Features summary
summary(diabete1)
Gestite Glucose TensionA PliCutane Insuline Imc Index
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. :0.0780
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00 Median : 30.5 Median :32.00 Median :0.3725
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54 Mean : 79.8 Mean :31.99 Mean :0.4719
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00 Max. :846.0 Max. :67.10 Max. :2.4200
Age TestMaladie
Min. :21.00 0:500
1st Qu.:24.00 1:268
Median :29.00
Mean :33.24
3rd Qu.:41.00
Max. :81.00
It can be seen that feature TensionA has 0 values. This is wrong because a the blodd pressure of person can not be 0. The same outcome is observed for Glucose,Insuline and Imc. In the following step, for the purpose of the diabete modelling, we will be deleting these values in the dataset.
#' set the TensionA 0 values as NA
#'
diabete1$TensionA[diabete$TensionA==0]=NA
diabete1$Glucose[diabete$Glucose==0]=NA
diabete1$Insuline[diabete$Insuline==0]=NA
diabete1$Imc[diabete$Imc==0]=NA
#' Remove NA in the dataset
dim(na.omit(diabete1)) #392 9
[1] 392 9
str(diabete1)
'data.frame': 768 obs. of 9 variables:
$ Gestite : num 6 1 8 1 0 5 3 10 2 8 ...
$ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
$ TensionA : num 72 66 64 66 40 74 50 NA 70 96 ...
$ PliCutane : num 35 29 0 23 35 0 32 0 45 0 ...
$ Insuline : num NA NA NA 94 168 NA 88 NA 543 NA ...
$ Imc : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
$ Index : num 0.627 0.351 0.672 0.167 2.288 ...
$ Age : num 50 31 32 21 33 30 26 29 53 54 ...
$ TestMaladie: Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
diabete.c <- na.omit(diabete1)
#'compute prior probability
#'
prior_proba <- prop.table(table(diabete.c$TestMaladie))
prior_proba
0 1
0.6683673 0.3316327
#'compute prior probability
#'
# par(mfrow=c(2,1))
# couleurs=as.numeric(diabete.c$TestMaladie)+1
p1=plotly::plot_ly(diabete.c,x=~Age,y=~TensionA,z=~TestMaladie,color=~TestMaladie)
# p2=plotly::plot_ly(diabete.c,x=~Age,y=~Glucose,z=~TestMaladie,color=~TestMaladie)
plotly::add_markers(p1)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
# plotly::add_markers(p2)
p2=plotly::plot_ly(diabete.c,x=~Age,y=~Glucose,z=~TestMaladie,color=~TestMaladie)
plotly::add_markers(p2)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
#' analyse of correlation with age
#'
age_classes <- hist(diabete.c$Age,breaks=c(21, 24, 27, 30, 34, 37, 40, 44, 47, 57, 87),plot = F)$breaks
age_prop <- prop.table(table(cut(x=diabete.c$Age,breaks=age_classes),diabete.c$TestMaladie),margin=1)
age_prop
0 1
(21,24] 0.8627451 0.1372549
(24,27] 0.7500000 0.2500000
(27,30] 0.6000000 0.4000000
(30,34] 0.4864865 0.5135135
(34,37] 0.5238095 0.4761905
(37,40] 0.6250000 0.3750000
(40,44] 0.3636364 0.6363636
(44,47] 0.3846154 0.6153846
(47,57] 0.3200000 0.6800000
(57,87] 0.5000000 0.5000000
age_log_logit <- log((prior_proba[2]*age_prop[,2])/(prior_proba[1]*age_prop[,1]))
plot(age_log_logit,type="b",xlab="Age classes")
# analyse of correlation with Glucose
Glucose_classes <- hist(diabete.c$Glucose,plot = F)$breaks
Glucose_prop <- prop.table(table(cut(x=diabete.c$Glucose,breaks=Glucose_classes),diabete.c$TestMaladie),margin=1)
Glucose_prop
0 1
(40,60] 1.00000000 0.00000000
(60,80] 0.88235294 0.11764706
(80,100] 0.90816327 0.09183673
(100,120] 0.80459770 0.19540230
(120,140] 0.61904762 0.38095238
(140,160] 0.50980392 0.49019608
(160,180] 0.18750000 0.81250000
(180,200] 0.13636364 0.86363636
Glucose_log_logit <- log((prior_proba[2]*Glucose_prop[,2])/(prior_proba[1]*Glucose_prop[,1]))
plot(Glucose_log_logit,type="b",xlab="Glucose classes")
NA
NA
# analyse of correlation with TensionA
TensionA_classes <- hist(diabete.c$TensionA,plot = F)$breaks
TensionA_prop <- prop.table(table(cut(x=diabete.c$TensionA,
breaks=TensionA_classes),
diabete.c$TestMaladie),
margin=1)
TensionA_prop
0 1
(20,30] 0.6666667 0.3333333
(30,40] 0.5000000 0.5000000
(40,50] 0.6666667 0.3333333
(50,60] 0.8666667 0.1333333
(60,70] 0.6694915 0.3305085
(70,80] 0.6814159 0.3185841
(80,90] 0.5223881 0.4776119
(90,100] 0.5000000 0.5000000
(100,110] 0.2000000 0.8000000
TensionA_log_logit <- log((prior_proba[2]*TensionA_prop[,2])/(prior_proba[1]*TensionA_prop[,1]))
plot(TensionA_log_logit,type="b",xlab="TensionA classes")
#' Logistic regression
#'
diab_glm0 <- glm(formula=TestMaladie~.++I(Age^2)+I(TensionA^2),family=binomial(),data=diabete.c)
diab_summary_glm0 <- summary(diab_glm0)
diab_summary_glm0
Call:
glm(formula = TestMaladie ~ . + +I(Age^2) + I(TensionA^2), family = binomial(),
data = diabete.c)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.8996 -0.6453 -0.3203 0.6227 2.5167
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.202e+01 3.430e+00 -3.505 0.000457 ***
Gestite 4.020e-02 5.672e-02 0.709 0.478510
Glucose 3.773e-02 5.837e-03 6.464 1.02e-10 ***
TensionA -7.216e-02 7.694e-02 -0.938 0.348280
PliCutane 1.153e-02 1.749e-02 0.659 0.509607
Insuline -2.270e-04 1.343e-03 -0.169 0.865750
Imc 5.669e-02 2.870e-02 1.975 0.048290 *
Index 1.074e+00 4.325e-01 2.482 0.013063 *
Age 3.181e-01 1.040e-01 3.058 0.002226 **
I(Age^2) -3.575e-03 1.322e-03 -2.705 0.006838 **
I(TensionA^2) 4.855e-04 5.471e-04 0.887 0.374849
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 498.1 on 391 degrees of freedom
Residual deviance: 333.1 on 381 degrees of freedom
AIC: 355.1
Number of Fisher Scoring iterations: 5
#' Logistic regression after removing some features
#'
diab_glm1 <- update(object=diab_glm0,formula.=.~.-Gestite-TensionA-PliCutane-Insuline-I(TensionA^2))
diab_summary_glm1 <- summary(diab_glm1)
best_full_model=diab_summary_glm1
diab_summary_glm1
Call:
glm(formula = TestMaladie ~ Glucose + Imc + Index + Age + I(Age^2),
family = binomial(), data = diabete.c)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.9191 -0.6412 -0.3253 0.6411 2.6131
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -14.988815 2.030534 -7.382 1.56e-13 ***
Glucose 0.036648 0.005072 7.225 5.02e-13 ***
Imc 0.068166 0.020464 3.331 0.000865 ***
Index 1.076931 0.424170 2.539 0.011120 *
Age 0.343708 0.096455 3.563 0.000366 ***
I(Age^2) -0.003797 0.001270 -2.990 0.002786 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 498.10 on 391 degrees of freedom
Residual deviance: 334.81 on 386 degrees of freedom
AIC: 346.81
Number of Fisher Scoring iterations: 5
anova(diab_glm0,diab_glm1)
Analysis of Deviance Table
Model 1: TestMaladie ~ Gestite + Glucose + TensionA + PliCutane + Insuline +
Imc + Index + Age + +I(Age^2) + I(TensionA^2)
Model 2: TestMaladie ~ Glucose + Imc + Index + Age + I(Age^2)
Resid. Df Resid. Dev Df Deviance
1 381 333.10
2 386 334.81 -5 -1.7036
#' We divide the dataset into two part as learning and testin data
#'
var_name= setdiff(names(diabete.c),c("TestMaladie"))
diabete.c0=diabete.c
target_levels <- unique(diabete.c[,"TestMaladie"])
nobs <- nrow(diabete.c0)
MatPerf=matrix(data=0,nrow=20,ncol=2)
# for(w in 1:100){
#' Diabete persons data
#'
diabete.c_pos0 <- diabete.c0[diabete.c0$TestMaladie==target_levels[2],]
spl_pos <- sample(x=nrow(diabete.c_pos0),size=125,replace=F)
diabete.c_pos <- diabete.c_pos0[spl_pos,]
diabete.c_pos_test <- diabete.c_pos0[-spl_pos,]
#' Non diabete persons data
#'
diabete.c_neg0 <- diabete.c0[diabete.c0$TestMaladie==target_levels[1],]
spl_neg <- sample(x=nrow(diabete.c_neg0),size=242,replace=F)
diabete.c_neg <- diabete.c_neg0[spl_neg,]
diabete.c_neg_test <- diabete.c_neg0[-spl_neg,]
#' Learning data
#'
logistic_data <- rbind.data.frame(diabete.c_pos,diabete.c_neg)
spl_l <- sample(x=nrow(logistic_data),replace=F)
logistic_data <- logistic_data[spl_l,]
logistic_data$TestMaladie <- as.factor(logistic_data$TestMaladie )
#' Testing data
#'
logistic_data_test <- rbind.data.frame(diabete.c_pos_test,diabete.c_neg_test)
spl_t <- sample(x=nrow(logistic_data_test),replace=F)
logistic_data_test <- logistic_data_test[spl_t,]
logistic_data_test$TestMaladie <- as.factor(logistic_data_test$TestMaladie )
#' AUC function
roc_auc <- function(TPR,FPR){
AUC <- numeric(length(TPR))
for(j in 1:(length(TPR)-1)){
AUC[j] <- abs(FPR[j+1]-FPR[j])*((TPR[j+1]+TPR[j])/2)
}
# AUC[length(TPR)] <- abs(1-FPR[length(FPR)])*((1+TPR[length(TPR)])/2)
sum(AUC)
}
#' Model performance function
#'
compute_perf <- function(threshold_proba,data,cv,nobs,seed,adjusted_model,target_column,spl,target_levels){
set.seed(seed) # Always use the same random values
tpr <- 0
fpr <-0
sens <- 0
spec <-0
err <- 0
n1 <-0
n2 <- 0
k <- floor(nobs/cv+0.5)
n1 <- n2+1
n2 <- n2+k
thrs_tpr <- numeric(cv)
thrs_fpr <- numeric(cv)
j <- 0
while(n2<=nobs){
j <- j+1
training_data <- data[-spl[(n1:n2)],]
test_data <- data[spl[(n1:n2)],-target_column]
y_obs <- data[spl[(n1:n2)],target_column]
diab_glmcv <- update(object=adjusted_model,formula.=.~.,data=training_data)
pred_proba <- predict(diab_glmcv,newdata=test_data,type="response")
pred_y <- as.numeric(pred_proba>threshold_proba)
pred_class <- sapply((1 + pred_y) , function(r) target_levels[r])
# classifier's TPR and FPR target_levels[2]
# target_levels[2]
tpr <- tpr+sum((pred_class==target_levels[2])&( y_obs ==target_levels[2]))/sum( y_obs ==target_levels[2])
fpr <- fpr+sum((pred_class==target_levels[2])&( y_obs ==target_levels[1]))/sum( y_obs ==target_levels[1])
sens <- tpr
spec <- spec+(1-sum((pred_class==target_levels[2])&( y_obs ==target_levels[1]))/sum( y_obs ==target_levels[1]))
err <- err+sum(pred_class!= y_obs)/(length(y_obs))
# classifier's sample tpr and fpr
# thrs_tpr[j] <- sum((pred_class==target_levels[2])&( y_obs ==target_levels[2]))/sum( y_obs ==target_levels[2])
# thrs_fpr[j] <- sum((pred_class==target_levels[2])&( y_obs ==target_levels[1]))/sum( y_obs ==target_levels[1])
#next step
n1 <- n2+1
n2 <- n2+k
}
# thrs_auc <- roc_auc(TPR=thrs_tpr,FPR=thrs_fpr)
return(c(thrs=threshold_proba,tpr=tpr/cv,fpr=fpr/cv,err=err/cv,auc=0.5*((sens+spec)/cv),sens=sens/cv,spec=spec/cv))
}
#' Perfomance measurement function
backtest_perffunc <- function(logistic_data,factor_target_var,nobs,ncv,optim_model,target_levels){
target_column <- which(names(logistic_data)==factor_target_var)
p0 <- predict(optim_model, type = 'response')
p0 <- sort(unique(p0 [p0< 1]))
p0 <- unique(round(.5 * (p0 [-length(p0 )] + p0 [-1]),2))
spl<-rank(runif(nobs)) #random sampling vector
# p0 <- seq(from=0,to=1,by=0.01)
# p0 <- p0[p0!=0&p0!=1]
performance_table <- sapply(X=p0,
FUN=compute_perf,
data=logistic_data,
cv=ncv,nobs=nobs,
seed=1,
adjusted_model=optim_model,
target_column=target_column,
spl=spl,
target_levels)
performance_table <- t(performance_table)
performance_table <- data.frame(performance_table)
rownames(performance_table) <- NULL
r=which.max(performance_table$auc)
# r=which.max(performance_table$thrs_auc)
par(mfrow=c(1,2))
plot(x=performance_table$fpr,y=performance_table$tpr,type="b",cex=0.8,xlab="FPR",ylab="TPR",main="Classifiers' ROC curve",cex.main=0.8)
lines(x=c(0,1),y=c(0,1),col=2,lwd=2)
points(x=performance_table$fpr[r],y=performance_table$tpr[r],col=2,lwd=2)
plot(x=performance_table$thrs,y=performance_table$sens,type="b",col=1,cex=0.8,xlab="Classification probability threshold",ylab="Sensibility/Specificity",main="Sensibility and Specificity curves",cex.main=0.8)
points(x=performance_table$thrs,y=performance_table$spec,type="b",col=2)
abline(v=performance_table$thrs[r],col=3,lwd=2)
optim_thrs <- max(performance_table$thrs[performance_table$sens>performance_table$spec])
abline(v=optim_thrs,col=4,lwd=2)
res_class <- rbind.data.frame(performance_table[r,],performance_table[performance_table$thrs==optim_thrs,])
legend("right",
legend=c("Sensibility","Specificity","Max AUC probability threshold","Optimal probability threshold"),
col=c(1,2,3,4),
lty=c(1,1,1,1),
pch=c(1,1,NA,NA),
cex=0.8,
lwd=2)
text(x=res_class$thrs,y=c(0.3,0.3),labels=res_class$thrs,pos=c(2,4),col=c(3,4),lwd=2,cex=0.8)
rownames(res_class) <- c("Max AUC probability threshold","Optimal probability threshold")
list(performance_table=performance_table,optim_performance=res_class)
}
#' Apply functions
#'
best_full_model=diab_glm1 #best model
nobs <- nrow(logistic_data)
ncv <- 20
#Type of variable
class_of_var <- sapply(logistic_data, class)
numeric_var <- names(class_of_var)[class_of_var%in%c("numeric","int")]
char_factor_var <- setdiff(names(class_of_var),numeric_var)
factor_target_var <- "TestMaladie" #"Class"
target_levels <- levels(logistic_data[,factor_target_var])
# levels(logistic_data[,factor_target_var])<-c(0,1)
prior_proba <- prop.table(table(logistic_data[,factor_target_var]))
backtest_stat_perf <- backtest_perffunc(logistic_data=logistic_data,
factor_target_var=factor_target_var,
nobs=nobs,
ncv=ncv,
optim_model=best_full_model,
target_levels=target_levels)
#' Forecasting function
#'
predict_logistic <- function(best_threshold_proba,test_data,seed=NULL,adjusted_model,target_levels){
if(!is.null(seed))
set.seed(seed)
pred_proba <- predict(adjusted_model,newdata=test_data,type="response")
pred_y <- as.numeric(pred_proba>best_threshold_proba)
pred_class <- sapply( pred_y+1, function(r) target_levels[r])
cbind.data.frame(predicted_class=pred_class,predicted_proba=pred_proba)
}
pred_class<- predict_logistic(best_threshold_proba=backtest_stat_perf$optim_performance$thrs[1],
test_data=logistic_data_test,seed=1,
adjusted_model=best_full_model,
target_levels=target_levels)
pred_class
perfo_proba <- prop.table(table(logistic_data_test[,factor_target_var],pred_class$predicted_class),margin=1)
#' Confuse matrix
#'
perfo_proba*100
0 1
0 90 10
1 20 80