Unit 5 Classification and Regression Trees (CART)

5.1 Introduction to trees - Judge, Jury, and Classifier

stevens <- read.csv("week4/stevens.csv")
str(stevens)

## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...
...

#Split data into Training and Testing data set
library(caTools)
set.seed(3000)
spl <- sample.split(stevens$Reverse, SplitRatio = 0.7)
Train <- subset(stevens, spl == TRUE)
Test <- subset(stevens, spl == FALSE)
library(rpart)
library(rpart.plot)

#Build classification Tree
stevensTree <- rpart(Reverse~ Circuit+ Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "class", minbucket=25)

#Plotting rpart trees
prp(stevensTree)

#Predict with tree
predictCART <- predict(stevensTree, newdata = Test, type = "class") 
table(Test$Reverse, predictCART) #Confusion matrix

##    predictCART
##      0  1
##   0 41 36
##   1 22 71

#Generate ROC Curve
library(ROCR) #load ROCR library

#Probability Prediction
PredictROC <- predict(stevensTree, newdata=Test) #ROC prediction
pred<- prediction(PredictROC[,2], Test$Reverse) #Use second column as we are trying to predict Reverse = 1.
perf <- performance(pred, "tpr","fpr") # performance Object to be plotted
plot(perf) #Plot ROC Curve

as.numeric(performance(pred, "auc")@y.values)#AUC - Area under curve to measure accuracy

## [1] 0.6927105

5.1.1 Random Forests

Designed to improve prediction accuracy of CART.

Works by building a large number of CART trees ans prediction is made by each tree voting on outcome

library(randomForest)
#For classification problems, make sure dependent variable is a factor
Train$Reverse <- as.factor(Train$Reverse)
Test$Reverse <- as.factor(Test$Reverse)

#create random forests
StevensForest <- randomForest(Reverse~ Circuit+ Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, nodesize=25, ntree=200) 
PredictForest <- predict(StevensForest, newdata = Test)# prediction using random forest
table(Test$Reverse, PredictForest) # confusion Matrix

##    PredictForest
##      0  1
##   0 43 34
##   1 16 77

5.1.2 Cross validation

Selecting minbucket should not be selected by best accuracy as it does not translate well to new data prediction accuracy

5.1.2.1 K-fold cross validation

Split training set into k pieces/folds

library(caret)
library(e1071)
numFolds <- trainControl(method="cv", number=10) # cv = croos validation; numer = number of folds(buckets)
cpGrid <- expand.grid(.cp=seq(0.01,0.5,0.01))#cp paramaeters to teast as numbers from 0.01 to 0.5, in increments of 0.01.
train(Reverse~ Circuit+ Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="rpart", trControl=numFolds, tuneGrid = cpGrid)

## CART 
## 
## 396 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 356, 356, 357, 357, 356, 356, ... 
## Resampling results across tuning parameters:
...

#New CART model with new cp value instead on minbucket
StevensTreeCV <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "class", cp="0.19")
PredictCV <- predict(StevensTreeCV, newdata= Test, type="class")
table(Test$Reverse, PredictCV) #Confusion Matrix shows increased accuracy to 0.7235294

##    PredictCV
##      0  1
##   0 59 18
##   1 29 64

#Plot tree
prp(StevensTreeCV)

5.2 The D2Hawkeye Story

Predict cost bucket patient fell into in 2009 using a CART model

Claims<-read.csv("week4/ClaimsData.csv")#read in data
str(Claims) # display variable structures

## 'data.frame':    458005 obs. of  16 variables:
##  $ age              : int  85 59 67 52 67 68 75 70 67 67 ...
##  $ alzheimers       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ arthritis        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancer           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copd             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ depression       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diabetes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.failure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ihd              : int  0 0 0 0 0 0 0 0 0 0 ...
...

table(Claims$bucket2009)/nrow(Claims)#Number of patients in each cost bucket

## 
##           1           2           3           4           5 
## 0.671267781 0.190170413 0.089466272 0.043324855 0.005770679

#Split data into training and testing sets
library(caTools) # helps in seed setting for splitting data randomly while maintaining ratio of specified variable

# Put 60% of data in training set
set.seed(88)
spl <- sample.split(Claims$bucket2009, SplitRatio = 0.6)
ClaimsTrain <- subset(Claims, spl = TRUE)
ClaimsTest <- subset(Claims, spl == FALSE)

5.2.1 Baseline method and Penalty Matrix

Baseline method will perdict that cost bucket for a patient in 2009 will be the same as it was in 2008

#Classification matrix
ClassificationMatrix <- table(ClaimsTest$bucket2009, ClaimsTest$bucket2008) #Outcome = bucket2009, prediction = bucket2008
ClassificationMatrix

##    
##          1      2      3      4      5
##   1 110138   7787   3427   1452    174
##   2  16000  10721   4629   2931    559
##   3   7006   4629   2774   1621    360
##   4   2688   1943   1415   1539    352
##   5    293    191    160    309    104

(110138 + 10721 + 2774 + 1539 + 104)/nrow(ClaimsTest)#Baseline model accuracy

## [1] 0.6838135

PenaltyMatrix <- matrix(c(0,1,2,3,4,2,0,1,2,3,4,2,0,1,2,6,4,2,0,1,8,6,4,2,0),byrow = T, nrow=5)
PenaltyMatrix

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    0    1    2    3    4
## [2,]    2    0    1    2    3
## [3,]    4    2    0    1    2
## [4,]    6    4    2    0    1
## [5,]    8    6    4    2    0

#penalty Error = sum(Penalty matrix multiplied by classification matrix)/number of observations
sum(PenaltyMatrix*ClassificationMatrix)/nrow(ClaimsTest)

## [1] 0.7386055

Now, we’ll create a CART model to improve the penalty error and accuracy

library(rpart)
library(rpart.plot)
#cp value was selected from cross validation. Skipped due to time 
ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke +  bucket2008 + reimbursement2008, data= ClaimsTrain, method = "class", cp=0.00005 )

prp(ClaimsTree) #display tree

PredictTest <- predict (ClaimsTree, newdata = ClaimsTest, type="class")
ClassificationMatrix <- table(ClaimsTest$bucket2009, PredictTest)
ClassificationMatrix

##    PredictTest
##          1      2      3      4      5
##   1 114987   7884      0    107      0
##   2  18692  16051      0     97      0
##   3   8188   8120      0     82      0
##   4   3176   4567      0    194      0
##   5    349    671      0     37      0

(114141 + 16102 + 118 + 201 + 0)/nrow(ClaimsTest) # accuracy

## [1] 0.7126669

sum(PenaltyMatrix*ClassificationMatrix)/nrow(ClaimsTest) #Penalty error

## [1] 0.7591238

CART model predicts 3,4,5 rarely because data has few observations in these classes. Can be rectified by adding penalty matrix to model

ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke +  bucket2008 + reimbursement2008, data= ClaimsTrain, method = "class", cp=0.00005, parms=list(loss=PenaltyMatrix) )
prp(ClaimsTree)

PredictTest <- predict (ClaimsTree, newdata = ClaimsTest, type="class")
ClassificationMatrix <- table(ClaimsTest$bucket2009, PredictTest)
ClassificationMatrix

##    PredictTest
##         1     2     3     4     5
##   1 93651 26529  2571   227     0
##   2  6913 20324  7049   554     0
##   3  3499  8184  4370   337     0
##   4  1264  3403  2702   568     0
##   5   129   375   411   142     0

(93651+20324+4370+568)/nrow(ClaimsTest)# accuracy

## [1] 0.6490813

sum(PenaltyMatrix*ClassificationMatrix)/nrow(ClaimsTest) #Penalty error

## [1] 0.6377987

5.3 Recitation - Boston Housing Data (Regression Trees)

boston <- read.csv("week4/boston.csv")
str(boston)

## 'data.frame':    506 obs. of  16 variables:
##  $ TOWN   : Factor w/ 92 levels "Arlington","Ashland",..: 54 77 77 46 46 46 69 69 69 69 ...
##  $ TRACT  : int  2011 2021 2022 2031 2032 2033 2041 2042 2043 2044 ...
##  $ LON    : num  -71 -71 -70.9 -70.9 -70.9 ...
##  $ LAT    : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ MEDV   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
##  $ CRIM   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
...

plot(boston$LON, boston$LAT)

#See which points lie along the Charles River
points(boston$LON[boston$CHAS ==1], boston$LAT[boston$CHAS == 1], col="blue", pch=19)
points(boston$LON[boston$TRACT ==3531], boston$LAT[boston$TRACT == 3531], col="red", pch=19)
summary(boston$NOX) #air pollution distribution

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3850  0.4490  0.5380  0.5547  0.6240  0.8710

points(boston$LON[boston$NOX >0.55], boston$LAT[boston$NOX >0.55], col="green", pch=19)

plot(boston$LON, boston$LAT)
summary(boston$MEDV)#thousands of dollars

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00

points(boston$LON[boston$MEDV >21.2], boston$LAT[boston$MEDV >21.2], col="red", pch=19)

5.3.1 Geographical Predictions

plot(boston$LAT, boston$MEDV)

plot(boston$LON, boston$MEDV)

#linear regression model
latlonlm <- lm(MEDV~LAT+LON, data = boston) 
summary(latlonlm)

## 
## Call:
## lm(formula = MEDV ~ LAT + LON, data = boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.460  -5.590  -1.299   3.695  28.129 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
...

#above median cencus tracts
plot(boston$LON, boston$LAT)
points(boston$LON[boston$MEDV >21.2], boston$LAT[boston$MEDV >21.2], col="red", pch=19) 

#See performance of linear model
points(boston$LON[latlonlm$fitted.values >21.2], boston$LAT[latlonlm$fitted.values >21.2], col="blue", pch="$")

#Regeression Trees
library(rpart)
library(rpart.plot)
latlontree<- rpart(MEDV ~ LAT + LON, data = boston)
prp(latlontree)#Regression trees predcit average of dependent variable(MEDV-median value) in the selected bucket

#See performance of Regression tree
plot(boston$LON, boston$LAT)
points(boston$LON[boston$MEDV >21.2], boston$LAT[boston$MEDV >21.2], col="red", pch=19)
fittedvalues <- predict(latlontree)
points(boston$LON[fittedvalues >21.2], boston$LAT[fittedvalues >21.2], col="blue", pch="$")

The tree is much better at estimating above median value houses. Athough still complex. We’ll try specifying a simpler tree with a min bucket and observe performance.

latlontree<- rpart(MEDV ~ LAT + LON, data = boston, minbucket=50)
prp(latlontree) #Tree has only 5 splits now

#Or we can use R base plot function to visualize trees
plot(latlontree)
text(latlontree)

plot(boston$LON, boston$LAT)
abline(v=-71.07)# First split on tree
abline(h=42.21)# Second split on tree to the left
abline(h =42.17)# third left split
points(boston$LON[boston$MEDV >21.2], boston$LAT[boston$MEDV >21.2], col="red", pch=19)

Regression tree has carved out the low value area in the center, that linear Regression in incapable of.

Now we’ll usee more variables than Longitude and Latitude to build a more accurate model

set.seed(123)
split <- sample.split(boston$MEDV, SplitRatio = 0.7)
train <- subset(boston, split==T)
test <- subset(boston, split==F)
linreg <- lm(MEDV ~ . , data = train[!(names(train) %in% c('TOWN','TRACT'))])
summary(linreg)

## 
## Call:
## lm(formula = MEDV ~ ., data = train[!(names(train) %in% c("TOWN", 
##     "TRACT"))])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.511  -2.712  -0.676   1.793  36.883 
## 
## Coefficients:
...

linreg.pred <- predict(linreg, newdata = test)
linreg.sse <- sum((linreg.pred - test$MEDV)^2) #Sum of squared errors 
linreg.sse

## [1] 3037.088

#Can we Improve sse with regression trees?
tree<- rpart(MEDV ~ . , data = train[!(names(train) %in% c('TOWN','TRACT'))])
prp(tree)

## Warning: Bad 'data' field in model 'call' (expected a data.frame or a matrix).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

tree.pred <- predict(tree, newdata = test)
tree.sse <- sum((tree.pred -test$MEDV)^2)
tree.sse #increased sse...

## [1] 4328.988

5.3.2 Cross validation

library(e1071)
library(caret)
tr.control <-trainControl(method = "cv", number=10)
cp.grid <- expand.grid(.cp=(0:10)*0.001) #cp range of values to be tried
tr <- train(MEDV ~ . , data = train[!(names(train) %in% c('TOWN','TRACT'))], method = "rpart", trControl = tr.control, tuneGrid = cp.grid)
tr #cp f 0.001 was the best

## CART 
## 
## 364 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 327, 326, 328, 329, 328, 328, ... 
## Resampling results across tuning parameters:
## 
...

best.tree <- tr$finalModel # model with cp =0.001
prp(best.tree)

best.tree.pred=predict(best.tree, newdata=test)

#Sum of squared errors
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse

## [1] 3660.149

5.4 Assignment

5.4.1 Part 1 - Understanding why people vote

5.4.1.1 Problem 1 - Exploration and Logistic Regression

#1.1 - What proportion of people in this dataset voted in this election?
gerber <- read.csv("week4/gerber.csv")
summary(gerber) # Answer - 0.3159

##       sex              yob           voting         hawthorne    
##  Min.   :0.0000   Min.   :1900   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1947   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :1956   Median :0.0000   Median :0.000  
##  Mean   :0.4993   Mean   :1956   Mean   :0.3159   Mean   :0.111  
##  3rd Qu.:1.0000   3rd Qu.:1965   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1986   Max.   :1.0000   Max.   :1.000  
##    civicduty        neighbors          self           control      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
...

#1.2 - Which of the four "treatment groups" had the largest percentage of people who actually voted (voting = 1)?
table(gerber$voting, gerber$civicduty)

##    
##          0      1
##   0 209191  26197
##   1  96675  12021

votedGerber <- subset(gerber, voting == 1)
summary(votedGerber) # Answer - Neighbors

##       sex              yob           voting    hawthorne        civicduty     
##  Min.   :0.0000   Min.   :1900   Min.   :1   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1945   1st Qu.:1   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1954   Median :1   Median :0.0000   Median :0.0000  
##  Mean   :0.4898   Mean   :1953   Mean   :1   Mean   :0.1133   Mean   :0.1106  
##  3rd Qu.:1.0000   3rd Qu.:1962   3rd Qu.:1   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1986   Max.   :1   Max.   :1.0000   Max.   :1.0000  
##    neighbors           self           control      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
...

#1.3 - Logistic regression model for voting. Which coeffs are significant
logModel <- glm(voting ~ hawthorne + civicduty +neighbors +self, data = gerber, family = "binomial")
summary(logModel)

## 
## Call:
## glm(formula = voting ~ hawthorne + civicduty + neighbors + self, 
##     family = "binomial", data = gerber)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9744  -0.8691  -0.8389   1.4586   1.5590  
## 
## Coefficients:
...

#1.4 - Accuracy of model
votingLogPrediction <- predict(logModel, type = "response")
table(gerber$voting, votingLogPrediction > 0.3)

##    
##      FALSE   TRUE
##   0 134513 100875
##   1  56730  51966

(134513+51966)/(134513+100875+56730+51966) # 0.5419578

## [1] 0.5419578

#1.6 AUC
library(ROCR)
pred<- prediction(votingLogPrediction, gerber$voting)
perf <- performance(pred, "tpr","fpr") # performance Object to be plotted
plot(perf) #Plot ROC Curve

as.numeric(performance(pred, "auc")@y.values)#AUC - Area under curve to measure accuracy

## [1] 0.5308461

5.4.1.2 Problem 2 - Trees

library(rpart)
library(rpart.plot)

#2.1
CARTmodel = rpart(voting ~ civicduty + hawthorne + self + neighbors, data=gerber)
prp(CARTmodel)

#2.2
CARTmodel2 = rpart(voting ~ civicduty + hawthorne + self + neighbors, data=gerber, cp=0.0)
prp(CARTmodel2)

#2.4 - Make a new tree that includes the "sex" variable, again with cp = 0.0
CARTmodel3 = rpart(voting ~ civicduty + hawthorne + self + neighbors + sex, data=gerber, cp=0.0)
prp(CARTmodel3)

5.4.1.3 Problem 3 - Interaction Terms

#3.1 - Tree with only control varaible and tree with control and sex variable
CARTmodel = rpart(voting ~ control, data=gerber, cp=0.0)
prp(CARTmodel, digits = 6)

0.34 -0.296638#difference in the predicted prob of voting between being in the control group versus being in a different group

## [1] 0.043362

#3.2
CARTmodel = rpart(voting ~ control + sex, data=gerber, cp=0.0)
prp(CARTmodel, digits = 6)

#3.4
Possibilities = data.frame(sex=c(0,0,1,1),control=c(0,1,0,1))
logModelSex <- glm(voting ~ control + sex, data = gerber, family = "binomial")
summary(logModelSex)

## 
## Call:
## glm(formula = voting ~ control + sex, family = "binomial", data = gerber)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9220  -0.9012  -0.8290   1.4564   1.5717  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
...

predict(logModelSex, newdata=Possibilities, type="response")

##         1         2         3         4 
## 0.3462559 0.3024455 0.3337375 0.2908065

#3.5 - How do you interpret the coefficient for the new variable in isolation? That is, how does it relate to the dependent variable?
LogModel2 = glm(voting ~ sex + control + sex:control, data=gerber, family="binomial")# sex:control means if sex and control are both 1
summary(LogModel2) # Ans - If a person is a woman and in the control group, the chance that she voted goes down

## 
## Call:
## glm(formula = voting ~ sex + control + sex:control, family = "binomial", 
##     data = gerber)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9213  -0.9019  -0.8284   1.4573   1.5724  
## 
## Coefficients:
...

predict(LogModel2, newdata=Possibilities, type="response")

##         1         2         3         4 
## 0.3458183 0.3027947 0.3341757 0.2904558

5.4.2 Part 2 - Letter Recognition

5.4.2.1 Problem 1 - Predicting B or not B

letters <- read.csv("week4/letters_ABPR.csv")
letters$isB <- as.factor(letters$letter == "B")
#Split data into Train and Test set by 50%
library(caTools)
set.seed(1000)
spl <- sample.split(letters$isB, SplitRatio = 0.5)
lettersTrain <- subset(letters, spl == TRUE)
lettersTest <- subset(letters, spl == FALSE)

#1.1 - Baseline model always not B accuracy
table(lettersTest$isB)

## 
## FALSE  TRUE 
##  1175   383

1175/(1175+383) #0.754172

## [1] 0.754172

#1.2 Classification tree accuracy
CARTb = rpart(isB ~ . - letter, data=lettersTrain, method="class")
prp(CARTb)

predictCART <- predict(CARTb, newdata = lettersTest, type = "class")
table(lettersTest$isB, predictCART) #Confusion matrix

##        predictCART
##         FALSE TRUE
##   FALSE  1118   57
##   TRUE     43  340

(340+1118)/(1118 + 340 +57 +43) #Accuacy = 0.9358151

## [1] 0.9358151

#1.3 - Random Forest accuracy
library(randomForest)
set.seed(1000)
lettersForest <- randomForest(isB ~ . - letter, data=lettersTrain) 
predictForest <- predict(lettersForest, newdata = lettersTest)# prediction using random forest
table(lettersTest$isB, predictForest) # confusion Matrix

##        predictForest
##         FALSE TRUE
##   FALSE  1163   12
##   TRUE      9  374

(1163+ 374)/nrow(lettersTest) #Accuracy = 0.9865212 (Almost perfect)

## [1] 0.9865212

5.4.2.2 Problem 2 - Predicting the letters A, B, P, R

letters$letter <- as.factor(letters$letter)
set.seed(2000)
spl <- sample.split(letters$letter, SplitRatio = 0.5)
lettersTrain <- subset(letters, spl == TRUE)
lettersTest <- subset(letters, spl == FALSE)

#2.1 - What is the baseline accuracy?
table(lettersTest$letter)

## 
##   A   B   P   R 
## 395 383 401 379

401/nrow(lettersTest) #0.2573813

## [1] 0.2573813

#2.2 What is the test set accuracy of CART model?
CARTb = rpart(letter ~ . - isB, data=lettersTrain, method="class")
prp(CARTb)

predictCART <- predict(CARTb, newdata = lettersTest, type = "class")
table(lettersTest$letter, predictCART)

##    predictCART
##       A   B   P   R
##   A 348   4   0  43
##   B   8 318  12  45
##   P   2  21 363  15
##   R  10  24   5 340

(348+318+363+340)/nrow(lettersTest)

## [1] 0.8786906

#2.3 - Random Forest
library(randomForest)
set.seed(1000)
lettersForest <- randomForest(letter ~ . - isB, data=lettersTrain) 
predictForest <- predict(lettersForest, newdata = lettersTest)# prediction using random forest
table(lettersTest$letter, predictForest) # confusion Matrix

##    predictForest
##       A   B   P   R
##   A 391   0   3   1
##   B   0 380   1   2
##   P   0   6 394   1
##   R   3  14   0 362

(391+380+394+362)/nrow(lettersTest) #Accuracy = 0.9801027

## [1] 0.9801027

5.4.3 Part 3 - Predicting Earnings from Census Data

5.4.3.1 Problem 1 - A Logistic Regression Model

census <- read.csv("week4/census.csv")
set.seed(2000)
spl <- sample.split(census$over50k, SplitRatio = 0.6)
censusTrain <- subset(census, spl == TRUE)
censusTest <- subset(census, spl == FALSE)

#1.1 - build a logistic regression model to predict the dependent variable "over50k", using all of the other variables in the dataset
logCensusModel <- glm(over50k ~ ., data= censusTrain, family="binomial")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logCensusModel)

## 
## Call:
## glm(formula = over50k ~ ., family = "binomial", data = censusTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.1065  -0.5037  -0.1804  -0.0008   3.3383  
## 
## Coefficients: (1 not defined because of singularities)
##                                            Estimate Std. Error z value Pr(>|z|)
...

#1.2 - Model accuracy
logPrediction <- predict(logCensusModel, newdata = censusTest, type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

summary(logPrediction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.01388 0.10074 0.24115 0.39293 1.00000

table(censusTest$over50k, logPrediction >= 0.5)

##         
##          FALSE TRUE
##    <=50K  9051  662
##    >50K   1190 1888

(9051+1888)/nrow(censusTest) #accuracy - 0.8552107

## [1] 0.8552107

#1.3 - Baseline model
table(censusTest$over50k)

## 
##  <=50K   >50K 
##   9713   3078

9713/nrow(censusTest) #accuracy - 0.7593621

## [1] 0.7593621

5.4.3.2 Problem 2 - A CART model

#2.1
censusTree <- rpart(over50k ~ ., data= censusTrain, method = "class")
prp(censusTree)

#2.4
#Classification Tree
predictCART <- predict(censusTree, newdata = censusTest, type = "class")
summary(predictCART)

##  <=50K   >50K 
##  10725   2066

table(censusTest$over50k, predictCART)

##         predictCART
##           <=50K  >50K
##    <=50K   9243   470
##    >50K    1482  1596

(9243+1596)/nrow(censusTest)

## [1] 0.8473927

#Regresion tree with threshold 0.5
predictCART2 <- predict(censusTree, newdata = censusTest)
summary(predictCART2)

##       <=50K             >50K        
##  Min.   :0.01286   Min.   :0.05099  
##  1st Qu.:0.69728   1st Qu.:0.05099  
##  Median :0.94901   Median :0.05099  
##  Mean   :0.75905   Mean   :0.24095  
##  3rd Qu.:0.94901   3rd Qu.:0.30272  
##  Max.   :0.94901   Max.   :0.98714

head(predictCART2)

##        <=50K       >50K
## 2  0.2794982 0.72050176
## 5  0.2794982 0.72050176
## 7  0.9490143 0.05098572
## 8  0.6972807 0.30271934
## 11 0.6972807 0.30271934
## 12 0.2794982 0.72050176

table(censusTest$over50k, predictCART2[,2] >0.5)

##         
##          FALSE TRUE
##    <=50K  9243  470
##    >50K   1482 1596

#2.6 AUC
ROCRpred = prediction(predictCART2[,2], censusTest$over50k)
as.numeric(performance(ROCRpred, "auc")@y.values)

## [1] 0.8470256

5.4.3.3 Problem 3 - A random forest model

#3.1 - model accuracy
set.seed(1)
trainSmall = censusTrain[sample(nrow(censusTrain), 2000), ]
set.seed(1)
smallCensusForest <- randomForest(over50k ~ . , data=trainSmall) 
predictForest <- predict(smallCensusForest, newdata = censusTest)# prediction using random forest
table(censusTest$over50k, predictForest) # confusion Matrix

##         predictForest
##           <=50K  >50K
##    <=50K   8955   758
##    >50K    1125  1953

(8994+1854)/nrow(censusTest) #Accuracy - 0.8480963

## [1] 0.8480963

#3.2  - Forest metrics
vu = varUsed(smallCensusForest, count=TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vusorted$x, names(smallCensusForest$forest$xlevels[vusorted$ix])) # Chart of variables and number of splits selected in all forest

summary(smallCensusForest)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted       2000   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           4000   matrix numeric  
## oob.times       2000   -none- numeric  
## classes            2   -none- character
## importance        12   -none- numeric  
...

varImpPlot(smallCensusForest)#Impurity - how homogenous each bucket of the tree is. Each variable split reduces impurity since we get a more detailed classification.

5.4.3.4 Problem 4 - Cross validation

library(e1071)
library(caret)
set.seed(2)
#4.1 - Which value of cp does the train function recommend?
censusTrainControl <-trainControl(method = "cv", number=10) # k = 10 folds
cartGrid = expand.grid( .cp = seq(0.002,0.1,0.002)) 
trained <- train(over50k ~ . , data = censusTrain, method = "rpart", trControl = censusTrainControl, tuneGrid = cartGrid)
trained

## CART 
## 
## 19187 samples
##    12 predictor
##     2 classes: ' <=50K', ' >50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 17268, 17269, 17269, 17268, 17268, 17268, ... 
## Resampling results across tuning parameters:
...

#4.2 - Fit a CART model to the training data using this value of cp. What is the prediction accuracy on the test set?
censusTree <- rpart(over50k ~ ., data= censusTrain, method = "class", cp=0.002) #new tree with cp
prp(censusTree)

cpCensusPrediction <- predict(censusTree, newdata = censusTest, type="class")
table(censusTest$over50k, cpCensusPrediction)

##         cpCensusPrediction
##           <=50K  >50K
##    <=50K   9178   535
##    >50K    1240  1838

(9178+1838)/nrow(censusTest) #Accuray - 0.8612306

## [1] 0.8612306