Unit 7 Clustering

7.1 Introduction to Clustering

#input text file
movies <- read.table("week6/movieLens.txt", header=FALSE, sep="|",quote="\"")
str(movies)

## 'data.frame':    1682 obs. of  24 variables:
##  $ V1 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ V2 : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 617 554 593 343 1318 1545 110 390 1240 ...
##  $ V3 : Factor w/ 241 levels "","01-Aug-1997",..: 71 71 71 71 71 71 71 71 71 182 ...
##  $ V4 : logi  NA NA NA NA NA NA ...
##  $ V5 : Factor w/ 1661 levels "","http://us.imdb.com/M/title-exact/Independence%20(1997)",..: 1431 565 505 543 310 1661 1453 103 357 1183 ...
##  $ V6 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V7 : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ V8 : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ V9 : int  1 0 0 0 0 0 0 0 0 0 ...
...

# Add column names
colnames(movies) <- c("ID", "Title", "ReleaseDate", "VideoReleaseDate", "IMDB", "Unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western")
str(movies)

## 'data.frame':    1682 obs. of  24 variables:
##  $ ID              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title           : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 617 554 593 343 1318 1545 110 390 1240 ...
##  $ ReleaseDate     : Factor w/ 241 levels "","01-Aug-1997",..: 71 71 71 71 71 71 71 71 71 182 ...
##  $ VideoReleaseDate: logi  NA NA NA NA NA NA ...
##  $ IMDB            : Factor w/ 1661 levels "","http://us.imdb.com/M/title-exact/Independence%20(1997)",..: 1431 565 505 543 310 1661 1453 103 357 1183 ...
##  $ Unknown         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Action          : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ Adventure       : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Animation       : int  1 0 0 0 0 0 0 0 0 0 ...
...

# Remove unnecessary variables
movies$ID <- NULL
movies$ReleaseDate <- NULL
movies$VideoReleaseDate <- NULL
movies$IMDB <- NULL

# Remove duplicates
movies <- unique(movies)

7.1.1 Heirarchical Clustering

#Heirarchical Clustering in R
#Cluster on genre variables not title
distances <- dist(movies[2:20], method= "euclidian") #Use euclidian distance
clusterMovies <- hclust(distances, method = "ward.D") #The ward method cares about the distance between clusters using centroid distance, and also the variance in each of the clusters.
plot(clusterMovies)

#label datapoint by cluster
clusterGroups <- cutree(clusterMovies, k = 10)
head(clusterGroups)

## 1 2 3 4 5 6 
## 1 2 3 1 3 4

tapply(movies$Action, clusterGroups, mean)#Average of action by clusterGroups

##         1         2         3         4         5         6         7         8 
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000 0.0000000 
##         9        10 
## 0.0000000 0.0000000

tapply(movies$Romance, clusterGroups, mean)#Average of Romance by clusterGroups

##          1          2          3          4          5          6          7 
## 0.10437710 0.04522613 0.03669725 0.00000000 0.00000000 1.00000000 1.00000000 
##          8          9         10 
## 0.00000000 0.00000000 0.00000000

# Find which cluster Men in Black is in.
subset(movies, Title=="Men in Black (1997)")

##                   Title Unknown Action Adventure Animation Childrens Comedy
## 257 Men in Black (1997)       0      1         1         0         0      1
##     Crime Documentary Drama Fantasy FilmNoir Horror Musical Mystery Romance
## 257     0           0     0       0        0      0       0       0       0
##     SciFi Thriller War Western
## 257     1        0   0       0

clusterGroups[257]

## 257 
##   2

# Create a new data set with just the movies from cluster 2
cluster2 <- subset(movies, clusterGroups==2)

# Look at the first 10 titles in this cluster:
cluster2$Title[1:20]

##  [1] GoldenEye (1995)                              
##  [2] Bad Boys (1995)                               
##  [3] Apollo 13 (1995)                              
##  [4] Net, The (1995)                               
##  [5] Natural Born Killers (1994)                   
##  [6] Outbreak (1995)                               
##  [7] Stargate (1994)                               
##  [8] Fugitive, The (1993)                          
##  [9] Jurassic Park (1993)                          
## [10] Robert A. Heinlein's The Puppet Masters (1994)
...

#An advanced approach to finding cluster centroids
cG <- cutree(clusterMovies, k = 2) #only 2 clusters
spl <- split(movies[2:20], cG)#Split movies by cluster groups
lapply(spl, colMeans)

## $`1`
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.001545595 0.192426584 0.102782071 0.032457496 0.092735703 0.387944359 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
## 0.082689335 0.038639876 0.267387944 0.017001546 0.018547141 0.069551777 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.043276662 0.046367852 0.188562597 0.077279753 0.191653787 0.054868624 
##     Western 
## 0.020865533 
## 
...

7.2 Recitation - Segmenting Images

7.2.1 Flower Image

flower <- read.csv("week6/flower.csv", header = F)
flowerMatrix <- as.matrix(flower)
flowerVector <- as.vector(flowerMatrix) #convert to vector to calculate euclidian distances for clustering
distance <- dist(flowerVector, method = "euclidean")

# Hierarchical clustering
clusterIntensity <- hclust(distance, method="ward.D")#ward method is minimum variance method.
plot(clusterIntensity)
# Select 3 clusters
rect.hclust(clusterIntensity, k = 3, border = "red")

flowerClusters <- cutree(clusterIntensity, k = 3)

tapply(flowerVector, flowerClusters, mean)

##          1          2          3 
## 0.08574315 0.50826255 0.93147713

#plot the image and clusters
dim(flowerClusters) <- c(50,50) #Set the dimensions as 50x50 to look like a matrix
image(flowerClusters, axes = FALSE)

image(flowerMatrix,axes=FALSE,col=grey(seq(0,1,length=256)))#original image in grayscale

7.2.2 MRI Image

healthy <- read.csv("week6/healthy.csv", header = F)
healthyMatrix <- as.matrix(healthy)

#Plot image
image(healthyMatrix,axes=FALSE,col=grey(seq(0,1,length=256)))

healthyVector <- as.vector(healthyMatrix)
#distance <- dist(healthyVector, method = "euclidean") - Error, We'll use k-means clusering instead

7.2.3 K-Means Clustering

#K-means clustering
k<-5
set.seed(1)
KMC <- kmeans(healthyVector, centers = k, iter.max = 1000)
str(KMC)

## List of 9
##  $ cluster     : int [1:365636] 5 5 5 5 5 5 5 5 5 5 ...
##  $ centers     : num [1:5, 1] 0.1166 0.2014 0.4914 0.3299 0.0219
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : NULL
##  $ totss       : num 5775
##  $ withinss    : num [1:5] 65.9 54.4 81.1 47.4 53.3
##  $ tot.withinss: num 302
##  $ betweenss   : num 5473
...

# Extract clusters
healthyClusters <- KMC$cluster

# Plot the image with the clusters
dim(healthyClusters) <- c(nrow(healthyMatrix), ncol(healthyMatrix))# set dimensions
image(healthyClusters, axes = FALSE, col=rainbow(k))

#Detecting Tumors
tumor <- read.csv("week6/tumor.csv", header=FALSE)
tumorMatrix <- as.matrix(tumor)
tumorVector <- as.vector(tumorMatrix)

#Apply clusters
library(flexclust)

KMC.kcca <- as.kcca(KMC, healthyVector)#convert to kcca object
tumorClusters <- predict(KMC.kcca, newdata = tumorVector)

# Visualize the clusters
dim(tumorClusters) <- c(nrow(tumorMatrix), ncol(tumorMatrix))
image(tumorClusters, axes = FALSE, col=rainbow(k))

7.3 Assignment

7.3.1 Part 1 - Document Clustering with Daily Kos

7.3.1.1 Problem 1 - Hierarchical Clustering

#1.1
dailyKos <- read.csv("week6/dailykos.csv", stringsAsFactors=FALSE)
str(dailyKos)

## 'data.frame':    3430 obs. of  1545 variables:
##  $ abandon                                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ abc                                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ability                                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ abortion                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ absolute                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ abstain                                 : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ abu                                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ abuse                                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ accept                                  : int  0 0 0 0 0 0 0 0 0 0 ...
...

distances <- dist(dailyKos, method= "euclidian")
clusterKos <- hclust(distances, method = "ward.D")
plot(clusterKos)

#1.4 Use the cutree function to split your data into 7 clusters.; How many observations are in cluster 3?
clusterGroups <- cutree(clusterKos, k = 7)
spl <- split(dailyKos, clusterGroups)#Split movies by cluster groups
lapply(spl, nrow)

## $`1`
## [1] 1266
## 
## $`2`
## [1] 321
## 
## $`3`
## [1] 374
## 
## $`4`
...

#1.5 look at the top 6 words in each cluster
#sort - orders in increasing value; tail - last 6 values
f <- function(x){
  tail(sort(colMeans(x)))
}
lapply(spl, f)

## $`1`
##      state republican       poll   democrat      kerry       bush 
##  0.7575039  0.7590837  0.9036335  0.9194313  1.0624013  1.7053712 
## 
## $`2`
##      bush  democrat challenge      vote      poll  november 
##  2.847352  2.850467  4.096573  4.398754  4.847352 10.339564 
## 
## $`3`
##      elect    parties      state republican   democrat       bush 
...

7.3.1.2 Problem 2 - K-Means Clustering

set.seed(1000)
KMC <- kmeans(dailyKos, centers = 7)
str(KMC)

## List of 9
##  $ cluster     : int [1:3430] 4 4 2 4 1 4 3 4 4 4 ...
##  $ centers     : num [1:7, 1:1545] 0.0333 0.0152 0.03 0.0138 0.0556 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:1545] "abandon" "abc" "ability" "abortion" ...
##  $ totss       : num 896461
##  $ withinss    : num [1:7] 77738 77220 75280 245827 53122 ...
##  $ tot.withinss: num 730614
##  $ betweenss   : num 165847
...

splK <- split(dailyKos, KMC$cluster)
lapply(splK, nrow)

## $`1`
## [1] 150
## 
## $`2`
## [1] 329
## 
## $`3`
## [1] 300
## 
## $`4`
...

lapply(splK, f)

## $`1`
##           time           iraq          kerry administration       presided 
##       1.586667       1.640000       1.653333       2.620000       2.726667 
##           bush 
##      11.333333 
## 
## $`2`
##  democrat      bush challenge      vote      poll  november 
##  2.899696  2.960486  4.121581  4.446809  4.872340 10.370821 
## 
...

7.3.2 Part 2 - Market Segmentation for Airlines

7.3.2.1 Problem 1 - Normalizing the Data

airlines <- read.csv("week6/AirlinesCluster.csv", stringsAsFactors=FALSE)
summary(airlines)

##     Balance          QualMiles         BonusMiles       BonusTrans  
##  Min.   :      0   Min.   :    0.0   Min.   :     0   Min.   : 0.0  
##  1st Qu.:  18528   1st Qu.:    0.0   1st Qu.:  1250   1st Qu.: 3.0  
##  Median :  43097   Median :    0.0   Median :  7171   Median :12.0  
##  Mean   :  73601   Mean   :  144.1   Mean   : 17145   Mean   :11.6  
##  3rd Qu.:  92404   3rd Qu.:    0.0   3rd Qu.: 23801   3rd Qu.:17.0  
##  Max.   :1704838   Max.   :11148.0   Max.   :263685   Max.   :86.0  
##   FlightMiles       FlightTrans     DaysSinceEnroll
##  Min.   :    0.0   Min.   : 0.000   Min.   :   2   
##  1st Qu.:    0.0   1st Qu.: 0.000   1st Qu.:2330   
...

#1.3 In the normalized data, which variable has the largest maximum value?
preproc <- preProcess(airlines) #pre process data
airlinesNorm <- predict(preproc, airlines)
summary(airlinesNorm)

##     Balance          QualMiles         BonusMiles        BonusTrans      
##  Min.   :-0.7303   Min.   :-0.1863   Min.   :-0.7099   Min.   :-1.20805  
##  1st Qu.:-0.5465   1st Qu.:-0.1863   1st Qu.:-0.6581   1st Qu.:-0.89568  
##  Median :-0.3027   Median :-0.1863   Median :-0.4130   Median : 0.04145  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.1866   3rd Qu.:-0.1863   3rd Qu.: 0.2756   3rd Qu.: 0.56208  
##  Max.   :16.1868   Max.   :14.2231   Max.   :10.2083   Max.   : 7.74673  
##   FlightMiles       FlightTrans       DaysSinceEnroll   
##  Min.   :-0.3286   Min.   :-0.36212   Min.   :-1.99336  
##  1st Qu.:-0.3286   1st Qu.:-0.36212   1st Qu.:-0.86607  
...

7.3.2.2 Problem 2 - Hierarchical Clustering

distanceAir <- dist(airlinesNorm, method= "euclidian")
clusterAir <- hclust(distanceAir, method = "ward.D")
plot(clusterAir)

#2.2 - How many data points are in Cluster 1?
clusterGroupsAir <- cutree(clusterAir, k = 5)
spl <- split(airlines, clusterGroupsAir)#Split movies by cluster groups
lapply(spl, nrow)

## $`1`
## [1] 776
## 
## $`2`
## [1] 519
## 
## $`3`
## [1] 494
## 
## $`4`
...

#2.3
lapply(spl, colMeans)

## $`1`
##         Balance       QualMiles      BonusMiles      BonusTrans     FlightMiles 
##    5.786690e+04    6.443299e-01    1.036012e+04    1.082345e+01    8.318428e+01 
##     FlightTrans DaysSinceEnroll 
##    3.028351e-01    6.235365e+03 
## 
## $`2`
##         Balance       QualMiles      BonusMiles      BonusTrans     FlightMiles 
##    1.106693e+05    1.065983e+03    2.288176e+04    1.822929e+01    2.613418e+03 
##     FlightTrans DaysSinceEnroll 
...

7.3.2.3 Problem 3 - K-Means Clustering

set.seed(88)
KMCAir <- kmeans(airlinesNorm, centers = 5, iter.max = 1000)
str(KMCAir)

## List of 9
##  $ cluster     : int [1:3999] 4 4 4 4 1 4 1 4 3 1 ...
##  $ centers     : num [1:5, 1:7] 0.787 0.401 1.172 -0.161 -0.352 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:7] "Balance" "QualMiles" "BonusMiles" "BonusTrans" ...
##  $ totss       : num 27986
##  $ withinss    : num [1:5] 4440 704 3571 2422 2374
##  $ tot.withinss: num 13511
##  $ betweenss   : num 14475
...

#3.1 - How many clusters have more than 1,000 observations?
splK <- split(airlines, KMCAir$cluster)
lapply(splK, nrow)

## $`1`
## [1] 776
## 
## $`2`
## [1] 57
## 
## $`3`
## [1] 143
## 
## $`4`
...

#3.2
KMCAir$centers

##      Balance   QualMiles  BonusMiles  BonusTrans FlightMiles FlightTrans
## 1  0.7866778 -0.08547306  1.40214847  1.01145492  0.01417749  0.02211363
## 2  0.4009981  6.97876620  0.08495883  0.07250717  0.34260322  0.38252097
## 3  1.1722573  0.42324832  0.66036647  1.74459327  3.78734636  4.05877606
## 4 -0.1606061 -0.11505046 -0.34731771 -0.26038231 -0.17602747 -0.19180853
## 5 -0.3517815 -0.14183200 -0.43059094 -0.41272355 -0.20026359 -0.21576712
##   DaysSinceEnroll
## 1       0.3859187
## 2      -0.1193065
## 3       0.2652869
...

7.3.3 Part 3 - Predicting Stock Returns with Cluster-Then-Predict

7.3.3.1 Problem 1 - Exploring the Dataset

#1.1
stocks <- read.csv("week6/StocksCluster.csv", stringsAsFactors=FALSE)
#1.2, 1.3, 1.4
summary(stocks)

##    ReturnJan            ReturnFeb           ReturnMar        
##  Min.   :-0.7616205   Min.   :-0.690000   Min.   :-0.712994  
##  1st Qu.:-0.0691663   1st Qu.:-0.077748   1st Qu.:-0.046389  
##  Median : 0.0009965   Median :-0.010626   Median : 0.009878  
##  Mean   : 0.0126316   Mean   :-0.007605   Mean   : 0.019402  
##  3rd Qu.: 0.0732606   3rd Qu.: 0.043600   3rd Qu.: 0.077066  
##  Max.   : 3.0683060   Max.   : 6.943694   Max.   : 4.008621  
##    ReturnApr           ReturnMay          ReturnJune       
##  Min.   :-0.826503   Min.   :-0.92207   Min.   :-0.717920  
##  1st Qu.:-0.054468   1st Qu.:-0.04640   1st Qu.:-0.063966  
...

7.3.3.2 Problem 2 - Initial Logistic Regression Model

#2.1 - What is the overall accuracy on the training set, using a threshold of 0.5?
set.seed(144)
splitStocks <- sample.split(stocks$PositiveDec, SplitRatio = 0.7)
stocksTrain <- subset(stocks, splitStocks == TRUE)
stocksTest <- subset(stocks, splitStocks == FALSE)
StocksModel <- glm(PositiveDec ~ ., data =stocksTrain, family = "binomial" )
stocksLogPrediction <- predict(StocksModel, type="response")
table(stocksTrain$PositiveDec, stocksLogPrediction > 0.5)

##    
##     FALSE TRUE
##   0   990 2689
##   1   787 3640

(990 +3640)/nrow(stocksTrain) #Accuracy - 0.5711818

## [1] 0.5711818

#2.2Now obtain test set predictions from StocksModel. What is the overall accuracy of the model on the test, again using a threshold of 0.5?
stocksLogPrediction <- predict(StocksModel, newdata = stocksTest, type="response")
table(stocksTest$PositiveDec, stocksLogPrediction > 0.5)

##    
##     FALSE TRUE
##   0   417 1160
##   1   344 1553

(417 +1553)/nrow(stocksTest) #Accuracy - 0.5670697

## [1] 0.5670697

7.3.3.3 Problem 3 - Clustering Stocks

#3.1Why do we need to remove the dependent variable in the clustering phase of the cluster-then-predict methodology?
#Ans - Thats the variable we're trying to predict
limitedTrain <- stocksTrain
limitedTrain$PositiveDec <- NULL
limitedTest <- stocksTest
limitedTest$PositiveDec <- NULL

#3.3
preproc <- preProcess(limitedTrain)
normTrain <- predict(preproc, limitedTrain)#normalize Train data
normTest <- predict(preproc, limitedTest)#normalize Test data
summary(normTest)

##    ReturnJan           ReturnFeb           ReturnMar          ReturnApr       
##  Min.   :-3.743836   Min.   :-3.251044   Min.   :-4.07731   Min.   :-4.47865  
##  1st Qu.:-0.485690   1st Qu.:-0.348951   1st Qu.:-0.40662   1st Qu.:-0.51121  
##  Median :-0.066856   Median :-0.006860   Median :-0.05674   Median :-0.11414  
##  Mean   :-0.000419   Mean   :-0.003862   Mean   : 0.00583   Mean   :-0.03638  
##  3rd Qu.: 0.357729   3rd Qu.: 0.264647   3rd Qu.: 0.35653   3rd Qu.: 0.32742  
##  Max.   : 8.412973   Max.   : 9.552365   Max.   : 9.00982   Max.   : 6.84589  
##    ReturnMay          ReturnJune         ReturnJuly          ReturnAug       
##  Min.   :-5.84445   Min.   :-4.73628   Min.   :-5.201454   Min.   :-4.62097  
##  1st Qu.:-0.43819   1st Qu.:-0.44968   1st Qu.:-0.512039   1st Qu.:-0.51546  
...

summary(normTrain)

##    ReturnJan          ReturnFeb          ReturnMar          ReturnApr      
##  Min.   :-4.57682   Min.   :-3.43004   Min.   :-4.54609   Min.   :-5.0227  
##  1st Qu.:-0.48271   1st Qu.:-0.35589   1st Qu.:-0.40758   1st Qu.:-0.4757  
##  Median :-0.07055   Median :-0.01875   Median :-0.05778   Median :-0.1104  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.35898   3rd Qu.: 0.25337   3rd Qu.: 0.36106   3rd Qu.: 0.3400  
##  Max.   :18.06234   Max.   :34.92751   Max.   :24.77296   Max.   :14.6959  
##    ReturnMay          ReturnJune         ReturnJuly         ReturnAug       
##  Min.   :-4.96759   Min.   :-4.82957   Min.   :-5.19139   Min.   :-5.60378  
##  1st Qu.:-0.43045   1st Qu.:-0.45602   1st Qu.:-0.51832   1st Qu.:-0.47163  
...

#3.4 Run k-means clustering with 3 clusters on normTrain; Which cluster has the largest number of observations?
set.seed(144)
km <- kmeans(normTrain, centers = 3)
splitTrain <- split(normTrain, km$cluster)
lapply(splitTrain, nrow) #Ans - Cluster 2

## $`1`
## [1] 2479
## 
## $`2`
## [1] 4731
## 
## $`3`
## [1] 896

#3.5
km.kcca <- as.kcca(km, normTrain)
clusterTrain <- predict(km.kcca)
clusterTest <- predict(km.kcca, newdata=normTest)
table(clusterTest)

## clusterTest
##    1    2    3 
## 1058 2029  387

7.3.3.4 Problem 4 - Cluster-Specific Predictions

#4.1 - Which training set data frame has the highest average value of the dependent variable?
trainList <- split(stocksTrain, clusterTrain)
testList <- split(stocksTest, clusterTest)
lapply(trainList, colMeans)

## $`1`
##   ReturnJan   ReturnFeb   ReturnMar   ReturnApr   ReturnMay  ReturnJune 
## -0.06884016 -0.05313853  0.05623781  0.09255203  0.08198363 -0.04521849 
##  ReturnJuly   ReturnAug   ReturnSep   ReturnOct   ReturnNov PositiveDec 
##  0.11018728  0.02208732 -0.01220414 -0.05505619 -0.01892736  0.61032674 
## 
## $`2`
##     ReturnJan     ReturnFeb     ReturnMar     ReturnApr     ReturnMay 
##  0.0251634954  0.0350812572 -0.0002492554 -0.0178523286 -0.0067017781 
##    ReturnJune    ReturnJuly     ReturnAug     ReturnSep     ReturnOct 
...

#4.2 - 
fmod <- function(x){
  glm(PositiveDec ~ ., data = x, family = "binomial")
}
trainModelList <- lapply(trainList, fmod)
lapply(trainModelList, summary)

## $`1`
## 
## Call:
## glm(formula = PositiveDec ~ ., family = "binomial", data = x)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7220  -1.2879   0.8679   1.0096   1.7170  
## 
## Coefficients:
...

fpredict <- function(modelInput, newdataInput){
  predict(modelInput, type = "response", newdata =newdataInput )
}
predList <- mapply(fpredict, trainModelList, testList)
str(predList[[1]])

##  Named num [1:1058] 0.638 0.54 0.49 0.589 0.531 ...
##  - attr(*, "names")= chr [1:1058] "5" "29" "67" "86" ...

table(testList[[1]]$PositiveDec ,predList[[1]] > 0.5)

##    
##     FALSE TRUE
##   0    43  350
##   1    26  639

(30+774)/(30+774+471+23)#Accuracy - 0.6194145

## [1] 0.6194145

table(testList[[2]]$PositiveDec ,predList[[2]] > 0.5)

##    
##     FALSE TRUE
##   0   277  719
##   1   221  812

(388+757)/(388+757+309+626)#Accuracy - 0.5504808

## [1] 0.5504808

table(testList[[3]]$PositiveDec ,predList[[3]] > 0.5)

##    
##     FALSE TRUE
##   0   119   69
##   1    76  123

(49+13)/(49+13+13+21)#Accuracy - 0.6458333

## [1] 0.6458333

AllPredictions <- c(predList[[1]],predList[[2]],predList[[3]])
AllOutcomes <- do.call("rbind",testList)
table(AllOutcomes$PositiveDec ,AllPredictions > 0.5)

##    
##     FALSE TRUE
##   0   439 1138
##   1   323 1574

(467+1544)/(nrow(AllOutcomes))#Accuracy - 0.5788716

## [1] 0.5788716