Unit 8 Visualization

8.1 Scatterplots - Visualizing the world

8.1.1 Basic scatterplots using ggplot

# Read in data
WHO <- read.csv("week7/WHO.csv")
str(WHO)

## 'data.frame':    194 obs. of  13 variables:
##  $ Country                      : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region                       : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ...
##  $ Population                   : int  29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
##  $ Under15                      : num  47.4 21.3 27.4 15.2 47.6 ...
##  $ Over60                       : num  3.82 14.93 7.17 22.86 3.84 ...
##  $ FertilityRate                : num  5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
##  $ LifeExpectancy               : int  60 74 73 82 51 75 76 71 82 81 ...
##  $ ChildMortality               : num  98.5 16.7 20 3.2 163.5 ...
##  $ CellularSubscribers          : num  54.3 96.4 99 75.5 48.4 ...
...

library(ggplot2)
# Create the ggplot object with the data and the aesthetic mapping:
scatterplot <- ggplot(WHO, aes(x = GNI, y = FertilityRate))
# Add the geom_point geometry
scatterplot + geom_point()

## Warning: Removed 35 rows containing missing values (geom_point).

# Redo the plot with blue triangles instead of circles:
scatterplot + geom_point(color = "blue", size = 3, shape = 17)

## Warning: Removed 35 rows containing missing values (geom_point).

scatterplot + geom_point(color = "darkred", size = 3, shape = 8)

## Warning: Removed 35 rows containing missing values (geom_point).

# Add a title to the plot:
scatterplot + geom_point(colour = "blue", size = 3, shape = 17) + ggtitle("Fertility Rate vs. Gross National Income")

## Warning: Removed 35 rows containing missing values (geom_point).

# Color the points by region: 
ggplot(WHO, aes(x = GNI, y = FertilityRate, color = Region)) + geom_point()

## Warning: Removed 35 rows containing missing values (geom_point).

# Color the points according to life expectancy:
ggplot(WHO, aes(x = GNI, y = FertilityRate, color = LifeExpectancy)) + geom_point()

## Warning: Removed 35 rows containing missing values (geom_point).

# Is the fertility rate of a country was a good predictor of the percentage of the population under 15?
ggplot(WHO, aes(x = FertilityRate, y = Under15)) + geom_point()

## Warning: Removed 11 rows containing missing values (geom_point).

# Let's try a log transformation:
ggplot(WHO, aes(x = log(FertilityRate), y = Under15)) + geom_point()

## Warning: Removed 11 rows containing missing values (geom_point).

# Simple linear regression model to predict the percentage of the population under 15, using the log of the fertility rate:
mod = lm(Under15 ~ log(FertilityRate), data = WHO)
summary(mod)

## 
## Call:
## lm(formula = Under15 ~ log(FertilityRate), data = WHO)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3131  -1.7742   0.0446   1.7440   7.7174 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
...

# Add this regression line to our plot:
ggplot(WHO, aes(x = log(FertilityRate), y = Under15)) + geom_point() + stat_smooth(method = "lm")

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

# 99% confidence interval
ggplot(WHO, aes(x = log(FertilityRate), y = Under15)) + geom_point() + stat_smooth(method = "lm", level = 0.99)

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

# No confidence interval in the plot
ggplot(WHO, aes(x = log(FertilityRate), y = Under15)) + geom_point() + stat_smooth(method = "lm", se = FALSE)

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

# Change the color of the regression line:
ggplot(WHO, aes(x = log(FertilityRate), y = Under15)) + geom_point() + stat_smooth(method = "lm", colour = "orange")

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

8.2 Visualizing crime over time

mvt <- read.csv("week7/mvt.csv")
# Convert the Date variable to a format that R will recognize:
mvt$Date <- strptime(mvt$Date, format="%m/%d/%y %H:%M")
# Extract the hour and the day of the week:
mvt$Weekday <- weekdays(mvt$Date)
mvt$Hour <- mvt$Date$hour

# Create a simple line plot - need the total number of crimes on each day of the week. We can get this information by creating a table:
table(mvt$Weekday)

## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     29284     27397     27118     26316     27319     26791     27416

# Save this table as a data frame:
WeekdayCounts <- as.data.frame(table(mvt$Weekday))
str(WeekdayCounts)

## 'data.frame':    7 obs. of  2 variables:
##  $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7
##  $ Freq: int  29284 27397 27118 26316 27319 26791 27416

# Create our plot
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))

# Make the "Var1" variable an ORDERED factor variable
WeekdayCounts$Var1 <- factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))
# Try again:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))

# Change our x and y labels:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1)) + xlab("Day of the Week") + ylab("Total Motor Vehicle Thefts")

8.2.1 Heatmaps

# Create a counts table for the weekday and hour:
table(mvt$Weekday, mvt$Hour)

##            
##                0    1    2    3    4    5    6    7    8    9   10   11   12
##   Friday    1873  932  743  560  473  602  839 1203 1268 1286  938  822 1207
##   Monday    1900  825  712  527  415  542  772 1123 1323 1235  971  737 1129
##   Saturday  2050 1267  985  836  652  508  541  650  858 1039  946  789 1204
##   Sunday    2028 1236 1019  838  607  461  478  483  615  864  884  787 1192
##   Thursday  1856  816  696  508  400  534  799 1135 1298 1301  932  731 1093
##   Tuesday   1691  777  603  464  414  520  845 1118 1175 1174  948  786 1108
##   Wednesday 1814  790  619  469  396  561  862 1140 1329 1237  947  763 1225
##            
...

# Save this to a data frame:
DayHourCounts <- as.data.frame(table(mvt$Weekday, mvt$Hour))
str(DayHourCounts)

## 'data.frame':    168 obs. of  3 variables:
##  $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
##  $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ Freq: int  1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...

# Convert the second variable, Var2, to numbers and call it Hour:
DayHourCounts$Hour <- as.numeric(as.character(DayHourCounts$Var2))

# Create out plot:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1))

# Change the colors
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2)

# Separate the weekends from the weekdays:
DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"), "Weekend", "Weekday")

# Redo our plot, this time coloring by Type:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2)

# Make the lines a little transparent:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5)

# Fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
# Make a heatmap:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))

# Change the label on the legend, and get rid of the y-label:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts") + theme(axis.title.y = element_blank())

# Change the color scheme
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts", low="white", high="red") + theme(axis.title.y = element_blank())

8.2.2 A Geographical Hot Spot Map

library(maps)
library(ggmap)
library(tmaptools)

# Load a map of Chicago into R:
chicago <- get_stamenmap(rbind(as.numeric(paste(geocode_OSM("Chicago")$bbox))), zoom = 11)

## Source : http://tile.stamen.com/terrain/11/523/760.png

## Source : http://tile.stamen.com/terrain/11/524/760.png

## Source : http://tile.stamen.com/terrain/11/525/760.png

## Source : http://tile.stamen.com/terrain/11/526/760.png

## Source : http://tile.stamen.com/terrain/11/523/761.png

## Source : http://tile.stamen.com/terrain/11/524/761.png

## Source : http://tile.stamen.com/terrain/11/525/761.png

## Source : http://tile.stamen.com/terrain/11/526/761.png

## Source : http://tile.stamen.com/terrain/11/523/762.png

## Source : http://tile.stamen.com/terrain/11/524/762.png

## Source : http://tile.stamen.com/terrain/11/525/762.png

## Source : http://tile.stamen.com/terrain/11/526/762.png

# Look at the map
ggmap(chicago)

# Plot the first 100 motor vehicle thefts:
ggmap(chicago) + geom_point(data = mvt[1:100,], aes(x = Longitude, y = Latitude))

## Warning: Removed 4 rows containing missing values (geom_point).

# Round our latitude and longitude to 2 digits of accuracy, and create a crime counts data frame for each area:
LatLonCounts <- as.data.frame(table(round(mvt$Longitude,2), round(mvt$Latitude,2)))
str(LatLonCounts)

## 'data.frame':    1638 obs. of  3 variables:
##  $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq: int  0 0 0 0 0 0 0 0 0 0 ...

# Convert our Longitude and Latitude variable to numbers:
LatLonCounts$Long <- as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat <- as.numeric(as.character(LatLonCounts$Var2))
# Plot these points on our map:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq))

## Warning: Removed 80 rows containing missing values (geom_point).

# Change the color scheme:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")

## Warning: Removed 80 rows containing missing values (geom_point).

# We can also use the geom_tile geometry
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")

## Warning: Removed 80 rows containing missing values (geom_tile).

8.2.3 A Heatmap of the United States

murders <- read.csv("week7/murders.csv")
str(murders)

## 'data.frame':    51 obs. of  6 variables:
##  $ State            : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Population       : int  4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
##  $ PopulationDensity: num  94.65 1.26 57.05 56.43 244.2 ...
##  $ Murders          : int  199 31 352 130 1811 117 131 48 131 987 ...
##  $ GunMurders       : int  135 19 232 93 1257 65 97 38 99 669 ...
##  $ GunOwnership     : num  0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...

# Load the map of the US
statesMap <- map_data("state")
str(statesMap)

## 'data.frame':    15537 obs. of  6 variables:
##  $ long     : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat      : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ subregion: chr  NA NA NA NA ...

# Plot the map:
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black")

# Create a new variable called region with the lowercase names to match the statesMap:
murders$region <- tolower(murders$State)
# Join the statesMap data and the murders data into one dataframe:
murderMap <- merge(statesMap, murders, by="region")
str(murderMap)

## 'data.frame':    15537 obs. of  12 variables:
##  $ region           : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ long             : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat              : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ subregion        : chr  NA NA NA NA ...
##  $ State            : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Population       : int  4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
##  $ PopulationDensity: num  94.7 94.7 94.7 94.7 94.7 ...
...

# Plot the number of murder on our map of the United States:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Murders)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

# Plot a map of the population:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Population)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

# Create a new variable that is the number of murders per 100,000 population:
murderMap$MurderRate <- murderMap$Murders / murderMap$Population * 100000

# Redo our plot with murder rate:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

# Redo the plot, removing any states with murder rates above 10:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend", limits = c(0,10))

8.3 Recitation

8.3.1 Bar Charts

intl <- read.csv("week7/intl.csv")
str(intl)

## 'data.frame':    8 obs. of  2 variables:
##  $ Region       : Factor w/ 8 levels "Africa","Asia",..: 2 3 6 4 5 1 7 8
##  $ PercentOfIntl: num  0.531 0.201 0.098 0.09 0.054 0.02 0.015 0.002

# We want to make a bar plot with region on the X axis
# and Percentage on the y-axis.
ggplot(intl, aes(x=Region, y=PercentOfIntl)) +
  geom_bar(stat="identity") + #
  geom_text(aes(label=PercentOfIntl))

#Make region ordered by decreasing percentage
intl <- transform(intl, Region = reorder(Region, -PercentOfIntl))
intl$PercentOfIntl = intl$PercentOfIntl * 100 #normalize
# Make the plot
ggplot(intl, aes(x=Region, y=PercentOfIntl)) +
geom_bar(stat="identity", fill="dark blue") +
geom_text(aes(label=PercentOfIntl), vjust=-0.4) +
ylab("Percent of International Students") +
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

#World Map
intlall <- read.csv("week7/intlall.csv",stringsAsFactors=FALSE)
intlall[is.na(intlall)] = 0#replace NAs with 0
world_map <- map_data("world")
str(world_map)

## 'data.frame':    99338 obs. of  6 variables:
##  $ long     : num  -69.9 -69.9 -69.9 -70 -70.1 ...
##  $ lat      : num  12.5 12.4 12.4 12.5 12.5 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "Aruba" "Aruba" "Aruba" "Aruba" ...
##  $ subregion: chr  NA NA NA NA ...

#Merge data by region on worldmap and citizenship on intlall
world_map <- merge(world_map, intlall, by.x ="region", by.y = "Citizenship")
str(world_map)

## 'data.frame':    63634 obs. of  12 variables:
##  $ region         : chr  "Albania" "Albania" "Albania" "Albania" ...
##  $ long           : num  20.5 20.4 19.5 20.5 20.4 ...
##  $ lat            : num  41.3 39.8 42.5 40.1 41.5 ...
##  $ group          : num  6 6 6 6 6 6 6 6 6 6 ...
##  $ order          : int  789 822 870 815 786 821 818 779 879 795 ...
##  $ subregion      : chr  NA NA NA NA ...
##  $ UG             : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ G              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SpecialUG      : num  0 0 0 0 0 0 0 0 0 0 ...
...

ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(fill="white", color="black") +
  coord_map("mercator", xlim = c(-180,180))

# Reorder the data
world_map <- world_map[order(world_map$group, world_map$order),]

# Redo the plot
ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(fill="white", color="black") +
  coord_map("mercator", xlim = c(-180,180))

intlall$Citizenship[intlall$Citizenship=="China (People's Republic Of)"] <- "China" #Fix China Name.
#Remerge and order
world_map <- merge(map_data("world"), intlall, 
                  by.x ="region",
                  by.y = "Citizenship")
world_map <- world_map[order(world_map$group, world_map$order),]

ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(aes(fill=Total), color="black") +
  coord_map("mercator", xlim = c(-180,180))

#Other Projections
ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(aes(fill=Total), color="black") +
  coord_map("ortho", orientation=c(20, 30, 0))

ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(aes(fill=Total), color="black") +
  coord_map("ortho", orientation=c(-37, 175, 0))

### Line Charts

#Using Line Charts
households <- read.csv("week7/households.csv")
library(reshape2)
households[,1:2]#First 2 columns

##   Year MarriedWChild
## 1 1970          40.3
## 2 1980          30.9
## 3 1990          26.3
## 4 1995          25.5
## 5 2000          24.1
## 6 2005          22.9
## 7 2010          20.9
## 8 2012          19.6

head(melt(households, id="Year"))

##   Year      variable value
## 1 1970 MarriedWChild  40.3
## 2 1980 MarriedWChild  30.9
## 3 1990 MarriedWChild  26.3
## 4 1995 MarriedWChild  25.5
## 5 2000 MarriedWChild  24.1
## 6 2005 MarriedWChild  22.9

melt(households, id="Year")[1:10,3]

##  [1] 40.3 30.9 26.3 25.5 24.1 22.9 20.9 19.6 30.3 29.9

melt(households, id="Year")[1:10,]

##    Year       variable value
## 1  1970  MarriedWChild  40.3
## 2  1980  MarriedWChild  30.9
## 3  1990  MarriedWChild  26.3
## 4  1995  MarriedWChild  25.5
## 5  2000  MarriedWChild  24.1
## 6  2005  MarriedWChild  22.9
## 7  2010  MarriedWChild  20.9
## 8  2012  MarriedWChild  19.6
## 9  1970 MarriedWOChild  30.3
...

ggplot(melt(households, id="Year"),       
       aes(x=Year, y=value, color=variable)) +
  geom_line(size=2) + geom_point(size=5) +  
  ylab("Percentage of Households")

8.4 Assignment

8.4.1 Part 1 - Election Forecasting Revisited

8.4.1.1 Problem 1 - Drawing a Map of the US

library(caTools)
library(rpart)
library(rpart.plot)
library(ROCR)
library(caret)
library(maps)
library(ggmap)
library(tmaptools)
library(mapproj)
#1.1 How many different groups are there? 
statesMap <- map_data("state")
str(statesMap)

## 'data.frame':    15537 obs. of  6 variables:
##  $ long     : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat      : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ subregion: chr  NA NA NA NA ...

table(statesMap$group) #Ans - 63

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  202  149  312  516   79   91   94   10  872  381  233  329  257  256  113  397 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##  650  399  566   36  220   30  460  370  373  382  315  238  208   70  125  205 
##   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
##   78   16  290   21  168   37  733   12  105  238  284  236  172   66  304  166 
##   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##  289 1088   59  129   96   15  623   17   17   19   44  448  373  388   68

#1.2
ggplot(statesMap, aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "white", color = "black")

8.4.1.2 Problem 2 - Coloring the States by Predictions

#2.1
polling <- read.csv("week7/PollingImputed.csv")
Train <- subset(polling, Year == 2004 | Year==2008)
Test <- subset(polling, Year == 2012)

mod2 <- glm(Republican~SurveyUSA+DiffCount, data=Train, family="binomial")
TestPrediction <- predict(mod2, newdata=Test, type="response")
TestPredictionBinary <- as.numeric(TestPrediction > 0.5)

predictionDataFrame <- data.frame(TestPrediction, TestPredictionBinary, Test$State)
summary(predictionDataFrame$TestPredictionBinary)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4889  1.0000  1.0000

#2.2
predictionDataFrame$region <- tolower(predictionDataFrame$Test.State)
predictionMap <- merge(statesMap, predictionDataFrame, by = "region")
predictionMap <- predictionMap[order(predictionMap$order),]#order

#2.4
ggplot(predictionMap, aes(x = long, y = lat, group = group, fill = TestPredictionBinary)) + 
  geom_polygon(color = "black")

#2.5
ggplot(predictionMap, aes(x = long, y = lat, group = group, fill = TestPredictionBinary))+ geom_polygon(color = "black") + 
  scale_fill_gradient(low = "blue", high = "red", guide = "legend", breaks= c(0,1), labels = c("Democrat", "Republican"), name = "Prediction 2012")

8.4.1.3 Problem 3 - Understanding the Predictions

#3.2 What was our predicted probability for the state of Florida?
tapply(predictionMap$TestPrediction,predictionMap$Test.State, mean)

##        Alabama         Alaska        Arizona       Arkansas     California 
##             NA             NA   9.739028e-01   9.994949e-01   9.261522e-05 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##   9.432967e-03   3.431627e-05             NA   9.640395e-01   9.901680e-01 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##             NA   9.996372e-01   9.262188e-05   9.992970e-01   6.486672e-02 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##   9.506137e-01   9.901659e-01   9.994949e-01   9.382536e-04   2.431748e-06 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##   1.236970e-07   1.770169e-05   4.843047e-04   9.325489e-01   9.990219e-01 
...

8.4.2 Part 2 - Visualizing Network Data

8.4.2.1 Problem 1 - Summarizing the Data

edges <- read.csv("week7/edges.csv")
users <- read.csv("week7/users.csv")
str(edges)

## 'data.frame':    146 obs. of  2 variables:
##  $ V1: int  4019 4023 4023 4027 3988 3982 3994 3998 3993 3982 ...
##  $ V2: int  4026 4031 4030 4032 4021 3986 3998 3999 3995 4021 ...

str(users)

## 'data.frame':    59 obs. of  4 variables:
##  $ id    : int  3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 ...
##  $ gender: Factor w/ 3 levels "","A","B": 2 3 3 3 3 3 2 3 3 2 ...
##  $ school: Factor w/ 3 levels "","A","AB": 2 1 1 1 1 2 1 1 2 1 ...
##  $ locale: Factor w/ 3 levels "","A","B": 3 3 3 3 3 3 2 3 3 2 ...

#1.2 - Out of all the students who listed a school, what was the most common locale?
table(users$locale)

## 
##     A  B 
##  3  6 50

#1.3 
table(users$gender)

## 
##     A  B 
##  2 15 42

8.4.2.2 Problem 2 - Creating a network

library(igraph)
g <- graph.data.frame(edges,FALSE,users)

#2.2
plot(g, vertex.size=5, vertex.label=NA)

#2.3 How many users are friends with 10 or more other Facebook users in this network?
table(degree(g) >=10)

## 
## FALSE  TRUE 
##    50     9

V(g)$size <- degree(g)/2+2 #Normalize node sie to friends
plot(g, vertex.label=NA)

summary(V(g)$size)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.500   3.500   4.475   6.000  11.000

8.4.2.3 Problem 3 - Coloring Vertices

V(g)$color <- "black"
#Color by gender
V(g)$color[V(g)$gender == "A"] <- "red"
V(g)$color[V(g)$gender == "B"] <- "gray"
plot(g, vertex.label=NA)

#Color by school
V(g)$color <- "black"
V(g)$color[V(g)$school == "A"] <- "red"
V(g)$color[V(g)$school == "AB"] <- "blue"
plot(g, vertex.label=NA)

#Color by locale
V(g)$color <- "black"
V(g)$color[V(g)$locale == "A"] <- "red"
V(g)$color[V(g)$locale == "B"] <- "blue"
plot(g, vertex.label=NA)