################################
#Visualizing Model Results in R#
##########Alex Kirss############
################################

# Clear your R
rm(list = ls())

# Some helpful packages you might not have already downloaded
install.packages(c("dotwhisker", "margins", "forecast", "lfe", "gridExtra", "stargazer", "xtable"))

# If you are a R n00b then you might need to install these also
install.packages(c("foreign", "haven", "tidyverse", "dplyr", "broom"))

# load packages
library(foreign)
library(haven)
library(tidyverse)
library(broom)
library(dplyr)
library(gridExtra)
library(dotwhisker)
library(margins)
library(lfe)
library(forecast)
library(stargazer)
library(xtable)


# Load Data
data<-read_dta("/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/181128ModelViz/togd.dta")

# Today we will be using the "Other Guys Database" compiled by Emrich and Kirss
# Data comprises all Senators in presidential election years from 1972-2016 from non-incumbent parties
# Level of analysis is the "senator-opportunity"


###############
#Linear Models#
###############

# Run a simple linear model of presidential run as a function of age, party and foreign policy committee membership

model1<-glm(PresRun ~ ForeignCom + Party + Age + AgeQuad, data = data)

# We can present these results in tabular form using the 'stargazer' or 'xtable' package

stargazer(model1)
xtable(model1)

# These are both customizable *to an extent*

# Problems with tables: 
# 1) hard to interpret, especially with interaction terms
# 2) not very flexible, only work with certain model classes, such as 'glm' models

# Solution: PRESENT RESULTS VISUALLY

# General process: 
# 1) "crack" your model open and extract the useful parameters
# 2) "pack" these parameters into a new data frame
# 3) use a graphical wrapper to present your new data frame

# Let's do this with our linear model

# "Crack" by extracting coefficients and standard errors
# In RMarkdown you can often click on the model in the "Global Environment" box to inspect elements
# Or use "View" function

View(model1)

# Extract the coefficients

model1coef<- as.data.frame(c(model1[["coefficients"]]))

# For standard errors and confidence intervals we want to crack the model summary

summary(model1)
View(summary(model1))

# Specifically we want the second column of the coefficient frame

model1se1 <- summary(model1)$coefficients[, 2]

# Alternatively calculate using the variance-covariance matrix

model1se2 <- sqrt(diag(vcov(model1)))


# ALWAYS CHECK THAT YOU'RE EXTRACTING THE RIGHT VALUES, I.E. THEY MATCH A TABULAR OUTPUT

# We now need to "pack" these values into the same data frame
# We want one column with variable names, one with coefficient, and one with standard error

model1names <- row.names(model1coef)
model1frame<- cbind(model1names, model1coef, model1se1)

# To wrap these use the 'dotwhisker' package, which is based off of ggplot2
# Need a dataframe with following columns: term, estimate, standard error
# "Term" is variable/model name, "estimate" is coefficient/marginal effect, "std.error" is standard error
# Let's clean up our data frame and rename the columns

names(model1frame) <- c("term", "estimate", "std.error")

# And then plot

model1plot<- dwplot(model1frame)
model1plot

# This looks kind of dumpy, so let's clean it up
# Look at the 'dotwhisker' manual and vignette (https://cran.r-project.org/web/packages/dotwhisker/vignettes/dotwhisker-vignette.html) for ideas
# Alternatively look at 'ggplot2' vignettes for general commands

model1plot<- dwplot(model1frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Model 1 Results") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
model1plot


#EDITED: 11/30
############################

# Thanks to Colin Emrich, there is a *far* easier way to crack and pack these models
# Using the 'tidy' function from the 'broom' package leaves you with a similar data frame

tidymodel1<-tidy(model1)

# We can plot this the same way we did the previous data frame

# Simple disgusting plot

model1plotb<- dwplot(tidymodel1)
model1plotb

# Clean and beautiful plot

model1plotb<- dwplot(tidymodel1) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Model 1 Results") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
model1plotb


###########################
#Comparing Multiple Models#
###########################

# Sometimes we want to compare the results of multiple models, so let's run some more

model2 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni, data = data)
model3 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval, data = data)
model4 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval, data = data)

# Crack and Pack these
model2coef<- as.data.frame(c(model2[["coefficients"]]))
model3coef<- as.data.frame(c(model3[["coefficients"]]))
model4coef<- as.data.frame(c(model4[["coefficients"]]))

model2se <- sqrt(diag(vcov(model2)))
model3se <- sqrt(diag(vcov(model3)))
model4se <- sqrt(diag(vcov(model4)))

model2names <- row.names(model2coef)
model3names <- row.names(model3coef)
model4names <- row.names(model4coef)

model2frame<- cbind(model2names, model2coef, model2se)
model3frame<- cbind(model3names, model3coef, model3se)
model4frame<- cbind(model4names, model4coef, model4se)

names(model2frame) <- c("term", "estimate", "std.error")
names(model3frame) <- c("term", "estimate", "std.error")
names(model4frame) <- c("term", "estimate", "std.error")

model2plot<- dwplot(model2frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Model 2 Results") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
model2plot

model3plot<- dwplot(model3frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Model 3 Results") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
model3plot

model4plot<- dwplot(model4frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Model 4 Results") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
model4plot

# Can present these plots together using 'gridExtra' package

Figure1<- grid.arrange(model1plot, model2plot, model3plot, model4plot, ncol = 2,
                        top = "Figure 1: Determinants of Presidential Candidacy")
Figure1

# Maybe we want to pull out a single coefficient for comparison
# Can either "filter" a data frame of full results, or build a data frame with only the coefficents you want
# Here, we'll do the former, because you generally will want the full results in your appendix anyways

forcom1<-filter(model1frame, term == "ForeignCom") %>% 
  mutate(model = "Model 1")
forcom2<-filter(model2frame, term == "ForeignCom") %>% 
  mutate(model = "Model 2")
forcom3<-filter(model3frame, term == "ForeignCom") %>% 
  mutate(model = "Model 3")
forcom4<-filter(model4frame, term == "ForeignCom") %>% 
  mutate(model = "Model 4")

forcomframe<-rbind(forcom1, forcom2, forcom3, forcom4)

# Because we want to label by model, need to make THAT the term command

forcomframe<- select(forcomframe, -term) %>%
  rename(term = model)

forcomplot<- dwplot(forcomframe) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Figure 2: Effect of SFRC Membership on Presidential Candidacy") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
forcomplot

###################
#Non Linear Models#
###################

# But WAIT...our DV is binary, so shouldn't we be using a nonlinear model? Sure!
# Estimate nonlinear models

model1a <-glm(PresRun ~ ForeignCom + Party + Age + AgeQuad, data = data, family = binomial(link = "logit"))
model2a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni, data = data, family = binomial(link = "logit"))
model3a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval, data = data, family = binomial(link = "logit"))
model4a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval, data = data, family = binomial(link = "logit"))

# Crack

model1acoef<- as.data.frame(c(model1a[["coefficients"]]))
model2acoef<- as.data.frame(c(model2a[["coefficients"]]))
model3acoef<- as.data.frame(c(model3a[["coefficients"]]))
model4acoef<- as.data.frame(c(model4a[["coefficients"]]))

model1ase <- sqrt(diag(vcov(model1a)))
model2ase <- sqrt(diag(vcov(model2a)))
model3ase <- sqrt(diag(vcov(model3a)))
model4ase <- sqrt(diag(vcov(model4a)))

model1anames <- row.names(model1acoef)
model2anames <- row.names(model2acoef)
model3anames <- row.names(model3acoef)
model4anames <- row.names(model4acoef)

# Pack

model1aframe<- cbind(model1anames, model1acoef, model1ase)
model2aframe<- cbind(model2anames, model2acoef, model2ase)
model3aframe<- cbind(model3anames, model3acoef, model3ase)
model4aframe<- cbind(model4anames, model4acoef, model4ase)

names(model1aframe) <- c("term", "estimate", "std.error")
names(model2aframe) <- c("term", "estimate", "std.error")
names(model3aframe) <- c("term", "estimate", "std.error")
names(model4aframe) <- c("term", "estimate", "std.error")

# Here I just want the Foreign Committee Variable, labeled by model #

forcom1a<-filter(model1aframe, term == "ForeignCom") %>% 
	select(-term) %>%
	mutate(term = "Model 1")
forcom2a<-filter(model2aframe, term == "ForeignCom") %>% 
  select(-term) %>%
  mutate(term = "Model 2")
forcom3a<-filter(model3aframe, term == "ForeignCom") %>% 
	select(-term) %>%
	mutate(term = "Model 3")
forcom4a<-filter(model4aframe, term == "ForeignCom") %>% 
	select(-term) %>%
	mutate(term = "Model 4")

forcomframe2<-rbind(forcom1a, forcom2a, forcom3a, forcom4a)

# Plot 

forcomplot2<- dwplot(forcomframe2) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Figure 3: Effect of SFRC Membership on Presidential Candidacy, Logit Models") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
forcomplot2

#####################
#Visualizing Margins#
#####################

# Do YOU know how to interpret logit coefficients? I don't.
# We can do a similar plot, however, with the marginal effects
# Just crack, pack, and plot

margins1<-margins(model1a)
summary(margins1)

# Trick is figuring out how to extract the parameters you want
# Let's filter out the Foreign Com variable

margins1<-filter(summary(margins(model1a)), factor == "ForeignCom") %>% 
  mutate(term = "Model 1")
margins2<-filter(summary(margins(model2a)), factor == "ForeignCom") %>% 
  mutate(term = "Model 2")
margins3<-filter(summary(margins(model3a)), factor == "ForeignCom") %>% 
  mutate(term = "Model 3")
margins4<-filter(summary(margins(model4a)), factor == "ForeignCom") %>% 
  mutate(term = "Model 4")

forcommargins<-rbind(margins1, margins2, margins3, margins4) %>% 
  select(AME, SE, term)
names(forcommargins) <- c("estimate", "std.error", "term")

# Plot

forcomplot3<- dwplot(forcommargins) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Figure 4: AME of SFRC Membership on Presidential Candidacy, Logistic Models") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
forcomplot3

######################
#Fixed Effects Models#
######################

# Things can get more complicated with higher level link functions
# The procedure, however, remains the same: Crack, Pack and Plot

# Let's run some fixed effects models, clustering by senator

model1b <-felm(PresRun ~ ForeignCom + Party + Age + AgeQuad | ICPSR, data = data)
model2b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni | ICPSR, data = data)
model3b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval | ICPSR, data = data)
model4b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval | ICPSR, data = data)

# These are NOT pure glm objects, so cracking and packing is different

model1bcoef<- as.data.frame(c(model1b[["coefficients"]]))
model2bcoef<- as.data.frame(c(model2b[["coefficients"]]))
model3bcoef<- as.data.frame(c(model3b[["coefficients"]]))
model4bcoef<- as.data.frame(c(model4b[["coefficients"]]))

# e.g. already have standard errors calculated

model1bse <- as.data.frame(c(model1b[["se"]]))
model2bse <- as.data.frame(c(model2b[["se"]]))
model3bse <- as.data.frame(c(model3b[["se"]]))
model4bse <- as.data.frame(c(model4b[["se"]]))

# Harder to extract the variable names, so build them from scratch

model1bnames <- c("ForeignCom", "Party", "Age", "AgeQuad")
model2bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "Chamberseni")
model3bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "AveragePresApproval")
model4bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "Chamberseni", "AveragePresApproval")

# Pack

model1bframe<- cbind(model1bcoef, model1bse, model1bnames)
model2bframe<- cbind(model2bcoef, model2bse, model2bnames)
model3bframe<- cbind(model3bcoef, model3bse, model3bnames)
model4bframe<- cbind(model4bcoef, model4bse, model4bnames)

names(model1bframe) <- c("estimate", "std.error", "term")
names(model2bframe) <- c("estimate", "std.error", "term")
names(model3bframe) <- c("estimate", "std.error", "term")
names(model4bframe) <- c("estimate", "std.error", "term")

model1bforcom<- filter(model1bframe, term == "ForeignCom") %>%
  mutate(model = "Model 1")
model2bforcom<- filter(model2bframe, term == "ForeignCom") %>%
  mutate(model = "Model 2")
model3bforcom<- filter(model3bframe, term == "ForeignCom") %>%
  mutate(model = "Model 3")
model4bforcom<- filter(model4bframe, term == "ForeignCom") %>%
  mutate(model = "Model 4")

forcomfe<- rbind(model1bforcom, model2bforcom, model3bforcom, model4bforcom) %>%
  select(-term) %>%
  rename(term = model)

# Plot

forcomplot4<- dwplot(forcomfe) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Figure 5: Effect of SFRC Membership on Presidential Candidacy, FE Models") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
forcomplot4

#################################
#Comparing Different Model Types#
#################################

# What if we want to compare different model types?
# Need all parameters in the same data frame, but categorized by both model number and link function

# Let's compare OLS and FE specification

# Re-categorize both frames individually

forcomframeA<- rename(forcomframe, model = term) %>%
  mutate(term = "OLS")
forcomfeA<- rename(forcomfe, model = term) %>%
  mutate(term = "FE")

comparison<-rbind(forcomframeA, forcomfeA)

# Tweak the plot slightly to differentiate models

forcomplot5<- dwplot(comparison) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + 
  ggtitle("Figure 6: Effect of SFRC Membership, OLS vs. FE Models") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) + 
  scale_colour_grey(start = 0, end = .75)
forcomplot5

##############################
#More Complicated Model Types#
##############################

# What about even MORE complicated model types?
# Just requires more effort to crack and pack

# Example with new data: let's calculate some auto-regressive integrated moving average (ARIMA) models
# Model stock price returns as a function of impending threat of war in Europe, 1930-1941
# Data: DV is weekly stock price data for five Dow companies from CRSP at Wharton, IV is measure of war threat from Chadefaux (2014)

dow <- read_dta("/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/181128ModelViz/DowExample.dta")
View(dow)

# Split the data into separate time series
splitdata <- split(dow, dow$numcorp)

# Calculate ARIMA models

set.seed(21586634)
arima67 <- auto.arima(splitdata[["67"]]$lnavgret, xreg = splitdata[["67"]]$news_lag)
arima112 <- auto.arima(splitdata[["112"]]$lnavgret, xreg = splitdata[["112"]]$news_lag)
arima145 <- auto.arima(splitdata[["145"]]$lnavgret, xreg = splitdata[["145"]]$news_lag)
arima353 <- auto.arima(splitdata[["353"]]$lnavgret, xreg = splitdata[["353"]]$news_lag)
arima530 <- auto.arima(splitdata[["530"]]$lnavgret, xreg = splitdata[["530"]]$news_lag)

# These are WEIRD outputs

summary(arima67)
View(arima67)

# Extract Standard Errors

arima.se67 <- sqrt(diag(vcov(arima67)))[4]
arima.se112 <- sqrt(diag(vcov(arima112)))[2]
arima.se145 <- sqrt(diag(vcov(arima145)))[2]
arima.se353 <- sqrt(diag(vcov(arima353)))[2]
arima.se530 <- sqrt(diag(vcov(arima530)))[2]

# Bind Coefs and SEs

arima_coef67 <- cbind(67, arima67[["coef"]][["xreg"]], arima.se67)
arima_coef112 <- cbind(112, arima112[["coef"]][["xreg"]], arima.se112)
arima_coef145 <- cbind(145, arima145[["coef"]][["xreg"]], arima.se145)
arima_coef353 <- cbind(353, arima353[["coef"]][["xreg"]], arima.se353)
arima_coef530 <- cbind(530, arima530[["coef"]][["xreg"]], arima.se530)

arima_coefsfinal<- as.data.frame(rbind(arima_coef67, arima_coef112, arima_coef145, arima_coef353, arima_coef530))
names(arima_coefsfinal) <- c("numcorp", "estimate", "std.error")

corpnames<-c("Bethlehem Steel Corp", "Coca Cola Co", "Du Pont", "Sears Roebuck & Co", "Goodyear Tire & Rubber")
arima_final<-as.data.frame(cbind(arima_coefsfinal, corpnames)) %>%
  rename(term = corpnames)

# Plot

dowplot <- dwplot(arima_final, dodge_size = 0) + xlab("Coefficient Estimate") + ylab("") + 
  geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Effect of Conflict Tensions on Stock Price: 1930-1941") + 
  theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + 
  scale_colour_grey(start = 0, end = 0)
dowplot

# The possibilities are essentially endless
# Be sure to look at the reference manuals for these packages for more info, especially for 'dotwhisker' and 'gridExtra'
# Some model types are easier to run in Stata, for instance fixed effects logit
# We'll likely cover visualizing models in Stata in a future session