################################ #Visualizing Model Results in R# ##########Alex Kirss############ ################################ # Clear your R rm(list = ls()) # Some helpful packages you might not have already downloaded install.packages(c("dotwhisker", "margins", "forecast", "lfe", "gridExtra", "stargazer", "xtable")) # If you are a R n00b then you might need to install these also install.packages(c("foreign", "haven", "tidyverse", "dplyr", "broom")) # load packages library(foreign) library(haven) library(tidyverse) library(broom) library(dplyr) library(gridExtra) library(dotwhisker) library(margins) library(lfe) library(forecast) library(stargazer) library(xtable) # Load Data data<-read_dta("/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/181128ModelViz/togd.dta") # Today we will be using the "Other Guys Database" compiled by Emrich and Kirss # Data comprises all Senators in presidential election years from 1972-2016 from non-incumbent parties # Level of analysis is the "senator-opportunity" ############### #Linear Models# ############### # Run a simple linear model of presidential run as a function of age, party and foreign policy committee membership model1<-glm(PresRun ~ ForeignCom + Party + Age + AgeQuad, data = data) # We can present these results in tabular form using the 'stargazer' or 'xtable' package stargazer(model1) xtable(model1) # These are both customizable *to an extent* # Problems with tables: # 1) hard to interpret, especially with interaction terms # 2) not very flexible, only work with certain model classes, such as 'glm' models # Solution: PRESENT RESULTS VISUALLY # General process: # 1) "crack" your model open and extract the useful parameters # 2) "pack" these parameters into a new data frame # 3) use a graphical wrapper to present your new data frame # Let's do this with our linear model # "Crack" by extracting coefficients and standard errors # In RMarkdown you can often click on the model in the "Global Environment" box to inspect elements # Or use "View" function View(model1) # Extract the coefficients model1coef<- as.data.frame(c(model1[["coefficients"]])) # For standard errors and confidence intervals we want to crack the model summary summary(model1) View(summary(model1)) # Specifically we want the second column of the coefficient frame model1se1 <- summary(model1)$coefficients[, 2] # Alternatively calculate using the variance-covariance matrix model1se2 <- sqrt(diag(vcov(model1))) # ALWAYS CHECK THAT YOU'RE EXTRACTING THE RIGHT VALUES, I.E. THEY MATCH A TABULAR OUTPUT # We now need to "pack" these values into the same data frame # We want one column with variable names, one with coefficient, and one with standard error model1names <- row.names(model1coef) model1frame<- cbind(model1names, model1coef, model1se1) # To wrap these use the 'dotwhisker' package, which is based off of ggplot2 # Need a dataframe with following columns: term, estimate, standard error # "Term" is variable/model name, "estimate" is coefficient/marginal effect, "std.error" is standard error # Let's clean up our data frame and rename the columns names(model1frame) <- c("term", "estimate", "std.error") # And then plot model1plot<- dwplot(model1frame) model1plot # This looks kind of dumpy, so let's clean it up # Look at the 'dotwhisker' manual and vignette (https://cran.r-project.org/web/packages/dotwhisker/vignettes/dotwhisker-vignette.html) for ideas # Alternatively look at 'ggplot2' vignettes for general commands model1plot<- dwplot(model1frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Model 1 Results") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) model1plot #EDITED: 11/30 ############################ # Thanks to Colin Emrich, there is a *far* easier way to crack and pack these models # Using the 'tidy' function from the 'broom' package leaves you with a similar data frame tidymodel1<-tidy(model1) # We can plot this the same way we did the previous data frame # Simple disgusting plot model1plotb<- dwplot(tidymodel1) model1plotb # Clean and beautiful plot model1plotb<- dwplot(tidymodel1) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Model 1 Results") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) model1plotb ########################### #Comparing Multiple Models# ########################### # Sometimes we want to compare the results of multiple models, so let's run some more model2 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni, data = data) model3 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval, data = data) model4 <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval, data = data) # Crack and Pack these model2coef<- as.data.frame(c(model2[["coefficients"]])) model3coef<- as.data.frame(c(model3[["coefficients"]])) model4coef<- as.data.frame(c(model4[["coefficients"]])) model2se <- sqrt(diag(vcov(model2))) model3se <- sqrt(diag(vcov(model3))) model4se <- sqrt(diag(vcov(model4))) model2names <- row.names(model2coef) model3names <- row.names(model3coef) model4names <- row.names(model4coef) model2frame<- cbind(model2names, model2coef, model2se) model3frame<- cbind(model3names, model3coef, model3se) model4frame<- cbind(model4names, model4coef, model4se) names(model2frame) <- c("term", "estimate", "std.error") names(model3frame) <- c("term", "estimate", "std.error") names(model4frame) <- c("term", "estimate", "std.error") model2plot<- dwplot(model2frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Model 2 Results") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) model2plot model3plot<- dwplot(model3frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Model 3 Results") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) model3plot model4plot<- dwplot(model4frame) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Model 4 Results") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) model4plot # Can present these plots together using 'gridExtra' package Figure1<- grid.arrange(model1plot, model2plot, model3plot, model4plot, ncol = 2, top = "Figure 1: Determinants of Presidential Candidacy") Figure1 # Maybe we want to pull out a single coefficient for comparison # Can either "filter" a data frame of full results, or build a data frame with only the coefficents you want # Here, we'll do the former, because you generally will want the full results in your appendix anyways forcom1<-filter(model1frame, term == "ForeignCom") %>% mutate(model = "Model 1") forcom2<-filter(model2frame, term == "ForeignCom") %>% mutate(model = "Model 2") forcom3<-filter(model3frame, term == "ForeignCom") %>% mutate(model = "Model 3") forcom4<-filter(model4frame, term == "ForeignCom") %>% mutate(model = "Model 4") forcomframe<-rbind(forcom1, forcom2, forcom3, forcom4) # Because we want to label by model, need to make THAT the term command forcomframe<- select(forcomframe, -term) %>% rename(term = model) forcomplot<- dwplot(forcomframe) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Figure 2: Effect of SFRC Membership on Presidential Candidacy") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) forcomplot ################### #Non Linear Models# ################### # But WAIT...our DV is binary, so shouldn't we be using a nonlinear model? Sure! # Estimate nonlinear models model1a <-glm(PresRun ~ ForeignCom + Party + Age + AgeQuad, data = data, family = binomial(link = "logit")) model2a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni, data = data, family = binomial(link = "logit")) model3a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval, data = data, family = binomial(link = "logit")) model4a <- glm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval, data = data, family = binomial(link = "logit")) # Crack model1acoef<- as.data.frame(c(model1a[["coefficients"]])) model2acoef<- as.data.frame(c(model2a[["coefficients"]])) model3acoef<- as.data.frame(c(model3a[["coefficients"]])) model4acoef<- as.data.frame(c(model4a[["coefficients"]])) model1ase <- sqrt(diag(vcov(model1a))) model2ase <- sqrt(diag(vcov(model2a))) model3ase <- sqrt(diag(vcov(model3a))) model4ase <- sqrt(diag(vcov(model4a))) model1anames <- row.names(model1acoef) model2anames <- row.names(model2acoef) model3anames <- row.names(model3acoef) model4anames <- row.names(model4acoef) # Pack model1aframe<- cbind(model1anames, model1acoef, model1ase) model2aframe<- cbind(model2anames, model2acoef, model2ase) model3aframe<- cbind(model3anames, model3acoef, model3ase) model4aframe<- cbind(model4anames, model4acoef, model4ase) names(model1aframe) <- c("term", "estimate", "std.error") names(model2aframe) <- c("term", "estimate", "std.error") names(model3aframe) <- c("term", "estimate", "std.error") names(model4aframe) <- c("term", "estimate", "std.error") # Here I just want the Foreign Committee Variable, labeled by model # forcom1a<-filter(model1aframe, term == "ForeignCom") %>% select(-term) %>% mutate(term = "Model 1") forcom2a<-filter(model2aframe, term == "ForeignCom") %>% select(-term) %>% mutate(term = "Model 2") forcom3a<-filter(model3aframe, term == "ForeignCom") %>% select(-term) %>% mutate(term = "Model 3") forcom4a<-filter(model4aframe, term == "ForeignCom") %>% select(-term) %>% mutate(term = "Model 4") forcomframe2<-rbind(forcom1a, forcom2a, forcom3a, forcom4a) # Plot forcomplot2<- dwplot(forcomframe2) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Figure 3: Effect of SFRC Membership on Presidential Candidacy, Logit Models") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) forcomplot2 ##################### #Visualizing Margins# ##################### # Do YOU know how to interpret logit coefficients? I don't. # We can do a similar plot, however, with the marginal effects # Just crack, pack, and plot margins1<-margins(model1a) summary(margins1) # Trick is figuring out how to extract the parameters you want # Let's filter out the Foreign Com variable margins1<-filter(summary(margins(model1a)), factor == "ForeignCom") %>% mutate(term = "Model 1") margins2<-filter(summary(margins(model2a)), factor == "ForeignCom") %>% mutate(term = "Model 2") margins3<-filter(summary(margins(model3a)), factor == "ForeignCom") %>% mutate(term = "Model 3") margins4<-filter(summary(margins(model4a)), factor == "ForeignCom") %>% mutate(term = "Model 4") forcommargins<-rbind(margins1, margins2, margins3, margins4) %>% select(AME, SE, term) names(forcommargins) <- c("estimate", "std.error", "term") # Plot forcomplot3<- dwplot(forcommargins) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Figure 4: AME of SFRC Membership on Presidential Candidacy, Logistic Models") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) forcomplot3 ###################### #Fixed Effects Models# ###################### # Things can get more complicated with higher level link functions # The procedure, however, remains the same: Crack, Pack and Plot # Let's run some fixed effects models, clustering by senator model1b <-felm(PresRun ~ ForeignCom + Party + Age + AgeQuad | ICPSR, data = data) model2b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni | ICPSR, data = data) model3b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + AveragePresApproval | ICPSR, data = data) model4b <- felm(PresRun ~ ForeignCom + Party + Age + AgeQuad + Chamberseni + AveragePresApproval | ICPSR, data = data) # These are NOT pure glm objects, so cracking and packing is different model1bcoef<- as.data.frame(c(model1b[["coefficients"]])) model2bcoef<- as.data.frame(c(model2b[["coefficients"]])) model3bcoef<- as.data.frame(c(model3b[["coefficients"]])) model4bcoef<- as.data.frame(c(model4b[["coefficients"]])) # e.g. already have standard errors calculated model1bse <- as.data.frame(c(model1b[["se"]])) model2bse <- as.data.frame(c(model2b[["se"]])) model3bse <- as.data.frame(c(model3b[["se"]])) model4bse <- as.data.frame(c(model4b[["se"]])) # Harder to extract the variable names, so build them from scratch model1bnames <- c("ForeignCom", "Party", "Age", "AgeQuad") model2bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "Chamberseni") model3bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "AveragePresApproval") model4bnames <- c("ForeignCom", "Party", "Age", "AgeQuad", "Chamberseni", "AveragePresApproval") # Pack model1bframe<- cbind(model1bcoef, model1bse, model1bnames) model2bframe<- cbind(model2bcoef, model2bse, model2bnames) model3bframe<- cbind(model3bcoef, model3bse, model3bnames) model4bframe<- cbind(model4bcoef, model4bse, model4bnames) names(model1bframe) <- c("estimate", "std.error", "term") names(model2bframe) <- c("estimate", "std.error", "term") names(model3bframe) <- c("estimate", "std.error", "term") names(model4bframe) <- c("estimate", "std.error", "term") model1bforcom<- filter(model1bframe, term == "ForeignCom") %>% mutate(model = "Model 1") model2bforcom<- filter(model2bframe, term == "ForeignCom") %>% mutate(model = "Model 2") model3bforcom<- filter(model3bframe, term == "ForeignCom") %>% mutate(model = "Model 3") model4bforcom<- filter(model4bframe, term == "ForeignCom") %>% mutate(model = "Model 4") forcomfe<- rbind(model1bforcom, model2bforcom, model3bforcom, model4bforcom) %>% select(-term) %>% rename(term = model) # Plot forcomplot4<- dwplot(forcomfe) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Figure 5: Effect of SFRC Membership on Presidential Candidacy, FE Models") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) forcomplot4 ################################# #Comparing Different Model Types# ################################# # What if we want to compare different model types? # Need all parameters in the same data frame, but categorized by both model number and link function # Let's compare OLS and FE specification # Re-categorize both frames individually forcomframeA<- rename(forcomframe, model = term) %>% mutate(term = "OLS") forcomfeA<- rename(forcomfe, model = term) %>% mutate(term = "FE") comparison<-rbind(forcomframeA, forcomfeA) # Tweak the plot slightly to differentiate models forcomplot5<- dwplot(comparison) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Figure 6: Effect of SFRC Membership, OLS vs. FE Models") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) + scale_colour_grey(start = 0, end = .75) forcomplot5 ############################## #More Complicated Model Types# ############################## # What about even MORE complicated model types? # Just requires more effort to crack and pack # Example with new data: let's calculate some auto-regressive integrated moving average (ARIMA) models # Model stock price returns as a function of impending threat of war in Europe, 1930-1941 # Data: DV is weekly stock price data for five Dow companies from CRSP at Wharton, IV is measure of war threat from Chadefaux (2014) dow <- read_dta("/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/181128ModelViz/DowExample.dta") View(dow) # Split the data into separate time series splitdata <- split(dow, dow$numcorp) # Calculate ARIMA models set.seed(21586634) arima67 <- auto.arima(splitdata[["67"]]$lnavgret, xreg = splitdata[["67"]]$news_lag) arima112 <- auto.arima(splitdata[["112"]]$lnavgret, xreg = splitdata[["112"]]$news_lag) arima145 <- auto.arima(splitdata[["145"]]$lnavgret, xreg = splitdata[["145"]]$news_lag) arima353 <- auto.arima(splitdata[["353"]]$lnavgret, xreg = splitdata[["353"]]$news_lag) arima530 <- auto.arima(splitdata[["530"]]$lnavgret, xreg = splitdata[["530"]]$news_lag) # These are WEIRD outputs summary(arima67) View(arima67) # Extract Standard Errors arima.se67 <- sqrt(diag(vcov(arima67)))[4] arima.se112 <- sqrt(diag(vcov(arima112)))[2] arima.se145 <- sqrt(diag(vcov(arima145)))[2] arima.se353 <- sqrt(diag(vcov(arima353)))[2] arima.se530 <- sqrt(diag(vcov(arima530)))[2] # Bind Coefs and SEs arima_coef67 <- cbind(67, arima67[["coef"]][["xreg"]], arima.se67) arima_coef112 <- cbind(112, arima112[["coef"]][["xreg"]], arima.se112) arima_coef145 <- cbind(145, arima145[["coef"]][["xreg"]], arima.se145) arima_coef353 <- cbind(353, arima353[["coef"]][["xreg"]], arima.se353) arima_coef530 <- cbind(530, arima530[["coef"]][["xreg"]], arima.se530) arima_coefsfinal<- as.data.frame(rbind(arima_coef67, arima_coef112, arima_coef145, arima_coef353, arima_coef530)) names(arima_coefsfinal) <- c("numcorp", "estimate", "std.error") corpnames<-c("Bethlehem Steel Corp", "Coca Cola Co", "Du Pont", "Sears Roebuck & Co", "Goodyear Tire & Rubber") arima_final<-as.data.frame(cbind(arima_coefsfinal, corpnames)) %>% rename(term = corpnames) # Plot dowplot <- dwplot(arima_final, dodge_size = 0) + xlab("Coefficient Estimate") + ylab("") + geom_vline(xintercept = 0, colour = "grey60", linetype = 2) + ggtitle("Effect of Conflict Tensions on Stock Price: 1930-1941") + theme(plot.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), legend.position = "none") + scale_colour_grey(start = 0, end = 0) dowplot # The possibilities are essentially endless # Be sure to look at the reference manuals for these packages for more info, especially for 'dotwhisker' and 'gridExtra' # Some model types are easier to run in Stata, for instance fixed effects logit # We'll likely cover visualizing models in Stata in a future session