#Analysis R Us: A Journey Into the Wonders of R #10/24/2018 #Presented by Alexes Beck and Kirss #Adapted from "Introduction to R" by Samer Anabtawi, Colin Emrich, and Jack Hasler ###################### #Basics for Beginners# ###################### #You can use R as a calculator 1+1 #Don't use R as a calulator. Only professional R users and serial killers use R as a calculator #(and you're probably neither). You can also make dataframes in R, but most of the time you will be #drawing on existing data sets. Since, the point of this seminar is to focus on functionality and playing with #data, we'll be working with an existing data set from here onwards. For some more on the basics, see a #resource such as: http://stat545.com/block002_hello-r-workspace-wd-project.html #####Differences between R and Stata ##The core functions of R include a lot less that Stata. Instead, we install and import the packages we need ##as necessary. Installations only have to happen once. Importing must take place every time. ##set working directory setwd("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop") ##Install packages install.packages("readr") install.packages("tidyverse") install.packages("ggThemeAssist") install.packages("haven") install.packages("foreign") install.packages("stargazer") install.packages("xtable") ##Load packages library(readr) library(tidyverse) library(ggThemeAssist) library(haven) library(foreign) library(xtable) library(stargazer) ##Load data from a .csv auto_data <- read_csv("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.csv") View(auto_data) #data in a different format? The 'haven' and 'foreign' packages have commands for importing different types of data #for instance, the "read.dta" function from 'foreign' and "read_dta" from 'haven' can import Stata files auto_data2<-read.dta("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.dta") auto_data3<-read_dta("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.dta") ##Making and manipulating objects in R #As you can hopefully see, one of the main differences between R and Stata is that you can load *multiple* objects at once #In Stata we can only have one data frame at a time, in R we can have many #An "object" in R can be virtually anything, a data frame, a vector/list, model results, a function #Assign objects using an arrow "<-" like we've done with the data #Let's create a vector that tells us the truth, that Alexes are great #We use the "combine" function, denoted by 'c()' alexes<-c("great") View(alexes) ##R, like Stata, is a VERY finicky language...make sure that you close parantheses, quotation marks, etc. ##Stray commas or capitalization errors and your code won't run ##When naming objects, you can write in a variety of ways: snake_case, per.iods, or CamelCase are all fine alexes_are<-c("great") alexes.are<-c("great") AlexesAre<-c("great") #Want to remove objects? Use the "rm" command rm(alexes, alexes_are, alexes.are, AlexesAre) #Want to remove EVERYTHING, including objects and loaded libraries/packages? Use the following rm(list = ls()) ################################ #Cleaning and Manipulating Data# ################################ #Let's look at our data View(auto_data) #Not sure if your object is a dataframe? Turn it into one auto_data<-as.data.frame(auto_data) ##Basic summary statistics dim(auto_data) summary(auto_data) ##Hadley Wickham's 'tidyverse' package is one of the newest (and best) ways to clean and manipulate data in R #His book, "R for Data Science" is incredibly helpful, and available online FOR FREE: http://r4ds.had.co.nz/ #We'll show you just a few basic functions here from the 'dplyr' package: 'mutate,' 'rename,' 'select,' 'arrange,' and 'filter' #Use 'mutate' in order to create new variables that are based on manipulations of current variables # e.g. create a new variable, "lwratio" that is equal to a car's length-to weight ratio auto_data<-mutate(auto_data, lwratio = length/weight) #can rename variables using 'rename' auto_data<-rename(auto_data, lw_ratio = lwratio) #Want to drop variables? Use the 'select' function to either choose which variables to keep or which ones to drop #drop using -varname auto_data<-select(auto_data, -lw_ratio) #select using varname auto_data<-select(auto_data, man, price, mpg, rep78, headroom, trunk, weight, length, turn, displacement, gear_ratio, foreign) #Want to sort columns by a particular set of values? use 'arrange' #e.g. order by ascending weight auto_data<-arrange(auto_data, weight) #what about descending? auto_data<-arrange(auto_data, desc(weight)) #Want to subset data by particular values or delete particular rows? use 'filter' #For instance, let's create a dataset of *only* domestic cars domestic<-filter(auto_data, foreign == "Domestic") ##USING THE "PIPE" #The "pipe" is a special character, %>%, that allows you to link together multiple functions in a string #This cleans up your code tremendously...allows you to avoid repeating the dataframe in each line #What if I wanted to filter only domestic vehicles and drop the now useless "foreign" variable at the same time? domestic<-filter(auto_data, foreign == "Domestic") %>% select(-foreign) ##More Complicated Mutations #Oftentimes you want to do more than simply create new variables from old variables #Combing a diverse set of tools such as "ifelse" and "grepl" can get you there #If-else basically runs a test for a condition, if the condition is true it does one thing, if false something else #For instance, I want to create a dummy variable "heavy" with value 1 if the weight is > 3000 pounds, otherwise 0 auto_data<-mutate(auto_data, heavy = ifelse(weight >3000, 1, 0)) #This works great for when you have numeric ifelse statements, but poorly if you need character-based (i.e. word) statements #Use the grepl command to work around #For instance, I want to create a dummy variable "ford" for all Ford vehicles auto_data<-mutate(auto_data, ford = ifelse(grepl("Ford", man), 1, 0)) #What if I want to code muscle cars by various producers? can stack ifelse statements auto_data<-mutate(auto_data, muscle = ifelse(grepl("Ford Mustang", man), 1, ifelse(grepl("Pont. Firebird", man), 1, 0))) #More info on ifelse grepl, see: https://rstudio-pubs-static.s3.amazonaws.com/116317_e6922e81e72e4e3f83995485ce686c14.html ##Finding and Dealing with Missing Data; R generally codes data as "NA" #Can use mutate to recode a variable as missing, or missing variables as values #let's say we don't know if Datsuns or the Mercury Cougar XR7 are muscle cars or not auto_data<-mutate(auto_data, muscle = ifelse(grepl("Ford Mustang", man), 1, ifelse(grepl("Pont. Firebird", man), 1, ifelse(grepl("Datsun", man), NA, ifelse(grepl("Merc. XR-7", man), NA, 0))))) #We then realize we've made a mistake and that the XR7 IS a muscle car...we can then use mutate again auto_data<-mutate(auto_data, muscle = ifelse(grepl("Merc. XR-7", man), 1, muscle)) #To omit variables from calculations use "na.omit" #For instance, we can't take the mean of the rep78 variable mean(auto_data$rep78) mean(na.omit(auto_data$rep78)) #N.B. The "$" specifies in which data set to search for your variable mean(na.omit(rep78)) #this won't work #Can also create a data frame of only complete cases, although be careful with this, why might the data be missing? complete<-filter(auto_data, complete.cases(auto_data)) ######################### #Basic Regression Models# ######################### ##Some basic linear models #The general format of a regression model in R is y ~ x1 + x2 auto_mod1 <- lm(price ~ displacement + mpg + weight, data = auto_data) #A basic linear model summary(auto_mod1) auto_mod2 <- lm(price ~ displacement + mpg + log(weight), data = auto_data) summary(auto_mod2) auto_mod3 <- lm(price ~ displacement + mpg*weight, data = auto_data) #Including an interaction summary(auto_mod3) #Note that for interaction terms, ONLY the interaction term is required for R. #That is, the constituent components will automatically be added (unlike Stata). ### Dummy Variables (specified in the same fashion as other variables) auto_mod4 <- lm(price ~ displacement + mpg + foreign, data = auto_data) summary(auto_mod4) #Finding out what else is contained in the saved regression names(auto_mod1) auto_mod1$coefficients auto_mod1$residuals #and so on. #This can be VERY helpful when using stranger types of models (e.g. not lm/glm) and you have to build your own tables ### Creating regression tables #Marek Hlavac's 'stargazer' package makes really cool tables that can be exported to Latex or RMarkdown library(stargazer) stargazer(auto_mod3) #See https://www.jakeruss.com/cheatsheets/stargazer/ for an excellent rundown of basic stargazer commands. #Pluses of stargazer: easy to use, look cool; Minuses of stargazer: only works with certain types of objects #An alternative way is to use the 'xtable' function #Besides plotting regression analysis, it can also create simple latex tables directly from data frames library(xtable) xtable(auto_mod3) ##Some nonlinear/more complicated models #Generally use the 'glm' function for generalized linear models, e.g. logit/probit auto_logit <- glm(heavy ~ price + mpg + weight, family = binomial(link = "logit"), data = auto_data) auto_probit <- glm(heavy ~ price + mpg + weight, family = binomial(link = "probit"), data = auto_data) stargazer(auto_logit) summary(auto_logit) ###################### #Graphing With GGplot# ###################### ##Basic scatterplot ggplot(data = auto_data) + geom_point(mapping = aes(x = displacement, y = price)) ##Let's add some variety - foreign vs. domestic ggplot(data = auto_data) + geom_point(mapping = aes(x = displacement, y = price, color = foreign)) ##How else might we seperate variables by type? ggplot(data = auto_data) + geom_point(mapping = aes(x = displacement, y = price)) + facet_wrap(~ foreign, nrow = 1) ggplot(data = auto_data) + geom_point(mapping = aes(x = displacement, y = price)) + facet_wrap(~ rep78, nrow = 2) ##More ggplot(data = auto_data) + geom_smooth(mapping = aes(x = displacement, y = price)) + geom_point(mapping = aes(x = displacement, y = price)) ##How about a fancy bar graph? ggplot(data = auto_data) + geom_bar(mapping = aes(x = mpg, fill = foreign), position = "dodge") ##Mapping our models ggplot(auto_mod1, aes(mpg, price)) + geom_point() + geom_smooth(mapping = aes(x = mpg, y = price)) + theme_minimal() + xlab("Miles per Gallon") + ylab("Price of Vehicle") + ggtitle("R Workshop - Effect of Miles per Gallon on Price") ## Quiz: Make the above an object ##### Challenge: make the following chart ##### ##try out ggThemeAssist ############# #Wrapping Up# ############# # What do you do if you need some help and neither of us are available? # Option 1:"?function" ?lm # Option 2:"help()" help(lm) ?help help(,foreign) # Option 3: "help.search("keyword") help.search("foreign") help.search("stata") help.search("vcov") # Option 4: GOOGLE IT! #There are a lot of helpful threads on StackOverflow #The CRAN repository also has reference manuals for individual packages (e.g. stargzer) ############################################################# ############################################################# ############################################################# ############################################################# # Quit quit()