#Analysis R Us: A Journey Into the Wonders of R
#10/24/2018
#Presented by Alexes Beck and Kirss
#Adapted from "Introduction to R" by Samer Anabtawi, Colin Emrich, and Jack Hasler


######################
#Basics for Beginners#
######################

#You can use R as a calculator 
1+1
#Don't use R as a calulator. Only professional R users and serial killers use R as a calculator
#(and you're probably neither). You can also make dataframes in R, but most of the time you will be
#drawing on existing data sets. Since, the point of this seminar is to focus on functionality and playing with
#data, we'll be working with an existing data set from here onwards. For some more on the basics, see a
#resource such as:  http://stat545.com/block002_hello-r-workspace-wd-project.html


#####Differences between R and Stata
##The core functions of R include a lot less that Stata. Instead, we install and import the packages we need 
##as necessary. Installations only have to happen once. Importing must take place every time.

##set working directory
setwd("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop")

##Install packages
install.packages("readr")
install.packages("tidyverse")
install.packages("ggThemeAssist")
install.packages("haven")
install.packages("foreign")
install.packages("stargazer")
install.packages("xtable")

##Load packages
library(readr)
library(tidyverse)
library(ggThemeAssist)
library(haven)
library(foreign)
library(xtable)
library(stargazer)

##Load data from a .csv
auto_data <- read_csv("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.csv")
View(auto_data)


#data in a different format? The 'haven' and 'foreign' packages have commands for importing different types of data
#for instance, the "read.dta" function from 'foreign' and "read_dta" from 'haven' can import Stata files
auto_data2<-read.dta("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.dta")
auto_data3<-read_dta("C:/Users/alexr/Desktop/Methods Fellow/Methods Workshop/auto_data.dta")

##Making and manipulating objects in R

#As you can hopefully see, one of the main differences between R and Stata is that you can load *multiple* objects at once
#In Stata we can only have one data frame at a time, in R we can have many
#An "object" in R can be virtually anything, a data frame, a vector/list, model results, a function
#Assign objects using an arrow "<-" like we've done with the data

#Let's create a vector that tells us the truth, that Alexes are great
#We use the "combine" function, denoted by 'c()'
alexes<-c("great")
View(alexes)

##R, like Stata, is a VERY finicky language...make sure that you close parantheses, quotation marks, etc.
##Stray commas or capitalization errors and your code won't run
##When naming objects, you can write in a variety of ways:  snake_case, per.iods, or CamelCase are all fine
alexes_are<-c("great")
alexes.are<-c("great")
AlexesAre<-c("great")

#Want to remove objects? Use the "rm" command
rm(alexes, alexes_are, alexes.are, AlexesAre)

#Want to remove EVERYTHING, including objects and loaded libraries/packages? Use the following
rm(list = ls())

################################
#Cleaning and Manipulating Data#
################################

#Let's look at our data
View(auto_data)

#Not sure if your object is a dataframe? Turn it into one
auto_data<-as.data.frame(auto_data)

##Basic summary statistics
dim(auto_data)
summary(auto_data)

##Hadley Wickham's 'tidyverse' package is one of the newest (and best) ways to clean and manipulate data in R
#His book, "R for Data Science" is incredibly helpful, and available online FOR FREE: http://r4ds.had.co.nz/
#We'll show you just a few basic functions here from the 'dplyr' package: 'mutate,' 'rename,' 'select,' 'arrange,' and 'filter'

#Use 'mutate' in order to create new variables that are based on manipulations of current variables
# e.g. create a new variable, "lwratio" that is equal to a car's length-to weight ratio
auto_data<-mutate(auto_data, lwratio = length/weight)

#can rename variables using 'rename'
auto_data<-rename(auto_data, lw_ratio = lwratio)

#Want to drop variables? Use the 'select' function to either choose which variables to keep or which ones to drop
#drop using -varname
auto_data<-select(auto_data, -lw_ratio)
#select using varname
auto_data<-select(auto_data, man, price, mpg, rep78, headroom, trunk, weight, length, turn, displacement, gear_ratio, foreign)

#Want to sort columns by a particular set of values? use 'arrange'
#e.g. order by ascending weight
auto_data<-arrange(auto_data, weight)
#what about descending?
auto_data<-arrange(auto_data, desc(weight))

#Want to subset data by particular values or delete particular rows? use 'filter'
#For instance, let's create a dataset of *only* domestic cars
domestic<-filter(auto_data, foreign == "Domestic")

##USING THE "PIPE"
#The "pipe" is a special character, %>%, that allows you to link together multiple functions in a string
#This cleans up your code tremendously...allows you to avoid repeating the dataframe in each line
#What if I wanted to filter only domestic vehicles and drop the now useless "foreign" variable at the same time?
domestic<-filter(auto_data, foreign == "Domestic") %>%
	select(-foreign)

##More Complicated Mutations

#Oftentimes you want to do more than simply create new variables from old variables
#Combing a diverse set of tools such as "ifelse" and "grepl" can get you there
#If-else basically runs a test for a condition, if the condition is true it does one thing, if false something else
#For instance, I want to create a dummy variable "heavy" with value 1 if the weight is > 3000 pounds, otherwise 0
auto_data<-mutate(auto_data, heavy = ifelse(weight >3000, 1, 0))

#This works great for when you have numeric ifelse statements, but poorly if you need character-based (i.e. word) statements
#Use the grepl command to work around
#For instance, I want to create a dummy variable "ford" for all Ford vehicles
auto_data<-mutate(auto_data, ford = ifelse(grepl("Ford", man), 1, 0))

#What if I want to code muscle cars by various producers? can stack ifelse statements
auto_data<-mutate(auto_data, muscle = ifelse(grepl("Ford Mustang", man), 1, ifelse(grepl("Pont. Firebird", man), 1, 0)))

#More info on ifelse grepl, see: https://rstudio-pubs-static.s3.amazonaws.com/116317_e6922e81e72e4e3f83995485ce686c14.html

##Finding and Dealing with Missing Data; R generally codes data as "NA"

#Can use mutate to recode a variable as missing, or missing variables as values
#let's say we don't know if Datsuns or the Mercury Cougar XR7 are muscle cars or not
auto_data<-mutate(auto_data, muscle = ifelse(grepl("Ford Mustang", man), 1, ifelse(grepl("Pont. Firebird", man), 1, ifelse(grepl("Datsun", man), NA, ifelse(grepl("Merc. XR-7", man), NA, 0)))))

#We then realize we've made a mistake and that the XR7 IS a muscle car...we can then use mutate again
auto_data<-mutate(auto_data, muscle = ifelse(grepl("Merc. XR-7", man), 1, muscle))

#To omit variables from calculations use "na.omit"
#For instance, we can't take the mean of the rep78 variable
mean(auto_data$rep78)
mean(na.omit(auto_data$rep78))

#N.B. The "$" specifies in which data set to search for your variable
mean(na.omit(rep78)) #this won't work

#Can also create a data frame of only complete cases, although be careful with this, why might the data be missing?
complete<-filter(auto_data, complete.cases(auto_data))



#########################
#Basic Regression Models#
#########################


##Some basic linear models
#The general format of a regression model in R is y ~ x1 + x2
auto_mod1 <- lm(price ~ displacement + mpg + weight, data = auto_data) #A basic linear model
summary(auto_mod1)

auto_mod2 <- lm(price ~ displacement + mpg + log(weight), data = auto_data)
summary(auto_mod2)

auto_mod3 <- lm(price ~ displacement + mpg*weight, data = auto_data) #Including an interaction
summary(auto_mod3)
#Note that for interaction terms, ONLY the interaction term is required for R.
#That is, the constituent components will automatically be added (unlike Stata).

### Dummy Variables (specified in the same fashion as other variables)
auto_mod4 <- lm(price ~ displacement + mpg + foreign, data = auto_data)
summary(auto_mod4)

#Finding out what else is contained in the saved regression
names(auto_mod1)
auto_mod1$coefficients
auto_mod1$residuals #and so on.

#This can be VERY helpful when using stranger types of models (e.g. not lm/glm) and you have to build your own tables

### Creating regression tables
#Marek Hlavac's 'stargazer' package makes really cool tables that can be exported to Latex or RMarkdown
library(stargazer)
stargazer(auto_mod3)
#See https://www.jakeruss.com/cheatsheets/stargazer/ for an excellent rundown of basic stargazer commands.

#Pluses of stargazer: easy to use, look cool; Minuses of stargazer: only works with certain types of objects
#An alternative way is to use the 'xtable' function
#Besides plotting regression analysis, it can also create simple latex tables directly from data frames
library(xtable)
xtable(auto_mod3)

##Some nonlinear/more complicated models 
#Generally use the 'glm' function for generalized linear models, e.g. logit/probit
auto_logit <- glm(heavy ~ price + mpg + weight, family = binomial(link = "logit"), data = auto_data)
auto_probit <- glm(heavy ~ price + mpg + weight, family = binomial(link = "probit"), data = auto_data)

stargazer(auto_logit)
summary(auto_logit)

######################
#Graphing With GGplot#
######################

##Basic scatterplot
ggplot(data = auto_data) +
  geom_point(mapping = aes(x = displacement, y = price))

##Let's add some variety - foreign vs. domestic
ggplot(data = auto_data) +
  geom_point(mapping = aes(x = displacement, y = price, color = foreign))

##How else might we seperate variables by type?
ggplot(data = auto_data) +
  geom_point(mapping = aes(x = displacement, y = price)) +
  facet_wrap(~ foreign, nrow = 1)

ggplot(data = auto_data) +
  geom_point(mapping = aes(x = displacement, y = price)) +
  facet_wrap(~ rep78, nrow = 2)

##More
ggplot(data = auto_data) +
  geom_smooth(mapping = aes(x = displacement, y = price)) +
  geom_point(mapping = aes(x = displacement, y = price))


##How about a fancy bar graph?
ggplot(data = auto_data) +
  geom_bar(mapping = aes(x = mpg, fill = foreign), position = "dodge")

##Mapping our models
ggplot(auto_mod1, aes(mpg, price)) +
  geom_point() +
  geom_smooth(mapping = aes(x = mpg, y = price)) +
  theme_minimal() +
  xlab("Miles per Gallon") +
  ylab("Price of Vehicle") + 
  ggtitle("R Workshop - Effect of Miles per Gallon on Price")

## Quiz:  Make the above an object

##### Challenge:  make the following chart #####

##try out ggThemeAssist


#############
#Wrapping Up#
#############

# What do you do if you need some help and neither of us are available?

# Option 1:"?function"

?lm

# Option 2:"help()"

help(lm)

?help

help(,foreign)

# Option 3: "help.search("keyword")

help.search("foreign")

help.search("stata")

help.search("vcov")

# Option 4: GOOGLE IT!

#There are a lot of helpful threads on StackOverflow
#The CRAN repository also has reference manuals for individual packages (e.g. stargzer)

#############################################################
#############################################################

#############################################################
#############################################################
# Quit

quit()