##############################################################
#Code for Web Scraping Workshop: George Washington University#
##############################################################

# April 4, 2019
# Code By Alex Kirss, based off some earlier work by Jack Hasler

# Required packages: rvest, foreign, dplyr, tidyr

install.packages("rvest")
install.packages("foreign")
install.packages("dplyr")
install.packages("tidyr")

# Additional Software: add the 'selectorgadget' function to your browser (I use Google Chrome); for instructions see Hadley Wickham's vignette

vignette("selectorgadget")

# Time to get to work!
# Clear out your list of objects and load the packages 

rm(list = ls())
library(rvest)
library(foreign)
library(dplyr)
library(tidyr)

# The important flow to keep in mind for scraping is: 1) identify the website; 2) read the website into R; 3) extract the data you need; 3) loop if necessary
# Here I set up a loop to scrape multiple years worth of data on county level drug overdoses

# Identifying the website
##############################

# Create an object of  the partial URL, so you can then add onto it while you loop

partial_url <- "https://www.cdc.gov/drugoverdose/maps/rxcounty"

# This vector contains the various url "endings" that I need to append in order to get a working link

years <- c('2006.html', '2007.html', '2008.html', '2009.html', '2010.html', '2011.html', '2012.html', '2013.html', '2014.html', 
          '2015.html', '2016.html')

# Reading the website into R
################################

# Set up blank data frame with the number of columns that corresponds to your scraped variables and identifiers

data <- matrix(, nrow = 0, ncol = 5)

# Set up a 'for' loop to go through the various webpages in sequence
# Here we use the length of our vector of url endings as our counter
# Once you set up the loop, think of your code as a SEQUENCE for each individual page
# At the end of the sequence you will just loop back to the top for the next page

for(s in 1:length(years)){

  # paste the precise ending of the partial url--based on the loop counter and partial ending vector--to get the website you want
  
  url <- paste0(partial_url,years[s])
  
  # read the website into R in using the 'read_html' command
  # this enclosed in a 'try' command so it will spit out an error if it can't execute
  
  try(page <- read_html(url))
  
  # identify which data/variables you want on the web page using selector gadget
  # you can then read the data into R using the 'html_text' command wrapped around a 'html_nodes' command
  # here I scrape: 1) county name; 2) state; 3) fips identifier; and 4) opiod prescribing rate
  
    county <- html_text(html_nodes(page, 'td:nth-child(1)'))
    state <- html_text(html_nodes(page, 'td:nth-child(2)'))
    fipscounty <- html_text(html_nodes(page, 'td:nth-child(3)'))
    prescribingrate <- html_text(html_nodes(page, 'td:nth-child(4)'))
    
  # because you're looping, you want to bind this data a temporary data frame that can then be merged with the master blank frame
  # as you loop you will rewrite this temporary data frame, while the master gets steadilt added to
  # note: I wanted to add in a year counter, which was harder to get in then you would think, since I wasn't scraping it!

    tmp <- as.data.frame(cbind(county, state, fipscounty, prescribingrate)) %>%
      mutate(year = s)
    data <-rbind(data,tmp)
    
  # generally I add in a ticker for my loops, so I know where it is
    
    print(s)
} 


# Cleaning your data
#########################


# As with a lot of coding, it is harder to clean your webscraped data than it is to actually use the data for modelling
# Most errors with web-scraping can be caught at this stage, but sometimes they can be difficult to find
# JUST BECAUSE YOUR LOOP RUNS DOESN'T MEAN THAT YOU HAVE SCRAPED THE DATA CORRECTLY

# Here, for instance, my year counter wasn't numeric and had the value of the loop count rather than year 

data$year<-as.numeric((data$year))
  
data2<- data %>%
  mutate(year2 = ifelse(year == 1, 2006,
                 ifelse(year == 2, 2007,
                 ifelse(year == 3, 2008,
                 ifelse(year == 4, 2009,
                 ifelse(year == 5, 2010,
                 ifelse(year == 6, 2011,
                 ifelse(year == 7, 2012,
                 ifelse(year == 8, 2013,
                 ifelse(year == 9, 2014,
                 ifelse(year == 10, 2015,
                 ifelse(year == 11, 2016, 0))))))))))),
         rate = prescribingrate) %>%
  select(-year) %>%
  rename(year = year2)

# Strangely, the website format CHANGED from 2016 to 2017, which threw off the loop when I tried to include it

# Checking the levels of your data to make sure there aren't weird things in it is a good idea

levels(data2$state)

# My rate data was also funky...wasn't numeric and simply converting it to a numeric threw things off

data2$rate <- as.numeric(levels(data2$rate))[data2$rate]

# Now that your data is clean, then save it out as a usable format

write.dta(data2, "/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/190402WebScraping/scraping.dta")


# More info: Hadley Wickham's github page on 'rvest' has a bunch of other tricks of the trade
# Other packages exist, e.g. 'Rcrawler' and 'scrapeR
# More advanced scraping can be done using Python