############################################################## #Code for Web Scraping Workshop: George Washington University# ############################################################## # April 4, 2019 # Code By Alex Kirss, based off some earlier work by Jack Hasler # Required packages: rvest, foreign, dplyr, tidyr install.packages("rvest") install.packages("foreign") install.packages("dplyr") install.packages("tidyr") # Additional Software: add the 'selectorgadget' function to your browser (I use Google Chrome); for instructions see Hadley Wickham's vignette vignette("selectorgadget") # Time to get to work! # Clear out your list of objects and load the packages rm(list = ls()) library(rvest) library(foreign) library(dplyr) library(tidyr) # The important flow to keep in mind for scraping is: 1) identify the website; 2) read the website into R; 3) extract the data you need; 3) loop if necessary # Here I set up a loop to scrape multiple years worth of data on county level drug overdoses # Identifying the website ############################## # Create an object of the partial URL, so you can then add onto it while you loop partial_url <- "https://www.cdc.gov/drugoverdose/maps/rxcounty" # This vector contains the various url "endings" that I need to append in order to get a working link years <- c('2006.html', '2007.html', '2008.html', '2009.html', '2010.html', '2011.html', '2012.html', '2013.html', '2014.html', '2015.html', '2016.html') # Reading the website into R ################################ # Set up blank data frame with the number of columns that corresponds to your scraped variables and identifiers data <- matrix(, nrow = 0, ncol = 5) # Set up a 'for' loop to go through the various webpages in sequence # Here we use the length of our vector of url endings as our counter # Once you set up the loop, think of your code as a SEQUENCE for each individual page # At the end of the sequence you will just loop back to the top for the next page for(s in 1:length(years)){ # paste the precise ending of the partial url--based on the loop counter and partial ending vector--to get the website you want url <- paste0(partial_url,years[s]) # read the website into R in using the 'read_html' command # this enclosed in a 'try' command so it will spit out an error if it can't execute try(page <- read_html(url)) # identify which data/variables you want on the web page using selector gadget # you can then read the data into R using the 'html_text' command wrapped around a 'html_nodes' command # here I scrape: 1) county name; 2) state; 3) fips identifier; and 4) opiod prescribing rate county <- html_text(html_nodes(page, 'td:nth-child(1)')) state <- html_text(html_nodes(page, 'td:nth-child(2)')) fipscounty <- html_text(html_nodes(page, 'td:nth-child(3)')) prescribingrate <- html_text(html_nodes(page, 'td:nth-child(4)')) # because you're looping, you want to bind this data a temporary data frame that can then be merged with the master blank frame # as you loop you will rewrite this temporary data frame, while the master gets steadilt added to # note: I wanted to add in a year counter, which was harder to get in then you would think, since I wasn't scraping it! tmp <- as.data.frame(cbind(county, state, fipscounty, prescribingrate)) %>% mutate(year = s) data <-rbind(data,tmp) # generally I add in a ticker for my loops, so I know where it is print(s) } # Cleaning your data ######################### # As with a lot of coding, it is harder to clean your webscraped data than it is to actually use the data for modelling # Most errors with web-scraping can be caught at this stage, but sometimes they can be difficult to find # JUST BECAUSE YOUR LOOP RUNS DOESN'T MEAN THAT YOU HAVE SCRAPED THE DATA CORRECTLY # Here, for instance, my year counter wasn't numeric and had the value of the loop count rather than year data$year<-as.numeric((data$year)) data2<- data %>% mutate(year2 = ifelse(year == 1, 2006, ifelse(year == 2, 2007, ifelse(year == 3, 2008, ifelse(year == 4, 2009, ifelse(year == 5, 2010, ifelse(year == 6, 2011, ifelse(year == 7, 2012, ifelse(year == 8, 2013, ifelse(year == 9, 2014, ifelse(year == 10, 2015, ifelse(year == 11, 2016, 0))))))))))), rate = prescribingrate) %>% select(-year) %>% rename(year = year2) # Strangely, the website format CHANGED from 2016 to 2017, which threw off the loop when I tried to include it # Checking the levels of your data to make sure there aren't weird things in it is a good idea levels(data2$state) # My rate data was also funky...wasn't numeric and simply converting it to a numeric threw things off data2$rate <- as.numeric(levels(data2$rate))[data2$rate] # Now that your data is clean, then save it out as a usable format write.dta(data2, "/Users/alexanderkirss/Desktop/Ongoing Projects/StatsWorkshops/190402WebScraping/scraping.dta") # More info: Hadley Wickham's github page on 'rvest' has a bunch of other tricks of the trade # Other packages exist, e.g. 'Rcrawler' and 'scrapeR # More advanced scraping can be done using Python