Avoid 404 error when scraping from multiple pages using rvest

Question

Avoid 404 error when scraping from multiple pages using rvest

73 views Asked by Michael At 09 March 2024 at 04:18

This is a follow up to my question here.

The code provided does give the desired output, however it seems there is an issue when a page does not exist, and I am trying to use try/catch to avoid these errors and continue.

For example, I am specifying all dates with the following:

month <- c('02')
year <- c('2024')
day <- c('220','270','280')   
team <- c('CHI')

This is fine, as Chicago played home games on all these days, hence the following urls all work:

https://www.basketball-reference.com/boxscores/202402220CHI.html

https://www.basketball-reference.com/boxscores/202402270CHI.html

https://www.basketball-reference.com/boxscores/202402280CHI.html

But if I add another day and/or month like so:

month <- c('02')
year <- c('2024')
day <- c(**'210'**,'220','270','280')   
team <- c('CHI')

Chicago did not play a home game on 21 Feb 24 and this url doesn't exist:

https://www.basketball-reference.com/boxscores/202402210CHI.html

I tried adding it to the code here:

page <- tryCatch(read_html(url), error = function(err) "error 404")

But then I received this message:

no applicable method for 'xml_find_first' applied to an object of class "character"

How can I skip the pages that don't exist and just return the values from those that do?

Full code:

library(rvest)
library(dplyr)
library(tidyr)

##sample only - ultimately this will include all teams and all months and days
month <- c('02')
year <- c('2024')
day <- c('220','270','280')   
team <- c('CHI')

make_url <- function(team, year, month, day) {
   paste0('https://www.basketball-reference.com/boxscores/', year, month, day, team, '.html')
}

dates <- expand.grid(team = team, year = year, month = month, day = day)

urls <- dates |>
   mutate( url = make_url(team, year, month, day),
      team = team,
      date = paste(year, month, gsub('.{1}$', '', day), sep = '-'),
      .keep = 'unused'
   )

getPageTable <- function(url) {
   #read the page
   page <- read_html(url)

   #get the game's date
   gamedate <- page %>% html_element("div.scorebox_meta div") %>% html_text2()
   
   #get game title
   gameInfo <- page %>% html_elements("div.box h1") %>% html_text()
   #get the table headings
   headings <- page %>% html_elements("div.section_wrapper") %>% html_element("h2") %>% html_text()
   
   #find the quarter scores
   quarters <- grep("Q[1|2|3|4]", headings)
   
   #retrieve the tables from the page
   tables <- page %>% html_elements("div.section_wrapper") %>% html_element("table") 

   #select the desired headings and tables
   headings <- headings[quarters]
   tables <- tables[quarters] %>% html_table(header=FALSE)

   #add game date and team name/quater to the results
   tables <- lapply(1:length(tables), function(i) {
      #set column titles to second row
      names(tables[[i]]) <- tables[[i]][2,]
     tables[[i]] <- tables[[i]][-c(1:2),]  
      tables[[i]]$gamedate <- gamedate
      tables[[i]]$team <- headings[i]
      tables[[i]]$title <- gameInfo
      tables[[i]]
   })
   #merge the quarterly status into 1 dataframe
   df <- bind_rows(tables)
   df <- df %>% filter("Starters" != "Reserves"  | "Starters" != "Team Totals" )
   df
}


#loop through the URLS
dfs <- lapply(urls$url, getPageTable)
#merge into one big table
finalResult <- bind_rows(dfs)
finalResult <- finalResult %>% separate("team", into=c("team", "quarter"), " \\(")
finalResult$quarter <- sub("\\)", "", finalResult$quarter)

Original Q&A

There are 4 answers

I_O On 09 March 2024 at 08:15

It should work if your tryCatch returns a valid HTML object on error.

Example:

library(rvest)

## create sample urls, one faulty, one OK:
urls <- sprintf('https://www.basketball-reference.com/boxscores/202402%sCHI.html',
                c(210, 220)
)

# > urls |> cbind()
#      urls                                                              
# [1,] "https://www.basketball-reference.com/boxscores/202402210CHI.html"
# [2,] "https://www.basketball-reference.com/boxscores/202402220CHI.html"

upon error, return minimal_html with the error message wrapped in an HTML element you'll extract later on:

responses <- 
  urls |>
  Map(f = \(url) tryCatch(read_html(url),
                          error = \(e) sprintf('<div class="box"><h1>%s</h1></div>',
                                               e$message) |>
                                       minimal_html()
                          )
      )

digest response as desired (example):

responses |>
  Map(f = \(response) response |>
                      html_elements("div.box h1") |>
                      html_text()
      )

## + $`https://www.basketball-reference.com/boxscores/202402210CHI.html`
## [1] "HTTP error 404."
## 
## $`https://www.basketball-reference.com/boxscores/202402220CHI.html`
## [1] "Boston Celtics at Chicago Bulls Box Score, February 22, 2024"

PBulls On 09 March 2024 at 08:58

You don't need to try or catch anything if you check the status code of the response first.

getPageTable <- function(url) {
   req <- httr::GET(url)
   if (req$status_code != 200) return(NULL)

   page <- read_html(req)
   ... continue as before
}

margusl On 09 March 2024 at 13:40

Instead of brute-forcing urls, you could collect those from https://www.basketball-reference.com/leagues/NBA_2024_games.html :

library(rvest)
url_ <- "https://www.basketball-reference.com/leagues/NBA_2024_games.html"
s <- session(url_)

month_links <- s |> 
  html_elements('.filter div a') |> 
  html_attr("href")

urls <- 
  lapply(month_links, \(lnk_) 
         session_jump_to(s, lnk_) |> 
           html_elements('table tbody td[data-stat="box_score_text"] a') |> 
           html_attr("href")
         ) |> unlist()
str(urls)
#>  chr [1:946] "/boxscores/202310240DEN.html" "/boxscores/202310240GSW.html" ...

# Feb 2024, CHI:
grep("202402.*CHI", urls, value = TRUE)
#> [1] "/boxscores/202402030CHI.html" "/boxscores/202402060CHI.html"
#> [3] "/boxscores/202402220CHI.html" "/boxscores/202402270CHI.html"
#> [5] "/boxscores/202402280CHI.html"

^{Created on 2024-03-09 with reprex v2.1.0}

**Rui Barradas** · Accepted Answer · 2024-03-09T08:06:21+00:00

Here is a solution. Wrap the call to read_html in tryCatch and return the error condition, if anything goes wrong. Then test the condition right after the read instruction. Like this you will have a list with both the data (URL's are OK) and the errors (URL's are not OK) and can test which are which outside the function.

Here is the function, corrected.

getPageTable <- function(url) {
  # read the page, returning the error condition if error 404 (or other)
  page <- tryCatch(
    read_html(url),
    error = function(e) e
  )
  if(inherits(page, "error")) {
    return(page)
  }
  # then continue as in the question's code 
  #get the game's date
  gamedate <- page %>% html_element("div.scorebox_meta div") %>% html_text2()
  
  #get game title
  gameInfo <- page %>% html_elements("div.box h1") %>% html_text()
  #get the table headings
  headings <- page %>% html_elements("div.section_wrapper") %>% html_element("h2") %>% html_text()
  
  #find the quarter scores
  quarters <- grep("Q[1|2|3|4]", headings)
  
  #retrieve the tables from the page
  tables <- page %>% html_elements("div.section_wrapper") %>% html_element("table") 
  
  #select the desired headings and tables
  headings <- headings[quarters]
  tables <- tables[quarters] %>% html_table(header=FALSE)
  
  #add game date and team name/quater to the results
  tables <- lapply(1:length(tables), function(i) {
    #set column titles to second row
    names(tables[[i]]) <- tables[[i]][2,]
    tables[[i]] <- tables[[i]][-c(1:2),]  
    tables[[i]]$gamedate <- gamedate
    tables[[i]]$team <- headings[i]
    tables[[i]]$title <- gameInfo
    tables[[i]]
  })
  #merge the quarterly status into 1 dataframe
  df <- bind_rows(tables)
  df <- df %>% filter("Starters" != "Reserves"  | "Starters" != "Team Totals" )
  df
}

Call the function above, check for valid return values and decide what to do with the errors. In the case below the bad URL's and the corresponding errors are printed as messages.

#loop through the URLS
dfs <- lapply(urls$url, getPageTable)
# get which weren't read in
err <- sapply(dfs, inherits, what = "error")
# optional, make a list of the bad ones
dfs_err <- dfs[err]
# and print the URL's and error messages
for(i in which(err)) {
  urls$url[i] %>% message()
  dfs_err[[i]] %>%
    conditionMessage() %>%
    message()
}

# these are the good ones and the rest of the code is like in the question
dfs <- dfs[!err]
#merge into one big table
finalResult <- bind_rows(dfs)

TechQA.

Avoid 404 error when scraping from multiple pages using rvest

There are 4 answers

Related Questions in R

Related Questions in RVEST

Popular Questions

Trending Questions