SEC Schools (11/14 Teams)

Scraping

All of the scraping for this project was done in R, using the rvest package, with the read_html() and html_table, functions. These conveniently take a webpage and automatically create a list of R dataframes, allowing for easy, simple scraping. Here’s an example:

# Scraping data for the University of Missouri's rosters for all sports in 2023


# The sports that the school offers
sports = c("baseball",
           "softball",
           "mens-basketball",
           "womens-basketball",
           "cross-country",
           "football",
           "mens-golf",
           "womens-golf",
           "mens-swimming-and-diving",
           "womens-swimming-and-diving",
           "womens-gymnastics",
           "womens-soccer",
           "track-and-field",
           "wrestling",
           "womens-tennis",
           "womens-volleyball")

# "Type" is just determining what the format of the year is. If the sport's urls are formatted as 2023, 2024, etc, its type is 0. If it is formatted like 2023-24, it is 1.
types = c(0,0,1,1,0,0,1,1,0,0,0,0,0,1,1,0)

sports_df = data.frame(Sport = sports, Type = types) # DataFrame combining sports with their types'

# This is a helper function taking in a column that has Hometown data. Some of the columns are presented as "Hometown/High School"; we only want Hometown, so we can split on the "/" character, and take the first element.
hometown_split = function(L) {
  new_list = c()
  for (i in 1:length(L)) {
    new_list = c(new_list, str_trim(str_split_1(L[i], "/")[1]))
  }
  return (new_list)
}

# Scraping
BASE_URL = "https://mutigers.com"
years = 2023:2023 # List of years that we want to look at. Normally this would be something like 2013:2023.

college_df = data.frame() # Initializing an empty dataframe that we can add to.

for (i in 1:nrow(sports_df)) {
  sport = sports_df$Sport[i] # Easier for readability
  for (y in 1:length(years)) {
    year = years[y] # Easier for readability
    
    # If the sport's url is presented as 2023-24 (type 1), we want to convert the year in our for loop to be of that format.
    if (sports_df$Type[i] == 1) {
      next_year = as.character(years[y] + 1)
      year_str = paste(years[y], "-", substr(next_year, nchar(next_year) -1, nchar(next_year)), sep="")
    }
    else { # otherwise, keep year as is.
      year_str = as.character(years[y])
    }
    
    # Put together the pieces of the url
    url = paste(BASE_URL, "/sports/", sport, "/roster/", year_str, sep="")
    
    # Read the data.
    tables = read_html(url) %>% html_table()
    
    # Which element of the tables list the roster is on depends on the school.
    roster = data.frame(tables[3])
    
    # Some sports have men's and women's rosters combined in two different tables. This adds that extra table in.
    if (sport %in% c("cross-country", "track-and-field")) {
      roster = rbind(roster, data.frame(tables[4]))
    }
    
    # Adding a column for the year and the sport
    roster$Year = rep(year, nrow(roster))
    roster$Sport = rep(sport, nrow(roster))
    
    # This checks if there is a column including the name "hometown". If there is one, it adds a Hometown column, using the hometown_split() function.
    for (i in 1:length(colnames(roster))){
      if (length(grep("[hH]ometown", colnames(roster)[i])) > 0){
        roster$Hometown = hometown_split(roster[[colnames(roster)[i]]])
      }
      
      # If there is a column "Name", add it to the roster dataframe.
      if (length(grep("[nN]ame", colnames(roster)[i])) > 0){
        roster$Name = roster[[colnames(roster)[i]]]
      }
    }
    
    
    # If we don't find a column with "Hometown" in it, likely this url doesn't contain any data. We move on to the next year/sport.
    if (!("Hometown" %in% colnames(roster))) {
      next
    }
    
    # Get the columns we want from the roster, and add them to the overall college dataframe.
    roster = roster %>% select(Name, Hometown, Year, Sport)
    college_df = rbind(college_df, roster)
  }
  # For monitoring purposes
  # print(paste("Finished with ", sport))
}

# Typically, csv files are written for each school, so we can combine them in the cleaning. I will not do that here, but instead show a snapshot of the dataframe.
# write.csv(college_df, "MIZZOU.csv", row.names=F)

Here is a snapshot of MIZZOU.csv.

show_table(head(read_csv("./Missouri/mizzou.csv")))


Name	Hometown	Year	Sport
Jerry Houston	Dolton, Ill.	2013	baseball
Brett Peel	St. Charles, Mo.	2013	baseball
Shane Benes	Town and Country, Mo.	2013	baseball
Case Munson	St. Charles, Mo.	2013	baseball
Brett Bond	St. Louis, Mo.	2013	baseball
Trey Harris	Powder Springs, Ga.	2013	baseball

Data Cleaning

The way I did the scraping, we are now left with 11 different directories, each with a .csv file, and a .R script file that scrapes the data, and generates those .csv files.

Reading and Combining

The approach I took is to get those .csv. files recursively from the home directory that houses all of these subdirectories.

# Get all files from this directory
files = list.files(recursive=T)

# Get all .csv files, this will remove the .R scripts
csv_files = files[endsWith(files, ".csv")]

all_schools = data.frame()

for (f in csv_files) {
  file = read_csv(f)
  file$School = rep(str_split_1(f, "/")[1], nrow(file))
  all_schools = rbind(all_schools, file)
}

We can see the head and tail of this dataset, showing we have combined all the data for all schools.

show_table(head(all_schools))


Name	Hometown	Year	Sport	School
Mikey White	Birmingham, Ala.	2014	baseball	Alabama
Daniel Cucjen	Shreveport, La.	2014	baseball	Alabama
Kyle Overstreet	Boaz, Ala.	2014	baseball	Alabama
Ben Moore	Cullman, Ala.	2014	baseball	Alabama
Casey Hughston	Mobile, Ala.	2014	baseball	Alabama
Wade Wass	Pensacola, Fla.	2014	baseball	Alabama

show_table(tail(all_schools))


Name	Hometown	Year	Sport	School
Molly Finlay	Oakton, Va.	2023	wlax	Vanderbilt
Ashley Sampone	Rochester, N.Y.	2023	wlax	Vanderbilt
Taylor Sampone	Rochester, N.Y.	2023	wlax	Vanderbilt
Kristin Gruber	Woodbine, Md.	2023	wlax	Vanderbilt
Molly Krestinski	Bronxville, N.Y.	2023	wlax	Vanderbilt
Remy Ricciardi	Ridgewood, N.J.	2023	wlax	Vanderbilt

UW Athletics Data Contest

Matt McClain

December 6, 2023

SEC Schools (11/14 Teams)

Scraping

Data Cleaning

Reading and Combining