# Load dependencies
library(tidyverse)
library(tidytuesdayR)
Fatal Car Crashes on 4/20
# Load data
<- tidytuesdayR::tt_load('2025-04-22')
tuesdata names(tuesdata)
[1] "daily_accidents" "daily_accidents_420"
# Extract tt_data
<- tuesdata$daily_accidents
accidents .420 <- tuesdata$daily_accidents_420
accidents
head(accidents)
Cleaning
# Pull all 4/20 rows
.420[month(accidents.420$date) == 4 & day(accidents.420$date) == 20, ] accidents
# Check for NAs in e420
.420[is.na(accidents.420$e420), ] accidents
# Pull all rows where e420 is true
.420[which(accidents.420$e420 == TRUE), ] accidents
For 4/20 specifically, the accidents.420 dataframe has multiple rows. FALSE is before 4:20pm and TRUE is after. Strange to see NAs in this column.
There are 13 rows where e420 is NA. I will remove because it’s unclear what they mean. In practice, would go to whomever created the dataset and ask why this is happening.
Limiting to e420 = TRUE is misleading because there were other fatalities prior to 4:20pm and I don’t think all the people who “celebrate” 4/20 are waiting until 4pm to get high.
For simplicity, I’m going to focus on the accidents dataset which only includes fatalities by day.
Exploration
# Summary stats
summary(accidents)
date fatalities_count
Min. :1992-01-01 Min. : 47.0
1st Qu.:1998-04-01 1st Qu.:121.0
Median :2004-07-01 Median :142.0
Mean :2004-07-01 Mean :145.1
3rd Qu.:2010-10-01 3rd Qu.:166.0
Max. :2016-12-31 Max. :299.0
print(paste("Standard deviation of fatalities = ", sd(accidents$fatalities_count)))
[1] "Standard deviation of fatalities = 33.2602881853278"
# Histogram of Fatalities
ggplot(accidents, aes(x = fatalities_count)) +
geom_histogram(binwidth = 25, fill = "lightblue", color = "black")
Slightly right skewed with a mean of 145 fatalities per day. Range of 47 to 299 fatalities.
# Time series plot
ggplot(accidents, aes(x = date, y = fatalities_count)) +
geom_line()
# Group by month
$week <- floor_date(accidents$date, "week")
accidents$month <- floor_date(accidents$date, "month")
accidents$year <- floor_date(accidents$date, "year")
accidents
head(accidents)
420 is interesting on its own, but I’d like to extend the analysis to look at other “party” holidays to see if fatalities are higher. I can also compare the rate of fatalities to national holidays and non-holiday weekdays and weekends.
Party Holidays:
Super Bowl Sunday
Mardi Gras
St. Patty’s
4/20
Cinco de Mayo
Halloween
Thanksgiving Eve
New Year’s Eve
National Holidays:
New Year’s Day
MLK Day
President’s Day
Memorial Day
Independence Day
Labor Day
Columbus Day
Veterans Day
Thanksgiving Day
Christmas Day
Let’s bring in a dataset I made using ChatGPT. This dataset has indicators for the holiday. This is not a good practice because I can’t replicate the creation of that dataset. Will look for a more programmatic solution for future projects.
Note that in the code below, I am converting “National Holiday (Observed)” to “National Holiday”. Some National Holidays fall on different dates every year (floating), and sometimes the holiday is observed on a Friday or Monday when the actual day falls on Saturday/Sunday. This is an imperfect solution but makes for a cleaner visualization.
# Load holiday data
<- read_csv("data/holidays.csv",
holidays col_types = cols(date = col_date(format = "%m/%d/%y")))
# Join to accidents dataset
<- left_join(accidents, holidays, by = "date")
accidents_with_holidays
# Change "National Holiday (Observed)" to "National Holiday"
<- accidents_with_holidays |>
accidents_with_holidays mutate(day_type = case_when(
== "National Holiday (Observed)" ~ "National Holiday",
day_type TRUE ~ day_type # Keep the rest unchanged
))
# Convert day_type and holiday to factors
<- accidents_with_holidays |>
accidents_with_holidays mutate(
across(c(day_type, holiday), as.factor),
day_of_week = factor(day_of_week,
levels = c("Sunday", "Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday"))
)
head(accidents_with_holidays)
# Summary table of Day Type
<- accidents_with_holidays |>
avg_fatalities_day_type group_by(day_type) |>
summarize(avg_fatalities = round(mean(fatalities_count),1)) |>
arrange(desc(avg_fatalities))
avg_fatalities_day_type
# Summary table of Holiday
<- accidents_with_holidays |>
avg_fatalities_holiday group_by(holiday) |>
summarize(
day_type = first(day_type),
avg_fatalities = round(mean(fatalities_count), 1)
|>
) na.omit(holiday) |> # removes non-holiday weekdays/weekends
arrange(desc(avg_fatalities))
avg_fatalities_holiday
Visualization
# Create side by side Bar Plot
library(patchwork)
library(showtext)
# Load font
<- "Lato"
family font_add(family = family,
regular = "Lato-Regular.ttf",
bold = "Lato-Bold.ttf")
showtext_auto()
showtext_opts(dpi = 300)
# Colors
<- c(
my_colors "Non-Holiday Weekday" = "#76B7B2",
"Non-Holiday Weekend" = "#F28E2B",
"National Holiday" = "#4E79A7",
"Party Holiday" = "#E15759"
)
# Reorder factor levels in descending order
<- avg_fatalities_day_type |>
avg_fatalities_day_type mutate(day_type = reorder(day_type, avg_fatalities))
<- avg_fatalities_holiday |>
avg_fatalities_holiday mutate(holiday = reorder(holiday, avg_fatalities))
# Plot 1
<- ggplot(avg_fatalities_day_type, aes(x = avg_fatalities, y = day_type, fill = day_type)) +
p1 geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.1f", avg_fatalities)), hjust = 1.1, color = "white", size = 3.5, family = family) +
labs(title = "By Day Type", x = NULL, y = NULL) +
scale_fill_manual(values = my_colors) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
text = element_text(family = family)
)
# Plot 2
<- ggplot(avg_fatalities_holiday, aes(x = avg_fatalities, y = holiday, fill = day_type)) +
p2 geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.1f", avg_fatalities)), hjust = 1.1, color = "white", size = 3.5, family = family) +
labs(title = "By Holiday", x = NULL, y = NULL) +
scale_fill_manual(values = my_colors) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
text = element_text(family = family)
)
# Plot Title, Subtitle, and Caption
<- plot_annotation(
plot_title title = "Daily Fatal Car Crashes in the U.S. (1992-2016)",
subtitle = "As a whole, holidays that are associated with heavy drug/alcohol use (like 4/20) did not result in \nthe highest rate of fatal car crashes. As the authors found in the original study, Independence Day is \na particularly dangerous day to drive.",
caption = "Source: Originally studied by Harper S, Palayew A \"The annual cannabis holiday and fatal traffic crashes.\"\nChart produced by Steven Villalon for Tidy Tuesday exercise on April 22, 2025",
theme = theme(text = element_text(family = family),
plot.caption.position = "plot",
plot.caption = element_text(hjust = 0),
plot.title = element_text(face = "bold"))
)
+ p2 + plot_title p1