# Load dependencies
library(tidyverse)
Base R Penguins
Scatterplot
Decision Tree
R
# Load data
<- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv') penguins
Cleaning
# Check for NAs
colSums(is.na(penguins))
species island bill_len bill_dep flipper_len body_mass
0 0 2 2 2 2
sex year
11 0
# Remove 11 NAs
<- na.omit(penguins)
penguins
# Convert categorical variables to factors
<- penguins |>
penguins mutate(across(c(species, island, sex), as.factor))
# Add column with filename of penguin image
<- penguins |>
penguins mutate(image = case_when(
== "Adelie" ~ "misc/adelie.jpg",
species == "Chinstrap" ~ "misc/chinstrap.jpg",
species == "Gentoo" ~ "misc/gentoo.jpg"
species
))
head(penguins)
Exploration
# Summary stats
summary(penguins)
species island bill_len bill_dep flipper_len
Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10 Min. :172
Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60 1st Qu.:190
Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30 Median :197
Mean :43.99 Mean :17.16 Mean :201
3rd Qu.:48.60 3rd Qu.:18.70 3rd Qu.:213
Max. :59.60 Max. :21.50 Max. :231
body_mass sex year image
Min. :2700 female:165 Min. :2007 Length:333
1st Qu.:3550 male :168 1st Qu.:2007 Class :character
Median :4050 Median :2008 Mode :character
Mean :4207 Mean :2008
3rd Qu.:4775 3rd Qu.:2009
Max. :6300 Max. :2009
# Get counts by year
|>
penguins group_by(year) |>
summarize(count = n(), .groups = "drop")
# Get counts by species and island
|>
penguins group_by(species, island) |>
summarize(count = n(), .groups = "drop")
# Get counts by species, island, and year
|>
penguins group_by(year, species, island) |>
summarize(count = n(), .groups = "drop")
# Get counts by species and sex
|>
penguins group_by(species, sex) |>
summarize(count = n(), .groups = "drop")
# Get avgs of numeric variables
<- penguins |>
penguins_avg group_by(species) |>
summarize(image = first(image),
avg_bill_len = mean(bill_len),
avg_bill_dep = mean(bill_dep),
avg_flipper_len = mean(flipper_len),
avg_body_mass = mean(body_mass))
penguins_avg
# Plot of bill length and depth by species
ggplot(penguins, aes(x = bill_len, y = bill_dep , color = species)) +
geom_point()
# Plot of flipper length and body mass by species
ggplot(penguins, aes(x = flipper_len, y = body_mass , color = species)) +
geom_point()
# Plot of bill length and depth by sex
ggplot(penguins, aes(x = bill_len, y = bill_dep , color = sex)) +
geom_point()
# Plot of flipper length and body mass by sex
ggplot(penguins, aes(x = flipper_len, y = body_mass , color = sex)) +
geom_point()
# Boxplot of bodymass by sex
ggplot(penguins, aes(x = sex, y = body_mass)) +
geom_boxplot()
Modeling
# Test/training split
set.seed(123)
<- nrow(penguins)
n <- sample.int(n, size = round(0.2 * n))
test_index <- penguins[-test_index, ]
train_data <- penguins[test_index, ] test_data
# Fit decision tree
library(rpart)
library(partykit)
<- rpart(species ~ bill_len + bill_dep + flipper_len + body_mass, data = train_data)
tree
# Plot tree
plot(as.party(tree),
main = "Decision Tree for Penguin Species",
gp = gpar(fontsize = 6))
# Prune the tree
plotcp(tree)
Since none of the simpler trees (sizes 1 or 2) fall below the dotted line, there’s no obvious benefit to pruning based on the 1-SE rule.
# Predict test data
$tree_preds <- predict(tree, newdata = test_data, "class")
test_data
# Generate confusion matrix
library(caret)
<- confusionMatrix(test_data$tree_preds, test_data$species,
cm_tree dnn = c("predicted", "actual"))
cm_tree
Confusion Matrix and Statistics
actual
predicted Adelie Chinstrap Gentoo
Adelie 28 0 0
Chinstrap 5 9 1
Gentoo 1 2 21
Overall Statistics
Accuracy : 0.8657
95% CI : (0.7603, 0.9367)
No Information Rate : 0.5075
P-Value [Acc > NIR] : 7.061e-10
Kappa : 0.788
Mcnemar's Test P-Value : 0.09647
Statistics by Class:
Class: Adelie Class: Chinstrap Class: Gentoo
Sensitivity 0.8235 0.8182 0.9545
Specificity 1.0000 0.8929 0.9333
Pos Pred Value 1.0000 0.6000 0.8750
Neg Pred Value 0.8462 0.9615 0.9767
Prevalence 0.5075 0.1642 0.3284
Detection Rate 0.4179 0.1343 0.3134
Detection Prevalence 0.4179 0.2239 0.3582
Balanced Accuracy 0.9118 0.8555 0.9439
# Create scatterplot with images
# Load dependencies
library(ggimage)
library(ggrepel)
library(showtext)
# Load custom font
font_add_google("Lato", "lato")
showtext_auto()
# Generate plot
<- ggplot(penguins_avg, aes(x = avg_bill_len, y = avg_flipper_len)) +
penguin_plot geom_image(aes(image = image), size = 0.3) +
geom_text(aes(label = species), vjust = -5.75, hjust = 0.5, size = 4, fontface = "bold") +
theme_minimal(base_family = "lato") +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(face = "plain", size = 12)
+
) labs(
title = "Penguin Species by Average Bill and Flipper Length",
subtitle = "Decision tree model indicated that these two features were best at distinguishing \nbetween species. Tree splits showed flipper length greater than 210mm likely Gentoo.\nSmaller flippers and bill lengths less than 42mm likely Adelie.",
x = "Bill Length (in mm)",
y = "Flipper Length (in mm)"
+
) xlim(min(penguins_avg$avg_bill_len) - 5, max(penguins_avg$avg_bill_len) + 5) +
ylim(min(penguins_avg$avg_flipper_len) - 5, max(penguins_avg$avg_flipper_len) + 10)
penguin_plot