Base R Penguins

Scatterplot

Decision Tree

Author

Steven Villalon

Published

April 15, 2025

# Load dependencies
library(tidyverse)

# Load data
penguins <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv')

Cleaning

# Check for NAs
colSums(is.na(penguins))

    species      island    bill_len    bill_dep flipper_len   body_mass 
          0           0           2           2           2           2 
        sex        year 
         11           0

# Remove 11 NAs
penguins <- na.omit(penguins)

# Convert categorical variables to factors
penguins <- penguins |> 
  mutate(across(c(species, island, sex), as.factor))

# Add column with filename of penguin image
penguins <- penguins |> 
  mutate(image = case_when(
    species == "Adelie" ~ "misc/adelie.jpg",
    species == "Chinstrap" ~ "misc/chinstrap.jpg",
    species == "Gentoo" ~ "misc/gentoo.jpg"
  ))

head(penguins)

Exploration

# Summary stats
summary(penguins)

      species          island       bill_len        bill_dep      flipper_len 
 Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10   Min.   :172  
 Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60   1st Qu.:190  
 Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30   Median :197  
                                 Mean   :43.99   Mean   :17.16   Mean   :201  
                                 3rd Qu.:48.60   3rd Qu.:18.70   3rd Qu.:213  
                                 Max.   :59.60   Max.   :21.50   Max.   :231  
   body_mass        sex           year         image          
 Min.   :2700   female:165   Min.   :2007   Length:333        
 1st Qu.:3550   male  :168   1st Qu.:2007   Class :character  
 Median :4050                Median :2008   Mode  :character  
 Mean   :4207                Mean   :2008                     
 3rd Qu.:4775                3rd Qu.:2009                     
 Max.   :6300                Max.   :2009

# Get counts by year
penguins |> 
  group_by(year) |> 
  summarize(count = n(), .groups = "drop")

# Get counts by species and island
penguins |> 
  group_by(species, island) |> 
  summarize(count = n(), .groups = "drop")

# Get counts by species, island, and year
penguins |> 
  group_by(year, species, island) |> 
  summarize(count = n(), .groups = "drop")

# Get counts by species and sex
penguins |> 
  group_by(species, sex) |> 
  summarize(count = n(), .groups = "drop")

# Get avgs of numeric variables
penguins_avg <- penguins |> 
  group_by(species) |> 
  summarize(image = first(image),
            avg_bill_len = mean(bill_len),
            avg_bill_dep = mean(bill_dep),
            avg_flipper_len = mean(flipper_len),
            avg_body_mass = mean(body_mass))

penguins_avg

# Plot of bill length and depth by species
ggplot(penguins, aes(x = bill_len, y = bill_dep , color = species)) +
  geom_point()

# Plot of flipper length and body mass by species
ggplot(penguins, aes(x = flipper_len, y = body_mass , color = species)) +
  geom_point()

# Plot of bill length and depth by sex
ggplot(penguins, aes(x = bill_len, y = bill_dep , color = sex)) +
  geom_point()

# Plot of flipper length and body mass by sex
ggplot(penguins, aes(x = flipper_len, y = body_mass , color = sex)) +
  geom_point()

# Boxplot of bodymass by sex
ggplot(penguins, aes(x = sex, y = body_mass)) +
  geom_boxplot()

Modeling

# Test/training split
set.seed(123)
n <- nrow(penguins)
test_index <- sample.int(n, size = round(0.2 * n))
train_data <- penguins[-test_index, ]
test_data <- penguins[test_index, ]

# Fit decision tree
library(rpart)
library(partykit)
tree <- rpart(species ~ bill_len + bill_dep + flipper_len + body_mass, data = train_data)

# Plot tree
plot(as.party(tree), 
     main = "Decision Tree for Penguin Species", 
     gp = gpar(fontsize = 6))

# Prune the tree
plotcp(tree)

Since none of the simpler trees (sizes 1 or 2) fall below the dotted line, there’s no obvious benefit to pruning based on the 1-SE rule.

# Predict test data
test_data$tree_preds <- predict(tree, newdata = test_data, "class")

# Generate confusion matrix
library(caret)
cm_tree <- confusionMatrix(test_data$tree_preds, test_data$species, 
                dnn = c("predicted", "actual"))
cm_tree

Confusion Matrix and Statistics

           actual
predicted   Adelie Chinstrap Gentoo
  Adelie        28         0      0
  Chinstrap      5         9      1
  Gentoo         1         2     21

Overall Statistics
                                          
               Accuracy : 0.8657          
                 95% CI : (0.7603, 0.9367)
    No Information Rate : 0.5075          
    P-Value [Acc > NIR] : 7.061e-10       
                                          
                  Kappa : 0.788           
                                          
 Mcnemar's Test P-Value : 0.09647         

Statistics by Class:

                     Class: Adelie Class: Chinstrap Class: Gentoo
Sensitivity                 0.8235           0.8182        0.9545
Specificity                 1.0000           0.8929        0.9333
Pos Pred Value              1.0000           0.6000        0.8750
Neg Pred Value              0.8462           0.9615        0.9767
Prevalence                  0.5075           0.1642        0.3284
Detection Rate              0.4179           0.1343        0.3134
Detection Prevalence        0.4179           0.2239        0.3582
Balanced Accuracy           0.9118           0.8555        0.9439

# Create scatterplot with images

# Load dependencies
library(ggimage)
library(ggrepel)
library(showtext)

# Load custom font
font_add_google("Lato", "lato")
showtext_auto()

# Generate plot
penguin_plot <- ggplot(penguins_avg, aes(x = avg_bill_len, y = avg_flipper_len)) +
  geom_image(aes(image = image), size = 0.3) +
  geom_text(aes(label = species), vjust = -5.75, hjust = 0.5, size = 4, fontface = "bold") +
  theme_minimal(base_family = "lato") +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(face = "plain", size = 12)
  ) +
  labs(
    title = "Penguin Species by Average Bill and Flipper Length", 
    subtitle = "Decision tree model indicated that these two features were best at distinguishing \nbetween species. Tree splits showed flipper length greater than 210mm likely Gentoo.\nSmaller flippers and bill lengths less than 42mm likely Adelie.",
    x = "Bill Length (in mm)", 
    y = "Flipper Length (in mm)"
  ) +
  xlim(min(penguins_avg$avg_bill_len) - 5, max(penguins_avg$avg_bill_len) + 5) +
  ylim(min(penguins_avg$avg_flipper_len) - 5, max(penguins_avg$avg_flipper_len) + 10)

penguin_plot