Project Gutenberg

Analysis revealed that more than a thousand books in Project Gutenberg have a subject related to Latin American countries.

String Manipulation
Maps
R
Author

Steven Villalon

Published

June 3, 2025


Final plot

Question of Interest

How many books in Project Gutenberg have a Latin American country as one their subjects?

Goal: make a map with the number of books as a label.

1. Packages & Dependencies

Show Code
# Load packages
library(tidyverse)
library(tidytuesdayR)
library(here)
library(rnaturalearth)
library(sf)
library(ggrepel)
library(showtext)
library(ggtext)

# Load helper functions
source(here::here("R/utils/tidy_tuesday_helpers.R"))

# Set project title
title <- "Project Gutenberg"
tt_date <- "2025-06-03"

2. Load Data

Show Code
# Load data from tidytuesdayR package
tuesdata <- tidytuesdayR::tt_load(tt_date)

# Extract elements from tuesdata
gutenberg_authors <- tuesdata$gutenberg_authors
gutenberg_languages <- tuesdata$gutenberg_languages
gutenberg_metadata <- tuesdata$gutenberg_metadata
gutenberg_subjects <- tuesdata$gutenberg_subjects

# Remove tuesdata file
rm(tuesdata)

3. Examine Data

Show Code
# View data
head(gutenberg_authors)
# A tibble: 6 × 7
  gutenberg_author_id author         alias birthdate deathdate wikipedia aliases
                <dbl> <chr>          <chr>     <dbl>     <dbl> <chr>     <chr>  
1                   1 United States  U.S.…        NA        NA https://… U.S.A. 
2                   3 Lincoln, Abra… <NA>       1809      1865 https://… United…
3                   4 Henry, Patrick <NA>       1736      1799 https://… <NA>   
4                   5 Adam, Paul     <NA>       1849      1931 https://… <NA>   
5                   7 Carroll, Lewis Dodg…      1832      1898 https://… Dodgso…
6                   8 United States… <NA>         NA        NA https://… Agency…
Show Code
head(gutenberg_languages)
# A tibble: 6 × 3
  gutenberg_id language total_languages
         <dbl> <chr>              <dbl>
1            1 en                     1
2            2 en                     1
3            3 en                     1
4            4 en                     1
5            5 en                     1
6            6 en                     1
Show Code
head(gutenberg_metadata)
# A tibble: 6 × 8
  gutenberg_id title     author gutenberg_author_id language gutenberg_bookshelf
         <dbl> <chr>     <chr>                <dbl> <chr>    <chr>              
1            1 "The Dec… Jeffe…                1638 en       Politics/American …
2            2 "The Uni… Unite…                   1 en       Politics/American …
3            3 "John F.… Kenne…                1666 en       Browsing: History …
4            4 "Lincoln… Linco…                   3 en       US Civil War/Brows…
5            5 "The Uni… Unite…                   1 en       United States/Poli…
6            6 "Give Me… Henry…                   4 en       American Revolutio…
# ℹ 2 more variables: rights <chr>, has_text <lgl>
Show Code
head(gutenberg_subjects)
# A tibble: 6 × 3
  gutenberg_id subject_type subject                                             
         <dbl> <chr>        <chr>                                               
1            1 lcsh         United States -- History -- Revolution, 1775-1783 -…
2            1 lcsh         United States. Declaration of Independence          
3            1 lcc          E201                                                
4            1 lcc          JK                                                  
5            2 lcsh         Civil rights -- United States -- Sources            
6            2 lcsh         United States. Constitution. 1st-10th Amendments    

4. Cleaning

Show Code
# Collapse subjects into 1 row per id
pg_subjects <- gutenberg_subjects |> 
  group_by(gutenberg_id) |> 
  summarize(subjects = str_flatten(subject, " | "))
head(pg_subjects)
# A tibble: 6 × 2
  gutenberg_id subjects                                                         
         <dbl> <chr>                                                            
1            1 United States -- History -- Revolution, 1775-1783 -- Sources | U…
2            2 Civil rights -- United States -- Sources | United States. Consti…
3            3 United States -- Foreign relations -- 1961-1963 | Presidents -- …
4            4 Consecration of cemeteries -- Pennsylvania -- Gettysburg | Soldi…
5            5 United States -- Politics and government -- 1783-1789 -- Sources…
6            6 Speeches, addresses, etc., American | United States -- Politics …
Show Code
# Join pg_subjects to metadata table
pg_clean <- gutenberg_metadata |> 
  left_join(pg_subjects, by = "gutenberg_id") |> 
  select(c("gutenberg_id", "title", "language", "subjects" )) |> 
  na.omit()
head(pg_clean)
# A tibble: 6 × 4
  gutenberg_id title                                           language subjects
         <dbl> <chr>                                           <chr>    <chr>   
1            1 "The Declaration of Independence of the United… en       United …
2            2 "The United States Bill of Rights\r\nThe Ten O… en       Civil r…
3            3 "John F. Kennedy's Inaugural Address"           en       United …
4            4 "Lincoln's Gettysburg Address\r\nGiven Novembe… en       Consecr…
5            5 "The United States Constitution"                en       United …
6            6 "Give Me Liberty or Give Me Death"              en       Speeche…
Show Code
# List of Latin American countries
latam_caribbean_countries <- c(
  # South America
  "Argentina", "Bolivia", "Brazil", "Chile", "Colombia",
  "Ecuador", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela",
  
  # Central America
  "Costa Rica", "El Salvador", "Guatemala", "Honduras", "Nicaragua", "Panama",
  
  # Caribbean
  "Cuba", "Dominican Republic", "Puerto Rico",
  
  # North America (Spanish-speaking)
  "Mexico"
)
Show Code
# Initialize an empty list to collect rows
rows <- list()

# Loop over each country and compute count
for (country in latam_caribbean_countries) {
  count <- sum(str_detect(pg_clean$subjects, fixed(country)))
  rows[[country]] <- data.frame(
    country = country,
    book_count = count
  )
}

# Combine all rows into a single data frame and sort descending
cnts_by_country <- bind_rows(rows) |> 
  mutate(cnt_group = case_when(
    book_count < 50 ~ "0 - 49",
    book_count < 100 ~ "50 - 99",
    book_count >= 100 ~ "100+",
    )) |> 
    mutate(cnt_group = factor(cnt_group, levels = c("0 - 49", "50 - 99", "100+"), ordered = TRUE)) |> 
  arrange(desc(book_count))
cnts_by_country
              country book_count cnt_group
1              Mexico        327      100+
2              Brazil        133      100+
3           Argentina        122      100+
4                Cuba         73   50 - 99
5                Peru         72   50 - 99
6              Panama         52   50 - 99
7               Chile         43    0 - 49
8           Nicaragua         29    0 - 49
9             Uruguay         23    0 - 49
10        Puerto Rico         22    0 - 49
11           Paraguay         21    0 - 49
12             Guyana         17    0 - 49
13          Venezuela         17    0 - 49
14            Bolivia         16    0 - 49
15           Suriname         13    0 - 49
16          Guatemala         13    0 - 49
17           Colombia         10    0 - 49
18            Ecuador          9    0 - 49
19           Honduras          6    0 - 49
20         Costa Rica          3    0 - 49
21 Dominican Republic          2    0 - 49
22        El Salvador          1    0 - 49

5. Mapping Parameters

Show Code
# Set lat/lon parameters for plotting area
long_min <- -125 
long_max <- -30
lat_min <- -60
lat_max <- 35

# Load country shapes
world <- ne_countries(scale = "medium", returnclass = "sf")

# Join country shapes to cnts_by_country
world_counts <- world |> 
  inner_join(cnts_by_country, by = c("name" = "country"))

# Extract lat/lon from sf object
world_counts <- world_counts |>
  mutate(label_point = suppressWarnings(st_point_on_surface(geometry))) |>
  mutate(
    lon = st_coordinates(label_point)[, 1],
    lat = st_coordinates(label_point)[, 2]
  )

6. Visualization

Show Code
# Load Lato font
font_add_google("Lato", "lato")
showtext_auto()
showtext_opts(dpi = 300)

# Make plot
final_plot <- 
  ggplot(world_counts) +
  geom_sf(color = "gray40") +
  coord_sf(
    xlim = c(long_min, long_max), 
    ylim = c(lat_min, lat_max)
    ) +
  geom_label_repel(
    data = world_counts |> filter(book_count >= 10),
    aes(x = lon,
        y = lat,
        label = paste(name, "\n", book_count),
        fill = cnt_group),
    family = "lato",
    size = 3,
    alpha = 0.9,
    fontface = "bold",
    label.size = 0.2,
    max.overlaps = 30
    ) +
  scale_fill_manual(values = c(
    "0 - 49" = "white",
    "50 - 99" = "#E6C36D",
    "100+" = "#A8C7A1"
)) +
  labs(
    title = "How many books in the Gutenberg online \nlibrary are about Latin America?",
    subtitle = "Mexico was the sure bet to have the most books, but surprised to see so few for Puerto Rico, \nColombia, and Venezuela. Books in the library are at least 95 years old and are not currently \nunder copyright.",
    caption = "Chart produced by Steven Villalon for Tidy Tuesday exercise on June 3, 2025.",
    fill = "Book Counts") +
  theme_minimal(base_family = "lato") +
  theme(
    plot.background = element_rect(fill = "black", color = NA),
    panel.background = element_rect(fill = "black", color = NA),
    legend.background = element_rect(fill = "black", color = NA),
    legend.key = element_rect(fill = "black", color = NA),
    legend.position = "inside",
    legend.position.inside = c(0.15, 0.5),
    legend.justification = c("left", "center"),
    text = element_text(color = "white"),
    axis.text = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.caption = element_text(color = "white", hjust = 0),
    plot.title = element_text(color = "#E8A6A1", face = "bold", size = 20),
    plot.subtitle = element_text(color = "gray90", size = 10)
    ) + 
  guides(fill = guide_legend(override.aes = list(label = ""))) # Remove "a" from legend

7. Export Visualization

Show Code
# Select file formats to export to
formats_to_export <- c("png", "svg")

# Save files to the output folder (uses custom R script)
save_tt_plots(
  plot = final_plot, 
  title = title, 
  date = tt_date,
  output_folder = "output", 
  formats = formats_to_export, 
  height = 8,
  width = 6,
  dpi = 300
  )

8. Session Info

R version 4.4.1 (2024-06-14)
Platform: x86_64-apple-darwin20
Running under: macOS Ventura 13.7.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggtext_0.1.2        showtext_0.9-7      showtextdb_3.0     
 [4] sysfonts_0.8.9      ggrepel_0.9.6       sf_1.0-21          
 [7] rnaturalearth_1.0.1 here_1.0.1          tidytuesdayR_1.2.1 
[10] lubridate_1.9.4     forcats_1.0.0       stringr_1.5.1      
[13] dplyr_1.1.4         purrr_1.0.4         readr_2.1.5        
[16] tidyr_1.3.1         tibble_3.2.1        ggplot2_3.5.2      
[19] tidyverse_2.0.0    

loaded via a namespace (and not attached):
 [1] gtable_0.3.6            xfun_0.52               httr2_1.1.2            
 [4] htmlwidgets_1.6.4       gh_1.5.0                tzdb_0.5.0             
 [7] vctrs_0.6.5             tools_4.4.1             generics_0.1.4         
[10] parallel_4.4.1          curl_6.2.3              proxy_0.4-27           
[13] pkgconfig_2.0.3         KernSmooth_2.23-26      RColorBrewer_1.1-3     
[16] lifecycle_1.0.4         compiler_4.4.1          farver_2.1.2           
[19] textshaping_1.0.1       terra_1.8-54            codetools_0.2-20       
[22] htmltools_0.5.8.1       class_7.3-23            yaml_2.3.10            
[25] crayon_1.5.3            pillar_1.10.2           classInt_0.4-11        
[28] rnaturalearthdata_1.0.0 tidyselect_1.2.1        digest_0.6.37          
[31] stringi_1.8.7           rprojroot_2.0.4         fastmap_1.2.0          
[34] grid_4.4.1              cli_3.6.5               magrittr_2.0.3         
[37] utf8_1.2.5              e1071_1.7-16            withr_3.0.2            
[40] rappdirs_0.3.3          scales_1.4.0            bit64_4.6.0-1          
[43] timechange_0.3.0        rmarkdown_2.29          httr_1.4.7             
[46] gitcreds_0.1.2          bit_4.6.0               ragg_1.4.0             
[49] hms_1.1.3               evaluate_1.0.3          knitr_1.50             
[52] rlang_1.1.6             gridtext_0.1.5          Rcpp_1.0.14            
[55] glue_1.8.0              DBI_1.2.3               xml2_1.3.8             
[58] svglite_2.2.1           vroom_1.6.5             jsonlite_2.0.0         
[61] R6_2.6.1                systemfonts_1.2.3       units_0.8-7            

9. Github

Back to top