#Load dependencies
library(tidyverse)
library(tidytuesdayR)
Use R! Conference
Barplot
Text Analysis
LDA
R
The materials at the following link were very helpful for learning about text analysis, topic modeling, and LDA. https://www.tidytextmining.com/topicmodeling#per-document
Load Data
# Load data
<- tidytuesdayR::tt_load('2025-04-29')
tuesdata <- tuesdata$user2025
user2025
#View(user2025)
Explore Data
summary(user2025)
id session date time
Min. : 3.00 Length:128 Min. :2025-08-01 Length:128
1st Qu.: 50.75 Class :character 1st Qu.:2025-08-08 Class :character
Median : 98.50 Mode :character Median :2025-08-09 Mode :character
Mean : 94.50 Mean :2025-08-07
3rd Qu.:138.25 3rd Qu.:2025-08-10
Max. :175.00 Max. :2025-08-10
room title content video_recording
Length:128 Length:128 Length:128 Length:128
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
keywords speakers co_authors
Length:128 Length:128 Length:128
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
# Look for particular strings
|>
user2025 filter(str_detect(content, "clinical trial"))
LDA Topic Model
library(tidytext)
library(textdata)
library(textstem)
Loading required package: koRpus.lang.en
Loading required package: koRpus
Loading required package: sylly
For information on available language packages for 'koRpus', run
available.koRpus.lang()
and see ?install.koRpus.lang()
Attaching package: 'koRpus'
The following object is masked from 'package:readr':
tokenize
# Unnest words from session keywords
<- user2025 |>
tidy_words unnest_tokens(word, keywords) |>
select(id, title, word)
# Load stopwords and add custom stopwords
<- bind_rows(
custom_stop_words ::stop_words,
tidytexttibble(word = c("http", "https", "1", "data", "package", "packages"))
)
# Remove stop words and lemmatize remaining words (removes plurals, suffixes, etc)
<- tidy_words |>
tidy_words_nostop anti_join(custom_stop_words, by = "word") |>
mutate(word = lemmatize_words(word))
head(tidy_words_nostop)
# LDA model
library(topicmodels)
# Create document-term matrix (DTM)
<- tidy_words_nostop |>
dtm count(id, word) |>
cast_dtm(document = id, term = word, value = n)
# Fit LDA model
<- LDA(dtm, k = 3, control = list(seed = 1234))
user2025_lda
# Pull betas (probabilities) for each word in each topic
<- tidy(user2025_lda, matrix = "beta")
user2025_topics
head(user2025_topics)
# Get top 10 words per topic
<- user2025_topics |>
user_top_terms group_by(topic) |>
slice_max(beta, n = 10) |>
ungroup() |>
arrange(topic, -beta)
user_top_terms
# Simple visualization
%>%
user_top_terms mutate(term = reorder_within(term, beta, topic)) |>
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
Output
library(showtext)
Loading required package: sysfonts
Loading required package: showtextdb
library(ggthemes)
# Load Lato font
font_add_google("Lato", "lato")
showtext_auto()
# Define colors
<- c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e")
my_colors
# Plot
|>
user_top_terms mutate(term = reorder_within(term, beta, topic)) |>
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
scale_y_reordered() +
scale_fill_manual(values = my_colors) +
labs(
title = "Uncovering the 3 main topics at useR! Conference 2025",
subtitle = "LDA method used to fit a topic model on session descriptions.",
x = "Probability of Term Being in Topic",
y = "Term",
caption = "\nChart produced by Steven Villalon for Tidy Tuesday exercise on April 29, 2025"
+
) theme_minimal(
base_size = 14,
base_family = "lato") +
theme(
plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold", size = 18, hjust = 0),
plot.subtitle = element_text(size = 14, margin = margin(b = 10), hjust = 0),
plot.caption = element_text(
size = 10,
color = "gray40",
hjust = 0,
face = "plain"),
strip.text = element_text(face = "bold", size = 14),
axis.text.y = element_text(size = 12),
axis.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
Other Analyses
TF-IDF
# Pull the gammas (propensity score for which topic each document id belongs to)
<- tidy(user2025_lda, matrix = "gamma")
doc_topic_gammas doc_topic_gammas
# Get the top topic for each document id
<- doc_topic_gammas |>
assigned_topics group_by(document) |>
slice_max(gamma, n = 1, with_ties = FALSE) |>
mutate(document = as.double(document)) |>
ungroup()
assigned_topics
# Join back to original dataset
<- user2025 |>
user2025_with_topics left_join(assigned_topics, by = c("id" = "document"))
# Unnest words from session descriptions (column = content)
<- user2025_with_topics |>
tidy_words_with_topics unnest_tokens(word, content) |>
select(id, topic, title, word)
# Remove stop words and lemmatize remaining words (remove plurals, suffixes, etc)
<- tidy_words_with_topics |>
tidy_words_with_topics_nostop anti_join(custom_stop_words, by = "word") |>
mutate(word = lemmatize_words(word))
head(tidy_words_with_topics_nostop)
# Get counts by word and topic
<- tidy_words_with_topics_nostop |>
word_counts count(topic, word, sort = TRUE)
word_counts
# Compute tf-idfs
<- word_counts |>
tfidf_words bind_tf_idf(word, topic, n)
tfidf_words
# Filter to top 10 tf-idfs for each topic
<- tfidf_words |>
top_tfidf_words group_by(topic) |>
arrange(desc(tf_idf)) |>
slice_head(n = 10) |>
ungroup()
top_tfidf_words
Bigrams
library(tidytext)
library(dplyr)
library(ggplot2)
# Assuming your dataset is called 'talks' and the text column is 'description'
# Step 1: Tokenize into bigrams
<- user2025 %>%
talks_bigrams unnest_tokens(bigram, content, token = "ngrams", n = 2)
# Step 2: Separate the bigrams into two words
<- talks_bigrams %>%
talks_bigrams_separated separate(bigram, into = c("word1", "word2"), sep = " ")
# Step 3: Remove stopwords
data("stop_words")
<- talks_bigrams_separated %>%
talks_bigrams_filtered filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word)
# Step 4: Count the bigrams
<- talks_bigrams_filtered %>%
bigram_counts count(word1, word2, sort = TRUE)
# Step 5: Visualize Top 15 Bigrams
%>%
bigram_counts top_n(15) %>%
mutate(bigram = paste(word1, word2, sep = " ")) %>%
ggplot(aes(x = reorder(bigram, n), y = n)) +
geom_col() +
coord_flip() +
labs(title = "Top 15 Bigrams in Talks",
x = "Bigram",
y = "Frequency")
Selecting by n