Gonna use this old project, https://evamaerey.github.io/flipbooks/federalist/federalist#1, and hope to look at mall llm clasification and pca/tsne/umap!
Connects the documents that are most correlated baseed on ‘stop word’ usage, and labels them by author…
On slide 44:
download.file("https://github.com/patperry/r-corpus/raw/refs/heads/master/data/federalist.rda", destfile = "federalist.rda")
library(tidyverse)
load("federalist.rda")
federalist |> tibble()
## # A tibble: 85 × 6
## name title venue date author text
## <chr> <chr> <chr> <date> <chr> <chr>
## 1 Federalist No. 1 General Introduction For … NA Hamil… "To …
## 2 Federalist No. 2 Concerning Dangers from Fore… For … NA Jay "To …
## 3 Federalist No. 3 The Same Subject Continued (… For … NA Jay "To …
## 4 Federalist No. 4 The Same Subject Continued (… For … NA Jay "To …
## 5 Federalist No. 5 The Same Subject Continued (… For … NA Jay "To …
## 6 Federalist No. 6 Concerning Dangers from Diss… For … NA Hamil… "To …
## 7 Federalist No. 7 The Same Subject Continued (… For … NA Hamil… "To …
## 8 Federalist No. 8 The Consequences of Hostilit… From… 1787-11-20 Hamil… "To …
## 9 Federalist No. 9 The Union as a Safeguard Aga… For … NA Hamil… "To …
## 10 Federalist No. 10 The Same Subject Continued (… From… 1787-11-23 Madis… "To …
## # ℹ 75 more rows
library(mall)
options(.mall_chat = ellmer::chat_google_gemini())
federalist |>
sample_n(1) |>
mutate(text = str_extract(text, ".{50}")) |>
select(name, text, author) |>
llm_classify(col = name,
labels = c("Madison", "Hamilton", "Jay", "Publius"),
pred_name = "pred_author")
colors() %in% "transparent" |> sum()
federalist %>%
mutate(author = replace_na(author,
"Disputed")) %>%
mutate(name = paste(author,
str_remove(name, "eralist"),
sep = "-")) ->
federalist
federalist %>%
quanteda::corpus(text_field = "text", docid_field = "name") |>
quanteda::tokens(remove_numbers = TRUE, remove_punct = TRUE) |>
quanteda::dfm() |>
quanteda::convert(to = "data.frame") |>
select(-"in", -"for", -"or") ->
federalist_word_count
names(federalist_word_count) |> tail()
## [1] "prodigy" "completion" "trembling" "recommence" "hume's"
## [6] "essays"
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_pca()
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0. Please use
## `as_label()` or `as_name()` instead.
## This warning is displayed once every 8 hours.
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_tsne(perplexity = 25)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the ggdims package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_umap()
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
Created on 2026-01-27 with reprex v2.1.1 ```