Gonna use this old project, https://evamaerey.github.io/flipbooks/federalist/federalist#1, and hope to look at mall llm clasification and pca/tsne/umap!
Connects the documents that are most correlated baseed on ‘stop word’ usage, and labels them by author…
On slide 44:
download.file("https://github.com/patperry/r-corpus/raw/refs/heads/master/data/federalist.rda", destfile = "federalist.rda")
library(tidyverse)
load("federalist.rda")
federalist |> tibble()
## # A tibble: 85 × 6
## name title venue date author text
## <chr> <chr> <chr> <date> <chr> <chr>
## 1 Federalist No. 1 General Introduction For … NA Hamil… "To …
## 2 Federalist No. 2 Concerning Dangers from Fore… For … NA Jay "To …
## 3 Federalist No. 3 The Same Subject Continued (… For … NA Jay "To …
## 4 Federalist No. 4 The Same Subject Continued (… For … NA Jay "To …
## 5 Federalist No. 5 The Same Subject Continued (… For … NA Jay "To …
## 6 Federalist No. 6 Concerning Dangers from Diss… For … NA Hamil… "To …
## 7 Federalist No. 7 The Same Subject Continued (… For … NA Hamil… "To …
## 8 Federalist No. 8 The Consequences of Hostilit… From… 1787-11-20 Hamil… "To …
## 9 Federalist No. 9 The Union as a Safeguard Aga… For … NA Hamil… "To …
## 10 Federalist No. 10 The Same Subject Continued (… From… 1787-11-23 Madis… "To …
## # ℹ 75 more rows
library(mall)
options(.mall_chat = ellmer::chat_google_gemini())
federalist |>
# just a demo - let's use fewer tokents
sample_n(5) |>
mutate(text = str_extract(text, ".{50}")) |>
# and focus on a few
select(name, text, author) |>
llm_classify(col = name,
labels = c("Madison", "Hamilton", "Jay", "Publius"),
pred_name = "pred_author")
colors() %in% "transparent" |> sum()
federalist %>%
mutate(author = replace_na(author,
"Disputed")) %>%
mutate(name = paste(author,
str_remove(name, "eralist"),
sep = "-")) ->
federalist
federalist %>%
mutate(author = replace_na(author, "disputed")) |>
select(doc_name = name, author, text, title) %>%
tidytext::unnest_tokens(output = word, input = text) %>%
# stop word are good for stylometry - keep them only
inner_join(tidytext::stop_words) %>%
group_by(doc_name, word, author) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
pivot_wider(values_from = n, names_from = word, values_fill = 0) |>
select(-"in", -"as", - "for", - "any", - "with", -"all", -"if",
- "while", - "which", -"else", - "each", -"let", -"get") ->
fed
## Joining with `by = join_by(word)`
## Warning in inner_join(., tidytext::stop_words): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 434 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
which(names(fed) %in% "whom")
## [1] 144
names(fed)
## [1] "doc_name" "author" "the" "of"
## [5] "to" "a" "and" "that"
## [9] "is" "be" "it" "by"
## [13] "would" "this" "have" "are"
## [17] "or" "been" "not" "our"
## [21] "his" "their" "on" "they"
## [25] "i" "was" "from" "no"
## [29] "will" "he" "between" "but"
## [33] "at" "we" "its" "an"
## [37] "had" "more" "has" "there"
## [41] "other" "these" "them" "new"
## [45] "were" "who" "may" "under"
## [49] "than" "those" "most" "against"
## [53] "states" "shall" "cases" "should"
## [57] "must" "her" "same" "upon"
## [61] "could" "such" "some" "can"
## [65] "one" "us" "only" "state"
## [69] "what" "right" "into" "very"
## [73] "your" "you" "both" "necessary"
## [77] "ought" "every" "when" "so"
## [81] "my" "different" "too" "might"
## [85] "causes" "within" "without" "itself"
## [89] "still" "him" "because" "cannot"
## [93] "where" "either" "another" "number"
## [97] "being" "few" "over" "how"
## [101] "great" "therefore" "general" "also"
## [105] "though" "members" "many" "much"
## [109] "two" "first" "fact" "now"
## [113] "said" "well" "among" "do"
## [117] "up" "during" "himself" "out"
## [121] "again" "enough" "own" "less"
## [125] "nor" "man" "nothing" "use"
## [129] "interests" "several" "themselves" "case"
## [133] "men" "greater" "does" "she"
## [137] "far" "want" "even" "next"
## [141] "whose" "others" "often" "whom"
## [145] "once" "certain" "me" "just"
## [149] "whole" "did" "interest" "further"
## [153] "then" "here" "through" "seems"
## [157] "three" "become" "last" "possible"
## [161] "always" "likely" "take" "ever"
## [165] "four" "best" "however" "yet"
## [169] "done" "given" "whether" "after"
## [173] "before" "years" "having" "why"
## [177] "making" "course" "myself" "until"
## [181] "part" "high" "latter" "says"
## [185] "known" "least" "former" "per"
## [189] "seem" "kind" "thus" "wants"
## [193] "down" "never" "second" "think"
## [197] "doing" "alone" "see" "like"
## [201] "made" "make" "good" "small"
## [205] "already" "rather" "according" "old"
## [209] "side" "year" "five" "particular"
## [213] "thing" "present" "became" "about"
## [217] "perhaps" "taken" "am" "almost"
## [221] "herself" "ourselves" "becomes" "above"
## [225] "better" "certainly" "know" "need"
## [229] "everywhere" "clearly" "ask" "end"
## [233] "seen" "gives" "little" "place"
## [237] "eight" "six" "order" "value"
## [241] "appear" "kept" "point" "places"
## [245] "wish" "long" "re" "tried"
## [249] "whatever" "nine" "able" "concerning"
## [253] "large" "important" "neither" "example"
## [257] "took" "keep" "seven" "numbers"
## [261] "say" "smaller" "knows" "probably"
## [265] "way" "non" "give" "hence"
## [269] "something" "somewhere" "come" "go"
## [273] "clear" "otherwise" "except" "generally"
## [277] "sometimes" "together" "felt" "third"
## [281] "cause" "ours" "contains" "work"
## [285] "although" "find" "throughout" "fully"
## [289] "everything" "off" "soon" "name"
## [293] "self" "whenever" "particularly" "nearly"
## [297] "unless" "interested" "consequently" "facts"
## [301] "immediate" "instead" "things" "member"
## [305] "towards" "beyond" "mean" "parts"
## [309] "various" "asked" "thought" "secondly"
## [313] "fifth" "consider" "believe" "full"
## [317] "contain" "points" "longer" "greatest"
## [321] "therein" "followed" "indeed" "serious"
## [325] "useful" "around" "associated" "came"
## [329] "ended" "saw" "seemed" "went"
## [333] "put" "since" "turn" "usually"
## [337] "actually" "later" "open" "sure"
## [341] "thoroughly" "away" "below" "merely"
## [345] "regards" "used" "following" "truly"
## [349] "afterwards" "needs" "really" "sensible"
## [353] "behind" "interesting" "becoming" "hardly"
## [357] "placed" "room" "theirs" "toward"
## [361] "ways" "turned" "anything" "besides"
## [365] "along" "keeps" "anywhere" "seriously"
## [369] "willing" "going" "nowhere" "reasonably"
## [373] "entirely" "gets" "quite" "especially"
## [377] "gave" "v" "provides" "seeming"
## [381] "tends" "ones" "using" "co"
## [385] "show" "happens" "specified" "none"
## [389] "etc" "ex" "p" "novel"
## [393] "thereby" "exactly" "formerly" "yourselves"
## [397] "allow" "respectively" "whence" "changes"
## [401] "whereas" "forth" "hers" "higher"
## [405] "wherever" "pointed" "sides" "accordingly"
## [409] "elsewhere" "nevertheless" "early" "lately"
## [413] "near" "presents" "trying" "wanting"
## [417] "ends" "considering" "face" "somewhat"
## [421] "unfortunately" "moreover" "shows" "young"
## [425] "e" "longest" "look" "t"
## [429] "turning" "allows" "comes" "corresponding"
## [433] "hereafter" "problem" "smallest" "brief"
## [437] "differ" "highest" "apart" "turns"
## [441] "looking" "finds" "indicated" "try"
## [445] "back" "ending" "yes" "appreciate"
## [449] "indicate" "sent" "unlikely" "wanted"
## [453] "mr" "orders" "beforehand" "getting"
## [457] "looks" "whoever" "saying" "thence"
## [461] "showing" "containing" "regarding" "amongst"
## [465] "seeing" "twice" "parted" "please"
## [469] "differently" "tell" "whither" "ordered"
## [473] "aside" "follows" "goods" "thorough"
## [477] "described" "gone" "group" "et"
## [481] "wonder" "inasmuch" "indicates" "puts"
## [485] "opened" "tries" "presented" "specify"
## [489] "began" "help" "thinks" "viz"
## [493] "opens" "works" "appropriate" "pointing"
## [497] "area" "rooms" "sees" "presenting"
## [501] "needed" "specifying" "goes" "lest"
names(fed)
## [1] "doc_name" "author" "the" "of"
## [5] "to" "a" "and" "that"
## [9] "is" "be" "it" "by"
## [13] "would" "this" "have" "are"
## [17] "or" "been" "not" "our"
## [21] "his" "their" "on" "they"
## [25] "i" "was" "from" "no"
## [29] "will" "he" "between" "but"
## [33] "at" "we" "its" "an"
## [37] "had" "more" "has" "there"
## [41] "other" "these" "them" "new"
## [45] "were" "who" "may" "under"
## [49] "than" "those" "most" "against"
## [53] "states" "shall" "cases" "should"
## [57] "must" "her" "same" "upon"
## [61] "could" "such" "some" "can"
## [65] "one" "us" "only" "state"
## [69] "what" "right" "into" "very"
## [73] "your" "you" "both" "necessary"
## [77] "ought" "every" "when" "so"
## [81] "my" "different" "too" "might"
## [85] "causes" "within" "without" "itself"
## [89] "still" "him" "because" "cannot"
## [93] "where" "either" "another" "number"
## [97] "being" "few" "over" "how"
## [101] "great" "therefore" "general" "also"
## [105] "though" "members" "many" "much"
## [109] "two" "first" "fact" "now"
## [113] "said" "well" "among" "do"
## [117] "up" "during" "himself" "out"
## [121] "again" "enough" "own" "less"
## [125] "nor" "man" "nothing" "use"
## [129] "interests" "several" "themselves" "case"
## [133] "men" "greater" "does" "she"
## [137] "far" "want" "even" "next"
## [141] "whose" "others" "often" "whom"
## [145] "once" "certain" "me" "just"
## [149] "whole" "did" "interest" "further"
## [153] "then" "here" "through" "seems"
## [157] "three" "become" "last" "possible"
## [161] "always" "likely" "take" "ever"
## [165] "four" "best" "however" "yet"
## [169] "done" "given" "whether" "after"
## [173] "before" "years" "having" "why"
## [177] "making" "course" "myself" "until"
## [181] "part" "high" "latter" "says"
## [185] "known" "least" "former" "per"
## [189] "seem" "kind" "thus" "wants"
## [193] "down" "never" "second" "think"
## [197] "doing" "alone" "see" "like"
## [201] "made" "make" "good" "small"
## [205] "already" "rather" "according" "old"
## [209] "side" "year" "five" "particular"
## [213] "thing" "present" "became" "about"
## [217] "perhaps" "taken" "am" "almost"
## [221] "herself" "ourselves" "becomes" "above"
## [225] "better" "certainly" "know" "need"
## [229] "everywhere" "clearly" "ask" "end"
## [233] "seen" "gives" "little" "place"
## [237] "eight" "six" "order" "value"
## [241] "appear" "kept" "point" "places"
## [245] "wish" "long" "re" "tried"
## [249] "whatever" "nine" "able" "concerning"
## [253] "large" "important" "neither" "example"
## [257] "took" "keep" "seven" "numbers"
## [261] "say" "smaller" "knows" "probably"
## [265] "way" "non" "give" "hence"
## [269] "something" "somewhere" "come" "go"
## [273] "clear" "otherwise" "except" "generally"
## [277] "sometimes" "together" "felt" "third"
## [281] "cause" "ours" "contains" "work"
## [285] "although" "find" "throughout" "fully"
## [289] "everything" "off" "soon" "name"
## [293] "self" "whenever" "particularly" "nearly"
## [297] "unless" "interested" "consequently" "facts"
## [301] "immediate" "instead" "things" "member"
## [305] "towards" "beyond" "mean" "parts"
## [309] "various" "asked" "thought" "secondly"
## [313] "fifth" "consider" "believe" "full"
## [317] "contain" "points" "longer" "greatest"
## [321] "therein" "followed" "indeed" "serious"
## [325] "useful" "around" "associated" "came"
## [329] "ended" "saw" "seemed" "went"
## [333] "put" "since" "turn" "usually"
## [337] "actually" "later" "open" "sure"
## [341] "thoroughly" "away" "below" "merely"
## [345] "regards" "used" "following" "truly"
## [349] "afterwards" "needs" "really" "sensible"
## [353] "behind" "interesting" "becoming" "hardly"
## [357] "placed" "room" "theirs" "toward"
## [361] "ways" "turned" "anything" "besides"
## [365] "along" "keeps" "anywhere" "seriously"
## [369] "willing" "going" "nowhere" "reasonably"
## [373] "entirely" "gets" "quite" "especially"
## [377] "gave" "v" "provides" "seeming"
## [381] "tends" "ones" "using" "co"
## [385] "show" "happens" "specified" "none"
## [389] "etc" "ex" "p" "novel"
## [393] "thereby" "exactly" "formerly" "yourselves"
## [397] "allow" "respectively" "whence" "changes"
## [401] "whereas" "forth" "hers" "higher"
## [405] "wherever" "pointed" "sides" "accordingly"
## [409] "elsewhere" "nevertheless" "early" "lately"
## [413] "near" "presents" "trying" "wanting"
## [417] "ends" "considering" "face" "somewhat"
## [421] "unfortunately" "moreover" "shows" "young"
## [425] "e" "longest" "look" "t"
## [429] "turning" "allows" "comes" "corresponding"
## [433] "hereafter" "problem" "smallest" "brief"
## [437] "differ" "highest" "apart" "turns"
## [441] "looking" "finds" "indicated" "try"
## [445] "back" "ending" "yes" "appreciate"
## [449] "indicate" "sent" "unlikely" "wanted"
## [453] "mr" "orders" "beforehand" "getting"
## [457] "looks" "whoever" "saying" "thence"
## [461] "showing" "containing" "regarding" "amongst"
## [465] "seeing" "twice" "parted" "please"
## [469] "differently" "tell" "whither" "ordered"
## [473] "aside" "follows" "goods" "thorough"
## [477] "described" "gone" "group" "et"
## [481] "wonder" "inasmuch" "indicates" "puts"
## [485] "opened" "tries" "presented" "specify"
## [489] "began" "help" "thinks" "viz"
## [493] "opens" "works" "appropriate" "pointing"
## [497] "area" "rooms" "sees" "presenting"
## [501] "needed" "specifying" "goes" "lest"
library(ggdims)
fed |>
ggplot() +
aes(dims = dims(the:their)) +
geom_pca() +
aes(fill = author)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0. Please use
## `as_label()` or `as_name()` instead.
## This warning is displayed once every 8 hours.
fed |>
ggplot() +
aes(dims = dims(the:their)) +
geom_tsne(perplexity = 20) +
aes(fill = author)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the ggdims package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fed |>
ggplot() +
aes(dims = dims(the:their)) +
geom_umap() +
aes(fill = author)
federalist %>%
quanteda::corpus(text_field = "text", docid_field = "name") |>
quanteda::tokens(remove_numbers = TRUE, remove_punct = TRUE) |>
quanteda::dfm() |>
quanteda::convert(to = "data.frame") |>
select(-"in", -"for", -"or") ->
federalist_word_count
names(federalist_word_count) |> tail()
## [1] "prodigy" "completion" "trembling" "recommence" "hume's"
## [6] "essays"
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_pca()
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_tsne(perplexity = 25)
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
library(ggdims)
federalist_word_count |>
ggplot() +
aes(dims = dims(the:an)) +
geom_umap()
last_plot() +
aes(fill = str_extract(doc_id, ".+-"))
library(embed)
## Loading required package: recipes
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.11 ✔ tailor 0.1.0
## ✔ dials 1.4.2 ✔ tune 2.0.1
## ✔ infer 1.1.0 ✔ workflows 1.3.0
## ✔ modeldata 1.5.1 ✔ workflowsets 1.1.1
## ✔ parsnip 1.4.1 ✔ yardstick 1.3.2
## ✔ rsample 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
recipe(~., data = fed) %>%
update_role(author, doc_name, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors()) |>
prep() |>
juice() |>
ggplot() +
aes(x = UMAP1, y = UMAP2) +
geom_point() +
aes(color = author)
recipe(~., data = fed) %>%
update_role(author, doc_name, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors()) |>
prep() |>
juice() |>
ggplot() +
aes(x = PC1, y = PC2) +
geom_point() +
aes(color = author)