Intro Thoughts

Gonna use this old project, https://evamaerey.github.io/flipbooks/federalist/federalist#1, and hope to look at mall llm clasification and pca/tsne/umap!

Connects the documents that are most correlated baseed on ‘stop word’ usage, and labels them by author…

On slide 44:

Status Quo

download.file("https://github.com/patperry/r-corpus/raw/refs/heads/master/data/federalist.rda", destfile = "federalist.rda")
library(tidyverse)

load("federalist.rda")

federalist |> tibble()
## # A tibble: 85 × 6
##    name              title                         venue date       author text 
##    <chr>             <chr>                         <chr> <date>     <chr>  <chr>
##  1 Federalist No. 1  General Introduction          For … NA         Hamil… "To …
##  2 Federalist No. 2  Concerning Dangers from Fore… For … NA         Jay    "To …
##  3 Federalist No. 3  The Same Subject Continued (… For … NA         Jay    "To …
##  4 Federalist No. 4  The Same Subject Continued (… For … NA         Jay    "To …
##  5 Federalist No. 5  The Same Subject Continued (… For … NA         Jay    "To …
##  6 Federalist No. 6  Concerning Dangers from Diss… For … NA         Hamil… "To …
##  7 Federalist No. 7  The Same Subject Continued (… For … NA         Hamil… "To …
##  8 Federalist No. 8  The Consequences of Hostilit… From… 1787-11-20 Hamil… "To …
##  9 Federalist No. 9  The Union as a Safeguard Aga… For … NA         Hamil… "To …
## 10 Federalist No. 10 The Same Subject Continued (… From… 1787-11-23 Madis… "To …
## # ℹ 75 more rows

Mall categorization…

library(mall)
options(.mall_chat = ellmer::chat_google_gemini())

federalist |> 
  # just a demo - let's use fewer tokents
  sample_n(5) |> 
  mutate(text = str_extract(text, ".{50}")) |>
  # and focus on a few 
  select(name, text, author) |>
  llm_classify(col = name, 
               labels = c("Madison", "Hamilton", "Jay", "Publius"), 
               pred_name = "pred_author")

colors() %in% "transparent" |> sum()
federalist %>%
  mutate(author = replace_na(author, 
                             "Disputed")) %>% 
  mutate(name = paste(author, 
                      str_remove(name, "eralist"), 
                      sep = "-"))  -> 
federalist


federalist %>%
  mutate(author = replace_na(author, "disputed")) |>
  select(doc_name = name, author, text, title) %>% 
  tidytext::unnest_tokens(output = word, input = text) %>% 
  # stop word are good for stylometry - keep them only
  inner_join(tidytext::stop_words) %>%
  group_by(doc_name, word, author) %>% 
  count() %>% 
  ungroup() %>% 
  arrange(-n) %>% 
  pivot_wider(values_from = n, names_from = word, values_fill = 0) |> 
  select(-"in", -"as", - "for", - "any", - "with", -"all", -"if", 
         - "while", - "which", -"else", - "each", -"let", -"get") ->
fed
## Joining with `by = join_by(word)`
## Warning in inner_join(., tidytext::stop_words): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 434 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
which(names(fed) %in% "whom")
## [1] 144
names(fed) 
##   [1] "doc_name"      "author"        "the"           "of"           
##   [5] "to"            "a"             "and"           "that"         
##   [9] "is"            "be"            "it"            "by"           
##  [13] "would"         "this"          "have"          "are"          
##  [17] "or"            "been"          "not"           "our"          
##  [21] "his"           "their"         "on"            "they"         
##  [25] "i"             "was"           "from"          "no"           
##  [29] "will"          "he"            "between"       "but"          
##  [33] "at"            "we"            "its"           "an"           
##  [37] "had"           "more"          "has"           "there"        
##  [41] "other"         "these"         "them"          "new"          
##  [45] "were"          "who"           "may"           "under"        
##  [49] "than"          "those"         "most"          "against"      
##  [53] "states"        "shall"         "cases"         "should"       
##  [57] "must"          "her"           "same"          "upon"         
##  [61] "could"         "such"          "some"          "can"          
##  [65] "one"           "us"            "only"          "state"        
##  [69] "what"          "right"         "into"          "very"         
##  [73] "your"          "you"           "both"          "necessary"    
##  [77] "ought"         "every"         "when"          "so"           
##  [81] "my"            "different"     "too"           "might"        
##  [85] "causes"        "within"        "without"       "itself"       
##  [89] "still"         "him"           "because"       "cannot"       
##  [93] "where"         "either"        "another"       "number"       
##  [97] "being"         "few"           "over"          "how"          
## [101] "great"         "therefore"     "general"       "also"         
## [105] "though"        "members"       "many"          "much"         
## [109] "two"           "first"         "fact"          "now"          
## [113] "said"          "well"          "among"         "do"           
## [117] "up"            "during"        "himself"       "out"          
## [121] "again"         "enough"        "own"           "less"         
## [125] "nor"           "man"           "nothing"       "use"          
## [129] "interests"     "several"       "themselves"    "case"         
## [133] "men"           "greater"       "does"          "she"          
## [137] "far"           "want"          "even"          "next"         
## [141] "whose"         "others"        "often"         "whom"         
## [145] "once"          "certain"       "me"            "just"         
## [149] "whole"         "did"           "interest"      "further"      
## [153] "then"          "here"          "through"       "seems"        
## [157] "three"         "become"        "last"          "possible"     
## [161] "always"        "likely"        "take"          "ever"         
## [165] "four"          "best"          "however"       "yet"          
## [169] "done"          "given"         "whether"       "after"        
## [173] "before"        "years"         "having"        "why"          
## [177] "making"        "course"        "myself"        "until"        
## [181] "part"          "high"          "latter"        "says"         
## [185] "known"         "least"         "former"        "per"          
## [189] "seem"          "kind"          "thus"          "wants"        
## [193] "down"          "never"         "second"        "think"        
## [197] "doing"         "alone"         "see"           "like"         
## [201] "made"          "make"          "good"          "small"        
## [205] "already"       "rather"        "according"     "old"          
## [209] "side"          "year"          "five"          "particular"   
## [213] "thing"         "present"       "became"        "about"        
## [217] "perhaps"       "taken"         "am"            "almost"       
## [221] "herself"       "ourselves"     "becomes"       "above"        
## [225] "better"        "certainly"     "know"          "need"         
## [229] "everywhere"    "clearly"       "ask"           "end"          
## [233] "seen"          "gives"         "little"        "place"        
## [237] "eight"         "six"           "order"         "value"        
## [241] "appear"        "kept"          "point"         "places"       
## [245] "wish"          "long"          "re"            "tried"        
## [249] "whatever"      "nine"          "able"          "concerning"   
## [253] "large"         "important"     "neither"       "example"      
## [257] "took"          "keep"          "seven"         "numbers"      
## [261] "say"           "smaller"       "knows"         "probably"     
## [265] "way"           "non"           "give"          "hence"        
## [269] "something"     "somewhere"     "come"          "go"           
## [273] "clear"         "otherwise"     "except"        "generally"    
## [277] "sometimes"     "together"      "felt"          "third"        
## [281] "cause"         "ours"          "contains"      "work"         
## [285] "although"      "find"          "throughout"    "fully"        
## [289] "everything"    "off"           "soon"          "name"         
## [293] "self"          "whenever"      "particularly"  "nearly"       
## [297] "unless"        "interested"    "consequently"  "facts"        
## [301] "immediate"     "instead"       "things"        "member"       
## [305] "towards"       "beyond"        "mean"          "parts"        
## [309] "various"       "asked"         "thought"       "secondly"     
## [313] "fifth"         "consider"      "believe"       "full"         
## [317] "contain"       "points"        "longer"        "greatest"     
## [321] "therein"       "followed"      "indeed"        "serious"      
## [325] "useful"        "around"        "associated"    "came"         
## [329] "ended"         "saw"           "seemed"        "went"         
## [333] "put"           "since"         "turn"          "usually"      
## [337] "actually"      "later"         "open"          "sure"         
## [341] "thoroughly"    "away"          "below"         "merely"       
## [345] "regards"       "used"          "following"     "truly"        
## [349] "afterwards"    "needs"         "really"        "sensible"     
## [353] "behind"        "interesting"   "becoming"      "hardly"       
## [357] "placed"        "room"          "theirs"        "toward"       
## [361] "ways"          "turned"        "anything"      "besides"      
## [365] "along"         "keeps"         "anywhere"      "seriously"    
## [369] "willing"       "going"         "nowhere"       "reasonably"   
## [373] "entirely"      "gets"          "quite"         "especially"   
## [377] "gave"          "v"             "provides"      "seeming"      
## [381] "tends"         "ones"          "using"         "co"           
## [385] "show"          "happens"       "specified"     "none"         
## [389] "etc"           "ex"            "p"             "novel"        
## [393] "thereby"       "exactly"       "formerly"      "yourselves"   
## [397] "allow"         "respectively"  "whence"        "changes"      
## [401] "whereas"       "forth"         "hers"          "higher"       
## [405] "wherever"      "pointed"       "sides"         "accordingly"  
## [409] "elsewhere"     "nevertheless"  "early"         "lately"       
## [413] "near"          "presents"      "trying"        "wanting"      
## [417] "ends"          "considering"   "face"          "somewhat"     
## [421] "unfortunately" "moreover"      "shows"         "young"        
## [425] "e"             "longest"       "look"          "t"            
## [429] "turning"       "allows"        "comes"         "corresponding"
## [433] "hereafter"     "problem"       "smallest"      "brief"        
## [437] "differ"        "highest"       "apart"         "turns"        
## [441] "looking"       "finds"         "indicated"     "try"          
## [445] "back"          "ending"        "yes"           "appreciate"   
## [449] "indicate"      "sent"          "unlikely"      "wanted"       
## [453] "mr"            "orders"        "beforehand"    "getting"      
## [457] "looks"         "whoever"       "saying"        "thence"       
## [461] "showing"       "containing"    "regarding"     "amongst"      
## [465] "seeing"        "twice"         "parted"        "please"       
## [469] "differently"   "tell"          "whither"       "ordered"      
## [473] "aside"         "follows"       "goods"         "thorough"     
## [477] "described"     "gone"          "group"         "et"           
## [481] "wonder"        "inasmuch"      "indicates"     "puts"         
## [485] "opened"        "tries"         "presented"     "specify"      
## [489] "began"         "help"          "thinks"        "viz"          
## [493] "opens"         "works"         "appropriate"   "pointing"     
## [497] "area"          "rooms"         "sees"          "presenting"   
## [501] "needed"        "specifying"    "goes"          "lest"
names(fed)
##   [1] "doc_name"      "author"        "the"           "of"           
##   [5] "to"            "a"             "and"           "that"         
##   [9] "is"            "be"            "it"            "by"           
##  [13] "would"         "this"          "have"          "are"          
##  [17] "or"            "been"          "not"           "our"          
##  [21] "his"           "their"         "on"            "they"         
##  [25] "i"             "was"           "from"          "no"           
##  [29] "will"          "he"            "between"       "but"          
##  [33] "at"            "we"            "its"           "an"           
##  [37] "had"           "more"          "has"           "there"        
##  [41] "other"         "these"         "them"          "new"          
##  [45] "were"          "who"           "may"           "under"        
##  [49] "than"          "those"         "most"          "against"      
##  [53] "states"        "shall"         "cases"         "should"       
##  [57] "must"          "her"           "same"          "upon"         
##  [61] "could"         "such"          "some"          "can"          
##  [65] "one"           "us"            "only"          "state"        
##  [69] "what"          "right"         "into"          "very"         
##  [73] "your"          "you"           "both"          "necessary"    
##  [77] "ought"         "every"         "when"          "so"           
##  [81] "my"            "different"     "too"           "might"        
##  [85] "causes"        "within"        "without"       "itself"       
##  [89] "still"         "him"           "because"       "cannot"       
##  [93] "where"         "either"        "another"       "number"       
##  [97] "being"         "few"           "over"          "how"          
## [101] "great"         "therefore"     "general"       "also"         
## [105] "though"        "members"       "many"          "much"         
## [109] "two"           "first"         "fact"          "now"          
## [113] "said"          "well"          "among"         "do"           
## [117] "up"            "during"        "himself"       "out"          
## [121] "again"         "enough"        "own"           "less"         
## [125] "nor"           "man"           "nothing"       "use"          
## [129] "interests"     "several"       "themselves"    "case"         
## [133] "men"           "greater"       "does"          "she"          
## [137] "far"           "want"          "even"          "next"         
## [141] "whose"         "others"        "often"         "whom"         
## [145] "once"          "certain"       "me"            "just"         
## [149] "whole"         "did"           "interest"      "further"      
## [153] "then"          "here"          "through"       "seems"        
## [157] "three"         "become"        "last"          "possible"     
## [161] "always"        "likely"        "take"          "ever"         
## [165] "four"          "best"          "however"       "yet"          
## [169] "done"          "given"         "whether"       "after"        
## [173] "before"        "years"         "having"        "why"          
## [177] "making"        "course"        "myself"        "until"        
## [181] "part"          "high"          "latter"        "says"         
## [185] "known"         "least"         "former"        "per"          
## [189] "seem"          "kind"          "thus"          "wants"        
## [193] "down"          "never"         "second"        "think"        
## [197] "doing"         "alone"         "see"           "like"         
## [201] "made"          "make"          "good"          "small"        
## [205] "already"       "rather"        "according"     "old"          
## [209] "side"          "year"          "five"          "particular"   
## [213] "thing"         "present"       "became"        "about"        
## [217] "perhaps"       "taken"         "am"            "almost"       
## [221] "herself"       "ourselves"     "becomes"       "above"        
## [225] "better"        "certainly"     "know"          "need"         
## [229] "everywhere"    "clearly"       "ask"           "end"          
## [233] "seen"          "gives"         "little"        "place"        
## [237] "eight"         "six"           "order"         "value"        
## [241] "appear"        "kept"          "point"         "places"       
## [245] "wish"          "long"          "re"            "tried"        
## [249] "whatever"      "nine"          "able"          "concerning"   
## [253] "large"         "important"     "neither"       "example"      
## [257] "took"          "keep"          "seven"         "numbers"      
## [261] "say"           "smaller"       "knows"         "probably"     
## [265] "way"           "non"           "give"          "hence"        
## [269] "something"     "somewhere"     "come"          "go"           
## [273] "clear"         "otherwise"     "except"        "generally"    
## [277] "sometimes"     "together"      "felt"          "third"        
## [281] "cause"         "ours"          "contains"      "work"         
## [285] "although"      "find"          "throughout"    "fully"        
## [289] "everything"    "off"           "soon"          "name"         
## [293] "self"          "whenever"      "particularly"  "nearly"       
## [297] "unless"        "interested"    "consequently"  "facts"        
## [301] "immediate"     "instead"       "things"        "member"       
## [305] "towards"       "beyond"        "mean"          "parts"        
## [309] "various"       "asked"         "thought"       "secondly"     
## [313] "fifth"         "consider"      "believe"       "full"         
## [317] "contain"       "points"        "longer"        "greatest"     
## [321] "therein"       "followed"      "indeed"        "serious"      
## [325] "useful"        "around"        "associated"    "came"         
## [329] "ended"         "saw"           "seemed"        "went"         
## [333] "put"           "since"         "turn"          "usually"      
## [337] "actually"      "later"         "open"          "sure"         
## [341] "thoroughly"    "away"          "below"         "merely"       
## [345] "regards"       "used"          "following"     "truly"        
## [349] "afterwards"    "needs"         "really"        "sensible"     
## [353] "behind"        "interesting"   "becoming"      "hardly"       
## [357] "placed"        "room"          "theirs"        "toward"       
## [361] "ways"          "turned"        "anything"      "besides"      
## [365] "along"         "keeps"         "anywhere"      "seriously"    
## [369] "willing"       "going"         "nowhere"       "reasonably"   
## [373] "entirely"      "gets"          "quite"         "especially"   
## [377] "gave"          "v"             "provides"      "seeming"      
## [381] "tends"         "ones"          "using"         "co"           
## [385] "show"          "happens"       "specified"     "none"         
## [389] "etc"           "ex"            "p"             "novel"        
## [393] "thereby"       "exactly"       "formerly"      "yourselves"   
## [397] "allow"         "respectively"  "whence"        "changes"      
## [401] "whereas"       "forth"         "hers"          "higher"       
## [405] "wherever"      "pointed"       "sides"         "accordingly"  
## [409] "elsewhere"     "nevertheless"  "early"         "lately"       
## [413] "near"          "presents"      "trying"        "wanting"      
## [417] "ends"          "considering"   "face"          "somewhat"     
## [421] "unfortunately" "moreover"      "shows"         "young"        
## [425] "e"             "longest"       "look"          "t"            
## [429] "turning"       "allows"        "comes"         "corresponding"
## [433] "hereafter"     "problem"       "smallest"      "brief"        
## [437] "differ"        "highest"       "apart"         "turns"        
## [441] "looking"       "finds"         "indicated"     "try"          
## [445] "back"          "ending"        "yes"           "appreciate"   
## [449] "indicate"      "sent"          "unlikely"      "wanted"       
## [453] "mr"            "orders"        "beforehand"    "getting"      
## [457] "looks"         "whoever"       "saying"        "thence"       
## [461] "showing"       "containing"    "regarding"     "amongst"      
## [465] "seeing"        "twice"         "parted"        "please"       
## [469] "differently"   "tell"          "whither"       "ordered"      
## [473] "aside"         "follows"       "goods"         "thorough"     
## [477] "described"     "gone"          "group"         "et"           
## [481] "wonder"        "inasmuch"      "indicates"     "puts"         
## [485] "opened"        "tries"         "presented"     "specify"      
## [489] "began"         "help"          "thinks"        "viz"          
## [493] "opens"         "works"         "appropriate"   "pointing"     
## [497] "area"          "rooms"         "sees"          "presenting"   
## [501] "needed"        "specifying"    "goes"          "lest"
library(ggdims)
fed |> 
  ggplot() + 
  aes(dims = dims(the:their)) + 
  geom_pca() + 
  aes(fill = author)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0. Please use
## `as_label()` or `as_name()` instead.
## This warning is displayed once every 8 hours.

fed |> 
  ggplot() + 
  aes(dims = dims(the:their)) + 
  geom_tsne(perplexity = 20) + 
  aes(fill = author)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the ggdims package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fed |> 
  ggplot() + 
  aes(dims = dims(the:their)) + 
  geom_umap() + 
  aes(fill = author)

federalist %>%
  quanteda::corpus(text_field = "text", docid_field = "name") |>
  quanteda::tokens(remove_numbers = TRUE, remove_punct = TRUE) |>
  quanteda::dfm() |>
  quanteda::convert(to = "data.frame") |>
  select(-"in", -"for", -"or") ->
federalist_word_count

names(federalist_word_count) |> tail()
## [1] "prodigy"    "completion" "trembling"  "recommence" "hume's"    
## [6] "essays"
library(ggdims)
federalist_word_count |> 
  ggplot() + 
  aes(dims = dims(the:an)) + 
  geom_pca() 

last_plot() + 
  aes(fill = str_extract(doc_id, ".+-"))

library(ggdims)
federalist_word_count |> 
  ggplot() + 
  aes(dims = dims(the:an)) + 
  geom_tsne(perplexity = 25) 

last_plot() + 
  aes(fill = str_extract(doc_id, ".+-"))

library(ggdims)
federalist_word_count |> 
  ggplot() + 
  aes(dims = dims(the:an)) + 
  geom_umap()

last_plot() + 
  aes(fill = str_extract(doc_id, ".+-"))

library(embed)
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.11     ✔ tailor       0.1.0 
## ✔ dials        1.4.2      ✔ tune         2.0.1 
## ✔ infer        1.1.0      ✔ workflows    1.3.0 
## ✔ modeldata    1.5.1      ✔ workflowsets 1.1.1 
## ✔ parsnip      1.4.1      ✔ yardstick    1.3.2 
## ✔ rsample      1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
recipe(~., data = fed) %>%
  update_role(author, doc_name, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors()) |> 
  prep() |> 
  juice() |> 
  ggplot() + 
  aes(x = UMAP1, y = UMAP2) + 
  geom_point() + 
  aes(color = author)

recipe(~., data = fed) %>%
  update_role(author, doc_name, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors()) |> 
  prep() |> 
  juice() |> 
  ggplot() + 
  aes(x = PC1, y = PC2) + 
  geom_point() + 
  aes(color = author)

Closing remarks, Other Relevant Work, Caveats