https://stackoverflow.com/questions/44837536/how-to-use-ggplot-to-plot-t-sne-clustering

https://satijalab.org/seurat/articles/seurat5_sketch_analysis

Intro Thoughts

Status Quo

library(tidyverse)

vars_pack <- function(...) {
  
  varnames <- as.character(ensyms(...))
  vars <- list(...)
  listvec <- asplit(do.call(cbind, vars), 1)
  structure(listvec, varnames = varnames)

  }

vars_unpack <- function(x) {
  pca_vars <- x
  df <- do.call(rbind, pca_vars)
  colnames(df) <- attr(pca_vars, "varnames")
  as.data.frame(df)
  
}


palmerpenguins::penguins %>% 
  mutate(outcome = species, 
         vars = vars_pack(bill_length_mm, species, sex)) %>% 
  select(outcome, vars) ->
data

data |> head()
## # A tibble: 6 × 2
##   outcome vars      
##   <fct>   <list[1d]>
## 1 Adelie  <dbl [3]> 
## 2 Adelie  <dbl [3]> 
## 3 Adelie  <dbl [3]> 
## 4 Adelie  <dbl [3]> 
## 5 Adelie  <dbl [3]> 
## 6 Adelie  <dbl [3]>
head(data$vars)
## [[1]]
## [1] 39.1  1.0  2.0
## 
## [[2]]
## [1] 39.5  1.0  1.0
## 
## [[3]]
## [1] 40.3  1.0  1.0
## 
## [[4]]
## [1] NA  1 NA
## 
## [[5]]
## [1] 36.7  1.0  1.0
## 
## [[6]]
## [1] 39.3  1.0  2.0
data %>%
  mutate(vars_unpack(vars)) %>% 
  select(-vars) ->
data

data |> head()
## # A tibble: 6 × 4
##   outcome bill_length_mm species   sex
##   <fct>            <dbl>   <dbl> <dbl>
## 1 Adelie            39.1       1     2
## 2 Adelie            39.5       1     1
## 3 Adelie            40.3       1     1
## 4 Adelie            NA         1    NA
## 5 Adelie            36.7       1     1
## 6 Adelie            39.3       1     2
compute_tsne2 <- function(data, scales, perplexity = 30){
  
  set.seed(1345)
  
data  
  
# identify duplicates just based on tsne data
data |>
  select(tsne_vars) |>
  mutate(vars_unpack(tsne_vars)) |>
  select(-tsne_vars) ->
data_unpacked ; data_unpacked

names_predictors <- names(data_unpacked); names_predictors

data_unpacked |>
   duplicated() ->
dups ; dups
# #
# # #
data_unpacked |>
    bind_cols(data) |>
     _[!dups,] |> 
  remove_missing() ->
clean_data ; clean_data
# # # 
clean_data |>
  _[names_predictors] |>
  as.matrix() |>
  Rtsne::Rtsne(perplexity = perplexity) |>
  _$Y |>
  as_tibble() |>
 rename(x = V1, y = V2) |>
 bind_cols(clean_data)
#   

}


iris |> 
  mutate(tsne_vars = vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width)) |>
  select(tsne_vars) |>
  compute_tsne2()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 149 × 7
##        x     y Sepal.Length Sepal.Width Petal.Length Petal.Width tsne_vars 
##    <dbl> <dbl>        <dbl>       <dbl>        <dbl>       <dbl> <list[1d]>
##  1  16.6 -15.0          5.1         3.5          1.4         0.2 <dbl [4]> 
##  2  16.6 -12.7          4.9         3            1.4         0.2 <dbl [4]> 
##  3  15.5 -12.7          4.7         3.2          1.3         0.2 <dbl [4]> 
##  4  15.6 -12.5          4.6         3.1          1.5         0.2 <dbl [4]> 
##  5  16.2 -15.3          5           3.6          1.4         0.2 <dbl [4]> 
##  6  16.8 -17.0          5.4         3.9          1.7         0.4 <dbl [4]> 
##  7  15.0 -13.1          4.6         3.4          1.4         0.3 <dbl [4]> 
##  8  16.2 -14.5          5           3.4          1.5         0.2 <dbl [4]> 
##  9  15.8 -11.7          4.4         2.9          1.4         0.2 <dbl [4]> 
## 10  16.4 -13.2          4.9         3.1          1.5         0.1 <dbl [4]> 
## # ℹ 139 more rows
StatTsne2 <- ggproto("StatTsne2", Stat, 
                     compute_panel = compute_tsne2)

GeomPointFill <- ggproto("GeomPointFill", 
                         GeomPoint,
                         default_aes = 
                           modifyList(GeomPoint$default_aes, 
                                      aes(shape = 21, 
                                          color = from_theme(paper),
                                          size = from_theme(pointsize * 2.5),
                                          alpha = .7,
                                          fill = from_theme(ink))))

geom_tsne <- make_constructor(GeomPointFill, stat = StatTsne2)


iris |> 
  ggplot() + 
  aes(tsne_vars = 
        vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_tsne(perplexity = 2)

# not sure what is going on with last_plot_wipe. 
# last_plot_wipe() + 
#   geom_tsne(perplexity = 5)

iris |> 
  ggplot() + 
  aes(tsne_vars = 
        vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_tsne(perplexity = 5)

iris |> 
  ggplot() + 
  aes(tsne_vars = 
        vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_tsne(perplexity = 12)

iris |> 
  ggplot() + 
  aes(tsne_vars = 
        vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_tsne(perplexity = 20)

iris |> 
  ggplot() + 
  aes(tsne_vars = 
        vars_pack(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_tsne(perplexity = 30)

## ! perplexity is too large for the number of samples
# ggwipe::last_plot_wipe() + 
#  geom_tsne(perplexity = 50)

# to great for sample
# ggwipe::last_plot_wipe() + 
#   geom_tsne(perplexity = 100)

aes_tsne <- function(...){aes(vars = vars_pack(vars(...)))}


palmerpenguins::penguins |>
   ggplot() + 
   aes(tsne_vars = vars_pack(bill_length_mm, bill_depth_mm, 
                        flipper_length_mm)) + 
   geom_tsne() 
## Warning: Removed 1 row containing missing values or values outside the scale
## range.

last_plot() + 
   aes(fill = species)
## Warning: Removed 1 row containing missing values or values outside the scale
## range.

last_plot() + 
  aes(shape = sex)
## Warning: Removed 10 rows containing missing values or values outside the scale
## range.

last_plot() + 
  facet_wrap(~island)
## Warning: Removed 5 rows containing missing values or values outside the scale
## range.
## Warning: Removed 1 row containing missing values or values outside the scale
## range.
## Warning: Removed 5 rows containing missing values or values outside the scale
## range.
## Warning: Computation failed in `stat_tsne2()`.
## Caused by error in `.check_tsne_params()`:
## ! perplexity is too large for the number of samples


Experiment

library(tidyverse)
library(Rtsne)
iris_unique <- unique(iris) # Remove duplicates
iris_matrix <- as.matrix(iris_unique[,1:4])
set.seed(42) # Set a seed if you want reproducible results
tsne_out <- Rtsne(iris_matrix)


library(ggplot2)
tsne_plot <- data.frame(x = tsne_out$Y[,1], 
                        y = tsne_out$Y[,2], 
                        col = iris_unique$Species)

ggplot(tsne_plot) + 
  geom_point(aes(x=x, y=y, color=col))

compute_tsne <- function(data, scales, vars){
  
# identify duplicates just based on tsne data
data |>
  select(all_of(vars)) |>
  duplicated() ->
dups
   
#  
data |>
    _[!dups,] |>
  remove_missing() ->
clean_data

clean_data |>
  select(all_of(vars)) |>
  as.matrix() |>
  Rtsne() |> 
  _$Y |> 
  as_tibble() |> 
  rename(x = V1, y = V2) |>
  bind_cols(clean_data)
  
}

StatTsne <- ggproto("StatTsne", Stat, 
                    compute_panel = compute_tsne)

geom_tsne <- make_constructor(GeomPoint, stat = StatTsne)



library(ggplyr)
iris |>
  ggplot() +
  aes_from_data() + 
  geom_tsne(vars = 
              c("Sepal.Length", "Sepal.Width", 
                "Petal.Length", "Petal.Width")) + 
  aes(color = Species)

library(tidyverse)
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.4.1
## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
# ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports |>
#   select(package, description) |>
#   filter(row_number() == 1, .by = package) |>
#   tidytext::unnest_ngrams(word, description, n = 1) |> 
#   mutate(exists = 1) |>
#   pivot_wider(names_from = word, 
#               values_from = exists)



ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports |> 
  select(package, description) |>
  mutate(doc_id = package) |>  
  select(doc_id, description) |>  
  corpus(text_field = "description", docid_field = "doc_id") |>  
  tokens(remove_punct = TRUE, remove_numbers = TRUE) |>  
  tokens_tolower() |>  
  tokens_remove(stopwords("en")) |>
  dfm() |>  # document-feature matrix
  convert(to = "data.frame") |>
  select(-doc_id) |>
  Rtsne::Rtsne(perplexity = 25) |>
  _$Y |>
  as_data_frame() |>
  bind_cols(ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports["package"]) |>
  ggplot() +
  aes(V1, V2, label = package) +
  geom_point(color = "grey") +
  geom_text(check_overlap = T, size = 3)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
##   tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(tidyverse)
library(quanteda)
set.seed(12456)
tools::CRAN_package_db() |> 
  janitor::clean_names() |>
  select(package, description) |>
  sample_frac(.04) |>
  mutate(doc_id = package) |>  
  mutate(row_number = row_number(), .by = package) |>
  filter(row_number == 1) ->
doc_text_df
  
doc_text_df |>  
  select(doc_id, description) |>  
  corpus(text_field = "description", docid_field = "doc_id") |>  
  tokens(remove_punct = TRUE, remove_numbers = TRUE) |>  
  tokens_tolower() |>  
  tokens_remove(stopwords("en")) |>
  dfm() |>  # document-feature matrix
  convert(to = "data.frame") |>
  select(-doc_id) |>
  Rtsne::Rtsne(perplexity = 25) |>
  _$Y |>
  as_data_frame() |>
  bind_cols(doc_text_df |> select(package)) |>
  ggplot() +
  aes(V1, V2, label = package) +
  geom_point() +
  aes(color = package |> str_detect("gg|GG|geom"))

  # geom_text(check_overlap = T, size = 3)

Closing remarks, Other Relevant Work, Caveats