Intro Thoughts

Status Quo

library(tidyverse)

mvars <- function(...) {
  
  varnames <- as.character(ensyms(...))
  vars <- list(...)
  listvec <- asplit(do.call(cbind, vars), 1)
  structure(listvec, varnames = varnames)

  }

vars_unpack <- function(x) {
  pca_vars <- x
  df <- do.call(rbind, pca_vars)
  colnames(df) <- attr(pca_vars, "varnames")
  as.data.frame(df)
  
}



compute_tsne2 <- function(data, scales, perplexity = 20){
  
  set.seed(1345)
  
# data  
  
# identify duplicates just based on tsne data
data |>
  select(dims) |>
  mutate(vars_unpack(dims)) |>
  select(-dims) ->
data_unpacked ; data_unpacked

names_predictors <- names(data_unpacked); names_predictors

data_unpacked |>
   duplicated() ->
dups ; dups
# #
# # #
data_unpacked |>
    bind_cols(data) |>
     _[!dups,] |> 
  remove_missing() ->
clean_data ; clean_data
# # # 
clean_data |>
  _[names_predictors] |>
  as.matrix() |>
  Rtsne::Rtsne(perplexity = perplexity) |>
  _$Y |>
  as_tibble() |>
 rename(x = V1, y = V2) |>
 bind_cols(clean_data)
#   

}


iris |> 
  mutate(dims = mvars(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width)) |>
  select(dims) |>
  compute_tsne2()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 149 × 7
##         x     y Sepal.Length Sepal.Width Petal.Length Petal.Width dims      
##     <dbl> <dbl>        <dbl>       <dbl>        <dbl>       <dbl> <list[1d]>
##  1 -10.3  -19.3          5.1         3.5          1.4         0.2 <dbl [4]> 
##  2 -11.2  -15.4          4.9         3            1.4         0.2 <dbl [4]> 
##  3  -9.61 -14.7          4.7         3.2          1.3         0.2 <dbl [4]> 
##  4 -10.1  -14.5          4.6         3.1          1.5         0.2 <dbl [4]> 
##  5  -9.57 -19.5          5           3.6          1.4         0.2 <dbl [4]> 
##  6  -9.59 -22.1          5.4         3.9          1.7         0.4 <dbl [4]> 
##  7  -8.80 -14.9          4.6         3.4          1.4         0.3 <dbl [4]> 
##  8  -9.98 -18.2          5           3.4          1.5         0.2 <dbl [4]> 
##  9 -10.4  -13.4          4.4         2.9          1.4         0.2 <dbl [4]> 
## 10 -10.8  -15.9          4.9         3.1          1.5         0.1 <dbl [4]> 
## # ℹ 139 more rows
StatTsne2 <- ggproto("StatTsne2", Stat, 
                     compute_panel = compute_tsne2)

GeomPointFill <- ggproto("GeomPointFill", 
                         GeomPoint,
                         default_aes = 
                           modifyList(GeomPoint$default_aes, 
                                      aes(shape = 21, 
                                          color = from_theme(paper),
                                          size = from_theme(pointsize * 2.5),
                                          alpha = .7,
                                          fill = from_theme(ink))))

geom_tsne <- make_constructor(GeomPointFill, stat = StatTsne2, perplexity = 30)
compute_umap <- function(data, scales, n_components = 2, random_state = 15){
  
set.seed(1345)
  
# identify duplicates just based on tsne data
data |>
  select(dims) |>
  mutate(vars_unpack(dims)) |>
  select(-dims) ->
data_unpacked ; data_unpacked

names_predictors <- names(data_unpacked); names_predictors

data_unpacked |>
    bind_cols(data) |>
  remove_missing() ->
clean_data ; clean_data

# # # 
clean_data |>
  _[names_predictors] |>
  umap::umap(n_components = n_components, random_state = random_state)  |>
  _$layout |>
  as_tibble() |>
 rename(x = V1, y = V2) |>
 bind_cols(clean_data)
#   

}

iris |> 
  mutate(dims = 
        mvars(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width)) |>
  select(color = Species, dims) |>
  compute_umap()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 150 × 8
##        x     y Sepal.Length Sepal.Width Petal.Length Petal.Width color  dims    
##    <dbl> <dbl>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  <list[1>
##  1  15.7 -4.39          5.1         3.5          1.4         0.2 setosa <dbl[…]>
##  2  13.7 -4.54          4.9         3            1.4         0.2 setosa <dbl[…]>
##  3  14.1 -5.12          4.7         3.2          1.3         0.2 setosa <dbl[…]>
##  4  13.8 -5.22          4.6         3.1          1.5         0.2 setosa <dbl[…]>
##  5  15.4 -4.27          5           3.6          1.4         0.2 setosa <dbl[…]>
##  6  15.9 -3.28          5.4         3.9          1.7         0.4 setosa <dbl[…]>
##  7  14.2 -5.35          4.6         3.4          1.4         0.3 setosa <dbl[…]>
##  8  15.3 -4.62          5           3.4          1.5         0.2 setosa <dbl[…]>
##  9  13.5 -5.33          4.4         2.9          1.4         0.2 setosa <dbl[…]>
## 10  13.8 -4.81          4.9         3.1          1.5         0.1 setosa <dbl[…]>
## # ℹ 140 more rows
StatUmap <- ggproto("StatUmap", Stat, 
                     compute_panel = compute_umap)

GeomPointFill <- ggproto("GeomPointFill", 
                         GeomPoint,
                         default_aes = 
                           modifyList(GeomPoint$default_aes, 
                                      aes(shape = 21, 
                                          color = from_theme(paper),
                                          size = from_theme(pointsize * 2.5),
                                          alpha = .7,
                                          fill = from_theme(ink))))

geom_umap <- make_constructor(GeomPointFill, stat = StatUmap, random_state = 15, n_components = 4)


iris |> 
  ggplot() + 
  aes(dims = 
        mvars(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) + 
  geom_umap()

iris |> 
  ggplot() + 
  aes(dims = 
        mvars(Sepal.Length, Sepal.Width, 
                Petal.Length, Petal.Width),
      fill = Species) +
  geom_tsne()

palmerpenguins::penguins |> 
  ggplot() + 
  aes(dims = 
        mvars(bill_length_mm, bill_depth_mm, 
              flipper_length_mm, body_mass_g),
      fill = species,
      alpha = sex) +
  geom_tsne(perplexity = 30)
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 10 rows containing missing values or values outside the scale
## range.

palmerpenguins::penguins |> 
  ggplot() + 
  aes(dims = 
        mvars(bill_length_mm, bill_depth_mm, 
              flipper_length_mm, body_mass_g),
      fill = species,
      alpha = sex) +
  geom_umap()
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.

palmerpenguins::penguins |> 
  ggplot() + 
  aes(dims = 
        mvars(bill_length_mm, bill_depth_mm, 
              flipper_length_mm, body_mass_g),
      fill = species,
      alpha = sex) +
  geom_umap()
## Warning: Using alpha for a discrete variable is not advised.
## Removed 11 rows containing missing values or values outside the scale range.

unvotes::un_votes |> 
  mutate(num_vote = case_when(vote == "yes" ~ 1,
                              vote == "abstain" ~ .5,
                              vote == "no" ~ 0,
                              TRUE ~ 0 )) |>
  # filter(rcid %in% 1:30) |>
  pivot_wider(id_cols = c(country, country_code),
    names_from = rcid, 
              values_from = num_vote,
              values_fill = .5
            ) |>
  mutate(continent = country_code |> 
           countrycode::countrycode(origin = "iso2c", destination = "continent")) |>
  mutate(continent = continent |> is.na() |> ifelse("unknown", continent)) ->
un_ga_country_wide_rcid
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `continent = countrycode::countrycode(country_code, origin =
##   "iso2c", destination = "continent")`.
## Caused by warning:
## ! Some values were not matched unambiguously: CS, DD, YD, YU
unvotes::un_votes$rcid |> max()
## [1] 9147
un_ga_country_wide_rcid |>
  ggplot() + 
  ggplyr::aes_dims(`3`:`9147`) + 
  geom_umap() +
  aes(fill = continent)

last_plot() + 
  geom_text(stat = StatUmap, check_overlap = T, aes(fill = NULL)) +
  aes(label = country)

# layer_data(i = 2)
library(ggplyr)
snapshot <- ggplyr::intercept
un_ga_country_wide_rcid |>
  ggplot() + 
  aes_dims(`9001`:`9147`) + 
  geom_tsne(perplexity = 5) + snapshot("p1") +
  aes(fill = continent) + snapshot("p2") + 
  geom_text(stat = StatTsne2, 
            check_overlap = T, 
            aes(fill = NULL, label = country), # do this in the Stat?
            perplexity = 5) + snapshot("p3") -> hide
## p1
## p2
## p3
library(patchwork)
p1 / p2 / p3 + plot_annotation(title = "TSNE dimention reduction", tag_levels = 1,
                               subtitle = 
                               "    1. Use TSNE to reduce many dimensions to two dimensions,   \n    2. fill by continent, \n    3. label some countries")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Closing remarks, Other Relevant Work, Caveats