Status Quo
library(tidyverse)
mvars <- function(...) {
varnames <- as.character(ensyms(...))
vars <- list(...)
listvec <- asplit(do.call(cbind, vars), 1)
structure(listvec, varnames = varnames)
}
vars_unpack <- function(x) {
pca_vars <- x
df <- do.call(rbind, pca_vars)
colnames(df) <- attr(pca_vars, "varnames")
as.data.frame(df)
}
compute_tsne2 <- function(data, scales, perplexity = 20){
set.seed(1345)
# data
# identify duplicates just based on tsne data
data |>
select(dims) |>
mutate(vars_unpack(dims)) |>
select(-dims) ->
data_unpacked ; data_unpacked
names_predictors <- names(data_unpacked); names_predictors
data_unpacked |>
duplicated() ->
dups ; dups
# #
# # #
data_unpacked |>
bind_cols(data) |>
_[!dups,] |>
remove_missing() ->
clean_data ; clean_data
# # #
clean_data |>
_[names_predictors] |>
as.matrix() |>
Rtsne::Rtsne(perplexity = perplexity) |>
_$Y |>
as_tibble() |>
rename(x = V1, y = V2) |>
bind_cols(clean_data)
#
}
iris |>
mutate(dims = mvars(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width)) |>
select(dims) |>
compute_tsne2()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 149 × 7
## x y Sepal.Length Sepal.Width Petal.Length Petal.Width dims
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list[1d]>
## 1 -10.3 -19.3 5.1 3.5 1.4 0.2 <dbl [4]>
## 2 -11.2 -15.4 4.9 3 1.4 0.2 <dbl [4]>
## 3 -9.61 -14.7 4.7 3.2 1.3 0.2 <dbl [4]>
## 4 -10.1 -14.5 4.6 3.1 1.5 0.2 <dbl [4]>
## 5 -9.57 -19.5 5 3.6 1.4 0.2 <dbl [4]>
## 6 -9.59 -22.1 5.4 3.9 1.7 0.4 <dbl [4]>
## 7 -8.80 -14.9 4.6 3.4 1.4 0.3 <dbl [4]>
## 8 -9.98 -18.2 5 3.4 1.5 0.2 <dbl [4]>
## 9 -10.4 -13.4 4.4 2.9 1.4 0.2 <dbl [4]>
## 10 -10.8 -15.9 4.9 3.1 1.5 0.1 <dbl [4]>
## # ℹ 139 more rows
StatTsne2 <- ggproto("StatTsne2", Stat,
compute_panel = compute_tsne2)
GeomPointFill <- ggproto("GeomPointFill",
GeomPoint,
default_aes =
modifyList(GeomPoint$default_aes,
aes(shape = 21,
color = from_theme(paper),
size = from_theme(pointsize * 2.5),
alpha = .7,
fill = from_theme(ink))))
geom_tsne <- make_constructor(GeomPointFill, stat = StatTsne2, perplexity = 30)
compute_umap <- function(data, scales, n_components = 2, random_state = 15){
set.seed(1345)
# identify duplicates just based on tsne data
data |>
select(dims) |>
mutate(vars_unpack(dims)) |>
select(-dims) ->
data_unpacked ; data_unpacked
names_predictors <- names(data_unpacked); names_predictors
data_unpacked |>
bind_cols(data) |>
remove_missing() ->
clean_data ; clean_data
# # #
clean_data |>
_[names_predictors] |>
umap::umap(n_components = n_components, random_state = random_state) |>
_$layout |>
as_tibble() |>
rename(x = V1, y = V2) |>
bind_cols(clean_data)
#
}
iris |>
mutate(dims =
mvars(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width)) |>
select(color = Species, dims) |>
compute_umap()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 150 × 8
## x y Sepal.Length Sepal.Width Petal.Length Petal.Width color dims
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <list[1>
## 1 15.7 -4.39 5.1 3.5 1.4 0.2 setosa <dbl[…]>
## 2 13.7 -4.54 4.9 3 1.4 0.2 setosa <dbl[…]>
## 3 14.1 -5.12 4.7 3.2 1.3 0.2 setosa <dbl[…]>
## 4 13.8 -5.22 4.6 3.1 1.5 0.2 setosa <dbl[…]>
## 5 15.4 -4.27 5 3.6 1.4 0.2 setosa <dbl[…]>
## 6 15.9 -3.28 5.4 3.9 1.7 0.4 setosa <dbl[…]>
## 7 14.2 -5.35 4.6 3.4 1.4 0.3 setosa <dbl[…]>
## 8 15.3 -4.62 5 3.4 1.5 0.2 setosa <dbl[…]>
## 9 13.5 -5.33 4.4 2.9 1.4 0.2 setosa <dbl[…]>
## 10 13.8 -4.81 4.9 3.1 1.5 0.1 setosa <dbl[…]>
## # ℹ 140 more rows
StatUmap <- ggproto("StatUmap", Stat,
compute_panel = compute_umap)
GeomPointFill <- ggproto("GeomPointFill",
GeomPoint,
default_aes =
modifyList(GeomPoint$default_aes,
aes(shape = 21,
color = from_theme(paper),
size = from_theme(pointsize * 2.5),
alpha = .7,
fill = from_theme(ink))))
geom_umap <- make_constructor(GeomPointFill, stat = StatUmap, random_state = 15, n_components = 4)
iris |>
ggplot() +
aes(dims =
mvars(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_umap()

iris |>
ggplot() +
aes(dims =
mvars(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne()

palmerpenguins::penguins |>
remove_missing() |>
ggplot() +
aes(dims =
mvars(bill_length_mm, bill_depth_mm,
flipper_length_mm, body_mass_g),
# fill = species,
# alpha = sex
) +
geom_tsne(perplexity = 30)
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.

palmerpenguins::penguins |>
ggplot() +
aes(dims =
mvars(bill_length_mm, bill_depth_mm,
flipper_length_mm, body_mass_g),
fill = species,
alpha = sex) +
geom_umap()
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.

palmerpenguins::penguins |>
ggplot() +
aes(dims =
mvars(bill_length_mm, bill_depth_mm,
flipper_length_mm, body_mass_g),
fill = species,
alpha = sex) +
geom_umap()
## Warning: Using alpha for a discrete variable is not advised.
## Removed 11 rows containing missing values or values outside the scale range.

unvotes::un_votes |>
mutate(num_vote = case_when(vote == "yes" ~ 1,
vote == "abstain" ~ .5,
vote == "no" ~ 0,
TRUE ~ 0 )) |>
# filter(rcid %in% 1:30) |>
pivot_wider(id_cols = c(country, country_code),
names_from = rcid,
values_from = num_vote,
values_fill = .5
) |>
mutate(continent = country_code |>
countrycode::countrycode(origin = "iso2c", destination = "continent")) |>
mutate(continent = continent |> is.na() |> ifelse("unknown", continent)) ->
un_ga_country_wide_rcid
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `continent = countrycode::countrycode(country_code, origin =
## "iso2c", destination = "continent")`.
## Caused by warning:
## ! Some values were not matched unambiguously: CS, DD, YD, YU
unvotes::un_votes$rcid |> max()
## [1] 9147
un_ga_country_wide_rcid |>
ggplot() +
ggplyr::aes_dims(`3`:`9147`) +
geom_umap() +
aes(fill = continent)

last_plot() +
geom_text(stat = StatUmap, check_overlap = T, aes(fill = NULL)) +
aes(label = country)

# layer_data(i = 2)
ggplyr::aes_dims
## function (vars)
## {
## structure(list(data_spec = data, vars_spec = rlang::enquo(vars)),
## class = "aes_dims")
## }
## <bytecode: 0x7fb49bf7aaf0>
## <environment: namespace:ggplyr>
library(ggplyr)
snapshot <- ggplyr::intercept
un_ga_country_wide_rcid |>
ggplot() +
aes_dims(`9001`:`9147`) +
geom_tsne(perplexity = 5) + snapshot("p1") +
aes(fill = continent) + snapshot("p2") +
geom_text(stat = StatTsne2,
check_overlap = T,
aes(fill = NULL, label = country), # do this in the Stat?
perplexity = 5) + snapshot("p3") -> hide
## p1
## p2
## p3
library(patchwork)
p1 / p2 / p3 + plot_annotation(title = "TSNE dimention reduction", tag_levels = 1,
subtitle =
" 1. Use TSNE to reduce many dimensions to two dimensions, \n 2. fill by continent, \n 3. label some countries")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
