https://stackoverflow.com/questions/44837536/how-to-use-ggplot-to-plot-t-sne-clustering
https://satijalab.org/seurat/articles/seurat5_sketch_analysis
library(tidyverse)
vars_pack <- function(...) {
varnames <- as.character(ensyms(...))
vars <- list(...)
listvec <- asplit(do.call(cbind, vars), 1)
structure(listvec, varnames = varnames)
}
vars_unpack <- function(x) {
pca_vars <- x
df <- do.call(rbind, pca_vars)
colnames(df) <- attr(pca_vars, "varnames")
as.data.frame(df)
}
palmerpenguins::penguins %>%
mutate(outcome = species,
vars = vars_pack(bill_length_mm, species, sex)) %>%
select(outcome, vars) ->
data
data |> head()
## # A tibble: 6 × 2
## outcome vars
## <fct> <list[1d]>
## 1 Adelie <dbl [3]>
## 2 Adelie <dbl [3]>
## 3 Adelie <dbl [3]>
## 4 Adelie <dbl [3]>
## 5 Adelie <dbl [3]>
## 6 Adelie <dbl [3]>
head(data$vars)
## [[1]]
## [1] 39.1 1.0 2.0
##
## [[2]]
## [1] 39.5 1.0 1.0
##
## [[3]]
## [1] 40.3 1.0 1.0
##
## [[4]]
## [1] NA 1 NA
##
## [[5]]
## [1] 36.7 1.0 1.0
##
## [[6]]
## [1] 39.3 1.0 2.0
data %>%
mutate(vars_unpack(vars)) %>%
select(-vars) ->
data
data |> head()
## # A tibble: 6 × 4
## outcome bill_length_mm species sex
## <fct> <dbl> <dbl> <dbl>
## 1 Adelie 39.1 1 2
## 2 Adelie 39.5 1 1
## 3 Adelie 40.3 1 1
## 4 Adelie NA 1 NA
## 5 Adelie 36.7 1 1
## 6 Adelie 39.3 1 2
compute_tsne2 <- function(data, scales, perplexity = 30){
set.seed(1345)
data
# identify duplicates just based on tsne data
data |>
select(tsne_vars) |>
mutate(vars_unpack(tsne_vars)) |>
select(-tsne_vars) ->
data_unpacked ; data_unpacked
names_predictors <- names(data_unpacked); names_predictors
data_unpacked |>
duplicated() ->
dups ; dups
# #
# # #
data_unpacked |>
bind_cols(data) |>
_[!dups,] |>
remove_missing() ->
clean_data ; clean_data
# # #
clean_data |>
_[names_predictors] |>
as.matrix() |>
Rtsne::Rtsne(perplexity = perplexity) |>
_$Y |>
as_tibble() |>
rename(x = V1, y = V2) |>
bind_cols(clean_data)
#
}
iris |>
mutate(tsne_vars = vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width)) |>
select(tsne_vars) |>
compute_tsne2()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 149 × 7
## x y Sepal.Length Sepal.Width Petal.Length Petal.Width tsne_vars
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list[1d]>
## 1 16.6 -15.0 5.1 3.5 1.4 0.2 <dbl [4]>
## 2 16.6 -12.7 4.9 3 1.4 0.2 <dbl [4]>
## 3 15.5 -12.7 4.7 3.2 1.3 0.2 <dbl [4]>
## 4 15.6 -12.5 4.6 3.1 1.5 0.2 <dbl [4]>
## 5 16.2 -15.3 5 3.6 1.4 0.2 <dbl [4]>
## 6 16.8 -17.0 5.4 3.9 1.7 0.4 <dbl [4]>
## 7 15.0 -13.1 4.6 3.4 1.4 0.3 <dbl [4]>
## 8 16.2 -14.5 5 3.4 1.5 0.2 <dbl [4]>
## 9 15.8 -11.7 4.4 2.9 1.4 0.2 <dbl [4]>
## 10 16.4 -13.2 4.9 3.1 1.5 0.1 <dbl [4]>
## # ℹ 139 more rows
StatTsne2 <- ggproto("StatTsne2", Stat,
compute_panel = compute_tsne2)
GeomPointFill <- ggproto("GeomPointFill",
GeomPoint,
default_aes =
modifyList(GeomPoint$default_aes,
aes(shape = 21,
color = from_theme(paper),
size = from_theme(pointsize * 2.5),
alpha = .7,
fill = from_theme(ink))))
geom_tsne <- make_constructor(GeomPointFill, stat = StatTsne2)
iris |>
ggplot() +
aes(tsne_vars =
vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne(perplexity = 2)
# not sure what is going on with last_plot_wipe.
# last_plot_wipe() +
# geom_tsne(perplexity = 5)
iris |>
ggplot() +
aes(tsne_vars =
vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne(perplexity = 5)
iris |>
ggplot() +
aes(tsne_vars =
vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne(perplexity = 12)
iris |>
ggplot() +
aes(tsne_vars =
vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne(perplexity = 20)
iris |>
ggplot() +
aes(tsne_vars =
vars_pack(Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width),
fill = Species) +
geom_tsne(perplexity = 30)
## ! perplexity is too large for the number of samples
# ggwipe::last_plot_wipe() +
# geom_tsne(perplexity = 50)
# to great for sample
# ggwipe::last_plot_wipe() +
# geom_tsne(perplexity = 100)
aes_tsne <- function(...){aes(vars = vars_pack(vars(...)))}
palmerpenguins::penguins |>
ggplot() +
aes(tsne_vars = vars_pack(bill_length_mm, bill_depth_mm,
flipper_length_mm)) +
geom_tsne()
## Warning: Removed 1 row containing missing values or values outside the scale
## range.
last_plot() +
aes(fill = species)
## Warning: Removed 1 row containing missing values or values outside the scale
## range.
last_plot() +
aes(shape = sex)
## Warning: Removed 10 rows containing missing values or values outside the scale
## range.
last_plot() +
facet_wrap(~island)
## Warning: Removed 5 rows containing missing values or values outside the scale
## range.
## Warning: Removed 1 row containing missing values or values outside the scale
## range.
## Warning: Removed 5 rows containing missing values or values outside the scale
## range.
## Warning: Computation failed in `stat_tsne2()`.
## Caused by error in `.check_tsne_params()`:
## ! perplexity is too large for the number of samples
library(tidyverse)
library(Rtsne)
iris_unique <- unique(iris) # Remove duplicates
iris_matrix <- as.matrix(iris_unique[,1:4])
set.seed(42) # Set a seed if you want reproducible results
tsne_out <- Rtsne(iris_matrix)
library(ggplot2)
tsne_plot <- data.frame(x = tsne_out$Y[,1],
y = tsne_out$Y[,2],
col = iris_unique$Species)
ggplot(tsne_plot) +
geom_point(aes(x=x, y=y, color=col))
compute_tsne <- function(data, scales, vars){
# identify duplicates just based on tsne data
data |>
select(all_of(vars)) |>
duplicated() ->
dups
#
data |>
_[!dups,] |>
remove_missing() ->
clean_data
clean_data |>
select(all_of(vars)) |>
as.matrix() |>
Rtsne() |>
_$Y |>
as_tibble() |>
rename(x = V1, y = V2) |>
bind_cols(clean_data)
}
StatTsne <- ggproto("StatTsne", Stat,
compute_panel = compute_tsne)
geom_tsne <- make_constructor(GeomPoint, stat = StatTsne)
library(ggplyr)
iris |>
ggplot() +
aes_from_data() +
geom_tsne(vars =
c("Sepal.Length", "Sepal.Width",
"Petal.Length", "Petal.Width")) +
aes(color = Species)
library(tidyverse)
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.4.1
## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
# ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports |>
# select(package, description) |>
# filter(row_number() == 1, .by = package) |>
# tidytext::unnest_ngrams(word, description, n = 1) |>
# mutate(exists = 1) |>
# pivot_wider(names_from = word,
# values_from = exists)
ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports |>
select(package, description) |>
mutate(doc_id = package) |>
select(doc_id, description) |>
corpus(text_field = "description", docid_field = "doc_id") |>
tokens(remove_punct = TRUE, remove_numbers = TRUE) |>
tokens_tolower() |>
tokens_remove(stopwords("en")) |>
dfm() |> # document-feature matrix
convert(to = "data.frame") |>
select(-doc_id) |>
Rtsne::Rtsne(perplexity = 25) |>
_$Y |>
as_data_frame() |>
bind_cols(ggplot2.extension.scrapers::cran_gg_w_ggplot2_depends_or_imports["package"]) |>
ggplot() +
aes(V1, V2, label = package) +
geom_point(color = "grey") +
geom_text(check_overlap = T, size = 3)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
## tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(tidyverse)
library(quanteda)
set.seed(12456)
tools::CRAN_package_db() |>
janitor::clean_names() |>
select(package, description) |>
sample_frac(.04) |>
mutate(doc_id = package) |>
mutate(row_number = row_number(), .by = package) |>
filter(row_number == 1) ->
doc_text_df
doc_text_df |>
select(doc_id, description) |>
corpus(text_field = "description", docid_field = "doc_id") |>
tokens(remove_punct = TRUE, remove_numbers = TRUE) |>
tokens_tolower() |>
tokens_remove(stopwords("en")) |>
dfm() |> # document-feature matrix
convert(to = "data.frame") |>
select(-doc_id) |>
Rtsne::Rtsne(perplexity = 25) |>
_$Y |>
as_data_frame() |>
bind_cols(doc_text_df |> select(package)) |>
ggplot() +
aes(V1, V2, label = package) +
geom_point() +
aes(color = package |> str_detect("gg|GG|geom"))
# geom_text(check_overlap = T, size = 3)