2026-06-28 @corybrunson I’ve mentioned that I wanted to share with you on this, and I’ve thought about rehashing and emailing or putting on discussions but on my walk today, I felt like @ - ing might be a less resource intense way. So just doing that. 🤷♀️ Can @ be pushed to github? As I am doing? Another way?
Here, exploring a pivot to preprocessing or different in-processing features() instead.
ggdims feels a little more engineered than it might be. See https://github.com/EvaMaeRey/ggdims. So Let’s look at a slightly different approach:
Target:
data |>
make_dims_column() |>
ggplot() +
aes(dims = .dims_column) + # list column... each row has unique data for the
geom_pca()
library(tidyverse)
declare_features <- function(data, ...){
data$features <- list(NA, length(data))
features_df <- data |> select(...)
# names(features_df) <- paste0(".f", 1:ncol(features_df))
for (i in 1:nrow(data)){
data$features[[i]] <- features_df |> slice(i)
}
data |> as_tibble()
}
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width)
## # A tibble: 150 × 6
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species features
## <dbl> <dbl> <dbl> <dbl> <fct> <list>
## 1 5.1 3.5 1.4 0.2 setosa <df [1 × 4]>
## 2 4.9 3 1.4 0.2 setosa <df [1 × 4]>
## 3 4.7 3.2 1.3 0.2 setosa <df [1 × 4]>
## 4 4.6 3.1 1.5 0.2 setosa <df [1 × 4]>
## 5 5 3.6 1.4 0.2 setosa <df [1 × 4]>
## 6 5.4 3.9 1.7 0.4 setosa <df [1 × 4]>
## 7 4.6 3.4 1.4 0.3 setosa <df [1 × 4]>
## 8 5 3.4 1.5 0.2 setosa <df [1 × 4]>
## 9 4.4 2.9 1.4 0.2 setosa <df [1 × 4]>
## 10 4.9 3.1 1.5 0.1 setosa <df [1 × 4]>
## # ℹ 140 more rows
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
ggplot() +
aes(features = features) + ## do features() or do geometry = geometry...
geom_blank()
compute_panel_pca <- function(data, scales){
dims <- data |> select(features) |> unnest(cols = features)
pca_ed <- dims |>
ordr::ordinate(model = ~ prcomp(., scale. = TRUE)) |>
_[[5]] |>
as_tibble()
pca_ed |>
bind_cols(data)
}
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
select(features) |>
compute_panel_pca()
## # A tibble: 150 × 5
## PC1 PC2 PC3 PC4 features
## <dbl> <dbl> <dbl> <dbl> <list>
## 1 -2.26 -0.478 0.127 0.0241 <df [1 × 4]>
## 2 -2.07 0.672 0.234 0.103 <df [1 × 4]>
## 3 -2.36 0.341 -0.0441 0.0283 <df [1 × 4]>
## 4 -2.29 0.595 -0.0910 -0.0657 <df [1 × 4]>
## 5 -2.38 -0.645 -0.0157 -0.0358 <df [1 × 4]>
## 6 -2.07 -1.48 -0.0269 0.00659 <df [1 × 4]>
## 7 -2.44 -0.0475 -0.334 -0.0367 <df [1 × 4]>
## 8 -2.23 -0.222 0.0884 -0.0245 <df [1 × 4]>
## 9 -2.33 1.11 -0.145 -0.0268 <df [1 × 4]>
## 10 -2.18 0.467 0.253 -0.0398 <df [1 × 4]>
## # ℹ 140 more rows
library(statexpress)
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
ggplot() +
aes(features = features) +
qlayer(stat = qstat_panel(compute_panel_pca,
default_aes = aes(x = after_stat(PC1),
y = after_stat(PC2)))) +
aes(color = Species)
p <- last_plot()
features <- function(){
aes(features = features)
}
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
ggplot() +
features() +
qlayer(stat = qstat_panel(compute_panel_pca,
default_aes = aes(x = after_stat(PC1),
y = after_stat(PC2)))) +
aes(color = Species)
StatPCA <- ggproto(`_class` = "StatPCA", `_inherit` = Stat,
compute_panel = compute_panel_pca,
required_aes = "features",
default_aes = aes(x = after_stat(PC1),
y = after_stat(PC2)))
geom_pca0 <- make_constructor(GeomPoint, stat = StatPCA)
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
ggplot() +
aes(features = features) +
geom_pca0()
last_plot() +
aes(color = Species)
geom_pca <- function(...){
list(
geom_pca0(...),
aes(features = features)
)
}
iris |>
declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
ggplot() +
geom_pca()
last_plot() +
aes(color = Species)
features() function that
can be declared within - fails with selections with commas (other tidy
select moves?)#' @export
features0 <- function(select0) {
structure(
list(
select_spec = rlang::enquo(select0)
),
class = "features0"
)
}
#' @import ggplot2
#' @importFrom ggplot2 ggplot_add
#' @export
ggplot_add.features0 <- function(object, plot, object_name) {
plot$data <- plot$data |>
declare_features(!!object$select_spec)
plot <- plot + aes(features = features)
plot
}
features <- function(select){
features0({{select}})
}
library(statexpress)
iris |>
ggplot() +
features(Sepal.Length:Petal.Length) +
geom_pca0() +
aes(color = Species)
library(statexpress)
iris |>
ggplot() +
features(Sepal.Length:Petal.Length, Petal.Width) +
geom_pca0() +
aes(color = Species)
## Error in features(Sepal.Length:Petal.Length, Petal.Width): unused argument (Petal.Width)
ggdims <- function(data, mapping, ...){
}
library(ggVennDiagram)
##
## Attaching package: 'ggVennDiagram'
## The following object is masked from 'package:tidyr':
##
## unite
y <- list(
A = sample(letters, 8),
B = sample(letters, 8),
C = sample(letters, 8),
D = sample(letters, 8)
)
as.data.frame(y)
## A B C D
## 1 d x p p
## 2 i o x h
## 3 p g k n
## 4 k i e f
## 5 c s j q
## 6 f y o l
## 7 r f m a
## 8 j c a c
ggVennDiagram(y)
process_region_data(Venn(y))
## # A tibble: 15 × 4
## id name item count
## <chr> <chr> <list> <int>
## 1 1 A <chr [2]> 2
## 2 2 B <chr [3]> 3
## 3 3 C <chr [2]> 2
## 4 4 D <chr [4]> 4
## 5 1/2 A/B <chr [1]> 1
## 6 1/3 A/C <chr [2]> 2
## 7 1/4 A/D <chr [0]> 0
## 8 2/3 B/C <chr [2]> 2
## 9 2/4 B/D <chr [0]> 0
## 10 3/4 C/D <chr [1]> 1
## 11 1/2/3 A/B/C <chr [0]> 0
## 12 1/2/4 A/B/D <chr [2]> 2
## 13 1/3/4 A/C/D <chr [1]> 1
## 14 2/3/4 B/C/D <chr [0]> 0
## 15 1/2/3/4 A/B/C/D <chr [0]> 0
genes <- paste("gene",1:1000,sep="")
set.seed(20231214)
x <- list(A=sample(genes,300),
B=sample(genes,525),
C=sample(genes,440),
D=sample(genes,350))
ggVennDiagram
## function (x, category.names = names(x), show_intersect = FALSE,
## set_color = "black", set_size = NA, label = c("both", "count",
## "percent", "none"), label_alpha = 0.5, label_font = "sans",
## label_bigInterval = 3L, label_bigMark = ",", label_geom = c("label",
## "text"), label_color = "black", label_size = NA, label_percent_digit = 0,
## label_txtWidth = 40, edge_lty = "solid", edge_size = 1, force_upset = FALSE,
## nintersects = 20, order.intersect.by = c("size", "name",
## "none"), order.set.by = c("size", "name", "none"), relative_height = 3,
## relative_width = 0.3, shape_id = NULL, ...)
## {
## if (!is.list(x)) {
## stop(simpleError("ggVennDiagram() requires at least a list."))
## }
## names(x) = category.names
## dimension = length(x)
## venn = Venn(x)
## label = match.arg(label)
## label_geom = match.arg(label_geom)
## if (dimension <= 7 & !force_upset) {
## data = process_data(venn, shape_id = shape_id)
## plot_venn(data, show_intersect = show_intersect, set_color = set_color,
## set_size = set_size, label = label, label_alpha = label_alpha,
## label_font = label_font, label_geom = label_geom,
## label_color = label_color, label_size = label_size,
## label_bigMark = label_bigMark, label_bigInterval = label_bigInterval,
## label_percent_digit = label_percent_digit, label_txtWidth = label_txtWidth,
## edge_lty = edge_lty, edge_size = edge_size, ...)
## }
## else {
## if (!force_upset)
## warning("Only support 2-7 dimension Venn diagram. Will give a plain upset plot instead.")
## plot_upset(venn, nintersects = nintersects, order.intersect.by = order.intersect.by,
## order.set.by = order.set.by, relative_height = relative_height,
## relative_width = relative_width, ...)
## }
## }
## <bytecode: 0x12da4d6d0>
## <environment: namespace:ggVennDiagram>
ggVennDiagram::plot_venn
## function (data, show_intersect = FALSE, set_color = "black",
## set_size = NA, label = "both", label_geom = "label", label_alpha = 0.5,
## label_font = "sans", label_color = "black", label_size = NA,
## label_percent_digit = 0, label_bigMark = ",", label_bigInterval = 3L,
## label_txtWidth = 40, edge_lty = "solid", edge_size = 1, ...)
## {
## setedge.params = list(data = get_shape_setedge(data, color = set_color,
## linetype = edge_lty, linewidth = as.numeric(edge_size)),
## mapping = aes(color = I(.data$color), group = .data$id,
## linetype = I(.data$linetype), linewidth = I(.data$linewidth)),
## show.legend = FALSE)
## setlabel.params = list(data = get_shape_setlabel(data, size = as.numeric(set_size),
## color = set_color), mapping = aes(label = .data$name,
## size = I(.data$size), color = I(.data$color)), family = label_font,
## show.legend = FALSE)
## region.params = list(data = dplyr::left_join(get_shape_regionedge(data),
## venn_region(data), by = "id"), mapping = aes(fill = .data$count,
## group = .data$id))
## setedge.layer = do.call("geom_path", setedge.params)
## setlabel.layer = do.call("geom_text", setlabel.params)
## region.layer = do.call("geom_polygon", region.params)
## p = ggplot(mapping = aes(.data$X, .data$Y))
## p_nonlabel = p + region.layer + setedge.layer + setlabel.layer +
## theme_void() + coord_equal()
## if (label == "none") {
## return(p_nonlabel)
## }
## region_label = get_shape_regionlabel(data)
## if (show_intersect) {
## check_package("plotly")
## region_label = dplyr::mutate(dplyr::rowwise(region_label),
## item = yulab.utils::str_wrap(paste0(.data$item, collapse = " "),
## width = label_txtWidth))
## p_plotly = p_nonlabel + geom_text(aes(label = .data$count,
## text = .data$item), data = region_label) + theme(legend.position = "none")
## ax = list(showline = FALSE)
## p_plotly = plotly::layout(plotly::ggplotly(p_plotly,
## tooltip = c("text")), xaxis = ax, yaxis = ax)
## return(p_plotly)
## }
## region_label = dplyr::mutate(region_label, percent = paste(round(.data$count *
## 100/sum(.data$count), digits = label_percent_digit),
## "%", sep = ""), both = paste(format(.data$count, big.mark = label_bigMark,
## big.interval = label_bigInterval), paste0("(", .data$percent,
## ")"), sep = "\n"))
## if (label_geom == "label") {
## p_label = p_nonlabel + geom_label(aes(label = .data[[label]]),
## data = region_label, alpha = label_alpha, family = label_font,
## color = label_color, size = label_size, lineheight = 0.85,
## label.size = NA, linewidth = 0, fill = "white")
## return(p_label)
## }
## if (label_geom == "text") {
## p_label = p_nonlabel + geom_text(aes(label = .data[[label]]),
## data = region_label, alpha = label_alpha, family = label_font,
## color = label_color, size = label_size, lineheight = 0.85)
## return(p_label)
## }
## }
## <bytecode: 0x12ef4a510>
## <environment: namespace:ggVennDiagram>
compute_venn_list <- function(data){
df <- data |>
select(features) |>
unnest(features)
venn_list <- list(NA)
# names(venn_list) <- names(df)
for(i in 1:ncol(df)){
venn_list[[i]] <- (1:nrow(df))[df[,i] |> pull() |> as.logical()]
}
names(venn_list) <- names(df)
venn_list
}
# raw TF data
recipes_characteristics_logical <- data.frame(
tasty = sample(0:1, 100, replace = T),
easy = sample(0:1, 100, replace = T)) |>
mutate(id = paste("item", row_number()), .before = 1)
recipes_characteristics_logical |>
# declare features and select only features
declare_features(tasty, easy) |>
select(features) |>
# compute that happens internally in an geom_venn()
compute_venn_list() |>
ggVennDiagram() +
labs(title = "Recipe characteristics: What percent are easy and tasty?")
ggVennDiagram:::plot_shapes()
# data |>
# declare_features |>
# ggplot() +
# aes(features = features) +
# geom_venn()