2026-06-28 @corybrunson I’ve mentioned that I wanted to share with you on this, and I’ve thought about rehashing and emailing or putting on discussions but on my walk today, I felt like @ - ing might be a less resource intense way. So just doing that. 🤷‍♀️ Can @ be pushed to github? As I am doing? Another way?

Here, exploring a pivot to preprocessing or different in-processing features() instead.

Intro Thoughts

ggdims feels a little more engineered than it might be. See https://github.com/EvaMaeRey/ggdims. So Let’s look at a slightly different approach:

Target:

data |> 
  make_dims_column() |>
  ggplot() + 
  aes(dims = .dims_column) + # list column... each row has unique data for the 
  geom_pca()

Experiment

library(tidyverse)
declare_features <- function(data, ...){

data$features <- list(NA, length(data))

features_df <- data |> select(...)
# names(features_df) <- paste0(".f", 1:ncol(features_df))

for (i in 1:nrow(data)){
  
  data$features[[i]] <- features_df |> slice(i)
  
}             

data |> as_tibble()

}


iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) 
## # A tibble: 150 × 6
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species features    
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>   <list>      
##  1          5.1         3.5          1.4         0.2 setosa  <df [1 × 4]>
##  2          4.9         3            1.4         0.2 setosa  <df [1 × 4]>
##  3          4.7         3.2          1.3         0.2 setosa  <df [1 × 4]>
##  4          4.6         3.1          1.5         0.2 setosa  <df [1 × 4]>
##  5          5           3.6          1.4         0.2 setosa  <df [1 × 4]>
##  6          5.4         3.9          1.7         0.4 setosa  <df [1 × 4]>
##  7          4.6         3.4          1.4         0.3 setosa  <df [1 × 4]>
##  8          5           3.4          1.5         0.2 setosa  <df [1 × 4]>
##  9          4.4         2.9          1.4         0.2 setosa  <df [1 × 4]>
## 10          4.9         3.1          1.5         0.1 setosa  <df [1 × 4]>
## # ℹ 140 more rows
iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |>
  ggplot() + 
  aes(features = features) + ## do features() or do geometry = geometry...
  geom_blank()

make it interact with compute via a stat

compute_panel_pca <- function(data, scales){
  
  dims <- data |> select(features) |> unnest(cols = features)
    
  pca_ed <- dims |>
    ordr::ordinate(model = ~ prcomp(., scale. = TRUE)) |> 
    _[[5]] |> 
    as_tibble()
  
  pca_ed |> 
    bind_cols(data)
  
}

iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |> 
  select(features) |>
  compute_panel_pca() 
## # A tibble: 150 × 5
##      PC1     PC2     PC3      PC4 features    
##    <dbl>   <dbl>   <dbl>    <dbl> <list>      
##  1 -2.26 -0.478   0.127   0.0241  <df [1 × 4]>
##  2 -2.07  0.672   0.234   0.103   <df [1 × 4]>
##  3 -2.36  0.341  -0.0441  0.0283  <df [1 × 4]>
##  4 -2.29  0.595  -0.0910 -0.0657  <df [1 × 4]>
##  5 -2.38 -0.645  -0.0157 -0.0358  <df [1 × 4]>
##  6 -2.07 -1.48   -0.0269  0.00659 <df [1 × 4]>
##  7 -2.44 -0.0475 -0.334  -0.0367  <df [1 × 4]>
##  8 -2.23 -0.222   0.0884 -0.0245  <df [1 × 4]>
##  9 -2.33  1.11   -0.145  -0.0268  <df [1 × 4]>
## 10 -2.18  0.467   0.253  -0.0398  <df [1 × 4]>
## # ℹ 140 more rows
library(statexpress)
iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |> 
  ggplot() + 
  aes(features = features) + 
  qlayer(stat = qstat_panel(compute_panel_pca,
                      default_aes = aes(x = after_stat(PC1), 
                                        y = after_stat(PC2)))) + 
  aes(color = Species)

p <- last_plot()

features <- function(){
  
  aes(features = features)
  
}


iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |> 
  ggplot() + 
  features() + 
  qlayer(stat = qstat_panel(compute_panel_pca,
                      default_aes = aes(x = after_stat(PC1), 
                                        y = after_stat(PC2)))) + 
  aes(color = Species)

write a proper Stat and user facing function…

StatPCA <- ggproto(`_class` = "StatPCA", `_inherit` = Stat,
                   compute_panel = compute_panel_pca,
                   required_aes = "features",
                   default_aes = aes(x = after_stat(PC1), 
                                        y = after_stat(PC2)))


geom_pca0 <- make_constructor(GeomPoint, stat = StatPCA)

iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |> 
  ggplot() + 
  aes(features = features) +
  geom_pca0()

last_plot() +
  aes(color = Species)

automatically look for the column like geom_sf looks for geometry (how does that managed?)

geom_pca <- function(...){
  
  list(
  geom_pca0(...),
  aes(features = features)
  )
  
}

iris |> 
  declare_features(Sepal.Length:Petal.Length, Petal.Width) |> 
  ggplot() + 
  geom_pca()

last_plot() + 
  aes(color = Species)

Preferred … wish list: add a features() function that can be declared within - fails with selections with commas (other tidy select moves?)

#' @export
features0 <- function(select0) {

  structure(
    list(
      select_spec = rlang::enquo(select0)
    ), 
    class = "features0"
    )

}


#' @import ggplot2
#' @importFrom ggplot2 ggplot_add
#' @export
ggplot_add.features0 <- function(object, plot, object_name) {
  
  plot$data <- plot$data |> 
    declare_features(!!object$select_spec)

  plot <- plot + aes(features = features)

  plot

}


features <- function(select){
  
  features0({{select}})
  
}


library(statexpress)
iris |> 
  ggplot() + 
  features(Sepal.Length:Petal.Length) + 
  geom_pca0() + 
  aes(color = Species)

library(statexpress)
iris |> 
  ggplot() + 
  features(Sepal.Length:Petal.Length, Petal.Width) + 
  geom_pca0() + 
  aes(color = Species)
## Error in features(Sepal.Length:Petal.Length, Petal.Width): unused argument (Petal.Width)

not preferred, though I’m having trouble articulating why…

ggdims <- function(data, mapping, ...){
  
  
  
  
  
}

a little venn exploration

library(ggVennDiagram)
## 
## Attaching package: 'ggVennDiagram'
## The following object is masked from 'package:tidyr':
## 
##     unite
y <- list(
  A = sample(letters, 8),
  B = sample(letters, 8),
  C = sample(letters, 8),
  D = sample(letters, 8)
) 

as.data.frame(y)
##   A B C D
## 1 d x p p
## 2 i o x h
## 3 p g k n
## 4 k i e f
## 5 c s j q
## 6 f y o l
## 7 r f m a
## 8 j c a c
ggVennDiagram(y)

process_region_data(Venn(y))
## # A tibble: 15 × 4
##    id      name    item      count
##    <chr>   <chr>   <list>    <int>
##  1 1       A       <chr [2]>     2
##  2 2       B       <chr [3]>     3
##  3 3       C       <chr [2]>     2
##  4 4       D       <chr [4]>     4
##  5 1/2     A/B     <chr [1]>     1
##  6 1/3     A/C     <chr [2]>     2
##  7 1/4     A/D     <chr [0]>     0
##  8 2/3     B/C     <chr [2]>     2
##  9 2/4     B/D     <chr [0]>     0
## 10 3/4     C/D     <chr [1]>     1
## 11 1/2/3   A/B/C   <chr [0]>     0
## 12 1/2/4   A/B/D   <chr [2]>     2
## 13 1/3/4   A/C/D   <chr [1]>     1
## 14 2/3/4   B/C/D   <chr [0]>     0
## 15 1/2/3/4 A/B/C/D <chr [0]>     0
genes <- paste("gene",1:1000,sep="")
set.seed(20231214)
x <- list(A=sample(genes,300),
          B=sample(genes,525),
          C=sample(genes,440),
          D=sample(genes,350))


ggVennDiagram
## function (x, category.names = names(x), show_intersect = FALSE, 
##     set_color = "black", set_size = NA, label = c("both", "count", 
##         "percent", "none"), label_alpha = 0.5, label_font = "sans", 
##     label_bigInterval = 3L, label_bigMark = ",", label_geom = c("label", 
##         "text"), label_color = "black", label_size = NA, label_percent_digit = 0, 
##     label_txtWidth = 40, edge_lty = "solid", edge_size = 1, force_upset = FALSE, 
##     nintersects = 20, order.intersect.by = c("size", "name", 
##         "none"), order.set.by = c("size", "name", "none"), relative_height = 3, 
##     relative_width = 0.3, shape_id = NULL, ...) 
## {
##     if (!is.list(x)) {
##         stop(simpleError("ggVennDiagram() requires at least a list."))
##     }
##     names(x) = category.names
##     dimension = length(x)
##     venn = Venn(x)
##     label = match.arg(label)
##     label_geom = match.arg(label_geom)
##     if (dimension <= 7 & !force_upset) {
##         data = process_data(venn, shape_id = shape_id)
##         plot_venn(data, show_intersect = show_intersect, set_color = set_color, 
##             set_size = set_size, label = label, label_alpha = label_alpha, 
##             label_font = label_font, label_geom = label_geom, 
##             label_color = label_color, label_size = label_size, 
##             label_bigMark = label_bigMark, label_bigInterval = label_bigInterval, 
##             label_percent_digit = label_percent_digit, label_txtWidth = label_txtWidth, 
##             edge_lty = edge_lty, edge_size = edge_size, ...)
##     }
##     else {
##         if (!force_upset) 
##             warning("Only support 2-7 dimension Venn diagram. Will give a plain upset plot instead.")
##         plot_upset(venn, nintersects = nintersects, order.intersect.by = order.intersect.by, 
##             order.set.by = order.set.by, relative_height = relative_height, 
##             relative_width = relative_width, ...)
##     }
## }
## <bytecode: 0x12da4d6d0>
## <environment: namespace:ggVennDiagram>
ggVennDiagram::plot_venn
## function (data, show_intersect = FALSE, set_color = "black", 
##     set_size = NA, label = "both", label_geom = "label", label_alpha = 0.5, 
##     label_font = "sans", label_color = "black", label_size = NA, 
##     label_percent_digit = 0, label_bigMark = ",", label_bigInterval = 3L, 
##     label_txtWidth = 40, edge_lty = "solid", edge_size = 1, ...) 
## {
##     setedge.params = list(data = get_shape_setedge(data, color = set_color, 
##         linetype = edge_lty, linewidth = as.numeric(edge_size)), 
##         mapping = aes(color = I(.data$color), group = .data$id, 
##             linetype = I(.data$linetype), linewidth = I(.data$linewidth)), 
##         show.legend = FALSE)
##     setlabel.params = list(data = get_shape_setlabel(data, size = as.numeric(set_size), 
##         color = set_color), mapping = aes(label = .data$name, 
##         size = I(.data$size), color = I(.data$color)), family = label_font, 
##         show.legend = FALSE)
##     region.params = list(data = dplyr::left_join(get_shape_regionedge(data), 
##         venn_region(data), by = "id"), mapping = aes(fill = .data$count, 
##         group = .data$id))
##     setedge.layer = do.call("geom_path", setedge.params)
##     setlabel.layer = do.call("geom_text", setlabel.params)
##     region.layer = do.call("geom_polygon", region.params)
##     p = ggplot(mapping = aes(.data$X, .data$Y))
##     p_nonlabel = p + region.layer + setedge.layer + setlabel.layer + 
##         theme_void() + coord_equal()
##     if (label == "none") {
##         return(p_nonlabel)
##     }
##     region_label = get_shape_regionlabel(data)
##     if (show_intersect) {
##         check_package("plotly")
##         region_label = dplyr::mutate(dplyr::rowwise(region_label), 
##             item = yulab.utils::str_wrap(paste0(.data$item, collapse = " "), 
##                 width = label_txtWidth))
##         p_plotly = p_nonlabel + geom_text(aes(label = .data$count, 
##             text = .data$item), data = region_label) + theme(legend.position = "none")
##         ax = list(showline = FALSE)
##         p_plotly = plotly::layout(plotly::ggplotly(p_plotly, 
##             tooltip = c("text")), xaxis = ax, yaxis = ax)
##         return(p_plotly)
##     }
##     region_label = dplyr::mutate(region_label, percent = paste(round(.data$count * 
##         100/sum(.data$count), digits = label_percent_digit), 
##         "%", sep = ""), both = paste(format(.data$count, big.mark = label_bigMark, 
##         big.interval = label_bigInterval), paste0("(", .data$percent, 
##         ")"), sep = "\n"))
##     if (label_geom == "label") {
##         p_label = p_nonlabel + geom_label(aes(label = .data[[label]]), 
##             data = region_label, alpha = label_alpha, family = label_font, 
##             color = label_color, size = label_size, lineheight = 0.85, 
##             label.size = NA, linewidth = 0, fill = "white")
##         return(p_label)
##     }
##     if (label_geom == "text") {
##         p_label = p_nonlabel + geom_text(aes(label = .data[[label]]), 
##             data = region_label, alpha = label_alpha, family = label_font, 
##             color = label_color, size = label_size, lineheight = 0.85)
##         return(p_label)
##     }
## }
## <bytecode: 0x12ef4a510>
## <environment: namespace:ggVennDiagram>
compute_venn_list <- function(data){
  
  df <- data |>
    select(features) |> 
    unnest(features) 
  
  venn_list <- list(NA)
  # names(venn_list) <- names(df)
    
  for(i in 1:ncol(df)){
    
    venn_list[[i]] <- (1:nrow(df))[df[,i] |> pull() |> as.logical()]
    
  }
  
  names(venn_list) <-  names(df)

  venn_list
  
  }


# raw TF data
recipes_characteristics_logical <- data.frame(
  tasty = sample(0:1, 100, replace = T), 
  easy = sample(0:1, 100, replace = T)) |>
  mutate(id = paste("item", row_number()), .before = 1)


recipes_characteristics_logical |> 
  # declare features and select only features
  declare_features(tasty, easy) |> 
  select(features) |>
  # compute that happens internally in an geom_venn()
  compute_venn_list() |> 
  ggVennDiagram() + 
  labs(title = "Recipe characteristics: What percent are easy and tasty?")

ggVennDiagram:::plot_shapes()

# data |> 
#   declare_features |> 
#   ggplot() + 
#     aes(features = features) + 
#     geom_venn()

Closing remarks, Other Relevant Work, Caveats