Intro Thoughts

Status Quo

library(tidyverse)

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.11     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
set.seed(27)

centers <- tibble(
  cluster = factor(1:3), 
  num_points = c(100, 150, 50),  # number points in each cluster
  x1 = c(5, 0, -3),              # x1 coordinate of cluster center
  x2 = c(-1, 1, -2)              # x2 coordinate of cluster center
)

labelled_points <- 
  centers %>%
  mutate(
    x1 = map2(num_points, x1, rnorm),
    x2 = map2(num_points, x2, rnorm)
  ) %>% 
  select(-num_points) %>% 
  unnest(cols = c(x1, x2))

ggplot(labelled_points, aes(x1, x2, color = cluster)) +
  geom_point(alpha = 0.3)

points <- 
  labelled_points %>% 
  select(-cluster)

kclust <- kmeans(points, centers = 3)
kclust
## K-means clustering with 3 clusters of sizes 148, 51, 101
## 
## Cluster means:
##            x1        x2
## 1  0.08853475  1.045461
## 2 -3.14292460 -2.000043
## 3  5.00401249 -1.045811
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 298.9415 108.8112 243.2092
##  (between_SS / total_SS =  82.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
augment(kclust, points)
## # A tibble: 300 × 3
##       x1     x2 .cluster
##    <dbl>  <dbl> <fct>   
##  1  6.91 -2.74  3       
##  2  6.14 -2.45  3       
##  3  4.24 -0.946 3       
##  4  3.54  0.287 3       
##  5  3.91  0.408 3       
##  6  5.30 -1.58  3       
##  7  5.01 -1.77  3       
##  8  6.16 -1.68  3       
##  9  7.13 -2.17  3       
## 10  5.24 -2.42  3       
## # ℹ 290 more rows
tidy(kclust)
## # A tibble: 3 × 5
##        x1    x2  size withinss cluster
##     <dbl> <dbl> <int>    <dbl> <fct>  
## 1  0.0885  1.05   148     299. 1      
## 2 -3.14   -2.00    51     109. 2      
## 3  5.00   -1.05   101     243. 3
data_prep <- penguins |> 
  select(bill_length_mm, bill_depth_mm) |> 
  remove_missing() 
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
data_prep |>
  kmeans(centers = 3) |> 
  augment(data_prep) |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_point() + 
  aes(color = .cluster)

compute_panel_kmeans <- function(data, scales, centers = 3, seed = 1234){
  
  set.seed(seed)
  
  data_prep <- data |> 
    mutate(row_id = row_number()) |>
    select(x, y) |> 
    remove_missing() 
  
  # unmodeled <- data |> # do this more carefully
  #   select(-x, -y) |> 
  #   slice(data_prep$row_id)

  data_prep |>
  kmeans(centers = centers) |> 
  augment(data_prep) #|>
  # bind_cols(unmodeled)
  
}

penguins |> 
  select(x = bill_length_mm, y = bill_depth_mm) |> 
  compute_panel_kmeans()
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## # A tibble: 342 × 3
##        x     y .cluster
##    <dbl> <dbl> <fct>   
##  1  39.1  18.7 3       
##  2  39.5  17.4 3       
##  3  40.3  18   3       
##  4  36.7  19.3 3       
##  5  39.3  20.6 3       
##  6  38.9  17.8 3       
##  7  39.2  19.6 3       
##  8  34.1  18.1 3       
##  9  42    20.2 3       
## 10  37.8  17.1 3       
## # ℹ 332 more rows
compute_panel_kmeans_tidy <- function(data, scales, centers = 3, seed = 1234){
  
  set.seed(seed)
  
  data_prep <- data |> 
    mutate(row_id = row_number()) |>
    select(x, y) |> 
    remove_missing() 
  
  # unmodeled <- data |> # do this more carefully
  #   select(-x, -y) |> 
  #   slice(data_prep$row_id)

  data_prep |>
  kmeans(centers = centers) |> 
  tidy() |> 
  rename(.size = size) #|>
  # bind_cols(unmodeled)
  
}
compute_panel_kmeans_lengths <- function(data, scales, centers = 3, seed = 1234){
  
  set.seed(seed)
  
  data_prep <- data |> 
    mutate(row_id = row_number()) |>
    select(x, y) |> 
    remove_missing() 
  
  # unmodeled <- data |> # do this more carefully
  #   select(-x, -y) |> 
  #   slice(data_prep$row_id)

  kmeaned <- data_prep |>
    kmeans(centers = centers)
  
  points <- kmeaned |> 
  augment(data_prep) #|>
  # bind_cols(unmodeled)
  
  kmeaned |> 
  tidy() |> 
  rename(.size = size) |>
  rename(.cluster = cluster) |>
  rename(xend = x, yend = y) |>
  right_join(points)
  
}


StatKmeansSegments <- ggproto("StatKmeansSegments", Stat,
                              compute_panel = compute_panel_kmeans_lengths,
                              default_aes = aes(color = after_stat(.cluster)))

geom_kmeans_lengths <- make_constructor(GeomSegment, stat = StatKmeansSegments)

penguins |> 
  remove_missing() |>
  select(x = bill_length_mm, y = bill_depth_mm) |> 
  compute_panel_kmeans_tidy()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
## # A tibble: 3 × 5
##       x     y .size withinss cluster
##   <dbl> <dbl> <int>    <dbl> <fct>  
## 1  50.9  17.3    85     618. 1      
## 2  45.5  15.7   112     742. 2      
## 3  38.4  18.3   136     905. 3
penguins_clean <- penguins |> 
  remove_missing()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
penguins_clean |> 
  ggplot() + 
  aes(x = bill_length_mm, 
      y = bill_depth_mm) + 
  geom_kmeans_lengths() + 
  geom_point()
## Joining with `by = join_by(.cluster)`

StatKmeans <- ggproto("StatKmeans", Stat,
                      compute_panel = compute_panel_kmeans,
                      default_aes = aes(color = after_stat(.cluster)))

geom_kmeans <- make_constructor(GeomPoint, stat = StatKmeans)

StatKmeansCenters <- ggproto("StatKmeansCenters", Stat,
                      compute_panel = compute_panel_kmeans_tidy)

GeomPointX <- ggproto("GeomPointX", GeomPoint,
                      default_aes = GeomPoint$default_aes |> 
                        modifyList(aes(shape = 13, size = 7), 
                                   keep.null = T)
                      )

geom_kmeans_center <- make_constructor(GeomPointX, stat = StatKmeansCenters)


penguins_clean <- penguins |>
  remove_missing() 
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_point() 

last_plot() + 
  geom_kmeans()

last_plot() +
  geom_kmeans_center()

penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_point() + 
  aes(color = species)

penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_kmeans(centers = 2) +
  geom_kmeans_center(centers = 2)

penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_kmeans(centers = 3) +
  geom_kmeans_center(centers = 3)

library(tictoc)
tic()

penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_kmeans(centers = 4) +
  geom_kmeans_center(centers = 4)

toc()
## 0.091 sec elapsed
penguins_clean |> 
  ggplot() + 
  aes(bill_length_mm, bill_depth_mm) + 
  geom_kmeans(centers = 5) +
  geom_kmeans_center(centers = 5)

PAIR_code_umap_mammoth_url <- "https://raw.githubusercontent.com/PAIR-code/understanding-umap/refs/heads/master/raw_data/mammoth_3d.json"

library(ggdims)
library(tictoc)



mammoth_df <- PAIR_code_umap_mammoth_url |>
  jsonlite::fromJSON() |>
  as.data.frame() 

tic()

 mammoth_df |> 
  ggplot() + 
  aes(V1, V2) + 
  geom_kmeans(centers = 4) +
  geom_kmeans_center(centers = 4)

toc() 
## 0.171 sec elapsed
mammoth_df |> 
  ggplot() + 
  aes(V1, V2) + 
  geom_kmeans(centers = 4) +
  geom_kmeans_lengths(centers = 4,
    linewidth = .2, alpha = .2) + 
  geom_kmeans_center(centers = 4)
## Joining with `by = join_by(.cluster)`

layer_three <- layer_data(i = 3)
## Joining with `by = join_by(.cluster)`
last_plot() +
  ggforce::geom_voronoi_segment(data = layer_three,  # geom_kmeans_partitian
                                aes(x = x, 
                                    y = y)) +
  ggforce::geom_voronoi_tile(data = layer_three |> rename(.cluster = cluster),
                                aes(x = x, 
                                    y = y,
                                    fill = .cluster))
## Joining with `by = join_by(.cluster)`
## Warning: Computation failed in `stat_voronoi_tile()`.
## Caused by error in `deldir::deldir()`:
## ! The x-range of the points is zero, whence a rectangular window
##   cannot be inferred from the data.  You must specify the rectangular
##   window explicitly.

library(ggplot2)
library(ggforce)
# Set a max radius
ggplot(iris, 
       aes(Sepal.Length, Sepal.Width, group = -1L)) +
  geom_voronoi_tile(aes(fill = Species), 
                    colour = 'black', max.radius = 0.25)
## Warning: `stat_voronoi_tile()` is dropping duplicated points

library(tidyverse)

ind_complete <- complete.cases(penguins)

penguins |> 
  select(bill_len, bill_dep) |> 
  filter(ind_complete) |>
  kmeans(centers = 4) |>
  broom::tidy()

penguins |> 
  select(bill_len, bill_dep) |> 
  filter(ind_complete) |>
  kmeans(centers = 4) |>
  broom::augment(
    penguins |> 
      filter(ind_complete)
    )

Closing remarks, Other Relevant Work, Caveats