Status Quo
library(tidyverse)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.11 ✔ rsample 1.3.2
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tune 2.0.1
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.4.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
set.seed(27)
centers <- tibble(
cluster = factor(1:3),
num_points = c(100, 150, 50), # number points in each cluster
x1 = c(5, 0, -3), # x1 coordinate of cluster center
x2 = c(-1, 1, -2) # x2 coordinate of cluster center
)
labelled_points <-
centers %>%
mutate(
x1 = map2(num_points, x1, rnorm),
x2 = map2(num_points, x2, rnorm)
) %>%
select(-num_points) %>%
unnest(cols = c(x1, x2))
ggplot(labelled_points, aes(x1, x2, color = cluster)) +
geom_point(alpha = 0.3)

points <-
labelled_points %>%
select(-cluster)
kclust <- kmeans(points, centers = 3)
kclust
## K-means clustering with 3 clusters of sizes 148, 51, 101
##
## Cluster means:
## x1 x2
## 1 0.08853475 1.045461
## 2 -3.14292460 -2.000043
## 3 5.00401249 -1.045811
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 298.9415 108.8112 243.2092
## (between_SS / total_SS = 82.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
augment(kclust, points)
## # A tibble: 300 × 3
## x1 x2 .cluster
## <dbl> <dbl> <fct>
## 1 6.91 -2.74 3
## 2 6.14 -2.45 3
## 3 4.24 -0.946 3
## 4 3.54 0.287 3
## 5 3.91 0.408 3
## 6 5.30 -1.58 3
## 7 5.01 -1.77 3
## 8 6.16 -1.68 3
## 9 7.13 -2.17 3
## 10 5.24 -2.42 3
## # ℹ 290 more rows
tidy(kclust)
## # A tibble: 3 × 5
## x1 x2 size withinss cluster
## <dbl> <dbl> <int> <dbl> <fct>
## 1 0.0885 1.05 148 299. 1
## 2 -3.14 -2.00 51 109. 2
## 3 5.00 -1.05 101 243. 3
data_prep <- penguins |>
select(bill_length_mm, bill_depth_mm) |>
remove_missing()
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
data_prep |>
kmeans(centers = 3) |>
augment(data_prep) |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_point() +
aes(color = .cluster)

compute_panel_kmeans <- function(data, scales, centers = 3, seed = 1234){
set.seed(seed)
data_prep <- data |>
mutate(row_id = row_number()) |>
select(x, y) |>
remove_missing()
# unmodeled <- data |> # do this more carefully
# select(-x, -y) |>
# slice(data_prep$row_id)
data_prep |>
kmeans(centers = centers) |>
augment(data_prep) #|>
# bind_cols(unmodeled)
}
penguins |>
select(x = bill_length_mm, y = bill_depth_mm) |>
compute_panel_kmeans()
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## # A tibble: 342 × 3
## x y .cluster
## <dbl> <dbl> <fct>
## 1 39.1 18.7 3
## 2 39.5 17.4 3
## 3 40.3 18 3
## 4 36.7 19.3 3
## 5 39.3 20.6 3
## 6 38.9 17.8 3
## 7 39.2 19.6 3
## 8 34.1 18.1 3
## 9 42 20.2 3
## 10 37.8 17.1 3
## # ℹ 332 more rows
compute_panel_kmeans_tidy <- function(data, scales, centers = 3, seed = 1234){
set.seed(seed)
data_prep <- data |>
mutate(row_id = row_number()) |>
select(x, y) |>
remove_missing()
# unmodeled <- data |> # do this more carefully
# select(-x, -y) |>
# slice(data_prep$row_id)
data_prep |>
kmeans(centers = centers) |>
tidy() |>
rename(.size = size) #|>
# bind_cols(unmodeled)
}
compute_panel_kmeans_lengths <- function(data, scales, centers = 3, seed = 1234){
set.seed(seed)
data_prep <- data |>
mutate(row_id = row_number()) |>
select(x, y) |>
remove_missing()
# unmodeled <- data |> # do this more carefully
# select(-x, -y) |>
# slice(data_prep$row_id)
kmeaned <- data_prep |>
kmeans(centers = centers)
points <- kmeaned |>
augment(data_prep) #|>
# bind_cols(unmodeled)
kmeaned |>
tidy() |>
rename(.size = size) |>
rename(.cluster = cluster) |>
rename(xend = x, yend = y) |>
right_join(points)
}
StatKmeansSegments <- ggproto("StatKmeansSegments", Stat,
compute_panel = compute_panel_kmeans_lengths,
default_aes = aes(color = after_stat(.cluster)))
geom_kmeans_lengths <- make_constructor(GeomSegment, stat = StatKmeansSegments)
penguins |>
remove_missing() |>
select(x = bill_length_mm, y = bill_depth_mm) |>
compute_panel_kmeans_tidy()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
## # A tibble: 3 × 5
## x y .size withinss cluster
## <dbl> <dbl> <int> <dbl> <fct>
## 1 50.9 17.3 85 618. 1
## 2 45.5 15.7 112 742. 2
## 3 38.4 18.3 136 905. 3
penguins_clean <- penguins |>
remove_missing()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
penguins_clean |>
ggplot() +
aes(x = bill_length_mm,
y = bill_depth_mm) +
geom_kmeans_lengths() +
geom_point()
## Joining with `by = join_by(.cluster)`

StatKmeans <- ggproto("StatKmeans", Stat,
compute_panel = compute_panel_kmeans,
default_aes = aes(color = after_stat(.cluster)))
geom_kmeans <- make_constructor(GeomPoint, stat = StatKmeans)
StatKmeansCenters <- ggproto("StatKmeansCenters", Stat,
compute_panel = compute_panel_kmeans_tidy)
GeomPointX <- ggproto("GeomPointX", GeomPoint,
default_aes = GeomPoint$default_aes |>
modifyList(aes(shape = 13, size = 7),
keep.null = T)
)
geom_kmeans_center <- make_constructor(GeomPointX, stat = StatKmeansCenters)
penguins_clean <- penguins |>
remove_missing()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_point()

last_plot() +
geom_kmeans()

last_plot() +
geom_kmeans_center()

penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_point() +
aes(color = species)

penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_kmeans(centers = 2) +
geom_kmeans_center(centers = 2)

penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_kmeans(centers = 3) +
geom_kmeans_center(centers = 3)

library(tictoc)
tic()
penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_kmeans(centers = 4) +
geom_kmeans_center(centers = 4)

toc()
## 0.091 sec elapsed
penguins_clean |>
ggplot() +
aes(bill_length_mm, bill_depth_mm) +
geom_kmeans(centers = 5) +
geom_kmeans_center(centers = 5)

PAIR_code_umap_mammoth_url <- "https://raw.githubusercontent.com/PAIR-code/understanding-umap/refs/heads/master/raw_data/mammoth_3d.json"
library(ggdims)
library(tictoc)
mammoth_df <- PAIR_code_umap_mammoth_url |>
jsonlite::fromJSON() |>
as.data.frame()
tic()
mammoth_df |>
ggplot() +
aes(V1, V2) +
geom_kmeans(centers = 4) +
geom_kmeans_center(centers = 4)

toc()
## 0.171 sec elapsed
mammoth_df |>
ggplot() +
aes(V1, V2) +
geom_kmeans(centers = 4) +
geom_kmeans_lengths(centers = 4,
linewidth = .2, alpha = .2) +
geom_kmeans_center(centers = 4)
## Joining with `by = join_by(.cluster)`

layer_three <- layer_data(i = 3)
## Joining with `by = join_by(.cluster)`
last_plot() +
ggforce::geom_voronoi_segment(data = layer_three, # geom_kmeans_partitian
aes(x = x,
y = y)) +
ggforce::geom_voronoi_tile(data = layer_three |> rename(.cluster = cluster),
aes(x = x,
y = y,
fill = .cluster))
## Joining with `by = join_by(.cluster)`
## Warning: Computation failed in `stat_voronoi_tile()`.
## Caused by error in `deldir::deldir()`:
## ! The x-range of the points is zero, whence a rectangular window
## cannot be inferred from the data. You must specify the rectangular
## window explicitly.

library(ggplot2)
library(ggforce)
# Set a max radius
ggplot(iris,
aes(Sepal.Length, Sepal.Width, group = -1L)) +
geom_voronoi_tile(aes(fill = Species),
colour = 'black', max.radius = 0.25)
## Warning: `stat_voronoi_tile()` is dropping duplicated points

library(tidyverse)
ind_complete <- complete.cases(penguins)
penguins |>
select(bill_len, bill_dep) |>
filter(ind_complete) |>
kmeans(centers = 4) |>
broom::tidy()
penguins |>
select(bill_len, bill_dep) |>
filter(ind_complete) |>
kmeans(centers = 4) |>
broom::augment(
penguins |>
filter(ind_complete)
)