ordr PCA statistical story telling
library(tidyverse)
library(ordr)
(penguins_pca <- ordinate(palmerpenguins::penguins %>% select(c(1, 3:6)) %>%
remove_missing(),
cols = 2:5, model = ~ prcomp(., scale. = TRUE)))
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## # A tbl_ord of class 'prcomp': (342 x 4) x (4 x 4)'
## # 4 coordinates: PC1, PC2, ..., PC4
## #
## # Rows (principal): [ 342 x 4 | 1 ]
## PC1 PC2 PC3 ... | species
## | <fct>
## 1 -1.84 -0.0476 0.232 | 1 Adelie
## 2 -1.30 0.428 0.0295 ... | 2 Adelie
## 3 -1.37 0.154 -0.198 | 3 Adelie
## 4 -1.88 0.00205 0.618 | 4 Adelie
## 5 -1.91 -0.828 0.686 | 5 Adelie
## # ℹ 337 more rows
## #
## # Columns (standard): [ 4 x 4 | 3 ]
## PC1 PC2 PC3 ... | name center scale
## | <chr> <dbl> <dbl>
## 1 0.455 -0.597 -0.644 | 1 bill_length_mm 43.9 5.46
## 2 -0.400 -0.798 0.418 ... | 2 bill_depth_mm 17.2 1.97
## 3 0.576 -0.00228 0.232 | 3 flipper_length_mm 201. 14.1
## 4 0.548 -0.0844 0.597 | 4 body_mass_g 4202. 802.
ggbiplot(penguins_pca, sec.axes = "cols", scale.factor = 2) +
aes(color = species, shape = species) +
geom_rows_point() +
stat_rows_ellipse(alpha = .5, level = .99) +
geom_cols_vector() +
geom_cols_text_radiate(aes(label = name)) +
expand_limits(y = c(-3.5, NA)) +
labs(title = "PCA of penguins") +
labs(subtitle = "99% confidence ellipses; variables use top & right axes")
ggbiplot(penguins_pca, axis.type = "predictive", axis.percents = FALSE) +
aes(color = species, shape = species) +
geom_rows_point() +
stat_rows_center(size = 5, alpha = .5, fun.data = mean_se) +
aes(label = name, center = center, scale = scale) +
geom_cols_axis() +
labs(subtitle = "Predictive biplot of Anderson's iris measurements")
labs(caption = "Project a marker onto an axis to approximate its measurement") +
theme_biplot()
## NULL
a ggplot() + stat approach
compute_panel_pca <- function(data, scales, what = "rows"){
data %>%
select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
remove_missing() %>%
ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) ->
pca
if(what == "rows"){
pca$x %>%
as.data.frame() %>%
bind_cols(data["outcome"]) ->
out
}
out
}
penguins_pca$rotation %>% data.frame() %>%
rownames_to_column() %>%
dplyr::rename(var_name = rowname)
## var_name PC1 PC2 PC3 PC4
## 1 bill_length_mm 0.4552503 -0.597031143 -0.6443012 0.1455231
## 2 bill_depth_mm -0.4003347 -0.797766572 0.4184272 -0.1679860
## 3 flipper_length_mm 0.5760133 -0.002282201 0.2320840 -0.7837987
## 4 body_mass_g 0.5483502 -0.084362920 0.5966001 0.5798821
palmerpenguins::penguins %>%
remove_missing() %>%
rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
compute_panel_pca() %>%
head()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
## PC1 PC2 PC3 outcome
## 1 -1.8312137 0.05018344 0.2860721 Adelie
## 2 -1.2184054 0.48737657 0.3278652 Adelie
## 3 -0.8697485 0.13788110 -0.2039821 Adelie
## 4 -1.6639864 0.07602012 -0.7088374 Adelie
## 5 -1.8801939 -0.72572749 -0.5742510 Adelie
## 6 -1.6179709 0.41909356 0.4553261 Adelie
StatPcarows <- ggproto(`_class` = "StatPcarows",
`_inherit` = Stat,
required_aes = c("outcome", "a", "b", "c"),
default_aes = aes(color = after_stat(outcome),
x = after_stat(PC1),
y = after_stat(PC2)),
compute_panel = compute_panel_pca
)
ggplot(palmerpenguins::penguins) +
aes(outcome = species,
a = bill_length_mm,
b = bill_depth_mm,
c = flipper_length_mm) +
layer(geom = "point",
position = "identity",
stat = StatPcarows)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).
stat_pca <- function(
mapping = NULL,
geom = "point",
data = NULL,
position = "identity",
required_aes = NULL,
default_aes = NULL,
dropped_aes = NULL,
...,
show.legend = NA,
inherit.aes = TRUE,
computation_scope = "group"
) {
ggplot2::layer(
mapping = mapping,
data = data,
stat = StatPcarows,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
check.aes = FALSE,
check.param = FALSE,
params = list(
na.rm = FALSE,
...
)
)
}
geom_point_pca <- stat_pca
ggplot(palmerpenguins::penguins) +
aes(outcome = species,
a = bill_length_mm,
b = bill_depth_mm,
c = flipper_length_mm) +
geom_point_pca() +
stat_pca()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).
statexpress + w/ ggplot() as plot entry point…
library(tidyverse)
compute_panel_pca_rows <- function(data){
data %>% remove_missing() ->
data
data %>%
select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
ordr::ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) %>%
.$x %>%
as.data.frame() %>%
bind_cols(data["outcome"])
}
palmerpenguins::penguins %>%
rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
compute_panel_pca_rows() %>%
head()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
## PC1 PC2 PC3 outcome
## 1 -1.8312137 0.05018344 0.2860721 Adelie
## 2 -1.2184054 0.48737657 0.3278652 Adelie
## 3 -0.8697485 0.13788110 -0.2039821 Adelie
## 4 -1.6639864 0.07602012 -0.7088374 Adelie
## 5 -1.8801939 -0.72572749 -0.5742510 Adelie
## 6 -1.6179709 0.41909356 0.4553261 Adelie
compute_panel_pca_col <- function(data){
data %>% remove_missing() ->
data
data %>%
select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
ordr::ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) %>%
.$rotation %>%
as.data.frame() %>%
rownames_to_column()
}
palmerpenguins::penguins %>%
rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
compute_panel_pca_col()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
## rowname PC1 PC2 PC3
## 1 a 0.5513631 -0.65493615 0.5167759
## 2 b -0.5107043 -0.75478128 -0.4116872
## 3 c 0.6596816 -0.03693055 -0.7506373
geom_pca_point <- function(...){
statexpress::stat_panel(compute_panel_pca_rows, "point",
default_aes = aes(x = after_stat(PC1),
y = after_stat(PC2),
color = after_stat(outcome)),
...)}
stat_pca_cols <- function(geom = "text", ...){
statexpress::stat_panel(fun = compute_panel_pca_col,
geom = geom,
default_aes = aes(x = after_stat(PC1),
y = after_stat(PC2),
xend = after_stat(0),
yend = after_stat(0),
label = paste("Variable", after_stat(rowname))),
...)}
palmerpenguins::penguins %>%
ggplot() +
aes(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) +
geom_pca_point() +
stat_pca_cols(hjust = "outward") +
stat_pca_cols(geom = "segment",
arrow = arrow(ends = "first"))
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## Warning: Removed 2 rows containing missing values or values outside the scale range.
## Removed 2 rows containing missing values or values outside the scale range.
knitr::knit_exit()