Intro Thoughts

Status Quo

library(tidyverse)

ordr PCA statistical story telling

library(tidyverse)
library(ordr)

(penguins_pca <- ordinate(palmerpenguins::penguins %>% select(c(1, 3:6)) %>%
                            remove_missing(), 
                      cols = 2:5, model = ~ prcomp(., scale. = TRUE)))
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## # A tbl_ord of class 'prcomp': (342 x 4) x (4 x 4)'
## # 4 coordinates: PC1, PC2, ..., PC4
## # 
## # Rows (principal): [ 342 x 4 | 1 ]
##     PC1      PC2     PC3 ... |   species
##                              |   <fct>  
## 1 -1.84 -0.0476   0.232      | 1 Adelie 
## 2 -1.30  0.428    0.0295 ... | 2 Adelie 
## 3 -1.37  0.154   -0.198      | 3 Adelie 
## 4 -1.88  0.00205  0.618      | 4 Adelie 
## 5 -1.91 -0.828    0.686      | 5 Adelie 
## # ℹ 337 more rows
## # 
## # Columns (standard): [ 4 x 4 | 3 ]
##      PC1      PC2    PC3 ... |   name              center  scale
##                              |   <chr>              <dbl>  <dbl>
## 1  0.455 -0.597   -0.644     | 1 bill_length_mm      43.9   5.46
## 2 -0.400 -0.798    0.418 ... | 2 bill_depth_mm       17.2   1.97
## 3  0.576 -0.00228  0.232     | 3 flipper_length_mm  201.   14.1 
## 4  0.548 -0.0844   0.597     | 4 body_mass_g       4202.  802.
ggbiplot(penguins_pca, sec.axes = "cols", scale.factor = 2) +
  aes(color = species, shape = species) +
  geom_rows_point() +
  stat_rows_ellipse(alpha = .5, level = .99) +
  geom_cols_vector() +
  geom_cols_text_radiate(aes(label = name)) +
  expand_limits(y = c(-3.5, NA)) +
  labs(title = "PCA of penguins") +
  labs(subtitle = "99% confidence ellipses; variables use top & right axes")

ggbiplot(penguins_pca, axis.type = "predictive", axis.percents = FALSE) +
  aes(color = species, shape = species) +
  geom_rows_point() +
  stat_rows_center(size = 5, alpha = .5, fun.data = mean_se) +
  aes(label = name, center = center, scale = scale) +
  geom_cols_axis() +
  labs(subtitle = "Predictive biplot of Anderson's iris measurements")

  labs(caption = "Project a marker onto an axis to approximate its measurement") +
  theme_biplot()
## NULL

a ggplot() + stat approach

compute_panel_pca <- function(data, scales, what = "rows"){
  
  data %>% 
    select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
    remove_missing() %>% 
    ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) ->
  pca
  
  if(what == "rows"){
  
  pca$x %>% 
    as.data.frame() %>% 
    bind_cols(data["outcome"]) ->
  out
  
  }
  
  
  out
    
}

penguins_pca$rotation %>% data.frame() %>% 
  rownames_to_column() %>% 
      dplyr::rename(var_name = rowname)
##            var_name        PC1          PC2        PC3        PC4
## 1    bill_length_mm  0.4552503 -0.597031143 -0.6443012  0.1455231
## 2     bill_depth_mm -0.4003347 -0.797766572  0.4184272 -0.1679860
## 3 flipper_length_mm  0.5760133 -0.002282201  0.2320840 -0.7837987
## 4       body_mass_g  0.5483502 -0.084362920  0.5966001  0.5798821
palmerpenguins::penguins %>%
  remove_missing() %>%
  rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
  compute_panel_pca() %>%
  head()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
##          PC1         PC2        PC3 outcome
## 1 -1.8312137  0.05018344  0.2860721  Adelie
## 2 -1.2184054  0.48737657  0.3278652  Adelie
## 3 -0.8697485  0.13788110 -0.2039821  Adelie
## 4 -1.6639864  0.07602012 -0.7088374  Adelie
## 5 -1.8801939 -0.72572749 -0.5742510  Adelie
## 6 -1.6179709  0.41909356  0.4553261  Adelie
StatPcarows <- ggproto(`_class` = "StatPcarows", 
                       `_inherit` = Stat,
                       required_aes = c("outcome", "a", "b", "c"),
                       default_aes = aes(color = after_stat(outcome),
                                         x = after_stat(PC1),
                                         y = after_stat(PC2)),
                       compute_panel = compute_panel_pca
                       )


ggplot(palmerpenguins::penguins)  + 
  aes(outcome = species, 
      a = bill_length_mm,
      b = bill_depth_mm,
      c = flipper_length_mm) + 
  layer(geom = "point", 
        position = "identity",
        stat = StatPcarows)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).

stat_pca <- function(
                       mapping = NULL, 
                       geom = "point", 
                       data = NULL,
                       position = "identity",
                       required_aes = NULL, 
                       default_aes = NULL, 
                       dropped_aes = NULL,
                      ...,
                      show.legend = NA,
                      inherit.aes = TRUE,
                      computation_scope = "group"
                  ) {
    ggplot2::layer(
       mapping = mapping,
       data = data,
       stat = StatPcarows,
       geom = geom,
       position = position,
       show.legend = show.legend,
       inherit.aes = inherit.aes,
       check.aes = FALSE,
       check.param = FALSE,
       params = list(
          na.rm = FALSE,
          ...
       )
    )
}
                       
geom_point_pca <- stat_pca

ggplot(palmerpenguins::penguins)  + 
  aes(outcome = species, 
      a = bill_length_mm,
      b = bill_depth_mm,
      c = flipper_length_mm) + 
  geom_point_pca() + 
  stat_pca()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_pcarows()`).

statexpress + w/ ggplot() as plot entry point…

library(tidyverse)

compute_panel_pca_rows <- function(data){
  
  data %>%  remove_missing() ->
    data

  data %>% 
    select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
    ordr::ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) %>% 
  .$x %>% 
    as.data.frame() %>% 
    bind_cols(data["outcome"])
    
}

palmerpenguins::penguins %>% 
  rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
  compute_panel_pca_rows() %>% 
  head()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
##          PC1         PC2        PC3 outcome
## 1 -1.8312137  0.05018344  0.2860721  Adelie
## 2 -1.2184054  0.48737657  0.3278652  Adelie
## 3 -0.8697485  0.13788110 -0.2039821  Adelie
## 4 -1.6639864  0.07602012 -0.7088374  Adelie
## 5 -1.8801939 -0.72572749 -0.5742510  Adelie
## 6 -1.6179709  0.41909356  0.4553261  Adelie
compute_panel_pca_col <- function(data){
  
  data %>%  remove_missing() ->
    data

  data %>% 
    select(outcome, a, b, c) %>% # as proof of concept 3 predictors a b c
    ordr::ordinate(cols = 2:4, model = ~ prcomp(., scale. = TRUE)) %>% 
    .$rotation %>% 
    as.data.frame() %>% 
    rownames_to_column()
    
}

palmerpenguins::penguins %>% 
  rename(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) %>%
  compute_panel_pca_col()
## Warning: Removed 11 rows containing missing values or values outside the scale
## range.
##   rowname        PC1         PC2        PC3
## 1       a  0.5513631 -0.65493615  0.5167759
## 2       b -0.5107043 -0.75478128 -0.4116872
## 3       c  0.6596816 -0.03693055 -0.7506373
geom_pca_point <- function(...){
  statexpress::stat_panel(compute_panel_pca_rows, "point", 
                          default_aes = aes(x = after_stat(PC1), 
                                            y = after_stat(PC2), 
                                            color = after_stat(outcome)),
                          ...)}


stat_pca_cols <- function(geom = "text", ...){
  statexpress::stat_panel(fun = compute_panel_pca_col, 
                          geom = geom, 
                          default_aes = aes(x = after_stat(PC1), 
                                            y = after_stat(PC2), 
                                            xend = after_stat(0),
                                            yend = after_stat(0), 
                                            label = paste("Variable", after_stat(rowname))), 
                          ...)}


palmerpenguins::penguins %>% 
  ggplot() + 
  aes(outcome = species, a = bill_length_mm, b = bill_depth_mm, c = flipper_length_mm) +
  geom_pca_point() + 
  stat_pca_cols(hjust = "outward") + 
  stat_pca_cols(geom = "segment", 
                arrow = arrow(ends = "first"))
## Warning: Removed 2 rows containing missing values or values outside the scale
## range.
## Warning: Removed 2 rows containing missing values or values outside the scale range.
## Removed 2 rows containing missing values or values outside the scale range.

knitr::knit_exit()