Intro Thoughts

Status Quo

https://evamaerey.github.io/statistics/covariance_correlation.html#56

ggxmean non-express…

Experiment

library(tidyverse)

# express Stat setup.

qstat <- function(compute_group, ...){
  ggproto("StatTemp", Stat, compute_group = compute_group, ...)
  }

qgeom_mod_default_aes <- function(`_inherit` = GeomPoint, default_aes = GeomPoint$default_aes, ...){
  
  ggproto("GeomTemp", `_inherit` = `_inherit`, default_aes = default_aes, ...)

}


qlayer <- function (mapping = NULL, data = NULL, geom = "point", stat = "identity", position = "identity", 
    ..., na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) 
{
    layer(data = data, mapping = mapping, stat = stat, geom = geom, 
        position = position, show.legend = show.legend, inherit.aes = inherit.aes, 
        params = rlang::list2(na.rm = na.rm, ...))
}
# 

# mysetseed(199402)
# create_x_y(relationship = .5) %>%
# data_create_scatterplot() %>%
# plot_draw_mean_x() %>%
# plot_draw_mean_y() %>%
# plot_draw_differences_x() %>%
# plot_draw_differences_y() %>%
# plot_multiply_differences() %>%
# plot_take_average_rectangle()

# Assume qlayer() and qstat() exists:

# Layer 0 (actually don't use in var/sd/cov/corr/R^2 but demos qstat simplest usage)
compute_group_xymean <- function(data, scales){data |> mutate(x = mean(x), y = mean(y))}

geom_xy_means <- function(...){geom_point(stat = qstat(compute_group_xymean), ...)}


# Layer 1
compute_group_xmean <- function(data, scales){data |> summarize(xintercept = mean(x))}

geom_xmean <- function(...){
  
  QStat <- qstat(compute_group_xmean, dropped_aes = c("x", "y"))
  
  qlayer(geom = GeomVline, stat = QStat, ...)
  
  
  }

# Layer 2
compute_group_ymean <- function(data, scales){data |> summarize(yintercept = mean(y))}

geom_ymean <- function(...){
  
  QStat <- qstat(compute_group_ymean, dropped_aes = c("x", "y"))
  
  qlayer(geom = GeomHline, stat = QStat, ...)
  
  }

# Layer 3
compute_group_xmeandiff <- function(data, scales){
  
  data |> mutate(xend = mean(x), 
                 yend = y, 
                 xdiff = x - mean(x), 
                 sign = factor(sign(xdiff)))
  
  }

geom_xmeandiff <- function(...){
  
  QStat <- qstat(compute_group_xmeandiff, 
              default_aes = aes(color = after_stat(sign)))
  
  geom_segment(stat = QStat, ...)
  
  }

# Layer 4
compute_group_ymeandiff <- function(data, scales){
  
  data |> mutate(yend = mean(y), 
                 xend = x, 
                 ydiff = y - mean(y), 
                 sign = factor(sign(ydiff)))
  }

geom_ymeandiff <- function(...){
  
  QStat <- qstat(compute_group_ymeandiff, 
              default_aes = aes(color = after_stat(sign)))
  
  geom_segment(stat = QStat, ...)
  
  }


# Layer 5
compute_group_xymeandiffs <- function(data, scales){data |> mutate(xmin = mean(x), ymin = mean(y), xmax = x, ymax = y, area = (xmax-xmin)*(ymax-ymin), sign = factor(sign(area)))}

geom_xydiffs <- function(alpha = .2, ...){geom_rect(
  stat = qstat(compute_group_xymeandiffs, 
               default_aes = aes(fill = after_stat(sign))), alpha = alpha, ...)}

# Layer 6 & 7
compute_covariance <- function(data, scales){
   
  xmean = mean(data$x)
  ymean = mean(data$y)
  xsd = sd(data$x)
  
  data |> 
    mutate(xdiff = x - mean(x),                                      ydiff = y - mean(y),   
           area = xdiff * ydiff) %>% 
    summarize(mean_area = sum(area)/(n()-1)) %>% 
    pull(mean_area) ->
    mean_area
  
  data.frame(
             xmin = xmean, ymin = ymean,
             xmax = xmean + xsd, ymax = ymean + mean_area/xsd,
             covariance = mean_area,
             sign = factor(sign(mean_area))) |>
    mutate(x = (xmin + xmax)/2,
          y = (ymin + ymax)/2)
  
}
  
  
geom_covariance <- function(...){
  
  QStat <- qstat(compute_covariance,
                 default_aes = aes(fill = after_stat(sign)))
  
  GeomRect2 <- qgeom_mod_default_aes(GeomRect, aes(color = "black", 
                              fill = "grey35",
                              linewidth = .5,
                              linetype = 1, 
                              alpha = NA))
  
  qlayer(stat = QStat, geom = GeomRect2, ...)
  
}  



geom_mean_xdiffXydiff <- geom_covariance


geom_covariance_label <- function(...){
  
  QStat <- qstat(compute_covariance,
                 default_aes = 
                   aes(label = round(after_stat(covariance), 2))
                 )
                                   
  
  geom_label(stat = QStat,
             show.legend = F,
                         ...)
  
}
  

# SDs
compute_sdx <- function(data, scales){
  
  ymean <- mean(data$y)
  xmean <- mean(data$x)
  xsd <- sd(data$x)
  
  data.frame(xintercept = xmean + xsd*c(-2,-1,1,2)) %>% 
    mutate(xstart = xmean,
            ystart = ymean,
            xend = xstart + xsd,
            yend = ystart)

    }

geom_x_sd <- function(...){
  
  QS <- qstat(compute_sdx, dropped_aes = c("x", "y"))
  
  # GeomVline$default_aes
  GeomVline2 <- ggproto("GeomVline2", GeomVline, 
                        default_aes = aes(colour = "black",
                                          linewidth = .5,
                                          linetype = "dotted",
                                          alpha = NA))
  
  qlayer(geom = GeomVline2, stat = QS, ...)
  
}

# SDs
compute_sdy <- function(data, scales){
  
  ymean <- mean(data$y)
  ysd <- sd(data$y)
  
  data.frame(yintercept = ymean + ysd*c(-2,-1,1,2))

    }


geom_y_sd <- function(...){
  
  QS <- qstat(compute_sdy, 
              dropped_aes = c("x", "y"))
  
  # GeomHline$default_aes
  GeomHline2 <- ggproto("GeomHline2", GeomHline, 
                        default_aes = aes(colour = "black",
                                          linewidth = .5,
                                          linetype = "dotted",
                                          alpha = NA))
  
  qlayer(geom = GeomHline2, stat = QS, ...)
  
}




cars %>% 
  ggplot() + 
  aes(x = speed, y = dist) +
  geom_point() + # x, y  0
  geom_xmean() + # x-bar 1
  geom_ymean() + # y-bar 2
  geom_xmeandiff() + # 3
  geom_ymeandiff() + # 4
  geom_xydiffs() + # 5
  geom_covariance() + #6
  geom_covariance_label() #7

cov(cars$speed, cars$dist)
## [1] 109.9469
# variance speed
last_plot() + 
  aes(x = speed, y = speed) + 
  coord_equal() + 
  geom_x_sd(linetype = "dotted") + 
  geom_y_sd()

var(cars$speed)
## [1] 27.95918
sd(cars$speed)
## [1] 5.287644
# variance dist 
last_plot() + 
  aes(x = dist, y = dist)

# returning to dist/speed
last_plot() +
  coord_cartesian() +
  aes(y = dist, x = speed)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

last_plot() + 
  aes(y = (dist-mean(dist))/sd(dist), 
      x = (speed-mean(speed))/sd(speed)) + 
  coord_equal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

cor(cars$speed, cars$dist)
## [1] 0.8068949
# last_plot() + 
#   geom_smooth(method = lm)
# 
# 
# lm(cars$dist ~ cars$speed) %>% 
#   summary()
# 
# cor(cars$speed, cars$dist)^2 # variance explained
# 


post <- function(data, scales){
  
  data |> 
    mutate(xend = x) |>
    mutate(yend = 0)
}

Closing remarks, Other Relevant Work, Caveats