Status quo options for doing that in ggplot2
“image a line that drops down from the observation to the model line”
library(tidyverse, warn.conflicts = F)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.0
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
First a bit of under-the-hood thinking about geom_smooth/stat_smooth.
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_smooth(n = 12) +
stat_smooth(geom = "point",
color = "blue",
n = 12)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Also, you can specify xseq… Almost surely new to you (and probably more interesting to stats instructors): predicting at observed values of x
xseq is not advertised, but possibly of interest.. https://ggplot2.tidyverse.org/reference/geom_smooth.html
# fit where the support is in the data...
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_point() +
geom_smooth() +
stat_smooth(geom = "point",
color = "blue",
xseq = mtcars$wt)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_smooth_fit()
last_plot() +
geom_smooth_error()
Key take away: this function allows you to set values of x with the xseq argument. Although the default is to create an evenly spaced sequence.
ggplot2::StatSmooth$compute_group
## <ggproto method>
## <Wrapper function>
## function (...)
## compute_group(...)
##
## <Inner function (f)>
## function (data, scales, method = NULL, formula = NULL, se = TRUE,
## n = 80, span = 0.75, fullrange = FALSE, xseq = NULL, level = 0.95,
## method.args = list(), na.rm = FALSE, flipped_aes = NA)
## {
## data <- flip_data(data, flipped_aes)
## if (length(unique0(data$x)) < 2) {
## return(data_frame0())
## }
## if (is.null(data$weight))
## data$weight <- 1
## if (is.null(xseq)) {
## if (is.integer(data$x)) {
## if (fullrange) {
## xseq <- scales$x$dimension()
## }
## else {
## xseq <- sort(unique0(data$x))
## }
## }
## else {
## if (fullrange) {
## range <- scales$x$dimension()
## }
## else {
## range <- range(data$x, na.rm = TRUE)
## }
## xseq <- seq(range[1], range[2], length.out = n)
## }
## }
## if (identical(method, "loess")) {
## method.args$span <- span
## }
## if (is.character(method)) {
## if (identical(method, "gam")) {
## method <- mgcv::gam
## }
## else {
## method <- match.fun(method)
## }
## }
## if (identical(method, mgcv::gam) && is.null(method.args$method)) {
## method.args$method <- "REML"
## }
## model <- inject(method(formula, data = data, weights = weight,
## !!!method.args))
## prediction <- predictdf(model, xseq, se, level)
## prediction$flipped_aes <- flipped_aes
## flip_data(prediction, flipped_aes)
## }
library(dplyr)
mtcars %>%
rename(x = wt, y = mpg, cat = am) %>%
StatSmooth$compute_group(method = lm, formula = y ~ x, n = 7)
## x y ymin ymax se flipped_aes
## 1 1.513000 29.198941 26.963760 31.43412 1.0944578 NA
## 2 2.164833 25.715236 24.086350 27.34412 0.7975850 NA
## 3 2.816667 22.231531 21.040552 23.42251 0.5831635 NA
## 4 3.468500 18.747827 17.611376 19.88428 0.5564635 NA
## 5 4.120333 15.264122 13.756629 16.77161 0.7381448 NA
## 6 4.772167 11.780417 9.692002 13.86883 1.0225936 NA
## 7 5.424000 8.296712 5.547468 11.04596 1.3461693 NA
Here we’ll piggy back on StatSmooth$compute_group, to create a function, compute_group_smooth_fit. We ask that function to compute predictions at the values of x observed in our data set. We also preserve the values of y (as yend) so that we can draw in the residual error.
xend and yend are computed to draw the segments visualizing the error.
compute_group_smooth_fit <- function(data, scales, method = NULL, formula = NULL,
se = TRUE, n = 80, span = 0.75, fullrange = FALSE,
level = 0.95, method.args = list(),
na.rm = FALSE, flipped_aes = NA){
StatSmooth$compute_group(data = data, scales = scales,
method = method, formula = formula,
se = FALSE, n= n, span = span, fullrange = fullrange,
xseq = data$x,
level = .95, method.args = method.args,
na.rm = na.rm, flipped_aes = flipped_aes) %>%
mutate(xend = data$x,
yend = data$y
)
}
We’ll also create compute_group_smooth_sq_error, further piggybacking, this time on the function we just build. This creates the ymin, ymax, xmin and xmax columns needed to show the squared error. Initially, I’d included this computation above, but the plot results can be bad, as the ‘flags’ that come off of the residuals effect the plot spacing even when they aren’t used. Preferring to avoid this side-effect, we create two functions (and later two ggproto objects). Note too that xmax is computed in the units of y, and initial plotting can yield squares that do not look like squares. Standardizing both variables, with coord_equal will get us to squares.
compute_group_smooth_sq_error <- function(data, scales, method = NULL, formula = NULL,
se = TRUE, n = 80, span = 0.75, fullrange = FALSE,
level = 0.95, method.args = list(),
na.rm = FALSE, flipped_aes = NA){
compute_group_smooth_fit(data = data, scales = scales,
method = method, formula = formula,
se = FALSE, n= n, span = span, fullrange = fullrange,
level = .95, method.args = method.args,
na.rm = na.rm, flipped_aes = flipped_aes) %>%
mutate(ymin = y,
xmin = x,
ymax = yend,
xmax = x + (ymax - ymin))
}
mtcars %>%
slice(1:10) %>%
rename(x = wt, y = mpg) %>%
compute_group_smooth_fit(method = lm, formula = y ~ x)
## x y flipped_aes xend yend
## 1 2.620 22.54689 NA 2.620 21.0
## 2 2.875 21.45416 NA 2.875 21.0
## 3 2.320 23.83246 NA 2.320 22.8
## 4 3.215 19.99719 NA 3.215 21.4
## 5 3.440 19.03301 NA 3.440 18.7
## 6 3.460 18.94731 NA 3.460 18.1
## 7 3.570 18.47593 NA 3.570 14.3
## 8 3.190 20.10432 NA 3.190 24.4
## 9 3.150 20.27573 NA 3.150 22.8
## 10 3.440 19.03301 NA 3.440 19.2
mtcars %>%
slice(1:10) %>%
rename(x = wt, y = mpg) %>%
compute_group_smooth_sq_error(method = lm, formula = y ~ x)
## x y flipped_aes xend yend ymin xmin ymax xmax
## 1 2.620 22.54689 NA 2.620 21.0 22.54689 2.620 21.0 1.0731060
## 2 2.875 21.45416 NA 2.875 21.0 21.45416 2.875 21.0 2.4208382
## 3 2.320 23.83246 NA 2.320 22.8 23.83246 2.320 22.8 1.2875387
## 4 3.215 19.99719 NA 3.215 21.4 19.99719 3.215 21.4 4.6178145
## 5 3.440 19.03301 NA 3.440 18.7 19.03301 3.440 18.7 3.1069900
## 6 3.460 18.94731 NA 3.460 18.1 18.94731 3.460 18.1 2.6126945
## 7 3.570 18.47593 NA 3.570 14.3 18.47593 3.570 14.3 -0.6059308
## 8 3.190 20.10432 NA 3.190 24.4 20.10432 3.190 24.4 7.4856839
## 9 3.150 20.27573 NA 3.150 22.8 20.27573 3.150 22.8 5.6742749
## 10 3.440 19.03301 NA 3.440 19.2 19.03301 3.440 19.2 3.6069900
StatSmoothFit <- ggplot2::ggproto("StatSmoothFit", ggplot2::Stat,
setup_params = StatSmooth$setup_params,
extra_params = c("na.rm", "orientation"),
compute_group = compute_group_smooth_fit,
dropped_aes = c("weight"),
required_aes = c("x", "y")
)
StatSmoothErrorSq <- ggplot2::ggproto("StatSmoothErrorSq", ggplot2::Stat,
setup_params = StatSmooth$setup_params,
extra_params = c("na.rm", "orientation"),
compute_group = compute_group_smooth_sq_error,
dropped_aes = c("weight"),
required_aes = c("x", "y")
)
stat_fit <- function(mapping = NULL, data = NULL,
geom = "point", position = "identity",
...,
method = NULL,
formula = NULL,
se = TRUE,
n = 80,
span = 0.75,
fullrange = FALSE,
level = 0.95,
method.args = list(),
na.rm = FALSE,
orientation = NA,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatSmoothFit,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = rlang::list2(
method = method,
formula = formula,
se = se,
n = n,
fullrange = fullrange,
level = level,
na.rm = na.rm,
orientation = orientation,
method.args = method.args,
span = span,
...
)
)
}
stat_errorsq <- function(mapping = NULL, data = NULL,
geom = "rect", position = "identity",
...,
method = NULL,
formula = NULL,
se = TRUE,
n = 80,
span = 0.75,
fullrange = FALSE,
level = 0.95,
method.args = list(),
na.rm = FALSE,
orientation = NA,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatSmoothErrorSq,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = rlang::list2(
method = method,
formula = formula,
se = se,
n = n,
fullrange = fullrange,
level = level,
na.rm = na.rm,
orientation = orientation,
method.args = method.args,
span = span,
...
)
)
}
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_point() +
geom_smooth(alpha = .2, se = FALSE) +
stat_fit(color = "blue") + # wrap as geom_smooth_fit()
stat_fit(geom = "segment") # geom_smooth_error()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
For best results, use standardized x, y and coord_equal() as shown below
stdz <- function(x){
var_mean <- mean(x)
var_sd <- sd(x)
(x-var_mean)/var_sd
}
last_plot() +
stat_errorsq(geom = "rect", alpha = .1) + # geom_smooth_error_sq() +
aes(stdz(wt), stdz(mpg)) +
coord_equal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_point() +
geom_smooth(alpha = .2, se = FALSE, method = lm) +
stat_fit(geom = "point", color = "blue", method = lm) + # wrap as geom_smooth_fit()
stat_fit(geom = "segment", method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
mtcars %>%
ggplot() +
aes(stdz(wt), stdz(mpg)) +
geom_point() +
geom_smooth(alpha = .2, se = FALSE, method = lm, formula = y ~ 1) +
stat_fit(geom = "point", color = "blue", method = lm, formula = y ~ 1) + # wrap as geom_smooth_fit()
stat_fit(geom = "segment", method = lm, formula = y ~ 1) +
stat_errorsq(geom = "rect", alpha = .1, method = lm, formula = y ~ 1) +
coord_equal()
geom_smooth_fit <- function(...){stat_fit(color = "blue",...)} # wrap as geom_smooth_fit()
geom_smooth_residual <- function(...){stat_fit(geom = "segment", color = "darkred", ...)} # wrap as geom_smooth_fit()
mtcars %>%
ggplot() +
aes(wt, mpg) +
geom_point() +
geom_smooth(alpha = .2, se = FALSE) +
geom_smooth_fit(color = "blue") #+ # wrap as geom_smooth_fit()
## Warning: Duplicated aesthetics after name standardisation: colour
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
geom_smooth_residual(geom = "segment")
## Error in stat_fit(geom = "segment", color = "darkred", ...): formal argument "geom" matched by multiple actual arguments