library(tidyverse)
library(Lahman)
Teams %>%
filter(yearID == 2000) %>%
select(teamID, franchID,
divID, lgID, W, L, R, G, RA) %>%
rename(wins = W, losses = L,
runs = R, games = G,
runs_opponent = RA) %>%
mutate(win_pct = wins/games) %>%
mutate(runs_differential = runs - runs_opponent) ->
my_data
my_data %>%
head()
## teamID franchID divID lgID wins losses runs games runs_opponent win_pct
## 1 ANA ANA W AL 82 80 864 162 869 0.5061728
## 2 ARI ARI W NL 85 77 792 162 754 0.5246914
## 3 ATL ATL E NL 95 67 810 162 714 0.5864198
## 4 BAL BAL E AL 74 88 794 162 913 0.4567901
## 5 BOS BOS E AL 85 77 792 162 745 0.5246914
## 6 CHA CHW C AL 95 67 978 162 839 0.5864198
## runs_differential
## 1 -5
## 2 38
## 3 96
## 4 -119
## 5 47
## 6 139
my_data %>%
ggplot() +
aes(y = wins) +
aes(x = runs_differential) +
geom_rug() +
geom_point() +
geom_smooth(method = lm, se = F) +
ggxmean::geom_lm_formula() +
aes(color = lgID) +
facet_wrap(facets = vars(lgID))
## `geom_smooth()` using formula 'y ~ x'
my_data %>%
lm(formula = wins ~
runs_differential + lgID +
lgID*runs_differential, data = .) %>%
summary
##
## Call:
## lm(formula = wins ~ runs_differential + lgID + lgID * runs_differential,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.988 -1.740 -0.246 2.116 6.368
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 81.38362 0.90885 89.55 < 2e-16 ***
## runs_differential 0.08065 0.00906 8.91 2.2e-09 ***
## lgIDNL -0.78447 1.24443 -0.63 0.534
## runs_differential:lgIDNL 0.02127 0.01242 1.71 0.099 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.4 on 26 degrees of freedom
## Multiple R-squared: 0.896, Adjusted R-squared: 0.884
## F-statistic: 74.8 on 3 and 26 DF, p-value: 6.51e-13