Code Annex
Generic code for various statistical tests in R.
Single Categorical Variable
Summary Statistic
dataframe %>%
count(variable) %>%
mutate(prop = n/sum(n)) %>%
adorn_totals()Check whether or not validity conditions are met.
Bargraph
dataframe %>%
ggplot(aes(x=variable)) +
geom_bar()One-proportion z-test
Standard Deviation of the null distribution:
\(sd_{null}=\sqrt{\frac{\pi_0(1-\pi_0)}{n}}\)
null = # Enter the value of your Null Hypothesis Parameter
n = # Enter the sample size
stat = # Enter the value of your statistic
sd = sqrt(null*(1 - null)/n) # Standard deviation of the null distribution
standardized_stat = (stat-null)/sd # Standardized Statistic, zChoose the \(p\)-value code that corresponds to the alternative hypothesis:
Less than hypothesis
pvalue = pnorm(standardized_stat)Greater than hypothesis
pvalue = 1-pnorm(standardized_stat)Two sided hypothesis
pvalue = 2*(1-pnorm(abs(standardized_stat)))Confidence Interval
siglevel = #Enter your significance level (alpha)
multiplier = qnorm(1-siglevel/2)
se = sqrt(stat*(1 - stat)/n) # Standard Error
CI = c(stat-multiplier*se, stat+multiplier*se) # Confidence Interval\(se=\sqrt{\frac{\hat{p}(1-\hat{p})}{n}}\)
Single Quantitative Variable
Summary Statistic
dataframe %>%
summarise(mean = mean(variable),
s = sd(variable),
n = n())Histogram
dataframe %>%
ggplot(aes(x = variable)) +
geom_histogram()Check whether or not validity conditions are met.
One-sample t-test
null = #Enter the value of your Null Hypothesis Parameter
n = #Enter the sample size
stat = #Enter the value of your statistic
s = # Sample standard deviation (from summary statistics)
sd = s/sqrt(n) # Standard deviation of the null distribution
standardized_stat = (stat-null)/sd # Standardized statistic, tStandard Deviation of the null distribution:
\(sd_{null}=s/\sqrt{n}\)
Choose the \(p\)-value code that corresponds to the alternative hypothesis:
Less than hypothesis
pvalue = pt(standardized_stat, n-1)Greater than hypothesis
pvalue = 1-pt(standardized_stat, n-1)Two sided hypothesis
pvalue = 2*(1-pt(abs(standardized_stat), n-1))Confidence Interval
siglevel = #Enter your significance level (alpha)
multiplier = qt(1-siglevel/2, n-1)
se = s/sqrt(n) # Standard error
CI = c(stat-multiplier*se, stat+multiplier*se) # Confidence IntervalStandard Error:
\(se=s/\sqrt{n}\)
Comparing Two Proportions
Categorical Explanatory Variable, Categorical Response Variable
Summary Statistic
dataframe <- dataframe %>%
mutate(explanatoryvariable = as.factor(explanatoryvariable),
responsevariable = as.factor(responsevariable)) # convert the categorical variables to a factors
dataframe %>%
count(explanatoryvariable,
responsevariable) %>%
pivot_wider(names_from = explanatoryvariable,
values_from = n) %>%
adorn_totals(c("row", "col"))Check whether or not validity conditions are met.
Segmented Bar Graph
dataframe %>%
ggplot(aes(x= explanatoryvariable,
fill = responsevariable)) +
geom_bar(position = position_fill())Two-proportion z-test
null = #Enter the value of your Null Hypothesis Parameter
n = #Enter the sample size
stat = #Enter the value of your statistic
successes_1 = # number of successes in group 1
successes_2 = # number of successes in group 2
n_1 = # sample size of group 1
n_2 = # sample size of group 2
phat_1 = successes_1/n_1
phat_2 = successes_2/n_2
phat_t = (successes_1 + successes_2)/(n_1 + n_2)
stat = phat_1-phat_2 # ensure this matches your null hypothesis order
sd = sqrt(phat_t*(1-phat_t)*(1/n_1 + 1/n_2))
standardized_stat = (stat-null)/sd # standardized statistic, zStatistic:
\(\textrm{stat}=\hat{p_1}-\hat{p_2}\)
Standard Deviation of the Null Distribution:
\(sd_{null}=\sqrt{\hat{p}*(1-\hat{p})*(\frac{1}{n_1}+\frac{1}{n_2})}\)
Choose the \(p\)-value code that corresponds to the alternative hypothesis:
Less than hypothesis
pvalue = pnorm(standardized_stat)Greater than hypothesis
pvalue = 1-pnorm(standardized_stat)Two sided hypothesis
pvalue = 2*(1-pnorm(abs(standardized_stat)))Confidence Interval
siglevel = #Enter your significance level (alpha)
multiplier = qnorm(1-siglevel/2)
se = sqrt(phat_1*(1-phat_1)/n_1+phat_2*(1-phat_2)/n_2) # Standard Error
CI = c(stat-multiplier*se, stat+multiplier*se) # Confidence Interval\(se=\sqrt{\frac{\hat{p_1}*(1 - \hat{p_1})}{n_1}+\frac{\hat{p_2}*(1 - \hat{p_2})}{n_2}}\)
Comparing Two Means
Categorical Explanatory Variable, Quantitative Response Variable
Summary Statistic
dataframe <- dataframe %>%
mutate(catvariable = as.factor(catvariable)) # convert the categorical variable to a factor
dataframe %>%
group_by(catvariable) %>%
summarise(mean = mean(quantvariable),
s = sd(quantvariable),
n = n())Split Histogram
dataframe %>%
ggplot(aes(x=quantvariable)) +
geom_histogram() +
facet_grid(catvariable~.)Check whether or not validity conditions are met.
Two-sample t-test
Statistic:
\(\textrm{stat}=\bar{x}_1-\bar{x}_2\)
null = #Enter the value of your Null Hypothesis Parameter
n = #Enter the sample size
xbar_1 = # sample mean of group 1
xbar_2 = # sample mean of group 2
s_1 = # sample standard deviation of group 1
s_2 = # sample standard deviation of group 2
n_1 = # sample size of group 1
n_2 = # sample size of group 2
stat = xbar_1 - xbar_2
sd = sqrt(s_1^2/n_1 + s_2^2/n_2)
standardized_stat = (stat-null)/sd # standardized statistic, tStandard Deviation of the Null Distribution:
\(sd_{null}=\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}\)
Choose the \(p\)-value code that corresponds to the alternative hypothesis:
Less than hypothesis
pvalue = pt(standardized_stat, n-2)Greater than hypothesis
pvalue = 1 - pt(standardized_stat, n-2)Two sided hypothesis
pvalue = 2*(1-pt(abs(standardized_stat), n-2))Confidence Interval
siglevel = #Enter your significance level (alpha)
multiplier = qt(1-siglevel/2, n-2)
se = sqrt(s_1^2/n_1 + s_2^2/n_2) # standard error
CI = c(stat-multiplier*se, stat + multiplier*se) # confidence interval\(se=\sqrt{\frac{s_1^2}{n_1}+\frac{s_2^2}{n_2}}\)
Linear Regression
The model below is linear regression without any interaction terms. Include as many variables as needed to be analyzed.
lrmodel <- dataframe %>%
lm(responsevar ~ var1 + var2, data = .)
summary(lrmodel)The model below evaluates an interaction between the two variables.
lrmodel <- dataframe %>%
lm(responsevar ~ var1 * var2, data = .)
summary(lrmodel)Testing Validity Conditions
Validating Linearity and Equal Variance
lrmodel%>%
fortify(lrmodel$model)%>%
ggplot(aes(x = .fitted,
y = .resid))+
geom_point()+
geom_hline(yintercept = 0)+
labs( x = "Predicted Values",
y = "Residuals",
title = "Residuals vs. predicted values")Validating the Independence condition
lrmodel%>%
fortify(lrmodel$model) %>%
mutate(row = row_number()) %>%
ggplot(aes(x = row,
y = .resid))+
geom_point()+
geom_hline(yintercept = 0)+
labs(x = "Order of Occurence",
y = "Residuals",
title = "Residuals in Order of Occurence")Validating the Normality condition
lrmodel%>%
fortify(lrmodel$model)%>%
ggplot(aes(x = .resid))+
geom_histogram()+
labs(x = "Residuals",
title = "Histogram of residuals")Visualizations
Bargraph
dataframe %>%
ggplot(aes(x=as.factor(variable))) +
geom_bar()Histogram
dataframe %>%
ggplot(aes(x=variable)) +
geom_histogram()Split Histogram
Note that the method below can be used to split any plot type up by a categorical variable’s values. This example is for a histogram.
dataframe %>%
ggplot(aes(x=quantvariable)) +
geom_histogram() +
facet_grid(as.factor(catvariable)~.)Segmented Bar Graph
dataframe %>%
ggplot(aes(x=as.factor(explanatoryvariable), fill = as.factor(responsevariable))) +
geom_bar(position = position_fill())Boxplot
dataframe %>%
ggplot(aes(x = var1, y = var2)) +
geom_boxplot()Scatterplot
dataframe %>%
ggplot(aes(x=var1, y=var2)) +
geom_point()Scatterplot with Linear Regression Line
dataframe %>%
ggplot(aes(x=var1, y=var2)) +
geom_point() +
geom_smooth(method = "lm")Scatterplot with Linear Regression Interaction
dataframe %>%
ggplot(aes(x = var1, y = var2, color = var3)) +
geom_point() +
geom_smooth(method = "lm")