Statistics Formulas in R

Descriptive Statistics

x <- c(23, 29, 20, 32, 25, 28, 31, 24)

mean(x)              # 26.5
median(x)            # 26.5
sd(x)                # 4.175
var(x)               # 17.43
IQR(x)               # 5.25

quantile(x)          # 0% 25% 50% 75% 100%
quantile(x, 0.9)     # 90th percentile

range(x)             # 20 32
diff(range(x))       # 12 (max - min)

# Summary in one call
summary(x)

# Weighted mean
weighted.mean(x, w = c(1,1,2,2,1,1,1,1))

# Skewness & kurtosis (moments package)
library(moments)
skewness(x)
kurtosis(x)

t-Tests

# One-sample: is mean different from mu?
t.test(x, mu = 25)
# H0: true mean = 25
# Returns: t, df, p-value, 95% CI

# Two-sample (independent)
t.test(score ~ group, data = df)
# or: t.test(group_a, group_b)

# Welch's t-test (default, unequal var)
t.test(x, y, var.equal = FALSE)

# Equal variance assumed
t.test(x, y, var.equal = TRUE)

# Paired t-test (before/after)
t.test(before, after, paired = TRUE)

# One-sided tests
t.test(x, mu = 25, alternative = "greater")
t.test(x, mu = 25, alternative = "less")

# Extract values
result <- t.test(x, mu = 25)
result$statistic   # t value
result$p.value     # p-value
result$conf.int    # confidence interval

Chi-Square Test

# Test of independence
# H0: variables are independent
tbl <- table(df$gender, df$preference)
chisq.test(tbl)

# With simulated p-value (small samples)
chisq.test(tbl, simulate.p.value = TRUE,
           B = 10000)

# Goodness of fit
# H0: observed matches expected proportions
observed <- c(30, 45, 25)
expected <- c(1/3, 1/3, 1/3)
chisq.test(observed, p = expected)

# Fisher's exact test (small cell counts)
fisher.test(tbl)

# Cramér's V (effect size)
library(vcd)
assocstats(tbl)$cramer

ANOVA

# One-way ANOVA
# H0: all group means are equal
model <- aov(score ~ treatment, data = df)
summary(model)

# Post-hoc pairwise comparisons
TukeyHSD(model)
# Gives: diff, lwr, upr, p adj for each pair

# Two-way ANOVA
model2 <- aov(score ~ treatment * gender,
              data = df)
summary(model2)  # main effects + interaction

# Effect size (eta-squared)
library(effectsize)
eta_squared(model)

# Non-parametric alternative
# Kruskal-Wallis (when normality violated)
kruskal.test(score ~ treatment, data = df)

# Pairwise Wilcoxon (post-hoc)
pairwise.wilcox.test(df$score, df$treatment,
  p.adjust.method = "bonferroni")

Correlation

# Pearson correlation (default)
cor(x, y)
cor(df[, c("x","y","z")])  # matrix

# Spearman (non-parametric / ordinal)
cor(x, y, method = "spearman")

# Kendall (robust, small samples)
cor(x, y, method = "kendall")

# Significance test
cor.test(x, y)
# Returns: r, p-value, 95% CI

# Handle missing values
cor(x, y, use = "complete.obs")

# Correlation matrix with p-values
library(Hmisc)
rcorr(as.matrix(df[, 1:4]))

# Visualize correlation matrix
library(corrplot)
M <- cor(mtcars[, 1:7])
corrplot(M, method = "circle",
         type = "upper")

Simple Linear Regression

# Fit model: y = b0 + b1*x
model <- lm(mpg ~ wt, data = mtcars)
summary(model)
# Key output:
#   Coefficients (Estimate, Std.Error, t, p)
#   R-squared, Adjusted R-squared
#   F-statistic and p-value

# Confidence intervals for coefficients
confint(model)           # 95% (default)
confint(model, level = 0.99)

# Predict new values
new_data <- data.frame(wt = c(2.5, 3.0, 3.5))
predict(model, new_data)
predict(model, new_data,
        interval = "confidence")
predict(model, new_data,
        interval = "prediction")

Multiple Regression

# Multiple predictors
model <- lm(mpg ~ wt + hp + cyl, data=mtcars)
summary(model)

# Interaction terms
lm(mpg ~ wt * hp, data = mtcars)
# Equivalent: mpg ~ wt + hp + wt:hp

# Polynomial regression
lm(mpg ~ wt + I(wt^2), data = mtcars)

# All predictors
lm(mpg ~ ., data = mtcars)

# Exclude a predictor
lm(mpg ~ . - am, data = mtcars)

# Compare models
model1 <- lm(mpg ~ wt, data = mtcars)
model2 <- lm(mpg ~ wt + hp, data = mtcars)
anova(model1, model2)  # F-test
AIC(model1, model2)    # lower = better
BIC(model1, model2)

# Stepwise selection
step(lm(mpg ~ ., data = mtcars),
     direction = "both")

Checking Assumptions

# Diagnostic plots (4 key plots)
par(mfrow = c(2, 2))
plot(model)
# 1: Residuals vs Fitted (linearity)
# 2: Q-Q plot (normality)
# 3: Scale-Location (homoscedasticity)
# 4: Residuals vs Leverage (outliers)

# Normality of residuals
shapiro.test(residuals(model))
# H0: data is normally distributed
# p > 0.05 -> cannot reject normality

# Homoscedasticity (constant variance)
library(lmtest)
bptest(model)   # Breusch-Pagan test

# Multicollinearity
library(car)
vif(model)      # VIF > 5 is concerning

# Autocorrelation
dwtest(model)   # Durbin-Watson test

# Outliers & influence
cooks.distance(model)
hatvalues(model)
influence.measures(model)

Confidence Intervals & p-values

# CI for a mean
t.test(x)$conf.int         # 95% CI
t.test(x, conf.level=0.99)$conf.int

# CI for a proportion
prop.test(x = 45, n = 100) # 45/100 success
binom.test(45, 100)        # exact binomial

# Interpreting p-values
# p < 0.05  -> reject H0 (significant)
# p >= 0.05 -> fail to reject H0
# p-value is NOT the probability H0 is true!

# Adjusting for multiple comparisons
p_vals <- c(0.01, 0.04, 0.03, 0.20)
p.adjust(p_vals, method = "bonferroni")
p.adjust(p_vals, method = "fdr") # BH method
p.adjust(p_vals, method = "holm")

# Effect sizes
library(effectsize)
cohens_d(score ~ group, data = df)
# Small: 0.2  Medium: 0.5  Large: 0.8