Descriptive Statistics
x <- c(23, 29, 20, 32, 25, 28, 31, 24)
mean(x) # 26.5
median(x) # 26.5
sd(x) # 4.175
var(x) # 17.43
IQR(x) # 5.25
quantile(x) # 0% 25% 50% 75% 100%
quantile(x, 0.9) # 90th percentile
range(x) # 20 32
diff(range(x)) # 12 (max - min)
# Summary in one call
summary(x)
# Weighted mean
weighted.mean(x, w = c(1,1,2,2,1,1,1,1))
# Skewness & kurtosis (moments package)
library(moments)
skewness(x)
kurtosis(x)
t-Tests
# One-sample: is mean different from mu?
t.test(x, mu = 25)
# H0: true mean = 25
# Returns: t, df, p-value, 95% CI
# Two-sample (independent)
t.test(score ~ group, data = df)
# or: t.test(group_a, group_b)
# Welch's t-test (default, unequal var)
t.test(x, y, var.equal = FALSE)
# Equal variance assumed
t.test(x, y, var.equal = TRUE)
# Paired t-test (before/after)
t.test(before, after, paired = TRUE)
# One-sided tests
t.test(x, mu = 25, alternative = "greater")
t.test(x, mu = 25, alternative = "less")
# Extract values
result <- t.test(x, mu = 25)
result$statistic # t value
result$p.value # p-value
result$conf.int # confidence interval
Chi-Square Test
# Test of independence
# H0: variables are independent
tbl <- table(df$gender, df$preference)
chisq.test(tbl)
# With simulated p-value (small samples)
chisq.test(tbl, simulate.p.value = TRUE,
B = 10000)
# Goodness of fit
# H0: observed matches expected proportions
observed <- c(30, 45, 25)
expected <- c(1/3, 1/3, 1/3)
chisq.test(observed, p = expected)
# Fisher's exact test (small cell counts)
fisher.test(tbl)
# Cramér's V (effect size)
library(vcd)
assocstats(tbl)$cramer
ANOVA
# One-way ANOVA
# H0: all group means are equal
model <- aov(score ~ treatment, data = df)
summary(model)
# Post-hoc pairwise comparisons
TukeyHSD(model)
# Gives: diff, lwr, upr, p adj for each pair
# Two-way ANOVA
model2 <- aov(score ~ treatment * gender,
data = df)
summary(model2) # main effects + interaction
# Effect size (eta-squared)
library(effectsize)
eta_squared(model)
# Non-parametric alternative
# Kruskal-Wallis (when normality violated)
kruskal.test(score ~ treatment, data = df)
# Pairwise Wilcoxon (post-hoc)
pairwise.wilcox.test(df$score, df$treatment,
p.adjust.method = "bonferroni")
Correlation
# Pearson correlation (default)
cor(x, y)
cor(df[, c("x","y","z")]) # matrix
# Spearman (non-parametric / ordinal)
cor(x, y, method = "spearman")
# Kendall (robust, small samples)
cor(x, y, method = "kendall")
# Significance test
cor.test(x, y)
# Returns: r, p-value, 95% CI
# Handle missing values
cor(x, y, use = "complete.obs")
# Correlation matrix with p-values
library(Hmisc)
rcorr(as.matrix(df[, 1:4]))
# Visualize correlation matrix
library(corrplot)
M <- cor(mtcars[, 1:7])
corrplot(M, method = "circle",
type = "upper")
Simple Linear Regression
# Fit model: y = b0 + b1*x
model <- lm(mpg ~ wt, data = mtcars)
summary(model)
# Key output:
# Coefficients (Estimate, Std.Error, t, p)
# R-squared, Adjusted R-squared
# F-statistic and p-value
# Confidence intervals for coefficients
confint(model) # 95% (default)
confint(model, level = 0.99)
# Predict new values
new_data <- data.frame(wt = c(2.5, 3.0, 3.5))
predict(model, new_data)
predict(model, new_data,
interval = "confidence")
predict(model, new_data,
interval = "prediction")
Multiple Regression
# Multiple predictors
model <- lm(mpg ~ wt + hp + cyl, data=mtcars)
summary(model)
# Interaction terms
lm(mpg ~ wt * hp, data = mtcars)
# Equivalent: mpg ~ wt + hp + wt:hp
# Polynomial regression
lm(mpg ~ wt + I(wt^2), data = mtcars)
# All predictors
lm(mpg ~ ., data = mtcars)
# Exclude a predictor
lm(mpg ~ . - am, data = mtcars)
# Compare models
model1 <- lm(mpg ~ wt, data = mtcars)
model2 <- lm(mpg ~ wt + hp, data = mtcars)
anova(model1, model2) # F-test
AIC(model1, model2) # lower = better
BIC(model1, model2)
# Stepwise selection
step(lm(mpg ~ ., data = mtcars),
direction = "both")
Checking Assumptions
# Diagnostic plots (4 key plots)
par(mfrow = c(2, 2))
plot(model)
# 1: Residuals vs Fitted (linearity)
# 2: Q-Q plot (normality)
# 3: Scale-Location (homoscedasticity)
# 4: Residuals vs Leverage (outliers)
# Normality of residuals
shapiro.test(residuals(model))
# H0: data is normally distributed
# p > 0.05 -> cannot reject normality
# Homoscedasticity (constant variance)
library(lmtest)
bptest(model) # Breusch-Pagan test
# Multicollinearity
library(car)
vif(model) # VIF > 5 is concerning
# Autocorrelation
dwtest(model) # Durbin-Watson test
# Outliers & influence
cooks.distance(model)
hatvalues(model)
influence.measures(model)
Confidence Intervals & p-values
# CI for a mean
t.test(x)$conf.int # 95% CI
t.test(x, conf.level=0.99)$conf.int
# CI for a proportion
prop.test(x = 45, n = 100) # 45/100 success
binom.test(45, 100) # exact binomial
# Interpreting p-values
# p < 0.05 -> reject H0 (significant)
# p >= 0.05 -> fail to reject H0
# p-value is NOT the probability H0 is true!
# Adjusting for multiple comparisons
p_vals <- c(0.01, 0.04, 0.03, 0.20)
p.adjust(p_vals, method = "bonferroni")
p.adjust(p_vals, method = "fdr") # BH method
p.adjust(p_vals, method = "holm")
# Effect sizes
library(effectsize)
cohens_d(score ~ group, data = df)
# Small: 0.2 Medium: 0.5 Large: 0.8