The Pipe
library(dplyr)
# Native pipe (R 4.1+)
mtcars |> filter(mpg > 20) |> head()
# magrittr pipe
library(magrittr)
mtcars %>% filter(mpg > 20) %>% head()
# Pipes pass left side as first argument
# x |> f(y) is the same as f(x, y)
filter() — Pick Rows
# Rows where conditions are TRUE
starwars |>
filter(species == "Human", height > 170)
# Multiple conditions
starwars |>
filter(mass > 50 & mass < 100)
starwars |>
filter(skin_color == "fair" |
skin_color == "light")
# %in% for matching multiple values
starwars |>
filter(eye_color %in% c("blue","brown"))
# Filter out NAs
starwars |>
filter(!is.na(height))
select() — Pick Columns
# By name
starwars |> select(name, height, mass)
# Drop columns
starwars |> select(-birth_year, -gender)
# Ranges
starwars |> select(name:mass)
# Helper functions
starwars |> select(starts_with("s"))
starwars |> select(ends_with("color"))
starwars |> select(contains("ei"))
starwars |> select(where(is.numeric))
# Rename while selecting
starwars |> select(nm = name, ht = height)
# Or just rename (keeps all cols)
starwars |> rename(nm = name)
mutate() — Create / Modify Columns
# Add new column
starwars |>
mutate(bmi = mass / (height/100)^2)
# Modify existing
starwars |>
mutate(height = height / 100) # m to cm
# Multiple columns at once
starwars |>
mutate(
bmi = mass / (height/100)^2,
tall = height > 180,
name_upper = toupper(name)
)
# case_when — vectorized if/else
starwars |>
mutate(
size = case_when(
height > 200 ~ "tall",
height > 150 ~ "medium",
TRUE ~ "short"
)
)
# Keep only new columns
starwars |>
transmute(name, bmi = mass / (height/100)^2)
summarise() — Aggregate
starwars |>
summarise(
avg_height = mean(height, na.rm = TRUE),
max_mass = max(mass, na.rm = TRUE),
n = n(),
n_species = n_distinct(species)
)
# Useful summary functions:
# n() count rows
# n_distinct(x) count unique values
# mean(), median(), sd(), var()
# min(), max(), sum()
# first(), last(), nth(x, 3)
group_by() — Split, Apply, Combine
# Group then summarise
starwars |>
group_by(species) |>
summarise(
count = n(),
avg_height = mean(height, na.rm = TRUE)
) |>
arrange(desc(count))
# Group then mutate (window functions)
starwars |>
group_by(species) |>
mutate(
height_rank = row_number(desc(height)),
pct_of_group = mass / sum(mass, na.rm=T)
)
# Multiple grouping variables
mtcars |>
group_by(cyl, am) |>
summarise(avg_mpg = mean(mpg), .groups="drop")
# Remove grouping
df |> ungroup()
arrange() & count()
# Sort ascending (default)
starwars |> arrange(height)
# Sort descending
starwars |> arrange(desc(mass))
# Multiple sort keys
starwars |> arrange(species, desc(height))
# Count occurrences (shortcut)
starwars |> count(species, sort = TRUE)
# Equivalent to:
# group_by(species) |> summarise(n = n()) |>
# arrange(desc(n))
Joins
orders <- tibble(
id = 1:4,
cust = c("A","B","C","D"),
amount = c(100, 200, 150, 300)
)
info <- tibble(
cust = c("A","B","C","E"),
city = c("NYC","LA","CHI","SF")
)
# Keep all rows from left table
left_join(orders, info, by = "cust")
# Keep all rows from right table
right_join(orders, info, by = "cust")
# Keep only matching rows
inner_join(orders, info, by = "cust")
# Keep all rows from both
full_join(orders, info, by = "cust")
# Different column names
left_join(x, y, by = c("id" = "cust_id"))
# Filtering joins (no new columns)
semi_join(orders, info, by = "cust") # keep matches
anti_join(orders, info, by = "cust") # keep non-matches
slice() — Pick Rows by Position
# First / last N rows
starwars |> slice_head(n = 5)
starwars |> slice_tail(n = 3)
# Top / bottom by value
starwars |> slice_max(height, n = 3)
starwars |> slice_min(mass, n = 5)
# Random sample
starwars |> slice_sample(n = 10)
starwars |> slice_sample(prop = 0.2)
# Works with groups!
starwars |>
group_by(species) |>
slice_max(height, n = 1) # tallest per species
Useful Patterns
# across() — apply function to many cols
starwars |>
summarise(across(
where(is.numeric),
list(mean = ~mean(.x, na.rm=T),
sd = ~sd(.x, na.rm=T))
))
# Relocate columns
starwars |> relocate(species, .before = name)
# Distinct rows
starwars |> distinct(species, .keep_all = TRUE)
# Pull a column as vector
starwars |> pull(name)
# Row-wise operations
df |>
rowwise() |>
mutate(total = sum(c_across(q1:q4)))