dplyr Data Wrangling

The Pipe

library(dplyr)

# Native pipe (R 4.1+)
mtcars |> filter(mpg > 20) |> head()

# magrittr pipe
library(magrittr)
mtcars %>% filter(mpg > 20) %>% head()

# Pipes pass left side as first argument
# x |> f(y)  is the same as  f(x, y)

filter() — Pick Rows

# Rows where conditions are TRUE
starwars |>
  filter(species == "Human", height > 170)

# Multiple conditions
starwars |>
  filter(mass > 50 & mass < 100)

starwars |>
  filter(skin_color == "fair" |
         skin_color == "light")

# %in% for matching multiple values
starwars |>
  filter(eye_color %in% c("blue","brown"))

# Filter out NAs
starwars |>
  filter(!is.na(height))

select() — Pick Columns

# By name
starwars |> select(name, height, mass)

# Drop columns
starwars |> select(-birth_year, -gender)

# Ranges
starwars |> select(name:mass)

# Helper functions
starwars |> select(starts_with("s"))
starwars |> select(ends_with("color"))
starwars |> select(contains("ei"))
starwars |> select(where(is.numeric))

# Rename while selecting
starwars |> select(nm = name, ht = height)

# Or just rename (keeps all cols)
starwars |> rename(nm = name)

mutate() — Create / Modify Columns

# Add new column
starwars |>
  mutate(bmi = mass / (height/100)^2)

# Modify existing
starwars |>
  mutate(height = height / 100)  # m to cm

# Multiple columns at once
starwars |>
  mutate(
    bmi = mass / (height/100)^2,
    tall = height > 180,
    name_upper = toupper(name)
  )

# case_when — vectorized if/else
starwars |>
  mutate(
    size = case_when(
      height > 200 ~ "tall",
      height > 150 ~ "medium",
      TRUE          ~ "short"
    )
  )

# Keep only new columns
starwars |>
  transmute(name, bmi = mass / (height/100)^2)

summarise() — Aggregate

starwars |>
  summarise(
    avg_height = mean(height, na.rm = TRUE),
    max_mass   = max(mass, na.rm = TRUE),
    n          = n(),
    n_species  = n_distinct(species)
  )

# Useful summary functions:
# n()             count rows
# n_distinct(x)   count unique values
# mean(), median(), sd(), var()
# min(), max(), sum()
# first(), last(), nth(x, 3)

group_by() — Split, Apply, Combine

# Group then summarise
starwars |>
  group_by(species) |>
  summarise(
    count = n(),
    avg_height = mean(height, na.rm = TRUE)
  ) |>
  arrange(desc(count))

# Group then mutate (window functions)
starwars |>
  group_by(species) |>
  mutate(
    height_rank = row_number(desc(height)),
    pct_of_group = mass / sum(mass, na.rm=T)
  )

# Multiple grouping variables
mtcars |>
  group_by(cyl, am) |>
  summarise(avg_mpg = mean(mpg), .groups="drop")

# Remove grouping
df |> ungroup()

arrange() & count()

# Sort ascending (default)
starwars |> arrange(height)

# Sort descending
starwars |> arrange(desc(mass))

# Multiple sort keys
starwars |> arrange(species, desc(height))

# Count occurrences (shortcut)
starwars |> count(species, sort = TRUE)

# Equivalent to:
# group_by(species) |> summarise(n = n()) |>
#   arrange(desc(n))

Joins

orders <- tibble(
  id = 1:4,
  cust = c("A","B","C","D"),
  amount = c(100, 200, 150, 300)
)
info <- tibble(
  cust = c("A","B","C","E"),
  city = c("NYC","LA","CHI","SF")
)

# Keep all rows from left table
left_join(orders, info, by = "cust")

# Keep all rows from right table
right_join(orders, info, by = "cust")

# Keep only matching rows
inner_join(orders, info, by = "cust")

# Keep all rows from both
full_join(orders, info, by = "cust")

# Different column names
left_join(x, y, by = c("id" = "cust_id"))

# Filtering joins (no new columns)
semi_join(orders, info, by = "cust") # keep matches
anti_join(orders, info, by = "cust") # keep non-matches

slice() — Pick Rows by Position

# First / last N rows
starwars |> slice_head(n = 5)
starwars |> slice_tail(n = 3)

# Top / bottom by value
starwars |> slice_max(height, n = 3)
starwars |> slice_min(mass, n = 5)

# Random sample
starwars |> slice_sample(n = 10)
starwars |> slice_sample(prop = 0.2)

# Works with groups!
starwars |>
  group_by(species) |>
  slice_max(height, n = 1)  # tallest per species

Useful Patterns

# across() — apply function to many cols
starwars |>
  summarise(across(
    where(is.numeric),
    list(mean = ~mean(.x, na.rm=T),
         sd   = ~sd(.x, na.rm=T))
  ))

# Relocate columns
starwars |> relocate(species, .before = name)

# Distinct rows
starwars |> distinct(species, .keep_all = TRUE)

# Pull a column as vector
starwars |> pull(name)

# Row-wise operations
df |>
  rowwise() |>
  mutate(total = sum(c_across(q1:q4)))