tidyr Reshaping

pivot_longer() — Wide to Long

Columns become rows. Use when variables are stored as column names.

# BEFORE (wide)
#   name  math  sci
#   Ali    88   92
#   Bo     79   85
# AFTER (long)
#   name  subject  score
#   Ali   math      88
#   Ali   sci       92
#   Bo    math      79
#   Bo    sci       85
library(tidyr)

wide_df |>
  pivot_longer(
    cols      = math:sci,     # columns to pivot
    names_to  = "subject",    # new key column
    values_to = "score"       # new value column
  )

# Selecting columns
pivot_longer(cols = -name)        # all except name
pivot_longer(cols = starts_with("q"))
pivot_longer(cols = where(is.numeric))

# Parse column names with patterns
# Cols: bp_before, bp_after, hr_before, hr_after
df |> pivot_longer(
  cols = -id,
  names_to = c("measure", "time"),
  names_sep = "_"
)

pivot_wider() — Long to Wide

Rows become columns. Opposite of pivot_longer.

# BEFORE (long)
#   name  subject  score
#   Ali   math      88
#   Ali   sci       92
#   Bo    math      79
#   Bo    sci       85
# AFTER (wide)
#   name  math  sci
#   Ali    88   92
#   Bo     79   85
long_df |>
  pivot_wider(
    names_from  = subject,    # column for names
    values_from = score       # column for values
  )

# Fill missing values
pivot_wider(
  names_from = subject,
  values_from = score,
  values_fill = 0             # NA -> 0
)

# Multiple value columns
pivot_wider(
  names_from = year,
  values_from = c(revenue, profit)
)

separate() — Split a Column

# BEFORE
#   date
#   2024-03-15
#   2024-07-22
# AFTER
#   year  month  day
#   2024  03     15
#   2024  07     22
df |>
  separate(
    col  = date,
    into = c("year", "month", "day"),
    sep  = "-"
  )

# Keep original column
separate(date, into = c("y","m","d"),
         sep = "-", remove = FALSE)

# Convert types automatically
separate(date, into = c("y","m","d"),
         sep = "-", convert = TRUE)

# By position (character index)
separate(code, into = c("prefix","num"),
         sep = 3)  # split after 3rd char

# Newer alternative: separate_wider_delim()
df |> separate_wider_delim(
  date, delim = "-",
  names = c("year", "month", "day")
)

unite() — Combine Columns

# BEFORE
#   year  month  day
#   2024  03     15
#   2024  07     22
# AFTER
#   date
#   2024-03-15
#   2024-07-22
df |>
  unite(
    col = "date",       # new column name
    year, month, day,   # columns to combine
    sep = "-"           # separator
  )

# Keep original columns
unite("full_name", first, last,
      sep = " ", remove = FALSE)

Missing Values

# Drop rows with any NA
df |> drop_na()

# Drop rows where specific columns are NA
df |> drop_na(score, grade)

# Replace NA with a value
df |> replace_na(list(
  score = 0,
  grade = "Unknown"
))

# Fill NA with previous/next value
#   quarter  revenue
#   Q1       100
#   NA       120     <- should be Q1
#   Q2       90
df |> fill(quarter, .direction = "down")
# .direction: "down", "up", "downup", "updown"

# Complete: make implicit NAs explicit
df |> complete(name, year)
# Ensures every name x year combination
# exists, filling missing with NA

df |> complete(name, year, fill = list(val=0))

nest() & unnest()

# Nest: collapse rows into list-columns
nested <- mtcars |>
  group_by(cyl) |>
  nest()
# Result: tibble with 3 rows (4,6,8 cyl)
# each row has a "data" list-column holding
# the subset data frame

# Useful with purrr for models per group
library(purrr)
nested |>
  mutate(
    model = map(data, ~ lm(mpg ~ wt, data=.x)),
    rsq = map_dbl(model, ~ summary(.x)$r.squared)
  )

# Unnest: expand list-columns back to rows
nested |> unnest(cols = data)

# Unnest a column of vectors
tibble(
  group = c("a", "b"),
  values = list(1:3, 4:5)
) |> unnest(values)
#   group  values
#   a      1
#   a      2
#   a      3
#   b      4
#   b      5

Common Reshaping Recipes

# Survey data: one row per student per Q
survey_wide |>
  pivot_longer(q1:q20,
    names_to = "question",
    names_prefix = "q",       # strip "q"
    names_transform = as.integer,
    values_to = "response")

# Summary table for reporting
results |>
  group_by(region, year) |>
  summarise(revenue = sum(rev)) |>
  pivot_wider(
    names_from = year,
    values_from = revenue
  )

# Correlation-ready format
df |>
  pivot_wider(
    id_cols = date,
    names_from = stock,
    values_from = price
  )