STAT1003 – Pivotting data

Evolving language

Earlier efforts to transform data from wide to long was in the reshape, first released on CRAN in 2005-08-05.
It was then superseded by reshape2 released on CRAN in 2010-09-10.
Then finally tidyr released on CRAN in 2014-07-21 ^*v1.0.0 released 2019-09-12.

Wide to long

reshape::melt
reshape2::melt
tidyr::gather
tidyr::pivot_longer^*

Long to wide

reshape::cast
reshape2::dcast
tidyr::spread
tidyr::pivot_wider^*

Hadley Wickham (2020). tidyr: Tidy Messy Data. R package version 1.1.2.

Hadley Wickham (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21(12), 1-20

Pivotting data with `tidyr` Part 1

Data

df_wide <- tibble::tibble(state = c("NSW", "VIC", "ACT"),
                          `2019` = c(8130159, 6655284, 427892),
                          `2018` = c(80366651, 6528601, 423169),
                          `2017` = c(7919815, 6387081, 415874))

df_long <- data.frame(state = c("NSW", "NSW", "NSW", "VIC", "VIC", "VIC", "ACT", "ACT", "ACT"),
                      year = c("2019", "2018", "2017", "2019", 
                               "2018", "2017", "2019", "2018", "2017"), 
                      population = c(8130159, 80366651, 7919815, 6655284, 6528601, 6387081, 427892, 423169, 415874))

df_wide
state	2019	2018	2017
NSW	8130159	80366651	7919815
VIC	6655284	6528601	6387081
ACT	427892	423169	415874

df_long
state	year	population
NSW	2019	8130159
NSW	2018	80366651
NSW	2017	7919815
VIC	2019	6655284
VIC	2018	6528601
VIC	2017	6387081
ACT	2019	427892
ACT	2018	423169
ACT	2017	415874

Pivotting data with `tidyr` Part 2

Data

yield_long <- data.frame(year = c(1900, 1900, 2000, 1900, 1900, 2000, 2000),
                         state = c("Iowa", "Kansas", "Kansas", "Iowa", "Kansas", "Iowa", "Kansas"),
                         crop = c("barley", "barley", "barley", "wheat", "wheat", "wheat", "wheat"),
                         yield = c(28.5, 18, 35, 14.4, 18.2, 47, 37))

yield_long
year	state	crop	yield
1900	Iowa	barley	28.5
1900	Kansas	barley	18.0
2000	Kansas	barley	35.0
1900	Iowa	wheat	14.4
1900	Kansas	wheat	18.2
2000	Iowa	wheat	47.0
2000	Kansas	wheat	37.0

yield_wide
year	state	barley_yield	wheat_yield
1900	Iowa	28.5	14.4
1900	Kansas	18.0	18.2
2000	Kansas	35.0	37.0
2000	Iowa	NA	47.0

yield_long yield_wide

Pivotting data with `tidyr` Part 3

Data

yield_wide <- data.frame(year = c(1900, 1900, 2000, 2000), 
                         state = c("Iowa", "Kansas", "Kansas", "Iowa"),
                         barley_yield = c(28.5, 18, 35, NA), 
                         wheat_yield = c(14.4, 18.2, 37, 47))

yield_wide
year	state	barley_yield	wheat_yield
1900	Iowa	28.5	14.4
1900	Kansas	18.0	18.2
2000	Kansas	35.0	37.0
2000	Iowa	NA	47.0

yield_long
year	state	crop	yield
1900	Iowa	barley	28.5
1900	Kansas	barley	18.0
2000	Kansas	barley	35.0
1900	Iowa	wheat	14.4
1900	Kansas	wheat	18.2
2000	Iowa	wheat	47.0
2000	Kansas	wheat	37.0

yield_wide yield_long

Pivotting data with `tidyr` Part 4

Data

crop_long <- data.frame(year = c(1900L, 1900L, 1900L, 1900L, 2000L, 2000L, 1900L, 1900L, 1900L, 1900L, 2000L, 2000L, 2000L, 2000L),
                        state = c("Iowa", "Iowa", "Kansas", "Kansas", "Kansas", "Kansas", "Iowa", "Iowa", "Kansas", "Kansas", "Iowa", "Iowa", "Kansas", "Kansas"),
                        crop = c("barley", "barley", "barley", "barley", "barley", "barley", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat"),
                        metric = c("yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres"),
                        value = c(28.5, 620000, 18, 127000, 35, 7000, 14.4, 1450000, 18.2, 4290000, 47, 18000, 37, 9400000))

crop_long
year	state	crop	metric	value
1900	Iowa	barley	yield	28.5
1900	Iowa	barley	acres	620,000.0
1900	Kansas	barley	yield	18.0
1900	Kansas	barley	acres	127,000.0
2000	Kansas	barley	yield	35.0
2000	Kansas	barley	acres	7,000.0
1900	Iowa	wheat	yield	14.4
1900	Iowa	wheat	acres	1,450,000.0
1900	Kansas	wheat	yield	18.2
1900	Kansas	wheat	acres	4,290,000.0
2000	Iowa	wheat	yield	47.0
2000	Iowa	wheat	acres	18,000.0
2000	Kansas	wheat	yield	37.0
2000	Kansas	wheat	acres	9,400,000.0

crop_wide
year	state	barley_yield	wheat_yield	barley_acres	wheat_acres
1900	Iowa	28.5	14.4	620000	1450000
1900	Kansas	18.0	18.2	127000	4290000
2000	Kansas	35.0	37.0	7000	9400000
2000	Iowa	NA	47.0	NA	18000

crop_long crop_wide

Pivotting data with `tidyr` Part 5

Data

crop_wide <- data.frame(year = c(1900L, 1900L, 2000L, 2000L), 
                        state = c("Iowa", "Kansas", "Kansas", "Iowa"), 
                        barley_yield = c(28.5, 18, 35, NA), 
                        wheat_yield = c(14.4, 18.2, 37, 47), 
                        barley_acres = c(620000, 127000, 7000, NA), 
                        wheat_acres = c(1450000, 4290000, 9400000, 18000))

crop_wide
year	state	barley_yield	wheat_yield	barley_acres	wheat_acres
1900	Iowa	28.5	14.4	620000	1450000
1900	Kansas	18.0	18.2	127000	4290000
2000	Kansas	35.0	37.0	7000	9400000
2000	Iowa	NA	47.0	NA	18000

crop_wide crop_long

crop_long
year	state	crop	metric	value
1900	Iowa	barley	yield	28.5
1900	Iowa	barley	acres	620,000.0
1900	Kansas	barley	yield	18.0
1900	Kansas	barley	acres	127,000.0
2000	Kansas	barley	yield	35.0
2000	Kansas	barley	acres	7,000.0
1900	Iowa	wheat	yield	14.4
1900	Iowa	wheat	acres	1,450,000.0
1900	Kansas	wheat	yield	18.2
1900	Kansas	wheat	acres	4,290,000.0
2000	Iowa	wheat	yield	47.0
2000	Iowa	wheat	acres	18,000.0
2000	Kansas	wheat	yield	37.0
2000	Kansas	wheat	acres	9,400,000.0

Pivotting data with `tidyr` Part 6

Data

crop_long <- data.frame(year = c(1900L, 1900L, 1900L, 1900L, 2000L, 2000L, 1900L, 1900L, 1900L, 1900L, 2000L, 2000L, 2000L, 2000L),
                        state = c("Iowa", "Iowa", "Kansas", "Kansas", "Kansas", "Kansas", "Iowa", "Iowa", "Kansas", "Kansas", "Iowa", "Iowa", "Kansas", "Kansas"),
                        crop = c("barley", "barley", "barley", "barley", "barley", "barley", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat", "wheat"),
                        metric = c("yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres", "yield", "acres"),
                        value = c(28.5, 620000, 18, 127000, 35, 7000, 14.4, 1450000, 18.2, 4290000, 47, 18000, 37, 9400000))

crop_long
year	state	crop	metric	value
1900	Iowa	barley	yield	2.85e+01
1900	Iowa	barley	acres	6.20e+05
1900	Kansas	barley	yield	1.80e+01
1900	Kansas	barley	acres	1.27e+05
2000	Kansas	barley	yield	3.50e+01
2000	Kansas	barley	acres	7.00e+03
1900	Iowa	wheat	yield	1.44e+01
1900	Iowa	wheat	acres	1.45e+06
1900	Kansas	wheat	yield	1.82e+01
1900	Kansas	wheat	acres	4.29e+06
2000	Iowa	wheat	yield	4.70e+01
2000	Iowa	wheat	acres	1.80e+04
2000	Kansas	wheat	yield	3.70e+01
2000	Kansas	wheat	acres	9.40e+06

crop_tidy
year	state	crop	yield	acres
1900	Iowa	barley	28.5	620000
1900	Kansas	barley	18.0	127000
2000	Kansas	barley	35.0	7000
1900	Iowa	wheat	14.4	1450000
1900	Kansas	wheat	18.2	4290000
2000	Iowa	wheat	47.0	18000
2000	Kansas	wheat	37.0	9400000

crop_long crop_tidy

Separate values into columns

Data

pkg_dat <- data.frame(package = c("dplyr", "magrittr", "rlang", "stringr", "tibble", "tidyr", "tidyselect"), 
                      maintainer = c("Hadley Wickham", "Lionel Henry", "Lionel Henry", "Hadley Wickham", "Kirill Müller", "Hadley Wickham", "Lionel Henry"))

pkg_dat
package	maintainer
dplyr	Hadley Wickham
magrittr	Lionel Henry
rlang	Lionel Henry
stringr	Hadley Wickham
tibble	Kirill Müller
tidyr	Hadley Wickham
tidyselect	Lionel Henry

🎯 separate maintainer name to columns, first name and last name

Separate values into rows

Data

author_dat <- data.frame(package = c("dplyr", "magrittr", "rlang", "stringr", "tibble", "tidyr", "tidyselect"), 
                         author = c("Hadley Wickham, Romain François, Lionel Henry, Kirill Müller", "Lionel Henry, Stefan Milton Bache, Hadley Wickham", "Lionel Henry, Hadley Wickham", "Hadley Wickham", "Kirill Müller, Hadley Wickham", "Hadley Wickham", "Lionel Henry, Hadley Wickham"))

author_dat
package	author
dplyr	Hadley Wickham, Romain François, Lionel Henry, Kirill Müller
magrittr	Lionel Henry, Stefan Milton Bache, Hadley Wickham
rlang	Lionel Henry, Hadley Wickham
stringr	Hadley Wickham
tibble	Kirill Müller, Hadley Wickham
tidyr	Hadley Wickham
tidyselect	Lionel Henry, Hadley Wickham

Summary

tidyr package provides tools for data cleaning and reshaping.
pivot_longer() and pivot_wider() are used to convert data between long and wide formats.
separate_wider_delim() and separate_longer_delim() functions are used to split a single column into multiple columns or rows.

Pivotting data

Evolving language

Pivotting data with tidyr Part 1

Pivotting data with tidyr Part 2

Pivotting data with tidyr Part 3

Pivotting data with tidyr Part 4

Pivotting data with tidyr Part 5

Pivotting data with tidyr Part 6

Separate values into columns

Separate values into rows

Summary

stringr cheatsheet

Pivotting data with `tidyr` Part 1

Pivotting data with `tidyr` Part 2

Pivotting data with `tidyr` Part 3

Pivotting data with `tidyr` Part 4

Pivotting data with `tidyr` Part 5

Pivotting data with `tidyr` Part 6

`stringr` cheatsheet