Part 1 Exercise 2: Taming data and multiples

Push the knit button!

library(tidyverse) # contains ggplot2, dplyr, tidyr, etc
library(agridat) # for `nass.wheat`, `nass.corn`, `nass.soybean` datasets
library(catdata) # for `heart` data

`nass.wheat`, `nass.corn`, `nass.soybean` dataset

glimpse(nass.wheat)

## Rows: 5,963
## Columns: 4
## $ year  <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866…
## $ state <fct> Alabama, Arkansas, California, Connecticut, Delaware, Georgia, I…
## $ acres <dbl> 125000, 50000, 650000, 2000, 59000, 245000, 2300000, 1550000, 11…
## $ yield <dbl> 5.0, 6.5, 18.0, 17.5, 11.0, 4.0, 10.5, 10.0, 13.0, 19.0, 6.5, 13…

glimpse(nass.corn)

## Rows: 6,381
## Columns: 4
## $ year  <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866…
## $ state <fct> Alabama, Arkansas, California, Connecticut, Delaware, Florida, G…
## $ acres <dbl> 1050000, 280000, 42000, 57000, 200000, 125000, 1770000, 4925000,…
## $ yield <dbl> 9.0, 18.0, 28.0, 34.0, 23.0, 9.0, 6.0, 29.0, 36.5, 32.0, 28.0, 2…

glimpse(nass.soybean)

## Rows: 2,528
## Columns: 4
## $ year  <int> 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924…
## $ state <fct> Alabama, Arkansas, Delaware, Georgia, Illinois, Indiana, Iowa, K…
## $ acres <dbl> 3000, 3000, 12000, 10000, 115000, 66000, 10000, 2000, 9000, 8000…
## $ yield <dbl> 6.5, 6.5, 11.0, 5.5, 12.0, 9.9, 12.0, 11.0, 9.5, 8.0, 11.8, 13.0…

Exercise 2.1

Hint: the curve is the loess curve from geom_smooth

# fill all ... and change eval = FALSE to eval = TRUE when done

df <- bind_rows(mutate(nass.wheat, crop = "Wheat"),
                mutate(nass.corn, crop = "Corn"),
                mutate(nass.soybean, crop = "Soy")) %>% 
  filter(state %in% c("New York", "Michigan", "Indiana", "Montana", "Washington"))

ggplot(df, aes(..., ...)) + 
  geom_...() + 
  ...() + 
  facet_grid(... ~ ...)

Exercise 2.2

This is using nass.corn data.
Hint: you can use the cut function to divide a numerical variable into intervals.

# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(nass.corn, aes(..., fill = cut(..., 4, dig.lab = 4))) + 
  geom_...(alpha = 0.6) +
  labs(fill = "Period", x = "Average Yield", y = "Density")

Exercise 2.3

Hint: the density is plotted using geom_density_2d

# fill all ... and change eval = FALSE to eval = TRUE when done
df2 <- left_join(rename(nass.wheat, 
                        yield_wheat = yield, acres_wheat = acres), 
                 rename(nass.corn, 
                        yield_corn = yield, acres_corn = acres),
                 by = c("year", "state"))

ggplot(df2, aes(..., ...)) + 
  geom_...(aes(color = ...)) +
  geom_density_2d(color = "black") + 
  labs(x = "Yield of corn", y = "Yield of wheat", color = "Year")

Exercise 2.4

# fill all ... and change eval = FALSE to eval = TRUE when done
df3 <- bind_rows(mutate(nass.wheat, crop = "Wheat"),
                mutate(nass.corn, crop = "Corn"),
                mutate(nass.soybean, crop = "Soy")) 

ggplot(df3, aes(..., ...)) + 
  geom_...() + 
  geom_...(width = 0.1)

Exercise 2.5

Note: this is the yield of wheat for year 2011.

# fill all ... and change eval = FALSE to eval = TRUE when done
df4 <- nass.wheat %>% 
  filter(year == 2011) 
df4highlight <- filter(df4, acres > 4000000 | yield > 80)

ggplot(df4, aes(..., yield)) + 
  ...() + 
  ...(data = df4highlight, color = "red") + 
  geom_text(data = df4highlight, aes(label = ...), nudge_y = 5) +
  ggtitle("Year 2011")

`glass` dataset

data("glass")
glimpse(glass)

## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ type <fct> type1, type1, type1, type1, type1, type1, type1, type1, type1, ty…

Exercise 2.6

Remember that you can find more about the glass dataset after loaded by executing ?glass in R.

# fill all ... and change eval = FALSE to eval = TRUE when done
df5 <- glass %>% 
  mutate(id = paste0("glass", 1:n())) %>% 
  pivot_longer(-c(RI, type, id), names_to = "element", values_to = "oxide")

ggplot(df5, aes(..., ...)) + 
  geom_point(aes(color = ...)) + 
  facet_wrap(~..., scale = "free_y") + 
  labs(x = "Refractive index", y = "Oxide content", color = "Glass")

Exercise 2.7

You can reuse the wrangled data from Exercise 2.6.

# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(df5, aes(..., ...)) + 
  geom_...() + 
  facet_wrap(~..., scale = "free_y")

Exercise 2.8

This is a pairwise scatterplot in the lower triangle and upper triangle shows the pairwise correlation coefficient.
This is quite hard! There is an extension package (GGally) that does easily but try to challenge yourself without using it.

# fill all ... and change eval = FALSE to eval = TRUE when done
elements <- sort(c("Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"))
nele <- length(elements)
# this uses `purrr` but many other ways to do this
# below is like using is two for loops, 
# but `purrrr::map_dfr` ensures the return type is data frame appended by row
df6 <- map_dfr(1:(nele - 1),  
               function(i) {
                 map_dfr((i + 1):nele, function(j) {
                   ele1 <- elements[i]
                   ele2 <- elements[j]
                   mutate(glass, 
                          element1 = ele1, element2 = ele2, 
                          oxide1 = glass[[ele1]], oxide2 = glass[[ele2]]) %>% 
                   select(RI, type, oxide1, oxide2, element1, element2)
                })}) 

cordat <- cor(select(glass, all_of(elements))) %>% 
  as_tibble() %>% 
  mutate(element1 = colnames(.)) %>% 
  pivot_longer(-element1, 
               names_to = "element2",
               values_to = "cor") %>% 
  mutate(i1 = as.integer(factor(element1)),
         i2 = as.integer(factor(element2))) %>% 
  filter(i1 > i2) %>% 
  # get the center points
  rowwise() %>% 
  mutate(oxide1 = mean(range(glass[[element1]])),
         oxide2 = mean(range(glass[[element2]])))

# also check out `GGally::ggpairs(select(glass, all_of(elements)))`
  
ggplot(df6, aes(..., ...)) + 
  geom_point() + 
  # remember formula is y ~ x
  facet_grid(... ~ ..., scales = "free") +
  geom_text(data = cordat, aes(label = round(cor, 2)), color = "red", size = 3)

Exercise 2.9

This shows the QQ-plot of the residuals from the fit of a simple linear model Al ~ Ba.
Note: you can use broom::augment(lm(Al ~ Ba, data = glass)) to get a data frame with residuals easily.

# fill all ... and change eval = FALSE to eval = TRUE when done
fit <- lm(Al ~ Ba, data = glass) %>% 
  broom::augment()
ggplot(fit, aes(sample = ...)) + 
  geom_qq() + 
  geom_qq_line(color = "red")

Exercise 2.10

This is the residual plot from fit of a simple linear model Al ~ Ba.

# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(fit, aes(Ba, .resid)) + 
  geom_...(yintercept = 0, linetype = "dashed") +
  ...() +
  ...(y = "Residual")

Part 1 Exercise 2: Taming data and multiples

nass.wheat, nass.corn, nass.soybean dataset

Exercise 2.1

Exercise 2.2

Exercise 2.3

Exercise 2.4

Exercise 2.5

glass dataset

Exercise 2.6

Exercise 2.7

Exercise 2.8

Exercise 2.9

Exercise 2.10

`nass.wheat`, `nass.corn`, `nass.soybean` dataset

`glass` dataset