Push the knit button!

library(tidyverse) # contains ggplot2, dplyr, tidyr, etc
library(agridat) # for `nass.wheat`, `nass.corn`, `nass.soybean` datasets
library(catdata) # for `heart` data

nass.wheat, nass.corn, nass.soybean dataset

glimpse(nass.wheat)
## Rows: 5,963
## Columns: 4
## $ year  <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866…
## $ state <fct> Alabama, Arkansas, California, Connecticut, Delaware, Georgia, I…
## $ acres <dbl> 125000, 50000, 650000, 2000, 59000, 245000, 2300000, 1550000, 11…
## $ yield <dbl> 5.0, 6.5, 18.0, 17.5, 11.0, 4.0, 10.5, 10.0, 13.0, 19.0, 6.5, 13…
glimpse(nass.corn)
## Rows: 6,381
## Columns: 4
## $ year  <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866, 1866…
## $ state <fct> Alabama, Arkansas, California, Connecticut, Delaware, Florida, G…
## $ acres <dbl> 1050000, 280000, 42000, 57000, 200000, 125000, 1770000, 4925000,…
## $ yield <dbl> 9.0, 18.0, 28.0, 34.0, 23.0, 9.0, 6.0, 29.0, 36.5, 32.0, 28.0, 2…
glimpse(nass.soybean)
## Rows: 2,528
## Columns: 4
## $ year  <int> 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924, 1924…
## $ state <fct> Alabama, Arkansas, Delaware, Georgia, Illinois, Indiana, Iowa, K…
## $ acres <dbl> 3000, 3000, 12000, 10000, 115000, 66000, 10000, 2000, 9000, 8000…
## $ yield <dbl> 6.5, 6.5, 11.0, 5.5, 12.0, 9.9, 12.0, 11.0, 9.5, 8.0, 11.8, 13.0…

Exercise 2.1

  • Hint: the curve is the loess curve from geom_smooth
# fill all ... and change eval = FALSE to eval = TRUE when done

df <- bind_rows(mutate(nass.wheat, crop = "Wheat"),
                mutate(nass.corn, crop = "Corn"),
                mutate(nass.soybean, crop = "Soy")) %>% 
  filter(state %in% c("New York", "Michigan", "Indiana", "Montana", "Washington"))

ggplot(df, aes(..., ...)) + 
  geom_...() + 
  ...() + 
  facet_grid(... ~ ...)

Exercise 2.2

  • This is using nass.corn data.
  • Hint: you can use the cut function to divide a numerical variable into intervals.
# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(nass.corn, aes(..., fill = cut(..., 4, dig.lab = 4))) + 
  geom_...(alpha = 0.6) +
  labs(fill = "Period", x = "Average Yield", y = "Density")

Exercise 2.3

  • Hint: the density is plotted using geom_density_2d
# fill all ... and change eval = FALSE to eval = TRUE when done
df2 <- left_join(rename(nass.wheat, 
                        yield_wheat = yield, acres_wheat = acres), 
                 rename(nass.corn, 
                        yield_corn = yield, acres_corn = acres),
                 by = c("year", "state"))

ggplot(df2, aes(..., ...)) + 
  geom_...(aes(color = ...)) +
  geom_density_2d(color = "black") + 
  labs(x = "Yield of corn", y = "Yield of wheat", color = "Year")

Exercise 2.4

# fill all ... and change eval = FALSE to eval = TRUE when done
df3 <- bind_rows(mutate(nass.wheat, crop = "Wheat"),
                mutate(nass.corn, crop = "Corn"),
                mutate(nass.soybean, crop = "Soy")) 

ggplot(df3, aes(..., ...)) + 
  geom_...() + 
  geom_...(width = 0.1)

Exercise 2.5

  • Note: this is the yield of wheat for year 2011.
# fill all ... and change eval = FALSE to eval = TRUE when done
df4 <- nass.wheat %>% 
  filter(year == 2011) 
df4highlight <- filter(df4, acres > 4000000 | yield > 80)

ggplot(df4, aes(..., yield)) + 
  ...() + 
  ...(data = df4highlight, color = "red") + 
  geom_text(data = df4highlight, aes(label = ...), nudge_y = 5) +
  ggtitle("Year 2011")

glass dataset

data("glass")
glimpse(glass)
## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ type <fct> type1, type1, type1, type1, type1, type1, type1, type1, type1, ty…

Exercise 2.6

  • Remember that you can find more about the glass dataset after loaded by executing ?glass in R.
# fill all ... and change eval = FALSE to eval = TRUE when done
df5 <- glass %>% 
  mutate(id = paste0("glass", 1:n())) %>% 
  pivot_longer(-c(RI, type, id), names_to = "element", values_to = "oxide")

ggplot(df5, aes(..., ...)) + 
  geom_point(aes(color = ...)) + 
  facet_wrap(~..., scale = "free_y") + 
  labs(x = "Refractive index", y = "Oxide content", color = "Glass")

Exercise 2.7

  • You can reuse the wrangled data from Exercise 2.6.
# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(df5, aes(..., ...)) + 
  geom_...() + 
  facet_wrap(~..., scale = "free_y")

Exercise 2.8

  • This is a pairwise scatterplot in the lower triangle and upper triangle shows the pairwise correlation coefficient.
  • This is quite hard! There is an extension package (GGally) that does easily but try to challenge yourself without using it.
# fill all ... and change eval = FALSE to eval = TRUE when done
elements <- sort(c("Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"))
nele <- length(elements)
# this uses `purrr` but many other ways to do this
# below is like using is two for loops, 
# but `purrrr::map_dfr` ensures the return type is data frame appended by row
df6 <- map_dfr(1:(nele - 1),  
               function(i) {
                 map_dfr((i + 1):nele, function(j) {
                   ele1 <- elements[i]
                   ele2 <- elements[j]
                   mutate(glass, 
                          element1 = ele1, element2 = ele2, 
                          oxide1 = glass[[ele1]], oxide2 = glass[[ele2]]) %>% 
                   select(RI, type, oxide1, oxide2, element1, element2)
                })}) 

cordat <- cor(select(glass, all_of(elements))) %>% 
  as_tibble() %>% 
  mutate(element1 = colnames(.)) %>% 
  pivot_longer(-element1, 
               names_to = "element2",
               values_to = "cor") %>% 
  mutate(i1 = as.integer(factor(element1)),
         i2 = as.integer(factor(element2))) %>% 
  filter(i1 > i2) %>% 
  # get the center points
  rowwise() %>% 
  mutate(oxide1 = mean(range(glass[[element1]])),
         oxide2 = mean(range(glass[[element2]])))

# also check out `GGally::ggpairs(select(glass, all_of(elements)))`
  
ggplot(df6, aes(..., ...)) + 
  geom_point() + 
  # remember formula is y ~ x
  facet_grid(... ~ ..., scales = "free") +
  geom_text(data = cordat, aes(label = round(cor, 2)), color = "red", size = 3)

Exercise 2.9

  • This shows the QQ-plot of the residuals from the fit of a simple linear model Al ~ Ba.
  • Note: you can use broom::augment(lm(Al ~ Ba, data = glass)) to get a data frame with residuals easily.
# fill all ... and change eval = FALSE to eval = TRUE when done
fit <- lm(Al ~ Ba, data = glass) %>% 
  broom::augment()
ggplot(fit, aes(sample = ...)) + 
  geom_qq() + 
  geom_qq_line(color = "red")

Exercise 2.10

  • This is the residual plot from fit of a simple linear model Al ~ Ba.
# fill all ... and change eval = FALSE to eval = TRUE when done
ggplot(fit, aes(Ba, .resid)) + 
  geom_...(yintercept = 0, linetype = "dashed") +
  ...() +
  ...(y = "Residual")