Push the knit
button!
library(tidyverse) # contains ggplot2, dplyr, tidyr, etc
tuberculosis
datasettb <- read_csv(here::here("data/TB_notifications_2020-07-01.csv")) %>%
dplyr::select(country, iso3, year, new_sp_m04:new_sp_fu) %>%
pivot_longer(cols=new_sp_m04:new_sp_fu, names_to="sexage", values_to="count") %>%
mutate(sexage = str_replace(sexage, "new_sp_", "")) %>%
mutate(sex=substr(sexage, 1, 1),
age=substr(sexage, 2, length(sexage))) %>%
dplyr::select(-sexage) %>%
filter(!(age %in% c("04", "014", "514", "u"))) %>%
filter(year > 1996, year < 2013) %>%
mutate(age_group = factor(age,
labels = c("15-24", "25-34", "35-44",
"45-54", "55-64", "65-"))) %>%
dplyr::select(country, year, age_group, sex, count)
# Filter Australia
tb_oz <- tb %>%
filter(country == "Australia")
The side-by-side bar chart reveals the spikes better. It is allowing comparison of heights because of proximity. The facetted bar chart might be better still.
“Is the proportion of TB incidence in males relative to females increasing with age?”
tb_oz %>%
filter(year == 2012) %>%
ggplot(aes(x=1, y=count, fill=age_group)) +
geom_bar(stat="identity", position="fill") +
facet_wrap(~sex, ncol=6) +
scale_fill_brewer("", palette="Dark2") +
xlab("") + ylab("") +
coord_polar(theta = "y")
tb_oz %>%
filter(year == 2012) %>%
mutate(age_group = as.numeric(age_group)) %>%
ggplot(aes(x=age_group, y=count, colour=sex)) +
geom_point() +
geom_smooth(method="lm", se=F) +
scale_colour_brewer("", palette="Dark2")
By using proportions by sex, this plot lost some trend difference between the sexes over years. Fix it.
tb_oz %>% group_by(year, age_group) %>%
summarise(p = count[sex=="m"]/sum(count)) %>%
ggplot(aes(x=year, y=p)) +
geom_hline(yintercept = 0.50, colour="white", size=2) +
geom_point() +
geom_smooth(se=F) +
facet_wrap(~age_group, ncol=6) +
ylab("proportion of males")
tb_oz %>%
ggplot(aes(x=year, y=count, colour=sex)) +
geom_point() +
geom_smooth(method="lm", se=F) +
facet_wrap(~age_group, ncol=6) +
scale_colour_brewer("", palette="Dark2")