```{r setup, echo = FALSE, message = FALSE, include = FALSE, warning=FALSE} # run setup script #install.packages("remotes") #library(remotes) #remotes::install_github("clauswilke/dviz.supp") #devtools::install_github("clauswilke/dviz.supp") library(colorspace) library(dplyr) library(tidyverse) library(ggforce) library(ggridges) library(treemapify) library(forcats) library(statebins) library(sf) library(cowplot) options(digits = 3) knitr::opts_chunk$set( echo = FALSE, message = FALSE, warning = FALSE, cache = FALSE, #dpi = 105, # not sure why, but need to divide this by 2 to get 210 at 6in, # which is 300 at 4.2in fig.align = 'center', fig.width = 6, fig.asp = 0.618, # 1 / phi fig.show = "hold" ) options(dplyr.print_min = 6, dplyr.print_max = 6) ``` # Introduction This note is compiled based on an online post of Professor Claus Wilke (at UT Austin) that provides a quick visual overview of the various plots and charts that are commonly used to visualize data. Wilke defined various functions based on ggplot2 and used them to make many aesthetically pleasant plots particularly the beautiful ridge plots. Since the graphs generated in this note involves significant coding, the source code are not included in this note. ```{r} ## general setup code # line_size = 0.6 # theme theme_plot_icon <- function(bg_color = "#F5F8EA", line_color = "#243400", line_size = .5, font_size = 14) { theme_dviz_open() %+replace% theme( axis.text.x = element_blank(), axis.text.y = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), #axis.line.x = element_blank(), #axis.line.y = element_blank(), #axis.ticks = element_blank(), axis.line.x = element_line(size = line_size, color = line_color), axis.line.y = element_line(size = line_size, color = line_color), axis.ticks = element_line(size = line_size, color = line_color), axis.ticks.length = grid::unit(4, "pt"), legend.position = "none", plot.margin = margin( font_size*8/14, font_size, font_size*10/14, font_size ), plot.title = element_text( hjust = 0.5, #family = dviz_font_family_bold, family = dviz_font_family_condensed, color = line_color, size = font_size, margin = margin(0, 0, font_size*6/14, 0) ), plot.background = element_rect(fill = bg_color) ) } theme_plot_icon_hgrid <- function(bg_color = "#F5F8EA", line_color = "#243400", line_size = .5, font_size = 14) { theme_plot_icon(bg_color, line_color, line_size, font_size) %+replace% theme( # make grid lines #panel.grid.major.y = element_line(colour = paste0(line_color, "30"), # size = 0.5), # remove x axis axis.ticks.x = element_blank(), axis.line.x = element_blank() ) } theme_plot_icon_vgrid <- function(bg_color = "#F5F8EA", line_color = "#243400", line_size = .5, font_size = 14) { theme_plot_icon(bg_color, line_color, line_size, font_size) %+replace% theme( # make grid lines #panel.grid.major.x = element_line(colour = paste0(line_color, "30"), # size = 0.5), # remove y axis axis.ticks.y = element_blank(), axis.line.y = element_blank() ) } theme_plot_icon_blank <- function(bg_color = "#F5F8EA", line_color = "#243400", line_size = .5, font_size = 14) { theme_plot_icon(bg_color, line_color, line_size, font_size) %+replace% theme( axis.ticks = element_blank(), axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks.length = grid::unit(0, "pt") ) } # data sets set.seed(5142) n <- 15 x <- rnorm(n) y <- .4*x + .6*rnorm(n) df_scatter_xy <- data.frame(x, y) df_one_dist <- data.frame(x = c(rnorm(1000, 1., 1.6), rnorm(300, 4, .4))) df_one_normal <- data.frame(x = rnorm(20)) df_fractions <- data.frame(y = c(.3, .39, .48, .6, .25, .13, .22, .24, .45, .48, .3, .16), x = factor(rep(1:4, 3)), type = rep(c("A", "B", "C"), each = 4)) set.seed(2474) n <- 8 x <- rnorm(n) y <- .4*x + .6*rnorm(n) z <- .5*x + .3*rnorm(n) z <- (z - min(z) + 0.1)^2 df_scatter_xyz <- data.frame(x, y, z) set.seed(5012) df_multi_amounts <- mutate(df_fractions, y = c(1.0, 1.1, 1.4, 1.2)[x]*y) n <- 70 df_multi_dist <- data.frame(y = c(rnorm(n, 1, .8), rnorm(n, 2, .7), rnorm(n, 0, .5)), type = rep(c("A", "B", "C"), each = n), number = rep(c(2, 1, 3), each = n)) df_props = data.frame(value = c(55, 30, 15), group = c("A", "B", "C")) df_multi_props <- data.frame( var1 = rep(c("C", "B", "A"), 3), var2 = rep(c("A", "B", "C"), each = 3), count = c(4, 1, 2, 12, 9, 5, 4, 5, 4) ) %>% group_by(var2) %>% mutate(group_count = sum(count)) df_multi_props2 <- data.frame( var1 = rep(c("B", "A"), 9), var2 = rep(c("E", "E", "D", "D", "C", "C"), 3), var3 = rep(c("H", "G", "F"), each = 6), count = c(5, 8, 0, 0, 0, 0, 0, 3, 2, 7, 0, 0, 4, 0, 4, 2, 7, 4) ) df_sets <- gather_set_data(df_multi_props2, 1:3) df_one_line <- data.frame( x = 1:5, y = c(3.1, 3.3, 4.0, 3.8, 4.4) ) set.seed(9681) n1 <- 1500/5 n2 <- 800/5 x1 <- rnorm(n1, 0, .7) y1 <- 2 * x1 + rnorm(n1, 0, .8) x2 <- rnorm(n2, 0, 0.4) y2 <- 1.5 * x2 + rnorm(n2, .5, .8) df_dense_scatter <- na.omit( data.frame( x = scales::censor(c(x1, x2 + 2.2), c(-2, 4)), y = scales::censor(c(y1, y2 + 1.5), c(-3.5, 4.5)) ) ) y1 <- 2 * x1 + rnorm(n1, 0, 1.6) y2 <- 1.5 * x2 + rnorm(n2, .5, 1.6) df_dense_scatter_sample <- na.omit( data.frame( x = scales::censor(c(x1, x2 + 2.2), c(-2, 4)), y = scales::censor(c(y1, y2 + 1.5), c(-3.5, 4.5)) ) ) %>% sample_n(50) df_connected_scatter <- data.frame( x = c(1.9, 1.5, 2.2, 3, 3.3, 2.7, 1.7, 1), y = c(0.3, -1, -2.0, -0.9, .6, 1.8, 2, 0.7), t = 1:8 ) df_paired <- data.frame( y = c(6, 5.3, 3.8, 2.8, 2, 4.3, 6.1, 5.1, 3.3, 2.4), x = rep(c("A", "B"), each = 5), group = rep(1:5, 2) ) df_uncertain <- data.frame( type = c("A", "B", "C"), x = c(1.5, 2.2, 3.4), y = c(3.2, 5.1, 3.9), dx = c(.25, .3, .35), dy = c(.5, .4, .6) ) # palettes npal <- 5 # earth-brown (Amounts) pal_earth_brown <- sequential_hcl(n = npal, h1 = 71, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # brown-green (Proportions) pal_brown_green <- sequential_hcl(n = npal, h1 = 86, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # green-brown (Geospatial data) pal_green_brown <- sequential_hcl(n = npal, h1 = -265, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # burgundy-red pal_red_brown <- sequential_hcl(n = npal, h1 = 28, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # brown-red (Uncertainty) pal_brown_red <- sequential_hcl(n = npal, h1 = 41, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # ocean-blue (Distributions) pal_ocean_blue <- sequential_hcl(n = npal, h1 = 241, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) # steel-blue (x-y relationships) pal_steel_blue <- sequential_hcl(n = npal, h1 = 257, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) pal_steel_blue_inv <- sequential_hcl(n = npal, h1 = 257-180, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5) ``` ```{r} #' dviz.supp #' #' Supporting materials for Claus Wilke's data visualization book. #' @name dviz.supp #' @docType package #' @import dplyr #' @import cowplot #' @import colorspace #' @import colorblindr # ************************************************* # Setup # ************************************************* .onAttach <- function(libname, pkgname) { # switch the cowplot null device cowplot::set_null_device("png") } #' @noRd #' @usage NULL #' @export dviz_font_family <- "Myriad Pro" #' @noRd #' @usage NULL #' @export dviz_font_family_bold <- "Myriad Pro Semibold" #' @noRd #' @usage NULL #' @export dviz_font_family_condensed <- "Myriad Pro Condensed" #' @noRd #' @usage NULL #' @export dviz_font_family_bold_condensed <- "Myriad Pro Bold Condensed" ``` ```{r} #' Themes for data viz book #' #' The themes used in the data visualization book. The default font for these #' themes is Myriad Pro, which needs to be installed on the target system for #' these themes to work. #' #' @param font_size Overall font size #' @param font_family Font family for plot title, axis titles and labels, legend texts, etc. #' @param line_size Line size for axis lines #' @param rel_small Relative size of small text (e.g., axis tick labels) #' @param rel_tiny Relative size of tiny text (e.g., caption) #' @param rel_large Relative size of large text (e.g., title) #' @export theme_dviz_open <- function(font_size = 14, font_family = dviz_font_family, line_size = .5, rel_small = 12/14, rel_tiny = 11/14, rel_large = 16/14) { half_line <- font_size / 2 cowplot::theme_half_open(font_size = font_size, font_family = font_family, line_size = line_size, rel_small = rel_small, rel_tiny = rel_tiny, rel_large = rel_large) %+replace% theme( plot.margin = margin(half_line/2, 1.5, half_line/2, 1.5), complete = TRUE ) } #' @rdname theme_dviz_open #' @param colour Color of grid lines #' @export theme_dviz_grid <- function(font_size = 14, font_family = dviz_font_family, line_size = .5, rel_small = 12/14, rel_tiny = 11/14, rel_large = 16/14, colour = "grey90") { half_line <- font_size / 2 cowplot::theme_minimal_grid(font_size = font_size, font_family = font_family, line_size = line_size, rel_small = rel_small, rel_tiny = rel_tiny, rel_large = rel_large, colour = colour) %+replace% theme( plot.margin = margin(half_line/2, 1.5, half_line/2, 1.5), complete = TRUE ) } #' @rdname theme_dviz_open #' @export theme_dviz_hgrid <- function(font_size = 14, font_family = dviz_font_family, line_size = .5, rel_small = 12/14, rel_tiny = 11/14, rel_large = 16/14, colour = "grey90") { half_line <- font_size / 2 cowplot::theme_minimal_hgrid(font_size = font_size, font_family = font_family, line_size = line_size, rel_small = rel_small, rel_tiny = rel_tiny, rel_large = rel_large, colour = colour) %+replace% theme( plot.margin = margin(half_line/2, 1.5, half_line/2, 1.5), complete = TRUE ) } #' @rdname theme_dviz_open #' @export theme_dviz_vgrid <- function(font_size = 14, font_family = dviz_font_family, line_size = .5, rel_small = 12/14, rel_tiny = 11/14, rel_large = 16/14, colour = "grey90") { half_line <- font_size / 2 cowplot::theme_minimal_vgrid(font_size = font_size, font_family = font_family, line_size = line_size, rel_small = rel_small, rel_tiny = rel_tiny, rel_large = rel_large, colour = colour) %+replace% theme( plot.margin = margin(half_line/2, 1.5, half_line/2, 1.5), complete = TRUE ) } #' @rdname theme_dviz_open #' @export theme_dviz_map <- function(font_size = 14, font_family = dviz_font_family, line_size = .5, rel_small = 12/14, rel_tiny = 11/14, rel_large = 16/14) { half_line <- font_size / 2 cowplot::theme_map(font_size = font_size, font_family = font_family, line_size = line_size, rel_small = rel_small, rel_tiny = rel_tiny, rel_large = rel_large) %+replace% theme( plot.margin = margin(half_line/2, 1.5, half_line/2, 1.5), complete = TRUE ) } ``` # Amounts ```{r amounts, fig.width = 8, fig.asp = 1/4} palette <- pal_earth_brown p1 <- ggplot(df_props, aes(x = group, y = value)) + geom_col( position="identity", color = palette[npal], fill = palette[3], width = 0.8 ) + scale_y_continuous(limits = c(0, 66), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p2 <- ggplot(df_props, aes(x = fct_rev(group), y = value)) + geom_col(position="identity", color = palette[npal], fill = palette[3], width = .8) + scale_y_continuous(limits = c(0, 66), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + coord_flip() + labs(title = "Bars") + theme_plot_icon_vgrid(palette[npal], palette[1]) p3 <- ggplot(filter(df_multi_amounts, x!=4), aes(x, y, fill=factor(type, levels = c("A", "C", "B")))) + geom_col(position="dodge", color = palette[npal], width = .7) + scale_y_continuous(expand = c(0, 0), limits = c(0, .7)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Grouped Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p4 <- ggplot(filter(df_multi_amounts, x!=4), aes(x, y, fill=factor(type, levels = c("B", "C", "A")))) + geom_col(position="dodge", color = palette[npal], width = .7) + scale_y_continuous(expand = c(0, 0), limits = c(0, .7)) + scale_fill_manual(values = rev(palette[2:4])) + coord_flip() + labs(title = "Grouped Bars") + theme_plot_icon_vgrid(palette[npal], palette[1]) p5 <- ggplot(df_multi_amounts, aes(x, y, fill=factor(type, levels = c("B", "C", "A")))) + geom_col(position="stack", color = palette[npal]) + scale_y_continuous(limits = c(0, 1.55), expand = c(0, 0)) + scale_fill_manual(values = rev(palette[2:4])) + labs(title = "Stacked Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p6 <- p5 + coord_flip() + theme_plot_icon_vgrid(palette[npal], palette[1]) p7 <- ggplot(df_props, aes(x = fct_rev(group), y = value)) + geom_point(color = palette[2], size = 2) + scale_y_continuous(limits = c(0, 66), expand = c(0, 0)) + coord_flip() + labs(title = "Dots") + theme_plot_icon_vgrid(palette[npal], palette[1]) p8 <- ggplot(filter(df_multi_amounts, x != 1), aes(x, y = factor(type, levels = c("A", "C", "B")), fill = y)) + geom_tile(color = palette[5], size = 1.5) + scale_fill_continuous_sequential( h1 = 71, c1 = 80, c2 = 10, l1 = 18, l2 = 97, p1 = 1.5, begin = 0.2, end = 0.75, rev = FALSE ) + labs(title = "Heatmap") + theme_plot_icon_blank(palette[npal], palette[1]) plot_grid(p1, p2, p7, ncol = 4, scale = .9) ``` The most common approach to visualizing amounts (i.e., numerical values shown for some set of categories) is using bars, either vertically or horizontally arranged. However, instead of using bars, we can also place dots at the location where the corresponding bar would end. ```{r amounts_multi, fig.width = 5*6/4.2, fig.asp = 1/2} plot_grid(p3, p4, p5, p6, p8, ncol = 4, scale = .9) ``` If there are two or more sets of categories for which we want to show amounts, we can group or stack the bars. We can also map the categories onto the *x* and *y* axes and show amounts by color, via a heatmap. # Distributions ```{r single-distributions, fig.width = 8, fig.asp = 1/4} palette <- pal_ocean_blue p1 <- ggplot(df_one_dist, aes(x)) + geom_histogram(fill = palette[3], color = palette[npal], binwidth = 1, center = 0) + scale_x_continuous(limits = c(-4.8, 6.8), expand = c(0, 0)) + scale_y_continuous(limits = c(0, 350), expand = c(0, 0)) + labs(title = "Histogram") + theme_plot_icon(palette[npal], palette[1]) p2 <- ggplot(df_one_dist, aes(x)) + geom_density(fill = palette[3], color = palette[npal], bw = .35) + scale_x_continuous(limits = c(-4.8, 6.8), expand = c(0, 0)) + scale_y_continuous(limits = c(0, .27), expand = c(0, 0)) + labs(title = "Density Plot") + theme_plot_icon(palette[npal], palette[1]) p3 <- ggplot(df_one_normal, aes(x)) + stat_ecdf(color = palette[2], size = .7) + scale_x_continuous(expand = c(0.05, 0)) + scale_y_continuous(limits = c(0, 1.08), expand = c(0, 0)) + labs(title = "Cumulative Density") + theme_plot_icon(palette[npal], palette[1]) p4 <- ggplot(df_one_normal, aes(sample = x)) + geom_abline(intercept = 0, slope = 1, color = palette[3]) + geom_qq(color = palette[1], size = 0.8) + labs(title = "Quantile-Quantile Plot") + theme_plot_icon(palette[npal], palette[1]) plot_grid(p1, p2, p3, p4, ncol = 4, scale = .9) ``` Histograms and density plots provide the most intuitive visualizations of a distribution, but both require arbitrary parameter choices and can be misleading. Cumulative densities and quantile-quantile (q-q) plots always represent the data faithfully but can be more difficult to interpret. ```{r} #' ggplot2 stat that creates sina plots #' #' This stat closely mirrors [`stat_ydensity()`] from ggplot2. This enables the #' user to plot sina plots on top of violin plots and have them match perfectly. #' #' @inheritParams ggplot2::layer #' @inheritParams ggplot2::geom_point #' @inheritParams ggplot2::stat_density #' @param scale if "area" (default), all violins have the same area (before trimming #' the tails). If "count", areas are scaled proportionally to the number of #' observations. If "width", all violins have the same maximum width. #' @seealso [geom_violin()] #' @examples #' ggplot(iris, aes(Species, Sepal.Length)) + #' geom_violin(color = NA) + #' stat_sina() #' @export stat_sina <- function(mapping = NULL, data = NULL, geom = "point", position = "identity", ..., bw = "nrd0", adjust = 1, kernel = "gaussian", trim = TRUE, scale = "area", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { scale <- match.arg(scale, c("area", "count", "width")) layer( data = data, mapping = mapping, stat = StatSina, geom = geom, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( bw = bw, adjust = adjust, kernel = kernel, trim = trim, scale = scale, na.rm = na.rm, ... ) ) } #' @rdname stat_sina #' @format NULL #' @usage NULL #' @export StatSina <- ggproto("StatSina", Stat, required_aes = c("x", "y"), non_missing_aes = "weight", compute_group = function(data, scales, width = NULL, bw = "nrd0", adjust = 1, kernel = "gaussian", trim = TRUE, na.rm = FALSE) { if (nrow(data) < 3) return(data.frame()) range <- range(data$y, na.rm = TRUE) modifier <- if (trim) 0 else 3 bw <- calc_bw(data$y, bw) dens <- ggplot2:::compute_density(data$y, data$w, from = range[1] - modifier*bw, to = range[2] + modifier*bw, bw = bw, adjust = adjust, kernel = kernel) densf <- approxfun(dens$x, dens$density, rule = 2) # Compute width if x has multiple values if (length(unique(data$x)) > 1) { width <- diff(range(data$x)) * 0.9 } data$width <- width data$density <- densf(data$y) data$x <- mean(range(data$x)) data }, compute_panel = function(self, data, scales, width = NULL, bw = "nrd0", adjust = 1, kernel = "gaussian", trim = TRUE, na.rm = FALSE, scale = "area") { data <- ggproto_parent(Stat, self)$compute_panel( data, scales, width = width, bw = bw, adjust = adjust, kernel = kernel, trim = trim, na.rm = na.rm ) # choose how violins are scaled relative to each other data$violinwidth <- switch(scale, # area : keep the original densities but scale them to a max width of 1 # for plotting purposes only area = data$density / max(data$density), # count: use the original densities scaled to a maximum of 1 (as above) # and then scale them according to the number of observations count = data$density / max(data$density) * data$n / max(data$n), # width: constant width (density scaled to a maximum of 1) width = data$scaled ) data$x <- data$x + runif(nrow(data), min = -1, max = 1) * 0.9*data$violinwidth/2 data } ) calc_bw <- function(x, bw) { if (is.character(bw)) { if (length(x) < 2) stop("need at least 2 points to select a bandwidth automatically", call. = FALSE) bw <- switch( tolower(bw), nrd0 = stats::bw.nrd0(x), nrd = stats::bw.nrd(x), ucv = stats::bw.ucv(x), bcv = stats::bw.bcv(x), sj = , `sj-ste` = stats::bw.SJ(x, method = "ste"), `sj-dpi` = stats::bw.SJ(x, method = "dpi"), stop("unknown bandwidth rule") ) } bw } ``` ```{r multiple-distributions, fig.width = 8, fig.asp = 1/2} palette <- pal_ocean_blue p1 <- ggplot(df_multi_dist, aes(x = type, y = y)) + geom_boxplot(color = palette[1], fill = palette[4]) + labs(title = "Boxplots") + theme_plot_icon_hgrid(palette[npal], palette[1]) p2 <- ggplot(df_multi_dist, aes(x = type, y = y)) + geom_violin(color = palette[npal], fill = palette[2], size = 0) + labs(title = "Violins") + theme_plot_icon_hgrid(palette[npal], palette[1]) df_multi_dist_small <- group_by(df_multi_dist, type) %>% sample_n(50) p3 <- ggplot(df_multi_dist_small, aes(x = type, y = y)) + geom_jitter(color = palette[1], width = 0.15, height = 0, size = .3) + labs(title = "Strip Charts") + theme_plot_icon_hgrid(palette[npal], palette[1]) p4 <- ggplot(df_multi_dist_small, aes(x = type, y = y)) + stat_sina(color = palette[1], size = 0.3) + labs(title = "Sina Plots") + theme_plot_icon_hgrid(palette[npal], palette[1]) p5 <- ggplot(df_multi_dist, aes(x = y, fill = factor(type, levels = c("C", "A", "B")))) + geom_histogram(color = palette[npal], binwidth = 0.5, center = 0) + scale_fill_manual(values = palette[2:4]) + labs(title = "Stacked Histograms") + scale_x_continuous() + scale_y_continuous(limits = c(0, 49), expand = c(0, 0)) + theme_plot_icon(palette[npal], palette[1]) p6 <- ggplot(df_multi_dist, aes(x = y, fill = factor(type, levels = c("C", "A", "B")))) + geom_density(alpha = 0.7, color = palette[npal]) + scale_fill_manual(values = palette[1:3]) + labs(title = "Overlapping Densities") + scale_x_continuous() + scale_y_continuous(limits = c(0, 1.1), expand = c(0, 0)) + theme_plot_icon(palette[npal], palette[1]) p7 <- ggplot(df_multi_dist, aes(x = y, y = number, group = number)) + geom_density_ridges(alpha = 0.7, color = palette[npal], fill = palette[2], scale = 2.5) + labs(title = "Ridgeline Plot") + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(limits = c(1, 6.5), expand = c(0, 0)) + theme_plot_icon(palette[npal], palette[1]) plot_grid(p1, p2, p3, p4, p5, p6, p7, ncol = 4, scale = .9) ``` Box-plots, violins, strip charts, and sina plots are useful when we want to visualize many distributions at once and/or if we are primarily interested in overall shifts among the distributions. Stacked histograms and overlapping densities allow a more in-depth comparison of a smaller number of distributions, though stacked histograms can be difficult to interpret and are best avoided. Ridgeline plots can be a useful alternative to violin plots and are often useful when visualizing very large numbers of distributions or changes in distributions over time. # Proportions ```{r proportions, fig.width = 8, fig.asp = 1/4} palette <- pal_brown_green p1_main <- ggplot(df_props, aes(x = 1, y = value, fill = group)) + geom_col(position = "stack", color = palette[npal]) + coord_polar(theta = "y") + scale_y_continuous(breaks = NULL, name = "") + scale_x_continuous(breaks = NULL, name = "") + scale_fill_manual(values = palette[2:4]) + theme_plot_icon_blank(palette[npal], palette[1]) + theme(plot.margin = margin(0, 0, 0, 0)) # make sure plot background is fully filled, as in the other plots p1 <- ggdraw(p1_main) + labs(title = "Pie Chart") + theme_plot_icon_blank(palette[npal], palette[1]) p2 <- ggplot(df_props, aes(x = factor(1), y = value, fill = group)) + geom_col(position = position_stack(reverse = TRUE), width = .45, color = palette[npal]) + scale_y_continuous(limits = c(0, 108), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Stacked Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p3 <- ggplot(df_props, aes(x = factor(1), y = value, fill = group)) + geom_col(position = position_stack(reverse = TRUE), width = .45, color = palette[npal]) + #scale_y_continuous(limits = c(0, 110), expand = c(0, 0), position = "right") + scale_y_continuous(limits = c(0, 110), expand = c(0, 0)) + coord_flip() + scale_fill_manual(values = palette[2:4]) + labs(title = "Stacked Bars") + theme_plot_icon_vgrid(palette[npal], palette[1]) p4 <- ggplot(df_props, aes(x = group, y = value, fill = group)) + geom_col(position="identity", color = palette[npal], width = .8) + scale_y_continuous(limits = c(0, 66), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p5 <- ggplot(df_props, aes(x = fct_rev(group), y = value, fill = group)) + geom_col(position="identity", color = palette[npal], width = .8) + scale_y_continuous(limits = c(0, 66), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + coord_flip() + labs(title = "Bars") + theme_plot_icon_vgrid(palette[npal], palette[1]) plot_grid(p1, p4, p5, p2, ncol = 4, scale = .9) ``` Proportions can be visualized as pie charts, side-by-side bars, or stacked bars, and as in the case for amounts, bars can be arranged either vertically or horizontally. Pie charts emphasize that the individual parts add up to a whole and highlight simple fractions. However, the individual pieces are more easily compared in side-by-side bars. Stacked bars look awkward for a single set of proportions, but can be useful when comparing multiple sets of proportions (see below). ```{r proportions-comp, fig.width = 8, fig.asp = 1/4} p5 <- ggplot(filter(df_fractions, x!=4), aes(x, y, fill=factor(type, levels = c("A", "C", "B")))) + geom_col(position="dodge", color = palette[npal], width = .7) + scale_y_continuous(expand = c(0, 0), limits = c(0, .58)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Grouped Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p6 <- ggplot(df_fractions, aes(x, y, fill=type)) + geom_col(position="stack", color = palette[npal]) + scale_y_continuous(limits = c(0, 1.08), expand = c(0, 0)) + scale_fill_manual(values = palette[2:4]) + labs(title = "Stacked Bars") + theme_plot_icon_hgrid(palette[npal], palette[1]) p7 <- ggplot(df_multi_dist, aes(x = y, fill = factor(type, levels = c("C", "A", "B")))) + geom_density(color = palette[npal], position = "fill") + scale_fill_manual(values = palette[2:4]) + scale_x_continuous(expand = c(0.04, 0)) + scale_y_continuous(limits = c(0, 1.08), expand = c(0, 0)) + labs(title = "Stacked Densities") + theme_plot_icon(palette[npal], palette[1]) p8_a <- ggplot(filter(df_fractions, x==1), aes(x = 1, y = y, fill = type)) + geom_col(position = "stack", color = palette[npal]) + coord_polar(theta = "y") + scale_y_continuous(breaks = NULL, name = "") + scale_x_continuous(breaks = NULL, name = "") + scale_fill_manual(values = palette[c(2, 1, 3)]) + theme_plot_icon_blank(palette[npal], palette[1], font_size = 5) + theme( plot.background = element_blank(), plot.margin = margin(0, 0, 0, 0) ) p8_b <- ggplot(filter(df_fractions, x==2), aes(x = 1, y = y, fill = type)) + geom_col(position = "stack", color = palette[npal]) + coord_polar(theta = "y") + scale_y_continuous(breaks = NULL, name = "") + scale_x_continuous(breaks = NULL, name = "") + scale_fill_manual(values = palette[c(2, 1, 3)]) + theme_plot_icon_blank(palette[npal], palette[1], font_size = 5) + theme( plot.background = element_blank(), plot.margin = margin(0, 0, 0, 0) ) p8_c <- ggplot(filter(df_fractions, x==3), aes(x = 1, y = y, fill = type)) + geom_col(position = "stack", color = palette[npal]) + coord_polar(theta = "y") + scale_y_continuous(breaks = NULL, name = "") + scale_x_continuous(breaks = NULL, name = "") + scale_fill_manual(values = palette[c(2, 1, 3)]) + theme_plot_icon_blank(palette[npal], palette[1], font_size = 5) + theme( plot.background = element_blank(), plot.margin = margin(0, 0, 0, 0) ) # combine p8 <- plot_grid(p8_a, p8_b, p8_c, ncol = 3, scale = 1.1) + labs(title = "Multiple Pie Charts") + theme_plot_icon_blank(palette[npal], palette[1]) plot_grid(p8, p5, p6, p7, ncol = 4, scale = .9) ``` When visualizing multiple sets of proportions or changes in proportions across conditions, pie charts tend to be space-inefficient and often obscure relationships. Grouped bars work well as long as the number of conditions compared is moderate, and stacked bars can work for large numbers of conditions. Stacked densities are appropriate when the proportions change along a continuous variable. ```{r proportions-multi, fig.width = 8, fig.asp = 1/4} p1 <- ggplot(df_multi_props, aes(x = var2, y = count, fill = var1, width = group_count)) + geom_bar(stat = "identity", position = "fill", colour = palette[npal], size = 0.5) + facet_grid(~var2, scales = "free_x", space = "free_x") + scale_x_discrete(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL, expand = c(0, 0)) + scale_fill_manual(values = palette[4:2], guide = "none") + coord_cartesian(clip = "off") + labs(title = "Mosaic Plot") + theme_plot_icon_blank(palette[npal], palette[1]) + theme( strip.text = element_blank(), panel.spacing.x = unit(0, "pt") ) p2 <- ggplot(df_multi_props, aes(area = count, subgroup = var2, fill = var2)) + geom_treemap(color = palette[npal], size = 0.5*.pt, alpha = NA) + geom_treemap_subgroup_border(color = palette[npal], size = 1.5*.pt) + scale_fill_manual(values = palette[4:2], guide = "none") + coord_cartesian(clip = "off") + labs(title = "Treemap") + theme_plot_icon_blank(palette[npal], palette[1]) p3 <- ggplot(df_sets, aes(x, id = id, split = y, value = count)) + geom_parallel_sets(aes(fill = var1), alpha = 0.7, axis.width = 0.15) + geom_parallel_sets_axes(axis.width = 0.06, fill = palette[2], color = palette[2]) + scale_x_discrete( name = NULL, breaks = NULL, expand = c(0, 0.15/2) ) + scale_y_continuous(breaks = NULL, expand = c(0, 0)) + scale_fill_manual(values = c(palette[3], palette[2]), guide = "none") + labs(title = "Parallel Sets") + theme_plot_icon_blank(palette[npal], palette[1]) plot_grid(p1, p2, p3, ncol = 3, scale = .9) ``` When proportions are specified according to multiple grouping variables, then mosaic plots, treemaps, or parallel sets are useful visualization approaches. Mosaic plots assume that every level of one grouping variable can be combined with every level of another grouping variable, whereas treemaps do not make such an assumption. Treemaps work well even if the subdivisions of one group are entirely distinct from the subdivisions of another. Parallel sets work better than either mosaic plots or treemaps when there are more than two grouping variables. # Relationships There are different ways to visualize the relationship between two variables. This section focused primarily two numerical variables. ## Scatter Plots ```{r basic-scatter, fig.width = 8, fig.asp = 1/4} palette <- pal_steel_blue p1 <- ggplot(df_scatter_xy, aes(x, y)) + geom_point(fill = palette[2], color = palette[npal], pch = 21, size = 2.4) + scale_x_continuous(expand = c(.2, 0)) + scale_y_continuous(expand = c(.2, 0)) + labs(title = "Scatterplot") + theme_plot_icon(palette[npal], palette[1]) p2 <- ggplot(df_scatter_xyz, aes(x, y, size = z)) + geom_point(fill = palette[2], color = palette[npal], pch = 21, alpha = 0.7) + scale_x_continuous(expand = c(.2, 0)) + scale_y_continuous(expand = c(.2, 0)) + scale_radius(range = c(2, 8)) + labs(title = "Bubble Chart") + theme_plot_icon(palette[npal], palette[1]) p3 <- ggplot(spread(df_paired, x, y), aes(A, B)) + geom_abline(slope = 1, intercept = 0, color = palette[3], size = 0.3) + geom_point( shape = 21, size = 2.4, stroke = 1, fill = palette[2], color = palette[npal] ) + scale_x_continuous(limits = c(1.5, 6.5)) + scale_y_continuous(limits = c(1.5, 6.5)) + labs(title = "Paired Scatterplot") + theme_plot_icon(palette[npal], palette[1]) p4 <- ggplot(df_paired, aes(x, y, group = group)) + geom_line(color = palette[1]) + geom_point( shape = 21, size = 2.4, stroke = 1, fill = palette[2], color = palette[npal] ) + scale_x_discrete(expand = c(0, 0.4)) + scale_y_continuous(limits = c(1.5, 6.5)) + labs(title = "Slopegraph") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.x = element_blank(), axis.ticks.x = element_blank() ) plot_grid(p1, p2, p3, p4, ncol = 4, scale = .9) ``` Scatterplots represent the archetypical visualization when we want to show one quantitative variable relative to another. If we have three quantitative variables, we can map one onto the dot size, creating a variant of the scatterplot called bubble chart. For paired data, where the variables along the *x* and the *y* axes are measured in the same units, it is generally helpful to add a line indicating *x* = *y*. Paired data can also be shown as a slope graph of paired points connected by straight lines. ## Density-based Plots ```{r xy-binning, fig.width = 8, fig.asp = 1/4} p5 <- ggplot(df_dense_scatter, aes(x, y)) + geom_density2d(binwidth = 0.02, color = palette[1]) + scale_x_continuous(limits = c(-2, 3.6), expand = c(0, 0)) + scale_y_continuous(limits = c(-4, 5), expand = c(0, 0)) + labs(title = "Density Contours") + theme_plot_icon(palette[npal], palette[1]) p6 <- ggplot(df_dense_scatter, aes(x, y)) + geom_bin2d(bins = 12, color = palette[npal], size = 0.5) + scale_x_continuous(limits = c(-2, 3.6), expand = c(0, 0)) + scale_y_continuous(limits = c(-4, 5), expand = c(0, 0)) + scale_fill_gradientn(colors = palette[1:(npal-1)]) + labs(title = "2D Bins") + theme_plot_icon(palette[npal], palette[1]) p7 <- ggplot(df_dense_scatter, aes(x, y)) + geom_hex(bins = 12, color = palette[npal], size = 0.5) + scale_x_continuous(limits = c(-2, 3.6), expand = c(0, 0)) + scale_y_continuous(limits = c(-4, 5), expand = c(0, 0)) + scale_fill_gradientn(colors = palette[1:(npal-1)]) + labs(title = "Hex Bins") + theme_plot_icon(palette[npal], palette[1]) cm <- cor(select(mtcars, mpg, hp, drat, wt, qsec)) df_wide <- as.data.frame(cm) df_long <- stack(df_wide) names(df_long) <- c("cor", "var1") df_long <- cbind(df_long, var2 = rep(rownames(cm), length(rownames(cm)))) clust <- hclust(as.dist(1-cm), method="average") levels <- clust$labels[clust$order] df_long$var1 <- factor(df_long$var1, levels = levels) df_long$var2 <- factor(df_long$var2, levels = levels) p8 <- ggplot(filter(df_long, as.integer(var1) < as.integer(var2)), aes(var1, var2, fill=cor, size = abs(cor))) + geom_point(shape = 21, stroke = 0) + scale_x_discrete(position = "top", name = NULL, expand = c(0, 0.5)) + scale_y_discrete(name = NULL, expand = c(0, 0.5)) + scale_size_area(max_size = 8, limits = c(0, 0.9), guide = "none") + scale_fill_gradient2(high = palette[2], mid = palette[npal], low = pal_steel_blue_inv[2], guide = "none") + labs(title = "Correlogram") + theme_plot_icon(palette[npal], palette[1]) plot_grid(p5, p6, p7, p8, ncol = 4, scale = .9) ``` For large numbers of points, regular scatter-plots can become uninformative due to overplotting. In this case, contour lines, 2D bins, or hex bins may provide an alternative. When we want to visualize more than two quantities, on the other hand, we may choose to plot correlation coefficients in the form of a correlogram instead of the underlying raw data. ## Serial Line Plots (for Trends) ```{r xy-lines, fig.width = 5*6/4.2, fig.asp = 1/4} p1 <- ggplot(df_one_line, aes(x, y)) + geom_line(color = palette[1]) + geom_point( shape = 21, size = 2.4, stroke = 1, fill = palette[2], color = palette[npal] ) + scale_x_continuous(limits = c(0.5, 5.5), breaks = c(1, 3, 5)) + scale_y_continuous(limits = c(2.8, 4.8)) + labs(title = "Line Graph") + theme_plot_icon(palette[npal], palette[1]) p2 <- ggplot(df_connected_scatter, aes(x, y, color = t, fill = t)) + geom_path() + geom_point( shape = 21, size = 2.4, stroke = 1, color = palette[npal] ) + scale_color_gradientn( aesthetics = c("colour", "fill"), colors = palette[(npal-2):1] ) + scale_x_continuous(limits = c(0.3, 3.7)) + scale_y_continuous(limits = c(-2.5, 2.5)) + labs(title = "Connected Scatterplot") + theme_plot_icon(palette[npal], palette[1]) p3 <- ggplot(df_dense_scatter_sample, aes(x, y)) + geom_point(color = palette[2], size = 0.3, alpha = 1/2) + geom_smooth( color = palette[1], fill = palette[npal-2], size = 0.5, se = FALSE ) + scale_y_continuous(limits = c(-5, 5)) + labs(title = "Smooth Line Graph") + theme_plot_icon(palette[npal], palette[1]) plot_grid(p1, p2, p3, ncol = 3, scale = .9) ``` When the *x* axis represents time or a strictly increasing quantity such as a treatment dose, we commonly draw line graphs. If we have a temporal sequence of two response variables, we can draw a connected scatterplot where we first plot the two response variables in a scatterplot and then connect dots corresponding to adjacent time points. We can use smooth lines to represent trends in a larger dataset. # Geospatial Information ```{r geospatial, fig.width = 8, fig.asp = 1/4} load(file = "US_income.rda") load(file = "US_income_cartogram.rda") palette <- pal_green_brown lower48 <- mutate( US_income, income_bins = cut( ifelse(is.na(median_income), 25000, median_income), # hide missing value breaks = c(0, 40000, 50000, 60000, 70000, 80000) ) ) %>% filter(!name %in% c("Alaska", "Hawaii", "District of Columbia")) p1_main <- ggplot(lower48) + geom_sf(color = palette[1], fill = palette[4], size = 0.3) + coord_sf(datum = NA, expand = FALSE) + scale_x_continuous(limits = c(-2500000, 100000)) + scale_y_continuous(limits = c(-900000, 1558935)) + theme_plot_icon_blank(palette[npal], palette[1]) + theme( plot.margin = margin(2, 5, 3, 5) ) # make sure plot background is fully filled, as in the other plots p1 <- ggdraw(p1_main) + labs(title = "Map") + theme_plot_icon_blank(palette[npal], palette[1]) p2_main <- ggplot(lower48, aes(fill = income_bins)) + geom_sf(color = palette[1], size = 0.2) + coord_sf(datum = NA, expand = FALSE) + scale_x_continuous(limits = c(-2500000, 100000)) + scale_y_continuous(limits = c(-900000, 1558935)) + scale_fill_manual(values = palette) + theme_plot_icon_blank(palette[npal], palette[1]) + theme( plot.margin = margin(2, 5, 3, 5) ) p2 <- ggdraw(p2_main) + labs(title = "Choropleth") + theme_plot_icon_blank(palette[npal], palette[1]) lower48_carto <- mutate( US_income_cartogram, income_bins = cut( ifelse(is.na(median_income), 25000, median_income), # hide missing value breaks = c(0, 40000, 50000, 60000, 70000, 80000) ) ) %>% filter(!name %in% c("Alaska", "Hawaii", "District of Columbia")) p3_main <- ggplot(lower48_carto, aes(fill = income_bins)) + geom_sf(color = palette[1], size = 0.2) + coord_sf(datum = NA, expand = FALSE) + scale_x_continuous(limits = c(-2500000, 100000)) + scale_y_continuous(limits = c(-1000000, 1458935)) + scale_fill_manual(values = palette) + theme_plot_icon_blank(palette[npal], palette[1]) + theme( plot.margin = margin(2, 5, 3, 5) ) p3 <- ggdraw(p3_main) + labs(title = "Cartogram") + theme_plot_icon_blank(palette[npal], palette[1]) lower48_small <- filter(lower48, GEOID %in% c( "04", "06", "08", "16", "20", "30", "31", "32", "35", "38", "41", "46", "49", "53", "56")) p4_main <- ggplot(lower48_small, aes(state = name, fill = income_bins)) + geom_statebins( family = dviz_font_family, lbl_size = 8/.pt, border_size = 1., border_col = palette[npal] ) + coord_equal(xlim = c(1.5, 5.5), ylim = c(-2.5, -6.5), expand = FALSE, clip = "off") + scale_fill_manual(values = palette[2:5]) + theme_plot_icon_blank(palette[npal], palette[1]) + theme( plot.margin = margin(2, 0, 0, 7) ) p4 <- ggdraw(p4_main) + labs(title = "Cartogram Heatmap") + theme_plot_icon_blank(palette[npal], palette[1]) plot_grid(p1, p2, p3, p4, scale = 0.9, nrow = 1) ``` The primary mode of showing geospatial data is in the form of a map. A map takes coordinates on the globe and projects them onto a flat surface, such that shapes and distances on the globe are approximately represented by shapes and distances in the 2D representation. In addition, we can show data values in different regions by coloring those regions in the map according to the data. Such a map is called a choropleth. In some cases, it may be helpful to distort the different regions according to some other quantity (e.g., population number) or simplify each region into a square. Such visualizations are called cartograms. # Uncertainty ```{r errorbars, fig.width = 8, fig.asp = 1/4} palette <- pal_brown_red p1 <- ggplot(df_uncertain, aes(y, type)) + geom_errorbarh( aes(xmin = y-dy, xmax = y+dy), color = palette[1], height = 0.2, size = 0.5 ) + geom_point( color = palette[1], size = 2 ) + labs(title = "Error Bars") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) p2 <- ggplot(df_uncertain, aes(type, y)) + geom_col(fill = palette[3], width = 0.8) + geom_segment( aes(xend = type, y = y-dy, yend = y+dy), color = palette[1], size = 0.7 ) + scale_y_continuous(limits = c(0, 6), expand = c(0, 0)) + labs(title = "Error Bars") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.x = element_blank(), axis.ticks.x = element_blank() ) p3 <- ggplot(df_uncertain, aes(y, type)) + geom_errorbarh( aes(xmin = y-2.58*dy, xmax = y+2.58*dy), # 99% CI color = palette[3], height = 0, size = 0.5 ) + geom_errorbarh( aes(xmin = y-1.96*dy, xmax = y+1.96*dy), # 95% CI color = palette[2], height = 0, size = 1 ) + geom_errorbarh( aes(xmin = y-1.28*dy, xmax = y+1.28*dy), # 80% CI color = palette[1], height = 0, size = 1.5 ) + #geom_errorbarh( # aes(xmin = y-dy, xmax = y+dy), # color = palette[1], height = 0.1, size = 0.5 #) + geom_point( color = palette[1], size = 2 ) + labs(title = "Graded Error Bars") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) p4 <- ggplot(df_uncertain, aes(x, y)) + geom_point(color = palette[1], size = 2) + geom_segment( aes(xend = x, y = y-dy, yend = y+dy), color = palette[1], size = 0.7 ) + geom_segment( aes(yend = y, x = x-dx, xend = x+dx), color = palette[1], size = 0.7 ) + scale_x_continuous(limits = c(1, 4)) + scale_y_continuous(limits = c(2, 6)) + labs(title = "2D Error Bars") + theme_plot_icon(palette[npal], palette[1]) plot_grid(p1, p2, p4, p3, ncol = 4, scale = .9) ``` Error bars are meant to indicate the range of likely values for some estimate or measurement. They extend horizontally and/or vertically from some reference point representing the estimate or measurement. Reference points can be shown in various ways, such as by dots or by bars. Graded error bars show multiple ranges at the same time, where each range corresponds to a different degree of confidence. They are in effect multiple error bars with different line thicknesses plotted on top of each other. ```{r} #' Confidence density distributions generated from estimate and margin of error #' #' This stat generates normal densities from provided estimates plus margins #' of error (at a specified confidence level). It can be used to estimate #' the confidence density that underlies a given parameter estimate with #' given margin of error. #' #' @inheritParams ggplot2::layer #' @param ... Other arguments passed on to [`layer()`]. These are #' often aesthetics, used to set an aesthetic to a fixed value, like #' `colour = "red"` or `size = 3`. They may also be parameters #' to the paired geom/stat. #' @param confidence The confidence level used to calculate the `moe` statistic. #' This defaults to 0.95 (`moe` corresponds to 95\% confidence interval). #' @param xlim Numeric vector of two numbers setting the range of x values to be #' covered by the confidence density. If not supplied, is taken from the x scale. #' @param n Number of equally spaced points at which the density is calculated. #' @param na.rm If `FALSE`, the default, missing values are removed with #' a warning. If `TRUE`, missing values are silently removed. #' #' @section Details: #' #' The following aesthetics are understood by this stat (required aesthetics #' are in bold): #' * **`x`**: The estimate whose uncertainty is to be displayed #' * **`moe`**: Margin of error #' * `confidence`: Confidence level used to calculate the `moe` statistic. #' This defaults to 0.95 (`moe` corresponds to 95\% confidence interval). #' #' @source #' Adrian W. Bowman. Graphs for Uncertainty. J. R. Statist. Soc. A 182:1-16, 2018. #' \url{http://www.rss.org.uk/Images/PDF/events/2018/Bowman-5-Sept-2018.pdf} #' @examples #' library(ggplot2) #' library(dplyr) #' #' cacao_small <- cacao %>% #' filter(location %in% c("Switzerland", "Canada", "U.S.A.", "Belgium")) #' #' cacao_summary <- cacao_small %>% #' group_by(location) %>% #' summarize( #' sd = sd(rating), #' moe = sd*1.96, #' rating = mean(rating) #' ) #' #' ggplot(cacao_summary, aes(x = rating, y = location)) + #' stat_confidence_density(aes(moe = moe, fill = stat(ndensity)), height = 0.8) + #' geom_point(data = cacao_small, position = position_jitter(width = 0.05), size = 0.3) + #' geom_errorbarh( #' aes(xmin = rating - sd, xmax = rating + sd), #' height = 0.3, color = "darkred", size = 1 #' ) + #' geom_point(size = 3, color = "darkred") + #' theme_minimal() #' #' #' library(ggridges) #' #' cacao_se <- cacao_small %>% #' group_by(location) %>% #' summarize( #' se = sd(rating)/sqrt(n()), #' moe = se*1.96, #' rating = mean(rating) #' ) #' #' ggplot(cacao_se, aes(x = rating, y = location)) + #' stat_confidence_density( #' geom = "ridgeline", #' aes(moe = moe, height = stat(density)), #' alpha = NA, xlim = c(2.5, 3.75), scale = 0.08 #' ) + #' theme_minimal() #' @export stat_confidence_density <- function(mapping = NULL, data = NULL, geom = "tile", position = "identity", ..., confidence = 0.95, xlim = NULL, n = 501, na.rm = FALSE, show.legend = FALSE, inherit.aes = TRUE) { l <- layer( data = data, mapping = mapping, stat = StatConfidenceDensity, geom = geom, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( confidence = confidence, n = n, na.rm = na.rm, xlim = xlim, ... ) ) list(l, scale_alpha_identity()) } #' @rdname stat_confidence_density #' @usage NULL #' @format NULL #' @export StatConfidenceDensity <- ggproto("StatConfidenceDensity", Stat, required_aes = c("x", "moe"), default_aes = aes( alpha = stat(ndensity), confidence = 0.95 ), compute_group = function(data, scales, xlim = NULL, n = 501, confidence = 0.95) { # assume confidence level is 0.95 if not provided if (is.null(data$confidence)) { data$confidence <- confidence } # Check that confidence density parameters are constant within group params <- unique(data[c("x", "moe", "confidence")]) if (nrow(params) > 1) { stop("Confidence density parameters can not vary within data groups", call. = FALSE) } params <- as.list(params) range <- xlim %||% scales$x$dimension() xseq <- seq(range[1], range[2], length.out = n) if (scales$x$is_discrete()) { x_trans <- xseq } else { # For continuous scales, need to back transform from transformed range # to original values x_trans <- scales$x$trans$inverse(xseq) params$x <- scales$x$trans$inverse(params$x) params$statistic <- scales$x$trans$inverse(params$statistic) } fun <- do.call(fit_normal, params) density <- fun(x_trans) data.frame( x = xseq, density = density, ndensity = density/max(density) ) } ) fit_normal <- function(x, moe, confidence) { # convert to two-tailed value confidence <- 1-(1-confidence)/2 function(z) stats::dnorm(z, mean = x, sd = moe/stats::qnorm(confidence)) } ``` ```{r confidence-dists, fig.width = 8, fig.asp = 1/4} p1 <- ggplot(df_uncertain, aes(y, type)) + stat_confidence_density(aes(moe = dy), fill = palette[3], height = 0.6, confidence = 0.68) + scale_x_continuous(limits = c(1.6, 6.4), expand = c(0, 0)) + scale_y_discrete(expand = c(0, 1)) + labs(title = "Confidence Strips") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) p2 <- ggplot(df_uncertain, aes(y, type)) + geom_ribbon( data = filter(df_uncertain, type == "A"), aes(moe = dy, ymin = 1 - .5*stat(density), ymax = 1 + .5*stat(density)), stat = "confidence_density", fill = palette[3], color = NA, alpha = NA, confidence = 0.68 ) + geom_ribbon( data = filter(df_uncertain, type == "B"), aes(moe = dy, ymin = 2 - .5*stat(density), ymax = 2 + .5*stat(density)), stat = "confidence_density", fill = palette[3], color = NA, alpha = NA, confidence = 0.68 ) + geom_ribbon( data = filter(df_uncertain, type == "C"), aes(moe = dy, ymin = 3 - .5*stat(density), ymax = 3 + .5*stat(density)), stat = "confidence_density", fill = palette[3], color = NA, alpha = NA, confidence = 0.68 ) + geom_errorbarh( aes(xmin = y-1.28*dy, xmax = y+1.28*dy), color = palette[1], height = 0, size = 0.5 ) + geom_point( color = palette[1], size = 2 ) + scale_x_continuous(limits = c(1.6, 6.4), expand = c(0, 0)) + scale_y_discrete(expand = expand_scale(add = c(0.8, 0.8))) + labs(title = "Eyes") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) p3 <- ggplot(df_uncertain, aes(y, type)) + stat_confidence_density( aes(moe = dy, height = .9*stat(density)), geom = "ridgeline", fill = palette[3], color = NA, alpha = NA, confidence = 0.68 ) + geom_errorbarh( aes(xmin = y-1.28*dy, xmax = y+1.28*dy), color = palette[1], height = 0, size = 0.5 ) + geom_point( color = palette[1], size = 2 ) + scale_x_continuous(limits = c(1.6, 6.4), expand = c(0, 0)) + scale_y_discrete(expand = expand_scale(add = c(0.2, 0.8))) + labs(title = "Half-Eyes") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) df_norm <- data.frame( x = seq(-3, 3, length.out = 100), y = dnorm(seq(-3, 3, length.out = 100)) ) df_q <- data.frame(x = qnorm(ppoints(20))) p4 <- ggplot(df_q, aes(x)) + geom_line(data = df_norm, aes(x, .36*y), color = palette[1], na.rm = FALSE, size = 0.25) + # factor .36 manually determined geom_dotplot(binwidth = .4, fill = palette[3], color = palette[1]) + scale_x_continuous( limits = c(-2.8, 2.8), expand = c(0, 0) ) + scale_y_continuous( expand = c(0.02, 0), limits = c(0, 0.4) ) + labs(title = "Quantile Dot Plot") + theme_plot_icon(palette[npal], palette[1]) + theme( axis.line.y = element_blank(), axis.ticks.y = element_blank() ) plot_grid(p1, p2, p3, p4, ncol = 4, scale = .9) ``` To achieve a more detailed visualization than is possible with error bars or graded error bars, we can visualize the actual confidence or posterior distributions. Confidence strips provide a clear visual sense of uncertainty but are difficult to read accurately. Eyes and half-eyes combine error bars with approaches to visualize distributions (violins and ridgelines, respectively), and thus show both precise ranges for some confidence levels and the overall uncertainty distribution. A quantile dot plot can serve as an alternative visualization of an uncertainty distribution. By showing the distribution in discrete units, the quantile dot plot is not as precise but can be easier to read than the continuous distribution shown by a violin or ridgeline plot. ```{r} # file based on stat-smooth.R from ggplot2 #' Generate outcome draws from a smooth fit #' #' Generate outcome draws from a smooth fit. This stat is similar to [`stat_smooth()`], #' but there are a few important differences. First, there is no `method` argument. #' Only smooth fits fitted via [`mgcv::gam()`] are currently supported. If you want a #' linear fit, set a linear formula via `formula = y ~ x`. Second, there is no `se` #' argument. This stat cannot draw confidence bands. See [`confidence_band()`] for a #' workaround if you want to add confidence bands. Internally, the stat uses the #' function [`sample_outcomes()`] to calculate outcomes. #' #' This stat fits the gam with Restricted Maximum Likelihood (REML) and uses the #' smoothing parameter uncertainty corrected covariance matrix to generate outcomes #' (`unconditional = TRUE` in [`sample_outcomes()`]). If you choose a different gam #' fitting method the stat sets `unconditional = FALSE`. #' #' Note that for static plots, you will generally have to set the `group` #' aesthetic appropriately (e.g., `aes(group = stat(.draw))`). However, for #' animated plots you will normally not want to set the group aesthetic in #' this way. To enable animations by default, `stat_smooth_draws()` does not #' set a group aesthetic. See examples for further details. #' #' @inheritParams ggplot2::stat_smooth #' @param times Number of outcomes to draw. #' @param formula Formula to use in smoothing function. Default is #' a cubic spline, `y ~ s(x, bs = "cs")`. To generate a linear fit, #' set `formula = y ~ x`. #' @param gam.args List of additional arguments passed on to the #' GAM call. #' @examples #' library(ggplot2) #' #' # static plots, need to set group aesthetic manually #' ggplot(mtcars, aes(hp, mpg)) + #' geom_point() + #' stat_smooth_draws(aes(group = stat(.draw)), size = 0.5) + #' theme_bw() #' #' # if we want to group by multiple variables, we have to use their #' # mapped name (here, `colour` instead of `Species`) because we're #' # creating the groups after after initial data mapping #' ggplot(iris, aes(Sepal.Length, Sepal.Width, colour = Species)) + #' geom_point() + #' stat_smooth_draws( #' formula = y ~ x, #' aes(group = interaction(stat(.draw), colour)), #' size = 0.5 #' ) + #' theme_bw() #' #' \dontrun{ #' #' # animated plots #' library(gganimate) #' #' ggplot(mtcars, aes(hp, mpg)) + #' geom_point() + #' stat_smooth_draws(size = 0.5) + #' transition_states(stat(.draw), 1, 2) #' #' ggplot(iris, aes(Sepal.Length, Sepal.Width, colour = Species)) + #' geom_point() + #' stat_smooth_draws(formula = y ~ x, times = 20, size = 0.5) + #' transition_states(stat(.draw), 1, 2) #' } #' @export stat_smooth_draws <- function(mapping = NULL, times = 10, data = NULL, geom = "smooth", position = "identity", ..., formula = y ~ s(x, bs = "cs"), n = 80, fullrange = FALSE, gam.args = list(method = "REML"), na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { layer( data = data, mapping = mapping, stat = StatSmoothdraws, geom = geom, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( times = times, formula = formula, n = n, fullrange = fullrange, na.rm = na.rm, gam.args = gam.args, ... ) ) } #' @rdname stat_smooth_draws #' @format NULL #' @usage NULL #' @export StatSmoothdraws <- ggproto("StatSmoothdraws", Stat, # Setting the group aesthetic by default is good for static plots but # bad for animations. We keep it unset by default to make animations # easier. #default_aes = aes(group = stat(.draw)), compute_group = function(data, scales, times = 10, formula = y ~ s(x, bs = "cs"), se = FALSE, n = 80, fullrange = FALSE, xseq = NULL, level = 0.95, gam.args = list(method = "REML"), na.rm = FALSE) { if (length(unique(data$x)) < 2) { # Not enough data to perform fit return(new_data_frame()) } if (is.null(data$weight)) data$weight <- 1 if (is.null(xseq)) { if (is.integer(data$x)) { if (fullrange) { xseq <- scales$x$dimension() } else { xseq <- sort(unique(data$x)) } } else { if (fullrange) { range <- scales$x$dimension() } else { range <- range(data$x, na.rm = TRUE) } xseq <- seq(range[1], range[2], length.out = n) } } #base.args <- list(quote(formula), data = quote(data), weights = quote(weight)) base.args <- list(quote(formula), data = quote(data)) model <- do.call(mgcv::gam, c(base.args, gam.args)) unconditional <- FALSE if (gam.args$method == "REML") { unconditional <- TRUE } sample_outcomes(model, data.frame(x = xseq), times = times, unconditional = unconditional) }, required_aes = c("x", "y") ) ``` ```{r confidence-bands, fig.width = 8, fig.asp = 1/4} p1 <- ggplot(df_dense_scatter_sample, aes(x, y)) + geom_smooth( color = palette[1], fill = palette[npal-2], size = 0.5, level = 0.95 ) + scale_y_continuous(limits = c(-5, 5)) + labs(title = "Confidence Band") + theme_plot_icon(palette[npal], palette[1]) p2 <- ggplot(df_dense_scatter_sample, aes(x, y)) + geom_smooth(color = NA, fill = palette[npal-1], level = 0.99) + geom_smooth(color = NA, fill = palette[npal-2], level = 0.95) + geom_smooth( color = palette[1], fill = palette[npal-3], size = 0.5, level = 0.8 ) + scale_y_continuous(limits = c(-5, 5)) + labs(title = "Graded Confidence Band") + theme_plot_icon(palette[npal], palette[1]) p3 <- ggplot(df_dense_scatter_sample, aes(x, y)) + stat_smooth_draws( times = 8, aes(group = stat(.draw)), color = palette[1], size = 0.15 ) + scale_y_continuous(limits = c(-5, 5)) + labs(title = "Fitted Draws") + theme_plot_icon(palette[npal], palette[1]) plot_grid(p1, p2, p3, ncol = 3, scale = .9) ``` For smooth line graphs, the equivalent of an error bar is a confidence band. It shows a range of values the line might pass through at a given confidence level. As in the case of error bars, we can draw graded confidence bands that show multiple confidence levels at once. We can also show individual fitted draws in lieu of or in addition to the confidence bands.