Visualizing Practice

Critiquing Poor Figures

This week, I was hoping to go through critiquing poor figures in an attempt to promote the creation of stronger figures to show the primary point without being misleading.

Much of this content comes from best practices in data visualization written by Claus Wilke in his book Fundamentals of Data Visualization.

Principle of Proportional Ink

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggformula)
## Loading required package: ggstance
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: ggridges
## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")
library(nycflights13)

theme_set(theme_bw(base_size = 18))

head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
avg_dist <- flights %>%
  group_by(carrier) %>%
  summarise(avg_dist = mean(distance, na.rm = TRUE),
            sd_dist = sd(distance, na.rm = TRUE)) %>%
  left_join(airlines) %>%
  filter(avg_dist > 1000 & avg_dist < 4000)
## Joining, by = "carrier"
avg_dist
## # A tibble: 7 × 4
##   carrier avg_dist sd_dist name                  
##   <chr>      <dbl>   <dbl> <chr>                 
## 1 AA         1340.   638.  American Airlines Inc.
## 2 AS         2402      0   Alaska Airlines Inc.  
## 3 B6         1069.   704.  JetBlue Airways       
## 4 DL         1237.   660.  Delta Air Lines Inc.  
## 5 F9         1620      0   Frontier Airlines Inc.
## 6 UA         1529.   799.  United Air Lines Inc. 
## 7 VX         2499.    88.0 Virgin America
ggplot(avg_dist, aes(x = avg_dist, y = name)) +
  geom_col() +
  xlab("Average Distance Flown") + 
  ylab("Airline Name") + 
  coord_cartesian(xlim = c(1000, 2500))

Pitfalls of color use

avg_dist <- flights %>%
  group_by(carrier) %>%
  summarise(avg_dist = mean(distance, na.rm = TRUE),
            sd_dist = sd(distance, na.rm = TRUE),
            avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
  left_join(airlines) 
## Joining, by = "carrier"
avg_dist
## # A tibble: 16 × 5
##    carrier avg_dist sd_dist avg_delay name                       
##    <chr>      <dbl>   <dbl>     <dbl> <chr>                      
##  1 9E          530.   322.      7.38  Endeavor Air Inc.          
##  2 AA         1340.   638.      0.364 American Airlines Inc.     
##  3 AS         2402      0      -9.93  Alaska Airlines Inc.       
##  4 B6         1069.   704.      9.46  JetBlue Airways            
##  5 DL         1237.   660.      1.64  Delta Air Lines Inc.       
##  6 EV          563.   287.     15.8   ExpressJet Airlines Inc.   
##  7 F9         1620      0      21.9   Frontier Airlines Inc.     
##  8 FL          665.   161.     20.1   AirTran Airways Corporation
##  9 HA         4983      0      -6.92  Hawaiian Airlines Inc.     
## 10 MQ          570.   226.     10.8   Envoy Air                  
## 11 OO          501.   206.     11.9   SkyWest Airlines Inc.      
## 12 UA         1529.   799.      3.56  United Air Lines Inc.      
## 13 US          553.   584.      2.13  US Airways Inc.            
## 14 VX         2499.    88.0     1.76  Virgin America             
## 15 WN          996.   410.      9.65  Southwest Airlines Co.     
## 16 YV          375.   160.     15.6   Mesa Airlines Inc.
ggplot(avg_dist, aes(x = avg_dist, y = avg_delay)) + 
  geom_point(aes(color = name), size = 4) + 
  xlab("Average Distance Flown") + 
  ylab("Average Arrival Delay")

ggplot(flights, aes(x = distance, y = arr_delay, color = carrier)) + 
  geom_point(size = 4) + 
  xlab("Distance Flown") +
  ylab("Arrival Delay")
## Warning: Removed 9430 rows containing missing values (geom_point).

Visualize Uncertainty

ggplot(avg_dist, aes(x = avg_dist, y = name)) + 
  geom_col() + 
  xlab("Average Distance Flown") + 
  ylab("")

Previous
Next