Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js
+ - 0:00:00
Notes for current slide
Notes for next slide

Data Processing in R

With the {tidyverse}

Michael Jones

2021-08-25

1 / 35

A quick note
on Composition

2 / 35

f(g(x))=(fg)(x)

3 / 35
  • f(g(x)) = g(x) %>% f()
4 / 35
  • f(g(x)) = g(x) %>% f()
  • f(g(x)) = x %>% g() %>% f()
4 / 35
  • f(g(x)) = g(x) %>% f()
  • f(g(x)) = x %>% g() %>% f()
  • f(x, y) = x %>% f(y)
4 / 35
  • f(g(x)) = g(x) %>% f()
  • f(g(x)) = x %>% g() %>% f()
  • f(x, y) = x %>% f(y)
  • f(x, y) = x |> f(y)
4 / 35

Traditional Cake

sprinkle(
sandwich(
bake(
mix(
what = ingredients,
in = "bowl"
)
in = "oven",
at = 180
)
between = "jam",
)
with = "sugar"
)
5 / 35

Saving your intermediate steps

batter <- mix(what = ingredients, in = "bowl")
sponge <- bake(batter, in = "oven", at = 180)
assembled_cake <- sandwich(sponge, between = "jam")
decorated_cake <- sprinkle(assembled_cake, with = "sugar")
6 / 35

Piped Cake

mix(what = ingredients, in = "bowl") %>%
bake(in = "oven", at = 180) %>%
sandwich(between = "jam") %>%
sprinkle(with = "sugar")
7 / 35

{tidyverse}

8 / 35
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
9 / 35

Other useful packages

library(readxl)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
10 / 35

Reading in
a CSV

11 / 35

Old Fashioned

read.csv("my_data.csv")
## id_column group fruit rating sampled
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
12 / 35

Tidyverse Way

read_csv("my_data.csv")
## Rows: 8 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): group, fruit
## dbl (2): id_column, rating
## date (1): sampled
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
13 / 35

Tell R what you expect

ratings <- read_csv("my_data.csv",
col_types = cols(
id_column = col_double(),
group = col_character(),
fruit = col_character(),
rating = col_double(),
sampled = col_date(format = "")
))
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
14 / 35

Tell R what you expect

ratings_bad <- read_csv("my_data.csv",
col_types = cols(
id_column = col_double(),
group = col_character(),
fruit = col_double(),
rating = col_double(),
sampled = col_date(format = "")
))
15 / 35

Taking a quick look at the data

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
16 / 35

Taking a quick look at the data

glimpse(ratings)
## Rows: 8
## Columns: 5
## $ id_column <dbl> 1, 2, 3, 4, 5, 6, 7, 8
## $ group <chr> "A", "B", "A", "B", "A", "B", "A", "B"
## $ fruit <chr> "apple", "apricot", "avocado", "banana", "bell pepper", "bil…
## $ rating <dbl> 4, 3, 5, 1, 4, 1, 5, 1
## $ sampled <date> 2021-07-05, 2021-07-13, 2021-07-09, 2021-07-26, 2021-07-22, …
17 / 35

Key idea of
the tidyverse

18 / 35

Functions accept a data frame

Functions return a data frame

19 / 35

Selecting Columns

select(<dataframe>, <columns>) data frame

20 / 35

Selecting Columns

ratings %>%
select(id_column)
## # A tibble: 8 × 1
## id_column
## <dbl>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
20 / 35

Selecting Columns

ratings %>%
select(id_column, group)
## # A tibble: 8 × 2
## id_column group
## <dbl> <chr>
## 1 1 A
## 2 2 B
## 3 3 A
## 4 4 B
## 5 5 A
## 6 6 B
## 7 7 A
## 8 8 B
20 / 35

Selecting Columns

ratings %>%
select(id_column:rating)
## # A tibble: 8 × 4
## id_column group fruit rating
## <dbl> <chr> <chr> <dbl>
## 1 1 A apple 4
## 2 2 B apricot 3
## 3 3 A avocado 5
## 4 4 B banana 1
## 5 5 A bell pepper 4
## 6 6 B bilberry 1
## 7 7 A blackberry 5
## 8 8 B blackcurrant 1
20 / 35

Selecting Columns

ratings %>%
select(-id_column)
## # A tibble: 8 × 4
## group fruit rating sampled
## <chr> <chr> <dbl> <date>
## 1 A apple 4 2021-07-05
## 2 B apricot 3 2021-07-13
## 3 A avocado 5 2021-07-09
## 4 B banana 1 2021-07-26
## 5 A bell pepper 4 2021-07-22
## 6 B bilberry 1 2021-07-31
## 7 A blackberry 5 2021-07-01
## 8 B blackcurrant 1 2021-07-18
20 / 35
  • Yes: ratings %>% select(id_column)
  • No: ratings %>% select("id_column")
21 / 35

Filtering Columns

filter(<dataframe>, <predicates>) data frame

22 / 35

Filtering Columns

ratings %>%
filter(group == "A")
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 3 A avocado 5 2021-07-09
## 3 5 A bell pepper 4 2021-07-22
## 4 7 A blackberry 5 2021-07-01
22 / 35

Filtering Columns

ratings %>%
filter(rating >= 3)
## # A tibble: 5 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 5 A bell pepper 4 2021-07-22
## 5 7 A blackberry 5 2021-07-01
22 / 35

Filtering Columns

ratings %>%
filter(sampled > as.Date("2021-07-15"))
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 4 B banana 1 2021-07-26
## 2 5 A bell pepper 4 2021-07-22
## 3 6 B bilberry 1 2021-07-31
## 4 8 B blackcurrant 1 2021-07-18
22 / 35

Sorting Columns

arrange(<dataframe>, <columns>) data frame

23 / 35

Sorting Columns

ratings %>%
arrange(rating)
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 4 B banana 1 2021-07-26
## 2 6 B bilberry 1 2021-07-31
## 3 8 B blackcurrant 1 2021-07-18
## 4 2 B apricot 3 2021-07-13
## 5 1 A apple 4 2021-07-05
## 6 5 A bell pepper 4 2021-07-22
## 7 3 A avocado 5 2021-07-09
## 8 7 A blackberry 5 2021-07-01
23 / 35

Sorting Columns

ratings %>%
arrange(desc(rating))
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 3 A avocado 5 2021-07-09
## 2 7 A blackberry 5 2021-07-01
## 3 1 A apple 4 2021-07-05
## 4 5 A bell pepper 4 2021-07-22
## 5 2 B apricot 3 2021-07-13
## 6 4 B banana 1 2021-07-26
## 7 6 B bilberry 1 2021-07-31
## 8 8 B blackcurrant 1 2021-07-18
23 / 35

Sorting Columns

ratings %>%
arrange(group, sampled)
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 7 A blackberry 5 2021-07-01
## 2 1 A apple 4 2021-07-05
## 3 3 A avocado 5 2021-07-09
## 4 5 A bell pepper 4 2021-07-22
## 5 2 B apricot 3 2021-07-13
## 6 8 B blackcurrant 1 2021-07-18
## 7 4 B banana 1 2021-07-26
## 8 6 B bilberry 1 2021-07-31
23 / 35

Changing Columns

mutate(<dataframe>, <new_column> = <function>) data frame

24 / 35

Changing Columns

ratings %>%
mutate(rating = rating * 20)
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 80 2021-07-05
## 2 2 B apricot 60 2021-07-13
## 3 3 A avocado 100 2021-07-09
## 4 4 B banana 20 2021-07-26
## 5 5 A bell pepper 80 2021-07-22
## 6 6 B bilberry 20 2021-07-31
## 7 7 A blackberry 100 2021-07-01
## 8 8 B blackcurrant 20 2021-07-18
24 / 35

Changing Columns

ratings %>%
mutate(fruit = str_to_upper(fruit))
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A APPLE 4 2021-07-05
## 2 2 B APRICOT 3 2021-07-13
## 3 3 A AVOCADO 5 2021-07-09
## 4 4 B BANANA 1 2021-07-26
## 5 5 A BELL PEPPER 4 2021-07-22
## 6 6 B BILBERRY 1 2021-07-31
## 7 7 A BLACKBERRY 5 2021-07-01
## 8 8 B BLACKCURRANT 1 2021-07-18
24 / 35

Changing Columns

ratings %>%
mutate(sampled = format(sampled, "%b %d"), rating = rating * 20)
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <chr>
## 1 1 A apple 80 Jul 05
## 2 2 B apricot 60 Jul 13
## 3 3 A avocado 100 Jul 09
## 4 4 B banana 20 Jul 26
## 5 5 A bell pepper 80 Jul 22
## 6 6 B bilberry 20 Jul 31
## 7 7 A blackberry 100 Jul 01
## 8 8 B blackcurrant 20 Jul 18
24 / 35

Changing Columns

ratings %>%
mutate(new_col = 1:8)
## # A tibble: 8 × 6
## id_column group fruit rating sampled new_col
## <dbl> <chr> <chr> <dbl> <date> <int>
## 1 1 A apple 4 2021-07-05 1
## 2 2 B apricot 3 2021-07-13 2
## 3 3 A avocado 5 2021-07-09 3
## 4 4 B banana 1 2021-07-26 4
## 5 5 A bell pepper 4 2021-07-22 5
## 6 6 B bilberry 1 2021-07-31 6
## 7 7 A blackberry 5 2021-07-01 7
## 8 8 B blackcurrant 1 2021-07-18 8
24 / 35

Chaining Functions Together

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
24 / 35

Chaining Functions Together

ratings %>%
filter(group == "A")
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 3 A avocado 5 2021-07-09
## 3 5 A bell pepper 4 2021-07-22
## 4 7 A blackberry 5 2021-07-01
24 / 35

Chaining Functions Together

ratings %>%
filter(group == "A") %>%
select(fruit, sampled)
## # A tibble: 4 × 2
## fruit sampled
## <chr> <date>
## 1 apple 2021-07-05
## 2 avocado 2021-07-09
## 3 bell pepper 2021-07-22
## 4 blackberry 2021-07-01
24 / 35

Chaining Functions Together

ratings %>%
filter(group == "A") %>%
select(fruit, sampled) %>%
mutate(month_name = format(sampled, "%b"))
## # A tibble: 4 × 3
## fruit sampled month_name
## <chr> <date> <chr>
## 1 apple 2021-07-05 Jul
## 2 avocado 2021-07-09 Jul
## 3 bell pepper 2021-07-22 Jul
## 4 blackberry 2021-07-01 Jul
24 / 35

Chaining Functions Together

ratings %>%
filter(group == "A") %>%
select(fruit, sampled) %>%
mutate(month_name = format(sampled, "%b")) %>%
arrange(desc(fruit))
## # A tibble: 4 × 3
## fruit sampled month_name
## <chr> <date> <chr>
## 1 blackberry 2021-07-01 Jul
## 2 bell pepper 2021-07-22 Jul
## 3 avocado 2021-07-09 Jul
## 4 apple 2021-07-05 Jul
24 / 35

Grouping and Summarising

25 / 35

Grouping and Summarising

group_by(<dataframe>, <columns>) data frame

summarise(<dataframe>, <new column> = <function>) data frame

26 / 35

Grouping and Summarising

ratings %>% head(n = 2)
## # A tibble: 2 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
ratings %>% group_by(group) %>% head(n = 2)
## # A tibble: 2 × 5
## # Groups: group [2]
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
27 / 35

Grouping and Summarising

ratings %>% summarise(mean = mean(rating))
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 3
ratings %>% group_by(group) %>% summarise(mean = mean(rating))
## # A tibble: 2 × 2
## group mean
## <chr> <dbl>
## 1 A 4.5
## 2 B 1.5
28 / 35

Counting

ratings %>% group_by(group) %>% summarise(count = n())
## # A tibble: 2 × 2
## group count
## <chr> <int>
## 1 A 4
## 2 B 4
ratings %>% count(group, name = "count")
## # A tibble: 2 × 2
## group count
## <chr> <int>
## 1 A 4
## 2 B 4
29 / 35

tibbles within tibbles

30 / 35

Nesting Data

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
30 / 35

Nesting Data

ratings %>%
group_by(group)
## # A tibble: 8 × 5
## # Groups: group [2]
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
30 / 35

Nesting Data

ratings %>%
group_by(group) %>%
nest()
## # A tibble: 2 × 2
## # Groups: group [2]
## group data
## <chr> <list>
## 1 A <tibble [4 × 4]>
## 2 B <tibble [4 × 4]>
30 / 35

Nesting Data

(subset <- ratings %>%
filter(group == "A"))
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 3 A avocado 5 2021-07-09
## 3 5 A bell pepper 4 2021-07-22
## 4 7 A blackberry 5 2021-07-01
30 / 35

Nesting Data

(subset <- ratings %>%
filter(group == "A"))
my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 3 A avocado 5 2021-07-09
## 3 5 A bell pepper 4 2021-07-22
## 4 7 A blackberry 5 2021-07-01
30 / 35

Nesting Data

(subset <- ratings %>%
filter(group == "A"))
my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
my_func(subset)
## # A tibble: 4 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 3 A avocado 5 2021-07-09
## 3 5 A bell pepper 4 2021-07-22
## 4 7 A blackberry 5 2021-07-01
## [1] "The highest rated fruit is avocado"
30 / 35

Nesting Data

my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
30 / 35

Nesting Data

my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
ratings %>%
group_by(group) %>%
nest()
## # A tibble: 2 × 2
## # Groups: group [2]
## group data
## <chr> <list>
## 1 A <tibble [4 × 4]>
## 2 B <tibble [4 × 4]>
30 / 35

Nesting Data

my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
ratings %>%
group_by(group) %>%
nest() %>%
mutate(description = map(data, my_func))
## # A tibble: 2 × 3
## # Groups: group [2]
## group data description
## <chr> <list> <list>
## 1 A <tibble [4 × 4]> <chr [1]>
## 2 B <tibble [4 × 4]> <chr [1]>
30 / 35

Nesting Data

my_func <- function(df) {
f <- df %>%
filter(rating == max(rating)) %>%
slice(1) %>%
pull(fruit)
paste("The highest rated fruit is", f)
}
ratings %>%
group_by(group) %>%
nest() %>%
mutate(description = map(data, my_func)) %>%
unnest(description)
## # A tibble: 2 × 3
## # Groups: group [2]
## group data description
## <chr> <list> <chr>
## 1 A <tibble [4 × 4]> The highest rated fruit is avocado
## 2 B <tibble [4 × 4]> The highest rated fruit is apricot
30 / 35

More involved nesting

library(gapminder)
30 / 35

More involved nesting

library(gapminder)
lm_fit <- function(df) {
lm(lifeExp ~ year, data = df)
}
30 / 35

More involved nesting

library(gapminder)
lm_fit <- function(df) {
lm(lifeExp ~ year, data = df)
}
plot_graph <- function(df) {
ggplot(df, aes(x = year, y = lifeExp)) +
geom_point() +
geom_smooth(method = "lm")
}
30 / 35

More involved nesting

library(gapminder)
lm_fit <- function(df) {
lm(lifeExp ~ year, data = df)
}
plot_graph <- function(df) {
ggplot(df, aes(x = year, y = lifeExp)) +
geom_point() +
geom_smooth(method = "lm")
}
gapminder
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
30 / 35

More involved nesting

library(gapminder)
lm_fit <- function(df) {
lm(lifeExp ~ year, data = df)
}
plot_graph <- function(df) {
ggplot(df, aes(x = year, y = lifeExp)) +
geom_point() +
geom_smooth(method = "lm")
}
gapminder %>%
group_by(country) %>%
nest()
## # A tibble: 142 × 2
## # Groups: country [142]
## country data
## <fct> <list>
## 1 Afghanistan <tibble [12 × 5]>
## 2 Albania <tibble [12 × 5]>
## 3 Algeria <tibble [12 × 5]>
## 4 Angola <tibble [12 × 5]>
## 5 Argentina <tibble [12 × 5]>
## 6 Australia <tibble [12 × 5]>
## 7 Austria <tibble [12 × 5]>
## 8 Bahrain <tibble [12 × 5]>
## 9 Bangladesh <tibble [12 × 5]>
## 10 Belgium <tibble [12 × 5]>
## # … with 132 more rows
30 / 35

More involved nesting

library(gapminder)
lm_fit <- function(df) {
lm(lifeExp ~ year, data = df)
}
plot_graph <- function(df) {
ggplot(df, aes(x = year, y = lifeExp)) +
geom_point() +
geom_smooth(method = "lm")
}
gapminder %>%
group_by(country) %>%
nest() %>%
mutate(model = map(data, lm_fit),
plot = map(data, plot_graph))
## # A tibble: 142 × 4
## # Groups: country [142]
## country data model plot
## <fct> <list> <list> <list>
## 1 Afghanistan <tibble [12 × 5]> <lm> <gg>
## 2 Albania <tibble [12 × 5]> <lm> <gg>
## 3 Algeria <tibble [12 × 5]> <lm> <gg>
## 4 Angola <tibble [12 × 5]> <lm> <gg>
## 5 Argentina <tibble [12 × 5]> <lm> <gg>
## 6 Australia <tibble [12 × 5]> <lm> <gg>
## 7 Austria <tibble [12 × 5]> <lm> <gg>
## 8 Bahrain <tibble [12 × 5]> <lm> <gg>
## 9 Bangladesh <tibble [12 × 5]> <lm> <gg>
## 10 Belgium <tibble [12 × 5]> <lm> <gg>
## # … with 132 more rows
30 / 35

More involved nesting

gapminder_modified %>%
filter(country == "United Kingdom") %>%
pull(model)
## [[1]]
##
## Call:
## lm(formula = lifeExp ~ year, data = df)
##
## Coefficients:
## (Intercept) year
## -294.197 0.186
30 / 35

More involved nesting

gapminder_modified %>%
filter(country == "United Kingdom") %>%
pull(plot)
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

30 / 35

Reshape your data

31 / 35

Pivoting Longer

gapminder %>%
select(-c(continent))
## # A tibble: 1,704 × 5
## country year lifeExp pop gdpPercap
## <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan 1952 28.8 8425333 779.
## 2 Afghanistan 1957 30.3 9240934 821.
## 3 Afghanistan 1962 32.0 10267083 853.
## 4 Afghanistan 1967 34.0 11537966 836.
## 5 Afghanistan 1972 36.1 13079460 740.
## 6 Afghanistan 1977 38.4 14880372 786.
## 7 Afghanistan 1982 39.9 12881816 978.
## 8 Afghanistan 1987 40.8 13867957 852.
## 9 Afghanistan 1992 41.7 16317921 649.
## 10 Afghanistan 1997 41.8 22227415 635.
## # … with 1,694 more rows
31 / 35

Pivoting Longer

gapminder %>%
select(-c(continent)) %>%
pivot_longer(cols = -c(country, year),
names_to = "stat",
values_to = "value")
## # A tibble: 5,112 × 4
## country year stat value
## <fct> <int> <chr> <dbl>
## 1 Afghanistan 1952 lifeExp 28.8
## 2 Afghanistan 1952 pop 8425333
## 3 Afghanistan 1952 gdpPercap 779.
## 4 Afghanistan 1957 lifeExp 30.3
## 5 Afghanistan 1957 pop 9240934
## 6 Afghanistan 1957 gdpPercap 821.
## 7 Afghanistan 1962 lifeExp 32.0
## 8 Afghanistan 1962 pop 10267083
## 9 Afghanistan 1962 gdpPercap 853.
## 10 Afghanistan 1967 lifeExp 34.0
## # … with 5,102 more rows
31 / 35

Pivoting Wider

gapminder %>%
select(country, year, pop)
## # A tibble: 1,704 × 3
## country year pop
## <fct> <int> <int>
## 1 Afghanistan 1952 8425333
## 2 Afghanistan 1957 9240934
## 3 Afghanistan 1962 10267083
## 4 Afghanistan 1967 11537966
## 5 Afghanistan 1972 13079460
## 6 Afghanistan 1977 14880372
## 7 Afghanistan 1982 12881816
## 8 Afghanistan 1987 13867957
## 9 Afghanistan 1992 16317921
## 10 Afghanistan 1997 22227415
## # … with 1,694 more rows
31 / 35

Pivoting Wider

gapminder %>%
select(country, year, pop) %>%
pivot_wider(names_from = "year",
values_from = "pop")
## # A tibble: 142 × 13
## country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` `1997`
## <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 Afghan… 8.43e6 9.24e6 1.03e7 1.15e7 1.31e7 1.49e7 1.29e7 1.39e7 1.63e7 2.22e7
## 2 Albania 1.28e6 1.48e6 1.73e6 1.98e6 2.26e6 2.51e6 2.78e6 3.08e6 3.33e6 3.43e6
## 3 Algeria 9.28e6 1.03e7 1.10e7 1.28e7 1.48e7 1.72e7 2.00e7 2.33e7 2.63e7 2.91e7
## 4 Angola 4.23e6 4.56e6 4.83e6 5.25e6 5.89e6 6.16e6 7.02e6 7.87e6 8.74e6 9.88e6
## 5 Argent… 1.79e7 1.96e7 2.13e7 2.29e7 2.48e7 2.70e7 2.93e7 3.16e7 3.40e7 3.62e7
## 6 Austra… 8.69e6 9.71e6 1.08e7 1.19e7 1.32e7 1.41e7 1.52e7 1.63e7 1.75e7 1.86e7
## 7 Austria 6.93e6 6.97e6 7.13e6 7.38e6 7.54e6 7.57e6 7.57e6 7.58e6 7.91e6 8.07e6
## 8 Bahrain 1.20e5 1.39e5 1.72e5 2.02e5 2.31e5 2.97e5 3.78e5 4.55e5 5.29e5 5.99e5
## 9 Bangla… 4.69e7 5.14e7 5.68e7 6.28e7 7.08e7 8.04e7 9.31e7 1.04e8 1.14e8 1.23e8
## 10 Belgium 8.73e6 8.99e6 9.22e6 9.56e6 9.71e6 9.82e6 9.86e6 9.87e6 1.00e7 1.02e7
## # … with 132 more rows, and 2 more variables: 2002 <int>, 2007 <int>
31 / 35

Combining Data

32 / 35

New data

group_details
## # A tibble: 4 × 2
## group time
## <chr> <chr>
## 1 A morning
## 2 B lunchtime
## 3 C afternoon
## 4 D evening

More groups than in ratings

fruit_details
## # A tibble: 3 × 2
## fruit price
## <chr> <dbl>
## 1 apple 2
## 2 banana 4
## 3 blackberry 6

Fewer fruits than in ratings

33 / 35

Left Join

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
33 / 35

Left Join

ratings %>%
left_join(group_details, by = "group")
## # A tibble: 8 × 6
## id_column group fruit rating sampled time
## <dbl> <chr> <chr> <dbl> <date> <chr>
## 1 1 A apple 4 2021-07-05 morning
## 2 2 B apricot 3 2021-07-13 lunchtime
## 3 3 A avocado 5 2021-07-09 morning
## 4 4 B banana 1 2021-07-26 lunchtime
## 5 5 A bell pepper 4 2021-07-22 morning
## 6 6 B bilberry 1 2021-07-31 lunchtime
## 7 7 A blackberry 5 2021-07-01 morning
## 8 8 B blackcurrant 1 2021-07-18 lunchtime
33 / 35

Left Join

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
33 / 35

Left Join

ratings %>%
left_join(fruit_details, by = "fruit")
## # A tibble: 8 × 6
## id_column group fruit rating sampled price
## <dbl> <chr> <chr> <dbl> <date> <dbl>
## 1 1 A apple 4 2021-07-05 2
## 2 2 B apricot 3 2021-07-13 NA
## 3 3 A avocado 5 2021-07-09 NA
## 4 4 B banana 1 2021-07-26 4
## 5 5 A bell pepper 4 2021-07-22 NA
## 6 6 B bilberry 1 2021-07-31 NA
## 7 7 A blackberry 5 2021-07-01 6
## 8 8 B blackcurrant 1 2021-07-18 NA
33 / 35

Inner Join

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
33 / 35

Inner Join

ratings %>%
inner_join(fruit_details, by = "fruit")
## # A tibble: 3 × 6
## id_column group fruit rating sampled price
## <dbl> <chr> <chr> <dbl> <date> <dbl>
## 1 1 A apple 4 2021-07-05 2
## 2 4 B banana 1 2021-07-26 4
## 3 7 A blackberry 5 2021-07-01 6
33 / 35

Full Join

ratings
## # A tibble: 8 × 5
## id_column group fruit rating sampled
## <dbl> <chr> <chr> <dbl> <date>
## 1 1 A apple 4 2021-07-05
## 2 2 B apricot 3 2021-07-13
## 3 3 A avocado 5 2021-07-09
## 4 4 B banana 1 2021-07-26
## 5 5 A bell pepper 4 2021-07-22
## 6 6 B bilberry 1 2021-07-31
## 7 7 A blackberry 5 2021-07-01
## 8 8 B blackcurrant 1 2021-07-18
33 / 35

Full Join

ratings %>%
full_join(group_details, by = "group")
## # A tibble: 10 × 6
## id_column group fruit rating sampled time
## <dbl> <chr> <chr> <dbl> <date> <chr>
## 1 1 A apple 4 2021-07-05 morning
## 2 2 B apricot 3 2021-07-13 lunchtime
## 3 3 A avocado 5 2021-07-09 morning
## 4 4 B banana 1 2021-07-26 lunchtime
## 5 5 A bell pepper 4 2021-07-22 morning
## 6 6 B bilberry 1 2021-07-31 lunchtime
## 7 7 A blackberry 5 2021-07-01 morning
## 8 8 B blackcurrant 1 2021-07-18 lunchtime
## 9 NA C <NA> NA NA afternoon
## 10 NA D <NA> NA NA evening
33 / 35

Key Takeaways

34 / 35
  • The tidyverse is big
  • Composition is a key part
  • Grouping and nesting is powerful
  • Writing tidyverse style functions is a little more involved
35 / 35

A quick note
on Composition

2 / 35
Paused

Help

Keyboard shortcuts

, , Pg Up, k Go to previous slide
, , Pg Dn, Space, j Go to next slide
Home Go to first slide
End Go to last slide
Number + Return Go to specific slide
b / m / f Toggle blackout / mirrored / fullscreen mode
c Clone slideshow
p Toggle presenter mode
t Restart the presentation timer
?, h Toggle this help
oTile View: Overview of Slides
Esc Back to slideshow