Chapter 5 Working with relational databases
library(palmerpenguins)
library(tidyverse)
penguins %>%
tidyr::pivot_longer(contains("_"), # measurement cols
names_to = c("variable_name"),
values_to = "value")
## # A tibble: 1,376 × 6
## species island sex year variable_name value
## <fct> <fct> <fct> <int> <chr> <dbl>
## 1 Adelie Torgersen male 2007 bill_length_mm 39.1
## 2 Adelie Torgersen male 2007 bill_depth_mm 18.7
## 3 Adelie Torgersen male 2007 flipper_length_mm 181
## 4 Adelie Torgersen male 2007 body_mass_g 3750
## 5 Adelie Torgersen female 2007 bill_length_mm 39.5
## 6 Adelie Torgersen female 2007 bill_depth_mm 17.4
## 7 Adelie Torgersen female 2007 flipper_length_mm 186
## 8 Adelie Torgersen female 2007 body_mass_g 3800
## 9 Adelie Torgersen female 2007 bill_length_mm 40.3
## 10 Adelie Torgersen female 2007 bill_depth_mm 18
## # ℹ 1,366 more rows
penguins_long <- penguins %>%
tidyr::pivot_longer(contains("_"), # measurement cols
names_to = c("part", "measure", "unit"),
names_sep = "_")
penguins_long
## # A tibble: 1,376 × 8
## species island sex year part measure unit value
## <fct> <fct> <fct> <int> <chr> <chr> <chr> <dbl>
## 1 Adelie Torgersen male 2007 bill length mm 39.1
## 2 Adelie Torgersen male 2007 bill depth mm 18.7
## 3 Adelie Torgersen male 2007 flipper length mm 181
## 4 Adelie Torgersen male 2007 body mass g 3750
## 5 Adelie Torgersen female 2007 bill length mm 39.5
## 6 Adelie Torgersen female 2007 bill depth mm 17.4
## 7 Adelie Torgersen female 2007 flipper length mm 186
## 8 Adelie Torgersen female 2007 body mass g 3800
## 9 Adelie Torgersen female 2007 bill length mm 40.3
## 10 Adelie Torgersen female 2007 bill depth mm 18
## # ℹ 1,366 more rows
penguins_long %>%
tidyr::pivot_wider(names_from = c("part", "measure", "unit"), # pivot these columns
values_from = "value", # take the values from here
names_sep = "_") # combine col names using an underscore
## Warning: Values from `value` are not uniquely
## identified; output will contain
## list-cols.
## • Use `values_fn = list` to suppress
## this warning.
## • Use `values_fn = {summary_fun}` to
## summarise duplicates.
## • Use the following dplyr code to
## identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by =
## c(species, island, sex, year, part,
## measure, unit)) |>
## dplyr::filter(n > 1L)
## # A tibble: 35 × 8
## species island sex year bill_length_mm bill_depth_mm flipper_length_mm
## <fct> <fct> <fct> <int> <list> <list> <list>
## 1 Adelie Torgersen male 2007 <dbl [7]> <dbl [7]> <dbl [7]>
## 2 Adelie Torgersen female 2007 <dbl [8]> <dbl [8]> <dbl [8]>
## 3 Adelie Torgersen <NA> 2007 <dbl [5]> <dbl [5]> <dbl [5]>
## 4 Adelie Biscoe female 2007 <dbl [5]> <dbl [5]> <dbl [5]>
## 5 Adelie Biscoe male 2007 <dbl [5]> <dbl [5]> <dbl [5]>
## 6 Adelie Dream female 2007 <dbl [9]> <dbl [9]> <dbl [9]>
## 7 Adelie Dream male 2007 <dbl [10]> <dbl [10]> <dbl [10]>
## 8 Adelie Dream <NA> 2007 <dbl [1]> <dbl [1]> <dbl [1]>
## 9 Adelie Biscoe female 2008 <dbl [9]> <dbl [9]> <dbl [9]>
## 10 Adelie Biscoe male 2008 <dbl [9]> <dbl [9]> <dbl [9]>
## # ℹ 25 more rows
## # ℹ 1 more variable: body_mass_g <list>
# give each measured penguin a number
penguins_with_id <- penguins %>% mutate(id=seq(nrow(.)))
# make the data long again
penguins_long_ided <- penguins_with_id %>%
tidyr::pivot_longer(contains("_"), # measurement cols
names_to = c("part", "measure", "unit"),
names_sep = "_")
# and wide again
penguins_long_ided2 <- penguins_long_ided %>%
tidyr::pivot_wider(names_from = c("part", "measure", "unit"),
values_from = "value",
names_sep = "_")
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 2 more variables: sex <fct>, year <int>
## # A tibble: 344 × 9
## penguin_id species island bill_length_mm bill_depth_mm flipper_length_mm
## <int> <fct> <fct> <dbl> <dbl> <int>
## 1 1 Adelie Torgersen 39.1 18.7 181
## 2 2 Adelie Torgersen 39.5 17.4 186
## 3 3 Adelie Torgersen 40.3 18 195
## 4 4 Adelie Torgersen NA NA NA
## 5 5 Adelie Torgersen 36.7 19.3 193
## 6 6 Adelie Torgersen 39.3 20.6 190
## 7 7 Adelie Torgersen 38.9 17.8 181
## 8 8 Adelie Torgersen 39.2 19.6 195
## 9 9 Adelie Torgersen 34.1 18.1 193
## 10 10 Adelie Torgersen 42 20.2 190
## # ℹ 334 more rows
## # ℹ 3 more variables: body_mass_g <int>, sex <fct>, year <int>
food_preference <-
tibble(
penguin_species = c("Adelie", "Chinstrap", "Gentoo"),
favorite_food = c("peanut butter", "cheese", "chocolate")
)
## # A tibble: 344 × 9
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 3 more variables: sex <fct>, year <int>, favorite_food <chr>
## # A tibble: 344 × 9
## penguin_id species island bill_length_mm bill_depth_mm flipper_length_mm
## <int> <fct> <fct> <dbl> <dbl> <int>
## 1 1 Adelie Torgersen 39.1 18.7 181
## 2 2 Adelie Torgersen 39.5 17.4 186
## 3 3 Adelie Torgersen 40.3 18 195
## 4 4 Adelie Torgersen NA NA NA
## 5 5 Adelie Torgersen 36.7 19.3 193
## 6 6 Adelie Torgersen 39.3 20.6 190
## 7 7 Adelie Torgersen 38.9 17.8 181
## 8 8 Adelie Torgersen 39.2 19.6 195
## 9 9 Adelie Torgersen 34.1 18.1 193
## 10 10 Adelie Torgersen 42 20.2 190
## # ℹ 334 more rows
## # ℹ 3 more variables: body_mass_g <int>, sex <fct>, year <int>
Let’s refresh your memory. Take the penguins dataset.
As we all know, Adelie penguins love peanut butter, Chinstrap penguins prefer cheese and Gentoo penguins do anything for chocolate.
Here is a table describing food preference:
food_preference <-
tibble(
penguin_species = c("Adelie", "Chinstrap", "Gentoo"),
favorite_food = c("peanut butter", "cheese", "chocolate")
)
Now make a new tibble containing only the information on penguin species, sex and bill length measured in 2009 using filter and select.
#penguins_long_ided #species island sex year id bill_length_mm bill_depth_mm
##filter(year=="2009")
# species island sex year id bill_length_mm bill_depth_mm
head(penguins_long_ided2)
## # A tibble: 6 × 9
## species island sex year id bill_length_mm bill_depth_mm
## <fct> <fct> <fct> <int> <int> <dbl> <dbl>
## 1 Adelie Torgersen male 2007 1 39.1 18.7
## 2 Adelie Torgersen female 2007 2 39.5 17.4
## 3 Adelie Torgersen female 2007 3 40.3 18
## 4 Adelie Torgersen <NA> 2007 4 NA NA
## 5 Adelie Torgersen female 2007 5 36.7 19.3
## 6 Adelie Torgersen male 2007 6 39.3 20.6
## # ℹ 2 more variables: flipper_length_mm <dbl>, body_mass_g <dbl>
new_tbl <- penguins_long_ided2 %>% filter(year==2009) %>% select(species,sex,bill_length_mm)
head(new_tbl)
## # A tibble: 6 × 3
## species sex bill_length_mm
## <fct> <fct> <dbl>
## 1 Adelie female 35
## 2 Adelie male 41
## 3 Adelie female 37.7
## 4 Adelie male 37.8
## 5 Adelie female 37.9
## 6 Adelie male 39.7