for (i in some_vector) {
# Code to do stuff with i
}
Iterating Over Functions
In this unit, you’ll learn tools for iteration—repeatedly performing the same action on different objects. Iteration in R generally tends to look a bit different from other programming languages. Much of iteration we get for free! For example, if you want to double a numeric vector x
in R, you can just write 2 * x
, whereas in many other languages you would need to explicitly double each element of x
using some sort of for
loop.
In R, there are generally two methods for iteration—for()
loops and functionals. We will start with a review of for()
loops before hopping over to functionals.
for
loops
In R, for
loops have the following general structure:
some_vector
can be any vector, including:
- An indexing vector:
1:3
- A character vector:
c("group1", "group2", "group3")
- A vector of any other class
<- c("group1", "group2", "group3")
groups
for (i in 1:3) {
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
for (g in groups) {
print(g)
}
[1] "group1"
[1] "group2"
[1] "group3"
The seq_along()
function generates an integer sequence from 1 to the length of the vector supplied. A nice feature of seq_along()
is that it generates an empty iteration vector if the vector you’re iterating over itself has length 0.
seq_along(groups)
[1] 1 2 3
<- c()
no_groups seq_along(no_groups)
integer(0)
for (i in seq_along(groups)) {
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
for (i in seq_along(no_groups)) {
print(no_groups[i])
}
Closely related to seq_along()
is seq_len()
. While seq_along(x)
generates an integer sequence from 1 to length(x)
, seq_len(x)
takes x
itself to be a length:
seq_len(3)
[1] 1 2 3
seq_len(0)
integer(0)
for (i in seq_len(length(groups))) {
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
for (i in seq_len(length(no_groups))) {
print(no_groups[i])
}
seq_len()
is useful for iterating over the rows of a data frame because seq_along()
would iterate over columns:
<- tibble(a = 1:2,
small_data b = 2:3,
c = 4:5)
for (col in small_data) {
print(col)
}
[1] 1 2
[1] 2 3
[1] 4 5
for (r in seq_len(nrow(small_data))) {
print(r)
}
[1] 1
[1] 2
Often we’ll want to store output created during a for
loop. We can create storage containers with the vector()
function:
<- vector("character",
char_storage length = 3)
char_storage
[1] "" "" ""
<- vector("numeric",
num_storage length = 3)
num_storage
[1] 0 0 0
<- vector("list",
list_storage length = 3)
list_storage
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
for (i in seq_len(3)) {
<- str_c("Number: ", i)
char_storage[i] <- 2*i
num_storage[i] <- i # Note the [[ for subsetting here
list_storage[[i]]
}
char_storage
[1] "Number: 1" "Number: 2" "Number: 3"
num_storage
[1] 2 4 6
list_storage
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
Exercises
Write for()
-loops that do each of the following:
- Prints the even numbers from 1:20.
- Produce the same output with the
seq()
function!
Solutions
for (i in seq_len(10)) {
print(2*i)
}
[1] 2
[1] 4
[1] 6
[1] 8
[1] 10
[1] 12
[1] 14
[1] 16
[1] 18
[1] 20
seq(from = 2, to = 20, by = 2)
[1] 2 4 6 8 10 12 14 16 18 20
- Iterates over the
month.name
vector (built-in to base R) and stores a character vector of output containing strings like “Month 1: January”, “Month 2: February”.
- Produce the same output with
str_c()
only!
Solutions
<- vector("character", length = length(month.name))
month_strings
for (i in seq_along(month.name)) {
<- str_c("Month ", i, ": ", month.name[i])
month_strings[i]
} month_strings
[1] "Month 1: January" "Month 2: February" "Month 3: March"
[4] "Month 4: April" "Month 5: May" "Month 6: June"
[7] "Month 7: July" "Month 8: August" "Month 9: September"
[10] "Month 10: October" "Month 11: November" "Month 12: December"
str_c("Month ", 1:12, ": ", month.name)
[1] "Month 1: January" "Month 2: February" "Month 3: March"
[4] "Month 4: April" "Month 5: May" "Month 6: June"
[7] "Month 7: July" "Month 8: August" "Month 9: September"
[10] "Month 10: October" "Month 11: November" "Month 12: December"
- Store the
class()
(type) of every column in themtcars
data frame.
Solution
<- vector("character",
col_classes ncol(mtcars)
)
# Data frames are **lists** of columns, so this loop iterates over the columns
for (i in seq_along(mtcars)) {
<- class(mtcars[[i]])
col_classes[i] }
Iteration with Functionals
A functional is a function that takes a function as an input and returns a vector as output. - Hadley Wickham
đź“– Required Reading: Functionals
purrr
is a tidyverse
package that provides several useful functions for iteration. The main advantages of purrr
include:
- Improved readability of R code
- Reduction in the “overhead” in writing a
for
loop (creating storage containers and writing thefor (i in ...)
)
In purrr
, we can use the family of map()
functions to apply a function to each element of a list or vector. We can think of this as mapping an input (a list or vector) to a new output via a function. Let’s look at the purrr
cheatsheet to look at graphical representations of how these functions work.
map()
returns a listmap_chr()
returns a character vectormap_lgl()
returns a logical vectormap_int()
returns an integer vectormap_dbl()
returns a numeric vectormap_vec()
returns a vector of a different (non-atomic) type (like dates)
To get the class()
of each data frame column, map_chr()
is sensible because classes are strings:
map_chr(mtcars, .f = class)
mpg cyl disp hp drat wt qsec vs
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
am gear carb
"numeric" "numeric" "numeric"
Let’s get the class of each variable in diamonds
:
map_chr(diamonds, class)
Error in `map_chr()`:
ℹ In index: 2.
ℹ With name: cut.
Caused by error:
! Result must be length 1, not 2.
What happened!? map_chr()
was expecting to create a character vector with one element per element (column) in diamonds
. But something happened in column 2 with the cut
variable. Let’s figure out what happened:
class(diamonds$cut)
[1] "ordered" "factor"
Ah! cut
has two classes. In this case, map()
(which returns a list) is the best option because some variables have multiple classes:
map(diamonds, class)
$carat
[1] "numeric"
$cut
[1] "ordered" "factor"
$color
[1] "ordered" "factor"
$clarity
[1] "ordered" "factor"
$depth
[1] "numeric"
$table
[1] "numeric"
$price
[1] "integer"
$x
[1] "numeric"
$y
[1] "numeric"
$z
[1] "numeric"
The error we encountered with map_chr()
is a nice feature of purrr
because it allows us to be very sure of the type of output we are getting. Failing loudly is vastly preferable to getting unexpected outputs silently because we can catch errors earlier!
We can combine map_*()
functions with tidy selection for some powerful variable summaries that require much less code than for()
loops.
%>%
diamonds select(where(is.numeric)) %>%
map_dbl(.f = mean)
carat depth table price x y
0.7979397 61.7494049 57.4571839 3932.7997219 5.7311572 5.7345260
z
3.5387338
%>%
diamonds select(!where(is.numeric)) %>%
map_int(.f = n_distinct)
cut color clarity
5 7 8
Exercises
- Write a function called
summ_stats()
that takes a numeric vectorx
as input and returns the mean, median, standard deviation, and IQR as a data frame. You can usetibble()
to create the data frame.- Example:
tibble(a = 1:2, b = 2:3)
creates a data frame with variablesa
andb
.
- Example:
Solution
<- function(x) {
summ_stats tibble(
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
iqr = IQR(x, na.rm = TRUE)
) }
- Use
map()
to apply yoursumm_stats()
function to the numeric columns in thediamonds
dataset.
Look up the bind_rows()
documentation from dplyr
to combine summary statistics for all quantitative variables into one data frame. The .id
argument will be especially helpful in adding the variable names!
The output would be easier for the reader to understand if the summary statistics were rows and the variable names were columns.
Solution
<- diamonds %>%
diamonds_num select(where(is.numeric))
map(diamonds_num, .f = summ_stats) %>%
bind_rows(.id = "variable") %>%
pivot_longer(cols = -variable,
names_to = "statistic",
values_to = "value") %>%
pivot_wider(names_from = variable, values_from = value)
# A tibble: 4 Ă— 8
statistic carat depth table price x y z
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 mean 0.798 61.7 57.5 3933. 5.73 5.73 3.54
2 median 0.7 61.8 57 2401 5.7 5.71 3.53
3 sd 0.474 1.43 2.23 3989. 1.12 1.14 0.706
4 iqr 0.64 1.5 3 4374. 1.83 1.82 1.13
- Write a
for()
loop to achieve the same result. Which do you prefer in terms of ease of code writing and readability?
Solution
<- vector("list", length = ncol(diamonds_num))
diamonds_summ_stats2
for (i in seq_along(diamonds_num)) {
<- summ_stats(diamonds_num[[i]])
diamonds_summ_stats2[[i]]
}
%>%
diamonds_summ_stats2 set_names(colnames(diamonds_num)) %>%
bind_rows(.id = "variable")
# A tibble: 7 Ă— 5
variable mean median sd iqr
<chr> <dbl> <dbl> <dbl> <dbl>
1 carat 0.798 0.7 0.474 0.64
2 depth 61.7 61.8 1.43 1.5
3 table 57.5 57 2.23 3
4 price 3933. 2401 3989. 4374.
5 x 5.73 5.7 1.12 1.83
6 y 5.73 5.71 1.14 1.82
7 z 3.54 3.53 0.706 1.13
Multiple Inputs
purrr
also offers the pmap()
family of functions that take multiple inputs and loops over them simultaneously. Let’s look at the purrr
cheatsheet to look at graphical representations of how these functions work.
<- tibble(
string_data string = c("apple", "banana", "cherry"),
pattern = c("p", "n", "h"),
replacement = c("P", "N", "H")
) string_data
# A tibble: 3 Ă— 3
string pattern replacement
<chr> <chr> <chr>
1 apple p P
2 banana n N
3 cherry h H
pmap_chr(string_data, .f = str_replace_all)
[1] "aPPle" "baNaNa" "cHerry"
Note how the column names in string_data
exactly match the argument names in str_replace_all()
. The iteration that is happening is across rows, and the multiple arguments in str_replace_all()
are being matched by name.
We can also use pmap()
to specify variations in some arguments but leave some arguments constant across the iterations:
<- tibble(
string_data pattern = c("p", "n", "h"),
replacement = c("P", "N", "H")
)
pmap_chr(string_data, str_replace_all, string = "ppp nnn hhh")
[1] "PPP nnn hhh" "ppp NNN hhh" "ppp nnn HHH"
Exercises
- Create 2 small examples that show how
pmap()
works withstr_sub()
. Your examples should:
- Use different arguments for
string
,start
, andend
- Use different arguments for
start
andend
but a fixedstring
Solution
<- tibble(
string_data string = c("apple", "banana", "cherry"),
start = c(1, 2, 4),
end = c(2, 3, 5)
)
pmap_chr(string_data, str_sub)
[1] "ap" "an" "rr"
<- tibble(
string_data start = c(1, 2, 4),
end = c(2, 3, 5)
)pmap_chr(string_data, str_sub, string = "abcde")
[1] "ab" "bc" "de"