 
      
     
      
     
  Saskia A. Otto
Postdoctoral Researcher
 
 

Scoping is the set of rules that govern how R looks up the value of a symbol.
x <- 10 
x 
## [1] 10
f <- function() { 
  x <- 2 
  y <- 3 
  x * y
}
f()
## [1] 6
x <- 2 
g <- function() { 
  y <- 3 
  x + y
}
g()
## [1] 5


add_one <- function(x) { 
  add_this <- 1
  x + add_this
}
add_one(5) 
## [1] 6
add_this
## Error in eval(expr, envir, enclos): object 'add_this' not found

x and add_this are not present in the global environmentadd_one()

n <- function(x) {
  x / 2
}
o <- function() { 
  n <- 10 
  n(n) 
} 
o()
## [1] 5
 
 
Try to avoid this whenever possible.

What gets returned each time?
j()
a <- j()
j()
j()
a <- j()
j()

What gets returned each time?
j()
## [1] 1
a <- j()
j()
## [1] 2
j()
## [1] 2
a <- j()
j()
## [1] 3
R looks for values when the function is run, not when it’s created.
my_mean <- function(x) {
    sum(x) / count
}
count <- 10
my_mean(rep(2, times = 10)) 
## [1] 2
my_mean(rep(2, times = 5))
## [1] 1
→ Hard to spot because occurring errors depend on the global environment.
func1 <- function(y) {
   func2(z = y)
}
func2 <- function(z) {
   3 * z
}
func1(4) 
z <- 3 
func1(4) 
y <- 3
func <- function(x, y) {
   x ^ y
}
func(2, 2)
x
y
func(3)
x1 <- 10
func <- function() {
   x1 * 2
}
func() 
x1 <- 5
func()
f <- function(x) { 
   f <- function(x) { 
      f <- function(x) { 
         x ^ 2 
      } 
      f(x) + 1 
   } 
   f(x) * 2 
} 
x <- 10
f(x)
Every function should be...
selfcontained,
able to solve one particular problem,
as small as possible and as complex as needed,
properly documented
func1 <- function(y) {
   func2(z = y)
}
func2 <- function(z) {
   3 * z
}
func1(4) 
z <- 3 
func1(4) 
y <- 3
func <- function(x, y) {
   x ^ y
}
func(2, 2)
x
y
func(3)
x1 <- 10
func <- function() {
   x1 * 2
}
func() 
x1 <- 5
func()
f <- function(x) { 
   f <- function(x) { 
      f <- function(x) { 
         x ^ 2 
      } 
      f(x) + 1 
   } 
   f(x) * 2 
} 
x <- 10
f(x)
df <- data.frame(replicate(6, sample(c(1:10, -99), 6, rep = TRUE)))
names(df) <- letters[1:6]
df
##    a   b   c   d  e  f
## 1  3 -99   8   5  3  6
## 2  5   8   5   9  5  7
## 3  7   7   9 -99  1  6
## 4 10   1   6   3  5  3
## 5  3   3   8   8 10 10
## 6 10   2 -99   2  4  8
df$a[df$a == -99] <- NA
df$b[df$b == -99] <- NA
df$c[df$b == -99] <- NA
df$d[df$d == -99] <- NA
df$e[df$e == -99] <- NA
df$e[df$f == -98] <- NA
Can you spot all mistakes?

Test your function
replace_value(c(2, 5, -99, 3, -99))
## [1]  2  5 NA  3 NA
Use your function
df$a <- replace_value(df$a)
df$b <- replace_value(df$b)
df$c <- replace_value(df$c)
df$d <- replace_value(df$d)
df$e <- replace_value(df$e)
df$f <- replace_value(df$f)
Still prone to error but much better!
Imagine the value changes from -99 to -999
##    a    b    c    d  e  f
## 1  3 -999    8    5  3  6
## 2  5    8    5    9  5  7
## 3  7    7    9 -999  1  6
## 4 10    1    6    3  5  3
## 5  3    3    8    8 10 10
## 6 10    2 -999    2  4  8
Simply change your function
replace_value2 <- function(x) {
   x[x == -999] <- NA
   x
}
Or add an additional argument
replace_value2 <- function(x, rep_na) {
   x[x == rep_na] <- NA
   x
}
replace_value <- function(x) {
   x[x == -99] <- NA
   return(x)
}
replace_value(c(2, 5, -99, 3, -99))
## [1]  2  5 NA  3 NA
replace_value <- function(x) {
   x[x == -99] <- NA
   invisible(x)
}
replace_value(c(2, 5, -99, 3, -99))
No output in the console due to the invisible() call within the function

complex_function <- function(x) {
   out_mean <- mean(x)
   out_median <- median(x)
   out_min <- min(x)
   out_max <- max(x)
   result <- list(out_mean, out_median, 
     out_min, out_max)
   return(result) 
}
is used here to combine intermediate results to one returned list object.
complex_function(1:10)
## [[1]]
## [1] 5.5
## 
## [[2]]
## [1] 5.5
## 
## [[3]]
## [1] 1
## 
## [[4]]
## [1] 10
 
 
for loop family: execute for a prescribed number of times, as controlled by a counter or an index, incremented at each iteration cyclewhile or repeat family of loops: are based on the onset and verification of a logical condition. The condition is tested at the start or the end of the loop construct. 
 source: www.datacamp.com/community/tutorials/tutorial-on-loops-in-r (licensed under CC-BY-NC-ND 4.0)
for loop
for loop
for loop - Styles
seq_along(x)→ a safe version of the familiar 1:length(x)
x <- c(2, 4, 7)
y <- numeric(0)
for (i in 1:length(x)) {
   print(x[i])
}
1:length(x)
## [1] 1 2 3
1:length(y)
## [1] 1 0
→ 1:length() iterates at least once!
for (i in seq_along(x)) {
   print(x[i])
}
seq_along(x)
## [1] 1 2 3
seq_along(y)
## integer(0)
→ If the counter is NULL seq_along() does not execute any iteration!
for loop at each iteration using c() (for example), your for loop will be very slow:# Grow objects
grow_obj <- function(x){
  y <- numeric()
  for (i in 1:x) { 
    y <- c(y, i)
  }
}
# Better: Indexing
index_obj <- function(x){
  y <- vector("integer", length(x))
  for (i in 1:x){ 
    y[i] <- i
  }
}
Here, an empty vector with the length of the counter is created before the loop runs.
# Let's test the speed of both functions
microbenchmark::microbenchmark(unit = "ms", times = 5,
    grow_obj(500), grow_obj(5000), index_obj(500), index_obj(5000) )
## Unit: milliseconds
##             expr       min        lq       mean    median        uq
##    grow_obj(500)  1.032632  1.042913  1.2216912  1.078211  1.429168
##   grow_obj(5000) 58.062709 58.829260 69.5195706 59.844666 62.812168
##   index_obj(500)  0.110443  0.118197  0.1541738  0.161138  0.185150
##  index_obj(5000)  1.523043  1.608496  2.7793866  1.657436  2.270148
##         max neval cld
##    1.525532     5  a 
##  108.049050     5   b
##    0.195941     5  a 
##    6.837810     5  a
Look at the mean column: As you can see, indexing is much faster than growing objects, particularly when iterating many times!
Replace a specific value in a vector!
replace_value <- function(x) {
   x[ x == -99] <- NA
   x
}
Use your function:
df$a <- replace_value(df$a)
df$b <- replace_value(df$b)
df$c <- replace_value(df$c)
df$d <- replace_value(df$d)
df$e <- replace_value(df$e)
df$f <- replace_value(df$f)
Still prone to error but much better!
Replace a specific value in a data frame!

replace_value(df)
##    a  b  c  d  e  f
## 1  3 NA  8  5  3  6
## 2  5  8  5  9  5  7
## 3  7  7  9 NA  1  6
## 4 10  1  6  3  5  3
## 5  3  3  8  8 10 10
## 6 10  2 NA  2  4  8
What happens if the input is a matrix and not a data frame?
mat <- matrix(c(1:5, -99), ncol=3)
replace_value(mat)
## Error in df[, i]: subscript out of bounds
seq_along(mat)
## [1] 1 2 3 4 5 6
seq_along() counts each element in matrices and not the columns!replace_value <- function(df) {
  if(is.data.frame(df)) {
    for (i in seq_along(df)) {
      df[df[, i] == -99, i] <- NA
    }
    return(df)
  } else {
    stop("Input has to be a data frame.")
  }
}
replace_value(mat) 
## Error in replace_value(mat): Input has to be a data frame.
Instead of using a loop within the replace function you can combine 2 functions:
replace_value <- function(x) {
  x[x == -99] <- NA
  x
}
apply_to_column <- function(df) {
  for (i in seq_along(df)) {
    df[, i] <- replace_value(df[, i])
  }
  return(df)
}

apply_to_column(df)
##    a  b  c  d  e  f
## 1  3 NA  8  5  3  6
## 2  5  8  5  9  5  7
## 3  7  7  9 NA  1  6
## 4 10  1  6  3  5  3
## 5  3  3  8  8 10 10
## 6 10  2 NA  2  4  8
for loops!FSA::Mirex datasetFSA::PikeNY datasetThink about the output, sequence, and body before you start writing the loop.

functions: function(), return(), invisible()
loops: for(var in seq) expr, while(cond) expr, repeat expr, seq_along()
conditions: if(cond) expr, if(cond) cons.expr  else  alt.expr
exists(), microbenchmark::microbenchmark()

Try out the exercises and read chapter 21 on iterations in R for Data Science.

We just scratched the surface of R functional programming. If you want to learn more on how to write R functions I highly recommend the book Advanced R by Hadley Wickham.
Then go grab a coffee, lean back and enjoy the rest of the day...!

For more information contact me: saskia.otto@uni-hamburg.de
      
        http://www.researchgate.net/profile/Saskia_Otto
      
        http://www.github.com/saskiaotto
    
    
      
        This work is licensed under a 
        Creative Commons Attribution-ShareAlike 4.0 International License except for the 
        borrowed and mentioned with proper source: statements. 
        Image on title and end slide: Section of an infrared satallite image showing the Larsen C
        ice shelf on the Antarctic 
        Peninsula - USGS/NASA Landsat: 
        A Crack of Light in the Polar Dark, Landsat 8 - TIRS, June 17, 2017 
        (under CC0 license)
    
The following code demonstrates one approach for exercise 3 in which the slopes and intercepts are computed together in one function:
files <- stringr::str_c("data/functions/dummyfile_", 1:100, ".csv")
# The iteration will be here part of the function:
get_coefs <- function(filenames) {
  # Create empty output vectors 
  intercept <- vector("double", length = length(filenames))
  slopes <- vector("double", length = length(filenames))
  for (i in seq_along(files)) {
    dat <- readr::read_csv(filenames[i]) # import single file 
    m_dat <- lm(y ~ x, data = dat) # fit linear model
    intercept[[i]] <- coef(m_dat)[1] # save the intercept
    slopes[[i]] <- coef(m_dat)[2] # save the slope
  }
  # Since output can be only 1 object:
  out <- tibble(a = intercept, b = slopes)
  return(out)
}
# Apply function
all_coefs <- get_coefs(files) %>% 
  print(n =5)
## # A tibble: 100 x 2
##       a     b
##   <dbl> <dbl>
## 1  3.12 0.794
## 2  2.64 0.800
## 3  1.95 0.828
## 4  3.08 0.755
## 5  1.07 0.922
## # ... with 95 more rows

# Create histograms
p <- ggplot(all_coefs) + 
  theme_classic()
p_a <- p + geom_histogram(aes(x = a), 
    bins = 10, fill = "white", 
  colour = "black") + 
  ggtitle("Histogram of intercepts") 
p_b <- p + geom_histogram(aes(x = b), 
    bins = 10, fill = "white", 
  colour = "black") + 
  ggtitle("Histogram of slopes") 
gridExtra::grid.arrange(p_a, p_b, 
  ncol=2)