Live code:

Live code

Bagging implementations

Published

March 31, 2023

library(tidyverse)
library(vegan)
library(tree)
data(mite)
data(mite.env)
mite_dat <- mite.env %>%
  add_column(abundance = mite$LRUG)

Test/train set approach for test error

n <- nrow(mite_dat)
set.seed(18) 
train_ids <- sample(1:n, 0.8*n)
n_train <- length(train_ids)
n_test <- n - n_train

train_dat <- mite_dat[train_ids,]
test_dat <- mite_dat[-train_ids,]
B <- 10

The two following options differ only in how I keep track of my predictions. I will set some seeds to see if we indeed get the same predictions:

Option 1:

Here, I create a vector pred_sums that is in essence, a vector of cumulative sums of predictions.

set.seed(1)
pred_sums <- rep(0, n_test)

for (b in 1:B){
  # obtain bootstrap sample
  boot_ids <- sample(1:n_train, n_train, replace = TRUE)
  boot_samp <- train_dat[boot_ids,]
  
  # fit tree to bootstrap sample
  boot_tree <- tree(abundance ~ . , data = boot_samp)
  
  # obtain predictions for test set
  predictions <- predict(boot_tree, newdata = test_dat)
  
  # store predictions for test set
  pred_sums <- pred_sums +  predictions
}
pred_sums/B

         2          3          7         15         16         19         22 
11.3331103 16.5589754 14.4339754  3.2265457  0.7085599 11.6994505  0.7085599 
        25         35         36         47         58         59         70 
 3.2353446  9.8865567 12.4727839  9.8865567 11.6536153  8.7849145 12.8079772

Option 2:

Here, I create a B x n_test matrix that will hold each prediction:

set.seed(1)
pred_mat <- matrix(0, nrow = B, ncol = n_test)

for (b in 1:B){
  # obtain bootstrap sample
  boot_ids <- sample(1:n_train, n_train, replace = TRUE)
  boot_samp <- train_dat[boot_ids,]
  
  # fit tree to bootstrap sample
  boot_tree <- tree(abundance ~ . , data = boot_samp)
  
  # obtain predictions for test set
  predictions <- predict(boot_tree, newdata = test_dat)
  
  # store predictions for test set
  pred_mat[b,] <- predictions
}

apply(pred_mat, 2, mean)

 [1] 11.3331103 16.5589754 14.4339754  3.2265457  0.7085599 11.6994505
 [7]  0.7085599  3.2353446  9.8865567 12.4727839  9.8865567 11.6536153
[13]  8.7849145 12.8079772

# this function does the same: colMeans(pred_mat)

OOB approach to for test error

set.seed(1)
B <- 10
pred_sum <- rep(0, n)
n_oob <- rep(0,n)
for (b in 1:B){
  boot_ids <- sample(1:n, n, replace = T)
  oob_ids <- (1:n)[-unique(boot_ids)]
  
  tree <- tree(abundance ~ ., data = mite_dat[boot_ids,])
  pred_sum[oob_ids] <-  pred_sum[oob_ids] + predict(tree, newdata = mite_dat[oob_ids,])
  n_oob[oob_ids] <- n_oob[oob_ids] + 1
  
}

pred_sum

 [1]   7.183861   3.448718  16.514286  10.061905   2.909420   3.336120
 [7]  27.066026   6.227226   7.176942   4.099206  14.439985  20.536264
[13]   4.401139  14.272283  29.132601  10.387198   5.800528   6.690494
[19]  92.931735   4.783861   3.901139   4.150000  15.628571   6.005617
[25]   3.313043  29.230159   5.773318  19.901810   7.956410  89.646886
[31]  29.593407  14.483333  58.656410  16.960317  54.034091  54.856410
[37]  79.111722  57.837662   1.888889  50.145299  45.600000  46.754579
[43]  26.825397  26.909091  79.837662 120.722436  26.599076  41.107692
[49] 151.181818  83.603996  57.186039  55.200000 170.867244  71.357143
[55]  65.155556  66.600000  26.361905  47.000000  28.799603  48.935165
[61]  85.018182  66.714286 132.310490  67.510490  15.285714  50.018182
[67]  24.301587  97.774451  45.155311  20.731502

n_oob

 [1] 3 2 2 3 3 3 4 5 5 4 5 3 4 4 4 6 5 6 5 4 4 3 4 5 2 5 5 3 3 5 2 4 4 3 2 4 5 5
[39] 1 3 2 3 4 2 6 6 6 2 4 4 4 4 6 5 3 3 6 3 5 3 4 3 7 4 1 3 6 5 5 3

oob_preds <- pred_sum/n_oob
sqrt(mean((oob_preds - mite_dat$abundance)^2))

[1] 11.03221