Live code:

Live code
LOOCV
Published

March 9, 2023

Data

library(tidyverse)
library(vegan)
data(mite)
data(mite.env)
mite_dat <- mite.env %>%
  add_column(abundance = mite$LRUG)

LOOCV

n <- nrow(mite_dat)
rmses <- rep(NA, n)
for(i in 1:n){
  train_dat <- mite_dat[-i,]
  test_dat <- mite_dat[i,]
  mod <- lm(abundance ~ WatrCont, data = train_dat)
  pred <- predict(mod, newdata = test_dat)
  rmses[i] <- sqrt((test_dat$abundance - pred)^2)
}
loocv_err <- mean(rmses)
loocv_err
[1] 9.513248

How does this compare to when I take a usual validation set approach?

set.seed(2)
train_ids <- sample(n, 0.7 * n)
train_dat <- mite_dat[train_ids,]
test_dat <- mite_dat[-train_ids,]
mod <- lm(abundance ~ WatrCont , data = train_dat)
pred <- predict(mod, newdata = test_dat)
rmse_val <- sqrt(mean((test_dat$abundance - pred)^2))
rmse_val
[1] 10.73352

Also, if you run with different seeds, you will get different estimated RMSEs!