test_that("Basic cross validation works", { skip_on_ci() skip_on_cran() d <- pcod spde <- make_mesh(d, c("X", "Y"), cutoff = 15) set.seed(2) # library(future) # for parallel processing # plan(multisession) # for parallel processing x <- sdmTMB_cv( density ~ 0 + depth_scaled + depth_scaled2 + as.factor(year), data = d, mesh = spde, spatial = "off", spatiotemporal = "off", family = tweedie(link = "log"), time = "year", k_folds = 2 ) print(x) expect_equal(class(x$sum_loglik), "numeric") expect_equal(x$sum_loglik, sum(x$data$cv_loglik)) expect_equal(x$sum_loglik, sum(x$fold_loglik)) expect_true("data.frame" %in% class(x$data)) expect_equal(class(x$models[[1]]), "sdmTMB") # Use fold_ids: x <- sdmTMB_cv( density ~ 1, data = d, mesh = spde, spatial = "off", family = tweedie(link = "log"), fold_ids = rep(seq(1, 2), nrow(d))[seq(1, nrow(d))]) expect_equal(class(x$models[[1]]), "sdmTMB") # student-t: was broken at one time, must deal with `df` d <- subset(d, density > 0) d$log_density <- log(d$density) spde <- make_mesh(d, c("X", "Y"), cutoff = 15) x <- sdmTMB_cv( log_density ~ 0 + depth_scaled + depth_scaled2 + as.factor(year), data = d, mesh = spde, spatial = "off", family = sdmTMB::student(df = 9), time = "year", k_folds = 2 ) expect_equal(class(x$models[[1]]), "sdmTMB") # Try passing family as a variable -- this is per Issue #127 fam <- gaussian(link = "identity") x <- sdmTMB_cv( log_density ~ 0 + depth_scaled + depth_scaled2 + as.factor(year), data = d, mesh = spde, spatial = "off", spatiotemporal = "off", family = fam, time = "year", k_folds = 2 ) expect_equal(class(x$models[[1]]), "sdmTMB") }) test_that("Cross validation in parallel with globals", { skip_on_cran() skip_on_ci() # https://github.com/sdmTMB/sdmTMB/issues/127 d <- pcod spde <- make_mesh(d, c("X", "Y"), cutoff = 15) set.seed(2) future::plan(future::multisession, workers = 2L) fam <- tweedie(link = "log") x <- sdmTMB_cv( density ~ 0 + depth_scaled + depth_scaled2 + as.factor(year), data = d, mesh = spde, family = fam, time = "year", k_folds = 2L, future_globals = 'fam' ) expect_s3_class(x$models[[1]], "sdmTMB") future::plan(future::sequential) }) test_that("Leave future out cross validation works", { skip_on_cran() skip_on_ci() x <- sdmTMB_cv( present ~ 1, data = pcod_2011, mesh = pcod_mesh_2011, lfo = TRUE, lfo_forecast = 1, lfo_validations = 2, family = binomial(), time = "year" ) expect_equal(class(x$sum_loglik), "numeric") expect_equal(x$sum_loglik, sum(x$data$cv_loglik)) expect_equal(x$sum_loglik, sum(x$fold_loglik)) expect_true("data.frame" %in% class(x$data)) expect_true(inherits(x$models[[1]], "sdmTMB")) # Can see how the folds are assigned with table(x$models[[1]]$data$cv_fold, x$models[[1]]$data$year) expect_equal(length(x$models), 2) expect_equal(length(x$fold_loglik), 2) expect_equal(length(x$max_gradients), 2) expect_equal(cor(x$data$cv_fold[x$data$cv_fold!=1], x$data$year[x$data$cv_fold!=1]), 1.0) }) test_that("LFO fold assignments follow documented structure", { skip_on_cran() skip_on_ci() # Test the documented example from help: 9 time steps, lfo_forecast = 2, lfo_validations = 3 # Expected: # - Fit data to time steps 1 to 5, predict and validate step 7. # - Fit data to time steps 1 to 6, predict and validate step 8. # - Fit data to time steps 1 to 7, predict and validate step 9. # Use pcod data which has 9 time steps: 2003, 2004, 2005, 2007, 2009, 2011, 2013, 2015, 2017 time_steps <- sort(unique(pcod$year)) expect_equal(length(time_steps), 9) # This should work if we have enough validation periods # With lfo_forecast = 2 and lfo_validations = 2 instead of 3 mesh <- make_mesh(pcod, c("X", "Y"), cutoff = 25) x <- sdmTMB_cv( present ~ 1, data = pcod, mesh = mesh, lfo = TRUE, lfo_forecast = 2, lfo_validations = 2, # Reduced to 2 to fit in 9 time steps family = binomial(), spatiotemporal = "off", time = "year" ) # Check fold structure fold_table <- table(x$data$cv_fold, x$data$year) # Validation 1: Should train on years 2003-2011 (time steps 1-5), validate on 2015 (step 7) # Validation 2: Should train on years 2003-2013 (time steps 1-6), validate on 2017 (step 8) # Check that validation years have the right fold assignments # For lfo_forecast = 2, validation data should be in folds k + lfo_forecast expect_true(all(x$data$cv_fold[x$data$year == 2015] == 3)) # fold 1 + 2 expect_true(all(x$data$cv_fold[x$data$year == 2017] == 4)) # fold 2 + 2 # Check that training years are properly assigned # For validation 1: years 2003-2011 should have fold <= 1 train_years_1 <- c(2003, 2004, 2005, 2007, 2009, 2011) expect_true(all(x$data$cv_fold[x$data$year %in% train_years_1] <= 1)) # For validation 2: years 2003-2013 should have fold <= 2 train_years_2 <- c(2003, 2004, 2005, 2007, 2009, 2011, 2013) expect_true(all(x$data$cv_fold[x$data$year %in% train_years_2] <= 2)) }) test_that("LFO parameter validation works", { skip_on_cran() skip_on_ci() mesh <- make_mesh(pcod_2011, c("X", "Y"), cutoff = 25) # Should error when lfo_validations + lfo_forecast > number of time steps expect_error( sdmTMB_cv( present ~ 1, data = pcod_2011, # Only 2 time steps mesh = mesh, lfo = TRUE, lfo_forecast = 2, lfo_validations = 3, # 3 + 2 = 5 > 2 time steps family = binomial(), time = "year" ), "Not enough time steps" ) }) test_that("Cross validation with offsets works", { skip_on_cran() skip_on_ci() skip_if_not_installed("future") skip_if_not_installed("future.apply") set.seed(1) d <- pcod_2011 d$log_effort <- rnorm(nrow(d)) # library(future) # future::plan(future::multisession) expect_error( fit_cv_off1 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", offset = d$log_effort, #< k_folds = 2, parallel = TRUE #< )) set.seed(1) fit_cv_off1 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", offset = "log_effort", #< k_folds = 2, parallel = TRUE #< ) set.seed(1) fit_cv_off2 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", offset = "log_effort", #< k_folds = 2, parallel = FALSE #< ) set.seed(1) fit_cv_off3 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", offset = "log_effort", #< k_folds = 2, use_initial_fit = TRUE, #< parallel = TRUE #< ) set.seed(1) fit_cv_off4 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", offset = "log_effort", #< k_folds = 2, use_initial_fit = TRUE, #< parallel = FALSE #< ) # now without offset set.seed(1) fit_cv_off5 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, use_initial_fit = TRUE, #< parallel = TRUE #< ) set.seed(1) fit_cv_off6 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, use_initial_fit = TRUE, #< parallel = FALSE #< ) set.seed(1) fit_cv_off7 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, use_initial_fit = FALSE, #< parallel = FALSE #< ) set.seed(1) fit_cv_off8 <- sdmTMB_cv( density ~ 1, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, use_initial_fit = TRUE, #< parallel = FALSE #< ) expect_equal(round(fit_cv_off1$models[[1]]$model$par, 4), round(fit_cv_off2$models[[1]]$model$par, 4)) expect_equal(round(fit_cv_off1$models[[1]]$model$par, 4), round(fit_cv_off3$models[[1]]$model$par, 4)) expect_equal(round(fit_cv_off1$models[[1]]$model$par, 4), round(fit_cv_off4$models[[1]]$model$par, 4)) # with/without offset: expect_false(identical(fit_cv_off1$models[[1]]$model$par, fit_cv_off5$models[[1]]$model$par)) expect_equal(round(fit_cv_off5$models[[1]]$model$par, 4), round(fit_cv_off6$models[[1]]$model$par, 4)) expect_equal(round(fit_cv_off5$models[[1]]$model$par, 4), round(fit_cv_off7$models[[1]]$model$par, 4)) expect_equal(round(fit_cv_off5$models[[1]]$model$par, 4), round(fit_cv_off8$models[[1]]$model$par, 4)) # future::plan(future::sequential) }) test_that("Delta model cross validation works", { skip_on_cran() skip_on_ci() set.seed(1) out_tw <- sdmTMB_cv( density ~ depth_scaled, data = pcod_2011, mesh = pcod_mesh_2011, spatial = "off", family = tweedie(), k_folds = 2 ) set.seed(1) out_dg <- sdmTMB_cv( density ~ depth_scaled, data = pcod_2011, mesh = pcod_mesh_2011, spatial = "off", family = delta_gamma(), k_folds = 2 ) diff_ll <- out_tw$sum_loglik - out_dg$sum_loglik expect_equal(round(diff_ll, 4), round(-19.8025, 4)) set.seed(1) out_dpg <- sdmTMB_cv( density ~ depth_scaled, data = pcod_2011, mesh = pcod_mesh_2011, spatial = "off", family = delta_gamma(type = "poisson-link"), k_folds = 2 ) diff_ll <- out_dpg$sum_loglik - out_dg$sum_loglik expect_equal(round(diff_ll, 4), round(-4.5477, 4)) }) test_that("Cross validation with weights works", { skip_on_cran() skip_on_ci() # Test with constant mesh set.seed(123) d <- pcod_2011 # Create some example weights weights_vec <- runif(nrow(d), 0.5, 1.5) # CV with weights set.seed(1) x_weighted <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, weights = weights_vec ) # CV without weights (default) set.seed(1) x_unweighted <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2 ) # Check that CV runs successfully with weights expect_s3_class(x_weighted, "sdmTMB_cv") expect_equal(class(x_weighted$sum_loglik), "numeric") expect_equal(x_weighted$sum_loglik, sum(x_weighted$data$cv_loglik)) expect_equal(x_weighted$sum_loglik, sum(x_weighted$fold_loglik)) expect_true("data.frame" %in% class(x_weighted$data)) expect_equal(class(x_weighted$models[[1]]), "sdmTMB") # Weighted and unweighted CV should give different results expect_false(isTRUE(all.equal(x_weighted$sum_loglik, x_unweighted$sum_loglik))) # Test that predictions exist expect_true("cv_predicted" %in% names(x_weighted$data)) expect_true("cv_loglik" %in% names(x_weighted$data)) # Test with use_initial_fit = TRUE set.seed(1) x_weighted_init <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, weights = weights_vec, use_initial_fit = TRUE ) expect_s3_class(x_weighted_init, "sdmTMB_cv") expect_equal(class(x_weighted_init$sum_loglik), "numeric") # Test error handling for mismatched weights length expect_error( sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, weights = c(1, 2, 3) # Wrong length ), "same length as the number of rows" ) # Test error handling for non-positive weights expect_error( sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, weights = c(rep(1, nrow(d) - 1), 0) # Zero weight ), "must be positive" ) expect_error( sdmTMB_cv( density ~ depth_scaled, data = d, mesh = pcod_mesh_2011, family = tweedie(), spatial = "off", k_folds = 2, weights = c(rep(1, nrow(d) - 1), -1) # Negative weight ), "must be positive" ) }) test_that("Cross validation returns deviance residuals", { skip_on_cran() skip_on_ci() set.seed(123) d <- pcod_2011 mesh <- make_mesh(d, c("X", "Y"), cutoff = 10) # Test with Gaussian family (simple case) d_pos <- subset(d, density > 0) d_pos$log_density <- log(d_pos$density) x_gauss <- sdmTMB_cv( log_density ~ depth_scaled, data = d_pos, mesh = mesh, family = gaussian(), spatial = "off", k_folds = 2 ) # Check that cv_deviance_resid column exists expect_true("cv_deviance_resid" %in% names(x_gauss$data)) # Check that deviance residuals are numeric and not all NA expect_true(is.numeric(x_gauss$data$cv_deviance_resid)) expect_false(all(is.na(x_gauss$data$cv_deviance_resid))) # Check that we have the same number of residuals as rows expect_equal(length(x_gauss$data$cv_deviance_resid), nrow(d_pos)) # Test with Tweedie family set.seed(123) x_tweedie <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = mesh, family = tweedie(), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_tweedie$data)) expect_false(all(is.na(x_tweedie$data$cv_deviance_resid))) # Test with binomial family set.seed(123) x_binom <- sdmTMB_cv( present ~ depth_scaled, data = d, mesh = mesh, family = binomial(), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_binom$data)) expect_false(all(is.na(x_binom$data$cv_deviance_resid))) }) test_that("Deviance residuals work with different families", { skip_on_cran() skip_on_ci() set.seed(456) d <- pcod_2011 mesh <- make_mesh(d, c("X", "Y"), cutoff = 10) # Gamma family d_pos <- subset(d, density > 0) x_gamma <- sdmTMB_cv( density ~ depth_scaled, data = d_pos, mesh = mesh, family = Gamma(link = "log"), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_gamma$data)) expect_false(all(is.na(x_gamma$data$cv_deviance_resid))) # Lognormal family x_lognormal <- sdmTMB_cv( density ~ depth_scaled, data = d_pos, mesh = mesh, family = lognormal(), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_lognormal$data)) expect_false(all(is.na(x_lognormal$data$cv_deviance_resid))) # Negative binomial (nbinom2) d$count <- round(d$density) x_nbinom2 <- sdmTMB_cv( count ~ depth_scaled, data = d, mesh = mesh, family = nbinom2(), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_nbinom2$data)) expect_false(all(is.na(x_nbinom2$data$cv_deviance_resid))) }) test_that("Deviance residuals work with delta models", { skip_on_cran() skip_on_ci() set.seed(789) d <- pcod_2011 mesh <- make_mesh(d, c("X", "Y"), cutoff = 10) # Delta-gamma model x_delta <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = mesh, family = delta_gamma(), spatial = "off", k_folds = 2 ) # Check that cv_deviance_resid column exists for delta models expect_true("cv_deviance_resid" %in% names(x_delta$data)) expect_false(all(is.na(x_delta$data$cv_deviance_resid))) # Delta-lognormal model x_delta_lnorm <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = mesh, family = delta_lognormal(), spatial = "off", k_folds = 2 ) expect_true("cv_deviance_resid" %in% names(x_delta_lnorm$data)) expect_false(all(is.na(x_delta_lnorm$data$cv_deviance_resid))) }) test_that("Deviance residuals work with non-constant mesh", { skip_on_cran() skip_on_ci() set.seed(321) d <- pcod_2011 # Use mesh_args instead of mesh to create non-constant mesh x_nc <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh_args = list(xy_cols = c("X", "Y"), cutoff = 10), family = tweedie(), spatial = "off", k_folds = 2 ) # Check that cv_deviance_resid column exists with non-constant mesh expect_true("cv_deviance_resid" %in% names(x_nc$data)) expect_false(all(is.na(x_nc$data$cv_deviance_resid))) }) test_that("Deviance residuals work with LFO cross-validation", { skip_on_cran() skip_on_ci() set.seed(654) x_lfo <- sdmTMB_cv( present ~ 1, data = pcod_2011, mesh = pcod_mesh_2011, lfo = TRUE, lfo_forecast = 1, lfo_validations = 2, family = binomial(), spatial = "off", time = "year" ) # Check that cv_deviance_resid exists for LFO expect_true("cv_deviance_resid" %in% names(x_lfo$data)) # For LFO, deviance residuals might not be available (uses old method) # Just check the column exists }) test_that("Deviance residuals match single-model residuals", { skip_on_cran() skip_on_ci() set.seed(111) d <- pcod_2011 mesh <- make_mesh(d, c("X", "Y"), cutoff = 10) # Fit a single model fit <- sdmTMB( density ~ depth_scaled, data = d, mesh = mesh, family = tweedie(), spatial = "off" ) # Get deviance residuals from the fit dev_res_fit <- residuals(fit, type = "deviance") # Do CV with k_folds = 2 set.seed(111) x_cv <- sdmTMB_cv( density ~ depth_scaled, data = d, mesh = mesh, family = tweedie(), spatial = "off", k_folds = 2 ) # The CV deviance residuals should be calculated from models # fitted without the held-out fold, so they won't exactly match # the full-data residuals. But they should be in a similar range # and have similar properties. expect_true(is.numeric(x_cv$data$cv_deviance_resid)) expect_false(all(is.na(x_cv$data$cv_deviance_resid))) # Check that the range is reasonable (not wildly different) # This is a soft check - just ensuring they're in the same ballpark expect_true( max(abs(x_cv$data$cv_deviance_resid), na.rm = TRUE) < max(abs(dev_res_fit)) * 3 ) })