context("predict_mi") library(dplyr) library(mice) library(testthat) suppressWarnings(RNGversion("3.5.0")) # load data dat <- mice::nhanes # add indicator training and test # first 20 for training, last 5 for testing dat$set <- c(rep("train", 20), rep("test", 5)) # Make prediction matrix and ensure that set is not used as a predictor predmat <- mice::make.predictorMatrix(dat) predmat[,"set"] <- 0 # train imputation model imp <- mice(dat, m = 5, maxit = 5 ,seed = 1, predictorMatrix = predmat, ignore = ifelse(dat$set == "test", TRUE, FALSE)) # extract the training and test data sets impdats <- mice::complete(imp, "all") traindats <- lapply(impdats, function(dat) subset(dat, set == "train", select = -set)) testdats <- lapply(impdats, function(dat) subset(dat, set == "test", select = -c(set))) # predict age with other variables with training datasets fits <- lapply(traindats, function(dat) lm(age ~ bmi + hyp + chl, data = dat)) # pool the predictions with function pool_preds <- predict_mi(object = fits, newdata = testdats, pool = TRUE, interval = "prediction", level = 0.95) # obtain the imputed value and the width of the prediction interval predfunc <- function(model, data, level) { summfit <- summary(model) sigma <- summfit$sigma xmat <- model.matrix(summfit$terms, data = as.data.frame(data)) se <- sqrt(1 + rowSums((xmat %*% summfit$cov.unscaled) * xmat)) xfit <- xmat %*% model$coefficients # variance var_pred = se^2 * sigma^2 return(cbind(model = xfit, var_pred = var_pred)) } # obtain predictions for the different imputations preds_all <- Map(predfunc, model = fits, data = testdats, level = 0.95) # change the list to a df preds <- unlist(preds_all) %>% array(dim = c(nrow(testdats[[1]]), 2, 5)) # our estimand is the predicted value # so Q bar is the mean over the predictions Q_bar <- apply(preds[, 1, ], 1, mean) # The uncertainty estimate is the width of the prediction interval U_bar <- apply(preds[, 2, ], 1, mean) # The between imputation variance is the variance of the predictions B <- apply(preds[, 1, ], 1, var) # The total variance is T + U_bar + B + B/M (with m the number of imputations) T_var <- U_bar + B + B / 5 # calculate the degrees of freedom for all observations df_vector <- numeric(length(Q_bar)) t_vector <- numeric(length(Q_bar)) for (i in 1:length(Q_bar)) { df_vector[i] <- mice:::barnard.rubin(5, B[i], T_var[i]) t_vector[i] <- qt(1 - (1-0.95)/2, df_vector[i]) } # Calculate bounds using individual t-values lwr <- Q_bar - t_vector * sqrt(T_var) upr <- Q_bar + t_vector * sqrt(T_var) # check if the output structure is as expected test_that("Output class is correct", { expect_type(pool_preds, "double") # checks storage type expect_true(is.matrix(pool_preds)) # checks it's a matrix }) # check if result is the same, for by hand or in function # Note pool_preds function depends on mice::pool.scalar test_that("retains same numerical result", { expect_equal(unname(pool_preds[, 1]), Q_bar, tolerance = 0.00001) expect_equal(unname(pool_preds[, 2]), lwr, tolerance = 0.00001) expect_equal(unname(pool_preds[, 3]), upr, tolerance = 0.00001) })