# Tests for imputation.R library(testthat) library(rwevidence) test_that("rwe_impute works with mean imputation", { set.seed(123) n <- 100 test_data <- data.frame( id = 1:n, age = rnorm(n, 60, 10), bmi = rnorm(n, 25, 5), lab_value = rnorm(n, 100, 20) ) # Introduce missing values test_data$age[sample(n, 10)] <- NA test_data$bmi[sample(n, 15)] <- NA imputed <- rwe_impute( data = test_data, method = "mean", seed = 123 ) expect_s3_class(imputed, "rwe_imputation") expect_true("imputed_data" %in% names(imputed)) expect_equal(nrow(imputed$imputed_data), n) # Check no missing values in imputed variables expect_equal(sum(is.na(imputed$imputed_data$age)), 0) expect_equal(sum(is.na(imputed$imputed_data$bmi)), 0) # Check imputed values are reasonable (should be close to mean) original_mean_age <- mean(test_data$age, na.rm = TRUE) imputed_age_values <- imputed$imputed_data$age[is.na(test_data$age)] expect_true(all(abs(imputed_age_values - original_mean_age) < 0.01)) }) test_that("rwe_impute works with median imputation", { set.seed(456) n <- 100 test_data <- data.frame( id = 1:n, age = rnorm(n, 60, 10), weight = rnorm(n, 70, 15) ) # Introduce missing values test_data$age[sample(n, 12)] <- NA imputed <- rwe_impute( data = test_data, method = "median", vars = "age", seed = 456 ) expect_s3_class(imputed, "rwe_imputation") expect_equal(sum(is.na(imputed$imputed_data$age)), 0) # Check imputed values equal the median original_median_age <- median(test_data$age, na.rm = TRUE) imputed_age_values <- imputed$imputed_data$age[is.na(test_data$age)] expect_true(all(abs(imputed_age_values - original_median_age) < 0.01)) }) test_that("rwe_impute works with mode imputation", { set.seed(789) n <- 100 test_data <- data.frame( id = 1:n, category = sample(c("A", "B", "C"), n, replace = TRUE), value = rnorm(n, 50, 10) ) # Introduce missing values in categorical variable test_data$category[sample(n, 15)] <- NA imputed <- rwe_impute( data = test_data, method = "mode", vars = "category", seed = 789 ) expect_s3_class(imputed, "rwe_imputation") expect_equal(sum(is.na(imputed$imputed_data$category)), 0) }) test_that("rwe_impute auto-detects variables with missing data", { set.seed(123) n <- 50 test_data <- data.frame( complete = 1:n, missing1 = rnorm(n), missing2 = rnorm(n), missing3 = rnorm(n) ) test_data$missing1[sample(n, 5)] <- NA test_data$missing2[sample(n, 8)] <- NA test_data$missing3[sample(n, 3)] <- NA imputed <- rwe_impute( data = test_data, method = "mean", seed = 123 ) # Should impute all three variables with missing data expect_length(imputed$vars_imputed, 3) expect_true(all(c("missing1", "missing2", "missing3") %in% imputed$vars_imputed)) # No missing values should remain expect_equal(sum(is.na(imputed$imputed_data[, imputed$vars_imputed])), 0) }) test_that("rwe_impute handles data with no missing values", { test_data <- data.frame( id = 1:50, age = rnorm(50, 60, 10), bmi = rnorm(50, 25, 5) ) imputed <- rwe_impute( data = test_data, method = "mean" ) expect_s3_class(imputed, "rwe_imputation") expect_equal(imputed$n_imputed, 0) expect_length(imputed$vars_imputed, 0) expect_identical(imputed$imputed_data, test_data) }) test_that("rwe_impute validates inputs", { test_data <- data.frame( age = c(50, 60, NA, 70) ) # Invalid method expect_error( rwe_impute(test_data, method = "invalid_method"), "'arg' should be one of" ) }) test_that("rwe_impute respects seed for reproducibility", { set.seed(999) n <- 100 test_data <- data.frame( age = rnorm(n, 60, 10), bmi = rnorm(n, 25, 5) ) test_data$age[sample(n, 20)] <- NA imputed1 <- rwe_impute(test_data, method = "mean", seed = 42) imputed2 <- rwe_impute(test_data, method = "mean", seed = 42) imputed3 <- rwe_impute(test_data, method = "mean", seed = 99) # Same seed should give same results expect_identical(imputed1$imputed_data, imputed2$imputed_data) # Different seed might give different results (though for mean it won't) # This is more relevant for stochastic methods expect_equal(nrow(imputed3$imputed_data), n) }) test_that("rwe_impute creates missingness statistics", { set.seed(123) n <- 100 test_data <- data.frame( age = rnorm(n), bmi = rnorm(n) ) test_data$age[sample(n, 15)] <- NA test_data$bmi[sample(n, 10)] <- NA imputed <- rwe_impute(test_data, method = "mean", seed = 123) expect_true("missingness_before" %in% names(imputed)) expect_true("missingness_after" %in% names(imputed)) # Before imputation should have missing values expect_true(any(imputed$missingness_before$n_missing > 0)) # After imputation should have no missing values expect_equal(sum(imputed$missingness_after$n_missing), 0) }) test_that("rwe_impute creates diagnostics", { set.seed(123) test_data <- data.frame( age = c(50, 60, NA, 70, NA, 55), bmi = c(25, NA, 27, 28, 26, NA) ) imputed <- rwe_impute(test_data, method = "mean", seed = 123) expect_true("diagnostics" %in% names(imputed)) expect_type(imputed$diagnostics, "list") }) test_that("rwe_impute creates audit trail", { set.seed(123) test_data <- data.frame( age = c(50, 60, NA, 70), bmi = c(25, NA, 27, 28) ) imputed <- rwe_impute(test_data, method = "mean", seed = 123) expect_true("audit_trail" %in% names(imputed)) expect_s3_class(imputed$audit_trail, "rwe_audit_trail") }) test_that("rwe_impute works with rwe_data objects", { set.seed(123) test_df <- data.frame( age = c(50, 60, NA, 70, NA), bmi = c(25, 26, 27, NA, 29) ) rwe_obj <- new_rwe_data(test_df, metadata = list(source = "test")) imputed <- rwe_impute(rwe_obj, method = "mean", seed = 123) expect_s3_class(imputed, "rwe_imputation") expect_equal(nrow(imputed$imputed_data), 5) expect_equal(sum(is.na(imputed$imputed_data)), 0) }) test_that("rwe_impute only imputes specified variables", { set.seed(123) test_data <- data.frame( age = c(50, NA, 60, NA), bmi = c(25, NA, 27, NA), weight = c(70, NA, 75, NA) ) imputed <- rwe_impute( test_data, method = "mean", vars = c("age", "bmi"), # Don't impute weight seed = 123 ) expect_length(imputed$vars_imputed, 2) expect_equal(sum(is.na(imputed$imputed_data$age)), 0) expect_equal(sum(is.na(imputed$imputed_data$bmi)), 0) expect_equal(sum(is.na(imputed$imputed_data$weight)), 2) # Still has NAs }) test_that("rwe_impute handles mixed data types", { set.seed(123) test_data <- data.frame( id = 1:10, age = c(50, 60, NA, 70, NA, 55, 65, NA, 75, 80), category = c("A", "B", NA, "A", "B", NA, "C", "A", NA, "B"), score = c(10, NA, 15, 20, NA, 25, 30, NA, 35, 40), stringsAsFactors = FALSE ) # Mean imputation should handle numeric variables imputed <- rwe_impute( test_data, method = "mean", vars = c("age", "score"), seed = 123 ) expect_equal(sum(is.na(imputed$imputed_data$age)), 0) expect_equal(sum(is.na(imputed$imputed_data$score)), 0) # Category still has NAs (mean doesn't work on characters) expect_true(sum(is.na(imputed$imputed_data$category)) > 0) }) test_that("impute_mean only imputes numeric variables", { test_data <- data.frame( numeric_var = c(1, 2, NA, 4), char_var = c("a", "b", NA, "d"), stringsAsFactors = FALSE ) # Should skip character variable imputed <- impute_mean(test_data, c("numeric_var", "char_var")) expect_equal(sum(is.na(imputed$numeric_var)), 0) expect_equal(sum(is.na(imputed$char_var)), 1) # Still has NA }) test_that("impute_median only imputes numeric variables", { test_data <- data.frame( numeric_var = c(1, 2, NA, 4, 5), char_var = c("a", "b", NA, "d", "e"), stringsAsFactors = FALSE ) imputed <- impute_median(test_data, c("numeric_var", "char_var")) expect_equal(sum(is.na(imputed$numeric_var)), 0) expect_equal(sum(is.na(imputed$char_var)), 1) # Still has NA }) test_that("calculate_missingness returns correct structure", { test_data <- data.frame( var1 = c(1, 2, NA, 4, NA), var2 = c(10, NA, 30, 40, 50), var3 = c(100, 200, 300, 400, 500) # No missing ) result <- calculate_missingness(test_data, c("var1", "var2", "var3")) expect_s3_class(result, "data.frame") expect_equal(nrow(result), 3) expect_true(all(c("variable", "n_missing", "pct_missing") %in% names(result))) expect_equal(result$n_missing[result$variable == "var1"], 2) expect_equal(result$n_missing[result$variable == "var2"], 1) expect_equal(result$n_missing[result$variable == "var3"], 0) expect_equal(result$pct_missing[result$variable == "var1"], 0.4) expect_equal(result$pct_missing[result$variable == "var2"], 0.2) expect_equal(result$pct_missing[result$variable == "var3"], 0.0) }) test_that("rwe_impute records number of imputed values", { set.seed(123) test_data <- data.frame( age = c(50, 60, NA, 70, NA, 55), bmi = c(25, NA, 27, 28, 26, NA) ) imputed <- rwe_impute(test_data, method = "mean", seed = 123) # Should have imputed 4 values total (2 age + 2 bmi) expect_equal(imputed$n_imputed, 4) }) test_that("rwe_impute stores method information", { test_data <- data.frame( age = c(50, 60, NA, 70) ) imputed_mean <- rwe_impute(test_data, method = "mean") imputed_median <- rwe_impute(test_data, method = "median") expect_equal(imputed_mean$method, "mean") expect_equal(imputed_median$method, "median") }) test_that("rwe_impute handles edge case with all missing values", { test_data <- data.frame( age = c(NA, NA, NA, NA), bmi = c(25, 26, 27, 28) ) # Mean of all NAs is NaN imputed <- rwe_impute(test_data, method = "mean", vars = "age") # Should not crash but result might be NaN expect_s3_class(imputed, "rwe_imputation") }) test_that("rwe_impute handles single row data", { test_data <- data.frame( age = NA, bmi = 25 ) # Can't really impute with single row imputed <- rwe_impute(test_data, method = "mean") expect_s3_class(imputed, "rwe_imputation") }) test_that("rwe_impute works with knn method", { skip_if_not_installed("FNN") set.seed(321) n <- 150 age_vals <- rnorm(n, 60, 8) test_data <- data.frame( age = age_vals, bmi = rnorm(n, 28, 4), glucose = 0.5 * age_vals + rnorm(n, 0, 5) ) test_data$glucose[sample(n, 20)] <- NA imputed <- rwe_impute( data = test_data, method = "knn", vars = "glucose", predictors = c("age", "bmi"), k = 3, seed = 321 ) expect_s3_class(imputed, "rwe_imputation") expect_equal(sum(is.na(imputed$imputed_data$glucose)), 0) }) test_that("rwe_impute works with xgboost method", { skip_if_not_installed("xgboost") set.seed(654) n <- 120 age_vals <- rnorm(n, 55, 7) weight_vals <- rnorm(n, 75, 12) test_data <- data.frame( age = age_vals, weight = weight_vals, cholesterol = 0.2 * age_vals + 0.5 * weight_vals + rnorm(n, 0, 10) ) test_data$cholesterol[sample(n, 18)] <- NA imputed <- rwe_impute( data = test_data, method = "xgboost", vars = "cholesterol", predictors = c("age", "weight"), xgb_nrounds = 20, seed = 654 ) expect_s3_class(imputed, "rwe_imputation") expect_equal(sum(is.na(imputed$imputed_data$cholesterol)), 0) }) test_that("print.rwe_imputation method exists", { set.seed(123) test_data <- data.frame( age = c(50, 60, NA, 70) ) imputed <- rwe_impute(test_data, method = "mean", seed = 123) # Should have a print method expect_output(print(imputed)) })