# Tests for Data Validation Functions
# Tests comprehensive data quality checks

context("Data Validation")

# Create valid test data
set.seed(456)
valid_data <- data.frame(
  id = rep(1:100, each = 5),
  time = rep(0:4, 100),
  outcome = rnorm(500, mean = 10, sd = 3),
  stratum = rep(1:4, each = 125),
  cluster = rep(1:20, each = 25),
  weight = runif(500, 0.5, 2),
  covariate1 = rnorm(500),
  covariate2 = rnorm(500)
)


test_that("validate_survey_data accepts valid data", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    strata = "stratum",
    cluster = "cluster",
    weights = "weight",
    verbose = FALSE
  )

  expect_s4_class(result, "DataValidation")
  expect_true(result@is_valid)
  expect_equal(length(result@errors), 0)
})

test_that("validate_survey_data detects missing variables", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "nonexistent_variable",  # Missing
    verbose = FALSE
  )

  expect_false(result@is_valid)
  expect_true(any(grepl("Missing required variables", result@errors)))
})

test_that("validate_survey_data detects duplicate ID-time combinations", {
  bad_data <- valid_data
  # Add duplicate
  bad_data <- rbind(bad_data, bad_data[1, ])

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  expect_false(result@is_valid)
  expect_true(any(grepl("duplicate", result@errors, ignore.case = TRUE)))
})

test_that("validate_survey_data checks cluster nesting", {
  # Create non-nested clusters (cluster 1 in both stratum 1 and 2)
  bad_data <- valid_data
  bad_data$cluster[bad_data$stratum == 2][1:10] <- bad_data$cluster[bad_data$stratum == 1][1]

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    strata = "stratum",
    cluster = "cluster",
    verbose = FALSE
  )

  expect_false(result@is_valid)
  expect_true(any(grepl("nested", result@errors, ignore.case = TRUE)))
})

test_that("validate_survey_data detects weight issues", {
  # Missing weights
  bad_data <- valid_data
  bad_data$weight[1:10] <- NA

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    weights = "weight",
    verbose = FALSE
  )

  expect_false(result@is_valid)
  expect_true(any(grepl("missing weight", result@errors, ignore.case = TRUE)))

  # Non-positive weights
  bad_data <- valid_data
  bad_data$weight[1:5] <- -1

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    weights = "weight",
    verbose = FALSE
  )

  expect_false(result@is_valid)
  expect_true(any(grepl("non-positive", result@errors, ignore.case = TRUE)))
})

test_that("validate_survey_data warns about extreme weights", {
  bad_data <- valid_data
  # Add extreme weight
  bad_data$weight[1] <- 100  # Very large

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    weights = "weight",
    verbose = FALSE
  )

  # Should have warning about extreme weights
  expect_true(any(grepl("extreme weight", result@warnings, ignore.case = TRUE)))
})

test_that("validate_survey_data checks for minimum observations", {
  bad_data <- valid_data
  # Remove most observations for ID 1
  bad_data <- bad_data[!(bad_data$id == 1 & bad_data$time > 1), ]

  result <- validate_survey_data(
    data = bad_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    min_observations = 3,
    verbose = FALSE
  )

  # Should warn about insufficient observations
  expect_true(any(grepl("< 3 observations", result@warnings, ignore.case = TRUE)) ||
             any(grepl("insufficient", result@warnings, ignore.case = TRUE)))
})

test_that("validate_survey_data analyzes missing data patterns", {
  # Create data with missing outcomes
  missing_data <- valid_data
  missing_data$outcome[sample(1:500, 50)] <- NA

  result <- validate_survey_data(
    data = missing_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Should report missing percentage
  expect_true(!is.null(result@info$outcome_missing_pct))
  expect_true(result@info$outcome_missing_pct > 0)
  expect_true(result@info$outcome_missing_pct <= 100)
})

test_that("validate_survey_data detects monotone missing patterns", {
  # Create monotone dropout pattern
  monotone_data <- valid_data
  # ID 1-10 drop out at different times
  for (i in 1:10) {
    dropout_time <- i %% 5
    monotone_data$outcome[monotone_data$id == i & monotone_data$time >= dropout_time] <- NA
  }

  result <- validate_survey_data(
    data = monotone_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Check that missing pattern is assessed
  expect_true(!is.null(result@info$missing_pattern))
})

test_that("validate_survey_data checks outcome distribution", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Should have outcome summary statistics
  expect_true(!is.null(result@info$outcome_summary))
  expect_true(!is.null(result@info$outcome_skewness))
  expect_true(!is.null(result@info$outcome_kurtosis))
})

test_that("validate_survey_data detects outliers", {
  outlier_data <- valid_data
  # Add extreme outliers
  outlier_data$outcome[1:5] <- c(100, 105, -80, -90, 110)

  result <- validate_survey_data(
    data = outlier_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Should warn about outliers
  expect_true(any(grepl("outlier", result@warnings, ignore.case = TRUE)))
})

test_that("validate_survey_data checks time variable", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Should report time points
  expect_true(!is.null(result@info$time_points))
  expect_true(!is.null(result@info$n_time_points))
  expect_equal(result@info$n_time_points, 5)
})

test_that("validate_survey_data warns about few time points", {
  two_time_data <- valid_data[valid_data$time %in% c(0, 1), ]

  result <- validate_survey_data(
    data = two_time_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  # Should warn about limited time points
  expect_true(any(grepl("2 time points|limited", result@warnings, ignore.case = TRUE)))
})

test_that("validate_survey_data checks covariate multicollinearity", {
  # Create highly correlated covariates
  corr_data <- valid_data
  corr_data$covariate2 <- corr_data$covariate1 * 1.01 + rnorm(500, 0, 0.01)

  result <- validate_survey_data(
    data = corr_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    covariates = c("covariate1", "covariate2"),
    verbose = FALSE
  )

  # Should warn about high correlations
  expect_true(any(grepl("correlation", result@warnings, ignore.case = TRUE)))
})

test_that("validate_survey_data calculates design effect", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    weights = "weight",
    verbose = FALSE
  )

  # Should calculate design effect
  expect_true(!is.null(result@info$design_effect))
  expect_true(result@info$design_effect > 0)
})

test_that("validate_survey_data provides recommendations", {
  # Use problematic data
  problem_data <- valid_data
  problem_data$weight[1] <- 50  # Extreme weight
  problem_data$outcome[sample(1:500, 150)] <- NA  # High missingness

  result <- validate_survey_data(
    data = problem_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    weights = "weight",
    verbose = FALSE
  )

  # Should have recommendations
  expect_true(length(result@recommendations) > 0)
})

test_that("DataValidation print method works", {
  result <- validate_survey_data(
    data = valid_data,
    id = "id",
    time = "time",
    outcome = "outcome",
    verbose = FALSE
  )

  expect_output(show(result), "VALIDATION")
  expect_output(show(result), "PASSED|FAILED")
})

test_that("Helper functions work correctly", {
  # Test skewness calculation
  x <- c(1, 2, 3, 4, 5, 100)  # Right-skewed
  skew <- .calculate_skewness(x)
  expect_true(skew > 0)

  x_symmetric <- c(-2, -1, 0, 1, 2)
  skew_sym <- .calculate_skewness(x_symmetric)
  expect_true(abs(skew_sym) < 0.1)

  # Test kurtosis calculation
  kurt <- .calculate_kurtosis(rnorm(1000))
  expect_true(abs(kurt) < 1)  # Normal should be near 0 (excess)
})

test_that("Monotone missing check works", {
  # Create monotone pattern
  mono_data <- data.frame(
    id = rep(1:10, each = 5),
    time = rep(1:5, 10),
    outcome = rnorm(50)
  )

  # Add monotone missingness for some individuals
  mono_data$outcome[mono_data$id == 1 & mono_data$time >= 4] <- NA
  mono_data$outcome[mono_data$id == 2 & mono_data$time >= 3] <- NA

  is_mono <- .check_monotone_missing(mono_data, "id", "time", "outcome")
  # Should detect monotone pattern (most individuals have it)
  expect_true(is.logical(is_mono))

  # Create non-monotone pattern
  nonmono_data <- mono_data
  nonmono_data$outcome[nonmono_data$id == 3 & nonmono_data$time == 3] <- NA
  nonmono_data$outcome[nonmono_data$id == 3 & nonmono_data$time == 5] <- 5  # Not monotone

  is_nonmono <- .check_monotone_missing(nonmono_data, "id", "time", "outcome")
  # Pattern should be less clearly monotone
  expect_true(is.logical(is_nonmono))
})