# Helper function to create a simple dataset
create_dataset <- function(n_samples, n_features = 5L, n_targets = 1L, coef = NULL, noise = 0L) {
  X <- matrix(stats::runif(n_samples * n_features), nrow = n_samples)
  colnames(X) <- paste0("Feature", seq_len(n_features))

  if (is.null(coef)) {
    coef <- matrix(stats::runif(n_features * n_targets), nrow = n_features)
  } else if (is.vector(coef)) {
    coef <- matrix(coef, ncol = 1L)
  }

  # Normalize coefficients
  coef <- apply(coef, 2L, function(col) col / sum(abs(col)))

  Y <- X %*% coef + matrix(stats::rnorm(n_samples * n_targets, 0.0, noise), nrow = n_samples)
  colnames(Y) <- letters[seq_len(ncol(Y))]

  if (n_targets > 1L) {
    # Ensure all values are positive
    Y <- Y - min(Y) + 1e-6
    # Normalize rows to sum to 1
    Y <- Y / rowSums(Y)
  }

  list(X = X, Y = Y, coef = coef)
}

# Create datasets for reuse
set.seed(42L)
N <- 100L
regression_data <- create_dataset(N, noise = 0.1)
multi_regression_data <- create_dataset(N, n_targets = 3L)

Y_binary <- matrix(as.integer(regression_data$Y > mean(regression_data$Y)), nrow = N)
Y_multi_binary <- matrix(as.integer(multi_regression_data$Y > mean(multi_regression_data$Y)), nrow = N)
colnames(Y_multi_binary) <- letters[seq_len(ncol(Y_multi_binary))]

# Test for regression (one col)
testthat::test_that("greedyMSE works for regression", {
  model <- greedyMSE(regression_data$X, regression_data$Y)
  testthat::expect_lt(model$RMSE, stats::sd(regression_data$Y)) # Model should be better than baseline
  # High correlation with true values
  testthat::expect_gt(stats::cor(predict(model, regression_data$X), regression_data$Y), 0.8)

  testthat::expect_output(print(model), "Greedy MSE")
  testthat::expect_output(print(model), "RMSE")
  testthat::expect_output(print(model), "Weights")
})

# Test for binary classification (one col)
testthat::test_that("greedyMSE works for binary classification", {
  model <- greedyMSE(regression_data$X, Y_binary)
  predictions <- predict(model, regression_data$X)
  accuracy <- mean((predictions > 0.5) == Y_binary)
  testthat::expect_gt(accuracy, 0.7) # Accuracy should be better than random guessing

  testthat::expect_output(print(model), "Greedy MSE")
  testthat::expect_output(print(model), "RMSE")
  testthat::expect_output(print(model), "Weights")
})

# Test for multiple regression (many cols)
testthat::test_that("greedyMSE works for multiple regression", {
  model <- greedyMSE(multi_regression_data$X, multi_regression_data$Y)

  testthat::expect_lt(model$RMSE, 0.3)
  predictions <- predict(model, multi_regression_data$X)
  correlations <- diag(stats::cor(predictions, multi_regression_data$Y))
  testthat::expect_gt(min(correlations), 0.7) # High correlation for all targets
})

# Test for multiclass classification (many cols)
testthat::test_that("greedyMSE works for multiclass classification", {
  model <- greedyMSE(multi_regression_data$X, Y_multi_binary)
  predictions <- predict(model, multi_regression_data$X)
  accuracy <- mean(apply(predictions, 1L, which.max) == apply(multi_regression_data$Y, 1L, which.max))
  testthat::expect_gt(accuracy, 0.6) # Accuracy should be better than random guessing
})

# Edge cases
testthat::test_that("greedyMSE handles edge cases", {
  # 1. Single feature
  data <- create_dataset(100L, 1L, 1L)
  testthat::expect_equal(greedyMSE(data$X, data$Y)$RMSE, 0.0, tolerance = 1e-6)

  # 2. Perfect multicollinearity
  X <- matrix(1L, nrow = 100L, ncol = 2L)
  Y <- X[, 1L] + stats::rnorm(100L, 0.0, 0.1)
  testthat::expect_equal(greedyMSE(data$X, data$Y)$RMSE, 0.0, tolerance = 1e-6)

  # 3. All zero features
  X <- matrix(0L, nrow = 100L, ncol = 5L)
  Y <- matrix(stats::rnorm(100L), ncol = 1L)
  model <- greedyMSE(X, Y)
  testthat::expect_equal(model$RMSE, stats::sd(Y), tolerance = 1e-2)

  # 4. Constant target
  X <- matrix(stats::runif(500L), nrow = 100L)
  Y <- matrix(rep(1L, 100L), ncol = 1L)
  model <- greedyMSE(X, Y)
  testthat::expect_equal(model$RMSE, 0.5, tolerance = 0.1)

  # 5. Very large values
  X <- matrix(stats::runif(500L, 1.0e6, 1.0e7), nrow = 100L)
  Y <- matrix(rowSums(X) + stats::rnorm(100L, 0L, 1.0e5), ncol = 1L)
  model <- greedyMSE(X, Y)
  pred <- predict(model, X)
  testthat::expect_gt(cor(pred, Y), 0.4)
})

# Regression ensembling test
testthat::test_that("greedyMSE can be used for regression ensembling with GLM, rpart, and RF", {
  X <- regression_data$X
  Y <- regression_data$Y

  # GLM
  glm_model <- stats::glm(Y ~ X, family = stats::gaussian())
  glm_pred <- stats::predict(glm_model, newdata = data.frame(X))

  # rpart
  rpart_model <- rpart::rpart(Y ~ X)
  rpart_pred <- stats::predict(rpart_model, newdata = data.frame(X))

  # greedyMSE
  greedy_model <- greedyMSE(X, Y)
  greedy_pred <- predict(greedy_model, X)

  # Ensemble
  ensemble_X <- cbind(glm_pred, rpart_pred, greedy_pred)
  ensemble_model <- greedyMSE(ensemble_X, Y)

  # Check if ensemble performs better than the best individual model
  individual_rmse <- c(
    sqrt(mean((Y - glm_pred)^2L)),
    sqrt(mean((Y - rpart_pred)^2L)),
    greedy_model$RMSE
  )
  testthat::expect_lte(ensemble_model$RMSE, min(individual_rmse))
})

# Classification ensembling test
testthat::test_that("greedyMSE can be used for classification ensembling with GLM, rpart, and RF", {
  X <- regression_data$X
  Y_binary <- as.integer(regression_data$Y > stats::median(regression_data$Y))

  # GLM (logistic regression)
  glm_model <- stats::glm(Y_binary ~ X, family = stats::binomial())
  glm_pred <- stats::predict(glm_model, newdata = data.frame(X), type = "response")

  # rpart
  rpart_model <- rpart::rpart(Y_binary ~ X, method = "class")
  rpart_pred <- stats::predict(rpart_model, newdata = data.frame(X), type = "prob")[, 2L]

  # Random Forest
  rf_model <- randomForest::randomForest(X, as.factor(Y_binary))
  rf_pred <- stats::predict(rf_model, newdata = X, type = "prob")[, 2L]

  # greedyMSE
  greedy_model <- greedyMSE(X, matrix(Y_binary, ncol = 1L))
  greedy_pred <- predict(greedy_model, X)

  # Ensemble
  ensemble_X <- cbind(glm_pred, rpart_pred, rf_pred, greedy_pred)
  ensemble_model <- greedyMSE(ensemble_X, matrix(Y_binary, ncol = 1L))

  # Check if ensemble performs better than the best individual model
  individual_rmse <- c(
    sqrt(mean((Y_binary - glm_pred)^2L)),
    sqrt(mean((Y_binary - rpart_pred)^2L)),
    sqrt(mean((Y_binary - rf_pred)^2L)),
    greedy_model$RMSE
  )
  testthat::expect_lte(ensemble_model$RMSE, min(individual_rmse))
})

# Test fitting and predicting with factor response (2 levels)
testthat::test_that("greedyMSE works with 2-level factor response", {
  lev <- c("Low", "High")
  y_factor <- cut(regression_data$Y, breaks = 2L, labels = lev)
  model <- greedyMSE(regression_data$X, y_factor)
  pred <- predict(model, regression_data$X, return_labels = TRUE)

  testthat::expect_is(pred, "factor")
  testthat::expect_identical(levels(pred), levels(y_factor))
  testthat::expect_gt(mean(pred == y_factor), 0.60) # 0.50 is random guessing
})

# Test fitting and predicting with factor response (3 levels)
testthat::test_that("greedyMSE works with 3-level factor response", {
  lev <- c("Low", "Medium", "High")
  y_factor <- cut(regression_data$Y, breaks = 3L, labels = lev)
  model <- greedyMSE(regression_data$X, y_factor)
  pred <- predict(model, regression_data$X, return_labels = TRUE)

  testthat::expect_is(pred, "factor")
  testthat::expect_identical(levels(pred), levels(y_factor))
  testthat::expect_gt(mean(pred == y_factor), 0.4) # 0.33 is random guessing
})

# Test varImp functionality
testthat::test_that("varImp works for greedyMSE", {
  model <- greedyMSE(regression_data$X, regression_data$Y)
  importance <- caret::varImp(model)

  testthat::expect_is(importance, "data.frame")
  testthat::expect_identical(nrow(importance), ncol(regression_data$X))
  testthat::expect_true(all(importance$Overall >= 0.0 & importance$Overall <= 1.0))
  testthat::expect_equal(sum(importance$Overall), 1.0, tolerance = 1e-6)
})

# Test predict with data.frame input
testthat::test_that("predict works with data.frame input", {
  model <- greedyMSE(regression_data$X, regression_data$Y)
  newdata_df <- as.data.frame(regression_data$X)
  pred <- predict(model, newdata_df)

  testthat::expect_is(pred, "numeric")
  testthat::expect_length(pred, nrow(newdata_df))
})

# Test predict with label return for classification
testthat::test_that("predict returns labels for classification", {
  y_factor <- factor(ifelse(regression_data$Y > median(regression_data$Y), "High", "Low"))
  model <- greedyMSE(regression_data$X, y_factor)
  pred_prob <- predict(model, regression_data$X, return_labels = FALSE)
  pred_label <- predict(model, regression_data$X, return_labels = TRUE)

  testthat::expect_is(pred_prob, "matrix")
  testthat::expect_is(pred_label, "factor")
  testthat::expect_identical(levels(pred_label), levels(y_factor))
})

# More realistic test
testthat::test_that("greedyMSE handles highly correlated predictors", {
  set.seed(123L)
  n <- 1000L
  base <- rnorm(n)
  X <- matrix(
    cbind(
      base + rnorm(n, 0.0, 0.01),
      base + rnorm(n, 0.0, 0.01),
      base + rnorm(n, 0.0, 0.01),
      base + rnorm(n, 0.0, 0.01),
      base + rnorm(n, 0.0, 0.01),
      rnorm(n)
    ),
    ncol = 6L
  )
  colnames(X) <- c("x1", "x2", "x3", "x4", "x5", "x6")
  coef <- c(10.0, 10.0, 5.0, 5.0, 0.0, 0.0)
  coef <- coef / sum(coef)
  Y <- X %*% coef + rnorm(n, 0.0, 0.1)
  Y <- Y - mean(Y)
  Y <- Y / sd(Y)

  model <- greedyMSE(X, Y)
  pred <- predict(model, X)

  rmse_model <- sqrt(mean((pred - Y)^2L))
  rmse_mean <- sqrt(mean((mean(Y) - Y)^2L))

  testthat::expect_lt(rmse_model, rmse_mean)
})

testthat::test_that("greedyMSE works with caret::train for regression", {
  set.seed(42L)
  n <- 1000L
  X1 <- stats::rnorm(n)
  X2 <- 0.5 * X1 + sqrt(1L - 0.5^2L) * stats::rnorm(n)
  Y <- 0.9 * X1 + 0.9 * X2 + 0.1 * stats::rnorm(n)
  Y <- (Y - mean(Y)) / stats::sd(Y) # Standardize Y

  # Split data
  train_indices <- sample.int(n, 0.7 * n)
  X_train <- cbind(X1, X2)[train_indices, ]
  X_test <- cbind(X1, X2)[-train_indices, ]
  Y_train <- Y[train_indices]
  Y_test <- Y[-train_indices]

  # Train model
  model <- caret::train(
    X_train,
    Y_train,
    tuneLength = 1L,
    method = greedyMSE_caret()
  )

  # Make predictions
  predictions <- predict(model, newdata = X_test)

  # Compute RMSE
  rmse_model <- sqrt(mean((predictions - Y_test)^2L))
  rmse_mean <- sqrt(mean((mean(Y_train) - Y_test)^2L))

  # Check model performance
  testthat::expect_lt(rmse_model, rmse_mean)

  # Check variable importance
  w <- model$finalModel$model_weights
  expect_equal(sum(w), 1.0, tol = 1.0e-6)
  expect_equal(w[1L], w[2L], tol = 0.2)
})

testthat::test_that("greedyMSE works with caret::train for binary classification", {
  set.seed(42L)

  # Make Y
  n <- 1000L
  X1_pos <- stats::runif(n)
  X2_pos <- 0.5 * X1_pos + sqrt(1L - 0.5^2L) * stats::runif(n)
  Y <- 0.9 * X1_pos + 0.9 * X2_pos + 0.05 * stats::rnorm(n)
  Y <- cut(Y, breaks = 2L, labels = c("Low", "High"))

  # Add negative classes
  # THIS IS IMPORTANT!
  # We need probabilities from ALL classes for ALL the input models
  X1_neg <- 1.0 - X1_pos
  X2_neg <- 1.0 - X2_pos

  X <- cbind(X1_pos, X2_pos, X1_neg, X2_neg)

  # Split data
  train_indices <- sample.int(n, 0.7 * n)
  X_train <- X[train_indices, ]
  X_test <- X[-train_indices, ]
  Y_train <- Y[train_indices]
  Y_test <- Y[-train_indices]

  # Train model
  model <- caret::train(
    X_train,
    Y_train,
    tuneLength = 1L,
    method = greedyMSE_caret()
  )

  # Make predictions
  predictions <- predict(model, newdata = X_test, type = "prob")

  # Compute AUC
  auc <- mean(caTools::colAUC(predictions, Y_test))
  benchmark_auc <- max(caTools::colAUC(X_test, Y_test))

  # Check model performance
  testthat::expect_gt(auc, benchmark_auc)
})

testthat::test_that("greedyMSE works with caret::train for multiclass classification", {
  set.seed(42L)
  n <- 1000L

  # Generate base correlated variables
  Z1 <- stats::rnorm(n)
  Z2 <- 0.5 * Z1 + sqrt(2L - 0.5^2L) * stats::rnorm(n)

  # Generate Y with high correlation to Z1 and Z2
  Y_numeric <- 0.9 * Z1 + 0.9 * Z2 + stats::rnorm(n)

  # Convert Y to categorical
  Y <- cut(Y_numeric, breaks = 3L, labels = c("Low", "Medium", "High"))

  # Create probability matrices for X1 and X2
  create_prob_matrix <- function(z) {
    probs <- stats::pnorm(z, mean = mean(z), sd = sd(z))
    raw_matrix <- matrix(c(1L - probs, probs - 0.5, 0.5 * probs), ncol = 3L)
    exp_matrix <- exp(raw_matrix)
    exp_matrix / rowSums(exp_matrix)
  }

  X1 <- create_prob_matrix(Z1)
  X2 <- create_prob_matrix(Z2)

  # Combine X1 and X2
  X <- cbind(X1, X2)
  colnames(X) <- c("X1_Low", "X1_Medium", "X1_High", "X2_Low", "X2_Medium", "X2_High")

  # Split data
  train_indices <- sample.int(n, 0.7 * n)
  X_train <- X[train_indices, ]
  X_test <- X[-train_indices, ]
  Y_train <- Y[train_indices]
  Y_test <- Y[-train_indices]

  # Train model
  model <- caret::train(
    X_train,
    Y_train,
    tuneLength = 1L,
    method = greedyMSE_caret()
  )

  # Make predictions
  predictions <- predict(model, newdata = X_test, type = "prob")

  # Compute AUC
  auc <- rowMeans(caTools::colAUC(predictions, Y_test))
  benchmark_auc <- rowMeans(caTools::colAUC(X_test, Y_test))

  # Check model performance
  testthat::expect_true(all(auc > benchmark_auc))
})