test_that("unnecessary columns are removed from model objects", {
  # Create a dataset with many unnecessary columns
  set.seed(123)
  n_rows <- 50
  n_extra_cols <- 20

  # Create base data (what we actually need)
  base_data <- data.frame(
    y = rnorm(n_rows),
    x1 = rnorm(n_rows),
    x2 = rnorm(n_rows),
    control1 = rnorm(n_rows)
  )

  # Add many unnecessary columns
  extra_data <- replicate(n_extra_cols, rnorm(n_rows), simplify = FALSE)
  names(extra_data) <- paste0("extra_col_", 1:n_extra_cols)
  full_data <- cbind(base_data, extra_data)

  # Run modeling pipeline
  result <- br_pipeline(
    full_data,
    y = "y",
    x = c("x1", "x2"),
    x2 = "control1",
    method = "gaussian"
  )

  # Test that original data is preserved in breg object
  expect_equal(ncol(result@data), ncol(full_data) + 1) # +1 for .row_names

  # Test that individual models only have necessary columns
  model1 <- result@models[["x1"]]
  model2 <- result@models[["x2"]]

  # Each model should only have y + focal_variable + control variables
  # For x1: y, x1, control1 (3 columns)
  expect_equal(ncol(model1$model), 3)
  expect_equal(sort(colnames(model1$model)), sort(c("y", "x1", "control1")))

  # For x2: y, x2, control1 (3 columns)
  expect_equal(ncol(model2$model), 3)
  expect_equal(sort(colnames(model2$model)), sort(c("y", "x2", "control1")))

  # Test that results are still correct
  manual_model1 <- lm(y ~ x1 + control1, data = base_data)
  expect_equal(coef(manual_model1), coef(model1), tolerance = 1e-10)

  manual_model2 <- lm(y ~ x2 + control1, data = base_data)
  expect_equal(coef(manual_model2), coef(model2), tolerance = 1e-10)
})

test_that("necessary columns are identified correctly", {
  # Test the utility function directly
  y <- c("response")
  x <- c("focal1", "focal2", "poly(focal3, 2)")
  x2 <- c("control1", "I(control2^2)")
  group_by <- c("group_var")
  available_cols <- c(
    "response", "focal1", "focal2", "focal3", "control1", "control2",
    "group_var", "extra1", "extra2", ".row_names"
  )

  necessary <- get_necessary_columns(y, x, x2, group_by, available_cols)

  # Should include all variables referenced in y, x, x2, group_by
  expected <- c(
    "response", "focal1", "focal2", "focal3", "control1", "control2",
    "group_var", ".row_names"
  )
  expect_setequal(necessary, expected)

  # Should not include extra columns
  expect_false("extra1" %in% necessary)
  expect_false("extra2" %in% necessary)
})

test_that("optimization works with group_by", {
  set.seed(456)
  n_rows <- 40

  # Create test data with group variable
  test_data <- data.frame(
    y = rnorm(n_rows),
    x1 = rnorm(n_rows),
    control1 = rnorm(n_rows),
    group_var = rep(c("A", "B"), each = n_rows / 2),
    extra1 = rnorm(n_rows),
    extra2 = rnorm(n_rows),
    extra3 = rnorm(n_rows)
  )

  # Run with group_by directly in pipeline
  result <- breg(test_data) |>
    br_set_y("y") |>
    br_set_x("x1") |>
    br_set_x2("control1") |>
    br_set_model("gaussian") |>
    br_run(group_by = "group_var")

  # Check that models only contain necessary columns
  # Should have y, x1, control1 (3 columns) - group_var is used for splitting, not in model
  for (model in result@models) {
    expect_equal(ncol(model$model), 3)
    expect_setequal(colnames(model$model), c("y", "x1", "control1"))
  }
})