# Tests comparing propensity package categorical weights to WeightIt and PSweight

test_that("categorical weights match WeightIt for all estimands", {
  skip_if_not_installed("WeightIt")
  skip_if_not_installed("nnet")
  skip_on_cran()

  # Note: WeightIt uses its own multinom_weightit() implementation for multi-category
  # treatments while we use nnet::multinom. Small differences are expected due to
  # different optimization algorithms.

  # Create test dataset with 3-category treatment
  set.seed(123)
  n <- 200
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rbinom(n, 1, 0.5)

  # Generate treatment with known propensity scores
  z1 <- 0.5 * x1 + 0.3 * x2 + 0.5 * x3 + rnorm(n, sd = 0.5)
  z2 <- -0.3 * x1 + 0.5 * x2 - 0.3 * x3 + rnorm(n, sd = 0.5)
  p1 <- exp(z1) / (1 + exp(z1) + exp(z2))
  p2 <- exp(z2) / (1 + exp(z1) + exp(z2))
  p3 <- 1 - p1 - p2

  # Sample treatment
  u <- runif(n)
  trt <- factor(ifelse(u < p1, "A", ifelse(u < p1 + p2, "B", "C")))

  # Create data frame
  test_data <- data.frame(
    trt = trt,
    x1 = x1,
    x2 = x2,
    x3 = x3
  )

  # Fit propensity score model
  ps_model <- nnet::multinom(
    trt ~ x1 + x2 + x3,
    data = test_data,
    trace = FALSE
  )
  ps_matrix <- predict(ps_model, type = "probs")

  # Ensure matrix has columns for all levels in correct order
  ps_matrix <- ps_matrix[, levels(trt)]

  # Test ATE weights
  w_ate_propensity <- wt_ate(ps_matrix, trt, exposure_type = "categorical")
  w_ate_weightit <- WeightIt::weightit(
    trt ~ x1 + x2 + x3,
    data = test_data,
    method = "ps",
    estimand = "ATE"
  )$weights

  expect_equal(as.numeric(w_ate_propensity), w_ate_weightit, tolerance = 1e-5)

  # Test ATT weights for each focal category
  for (focal in levels(trt)) {
    w_att_propensity <- wt_att(
      ps_matrix,
      trt,
      .focal_level = focal,
      exposure_type = "categorical"
    )
    w_att_weightit <- WeightIt::weightit(
      trt ~ x1 + x2 + x3,
      data = test_data,
      method = "ps",
      estimand = "ATT",
      focal = focal
    )$weights

    expect_equal(
      as.numeric(w_att_propensity),
      w_att_weightit,
      tolerance = 1e-5,
      label = paste("ATT weights with focal =", focal)
    )
  }

  # Test ATO weights
  w_ato_propensity <- wt_ato(ps_matrix, trt, exposure_type = "categorical")
  w_ato_weightit <- WeightIt::weightit(
    trt ~ x1 + x2 + x3,
    data = test_data,
    method = "ps",
    estimand = "ATO"
  )$weights

  expect_equal(as.numeric(w_ato_propensity), w_ato_weightit, tolerance = 1e-5)

  # Test ATM weights
  w_atm_propensity <- wt_atm(ps_matrix, trt, exposure_type = "categorical")
  w_atm_weightit <- WeightIt::weightit(
    trt ~ x1 + x2 + x3,
    data = test_data,
    method = "ps",
    estimand = "ATM"
  )$weights

  expect_equal(as.numeric(w_atm_propensity), w_atm_weightit, tolerance = 1e-5)
})

test_that("categorical weights produce same treatment effect estimates as PSweight", {
  skip_if_not_installed("PSweight")
  skip_if_not_installed("nnet")
  skip_on_cran()

  # Create test dataset with 3-category treatment
  set.seed(456)
  n <- 200
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rbinom(n, 1, 0.5)

  # Generate treatment
  z1 <- 0.5 * x1 + 0.3 * x2 + 0.5 * x3 + rnorm(n, sd = 0.5)
  z2 <- -0.3 * x1 + 0.5 * x2 - 0.3 * x3 + rnorm(n, sd = 0.5)
  p1 <- exp(z1) / (1 + exp(z1) + exp(z2))
  p2 <- exp(z2) / (1 + exp(z1) + exp(z2))
  p3 <- 1 - p1 - p2

  u <- runif(n)
  trt <- ifelse(u < p1, 1, ifelse(u < p1 + p2, 2, 3))

  # Generate outcome for PSweight
  y <- 2 + 1 * (trt == 1) + 3 * (trt == 2) + x1 + x2 + x3 + rnorm(n)

  # Create data frame - PSweight needs numeric treatment
  test_data <- data.frame(
    y = y,
    trt = trt,
    x1 = x1,
    x2 = x2,
    x3 = x3
  )

  # Get propensity scores using our standard approach
  ps_model <- nnet::multinom(
    trt ~ x1 + x2 + x3,
    data = test_data,
    trace = FALSE
  )
  ps_matrix <- predict(ps_model, type = "probs")
  trt_factor <- factor(trt)

  # Test ATE estimation
  w_ate_propensity <- wt_ate(
    ps_matrix,
    trt_factor,
    exposure_type = "categorical"
  )

  # Calculate potential outcomes using our weights
  po_our <- numeric(3)
  for (g in 1:3) {
    po_our[g] <- sum(y[trt == g] * as.numeric(w_ate_propensity)[trt == g]) /
      sum(as.numeric(w_ate_propensity)[trt == g])
  }

  # PSweight ATE estimation
  psw_ate <- PSweight::PSweight(
    ps.formula = trt ~ x1 + x2 + x3,
    yname = "y",
    data = test_data,
    weight = "IPW"
  )

  # Compare potential outcome estimates
  expect_equal(po_our, as.numeric(psw_ate$muhat), tolerance = 1e-3)

  # Test entropy weights
  w_entropy_propensity <- wt_entropy(
    ps_matrix,
    trt_factor,
    exposure_type = "categorical"
  )

  # Calculate potential outcomes using entropy weights
  po_entropy_our <- numeric(3)
  for (g in 1:3) {
    po_entropy_our[g] <- sum(
      y[trt == g] * as.numeric(w_entropy_propensity)[trt == g]
    ) /
      sum(as.numeric(w_entropy_propensity)[trt == g])
  }

  psw_entropy <- PSweight::PSweight(
    ps.formula = trt ~ x1 + x2 + x3,
    yname = "y",
    data = test_data,
    weight = "entropy"
  )

  expect_equal(po_entropy_our, as.numeric(psw_entropy$muhat), tolerance = 1e-3)

  # Test overlap (ATO) weights
  w_ato_propensity <- wt_ato(
    ps_matrix,
    trt_factor,
    exposure_type = "categorical"
  )

  # Calculate potential outcomes using ATO weights
  po_ato_our <- numeric(3)
  for (g in 1:3) {
    po_ato_our[g] <- sum(y[trt == g] * as.numeric(w_ato_propensity)[trt == g]) /
      sum(as.numeric(w_ato_propensity)[trt == g])
  }

  psw_overlap <- PSweight::PSweight(
    ps.formula = trt ~ x1 + x2 + x3,
    yname = "y",
    data = test_data,
    weight = "overlap"
  )

  expect_equal(po_ato_our, as.numeric(psw_overlap$muhat), tolerance = 1e-3)
})

test_that("categorical weights handle 4+ categories correctly", {
  skip_if_not_installed("WeightIt")
  skip_if_not_installed("nnet")
  skip_on_cran()

  # Create test dataset with 4-category treatment
  set.seed(789)
  n <- 300
  x1 <- rnorm(n)
  x2 <- rnorm(n)

  # Generate 4-category treatment
  z1 <- 0.5 * x1 + 0.3 * x2 + rnorm(n, sd = 0.5)
  z2 <- -0.3 * x1 + 0.5 * x2 + rnorm(n, sd = 0.5)
  z3 <- 0.2 * x1 - 0.4 * x2 + rnorm(n, sd = 0.5)

  denom <- 1 + exp(z1) + exp(z2) + exp(z3)
  p1 <- exp(z1) / denom
  p2 <- exp(z2) / denom
  p3 <- exp(z3) / denom
  p4 <- 1 - p1 - p2 - p3

  # Sample treatment
  u <- runif(n)
  trt <- factor(
    ifelse(
      u < p1,
      "A",
      ifelse(u < p1 + p2, "B", ifelse(u < p1 + p2 + p3, "C", "D"))
    )
  )

  test_data <- data.frame(trt = trt, x1 = x1, x2 = x2)

  # Fit propensity score model
  ps_model <- nnet::multinom(trt ~ x1 + x2, data = test_data, trace = FALSE)
  ps_matrix <- predict(ps_model, type = "probs")
  ps_matrix <- ps_matrix[, levels(trt)]

  # Test ATE weights
  w_ate_propensity <- wt_ate(ps_matrix, trt, exposure_type = "categorical")
  w_ate_weightit <- WeightIt::weightit(
    trt ~ x1 + x2,
    data = test_data,
    method = "ps",
    estimand = "ATE"
  )$weights

  expect_equal(as.numeric(w_ate_propensity), w_ate_weightit, tolerance = 1e-5)

  # Check that weights have correct attributes
  expect_equal(attr(w_ate_propensity, "n_categories"), 4)
  expect_equal(attr(w_ate_propensity, "category_names"), levels(trt))
})

test_that("stabilized categorical ATE weights match WeightIt", {
  skip_if_not_installed("WeightIt")
  skip_if_not_installed("nnet")
  skip_on_cran()

  set.seed(101)
  n <- 200
  x1 <- rnorm(n)
  x2 <- rnorm(n)

  # Generate 3-category treatment
  z1 <- 0.5 * x1 + 0.3 * x2 + rnorm(n, sd = 0.5)
  z2 <- -0.3 * x1 + 0.5 * x2 + rnorm(n, sd = 0.5)
  p1 <- exp(z1) / (1 + exp(z1) + exp(z2))
  p2 <- exp(z2) / (1 + exp(z1) + exp(z2))
  p3 <- 1 - p1 - p2

  u <- runif(n)
  trt <- factor(ifelse(u < p1, "A", ifelse(u < p1 + p2, "B", "C")))

  test_data <- data.frame(trt = trt, x1 = x1, x2 = x2)

  # Fit propensity score model
  ps_model <- nnet::multinom(trt ~ x1 + x2, data = test_data, trace = FALSE)
  ps_matrix <- predict(ps_model, type = "probs")
  ps_matrix <- ps_matrix[, levels(trt)]

  # Test stabilized ATE weights
  w_ate_stab_propensity <- wt_ate(
    ps_matrix,
    trt,
    exposure_type = "categorical",
    stabilize = TRUE
  )

  # WeightIt stabilized weights
  # Note: WeightIt uses s.weights for stabilization
  w_obj <- WeightIt::weightit(
    trt ~ x1 + x2,
    data = test_data,
    method = "ps",
    estimand = "ATE",
    stabilize = TRUE
  )
  w_ate_stab_weightit <- w_obj$weights

  expect_equal(
    as.numeric(w_ate_stab_propensity),
    w_ate_stab_weightit,
    tolerance = 1e-5
  )
  expect_true(is_stabilized(w_ate_stab_propensity))
})

test_that("categorical weights work with parsnip models", {
  skip_if_not_installed("parsnip")
  skip_if_not_installed("nnet")
  skip_on_cran()

  set.seed(202)
  n <- 200
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rbinom(n, 1, 0.5)

  # Generate 3-category treatment
  z1 <- 0.5 * x1 + 0.3 * x2 + 0.5 * x3 + rnorm(n, sd = 0.5)
  z2 <- -0.3 * x1 + 0.5 * x2 - 0.3 * x3 + rnorm(n, sd = 0.5)
  p1 <- exp(z1) / (1 + exp(z1) + exp(z2))
  p2 <- exp(z2) / (1 + exp(z1) + exp(z2))
  p3 <- 1 - p1 - p2

  u <- runif(n)
  trt <- factor(ifelse(u < p1, "A", ifelse(u < p1 + p2, "B", "C")))

  test_data <- data.frame(
    trt = trt,
    x1 = x1,
    x2 = x2,
    x3 = x3
  )

  # Fit propensity score model using parsnip
  ps_spec <- parsnip::multinom_reg() %>%
    parsnip::set_engine("nnet") %>%
    parsnip::set_mode("classification")

  ps_fit <- ps_spec %>%
    parsnip::fit(trt ~ x1 + x2 + x3, data = test_data)

  # Get predictions as probabilities (data frame)
  ps_probs <- predict(ps_fit, new_data = test_data, type = "prob")

  # Test that we can calculate weights directly with the data frame
  expect_no_error(
    w_ate <- wt_ate(ps_probs, trt, exposure_type = "categorical")
  )
  expect_no_error(
    w_att <- wt_att(
      ps_probs,
      trt,
      .focal_level = "A",
      exposure_type = "categorical"
    )
  )
  expect_no_error(
    w_ato <- wt_ato(ps_probs, trt, exposure_type = "categorical")
  )

  # Check basic properties
  expect_s3_class(w_ate, "psw")
  expect_equal(length(w_ate), n)
  expect_equal(attr(w_ate, "n_categories"), 3)

  # Compare to direct nnet fit
  ps_nnet <- nnet::multinom(trt ~ x1 + x2 + x3, data = test_data, trace = FALSE)
  ps_matrix_nnet <- predict(ps_nnet, type = "probs")
  ps_matrix_nnet <- ps_matrix_nnet[, levels(trt)]

  w_ate_nnet <- wt_ate(ps_matrix_nnet, trt, exposure_type = "categorical")

  # Should be very close (may have slight differences due to fitting algorithm)
  expect_equal(as.numeric(w_ate), as.numeric(w_ate_nnet), tolerance = 0.01)
})