test_that("Calculations are correct - two class", {
  # roc_curv <- pROC::roc(
  #   two_class_example$truth,
  #   two_class_example$Class1,
  #   levels = rev(levels(two_class_example$truth)),
  #   direction = "<"
  # )
  #
  # lvls <- levels(two_class_example$truth)
  # roc_val <- as.numeric(roc_curv$auc)
  roc_val <- 0.939313857389967

  expect_equal(
    roc_auc_vec(two_class_example$truth, two_class_example$Class1),
    roc_val
  )
})

test_that("Calculations are correct - multi class", {
  # HandTill2001::auc(
  #   HandTill2001::multcap(hpc_cv2$obs, as.matrix(select(hpc_cv2, VF:L)))
  # )

  # HPC_CV takes too long
  hpc_cv2 <- dplyr::filter(
    hpc_cv,
    Resample %in% c("Fold06", "Fold07", "Fold08", "Fold09", "Fold10")
  )

  expect_equal(
    roc_auc(hpc_cv2, obs, VF:L)[[".estimate"]],
    0.827387699597311
  )

  hpc_f1 <- data_hpc_fold1()

  expect_equal(
    roc_auc(hpc_f1, obs, VF:L, estimator = "macro")[[".estimate"]],
    hpc_fold1_macro_metric(roc_auc_binary)
  )
  expect_equal(
    roc_auc(hpc_f1, obs, VF:L, estimator = "macro_weighted")[[".estimate"]],
    hpc_fold1_macro_weighted_metric(roc_auc_binary)
  )
})

test_that("Calculations handles NAs", {
  # HandTill2001::auc(
  #   HandTill2001::multcap(hpc_cv2$obs, as.matrix(select(hpc_cv2, VF:L)))
  # )

  # HPC_CV takes too long
  hpc_cv2 <- dplyr::filter(
    hpc_cv,
    Resample %in% c("Fold06", "Fold07", "Fold08", "Fold09", "Fold10")
  )
  hpc_cv2$VF[1:10] <- NA

  expect_equal(
    roc_auc(hpc_cv2, obs, VF:L)[[".estimate"]],
    0.82746116
  )

  expect_equal(
    roc_auc(hpc_cv2, obs, VF:L, na_rm = FALSE)[[".estimate"]],
    NA_real_
  )
})

test_that("Case weights calculations are correct", {
  sklearn <- read_pydata("py-roc-auc")

  two_class_example$weight <- read_weights_two_class_example()

  expect_equal(
    roc_auc(two_class_example, truth, Class1, case_weights = weight)[[
      ".estimate"
    ]],
    sklearn$case_weight$binary
  )

  sklearn <- read_pydata("py-roc-auc")

  hpc_cv$weight <- read_weights_hpc_cv()

  expect_equal(
    roc_auc(hpc_cv, obs, VF:L, estimator = "macro", case_weights = weight)[[
      ".estimate"
    ]],
    sklearn$case_weight$macro
  )
  expect_equal(
    roc_auc(
      hpc_cv,
      obs,
      VF:L,
      estimator = "macro_weighted",
      case_weights = weight
    )[[".estimate"]],
    sklearn$case_weight$macro_weighted
  )

  # No support for hand_till + case weights

  hpc_cv$weight <- rep(1, times = nrow(hpc_cv))
  hpc_cv$weight[c(100, 200, 150, 2)] <- 5

  hpc_cv <- dplyr::group_by(hpc_cv, Resample)

  hpc_cv_expanded <- hpc_cv[
    vec_rep_each(seq_len(nrow(hpc_cv)), times = hpc_cv$weight),
  ]

  expect_identical(
    roc_auc(hpc_cv, obs, VF:L, case_weights = weight, estimator = "macro"),
    roc_auc(hpc_cv_expanded, obs, VF:L, estimator = "macro")
  )

  expect_identical(
    roc_auc(
      hpc_cv,
      obs,
      VF:L,
      case_weights = weight,
      estimator = "macro_weighted"
    ),
    roc_auc(hpc_cv_expanded, obs, VF:L, estimator = "macro_weighted")
  )
})

test_that("works with hardhat case weights", {
  df <- two_class_example

  imp_wgt <- hardhat::importance_weights(seq_len(nrow(df)))
  freq_wgt <- hardhat::frequency_weights(seq_len(nrow(df)))

  expect_no_error(
    roc_auc_vec(df$truth, df$Class1, case_weights = imp_wgt)
  )

  expect_no_error(
    roc_auc_vec(df$truth, df$Class1, case_weights = freq_wgt)
  )
})

test_that("errors with class_pred input", {
  skip_if_not_installed("probably")

  cp_truth <- probably::as_class_pred(two_class_example$truth, which = 1)
  fct_truth <- two_class_example$truth
  fct_truth[1] <- NA

  estimate <- two_class_example$Class1

  expect_snapshot(
    error = TRUE,
    roc_auc_vec(cp_truth, estimate)
  )
})

test_that("na_rm argument check", {
  expect_snapshot(
    error = TRUE,
    roc_auc_vec(1, 1, na_rm = "yes")
  )
})

test_that("`event_level = 'second'` works", {
  df <- two_class_example

  df_rev <- df
  df_rev$truth <- stats::relevel(df_rev$truth, "Class2")

  expect_equal(
    roc_auc_vec(df$truth, df$Class1),
    roc_auc_vec(df_rev$truth, df_rev$Class1, event_level = "second")
  )
})

test_that("`options` is defunct", {
  expect_snapshot(
    error = TRUE,
    roc_auc(two_class_example, truth, Class1, options = 1)
  )

  expect_snapshot(
    error = TRUE,
    roc_auc_vec(
      truth = two_class_example$truth,
      estimate = two_class_example$Class1,
      options = 1
    )
  )
})

test_that("sklearn equivalent", {
  sklearn <- read_pydata("py-roc-auc")

  expect_equal(
    roc_auc(two_class_example, truth, Class1)[[".estimate"]],
    sklearn$binary
  )

  sklearn <- read_pydata("py-roc-auc")

  expect_equal(
    roc_auc(hpc_cv, obs, VF:L, estimator = "macro")[[".estimate"]],
    sklearn$macro
  )
  expect_equal(
    roc_auc(hpc_cv, obs, VF:L, estimator = "macro_weighted")[[".estimate"]],
    sklearn$macro_weighted
  )
  expect_equal(
    roc_auc(hpc_cv, obs, VF:L, estimator = "hand_till")[[".estimate"]],
    sklearn$hand_till
  )
})

test_that("roc_auc() - can calculate Hand Till when prob matrix column names are different from level values", {
  # HPC_CV takes too long
  hpc_cv2 <- dplyr::filter(
    hpc_cv,
    Resample %in% c("Fold06", "Fold07", "Fold08", "Fold09", "Fold10")
  )

  # Mimic how parsnip returns names
  colnames(hpc_cv2) <- c(
    "obs",
    "pred",
    ".pred_VF",
    ".pred_F",
    ".pred_M",
    ".pred_L",
    "Resample"
  )

  expect_equal(
    roc_auc(hpc_cv2, obs, .pred_VF:.pred_L)[[".estimate"]],
    0.827387699597311
  )
})

test_that("roc_auc() - hand-till method ignores levels with 0 observations with a warning (#123)", {
  # Generally we return `NA_real_` for macro/macro-weighted/micro multiclass,
  # but pROC and HandTill2001 both ignore levels with 0 observations, so we do
  # too for consistency
  truth <- factor(c("x", "x", "z"), levels = c("x", "y", "z"))

  estimate <- c(
    c(0.8, 0.5, 0.6),
    c(0.1, 0.1, 0.1),
    c(0.1, 0.4, 0.3)
  )

  estimate <- matrix(estimate, ncol = 3)
  colnames(estimate) <- c("x", "y", "z")

  # HandTill2001::auc(HandTill2001::multcap(truth, estimate))
  expect_snapshot(
    expect_identical(roc_auc_vec(truth, estimate), 0.5)
  )
})

test_that("roc_auc() - binary roc auc uses equivalent of pROC `direction = <`", {
  # In yardstick we do events (or cases) as the first level
  truth <- factor(c("control", "case", "case"), levels = c("case", "control"))

  # Make really bad predictions
  # This would force `direction = "auto"` to choose `>`,
  # which would be incorrect. We are required to force `direction = <` for
  # our purposes of having `estimate` match the event
  estimate <- c(0.8, 0.2, 0.1)

  # # pROC() expects levels to be in the order of control, then event.
  # auc <- pROC::auc(
  #   truth,
  #   estimate,
  #   levels = c("control", "case"),
  #   direction = "<"
  # )
  # auc <- as.numeric(auc)
  auc <- 0

  expect_identical(roc_auc_vec(truth, estimate), auc)
})

test_that("roc_auc() - equivalent of `direction = <` is forced when individual binary AUCs are computed (#123)", {
  truth <- factor(
    c(
      "c",
      "c",
      "c",
      "d",
      "d",
      "a",
      "d",
      "c",
      "a",
      "a",
      "d",
      "b",
      "d",
      "a",
      "d",
      "a",
      "d",
      "d"
    ),
    levels = c("a", "b", "c", "d")
  )

  estimate <- c(
    c(
      c(
        0.5,
        0.4,
        0.8,
        0.5,
        0.8,
        1,
        0.8,
        0,
        0.5,
        2 / 3,
        0,
        1,
        0.4,
        1 / 6,
        0.4,
        0.8,
        0.5,
        0.6
      )
    ),
    c(0, 0.2, 0, 0, 0, 0, 0, 0.6, 0, 1 / 3, 0, 0, 0, 5 / 6, 0.6, 0, 0, 0),
    c(0, 0.2, 0, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    c(
      0.5,
      0.2,
      0.2,
      0.5,
      0.2,
      0,
      0.2,
      0.2,
      0.5,
      0,
      1,
      0,
      0.6,
      0,
      0,
      0.2,
      0.5,
      0.4
    )
  )

  estimate <- matrix(
    estimate,
    ncol = 4,
    dimnames = list(NULL, c("a", "b", "c", "d"))
  )

  # HandTill2001::auc(HandTill2001::multcap(truth, estimate))
  expect_equal(
    roc_auc_vec(truth, estimate),
    0.5890625
  )
})

test_that("roc_auc() - warning is thrown when missing events", {
  no_event <- dplyr::filter(two_class_example, truth == "Class2")

  expect_snapshot(out <- roc_auc(no_event, truth, Class1)[[".estimate"]])

  expect_identical(out, NA_real_)
})

test_that("roc_auc() - warning is thrown when missing controls", {
  no_control <- dplyr::filter(two_class_example, truth == "Class1")

  expect_snapshot(out <- roc_auc(no_control, truth, Class1)[[".estimate"]])

  expect_identical(out, NA_real_)
})

test_that("roc_auc() - multiclass one-vs-all approach results in multiple warnings", {
  no_event <- dplyr::filter(two_class_example, truth == "Class2")

  expect_snapshot(
    out <- roc_auc(no_event, truth, Class1, Class2, estimator = "macro")[[
      ".estimate"
    ]]
  )
  expect_identical(out, NA_real_)

  expect_snapshot(
    out <- roc_auc(no_event, truth, Class1, Class2, estimator = "macro")[[
      ".estimate"
    ]]
  )
  expect_identical(out, NA_real_)

  expect_snapshot(
    out <- roc_auc(
      no_event,
      truth,
      Class1,
      Class2,
      estimator = "macro_weighted"
    )[[".estimate"]]
  )
  expect_identical(out, NA_real_)

  expect_snapshot(
    out <- roc_auc(
      no_event,
      truth,
      Class1,
      Class2,
      estimator = "macro_weighted"
    )[[".estimate"]]
  )
  expect_identical(out, NA_real_)
})

test_that("roc_auc() - hand till approach throws warning and returns `NaN` when only 1 level has observations", {
  x <- factor(c("x", "x", "x"), levels = c("x", "y"))

  estimate <- c(
    c(0.8, 0.5, 0.6),
    c(0.2, 0.5, 0.4)
  )

  estimate <- matrix(estimate, ncol = 2)
  colnames(estimate) <- c("x", "y")

  # With two levels -> one
  expect_snapshot(
    out <- roc_auc_vec(x, estimate, estimator = "hand_till")
  )
  expect_identical(out, NaN)

  x <- factor(c("x", "x", "x"), levels = c("x", "y", "z"))

  estimate <- c(
    c(0.8, 0.5, 0.6),
    c(0.1, 0.1, 0.1),
    c(0.1, 0.4, 0.3)
  )

  estimate <- matrix(estimate, ncol = 3)
  colnames(estimate) <- c("x", "y", "z")

  # With three levels -> one
  expect_snapshot(
    out <- roc_auc_vec(x, estimate, estimator = "hand_till")
  )
  expect_identical(out, NaN)
})

test_that("roc_auc() -  df method - presense of case weights affects default multiclass estimator", {
  hpc_cv$weight <- read_weights_hpc_cv()

  hpc_cv_estimate_matrix <- as.matrix(hpc_cv[c("VF", "F", "M", "L")])

  # Generally hand-till
  out <- roc_auc(hpc_cv, obs, VF:L)
  expect_identical(out$.estimator, "hand_till")
  expect_identical(
    out$.estimate,
    roc_auc_vec(hpc_cv$obs, hpc_cv_estimate_matrix, estimator = "hand_till")
  )

  # Unless case weights are supplied
  out <- roc_auc(hpc_cv, obs, VF:L, case_weights = weight)
  expect_identical(out$.estimator, "macro")
  expect_identical(
    out$.estimate,
    roc_auc_vec(
      hpc_cv$obs,
      hpc_cv_estimate_matrix,
      estimator = "macro",
      case_weights = hpc_cv$weight
    )
  )
})

test_that("roc_auc() - vec method - presense of case weights affects default multiclass estimator", {
  hpc_cv$weight <- read_weights_hpc_cv()

  hpc_cv_estimate_matrix <- as.matrix(hpc_cv[c("VF", "F", "M", "L")])

  # Generally hand-till
  expect_identical(
    roc_auc_vec(hpc_cv$obs, hpc_cv_estimate_matrix),
    roc_auc_vec(hpc_cv$obs, hpc_cv_estimate_matrix, estimator = "hand_till")
  )

  # Unless case weights are supplied
  expect_identical(
    roc_auc_vec(
      hpc_cv$obs,
      hpc_cv_estimate_matrix,
      case_weights = hpc_cv$weight
    ),
    roc_auc_vec(
      hpc_cv$obs,
      hpc_cv_estimate_matrix,
      estimator = "macro",
      case_weights = hpc_cv$weight
    )
  )
})

test_that("roc_auc() - can't use case weights and hand-till method", {
  hpc_cv$weight <- read_weights_hpc_cv()

  expect_snapshot(error = TRUE, {
    roc_auc(hpc_cv, obs, VF:L, estimator = "hand_till", case_weights = weight)
  })
})

test_that("range values are correct", {
  direction <- metric_direction(roc_auc)
  range <- metric_range(roc_auc)
  perfect <- ifelse(direction == "minimize", range[1], range[2])
  worst <- ifelse(direction == "minimize", range[2], range[1])

  df <- tibble::tibble(
    truth = factor(c("a", "a", "a", "b", "b"), levels = c("a", "b")),
    perfect = c(1, 1, 1, 0, 0),
    off = c(0.5, 0.5, 0.5, 0.5, 0.5)
  )

  expect_equal(roc_auc_vec(df$truth, df$perfect), perfect)

  if (direction == "minimize") {
    expect_gt(roc_auc_vec(df$truth, df$off), perfect)
    expect_lte(roc_auc_vec(df$truth, df$off), worst)
  }
  if (direction == "maximize") {
    expect_lt(roc_auc_vec(df$truth, df$off), perfect)
    expect_gte(roc_auc_vec(df$truth, df$off), worst)
  }
})