test_that("result matches reference implementation (fairlearn)", {
  data("hpc_cv")
  py_res <- read_pydata("py-equalized_odds")

  hpc_cv$obs_vf <- as.factor(hpc_cv$obs == "VF")
  hpc_cv$pred_vf <- as.factor(hpc_cv$pred == "VF")
  hpc_cv$case_weights <- read_weights_hpc_cv()

  eo <- equalized_odds(Resample)

  expect_equal(
    eo(
      hpc_cv,
      truth = obs_vf,
      estimate = pred_vf,
      event_level = "second"
    )$.estimate,
    py_res$binary
  )

  expect_equal(
    eo(
      hpc_cv,
      truth = obs_vf,
      estimate = pred_vf,
      event_level = "second",
      case_weights = case_weights
    )$.estimate,
    py_res$weighted
  )
})