# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
test_that("evaluatePlp", {
  skip_if_offline()
  eval <- evaluatePlp(
    prediction = plpResult$prediction,
    typeColumn = "evaluationType"
  )
  expect_equal(class(eval), "plpEvaluation")
  expect_equal(names(eval), c("evaluationStatistics", "thresholdSummary", "demographicSummary", "calibrationSummary", "predictionDistribution"))
})

test_that("modelBasedConcordance", {
  skip_if_offline()
  concordance <- PatientLevelPrediction::modelBasedConcordance(prediction = plpResult$prediction)
  expect_type(concordance, "double")
})

test_that("evaluatePlp_survival", {
  n <- 100
  plpResultSurvivalPred <- data.frame(
    rowId = 1:n,
    ageYear = sample(100, n, replace = TRUE),
    gender = sample(c("8507", "8532"), n, replace = TRUE),
    outcomeCount = c(rep(1, n * 0.1), rep(0, n * 0.9)),
    value = runif(n, max = 0.1),
    evaluationType = rep("Train", n),
    survivalTime = sample(2000, n, replace = TRUE)
  )
  attr(plpResultSurvivalPred, "metaData")$modelType <- "survival"
  attr(plpResultSurvivalPred, "metaData")$timepoint <- 365

  eval <- evaluatePlp(
    prediction = plpResultSurvivalPred,
    typeColumn = "evaluationType"
  )
  expect_equal(class(eval), "plpEvaluation")
  expect_true(5 == sum(names(eval) %in% c("evaluationStatistics", "demographicSummary", "calibrationSummary", "thresholdSummary", "predictionDistribution")))
})

test_that("AUROC", {
  ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100)))
  attr(ePrediction, "metaData") <- list(modelType = "binary")
  procAuc <- pROC::roc(ePrediction$outcomeCount, ePrediction$value,
    algorithm = 3,
    direction = "<"
  )
  tolerance <- 0.001

  plpAUC <- computeAuc(ePrediction, confidenceInterval = FALSE)
  expect_equal(as.numeric(procAuc$auc), plpAUC, tolerance = tolerance)
})

test_that("AUPRC", {
  ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100)))

  positive <- ePrediction$value[ePrediction$outcomeCount == 1]
  negative <- ePrediction$value[ePrediction$outcomeCount == 0]
  pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative)
  auprc <- pr$auc.integral

  # area under precision-recall curve must be between 0 and 1
  expect_gte(auprc, 0)
  expect_lte(auprc, 1)
})

test_that("Brierscore", {
  skip_if_not_installed("scoring")
  ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100)))

  ePrediction$dummy <- 1
  brierScoring <- scoring::brierscore(outcomeCount ~ value, data = ePrediction, group = "dummy")$brieravg
  brierPlp <- brierScore(ePrediction)$brier
  expect_equal(as.double(brierScoring), (brierPlp))
})

test_that("Average precision", {
  skip_if_not_installed("Metrics")
  ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100)))

  avePMetrics <- Metrics::apk(
    nrow(ePrediction),
    which(ePrediction$outcomeCount == 1), (1:nrow(ePrediction))[order(-ePrediction$value)]
  )
  avePPlp <- averagePrecision(ePrediction)
  expect_equal(as.double(avePMetrics), avePPlp)
})


test_that("Calibration metrics", {
  skip_if_not_installed("ResourceSelection")
  ePrediction <- data.frame(
    rowId = 1:100,
    value = c(rep(0, 50), rep(1, 50)),
    outcomeCount = c(rep(0, 50), rep(1, 50))
  )
  # test the output
  calibrationTest1 <- calibrationLine(ePrediction, numberOfStrata = 2)
  expect_equal(calibrationTest1$lm["Intercept"][[1]], 0)
  expect_equal(calibrationTest1$lm["Gradient"][[1]], 1)
  expect_true(nrow(calibrationTest1$aggregateLmData) == 2, TRUE)

  # should return - need to test all three
  ## lm # has the 'Intercept' and 'Gradient'
  ## aggregateLmData # obs vs pred for groups
  ## hosmerlemeshow # hosmerlemeshow value
  ePrediction2 <- data.frame(
    rowId = 1:100,
    value = c(0.1 + runif(50) * 0.9, runif(50) * 0.6),
    outcomeCount = c(rep(1, 50), rep(0, 50))
  )

  hsExist2 <- ResourceSelection::hoslem.test(ePrediction2$outcomeCount,
    ePrediction2$value,
    g = 10
  )
  calibrationTest2 <- calibrationLine(ePrediction2, numberOfStrata = 10)
  #  test plp values vs ResourceSelection::hoslem.test
  expect_equal(
    calibrationTest2$hosmerlemeshow["Xsquared"][[1]],
    hsExist2$statistic[[1]])
  expect_equal(
    calibrationTest2$hosmerlemeshow["df"][[1]],
    hsExist2$parameter[[1]])
  expect_equal(
    calibrationTest2$hosmerlemeshow["pvalue"][[1]],
    hsExist2$p.value)
})

test_that("E statistics binary", {
  prediction <- data.frame(
    value = c(seq(.1, .5, length.out = 5), NA, .2),
    outcomeCount = c(0, 0, 0, 1, 1, 0, NA)
  )
  eStatsBinary <- PatientLevelPrediction:::calculateEStatisticsBinary(prediction)
  expect_equal(
    eStatsBinary,
    c(Eavg = .34, E90 = .56, Emax = .6)
  )
})

# TODO: test pref scores
# test computePreferenceScore(prediction)

#############################################################################