test_that("threshold application works", { gold <- tibble::tribble( ~doc_id, ~label_id, "A", "a", "A", "b", "A", "c", "B", "a", "B", "d", "C", "a", "C", "b", "C", "d", "C", "f" ) pred <- tibble::tribble( ~doc_id, ~label_id, ~score, ~rank, "A", "a", 0.9, 1L, "A", "d", 0.7, 2L, "A", "f", 0.3, 3L, "A", "c", 0.1, 4L, "B", "a", 0.8, 1L, "B", "e", 0.6, 2L, "B", "d", 0.1, 3L, "C", "f", 0.1, 1L, "C", "c", 0.2, 2L, "C", "e", 0.2, 2L ) base_compare <- casimir:::create_comparison(pred, gold) # applying zero as threshold should not change anything res_0 <- casimir:::apply_threshold(threshold = 0, base_compare = base_compare) expect_equal(res_0, base_compare) res_1 <- casimir:::apply_threshold(base_compare = base_compare, threshold = 1) # expect that no labels are suggested for threshold 1 expect_true(sum(res_1$suggested) == 0) # all gold labels should still be present in dataset expect_equal( dplyr::select(res_1, "doc_id", "label_id"), gold ) # expect error if threshold is above 1 expect_error( casimir:::apply_threshold(base_compare = base_compare, threshold = 1.3), regexp = "threshold >= 0 & threshold <= 1 is not TRUE" ) # expect some detailed specific results for the test case res_0p5 <- casimir:::apply_threshold( base_compare = base_compare, threshold = 0.5 ) |> dplyr::select(-rank) exp_0p5 <- tibble::tribble( ~doc_id, ~label_id, ~gold, ~score, ~suggested, ~relevance, "A", "a", TRUE, 0.9, TRUE, 0, "A", "b", TRUE, NA, FALSE, 0, "A", "c", TRUE, 0.1, FALSE, 0, "B", "a", TRUE, 0.8, TRUE, 0, "B", "d", TRUE, 0.1, FALSE, 0, "C", "a", TRUE, NA, FALSE, 0, "C", "b", TRUE, NA, FALSE, 0, "C", "d", TRUE, NA, FALSE, 0, "C", "f", TRUE, 0.1, FALSE, 0, "A", "d", FALSE, 0.7, TRUE, 0, "B", "e", FALSE, 0.6, TRUE, 0 ) expect_equal(res_0p5, exp_0p5) pred2 <- tibble::tribble( ~doc_id, ~label_id, ~score, "A", "a", 0.9, "A", "d", 0.7, "A", "f", 1.3, "A", "c", 0.1, "B", "a", 0.8, "B", "e", 0.6, "B", "d", 0.1, "C", "f", 0.1, "C", "c", 0.2, "C", "e", 0.2 ) compare2 <- casimir:::create_comparison(pred2, gold) expect_error( apply_threshold(base_compare = compare2, limit = 0.5), regexp = 'sum(base_compare[["score"]] > 1, na.rm = TRUE) == 0 is not TRUE', fixed = TRUE ) }) test_that("limits are applied correctly", { base_compare <- casimir:::create_comparison( dnb_gold_standard, create_rank_col(dnb_test_predictions) ) res <- apply_threshold( base_compare = base_compare, threshold = 0, limit = 10L ) expect_equal( nrow(dplyr::filter(res, .data$suggested, .data$rank > 10L)), 0 ) # inputting a limit requires a rank column expect_error( apply_threshold( threshold = 0.1, base_compare = dplyr::select(base_compare, -rank), limit = 5 ), regexp = '.*\\"rank\\" %in% colnames\\(base_compare\\) is not TRUE' ) }) test_that("handling of NULL and inf limits works", { base_compare <- casimir:::create_comparison( dnb_gold_standard, create_rank_col(dnb_test_predictions) ) # NULL is not an appropriate input to limit expect_error( apply_threshold( threshold = 0.1, base_compare = dplyr::select(base_compare, -rank), limit = NULL ), regexp = "is.numeric\\(limit\\) is not TRUE" ) # limit = Inf requires a rank column, as any other limit does expect_error( apply_threshold( threshold = 0.1, base_compare = dplyr::select(base_compare, -rank), limit = Inf ), regexp = '\\"rank\\" %in% colnames\\(base_compare\\) is not TRUE' ) # limit = Inf is an appropriate input expect_silent( apply_threshold( threshold = 0.1, base_compare = base_compare, limit = Inf ) ) })