# tests/testthat/test-cohort-reference.R test_that("cohort_reference calculates percentiles correctly", { # Create multi-athlete test data dates <- rep(seq(as.Date("2024-01-01"), by = "day", length.out = 30), times = 10) cohort_data <- data.frame( date = dates, athlete_id = rep(paste0("athlete", 1:10), each = 30), sport = "Run", acwr_smooth = rnorm(300, mean = 1.0, sd = 0.2), stringsAsFactors = FALSE ) # Calculate reference percentiles result <- calculate_cohort_reference( cohort_data, metric = "acwr_smooth", by = c("sport"), probs = c(0.25, 0.5, 0.75), min_athletes = 5 ) # Check structure expect_s3_class(result, "data.frame") expect_contains(colnames(result), c("date", "percentile", "value", "n_athletes")) # Check that we have 3 percentiles per date dates_count <- unique(result$date) expect_length(unique(result$percentile), 3) # Check that n_athletes is correct (should be 10) expect_equal(unique(result$n_athletes), 10) }) test_that("cohort_reference respects min_athletes threshold", { # Create data with only 3 athletes cohort_data <- data.frame( date = rep(as.Date("2024-01-01"), 3), athlete_id = paste0("athlete", 1:3), sport = "Run", acwr_smooth = c(0.8, 1.0, 1.2), stringsAsFactors = FALSE ) # With min_athletes = 5, should error or return empty expect_error( calculate_cohort_reference( cohort_data, metric = "acwr_smooth", min_athletes = 5 ), "No groups have at least.*athletes" ) # With min_athletes = 3, should work result <- calculate_cohort_reference( cohort_data, metric = "acwr_smooth", min_athletes = 3 ) expect_gt(nrow(result), 0) }) test_that("cohort_reference handles grouping variables", { # Create data with multiple sports cohort_data <- data.frame( date = rep(seq(as.Date("2024-01-01"), by = "day", length.out = 10), times = 10), athlete_id = rep(paste0("athlete", 1:10), each = 10), sport = rep(c("Run", "Ride"), each = 50), acwr_smooth = rnorm(100, mean = 1.0, sd = 0.2), stringsAsFactors = FALSE ) # Group by sport result <- calculate_cohort_reference( cohort_data, metric = "acwr_smooth", by = c("sport"), min_athletes = 3 ) # Should have both sports expect_contains(colnames(result), "sport") expect_setequal(unique(result$sport), c("Run", "Ride")) }) test_that("cohort_reference validates input", { # Empty data expect_error( calculate_cohort_reference(data.frame()), "data.*empty" ) # Missing required column bad_data <- data.frame( date = as.Date("2024-01-01"), athlete_id = "athlete1" ) expect_error( calculate_cohort_reference(bad_data, metric = "acwr_smooth"), "Missing required columns" ) }) test_that("plot_with_reference creates valid plot", { # Create individual data individual_data <- data.frame( date = seq(as.Date("2024-01-01"), by = "day", length.out = 30), acwr_smooth = rnorm(30, mean = 1.1, sd = 0.15) ) # Create reference data reference_data <- data.frame( date = rep(seq(as.Date("2024-01-01"), by = "day", length.out = 30), times = 5), percentile = rep(c("p05", "p25", "p50", "p75", "p95"), each = 30), value = c( rnorm(30, 0.6, 0.05), # p05 rnorm(30, 0.8, 0.05), # p25 rnorm(30, 1.0, 0.05), # p50 rnorm(30, 1.2, 0.05), # p75 rnorm(30, 1.4, 0.05) # p95 ), n_athletes = 10 ) # Create plot p <- plot_with_reference( individual = individual_data, reference = reference_data, metric = "acwr_smooth" ) # Check plot components (plot_with_reference not in vdiffr, validate structure) expect_gt(length(p$layers), 0) }) test_that("add_reference_bands adds layers to plot", { # Create base plot plot_data <- data.frame( date = seq(as.Date("2024-01-01"), by = "day", length.out = 30), value = rnorm(30, 1.0, 0.1) ) base_plot <- ggplot2::ggplot(plot_data, ggplot2::aes(x = date, y = value)) + ggplot2::geom_line() # Create reference data reference_data <- data.frame( date = rep(seq(as.Date("2024-01-01"), by = "day", length.out = 30), times = 5), percentile = rep(c("p05", "p25", "p50", "p75", "p95"), each = 30), value = c( rep(0.6, 30), rep(0.8, 30), rep(1.0, 30), rep(1.2, 30), rep(1.4, 30) ), n_athletes = 10 ) # Add bands plot_with_bands <- add_reference_bands(base_plot, reference_data) # Check that layers were added (add_reference_bands not in vdiffr) expect_gt(length(plot_with_bands$layers), length(base_plot$layers)) }) test_that("reference plotting rejects unfiltered grouped reference data", { dates <- seq(as.Date("2024-01-01"), by = "day", length.out = 3) grouped_reference <- data.frame( date = rep(dates, times = 10), percentile = rep(rep(c("p05", "p25", "p50", "p75", "p95"), each = 3), times = 2), value = runif(30), sport = rep(c("Run", "Ride"), each = 15), stringsAsFactors = FALSE ) individual <- data.frame( date = dates, acwr_smooth = c(0.9, 1.0, 1.1), stringsAsFactors = FALSE ) base_plot <- ggplot2::ggplot(individual, ggplot2::aes(x = date, y = acwr_smooth)) + ggplot2::geom_line() expect_error( add_reference_bands(base_plot, grouped_reference), "multiple groups per date/percentile" ) expect_error( plot_with_reference(individual, grouped_reference), "multiple groups per date/percentile" ) run_reference <- dplyr::filter(grouped_reference, .data$sport == "Run") expect_s3_class(add_reference_bands(base_plot, run_reference), "ggplot") expect_s3_class(plot_with_reference(individual, run_reference), "ggplot") }) test_that("README cohort reference example is runnable with two athletes", { data("sample_acwr", package = "Athlytics") cohort_acwr <- dplyr::bind_rows( dplyr::mutate(sample_acwr, athlete_id = "A1"), dplyr::mutate(sample_acwr, athlete_id = "A2", acwr_smooth = acwr_smooth * 1.05 ) ) reference_bands <- calculate_cohort_reference( cohort_acwr, metric = "acwr_smooth", by = character(0), min_athletes = 2 ) expect_s3_class(reference_bands, "data.frame") expect_gt(nrow(reference_bands), 0) }) test_that("cohort_reference handles missing grouping variables gracefully", { cohort_data <- data.frame( date = rep(as.Date("2024-01-01"), 10), athlete_id = paste0("athlete", 1:10), acwr_smooth = rnorm(10, 1.0, 0.2), stringsAsFactors = FALSE ) # Request grouping by a non-existent variable expect_warning( result <- calculate_cohort_reference( cohort_data, metric = "acwr_smooth", by = c("sport", "nonexistent_var"), # "sport" doesn't exist min_athletes = 5 ), "Grouping variable.*not found" ) # Should still work, just without grouping expect_s3_class(result, "data.frame") }) test_that("cohort_reference is deprecated but remains available", { dates <- rep(seq(as.Date("2024-01-01"), by = "day", length.out = 10), times = 10) cohort_data <- data.frame( date = dates, athlete_id = rep(paste0("athlete", 1:10), each = 10), sport = "Run", acwr_smooth = rnorm(100, mean = 1.0, sd = 0.2), stringsAsFactors = FALSE ) expect_warning( out <- cohort_reference(cohort_data, metric = "acwr_smooth", by = "sport", min_athletes = 3), "deprecated" ) expect_s3_class(out, "data.frame") }) # ============================================================ # Regression tests — bug-fix guards grouped by version # ============================================================ # --- Athlete-weighted percentiles (v1.0.4) ------------------------------ test_that("calculate_cohort_reference is athlete-weighted, not row-weighted (regression)", { # Cohort where one athlete has 10x more rows than another on the same # date. Under the old row-weighted behaviour that athlete dominates the # percentile; under the fix each athlete contributes a single # representative value. dates <- rep(as.Date("2024-06-01"), 11) athlete_ids <- c(rep("A", 10), "B") values <- c(rep(0.5, 10), 5.0) # athlete B is an outlier with ACWR = 5.0 cohort <- data.frame( date = dates, athlete_id = athlete_ids, sport = "Run", acwr_smooth = values, stringsAsFactors = FALSE ) ref <- suppressMessages(calculate_cohort_reference( cohort, metric = "acwr_smooth", by = "sport", probs = c(0.5), min_athletes = 2 )) # Median across two athletes (A=0.5, B=5.0) should be the average = 2.75, # not 0.5 (which would imply row-weighted behaviour). p50_row <- ref$value[ref$percentile == "p50"] expect_length(p50_row, 1) expect_gt(p50_row, 0.6) expect_lt(p50_row, 5.0) }) # --- athlete_id requirement (v1.0.5) ------------------------------------ test_that("calculate_cohort_reference errors when athlete_id is missing by default (regression)", { dat <- data.frame( date = seq(as.Date("2024-01-01"), by = "day", length.out = 10), sport = "Run", acwr_smooth = rnorm(10, 1, 0.1), stringsAsFactors = FALSE ) expect_error( calculate_cohort_reference(dat, metric = "acwr_smooth", by = "sport"), "athlete_id.*is required" ) }) test_that("calculate_cohort_reference tolerates missing athlete_id when opted in (regression)", { dat <- data.frame( date = rep(seq(as.Date("2024-01-01"), by = "day", length.out = 10), times = 2), sport = "Run", acwr_smooth = rnorm(20, 1, 0.1), stringsAsFactors = FALSE ) expect_warning( res <- calculate_cohort_reference( dat, metric = "acwr_smooth", by = "sport", min_athletes = 1, allow_unknown_athlete = TRUE ), "pooling all rows under a single synthetic" ) expect_s3_class(res, "data.frame") })