# Sample thresholds test_thresholds <- dplyr::tibble( parameter = c("TestParam1", "TestParam2"), datatype = c("TypeA", "TypeA"), extreme_upper = c(10, 100), mild_upper = c(5, 50) ) # Sample data test_data <- dplyr::tibble( station_name = c("S1", "S2", "S3"), sample_date = as.Date(c("2025-01-01", "2025-01-02", "2025-01-03")), sample_id = 1:3, shark_sample_id_md5 = letters[1:3], sample_min_depth_m = c(0, 5, 10), sample_max_depth_m = c(1, 6, 11), parameter = c("TestParam1", "TestParam1", "TestParam1"), value = c(8, 12, 9), delivery_datatype = c("TypeA", "TypeA", "TypeA") ) test_that("check_outliers returns datatable for outliers", { out <- check_outliers( data = test_data, parameter = "TestParam1", datatype = "TypeA", threshold_col = "extreme_upper", thresholds = test_thresholds ) expect_s3_class(out, "datatables") expect_equal(nrow(out$x$data), 1) # Only the second row exceeds threshold expect_equal(out$x$data$value, 12) }) test_that("check_outliers prints message if no outliers", { data_no_outliers <- test_data %>% mutate(value = value - 10) expect_message( check_outliers( data = data_no_outliers, parameter = "TestParam1", datatype = "TypeA", threshold_col = "extreme_upper", thresholds = test_thresholds ), "is within the extreme_upper range" ) }) test_that("check_outliers errors if threshold_col is missing", { expect_warning( check_outliers( data = test_data, parameter = "TestParam1", datatype = "TypeA", threshold_col = "nonexistent_col", thresholds = test_thresholds ), "Column nonexistent_col not found in thresholds dataframe" ) }) test_that("check_outliers errors if parameter not in thresholds", { expect_warning( check_outliers( data = test_data, parameter = "NonexistentParam", datatype = "TypeA", threshold_col = "extreme_upper", thresholds = test_thresholds ), "No thresholds found for NonexistentParam and TypeA" ) }) test_that("check_outliers only filters by datatype", { data_mixed <- test_data %>% dplyr::add_row( station_name = "S4", sample_date = as.Date("2025-01-04"), sample_id = 4, shark_sample_id_md5 = "d", sample_min_depth_m = 2, sample_max_depth_m = 3, parameter = "TestParam1", value = 200, delivery_datatype = "TypeB" ) out <- check_outliers( data = data_mixed, parameter = "TestParam1", datatype = "TypeA", threshold_col = "extreme_upper", thresholds = test_thresholds ) # Only TypeA row above threshold should be flagged expect_equal(nrow(out$x$data), 1) expect_equal(out$x$data$delivery_datatype, "TypeA") }) # List the outlier-check functions you want to test: outlier_checks <- list( bacterial_production = check_bacterial_production, bacterial_concentration = check_bacterial_concentration, bacterial_carbon = check_bacterial_carbon, chlorophyll_conc = check_chlorophyll_conc, picoplankton_abundance = check_picoplankton_abundance, picoplankton_biovol = check_picoplankton_biovol, picoplankton_carbon = check_picoplankton_carbon, picoplankton_counted = check_picoplankton_counted, zooplankton_abund = check_zooplankton_abund, zooplankton_counted = check_zooplankton_counted, zooplankton_length_mean = check_zooplankton_length_mean, zooplankton_length_median = check_zooplankton_length_median, zooplankton_wetweight = check_zooplankton_wetweight, zooplankton_carbon = check_zooplankton_carbon, zooplankton_wetweight_volume = check_zooplankton_wetweight_volume, zooplankton_wetweight_area = check_zooplankton_wetweight_area, phytoplankton_abund = check_phytoplankton_abund, phytoplankton_biovol = check_phytoplankton_biovol, phytoplankton_carbon = check_phytoplankton_carbon, phytoplankton_counted = check_phytoplankton_counted, primaryproduction_carbonprod = check_primaryproduction_carbonprod, primaryproduction_carbonprodlight = check_primaryproduction_carbonprodlight, primaryproduction_carbonprod_hour = check_primaryproduction_carbonprod_hour, epibenthos_counted = check_epibenthos_counted, epibenthos_dryweight = check_epibenthos_dryweight, epibenthos_specdistr_maxdepth = check_epibenthos_specdistr_maxdepth, epibenthos_specdistr_mindepth = check_epibenthos_specdistr_mindepth, harbourseal_counted = check_harbourseal_counted, greyseal_counted = check_greyseal_counted, zoobenthos_BQIm = check_zoobenthos_BQIm, zoobenthos_abund = check_zoobenthos_abund, zoobenthos_counted = check_zoobenthos_counted, zoobenthos_wetweight = check_zoobenthos_wetweight, ringedseal_calccounted = check_ringedseal_calccounted, harbporp_positivemin = check_harbporp_positivemin ) manual_param_map <- list( bacterial_production = "Bacterial carbon production", bacterial_concentration = "Bacterial abundance", bacterial_carbon = "Bacterial cell carbon content", chlorophyll_conc = "Chlorophyll-a", picoplankton_abundance = "Abundance", picoplankton_biovol = "Biovolume concentration", picoplankton_carbon = "Carbon concentration", picoplankton_counted = "# counted", zooplankton_abund = "Abundance", zooplankton_counted = "# counted", zooplankton_length_mean = "Length (mean)", zooplankton_length_median = "Length (median)", zooplankton_wetweight = "Wet weight", zooplankton_carbon = "Carbon content", zooplankton_wetweight_volume = "Wet weight/volume", zooplankton_wetweight_area = "Wet weight/area", phytoplankton_abund = "Abundance", phytoplankton_biovol = "Biovolume concentration", phytoplankton_carbon = "Carbon concentration", phytoplankton_counted = "# counted", primaryproduction_carbonprod = "Carbon production", primaryproduction_carbonprodlight = "Carbon prod in light", primaryproduction_carbonprod_hour = "Carbon production/hour", epibenthos_counted = "# counted", epibenthos_dryweight = "Dry weight", epibenthos_specdistr_maxdepth = "Species distribution max depth", epibenthos_specdistr_mindepth = "Species distribution min depth", harbourseal_counted = "# counted", greyseal_counted = "# counted", zoobenthos_BQIm = "BQIm", zoobenthos_abund = "Abundance", zoobenthos_counted = "# counted", zoobenthos_wetweight = "Wet weight", ringedseal_calccounted = "Calculated # counted", harbporp_positivemin = "Porpoise positive minutes" ) # Run tests for each function for (fname in names(outlier_checks)) { fn <- outlier_checks[[fname]] print(fname) # for debugging test_that(paste0("deprecated ", fname, " function still works"), { # extreme_threshold <- get_extreme_threshold(fn, fname, manual_map = manual_map) extreme_threshold <- .threshold_values$extreme_upper[.threshold_values$parameter == manual_param_map[fname][[1]]] if (is.null(extreme_threshold) || !is.numeric(extreme_threshold)) { skip(paste("No numeric extreme threshold for", fname)) } # param_string <- get_parameter_string(fn, fname, manual_map = manual_param_map) param_string <- .threshold_values$parameter[.threshold_values$parameter == manual_param_map[fname][[1]]] datatype_string <- .threshold_values$datatype[.threshold_values$parameter == manual_param_map[fname][[1]]] # Build a small tibble with the columns these functions select/expect df_trigger <- dplyr::tibble( station_name = "S1", sample_date = as.Date("2020-01-01"), sample_id = "id1", shark_sample_id_md5 = "md5_1", sample_min_depth_m = 1, sample_max_depth_m = 2, parameter = param_string, delivery_datatype = datatype_string, # make value exceed the extreme threshold by 1 (works for numeric thresholds) value = as.numeric(extreme_threshold) + 1 ) # Expect deprecated warning lifecycle::expect_deprecated( out <- fn(df_trigger) ) }) } test_that("check_outliers stops if custom_group missing from data", { expect_error( check_outliers( data = test_data, parameter = "TestParam1", datatype = "TypeA", thresholds = test_thresholds, custom_group = "nonexistent_column" ), "Grouping column nonexistent_column not found in data" ) }) test_that("check_outliers works with custom_group", { # Add a grouping column to data and thresholds data_grouped <- test_data %>% mutate(group = c("A", "A", "B")) thresholds_grouped <- test_thresholds %>% mutate(group = c("A", "B")) out <- check_outliers( data = data_grouped, parameter = "TestParam1", datatype = "TypeA", thresholds = thresholds_grouped, threshold_col = "extreme_upper", custom_group = "group", return_df = TRUE ) # Only the second row in group A exceeds threshold expect_equal(nrow(out), 1) expect_equal(out$value, 12) }) test_that("check_outliers returns data.frame if return_df = TRUE", { out <- check_outliers( data = test_data, parameter = "TestParam1", datatype = "TypeA", threshold_col = "extreme_upper", thresholds = test_thresholds, return_df = TRUE ) expect_s3_class(out, "data.frame") })