# Skip on CRAN as this test takes about a minute. testthat::skip_on_cran() testthat::skip_on_ci() # Create random number generator stream for reproducibility. r <- familiar:::.start_random_number_stream(seed = 1863) # Subsampling ------------------------------------------------------------------ for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = TRUE, rstream_object = r ) n_rep <- 3L if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Subsampling ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- familiar:::.create_subsample( data = data@data, n_iter = 20, size = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r ) # Check that none of in-bag datasets is the same. for (ii in 1:(length(subsample_data$train_list) - 1)) { for (jj in (ii + 1):length(subsample_data$train_list)) { testthat::expect_false(data.table::fsetequal( subsample_data$train_list[[ii]], subsample_data$train_list[[jj]] )) } } for (ii in seq_along(subsample_data$train_list)) { # Check that there is no overlap between in-bag and out-of-bag data. testthat::expect_equal( nrow(data.table::fintersect( unique(subsample_data$train_list[[ii]]), unique(subsample_data$valid_list[[ii]]))), 0L) # Check that the combination of in-bag and out-of-bag data is the same # as the input dataset. testthat::expect_true(data.table::fsetequal( unique(rbind(subsample_data$train_list[[ii]], subsample_data$valid_list[[ii]])), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]) )) # Check that sampling creates a dataset identical to the development # dataset. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$train_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$train_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data$train_list[[ii]])) # Test that the subsample has the required size. testthat::expect_equal( data.table::uniqueN(train_data@data, by = c("batch_id", "sample_id")), 20) # Check that sampling creates a dataset identical to the validation # dataset. validation_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$valid_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true( data.table::fsetequal( validation_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$valid_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(validation_data@data), n_rep * nrow(subsample_data$valid_list[[ii]])) # If stratified, check that occurrence of event or categories is similar # between discovery and the entire dataset. if (stratify & outcome_type %in% c("binomial", "multinomial")) { # Determine the frequency of outcome classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } else if (stratify & outcome_type %in% c("survival")) { # Determine the frequency of censored data points and events in # classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome_event"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome_event"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome_event") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } # Check that the rare outcome is found in the training data. This # prevent issues with training data. if (outcome_type == "multinomial") { testthat::expect_gt(nrow(train_data@data[outcome == "3"]), 0) } # Assert that all outcome levels in the validation folds also appear in # the training folds. if (outcome_type %in% c("binomial", "multinomial", "survival")) { testthat::expect_equal( length(setdiff( unique(validation_data@data$outcome), unique(train_data@data$outcome))), 0) } } }) } } for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { # Create synthetic dataset with one outcome. if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Bootstrap ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- familiar:::.create_subsample( data = data@data, n_iter = 20, size = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r) # Expect a list. This is sort of a placeholder because subsampling should # work even when the outcome value is singular. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample. We don't expect an error because you can't do # cross-validation with a single sample. subsample_data <- familiar:::.create_subsample( data = data@data, n_iter = 20, size = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r) testthat::expect_type(subsample_data, "list") }) } } # Full undersampling ----------------------------------------------------------- for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset. data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = FALSE, rstream_object = r) n_rep <- 3L testthat::test_that(paste0( "Full undersampling for correcting outcome imbalances for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_method = "full_undersampling", rstream_object = r)) # Check that none of the training folds are the same. if (length(subsample_data) > 1) { for (ii in 1:(length(subsample_data) - 1)) { for (jj in (ii + 1):length(subsample_data)) { testthat::expect_false(data.table::fsetequal( subsample_data[[ii]], subsample_data[[jj]])) } } } for (ii in seq_along(subsample_data)) { # Assert that all samples in the subsample are unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data[[ii]]) # Test that the samples and series are selected. testthat::expect_true( data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data[[ii]])) # Assert that outcomes the minority class is now selected as least as # often as other classes. original_table <- unique(data@data, by = familiar:::get_id_columns(id_depth = "series")) original_table <- original_table[, list("n" = .N), by = "outcome"][order(n)] minority_class_n <- min(original_table$n) minority_class <- original_table[n == minority_class_n]$outcome[1] frequency_table <- unique(train_data@data, by = familiar:::get_id_columns(id_depth = "series")) frequency_table <- frequency_table[, list("partition_occurrence" = .N), by = "outcome"] # Assert that all instances of the minority class are selected. testthat::expect_equal( frequency_table[outcome == minority_class]$partition_occurrence, minority_class_n) # Assert that all instances similar to the minority class are selected. testthat::expect_true(all(frequency_table$partition_occurrence <= minority_class_n)) } }) } for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset. data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, n_series = 1L, n_samples = 30, rare_outcome = FALSE, rstream_object = r ) n_rep <- 3L testthat::test_that(paste0( "Full undersampling for correcting outcome imbalances for ", outcome_type, " without multiple series functions correctly." ), { # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_method = "full_undersampling", rstream_object = r)) # Check that none of the training folds are the same. if (length(subsample_data) > 1) { for (ii in 1:(length(subsample_data) - 1)) { for (jj in (ii + 1):length(subsample_data)) { testthat::expect_false(data.table::fsetequal( subsample_data[[ii]], subsample_data[[jj]])) } } } # The union of the datasets is the original dataset. testthat::expect_true(data.table::fsetequal( unique(data.table::rbindlist(subsample_data)), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]))) for (ii in seq_along(subsample_data)) { # Assert that all samples in the subsample are unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data[[ii]])) # Assert that outcomes the minority class is now selected as least as # often as other classes. original_table <- unique(data@data, by = familiar:::get_id_columns(id_depth = "series")) original_table <- original_table[, list("n" = .N), by = "outcome"][order(n)] minority_class_n <- min(original_table$n) minority_class <- original_table[n == minority_class_n]$outcome[1] frequency_table <- unique(train_data@data, by = familiar:::get_id_columns(id_depth = "series")) frequency_table <- frequency_table[, list("partition_occurrence" = .N), by = "outcome"] # Assert that all instances of the minority class are selected. testthat::expect_equal( frequency_table[outcome == minority_class]$partition_occurrence, minority_class_n) # Assert that all instances similar to the minority class are selected. testthat::expect_true(all(frequency_table$partition_occurrence == minority_class_n)) } }) } for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset with one outcome. testthat::test_that(paste0( "Full undersampling for correcting outcome imbalances for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_method = "full_undersampling", rstream_object = r)) # Expect a list. This is sort of a placeholder because the partitioning # should work. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_method = "full_undersampling", rstream_object = r)) # Expect a list. This is sort of a placeholder because the partitioning # should work. testthat::expect_type(subsample_data, "list") }) } # Random undersampling --------------------------------------------------------- for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset. data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = FALSE, rstream_object = r) n_rep <- 3L testthat::test_that(paste0( "Random undersampling for correcting outcome imbalances for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_n_partitions = 3L, imbalance_method = "random_undersampling", rstream_object = r)) # Check that none of the training folds are the same. if (length(subsample_data) > 1) { for (ii in 1:(length(subsample_data) - 1)) { for (jj in (ii + 1):length(subsample_data)) { testthat::expect_false(data.table::fsetequal( subsample_data[[ii]], subsample_data[[jj]])) } } } # Check that at most 3 (the number specified) partitions are created. testthat::expect_lte(length(subsample_data), 3L) for (ii in seq_along(subsample_data)) { # Assert that all samples in the subsample are unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data[[ii]] )) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data[[ii]])) # Assert that outcomes the minority class is now selected as least as # often as other classes. original_table <- unique(data@data, by = familiar:::get_id_columns(id_depth = "series")) original_table <- original_table[, list("n" = .N), by = "outcome"][order(n)] minority_class_n <- min(original_table$n) minority_class <- original_table[n == minority_class_n]$outcome[1] frequency_table <- unique(train_data@data, by = familiar:::get_id_columns(id_depth = "series")) frequency_table <- frequency_table[, list("partition_occurrence" = .N), by = "outcome"] # Assert that all instances of the minority class are selected. testthat::expect_equal( frequency_table[outcome == minority_class]$partition_occurrence, minority_class_n) # Assert that all instances similar to the minority class are selected. testthat::expect_true(all(frequency_table$partition_occurrence <= minority_class_n)) } }) } for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset. data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, n_series = 1L, n_samples = 30, rare_outcome = FALSE, rstream_object = r) n_rep <- 3L testthat::test_that(paste0( "Random undersampling for correcting outcome imbalances for ", outcome_type, " without multiple series functions correctly." ), { # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_n_partitions = 3L, imbalance_method = "random_undersampling", rstream_object = r)) # Check that none of the training folds are the same. if (length(subsample_data) > 1) { for (ii in 1:(length(subsample_data) - 1)) { for (jj in (ii + 1):length(subsample_data)) { testthat::expect_false(data.table::fsetequal( subsample_data[[ii]], subsample_data[[jj]])) } } } for (ii in seq_along(subsample_data)) { # Assert that all samples in the subsample are unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data[[ii]])) # Assert that outcomes the minority class is now selected as least as # often as other classes. original_table <- unique(data@data, by = familiar:::get_id_columns(id_depth = "series")) original_table <- original_table[, list("n" = .N), by = "outcome"][order(n)] minority_class_n <- min(original_table$n) minority_class <- original_table[n == minority_class_n]$outcome[1] frequency_table <- unique(train_data@data, by = familiar:::get_id_columns(id_depth = "series")) frequency_table <- frequency_table[, list("partition_occurrence" = .N), by = "outcome"] # Assert that all instances of the minority class are selected. testthat::expect_equal( frequency_table[outcome == minority_class]$partition_occurrence, minority_class_n) # Assert that all instances similar to the minority class are selected. testthat::expect_true(all(frequency_table$partition_occurrence == minority_class_n)) } }) } for (outcome_type in c("binomial", "multinomial")) { # Create synthetic dataset with one outcome. testthat::test_that(paste0( "Random undersampling for correcting outcome imbalances for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_n_partitions = 3L, imbalance_method = "random_undersampling", rstream_object = r)) # Expect a list. This is sort of a placeholder because the partitioning # should work. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample subsample_data <- suppressWarnings(familiar:::.create_balanced_partitions( data = data@data, outcome_type = outcome_type, imbalance_n_partitions = 3L, imbalance_method = "random_undersampling", rstream_object = r)) # Expect a list. This is sort of a placeholder because the partitioning # should work. testthat::expect_type(subsample_data, "list") }) } # Cross-validation ------------------------------------------------------------- for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = TRUE, rstream_object = r) n_rep <- 3L n_folds <- 3L if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Cross-validation ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- familiar:::.create_cv( data = data@data, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r) # Check that none of the training folds are the same. for (ii in 1:(length(subsample_data$train_list) - 1)) { for (jj in (ii + 1):length(subsample_data$train_list)) { testthat::expect_false(data.table::fsetequal( subsample_data$train_list[[ii]], subsample_data$train_list[[jj]] )) } } for (ii in seq_along(subsample_data$train_list)) { # Check that there is no overlap between training folds and the # validation fold. testthat::expect_equal( nrow(data.table::fintersect( unique(subsample_data$train_list[[ii]]), unique(subsample_data$valid_list[[ii]]))), 0L) # Check that the union of training and validation folds is the input # dataset. testthat::expect_true(data.table::fsetequal( unique(rbind(subsample_data$train_list[[ii]], subsample_data$valid_list[[ii]])), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]) )) # Assert that all samples in the training and validation folds are # unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data$train_list[[ii]]), 0) testthat::expect_equal(anyDuplicated(subsample_data$valid_list[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$train_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$train_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data$train_list[[ii]])) # Test that the size of the training folds is about (n_folds - 1) / # n_folds of the complete set. testthat::expect_gte(nrow(train_data@data), nrow(data@data) * (n_folds - 1) / n_folds - 9) testthat::expect_lte(nrow(train_data@data), nrow(data@data) * (n_folds - 1) / n_folds + 9) # Check that sampling creates a dataset identical to the validation subsample. validation_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$valid_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( validation_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$valid_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(validation_data@data), n_rep * nrow(subsample_data$valid_list[[ii]])) # Test that the size of the validation fold is about 1 / n_folds of # the complete set. testthat::expect_gte(nrow(validation_data@data), nrow(data@data) * 1 / n_folds - 9) testthat::expect_lte(nrow(validation_data@data), nrow(data@data) * 1 / n_folds + 9) # Assert that data are correctly stratified. if (stratify && outcome_type %in% c("binomial", "multinomial")) { # Determine the frequency of outcome classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } else if (stratify & outcome_type %in% c("survival")) { # Determine the frequency of censored data points and events in # classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome_event"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome_event"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome_event") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } # Check that the rare outcome is found in the training data. This # prevent issues with training data. if (outcome_type == "multinomial") { testthat::expect_gt(nrow(train_data@data[outcome == "3"]), 0) } # Assert that all outcome levels in the validation folds also appear in # the training folds. if (outcome_type %in% c("binomial", "multinomial", "survival")) { testthat::expect_equal( length(setdiff( unique(validation_data@data$outcome), unique(train_data@data$outcome))), 0) } } }) } } for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { # Create synthetic dataset with one outcome. if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } n_folds <- 3L for (stratify in available_stratify_options) { testthat::test_that(paste0( "Cross-validation ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_cv( data = data@data, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r)) # Expect a list. This is sort of a placeholder because cross-validation # should work even when the outcome value is singular. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample. We expect an error because you can't do # cross-validation with a single sample. subsample_data <- testthat::expect_error(suppressWarnings(familiar:::.create_cv( data = data@data, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r))) }) } } # Repeated cross-validation ---------------------------------------------------- for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = TRUE, rstream_object = r) n_rep <- 3L n_folds <- 3L if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Repeated cross-validation ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- familiar:::.create_repeated_cv( data = data@data, n_rep = 3L, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r) # Check that none of the training folds are the same. for (ii in 1:(length(subsample_data$train_list) - 1)) { for (jj in (ii + 1):length(subsample_data$train_list)) { testthat::expect_false(data.table::fsetequal( subsample_data$train_list[[ii]], subsample_data$train_list[[jj]])) } } for (ii in seq_along(subsample_data$train_list)) { # Check that there is no overlap between training folds and the # validation fold. testthat::expect_equal( nrow(data.table::fintersect( unique(subsample_data$train_list[[ii]]), unique(subsample_data$valid_list[[ii]]))), 0L) # Check that the union of training and validation folds is the input # dataset. testthat::expect_true(data.table::fsetequal( unique(rbind(subsample_data$train_list[[ii]], subsample_data$valid_list[[ii]])), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]))) # Assert that all samples in the training and validation folds are # unique (not duplicated). testthat::expect_equal(anyDuplicated(subsample_data$train_list[[ii]]), 0) testthat::expect_equal(anyDuplicated(subsample_data$valid_list[[ii]]), 0) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$train_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$train_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data$train_list[[ii]])) # Test that the size of the training folds is about (n_folds - 1) / # n_folds of the complete set. testthat::expect_gte(nrow(train_data@data), nrow(data@data) * (n_folds - 1) / n_folds - 9) testthat::expect_lte(nrow(train_data@data), nrow(data@data) * (n_folds - 1) / n_folds + 9) # Check that sampling creates a dataset identical to the validation # subsample. validation_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$valid_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( validation_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$valid_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(validation_data@data), n_rep * nrow(subsample_data$valid_list[[ii]])) # Test that the size of the validation fold is about 1 / n_folds of the # complete set. testthat::expect_gte(nrow(validation_data@data), nrow(data@data) * 1 / n_folds - 9) testthat::expect_lte(nrow(validation_data@data), nrow(data@data) * 1 / n_folds + 9) # Assert that data are correctly stratified. if (stratify && outcome_type %in% c("binomial", "multinomial")) { # Determine the frequency of outcome classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } else if (stratify && outcome_type %in% c("survival")) { # Determine the frequency of censored data points and events in # classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome_event"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome_event"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome_event") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } # Check that the rare outcome is found in the training data. This # prevent issues with training data. if (outcome_type == "multinomial") { testthat::expect_gt(nrow(train_data@data[outcome == "3"]), 0) } # Assert that all outcome levels in the validation folds also appear in # the training folds. if (outcome_type %in% c("binomial", "multinomial", "survival")) { testthat::expect_equal( length(setdiff( unique(validation_data@data$outcome), unique(train_data@data$outcome))), 0) } } }) } } for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { # Create synthetic dataset with one outcome. if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } n_folds <- 3L for (stratify in available_stratify_options) { testthat::test_that(paste0( "Repeated cross-validation ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_repeated_cv( data = data@data, n_rep = 3L, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r)) # Expect a list. This is sort of a placeholder because cross-validation # should work even when the outcome value is singular. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample. We expect an error because you can't do # cross-validation with a single sample. subsample_data <- testthat::expect_error(suppressWarnings(familiar:::.create_repeated_cv( data = data@data, n_rep = 3L, n_folds = n_folds, stratify = stratify, outcome_type = outcome_type, rstream_object = r))) }) } } # Leave-one-out cross-validation ----------------------------------------------- for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = TRUE, rstream_object = r) n_rep <- 3L testthat::test_that(paste0( "Leave-one-out cross-validation for ", outcome_type, " functions correctly." ), { # Create subsample subsample_data <- familiar:::.create_loocv( data = data@data, outcome_type = outcome_type, rstream_object = r) # Check that none of the training folds are the same. for (ii in 1:(length(subsample_data$train_list) - 1)) { for (jj in (ii + 1):length(subsample_data$train_list)) { testthat::expect_false(data.table::fsetequal( subsample_data$train_list[[ii]], subsample_data$train_list[[jj]])) } } # Assert (for LOOCV) that the number of run pairs is the number of samples, # or the number of samples - 1 for multinomial (due to pre-assignment of a # sample with the rare outcome level). if (outcome_type == "multinomial") { testthat::expect_equal( length(subsample_data$train_list), data.table::uniqueN( data@data, by = familiar:::get_id_columns(id_depth = "sample")) - 1L) } else { testthat::expect_equal( length(subsample_data$train_list), data.table::uniqueN( data@data, by = familiar:::get_id_columns(id_depth = "sample"))) } # Iterate over the subsamples. for (ii in seq_along(subsample_data$train_list)) { # Check that there is no overlap between training folds and the validation # fold. testthat::expect_equal( nrow(data.table::fintersect( unique(subsample_data$train_list[[ii]]), unique(subsample_data$valid_list[[ii]]))), 0L) # Check that the union of training and validation folds is the input # dataset. testthat::expect_true(data.table::fsetequal( unique(rbind(subsample_data$train_list[[ii]], subsample_data$valid_list[[ii]])), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]))) # Assert that all samples in the training fold are unique (not # duplicated). testthat::expect_equal(anyDuplicated(subsample_data$train_list[[ii]]), 0) # Assert that there is only one sample in the validation fold. testthat::expect_equal( data.table::uniqueN(subsample_data$valid_list[[ii]], by = familiar:::get_id_columns(id_depth = "sample")), 1) # Check that sampling creates a dataset identical to the development # subsample. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$train_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$train_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data$train_list[[ii]])) # Assert that the size of the training set is equal to the number of # samples - 1. testthat::expect_equal( data.table::uniqueN(train_data@data, by = familiar:::get_id_columns(id_depth = "sample")), data.table::uniqueN(data@data, by = familiar:::get_id_columns(id_depth = "sample")) - 1L) # Check that sampling creates a dataset identical to the validation # subsample. validation_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$valid_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( validation_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$valid_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(validation_data@data), n_rep * nrow(subsample_data$valid_list[[ii]])) # Test that the number of samples in the the validation fold is 1. testthat::expect_equal( data.table::uniqueN(validation_data@data, by = familiar:::get_id_columns(id_depth = "sample")), 1L) # Check that the rare outcome is found in the training data. This prevent # issues with training data. if (outcome_type == "multinomial") { testthat::expect_gt(nrow(train_data@data[outcome == "3"]), 0) } # Assert that all outcome levels in the validation folds also appear in # the training folds. if (outcome_type %in% c("binomial", "multinomial", "survival")) { testthat::expect_equal( length(setdiff( unique(validation_data@data$outcome), unique(train_data@data$outcome))), 0) } } }) } for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { testthat::test_that(paste0( "Repeated cross-validation for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- suppressWarnings(familiar:::.create_loocv( data = data@data, outcome_type = outcome_type, rstream_object = r)) # Expect a list. This is sort of a placeholder because cross-validation # should work even when the outcome value is singular. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample. We expect an error because you can't do cross-validation # with a single sample. subsample_data <- testthat::expect_error(suppressWarnings(familiar:::.create_loocv( data = data@data, outcome_type = outcome_type, rstream_object = r))) }) } # Bootstraps ------------------------------------------------------------------- for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { data <- familiar:::test_create_synthetic_series_data( outcome_type = outcome_type, rare_outcome = TRUE, rstream_object = r) n_rep <- 3L if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Bootstrap resampling ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " functions correctly." ), { # Create subsample. subsample_data <- familiar:::.create_bootstraps( data = data@data, n_iter = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r) # Check that none of in-bag datasets is the same. for (ii in 1:(length(subsample_data$train_list) - 1)) { for (jj in (ii + 1):length(subsample_data$train_list)) { testthat::expect_false(data.table::fsetequal( subsample_data$train_list[[ii]], subsample_data$train_list[[jj]])) } } for (ii in seq_along(subsample_data$train_list)) { # Check that there is no overlap between in-bag and out-of-bag data. testthat::expect_equal( nrow(data.table::fintersect( unique(subsample_data$train_list[[ii]]), unique(subsample_data$valid_list[[ii]]))), 0L) # Check that the combination of in-bag and out-of-bag data is the same # as the input dataset. testthat::expect_true(data.table::fsetequal( unique(rbind(subsample_data$train_list[[ii]], subsample_data$valid_list[[ii]])), unique(data@data[, mget(familiar:::get_id_columns(id_depth = "series"))]))) # Check that sampling creates a dataset identical to the development # dataset. train_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$train_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( train_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$train_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(train_data@data), n_rep * nrow(subsample_data$train_list[[ii]])) # Test that the subsample has the same size (or is slightly larger) than # the original dataset. It can be slightly larger because samples with # rare outcomes are added to the in-bag data. testthat::expect_gte(nrow(train_data@data), nrow(data@data) - 30) testthat::expect_lte(nrow(train_data@data), nrow(data@data) + 9) # Check that sampling creates a dataset identical to the validation # dataset. validation_data <- familiar:::select_data_from_samples( data = data, samples = subsample_data$valid_list[[ii]]) # Test that the samples and series are selected. testthat::expect_true(data.table::fsetequal( validation_data@data[repetition_id == 1L, mget(familiar:::get_id_columns(id_depth = "series"))], subsample_data$valid_list[[ii]])) # Test that repetitions are likewise selected. testthat::expect_equal( nrow(validation_data@data), n_rep * nrow(subsample_data$valid_list[[ii]])) # If stratified, check that occurrence of event or categories is similar # between discovery and the entire dataset. if (stratify && outcome_type %in% c("binomial", "multinomial")) { # Determine the frequency of outcome classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_true(all(frequency_table$similar)) } else if (stratify & outcome_type %in% c("survival")) { # Determine the frequency of censored data points and events in # classes in the original dataset. input_frequency <- data@data[, list( "frequency_original" = .N / nrow(data@data)), by = "outcome_event"] train_frequency <- train_data@data[, list( "frequency_bootstrap" = .N / nrow(train_data@data)), by = "outcome_event"] # Update the frequency table. frequency_table <- merge( x = input_frequency, y = train_frequency, by = "outcome_event") # Check that the data is correctly stratified. frequency_table[, "similar" := data.table::between( frequency_bootstrap, lower = frequency_original - 0.05, upper = frequency_original + 0.05 )] testthat::expect_equal(all(frequency_table$similar), TRUE) } # Check that the rare outcome is found in the training data. This # prevent issues with training data. if (outcome_type == "multinomial") { testthat::expect_gt(nrow(train_data@data[outcome == "3"]), 0) } # Assert that all outcome levels in the validation folds also appear in # the training folds. if (outcome_type %in% c("binomial", "multinomial", "survival")) { testthat::expect_equal( length(setdiff( unique(validation_data@data$outcome), unique(train_data@data$outcome))), 0) } } }) } } for (outcome_type in c("binomial", "multinomial", "continuous", "count", "survival")) { # Create synthetic dataset with one outcome. if (outcome_type %in% c("binomial", "multinomial", "survival")) { available_stratify_options <- c(FALSE, TRUE) } else { available_stratify_options <- FALSE } for (stratify in available_stratify_options) { testthat::test_that(paste0( "Bootstrap ", ifelse(stratify, "(stratified) ", ""), "for ", outcome_type, " with odd data functions correctly." ), { # One outcome-data data <- familiar:::test_create_synthetic_series_one_outcome( outcome_type = outcome_type, rstream_object = r) # Create subsample. subsample_data <- familiar:::.create_bootstraps( data = data@data, n_iter = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r) # Expect a list. This is sort of a placeholder because cross-validation # should work even when the outcome value is singular. testthat::expect_type(subsample_data, "list") # One sample data. data <- familiar:::test_create_synthetic_series_one_sample_data( outcome_type = outcome_type, rstream_object = r) # Create subsample. We expect an error because you can't do # cross-validation with a single sample. subsample_data <- testthat::expect_error(familiar:::.create_bootstraps( data = data@data, n_iter = 20, stratify = stratify, outcome_type = outcome_type, rstream_object = r)) }) } }