test_that("tl_reduce_dimensions works with PCA", { result <- tl_reduce_dimensions(iris, response = "Species", method = "pca", n_components = 3) expect_type(result, "list") expect_true("data" %in% names(result)) expect_true("reduction_model" %in% names(result)) # Check transformed data has PC columns expect_true(any(grepl("PC", names(result$data)))) # Response should be preserved expect_true("Species" %in% names(result$data)) expect_equal(result$data$Species, iris$Species) # Should have requested number of components pc_cols <- sum(grepl("^PC\\d+$", names(result$data))) expect_equal(pc_cols, 3) }) test_that("tl_reduce_dimensions works without response", { result <- tl_reduce_dimensions(iris[, 1:4], method = "pca", n_components = 2) expect_type(result, "list") expect_true("data" %in% names(result)) # Should have PC columns pc_cols <- sum(grepl("^PC", names(result$data))) expect_gte(pc_cols, 2) }) test_that("tl_add_cluster_features adds cluster columns", { data_with_clusters <- tl_add_cluster_features(iris, response = "Species", method = "kmeans", k = 3) # Should have cluster column expect_true(any(grepl("cluster_", names(data_with_clusters)))) # Original columns should be preserved expect_true(all(names(iris) %in% names(data_with_clusters))) # Cluster column should be a factor cluster_col <- grep("cluster_", names(data_with_clusters), value = TRUE) expect_s3_class(data_with_clusters[[cluster_col]], "factor") }) test_that("tl_add_cluster_features works with different clustering methods", { # K-means data_kmeans <- tl_add_cluster_features(iris, response = "Species", method = "kmeans", k = 3) expect_true("cluster_kmeans" %in% names(data_kmeans)) # PAM skip_if_not_installed("cluster") data_pam <- tl_add_cluster_features(iris, response = "Species", method = "pam", k = 3) expect_true("cluster_pam" %in% names(data_pam)) }) test_that("tl_semisupervised performs label propagation", { # Use only 10% of labels set.seed(123) labeled_idx <- sample(nrow(iris), size = 15) model <- tl_semisupervised(iris, Species ~ ., labeled_indices = labeled_idx, cluster_method = "kmeans", supervised_method = "logistic") expect_s3_class(model, "tidylearn_semisupervised") expect_s3_class(model, "tidylearn_supervised") # Should have semisupervised info expect_true("semisupervised_info" %in% names(model)) expect_equal(model$semisupervised_info$labeled_indices, labeled_idx) # Can predict preds <- predict(model) expect_equal(nrow(preds), nrow(iris)) }) test_that("tl_anomaly_aware detects and handles outliers", { skip_if_not_installed("dbscan") # Flag anomalies model_flag <- tl_anomaly_aware(iris, Species ~ ., response = "Species", anomaly_method = "dbscan", action = "flag", supervised_method = "logistic") expect_s3_class(model_flag, "tidylearn_anomaly_aware") expect_true("anomaly_info" %in% names(model_flag)) expect_equal(model_flag$anomaly_info$action, "flag") # Remove anomalies model_remove <- tl_anomaly_aware(iris, Species ~ ., response = "Species", anomaly_method = "dbscan", action = "remove", supervised_method = "logistic") expect_s3_class(model_remove, "tidylearn_anomaly_aware") expect_true("anomalies_removed" %in% names(model_remove)) }) test_that("tl_stratified_models creates cluster-specific models", { models <- tl_stratified_models(mtcars, mpg ~ ., cluster_method = "kmeans", k = 3, supervised_method = "linear") expect_s3_class(models, "tidylearn_stratified") expect_true("cluster_model" %in% names(models)) expect_true("supervised_models" %in% names(models)) # Should have one model per cluster expect_gte(length(models$supervised_models), 1) expect_lte(length(models$supervised_models), 3) }) test_that("predict.tidylearn_stratified assigns to clusters and predicts", { models <- tl_stratified_models(mtcars, mpg ~ ., cluster_method = "kmeans", k = 2, supervised_method = "linear") # Predict on training data preds <- predict(models) expect_equal(nrow(preds), nrow(mtcars)) expect_true(".pred" %in% names(preds)) expect_true(".cluster" %in% names(preds)) # Predict on new data preds_new <- predict(models, new_data = mtcars[1:10, ]) expect_equal(nrow(preds_new), 10) }) test_that("integration functions validate inputs", { # Invalid response variable expect_error( tl_reduce_dimensions(iris, response = "InvalidColumn", method = "pca"), "Response variable.*not found" ) expect_error( tl_add_cluster_features(iris, response = "InvalidColumn", method = "kmeans", k = 3), "Response variable.*not found" ) }) test_that("reduced data can be used for supervised learning", { # Reduce dimensions reduced <- tl_reduce_dimensions(iris, response = "Species", method = "pca", n_components = 3) # Train model on reduced data model <- tl_model(reduced$data, Species ~ ., method = "logistic") expect_s3_class(model, "tidylearn_logistic") # Can predict preds <- predict(model) expect_equal(nrow(preds), nrow(iris)) }) test_that("cluster features improve model", { # This is more of an integration test to ensure the workflow works data_clustered <- tl_add_cluster_features(iris, response = "Species", method = "kmeans", k = 3) # Train model with cluster features model <- tl_model(data_clustered, Species ~ ., method = "logistic") expect_s3_class(model, "tidylearn_logistic") # Can predict preds <- predict(model) expect_equal(nrow(preds), nrow(data_clustered)) })