# Test: Masking and Zero-Handling # Verify masking behavior and zero treatment options(RcppML.verbose = FALSE) test_that("mask='zeros' excludes zero entries from optimization", { skip_on_cran() set.seed(42) data <- simulateNMF(60, 50, k = 4, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") # Fit with mask='zeros' model_masked <- nmf(A_sparse, 4, mask = "zeros", maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Fit without mask (zeros are observed) model_unmasked <- nmf(A_sparse, 4, maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Both models should converge to similar solutions for sparse matrices # (sparse NMF naturally focuses on non-zeros) # But MSE computation should differ # Both should produce valid non-negative factors expect_true(is_nonnegative(model_masked$w)) expect_true(is_nonnegative(model_masked$h)) expect_true(is_nonnegative(model_unmasked$w)) expect_true(is_nonnegative(model_unmasked$h)) # Both should converge to reasonable loss values expect_lt(model_masked$loss, 5.0) expect_lt(model_unmasked$loss, 5.0) }) test_that("mask='zeros' loss only considers non-zero entries", { skip_on_cran() set.seed(42) data <- simulateNMF(60, 50, k = 4, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") model <- nmf(A_sparse, 4, mask = "zeros", maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Compute loss manually only on non-zero entries A_recon <- model$w %*% model$h nonzero_idx <- which(A_sparse != 0) mse_nonzero <- mean((A_sparse[nonzero_idx] - A_recon[nonzero_idx])^2) # Compute using helper (should match) loss_computed <- compute_nmf_loss(A_sparse, model$w, model$h, loss_type = "mse", mask = "zeros")$data_loss expect_equal(mse_nonzero, loss_computed, tolerance = 1e-10) }) test_that("mask='zeros' works with different loss types", { skip_on_cran() set.seed(42) data <- simulateNMF(50, 40, k = 3, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") loss_types <- c("mse") for (loss in loss_types) { model <- nmf(A_sparse, 3, mask = "zeros", loss = loss, maxit = 50, tol = 1e-6, seed = 456, verbose = FALSE) # Should converge without errors expect_true(!is.null(model$w)) expect_true(!is.null(model$h)) expect_true(is.finite(model$loss)) expect_gt(model$loss, 0) } }) test_that("mask='zeros' with regularization", { skip_on_cran() set.seed(42) data <- simulateNMF(60, 50, k = 4, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") model <- nmf(A_sparse, 4, mask = "zeros", L1 = c(0.05, 0.05), L2 = c(0.05, 0.05), maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Should produce sparse, regularized factors expect_gt(compute_sparsity(model$w), 0.05) expect_gt(compute_sparsity(model$h), 0.05) expect_true(is_nonnegative(model$w)) expect_true(is_nonnegative(model$h)) }) test_that("Dense matrices: zeros are always observed", { skip_on_cran() set.seed(42) data <- simulateNMF(50, 40, k = 3, noise = 0.1, dropout = 0.3, seed = 123) A_dense <- data$A # Set some entries to exactly zero A_dense[1:5, 1:5] <- 0 # Fit NMF (no mask - zeros are observed) model <- nmf(A_dense, 3, maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Reconstruction should fit zeros too A_recon <- model$w %*% model$h # Check that zero region has low reconstruction values zero_region_recon <- A_recon[1:5, 1:5] # Should be close to zero (but not necessarily exactly zero) expect_lt(mean(abs(zero_region_recon)), 0.5) }) test_that("Sparse vs dense: different treatment of zeros", { skip_on_cran() set.seed(42) data <- simulateNMF(50, 40, k = 3, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") A_dense <- as.matrix(A_sparse) # Fit sparse with mask='zeros' (zeros unobserved) model_sparse_masked <- nmf(A_sparse, 3, mask = "zeros", maxit = 80, tol = 1e-6, seed = 456, verbose = FALSE) # Fit dense (zeros observed) model_dense <- nmf(A_dense, 3, maxit = 80, tol = 1e-6, seed = 456, verbose = FALSE) # Both should produce valid non-negative factors expect_true(is_nonnegative(model_sparse_masked$w)) expect_true(is_nonnegative(model_sparse_masked$h)) expect_true(is_nonnegative(model_dense$w)) expect_true(is_nonnegative(model_dense$h)) # Both should converge expect_true(!is.null(model_sparse_masked$loss)) expect_true(!is.null(model_dense$loss)) }) test_that("mask='zeros' improves fit on non-zero entries", { skip_on_cran() set.seed(42) data <- simulateNMF(60, 50, k = 4, noise = 0.08, dropout = 0.7, seed = 123) A_sparse <- as(data$A, "dgCMatrix") # Fit with mask='zeros' model_masked <- nmf(A_sparse, 4, mask = "zeros", maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Fit without mask model_unmasked <- nmf(A_sparse, 4, maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Compute MSE on non-zero entries only nonzero_idx <- which(A_sparse != 0) recon_masked <- model_masked$w %*% model_masked$h recon_unmasked <- model_unmasked$w %*% model_unmasked$h mse_masked <- mean((A_sparse[nonzero_idx] - recon_masked[nonzero_idx])^2) mse_unmasked <- mean((A_sparse[nonzero_idx] - recon_unmasked[nonzero_idx])^2) # Masked version focuses on non-zeros, so should fit them better (usually) # This is not guaranteed but is expected for highly sparse data expect_true(is.finite(mse_masked)) expect_true(is.finite(mse_unmasked)) expect_gt(mse_masked, 0) expect_gt(mse_unmasked, 0) }) test_that("Highly sparse data: mask='zeros' vs standard NMF", { skip_on_cran() set.seed(42) # Generate very sparse data (90% zeros) data <- simulateNMF(60, 50, k = 4, noise = 0.05, dropout = 0.9, seed = 123) A_sparse <- as(data$A, "dgCMatrix") # With such high sparsity, mask='zeros' is more appropriate model <- nmf(A_sparse, 4, mask = "zeros", maxit = 80, tol = 1e-6, seed = 456, verbose = FALSE) # Should converge to reasonable solution expect_true(!is.null(model$w)) expect_true(!is.null(model$h)) expect_true(is.finite(model$loss)) # Compute error on non-zeros nonzero_idx <- which(A_sparse != 0) recon <- model$w %*% model$h mse_nonzero <- mean((A_sparse[nonzero_idx] - recon[nonzero_idx])^2) # For very sparse data, use absolute MSE threshold instead of variance-based # With high dropout and low noise, even small absolute errors can exceed variance expect_lt(mse_nonzero, 0.1, label = sprintf("MSE on non-zeros (%.4f)", mse_nonzero)) }) test_that("mask='zeros' respects non-negativity", { skip_on_cran() set.seed(42) data <- simulateNMF(60, 50, k = 4, noise = 0.1, dropout = 0.6, seed = 123) A_sparse <- as(data$A, "dgCMatrix") # Strong regularization with mask model <- nmf(A_sparse, 4, mask = "zeros", L1 = c(0.2, 0.2), L2 = c(0.2, 0.2), maxit = 100, tol = 1e-6, seed = 456, verbose = FALSE) # Must remain non-negative expect_true(is_nonnegative(model$w, tol = -1e-10)) expect_true(is_nonnegative(model$h, tol = -1e-10)) }) test_that("mask='zeros' with very few non-zeros still works", { skip_on_cran() set.seed(42) # Extreme sparsity (95% zeros) data <- simulateNMF(50, 40, k = 3, noise = 0.05, dropout = 0.95, seed = 123) A_sparse <- as(data$A, "dgCMatrix") # Should still converge (though may need more iterations) model <- nmf(A_sparse, 3, mask = "zeros", maxit = 150, tol = 1e-6, seed = 456, verbose = FALSE) # Basic sanity checks expect_true(!is.null(model$w)) expect_true(!is.null(model$h)) expect_true(is.finite(model$loss)) expect_true(is_nonnegative(model$w)) expect_true(is_nonnegative(model$h)) }) # ============================================================================ # NA Masking Tests for Dense Matrices # ============================================================================ test_that("Dense matrices: NA values are auto-detected and masked", { skip_on_cran() set.seed(42) A_dense <- matrix(runif(100 * 50), 100, 50) # Introduce NA values A_dense[1:5, 1:5] <- NA na_count <- sum(is.na(A_dense)) # Should auto-detect NAs and create a mask (warning is expected) expect_warning( model <- nmf(A_dense, k = 3, maxit = 20, verbose = FALSE), "Detected.*NA values" ) # Model should fit successfully expect_s4_class(model, "nmf") expect_true(is.finite(model@misc$loss)) # Factors should be non-negative expect_true(all(model$w >= 0)) expect_true(all(model$h >= 0)) }) test_that("Dense matrices: explicit mask='NA' works", { skip_on_cran() set.seed(42) A_dense <- matrix(runif(80 * 40), 80, 40) A_dense[1:3, 1:3] <- NA # Explicit mask='NA' should work without message model <- suppressMessages( nmf(A_dense, k = 2, mask = "NA", maxit = 15, verbose = FALSE) ) expect_s4_class(model, "nmf") expect_true(is.finite(model@misc$loss)) }) test_that("Dense matrices: NA mask does not affect non-NA regions", { skip_on_cran() set.seed(42) A_dense <- matrix(runif(60 * 40), 60, 40) original_values <- A_dense[10:15, 10:15] # Save non-NA region # Add NAs in different region A_dense[1:5, 1:5] <- NA # Fit model model <- suppressMessages( nmf(A_dense, k = 3, maxit = 30, verbose = FALSE) ) # Reconstruction should fit non-NA region well recon <- model$w %*% diag(model$d) %*% model$h mse_non_na <- mean((original_values - recon[10:15, 10:15])^2) # MSE on non-NA region should be low (good fit) expect_lt(mse_non_na, 0.5) # Reasonable threshold })