# Systematic Test Suite for Factorana Package # # This test suite systematically tests the package functionality by: # 1. Checking analytical gradients against finite differences # 2. Checking analytical Hessians against finite differences # 3. Estimating models with default and true initial parameters # 4. Comparing estimates to true parameter values # # Tests are marked skip_on_cran() as they involve estimation and take time # Test configuration VERBOSE <- Sys.getenv("FACTORANA_TEST_VERBOSE", "FALSE") == "TRUE" SAVE_LOGS <- Sys.getenv("FACTORANA_TEST_SAVE_LOGS", "TRUE") == "TRUE" GRAD_TOL <- 1e-3 # Relaxed to accommodate numerical precision in finite differences HESS_TOL <- 1e-3 # Checking all elements (diagonal and off-diagonal) # ============================================================================== # Test A: Measurement System with 3 Linear Tests and 1 Factor # ============================================================================== test_that("Model A: Measurement system with 3 linear tests and 1 factor", { skip_on_cran() set.seed(104) # Simulate data n <- 500 f <- rnorm(n) # Latent factor # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, T3_int, T3_lambda, T3_sigma true_params <- c(1.0, # Factor variance 2.0, 0.5, # T1: int, sigma (loading fixed to 1.0) 1.5, 1.2, 0.6, # T2: int, lambda, sigma 1.0, 0.8, 0.4) # T3: int, lambda, sigma # Generate data (using true loading values for simulation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) T3 <- true_params[7] + true_params[8]*f + rnorm(n, 0, true_params[9]) dat <- data.frame(intercept = 1, T1 = T1, T2 = T2, T3 = T3, eval = 1) # Create model system with 1 factor fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, # Fix loading to 1 for identification evaluation_indicator = "eval" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, # Estimate freely evaluation_indicator = "eval" ) mc_T3 <- define_model_component( name = "T3", data = dat, outcome = "T3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, # Estimate freely evaluation_indicator = "eval" ) ms <- define_model_system(components = list(mc_T1, mc_T2, mc_T3), factor = fm) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1 loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "T3_int", "T3_lambda", "T3_sigma"), verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_A_measurement_system", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") expect_true(est_comp$reasonable_default, info = "Estimates are not reasonable") }) # ============================================================================== # Test B: Measurement System (A) + Probit # ============================================================================== test_that("Model B: Measurement system with 3 linear tests and probit outcome", { skip_on_cran() set.seed(105) # Simulate data n <- 500 x1 <- rnorm(n) f <- rnorm(n) # Latent factor # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, T3_int, T3_lambda, T3_sigma, y_int, y_beta1, y_loading true_params <- c(1.0, # Factor variance 2.0, 0.5, # T1: int, sigma (loading fixed to 1.0) 1.5, 1.2, 0.6, # T2: int, lambda, sigma 1.0, 0.8, 0.4, # T3: int, lambda, sigma 0.5, 0.7, 0.9) # Probit: int, beta1, loading # Generate data (using true loading values for simulation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) T3 <- true_params[7] + true_params[8]*f + rnorm(n, 0, true_params[9]) z <- true_params[10] + true_params[11]*x1 + true_params[12]*f y <- as.numeric(runif(n) < pnorm(z)) dat <- data.frame(intercept = 1, x1 = x1, T1 = T1, T2 = T2, T3 = T3, y = y, eval = 1) # Create model system fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_T3 <- define_model_component( name = "T3", data = dat, outcome = "T3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_y <- define_model_component( name = "y", data = dat, outcome = "y", factor = fm, covariates = c("intercept", "x1"), model_type = "probit", loading_normalization = NA_real_, evaluation_indicator = "eval" ) ms <- define_model_system(components = list(mc_T1, mc_T2, mc_T3, mc_y), factor = fm) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1 loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "T3_int", "T3_lambda", "T3_sigma", "y_int", "y_beta1", "y_loading"), verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_B_measurement_plus_probit", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") expect_true(est_comp$reasonable_default, info = "Estimates are not reasonable") }) # ============================================================================== # Test C: Measurement System (A) + Ordered Probit # ============================================================================== test_that("Model C: Measurement system with 3 linear tests and ordered probit", { skip_on_cran() set.seed(106) # Simulate data n <- 500 x1 <- rnorm(n) f <- rnorm(n) # Latent factor # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, T3_int, T3_lambda, T3_sigma, # y_beta1, y_loading, thresh1, thresh_incr1 # NOTE: Ordered probit has NO INTERCEPT (absorbed into thresholds) # Threshold parameterization: thresh2 = thresh1 + abs(thresh_incr1) true_params <- c(1.0, # Factor variance 2.0, 0.5, # T1: int, sigma (loading fixed to 1.0) 1.5, 1.2, 0.6, # T2: int, lambda, sigma 1.0, 0.8, 0.4, # T3: int, lambda, sigma 0.6, 0.8, # Oprobit: beta1, loading (NO intercept) -0.5, 1.0) # Oprobit thresholds: thresh1, thresh_incr1 (intercept absorbed) # Generate data (using true loading values for simulation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) T3 <- true_params[7] + true_params[8]*f + rnorm(n, 0, true_params[9]) # Ordered probit latent variable (with intercept for data generation) # But model will not estimate intercept - it's absorbed into thresholds y_intercept <- 0.5 z <- y_intercept + true_params[10]*x1 + true_params[11]*f # Compute absolute threshold values from incremental parameterization # thresh1 has intercept absorbed: original 0.0 - intercept 0.5 = -0.5 thresh1_abs <- true_params[12] thresh2_abs <- thresh1_abs + abs(true_params[13]) u <- rnorm(n) y <- ifelse(z + u < thresh1_abs, 1, ifelse(z + u < thresh2_abs, 2, 3)) dat <- data.frame(intercept = 1, x1 = x1, T1 = T1, T2 = T2, T3 = T3, y = y, eval = 1) # Create model system fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_T3 <- define_model_component( name = "T3", data = dat, outcome = "T3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_y <- define_model_component( name = "y", data = dat, outcome = "y", factor = fm, covariates = "x1", # NO intercept for ordered probit (absorbed into thresholds) model_type = "oprobit", num_choices = 3, # 3 ordered categories loading_normalization = NA_real_, evaluation_indicator = "eval", intercept = FALSE ) ms <- define_model_system(components = list(mc_T1, mc_T2, mc_T3, mc_y), factor = fm) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1 loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "T3_int", "T3_lambda", "T3_sigma", "y_beta1", "y_loading", # Oprobit: NO intercept "thresh1", "thresh_incr1"), # Incremental threshold parameterization verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_C_measurement_plus_oprobit", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") expect_true(est_comp$reasonable_default, info = "Estimates are not reasonable") }) # ============================================================================== # Test D: Measurement System (A) + Multinomial Logit # ============================================================================== test_that("Model D: Measurement system with 3 linear tests and multinomial logit", { skip_on_cran() set.seed(107) # Simulate data n <- 500 x1 <- rnorm(n) f <- rnorm(n) # Latent factor # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, T3_int, T3_lambda, T3_sigma, # y1_int, y1_beta1, y1_loading, y2_int, y2_beta1, y2_loading true_params <- c(1.0, # Factor variance 2.0, 0.5, # T1: int, sigma (loading fixed to 1.0) 1.5, 1.2, 0.6, # T2: int, lambda, sigma 1.0, 0.8, 0.4, # T3: int, lambda, sigma 0.5, 0.6, 0.7, # Mlogit choice 1: int, beta1, loading 1.0, -0.5, 0.9) # Mlogit choice 2: int, beta1, loading # Generate data (using true loading values for simulation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) T3 <- true_params[7] + true_params[8]*f + rnorm(n, 0, true_params[9]) z1 <- true_params[10] + true_params[11]*x1 + true_params[12]*f z2 <- true_params[13] + true_params[14]*x1 + true_params[15]*f exp_z0 <- 1 exp_z1 <- exp(z1) exp_z2 <- exp(z2) denom <- exp_z0 + exp_z1 + exp_z2 p0 <- exp_z0 / denom p1 <- exp_z1 / denom p2 <- exp_z2 / denom y <- numeric(n) for (i in seq_len(n)) { # C++ expects multinomial choices coded as 1, 2, 3 (not 0, 1, 2) y[i] <- sample(1:3, 1, prob = c(p0[i], p1[i], p2[i])) } dat <- data.frame(intercept = 1, x1 = x1, T1 = T1, T2 = T2, T3 = T3, y = y, eval = 1) # Create model system fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_T3 <- define_model_component( name = "T3", data = dat, outcome = "T3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) mc_y <- define_model_component( name = "y", data = dat, outcome = "y", factor = fm, covariates = c("intercept", "x1"), model_type = "logit", num_choices = 3, # 3 choices: 0, 1, 2 loading_normalization = NA_real_, # Single factor, loading estimated freely evaluation_indicator = "eval" ) ms <- define_model_system(components = list(mc_T1, mc_T2, mc_T3, mc_y), factor = fm) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1 loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "T3_int", "T3_lambda", "T3_sigma", "y1_int", "y1_beta1", "y1_loading", "y2_int", "y2_beta1", "y2_loading"), verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_D_measurement_plus_mlogit", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") expect_true(est_comp$reasonable_default, info = "Estimates are not reasonable") }) # ============================================================================== # Test E: Roy Model # ============================================================================== test_that("Model E: Roy selection model", { skip_on_cran() set.seed(108) # Simulate Roy model data n <- 2000 # Increased from 500 to match manual test for better numerical stability x1 <- rnorm(n) x2 <- rnorm(n) f <- rnorm(n) # Latent factor (ability) # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, # T3_int, T3_lambda, T3_sigma, wage0_int, wage0_beta1, wage0_beta2, wage0_sigma, # wage1_int, wage1_beta1, wage1_lambda, wage1_sigma, sector_int, sector_beta1, sector_loading true_params <- c( 1.0, # Factor variance # T1: int, sigma (loading FIXED to 1.0) 2.0, 0.5, # T2: int, lambda, sigma 1.5, 1.2, 0.6, # T3: int, lambda, sigma 1.0, 0.8, 0.4, # Wage0: int, beta1, beta2, sigma (loading FIXED to 0.0 - no factor effect) 2.0, 0.5, 0.3, 0.6, # Wage1: int, beta1, lambda, sigma 2.5, 0.6, 1.0, 0.7, # Sector: int, beta1, loading (probit - no sigma) 0.0, 0.4, 0.8 ) # Generate test scores (using fixed loading values for data generation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) T3 <- true_params[7] + true_params[8]*f + rnorm(n, 0, true_params[9]) # Generate potential wages wage0 <- true_params[10] + true_params[11]*x1 + true_params[12]*x2 + rnorm(n, 0, true_params[13]) # No factor effect (loading = 0.0 fixed) wage1 <- true_params[14] + true_params[15]*x1 + true_params[16]*f + rnorm(n, 0, true_params[17]) # Sector choice (based on utility difference) z_sector <- true_params[18] + true_params[19]*x2 + true_params[20]*f sector <- as.numeric(runif(n) < pnorm(z_sector)) # Observed wage wage <- ifelse(sector == 1, wage1, wage0) dat <- data.frame( intercept = 1, x1 = x1, x2 = x2, T1 = T1, T2 = T2, T3 = T3, wage = wage, sector = sector, eval_tests = 1, eval_wage0 = 1 - sector, eval_wage1 = sector, eval_sector = 1 ) # Create model system fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval_tests" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval_tests" ) mc_T3 <- define_model_component( name = "T3", data = dat, outcome = "T3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval_tests" ) mc_wage0 <- define_model_component( name = "wage0", data = dat, outcome = "wage", factor = fm, covariates = c("intercept", "x1", "x2"), model_type = "linear", loading_normalization = 0.0, evaluation_indicator = "eval_wage0" ) mc_wage1 <- define_model_component( name = "wage1", data = dat, outcome = "wage", factor = fm, covariates = c("intercept", "x1"), model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval_wage1" ) mc_sector <- define_model_component( name = "sector", data = dat, outcome = "sector", factor = fm, covariates = c("intercept", "x2"), model_type = "probit", loading_normalization = NA_real_, evaluation_indicator = "eval_sector" ) ms <- define_model_system( components = list(mc_T1, mc_T2, mc_T3, mc_wage0, mc_wage1, mc_sector), factor = fm ) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1: loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "T3_int", "T3_lambda", "T3_sigma", "w0_int", "w0_beta1", "w0_beta2", "w0_sigma", "w1_int", "w1_beta1", "w1_lambda", "w1_sigma", "s_int", "s_beta1", "s_loading"), verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 16) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 16) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_E_roy_model", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") expect_true(est_comp$reasonable_default, info = "Estimates are not reasonable") }) # ============================================================================== # Test F: Measurement System + Exploded Multinomial Logit (Ranked Choices) # ============================================================================== test_that("Model F: Measurement system with exploded logit (ranked choices)", { skip_on_cran() set.seed(109) # Simulate data - simpler case: 3 choices, 2 ranks n <- 1000 # Larger sample for stability x1 <- rnorm(n) f <- rnorm(n) # Latent factor # True parameters - ORDER: factor_var, T1_int, T1_sigma, T2_int, T2_lambda, T2_sigma, # c1_int, c1_beta1, c1_loading, c2_int, c2_beta1, c2_loading # 3 choices = 2 non-reference alternatives true_params <- c(1.0, # Factor variance 0.0, 0.5, # T1: int, sigma (loading fixed to 1.0) 0.0, 0.8, 0.5, # T2: int, lambda, sigma 0.5, 0.8, 0.6, # Mlogit choice 1: int, beta1, loading -0.3, -0.5, 0.9) # Mlogit choice 2: int, beta1, loading # Generate data (using true loading values for simulation) T1 <- true_params[2] + 1.0*f + rnorm(n, 0, true_params[3]) # T1 loading = 1.0 (fixed) T2 <- true_params[4] + true_params[5]*f + rnorm(n, 0, true_params[6]) # Multinomial logit utilities (3 choices: reference + 2 alternatives) z1 <- true_params[7] + true_params[8]*x1 + true_params[9]*f z2 <- true_params[10] + true_params[11]*x1 + true_params[12]*f exp_z0 <- 1 # Reference category exp_z1 <- exp(z1) exp_z2 <- exp(z2) denom <- exp_z0 + exp_z1 + exp_z2 # Generate rankings for each observation (2 ranks from 3 choices) rank1 <- rank2 <- numeric(n) for (i in seq_len(n)) { probs <- c(exp_z0, exp_z1[i], exp_z2[i]) / denom[i] # Sample first choice rank1[i] <- sample(1:3, 1, prob = probs) # Update probs for second choice (remove first choice) remaining <- setdiff(1:3, rank1[i]) probs2 <- probs[remaining] / sum(probs[remaining]) rank2[i] <- sample(remaining, 1, prob = probs2) } dat <- data.frame(intercept = 1, x1 = x1, T1 = T1, T2 = T2, rank1 = rank1, rank2 = rank2, eval = 1) # Create model system fm <- define_factor_model(n_factors = 1, n_types = 1) mc_T1 <- define_model_component( name = "T1", data = dat, outcome = "T1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval" ) mc_T2 <- define_model_component( name = "T2", data = dat, outcome = "T2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval" ) # Exploded logit with 2 ranks mc_choice <- define_model_component( name = "choice", data = dat, outcome = c("rank1", "rank2"), # Vector of outcomes for ranked choices factor = fm, covariates = c("intercept", "x1"), model_type = "logit", num_choices = 3, # 3 choices total loading_normalization = NA_real_, # Single factor, loading estimated freely evaluation_indicator = "eval" ) ms <- define_model_system(components = list(mc_T1, mc_T2, mc_choice), factor = fm) # First run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = c("f_var", "T1_int", "T1_sigma", # T1 loading fixed to 1.0 "T2_int", "T2_lambda", "T2_sigma", "c1_int", "c1_beta1", "c1_loading", "c2_int", "c2_beta1", "c2_loading"), verbose = VERBOSE ) # Run gradient check with param_fixed # NOTE: Hessian for exploded logit with conditional probabilities is complex # and not yet fully implemented - skip Hessian check for this model type grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, estimation = est_comp, overall_pass = grad_check$pass && est_comp$default_converged ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_F_exploded_logit", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions - for exploded logit, we focus on gradient correctness and convergence # Hessian for conditional probabilities in exploded logit is a TODO expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(est_comp$default_converged, info = "Estimation with default init failed to converge") }) # ============================================================================== # Test G: Structural Equation Model (SE_linear) # f2 = se_intercept + se_linear_1 * f1 + epsilon # ============================================================================== test_that("Model G: SE_linear structural equation model with 2 factors", { skip_on_cran() set.seed(201) # Simulate data n <- 500 # True parameters for factor structure true_var_f1 <- 1.5 # Variance of input factor true_se_intercept <- 0.5 # SE intercept true_se_linear <- 0.8 # SE linear coefficient true_se_residual_var <- 0.5 # SE residual variance # Generate factors f1 <- rnorm(n, 0, sqrt(true_var_f1)) eps <- rnorm(n, 0, sqrt(true_se_residual_var)) f2 <- true_se_intercept + true_se_linear * f1 + eps # Generate measurements for f1 (3 measures) Y1_1 <- 2.0 + 1.0 * f1 + rnorm(n, 0, 1.0) # int=2, loading=1 (fixed), sigma=1 Y1_2 <- 1.5 + 0.8 * f1 + rnorm(n, 0, 0.9) # int=1.5, loading=0.8, sigma=0.9 Y1_3 <- 1.0 + 1.2 * f1 + rnorm(n, 0, 1.1) # int=1, loading=1.2, sigma=1.1 # Generate measurements for f2 (3 measures) Y2_1 <- 1.8 + 1.0 * f2 + rnorm(n, 0, 1.0) # int=1.8, loading=1 (fixed), sigma=1 Y2_2 <- 1.2 + 0.7 * f2 + rnorm(n, 0, 0.8) # int=1.2, loading=0.7, sigma=0.8 Y2_3 <- 0.5 + 1.1 * f2 + rnorm(n, 0, 0.95) # int=0.5, loading=1.1, sigma=0.95 dat <- data.frame( intercept = 1, Y1_1 = Y1_1, Y1_2 = Y1_2, Y1_3 = Y1_3, Y2_1 = Y2_1, Y2_2 = Y2_2, Y2_3 = Y2_3, eval = 1 ) # Create model system with 2 factors and SE_linear structure fm <- define_factor_model(n_factors = 2, factor_structure = "SE_linear") # 3 measurement equations for factor 1 mc1_1 <- define_model_component( name = "Y1_1", data = dat, outcome = "Y1_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(1, 0), # Fix loading to 1 for f1, 0 for f2 evaluation_indicator = "eval" ) mc1_2 <- define_model_component( name = "Y1_2", data = dat, outcome = "Y1_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), # Free for f1, 0 for f2 evaluation_indicator = "eval" ) mc1_3 <- define_model_component( name = "Y1_3", data = dat, outcome = "Y1_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), # Free for f1, 0 for f2 evaluation_indicator = "eval" ) # 3 measurement equations for factor 2 mc2_1 <- define_model_component( name = "Y2_1", data = dat, outcome = "Y2_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, 1), # 0 for f1, fix to 1 for f2 evaluation_indicator = "eval" ) mc2_2 <- define_model_component( name = "Y2_2", data = dat, outcome = "Y2_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), # 0 for f1, free for f2 evaluation_indicator = "eval" ) mc2_3 <- define_model_component( name = "Y2_3", data = dat, outcome = "Y2_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), # 0 for f1, free for f2 evaluation_indicator = "eval" ) ms <- define_model_system( components = list(mc1_1, mc1_2, mc1_3, mc2_1, mc2_2, mc2_3), factor = fm ) # True parameters: # factor_var_1, se_intercept, se_linear_1, se_residual_var, # Y1_1_int, Y1_1_sigma, (loading fixed to 1) # Y1_2_int, Y1_2_lambda, Y1_2_sigma, # Y1_3_int, Y1_3_lambda, Y1_3_sigma, # Y2_1_int, Y2_1_sigma, (loading fixed to 1) # Y2_2_int, Y2_2_lambda, Y2_2_sigma, # Y2_3_int, Y2_3_lambda, Y2_3_sigma true_params <- c( true_var_f1, # factor_var_1 = 1.5 true_se_intercept, true_se_linear, true_se_residual_var, # SE params: 0.5, 0.8, 0.5 2.0, 1.0, # Y1_1: int, sigma (loading fixed to 1) 1.5, 0.8, 0.9, # Y1_2: int, lambda, sigma 1.0, 1.2, 1.1, # Y1_3: int, lambda, sigma 1.8, 1.0, # Y2_1: int, sigma (loading fixed to 1) 1.2, 0.7, 0.8, # Y2_2: int, lambda, sigma 0.5, 1.1, 0.95 # Y2_3: int, lambda, sigma ) param_names <- c( "f1_var", "se_intercept", "se_linear_1", "se_residual_var", "Y1_1_int", "Y1_1_sigma", "Y1_2_int", "Y1_2_lambda", "Y1_2_sigma", "Y1_3_int", "Y1_3_lambda", "Y1_3_sigma", "Y2_1_int", "Y2_1_sigma", "Y2_2_int", "Y2_2_lambda", "Y2_2_sigma", "Y2_3_int", "Y2_3_lambda", "Y2_3_sigma" ) # Run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = param_names, verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_G_se_linear", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions - focus on gradient and Hessian correctness # Estimation convergence with default init is not strictly required for this test expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) }) # ============================================================================== # Test G2: Two-Stage SE_linear Estimation # Stage 1: Independent factors (measurement model) # Stage 2: SE_linear with previous_stage (structural equation) # ============================================================================== test_that("Model G2: Two-stage SE_linear estimation with previous_stage", { skip_on_cran() set.seed(301) # Simulate data with known DGP n <- 500 # True parameters # Note: In two-stage estimation, Stage 1 assumes independent factors with mean 0. # The measurement intercepts absorb the factor means. So in Stage 2, the SE intercept # captures only ADDITIONAL mean shift beyond what Stage 1 absorbed. We set it to 0. true_var_f1 <- 1.2 # Input factor variance true_se_linear <- 0.7 # Structural coefficient true_se_intercept <- 0.0 # Structural intercept (0 for two-stage identification) true_se_residual_var <- 0.4 # Structural residual variance # True measurement parameters true_lambda_f1 <- c(1.0, 0.9, 1.1) # Loadings for f1 (first fixed) true_lambda_f2 <- c(1.0, 0.8, 1.2) # Loadings for f2 (first fixed) true_int_f1 <- c(2.0, 1.5, 1.0) true_int_f2 <- c(1.8, 1.2, 0.5) true_sigma_f1 <- c(0.8, 0.9, 0.7) true_sigma_f2 <- c(0.9, 0.85, 0.95) # Generate factors according to SE_linear structure f1 <- rnorm(n, 0, sqrt(true_var_f1)) eps_se <- rnorm(n, 0, sqrt(true_se_residual_var)) f2 <- true_se_intercept + true_se_linear * f1 + eps_se # Generate measurements Y1_1 <- true_int_f1[1] + true_lambda_f1[1] * f1 + rnorm(n, 0, true_sigma_f1[1]) Y1_2 <- true_int_f1[2] + true_lambda_f1[2] * f1 + rnorm(n, 0, true_sigma_f1[2]) Y1_3 <- true_int_f1[3] + true_lambda_f1[3] * f1 + rnorm(n, 0, true_sigma_f1[3]) Y2_1 <- true_int_f2[1] + true_lambda_f2[1] * f2 + rnorm(n, 0, true_sigma_f2[1]) Y2_2 <- true_int_f2[2] + true_lambda_f2[2] * f2 + rnorm(n, 0, true_sigma_f2[2]) Y2_3 <- true_int_f2[3] + true_lambda_f2[3] * f2 + rnorm(n, 0, true_sigma_f2[3]) dat <- data.frame( intercept = 1, Y1_1 = Y1_1, Y1_2 = Y1_2, Y1_3 = Y1_3, Y2_1 = Y2_1, Y2_2 = Y2_2, Y2_3 = Y2_3, eval = 1 ) # ========================================================================= # STAGE 1: Measurement model with INDEPENDENT factors # ========================================================================= fm_stage1 <- define_factor_model(n_factors = 2, factor_structure = "independent") # Factor 1 measures mc1_1 <- define_model_component( name = "Y1_1", data = dat, outcome = "Y1_1", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(1, 0), evaluation_indicator = "eval" ) mc1_2 <- define_model_component( name = "Y1_2", data = dat, outcome = "Y1_2", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), evaluation_indicator = "eval" ) mc1_3 <- define_model_component( name = "Y1_3", data = dat, outcome = "Y1_3", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), evaluation_indicator = "eval" ) # Factor 2 measures mc2_1 <- define_model_component( name = "Y2_1", data = dat, outcome = "Y2_1", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(0, 1), evaluation_indicator = "eval" ) mc2_2 <- define_model_component( name = "Y2_2", data = dat, outcome = "Y2_2", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), evaluation_indicator = "eval" ) mc2_3 <- define_model_component( name = "Y2_3", data = dat, outcome = "Y2_3", factor = fm_stage1, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), evaluation_indicator = "eval" ) ms_stage1 <- define_model_system( components = list(mc1_1, mc1_2, mc1_3, mc2_1, mc2_2, mc2_3), factor = fm_stage1 ) # Stage 1 estimation ctrl <- define_estimation_control(n_quad_points = 8, num_cores = 1) result_stage1 <- estimate_model_rcpp( model_system = ms_stage1, data = dat, init_params = NULL, control = ctrl, optimizer = "nlminb", verbose = FALSE ) # Check Stage 1 convergence expect_equal(result_stage1$convergence, 0, info = "Stage 1 should converge") # Build true parameters for Stage 1 (independent 2-factor model) # Parameter order: factor_var_1, factor_var_2, then per-component: intercept, [loading], sigma true_params_s1 <- c( true_var_f1, # factor_var_1 true_var_f1 * true_se_linear^2 + true_se_residual_var, # factor_var_2 (marginal variance of f2) true_int_f1[1], true_sigma_f1[1], # Y1_1: intercept, sigma (loading fixed to 1) true_int_f1[2], true_lambda_f1[2], true_sigma_f1[2], # Y1_2: intercept, loading, sigma true_int_f1[3], true_lambda_f1[3], true_sigma_f1[3], # Y1_3: intercept, loading, sigma true_int_f2[1], true_sigma_f2[1], # Y2_1: intercept, sigma (loading fixed to 1) true_int_f2[2], true_lambda_f2[2], true_sigma_f2[2], # Y2_2: intercept, loading, sigma true_int_f2[3], true_lambda_f2[3], true_sigma_f2[3] # Y2_3: intercept, loading, sigma ) names(true_params_s1) <- names(result_stage1$estimates) # Get param_fixed vector for Stage 1 stage1_init <- initialize_parameters(ms_stage1, dat, verbose = FALSE) param_metadata_s1 <- factorana:::build_parameter_metadata(ms_stage1) param_constraints_s1 <- factorana:::setup_parameter_constraints( ms_stage1, true_params_s1, param_metadata_s1, stage1_init$factor_variance_fixed, verbose = FALSE ) param_fixed_s1 <- rep(TRUE, length(true_params_s1)) param_fixed_s1[param_constraints_s1$free_idx] <- FALSE # Check Stage 1 gradient accuracy at TRUE parameters (not at MLE) grad_check_s1 <- check_gradient_accuracy(ms_stage1, dat, true_params_s1, param_fixed = param_fixed_s1, tol = GRAD_TOL, verbose = FALSE, n_quad = 8) expect_true(grad_check_s1$pass, info = sprintf("Stage 1 gradient check failed (max error: %.2e)", grad_check_s1$max_error)) # Check Stage 1 parameter recovery (measurement params) # Factor variances - use unname() for comparison expect_equal(unname(result_stage1$estimates["factor_var_1"]), true_var_f1, tolerance = 0.3, info = "Stage 1: factor_var_1 recovery") # Loadings (free ones) - use unname() for comparison expect_equal(unname(result_stage1$estimates["Y1_2_loading_1"]), true_lambda_f1[2], tolerance = 0.15, info = "Stage 1: Y1_2 loading recovery") expect_equal(unname(result_stage1$estimates["Y2_2_loading_2"]), true_lambda_f2[2], tolerance = 0.15, info = "Stage 1: Y2_2 loading recovery") # Check Stage 1 SEs are computed expect_true(all(!is.na(result_stage1$std_errors)), info = "Stage 1 should have computed std_errors") expect_true(all(result_stage1$std_errors > 0), info = "Stage 1 std_errors should be positive") if (VERBOSE) { cat("\n=== Stage 1 (Independent Factors) ===\n") cat(sprintf("Convergence: %d\n", result_stage1$convergence)) cat(sprintf("Log-likelihood: %.4f\n", result_stage1$loglik)) cat(sprintf("Gradient max error: %.2e\n", grad_check_s1$max_error)) cat(sprintf("factor_var_1: true=%.3f, est=%.3f\n", true_var_f1, result_stage1$estimates["factor_var_1"])) } # ========================================================================= # STAGE 2: SE_linear with previous_stage # ========================================================================= fm_stage2 <- define_factor_model(n_factors = 2, factor_structure = "SE_linear") # No new components - just change factor structure ms_stage2 <- define_model_system( components = list(), factor = fm_stage2, previous_stage = result_stage1 ) # Initialize SE parameters init_stage2 <- initialize_parameters(ms_stage2, dat, verbose = FALSE) init_stage2$init_params["se_intercept"] <- 0.0 init_stage2$init_params["se_linear_1"] <- 0.5 init_stage2$init_params["se_residual_var"] <- 0.5 result_stage2 <- estimate_model_rcpp( model_system = ms_stage2, data = dat, init_params = init_stage2$init_params, control = ctrl, optimizer = "nlminb", verbose = FALSE ) # Check Stage 2 convergence expect_equal(result_stage2$convergence, 0, info = "Stage 2 should converge") # Build true parameters for Stage 2 (SE_linear structure) # Parameter order for SE_linear: factor_var_1, se_intercept, se_linear_1, se_residual_var, # then measurement params from Stage 1 true_params_s2 <- c( true_var_f1, # factor_var_1 true_se_intercept, # se_intercept true_se_linear, # se_linear_1 true_se_residual_var # se_residual_var ) # Append measurement parameters (using Stage 1 true params without factor variances) meas_params <- true_params_s1[-(1:2)] # Remove factor variances true_params_s2 <- c(true_params_s2, meas_params) names(true_params_s2) <- names(result_stage2$estimates) # Get param_fixed vector for Stage 2 stage2_init <- initialize_parameters(ms_stage2, dat, verbose = FALSE) param_metadata_s2 <- factorana:::build_parameter_metadata(ms_stage2) param_constraints_s2 <- factorana:::setup_parameter_constraints( ms_stage2, true_params_s2, param_metadata_s2, stage2_init$factor_variance_fixed, verbose = FALSE ) param_fixed_s2 <- rep(TRUE, length(true_params_s2)) param_fixed_s2[param_constraints_s2$free_idx] <- FALSE # Check Stage 2 gradient accuracy at TRUE parameters (not at MLE) grad_check_s2 <- check_gradient_accuracy(ms_stage2, dat, true_params_s2, param_fixed = param_fixed_s2, tol = GRAD_TOL, verbose = FALSE, n_quad = 8) expect_true(grad_check_s2$pass, info = sprintf("Stage 2 gradient check failed (max error: %.2e)", grad_check_s2$max_error)) # Check Stage 2 Hessian accuracy at TRUE parameters hess_check_s2 <- check_hessian_accuracy(ms_stage2, dat, true_params_s2, param_fixed = param_fixed_s2, tol = HESS_TOL, verbose = FALSE, n_quad = 8) expect_true(hess_check_s2$pass, info = sprintf("Stage 2 Hessian check failed (max error: %.2e)", hess_check_s2$max_error)) # ========================================================================= # VERIFICATION: Parameters preserved from Stage 1 # ========================================================================= # Measurement parameters should be identical stage1_meas_params <- result_stage1$estimates[!grepl("^factor_var", names(result_stage1$estimates))] stage2_meas_params <- result_stage2$estimates[names(stage1_meas_params)] max_meas_diff <- max(abs(stage1_meas_params - stage2_meas_params)) expect_true(max_meas_diff < 1e-10, info = sprintf("Measurement params should be preserved (max diff: %.2e)", max_meas_diff)) # SEs for fixed params should be preserved stage1_meas_se <- result_stage1$std_errors[names(stage1_meas_params)] stage2_meas_se <- result_stage2$std_errors[names(stage1_meas_params)] max_se_diff <- max(abs(stage1_meas_se - stage2_meas_se)) expect_true(max_se_diff < 1e-10, info = sprintf("Measurement SEs should be preserved (max diff: %.2e)", max_se_diff)) # ========================================================================= # VERIFICATION: SE_linear parameter recovery # ========================================================================= # SE linear coefficient (the key structural parameter) est_se_linear <- unname(result_stage2$estimates["se_linear_1"]) expect_equal(est_se_linear, true_se_linear, tolerance = 0.15, info = sprintf("SE linear recovery: true=%.3f, est=%.3f", true_se_linear, est_se_linear)) # SE intercept (should be ~0 since Stage 1 absorbed factor means) est_se_intercept <- unname(result_stage2$estimates["se_intercept"]) expect_equal(est_se_intercept, true_se_intercept, tolerance = 0.15, info = sprintf("SE intercept recovery: true=%.3f, est=%.3f", true_se_intercept, est_se_intercept)) # SE residual variance (relaxed tolerance due to sampling variability in SE models) est_se_resvar <- unname(result_stage2$estimates["se_residual_var"]) expect_equal(est_se_resvar, true_se_residual_var, tolerance = 0.35, info = sprintf("SE residual var recovery: true=%.3f, est=%.3f", true_se_residual_var, est_se_resvar)) # Factor variance for f1 (should be estimated in Stage 2) est_var_f1 <- unname(result_stage2$estimates["factor_var_1"]) expect_equal(est_var_f1, true_var_f1, tolerance = 0.3, info = sprintf("Factor var recovery: true=%.3f, est=%.3f", true_var_f1, est_var_f1)) # Check SE params have positive std_errors se_param_names <- c("se_linear_1", "se_intercept", "se_residual_var", "factor_var_1") se_std_errors <- result_stage2$std_errors[se_param_names] expect_true(all(se_std_errors > 0), info = "SE parameters should have positive std_errors") if (VERBOSE) { cat("\n=== Stage 2 (SE_linear) ===\n") cat(sprintf("Convergence: %d\n", result_stage2$convergence)) cat(sprintf("Log-likelihood: %.4f\n", result_stage2$loglik)) cat(sprintf("Gradient max error: %.2e\n", grad_check_s2$max_error)) cat(sprintf("Hessian max error: %.2e\n", hess_check_s2$max_error)) cat(sprintf("Meas params preserved: max diff = %.2e\n", max_meas_diff)) cat(sprintf("Meas SEs preserved: max diff = %.2e\n", max_se_diff)) cat("\nSE Parameter Recovery:\n") cat(sprintf(" se_linear_1: true=%.3f, est=%.3f (%.1f%%)\n", true_se_linear, est_se_linear, 100*est_se_linear/true_se_linear)) cat(sprintf(" se_intercept: true=%.3f, est=%.3f\n", true_se_intercept, est_se_intercept)) cat(sprintf(" se_residual_var: true=%.3f, est=%.3f (%.1f%%)\n", true_se_residual_var, est_se_resvar, 100*est_se_resvar/true_se_residual_var)) cat(sprintf(" factor_var_1: true=%.3f, est=%.3f (%.1f%%)\n", true_var_f1, est_var_f1, 100*est_var_f1/true_var_f1)) } }) # ============================================================================== # Test H: Structural Equation Model (SE_quadratic) # f2 = se_intercept + se_linear_1 * f1 + se_quadratic_1 * f1^2 + epsilon # ============================================================================== test_that("Model H: SE_quadratic structural equation model with 2 factors", { skip_on_cran() set.seed(202) # Simulate data n <- 500 # True parameters for factor structure true_var_f1 <- 1.5 # Variance of input factor true_se_intercept <- 0.5 # SE intercept true_se_linear <- 0.8 # SE linear coefficient true_se_quadratic <- 0.2 # SE quadratic coefficient true_se_residual_var <- 0.5 # SE residual variance # Generate factors f1 <- rnorm(n, 0, sqrt(true_var_f1)) eps <- rnorm(n, 0, sqrt(true_se_residual_var)) f2 <- true_se_intercept + true_se_linear * f1 + true_se_quadratic * f1^2 + eps # Generate measurements for f1 (3 measures) Y1_1 <- 2.0 + 1.0 * f1 + rnorm(n, 0, 1.0) # int=2, loading=1 (fixed), sigma=1 Y1_2 <- 1.5 + 0.8 * f1 + rnorm(n, 0, 0.9) # int=1.5, loading=0.8, sigma=0.9 Y1_3 <- 1.0 + 1.2 * f1 + rnorm(n, 0, 1.1) # int=1, loading=1.2, sigma=1.1 # Generate measurements for f2 (3 measures) Y2_1 <- 1.8 + 1.0 * f2 + rnorm(n, 0, 1.0) # int=1.8, loading=1 (fixed), sigma=1 Y2_2 <- 1.2 + 0.7 * f2 + rnorm(n, 0, 0.8) # int=1.2, loading=0.7, sigma=0.8 Y2_3 <- 0.5 + 1.1 * f2 + rnorm(n, 0, 0.95) # int=0.5, loading=1.1, sigma=0.95 dat <- data.frame( intercept = 1, Y1_1 = Y1_1, Y1_2 = Y1_2, Y1_3 = Y1_3, Y2_1 = Y2_1, Y2_2 = Y2_2, Y2_3 = Y2_3, eval = 1 ) # Create model system with 2 factors and SE_quadratic structure fm <- define_factor_model(n_factors = 2, factor_structure = "SE_quadratic") # 3 measurement equations for factor 1 mc1_1 <- define_model_component( name = "Y1_1", data = dat, outcome = "Y1_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(1, 0), # Fix loading to 1 for f1, 0 for f2 evaluation_indicator = "eval" ) mc1_2 <- define_model_component( name = "Y1_2", data = dat, outcome = "Y1_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), # Free for f1, 0 for f2 evaluation_indicator = "eval" ) mc1_3 <- define_model_component( name = "Y1_3", data = dat, outcome = "Y1_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), # Free for f1, 0 for f2 evaluation_indicator = "eval" ) # 3 measurement equations for factor 2 mc2_1 <- define_model_component( name = "Y2_1", data = dat, outcome = "Y2_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, 1), # 0 for f1, fix to 1 for f2 evaluation_indicator = "eval" ) mc2_2 <- define_model_component( name = "Y2_2", data = dat, outcome = "Y2_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), # 0 for f1, free for f2 evaluation_indicator = "eval" ) mc2_3 <- define_model_component( name = "Y2_3", data = dat, outcome = "Y2_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), # 0 for f1, free for f2 evaluation_indicator = "eval" ) ms <- define_model_system( components = list(mc1_1, mc1_2, mc1_3, mc2_1, mc2_2, mc2_3), factor = fm ) # True parameters: # factor_var_1, se_intercept, se_linear_1, se_quadratic_1, se_residual_var, # Y1_1_int, Y1_1_sigma, (loading fixed to 1) # Y1_2_int, Y1_2_lambda, Y1_2_sigma, # Y1_3_int, Y1_3_lambda, Y1_3_sigma, # Y2_1_int, Y2_1_sigma, (loading fixed to 1) # Y2_2_int, Y2_2_lambda, Y2_2_sigma, # Y2_3_int, Y2_3_lambda, Y2_3_sigma true_params <- c( true_var_f1, # factor_var_1 = 1.5 true_se_intercept, true_se_linear, true_se_quadratic, true_se_residual_var, # SE params 2.0, 1.0, # Y1_1: int, sigma (loading fixed to 1) 1.5, 0.8, 0.9, # Y1_2: int, lambda, sigma 1.0, 1.2, 1.1, # Y1_3: int, lambda, sigma 1.8, 1.0, # Y2_1: int, sigma (loading fixed to 1) 1.2, 0.7, 0.8, # Y2_2: int, lambda, sigma 0.5, 1.1, 0.95 # Y2_3: int, lambda, sigma ) param_names <- c( "f1_var", "se_intercept", "se_linear_1", "se_quadratic_1", "se_residual_var", "Y1_1_int", "Y1_1_sigma", "Y1_2_int", "Y1_2_lambda", "Y1_2_sigma", "Y1_3_int", "Y1_3_lambda", "Y1_3_sigma", "Y2_1_int", "Y2_1_sigma", "Y2_2_int", "Y2_2_lambda", "Y2_2_sigma", "Y2_3_int", "Y2_3_lambda", "Y2_3_sigma" ) # Run estimation to get param_fixed est_comp <- run_estimation_comparison( ms, dat, true_params, param_names = param_names, verbose = VERBOSE ) # Run gradient/Hessian checks with param_fixed grad_check <- check_gradient_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = GRAD_TOL, verbose = VERBOSE, n_quad = 8) hess_check <- check_hessian_accuracy(ms, dat, true_params, param_fixed = est_comp$param_fixed, tol = HESS_TOL, verbose = VERBOSE, n_quad = 8) # Collect diagnostics diagnostics <- list( gradient_check = grad_check, hessian_check = hess_check, estimation = est_comp, overall_pass = grad_check$pass && hess_check$pass && est_comp$default_converged && est_comp$reasonable_default ) # Save log if (SAVE_LOGS) { log_file <- save_diagnostics_to_log("test_H_se_quadratic", diagnostics) if (VERBOSE) cat("Log saved to:", log_file, "\n") } # Assertions - focus on gradient and Hessian correctness # Estimation convergence with default init is not strictly required for this test expect_true(grad_check$pass, info = sprintf("Gradient check failed (max error: %.2e)", grad_check$max_error)) expect_true(hess_check$pass, info = sprintf("Hessian check failed (max error: %.2e)", hess_check$max_error)) }) # ============================================================================== # Test I: Equality Constraints (Measurement Invariance) with SE_quadratic # ============================================================================== test_that("Model I: SE_quadratic with equality constraints (measurement invariance)", { skip_on_cran() set.seed(456) # True parameters n <- 800 true_var_f1 <- 1.0 true_se_intercept <- 0.0 # Set to 0 for identification true_se_linear <- 0.7 true_se_quadratic <- 0.15 true_se_residual_var <- 0.4 # Common measurement parameters (invariance across time points) true_loading_2 <- 0.8 true_loading_3 <- 1.2 true_sigma_1 <- 1.0 true_sigma_2 <- 0.9 true_sigma_3 <- 1.1 # Generate factors f1 <- rnorm(n, 0, sqrt(true_var_f1)) eps <- rnorm(n, 0, sqrt(true_se_residual_var)) f2 <- true_se_intercept + true_se_linear * f1 + true_se_quadratic * f1^2 + eps # Measurements at time 1 (factor 1) Y1_1 <- 1.0 * f1 + rnorm(n, 0, true_sigma_1) Y1_2 <- true_loading_2 * f1 + rnorm(n, 0, true_sigma_2) Y1_3 <- true_loading_3 * f1 + rnorm(n, 0, true_sigma_3) # Measurements at time 2 (factor 2) - SAME loadings and sigmas Y2_1 <- 1.0 * f2 + rnorm(n, 0, true_sigma_1) Y2_2 <- true_loading_2 * f2 + rnorm(n, 0, true_sigma_2) Y2_3 <- true_loading_3 * f2 + rnorm(n, 0, true_sigma_3) dat <- data.frame(Y1_1, Y1_2, Y1_3, Y2_1, Y2_2, Y2_3, intercept = 1, eval = 1) # Model setup fm <- define_factor_model(n_factors = 2, factor_structure = "SE_quadratic") mc1_1 <- define_model_component(name = "Y1_1", data = dat, outcome = "Y1_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(1, 0), evaluation_indicator = "eval") mc1_2 <- define_model_component(name = "Y1_2", data = dat, outcome = "Y1_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), evaluation_indicator = "eval") mc1_3 <- define_model_component(name = "Y1_3", data = dat, outcome = "Y1_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0), evaluation_indicator = "eval") mc2_1 <- define_model_component(name = "Y2_1", data = dat, outcome = "Y2_1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, 1), evaluation_indicator = "eval") mc2_2 <- define_model_component(name = "Y2_2", data = dat, outcome = "Y2_2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), evaluation_indicator = "eval") mc2_3 <- define_model_component(name = "Y2_3", data = dat, outcome = "Y2_3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_), evaluation_indicator = "eval") # Define model system WITH equality constraints (measurement invariance) ms <- define_model_system( components = list(mc1_1, mc1_2, mc1_3, mc2_1, mc2_2, mc2_3), factor = fm, equality_constraints = list( c("Y1_2_loading_1", "Y2_2_loading_2"), # Same loading for item 2 c("Y1_3_loading_1", "Y2_3_loading_2"), # Same loading for item 3 c("Y1_1_sigma", "Y2_1_sigma"), # Same error variance for item 1 c("Y1_2_sigma", "Y2_2_sigma"), # Same error variance for item 2 c("Y1_3_sigma", "Y2_3_sigma") # Same error variance for item 3 ) ) # Test 1: Verify equality constraints are stored correctly expect_equal(length(ms$equality_constraints), 5) # Test 2: Run estimation control <- define_estimation_control(n_quad_points = 8, num_cores = 1) result <- estimate_model_rcpp(model_system = ms, data = dat, control = control, optimizer = "nlminb", verbose = FALSE) # Test 3: Verify tied parameters are exactly equal (use unname() to compare values, not named vectors) expect_equal(unname(result$estimates["Y1_2_loading_1"]), unname(result$estimates["Y2_2_loading_2"])) expect_equal(unname(result$estimates["Y1_3_loading_1"]), unname(result$estimates["Y2_3_loading_2"])) expect_equal(unname(result$estimates["Y1_1_sigma"]), unname(result$estimates["Y2_1_sigma"])) expect_equal(unname(result$estimates["Y1_2_sigma"]), unname(result$estimates["Y2_2_sigma"])) expect_equal(unname(result$estimates["Y1_3_sigma"]), unname(result$estimates["Y2_3_sigma"])) # Test 4: Verify param_table shows tied_to column correctly expect_equal(result$param_table$tied_to[result$param_table$name == "Y2_2_loading_2"], "Y1_2_loading_1") expect_equal(result$param_table$tied_to[result$param_table$name == "Y2_1_sigma"], "Y1_1_sigma") # Test 5: Parameter recovery (check key parameters are within tolerance) # Slightly relaxed tolerance to account for random variation in Monte Carlo simulation tolerance <- 0.20 expect_lt(abs(result$estimates["factor_var_1"] - true_var_f1), tolerance) expect_lt(abs(result$estimates["se_linear_1"] - true_se_linear), tolerance) expect_lt(abs(result$estimates["se_quadratic_1"] - true_se_quadratic), tolerance) expect_lt(abs(result$estimates["se_residual_var"] - true_se_residual_var), tolerance) expect_lt(abs(result$estimates["Y1_2_loading_1"] - true_loading_2), tolerance) expect_lt(abs(result$estimates["Y1_3_loading_1"] - true_loading_3), tolerance) expect_lt(abs(result$estimates["Y1_1_sigma"] - true_sigma_1), tolerance) expect_lt(abs(result$estimates["Y1_2_sigma"] - true_sigma_2), tolerance) expect_lt(abs(result$estimates["Y1_3_sigma"] - true_sigma_3), tolerance) # Test 6: FD gradient check with equality constraints # The C++ gradient needs to be aggregated for tied params # Note: C++ now returns gradients for FREE params only, so we need to map back to full data_mat <- as.matrix(dat) init_result <- initialize_parameters(ms, dat, verbose = FALSE) params <- init_result$init_params # Set to true values for gradient check params[1] <- true_var_f1 params[2] <- true_se_intercept params[3] <- true_se_linear params[4] <- true_se_quadratic params[5] <- true_se_residual_var params[9] <- true_loading_2 # Y1_2_loading_1 params[12] <- true_loading_3 # Y1_3_loading_1 params[7] <- true_sigma_1 # Y1_1_sigma params[10] <- true_sigma_2 # Y1_2_sigma params[13] <- true_sigma_3 # Y1_3_sigma # Set tied params equal params[17] <- true_loading_2 # Y2_2_loading_2 params[20] <- true_loading_3 # Y2_3_loading_2 params[15] <- true_sigma_1 # Y2_1_sigma params[18] <- true_sigma_2 # Y2_2_sigma params[21] <- true_sigma_3 # Y2_3_sigma fm_ptr <- initialize_factor_model_cpp(ms, data_mat, 8, params) # Extract free params (tied params are now fixed in C++) params_free <- extract_free_params_cpp(fm_ptr, params) cpp_result <- evaluate_likelihood_cpp(fm_ptr, params_free, compute_gradient = TRUE, compute_hessian = FALSE) # Map gradient from free params back to full params # Free indices are all params except tied ones (17, 20, 15, 18, 21) # free_idx = c(1:14, 16, 19) = 16 free params free_idx <- c(1:14, 16, 19) analytical_grad <- rep(0, length(params)) analytical_grad[free_idx] <- cpp_result$gradient # No need for manual aggregation since C++ already excludes tied params # The gradient for primary params now directly includes the contribution aggregated_analytical <- analytical_grad # Compute FD gradient with constraints applied eps_fd <- 1e-5 fd_grad <- numeric(length(params)) for (i in seq_along(params)) { params_plus <- params params_minus <- params params_plus[i] <- params[i] + eps_fd params_minus[i] <- params[i] - eps_fd # Apply equality constraints if (i == 9) { params_plus[17] <- params_plus[9]; params_minus[17] <- params_minus[9] } if (i == 12) { params_plus[20] <- params_plus[12]; params_minus[20] <- params_minus[12] } if (i == 7) { params_plus[15] <- params_plus[7]; params_minus[15] <- params_minus[7] } if (i == 10) { params_plus[18] <- params_plus[10]; params_minus[18] <- params_minus[10] } if (i == 13) { params_plus[21] <- params_plus[13]; params_minus[21] <- params_minus[13] } # Need to extract free params for the perturbed params params_plus_free <- extract_free_params_cpp(fm_ptr, params_plus) params_minus_free <- extract_free_params_cpp(fm_ptr, params_minus) ll_plus <- evaluate_loglik_only_cpp(fm_ptr, params_plus_free) ll_minus <- evaluate_loglik_only_cpp(fm_ptr, params_minus_free) fd_grad[i] <- (ll_plus - ll_minus) / (2 * eps_fd) } # Check primary parameters only (not derived ones) primary_indices <- c(1:14, 16, 19) max_rel_err <- 0 worst_idx <- 0 cat("\n=== DEBUG: Gradient comparison ===\n") for (i in primary_indices) { ana <- aggregated_analytical[i] fd <- fd_grad[i] rel_err <- if (abs(fd) > 1e-6) abs(ana - fd) / abs(fd) else abs(ana - fd) cat(sprintf(" param %2d: ana=%.6f, fd=%.6f, rel_err=%.2e\n", i, ana, fd, rel_err)) if (!is.na(rel_err) && rel_err > max_rel_err) { max_rel_err <- rel_err worst_idx <- i } } cat(sprintf("Worst param: %d, max_rel_err: %.2e\n", worst_idx, max_rel_err)) expect_lt(max_rel_err, 1e-4, label = sprintf("Gradient max error: %.2e", max_rel_err)) if (VERBOSE) { cat("Test I: Equality constraints (measurement invariance)\n") cat(" Equality constraints: 5 groups\n") cat(" Free parameters: 16 (21 total - 5 tied)\n") cat(sprintf(" Gradient max rel error: %.2e\n", max_rel_err)) cat(sprintf(" SE_linear recovery: true=%.3f, est=%.3f\n", true_se_linear, result$estimates["se_linear_1"])) cat(sprintf(" Loading_2 recovery: true=%.3f, est=%.3f\n", true_loading_2, result$estimates["Y1_2_loading_1"])) } }) # ===================================================================== # Test J: Two-stage model with multiple outcome types and fixed coefficients # - Stage 1: Linear measurement system for factor 1 # - Stage 2: Various outcome models (linear, probit, oprobit, mlogit, exploded logit) # with at least one fixed coefficient in each # ===================================================================== test_that("Test J: Two-stage with multiple outcome types and fixed coefficients", { skip_on_cran() set.seed(54321) n <- 400 # Generate factor f1 <- rnorm(n) # Stage 1: Linear measurement equations for factor 1 Y_m1 <- 1.0 * f1 + rnorm(n, 0, 0.5) # loading fixed to 1 Y_m2 <- 0.8 * f1 + rnorm(n, 0, 0.6) # loading = 0.8 # Covariates for stage 2 x1 <- rnorm(n) x2 <- rnorm(n) # True parameters for stage 2 models true_beta_linear <- c(0.5, 0.3) # intercept, x1 coefficient true_loading_linear <- 0.7 true_sigma_linear <- 0.8 true_beta_probit <- c(-0.2, 0.4) # intercept, x1 coefficient true_loading_probit <- 0.6 true_beta_oprobit <- c(0.0, 0.5) # intercept=0 (fixed), x1 coefficient true_loading_oprobit <- 0.5 true_thresholds <- c(-0.5, 0.5) # 3 categories -> 2 thresholds true_beta_mlogit <- matrix(c( 0.3, 0.2, # Choice 1: intercept, x1 0.0, 0.4 # Choice 2: intercept=0 (fixed), x1 ), nrow = 2, byrow = TRUE) true_loading_mlogit <- c(0.4, 0.5) # loadings for choices 1, 2 true_beta_explogit <- matrix(c( 0.2, 0.0, # Choice 1: intercept, x1=0 (fixed) 0.1, 0.3 # Choice 2: intercept, x1 ), nrow = 2, byrow = TRUE) true_loading_explogit <- c(0.3, 0.4) # Generate outcomes # Linear outcome Y_linear <- true_beta_linear[1] + true_beta_linear[2] * x1 + true_loading_linear * f1 + rnorm(n, 0, true_sigma_linear) # Probit outcome latent_probit <- true_beta_probit[1] + true_beta_probit[2] * x1 + true_loading_probit * f1 + rnorm(n, 0, 1) Y_probit <- as.integer(latent_probit > 0) # Ordered probit outcome (3 categories) - store as numeric integers for matrix conversion latent_oprobit <- true_beta_oprobit[1] + true_beta_oprobit[2] * x1 + true_loading_oprobit * f1 + rnorm(n, 0, 1) Y_oprobit <- as.integer(cut(latent_oprobit, breaks = c(-Inf, true_thresholds, Inf), labels = FALSE)) # Multinomial logit outcome (3 choices, choice 0 is reference) u0 <- 0 # Reference u1 <- true_beta_mlogit[1,1] + true_beta_mlogit[1,2] * x1 + true_loading_mlogit[1] * f1 + rlogis(n, 0, 1) u2 <- true_beta_mlogit[2,1] + true_beta_mlogit[2,2] * x1 + true_loading_mlogit[2] * f1 + rlogis(n, 0, 1) Y_mlogit <- apply(cbind(u0, u1, u2), 1, which.max) # Exploded logit outcome (rank 2 alternatives from 3 choices) # Compute utilities for all 3 choices v0 <- 0 # Reference v1 <- true_beta_explogit[1,1] + true_beta_explogit[1,2] * x1 + true_loading_explogit[1] * f1 + rlogis(n, 0, 1) v2 <- true_beta_explogit[2,1] + true_beta_explogit[2,2] * x1 + true_loading_explogit[2] * f1 + rlogis(n, 0, 1) utils <- cbind(v0, v1, v2) # Generate rankings (top 2 ranks) rankings <- t(apply(utils, 1, function(u) order(u, decreasing = TRUE)[1:2])) Y_rank1 <- rankings[, 1] Y_rank2 <- rankings[, 2] # Create data frame dat <- data.frame( Y_m1 = Y_m1, Y_m2 = Y_m2, Y_linear = Y_linear, Y_probit = Y_probit, Y_oprobit = Y_oprobit, Y_mlogit = Y_mlogit, Y_rank1 = Y_rank1, Y_rank2 = Y_rank2, x1 = x1, x2 = x2, intercept = 1, eval = 1 ) # Define factor model (single factor) fm <- define_factor_model(n_factors = 1) # Stage 1: Measurement equations mc_m1 <- define_model_component(name = "m1", data = dat, outcome = "Y_m1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1, evaluation_indicator = "eval") mc_m2 <- define_model_component(name = "m2", data = dat, outcome = "Y_m2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval") # Stage 2: Outcome models with fixed coefficients # Linear with x2 coefficient fixed to 0 mc_linear <- define_model_component(name = "linear", data = dat, outcome = "Y_linear", factor = fm, covariates = c("intercept", "x1", "x2"), model_type = "linear", loading_normalization = NA_real_, evaluation_indicator = "eval") mc_linear <- fix_coefficient(mc_linear, "x2", 0) # Probit with x2 coefficient fixed to 0 mc_probit <- define_model_component(name = "probit", data = dat, outcome = "Y_probit", factor = fm, covariates = c("intercept", "x1", "x2"), model_type = "probit", loading_normalization = NA_real_, evaluation_indicator = "eval") mc_probit <- fix_coefficient(mc_probit, "x2", 0) # Ordered probit with x2 coefficient fixed to 0 (3 categories). # Note: oprobit has no intercept covariate — the intercept is absorbed into # the cut points. So we fix x2 instead to exercise fix_coefficient() here. mc_oprobit <- define_model_component(name = "oprobit", data = dat, outcome = "Y_oprobit", factor = fm, covariates = c("x1", "x2"), model_type = "oprobit", num_choices = 3, loading_normalization = NA_real_, evaluation_indicator = "eval") mc_oprobit <- fix_coefficient(mc_oprobit, "x2", 0) # Multinomial logit with choice 2 intercept fixed to 0 mc_mlogit <- define_model_component(name = "mlogit", data = dat, outcome = "Y_mlogit", factor = fm, covariates = c("intercept", "x1"), model_type = "logit", num_choices = 3, loading_normalization = NA_real_, evaluation_indicator = "eval") mc_mlogit <- fix_coefficient(mc_mlogit, "intercept", 0, choice = 2) # Exploded logit with choice 1 x1 coefficient fixed to 0 mc_explogit <- define_model_component(name = "explogit", data = dat, outcome = c("Y_rank1", "Y_rank2"), factor = fm, covariates = c("intercept", "x1"), model_type = "logit", num_choices = 3, loading_normalization = NA_real_, evaluation_indicator = "eval") mc_explogit <- fix_coefficient(mc_explogit, "x1", 0, choice = 1) # Define model system ms <- define_model_system( components = list(mc_m1, mc_m2, mc_linear, mc_probit, mc_oprobit, mc_mlogit, mc_explogit), factor = fm ) # Initialize and estimate control <- define_estimation_control(n_quad_points = 8, num_cores = 1) result <- estimate_model_rcpp(model_system = ms, data = dat, control = control, optimizer = "nlminb", verbose = FALSE) # Test 1: Check that fixed coefficients are exactly at their fixed values expect_equal(unname(result$estimates["linear_x2"]), 0, tolerance = 1e-10) expect_equal(unname(result$estimates["probit_x2"]), 0, tolerance = 1e-10) expect_equal(unname(result$estimates["oprobit_x2"]), 0, tolerance = 1e-10) expect_equal(unname(result$estimates["mlogit_c2_intercept"]), 0, tolerance = 1e-10) expect_equal(unname(result$estimates["explogit_c1_x1"]), 0, tolerance = 1e-10) # Test 2: FD gradient check data_mat <- as.matrix(dat) init_result <- initialize_parameters(ms, dat, verbose = FALSE) params <- init_result$init_params fm_ptr <- initialize_factor_model_cpp(ms, data_mat, 8, params) params_free <- extract_free_params_cpp(fm_ptr, params) cpp_result <- evaluate_likelihood_cpp(fm_ptr, params_free, compute_gradient = TRUE, compute_hessian = FALSE) # Compute FD gradient eps <- 1e-5 fd_grad <- numeric(length(params_free)) for (i in seq_along(params_free)) { params_plus <- params_free params_minus <- params_free params_plus[i] <- params_free[i] + eps params_minus[i] <- params_free[i] - eps ll_plus <- evaluate_loglik_only_cpp(fm_ptr, params_plus) ll_minus <- evaluate_loglik_only_cpp(fm_ptr, params_minus) fd_grad[i] <- (ll_plus - ll_minus) / (2 * eps) } # Check gradient accuracy max_rel_err <- 0 for (i in seq_along(params_free)) { ana <- cpp_result$gradient[i] fd <- fd_grad[i] rel_err <- if (abs(fd) > 1e-6) abs(ana - fd) / abs(fd) else abs(ana - fd) if (!is.na(rel_err) && rel_err > max_rel_err) max_rel_err <- rel_err } expect_lt(max_rel_err, 1e-4, label = sprintf("Gradient max error: %.2e", max_rel_err)) # Test 3: FD Hessian check (all elements) cpp_hess_result <- evaluate_likelihood_cpp(fm_ptr, params_free, compute_gradient = FALSE, compute_hessian = TRUE) hess_vec <- cpp_hess_result$hessian # Expand upper triangular to full matrix n_free <- length(params_free) hess_mat <- matrix(0, n_free, n_free) idx <- 1 for (i in 1:n_free) { for (j in i:n_free) { hess_mat[i, j] <- hess_vec[idx] hess_mat[j, i] <- hess_vec[idx] idx <- idx + 1 } } # Compute FD Hessian by differentiating gradient (all elements) fd_hess <- matrix(0, n_free, n_free) grad_base <- evaluate_likelihood_cpp(fm_ptr, params_free, compute_gradient = TRUE, compute_hessian = FALSE)$gradient for (i in seq_len(n_free)) { params_plus <- params_free h <- eps * (abs(params_free[i]) + 1.0) params_plus[i] <- params_free[i] + h grad_plus <- evaluate_likelihood_cpp(fm_ptr, params_plus, compute_gradient = TRUE, compute_hessian = FALSE)$gradient fd_hess[i, ] <- (grad_plus - grad_base) / h } # Symmetrize FD Hessian fd_hess <- (fd_hess + t(fd_hess)) / 2 # Check all Hessian elements (upper triangle) max_hess_err <- 0 for (i in seq_len(n_free)) { for (j in i:n_free) { ana <- hess_mat[i, j] fd <- fd_hess[i, j] abs_err <- abs(ana - fd) # For near-zero elements, use absolute error; otherwise use relative error if (abs(ana) < 1e-6 && abs(fd) < 1e-6) { rel_err <- abs_err } else { rel_err <- abs_err / max(abs(ana), abs(fd)) } if (!is.na(rel_err) && rel_err > max_hess_err) max_hess_err <- rel_err } } # Tolerance of 1e-2 for full Hessian check (cross-derivatives have higher FD error) expect_lt(max_hess_err, 1e-2, label = sprintf("Hessian max error: %.2e", max_hess_err)) # Test 4: Parameter recovery (with relaxed tolerance due to finite sample) tolerance <- 0.3 # Factor variance (should be close to 1) expect_lt(abs(result$estimates["factor_var_1"] - 1.0), tolerance) # Measurement loading for m2 (should be close to 0.8) expect_lt(abs(result$estimates["m2_loading_1"] - 0.8), tolerance) # Linear model parameters expect_lt(abs(result$estimates["linear_intercept"] - true_beta_linear[1]), tolerance) expect_lt(abs(result$estimates["linear_x1"] - true_beta_linear[2]), tolerance) expect_lt(abs(result$estimates["linear_loading_1"] - true_loading_linear), tolerance) # Probit model parameters expect_lt(abs(result$estimates["probit_intercept"] - true_beta_probit[1]), tolerance) expect_lt(abs(result$estimates["probit_x1"] - true_beta_probit[2]), tolerance) if (VERBOSE) { cat("\nTest J: Two-stage with multiple outcome types and fixed coefficients\n") cat(" Stage 1: 2 linear measurement equations\n") cat(" Stage 2: linear, probit, oprobit, mlogit, exploded logit\n") cat(" Fixed coefficients: 5 (one per stage 2 model)\n") cat(sprintf(" Gradient max rel error: %.2e\n", max_rel_err)) cat(sprintf(" Hessian max rel error: %.2e\n", max_hess_err)) cat(sprintf(" Linear loading recovery: true=%.3f, est=%.3f\n", true_loading_linear, result$estimates["linear_loading_1"])) cat(sprintf(" m2 loading recovery: true=%.3f, est=%.3f\n", 0.8, result$estimates["m2_loading_1"])) } }) # ============================================================================= # Test K: Binary Logit Model (num_choices = 2) # ============================================================================= # This test specifically covers the binary logit case which requires special # handling for initialization (glm expects 0/1 but model uses 1/2 coding) test_that("Model K: Binary logit gradient, Hessian, and parameter recovery", { skip_on_cran() set.seed(54321) n_obs <- 500 # True parameters true_factor_var <- 1.0 true_beta <- c(0.5, 0.3) # intercept, x1 coefficient true_loading <- 0.7 true_meas_loading <- 0.8 true_meas_sigma <- 0.6 # Generate data factors <- rnorm(n_obs, 0, sqrt(true_factor_var)) x1 <- rnorm(n_obs) # Linear measurement equation (to identify factor) Y_meas <- true_meas_loading * factors + rnorm(n_obs, 0, true_meas_sigma) # Binary logit outcome # P(Y=2) = 1/(1 + exp(-(beta0 + beta1*x1 + lambda*f))) latent <- true_beta[1] + true_beta[2] * x1 + true_loading * factors prob <- 1 / (1 + exp(-latent)) # Outcomes must be 1-indexed (1 or 2) for C++ compatibility Y_logit <- rbinom(n_obs, 1, prob) + 1 # 1 or 2 (NOT 0/1) dat <- data.frame( Y_meas = Y_meas, Y_logit = Y_logit, x1 = x1, intercept = 1, eval = 1 ) # Define model fm <- define_factor_model(n_factors = 1, n_types = 1) # Measurement equation (identifies factor) mc_meas <- define_model_component( name = "meas", data = dat, outcome = "Y_meas", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = 1.0, evaluation_indicator = "eval" ) # Binary logit outcome (num_choices = 2) mc_logit <- define_model_component( name = "logit", data = dat, outcome = "Y_logit", factor = fm, covariates = c("intercept", "x1"), model_type = "logit", num_choices = 2, # Binary logit loading_normalization = NA_real_, evaluation_indicator = "eval" ) ms <- define_model_system(components = list(mc_meas, mc_logit), factor = fm) control <- define_estimation_control(n_quad_points = 12, num_cores = 1) # Test 1: Verify likelihood is finite (not -Inf which was the bug) fm_ptr <- initialize_factor_model_cpp(ms, dat, control$n_quad_points) init_result <- initialize_parameters(ms, dat, verbose = FALSE) init_params <- init_result$init_params result_init <- evaluate_likelihood_cpp(fm_ptr, init_params) expect_true(is.finite(result_init$logLikelihood), info = "Binary logit should produce finite log-likelihood after initialization") # Test 2: Full estimation result <- estimate_model_rcpp( model_system = ms, data = dat, init_params = init_params, control = control, optimizer = "nlminb", verbose = FALSE ) expect_equal(result$convergence, 0) expect_true(is.finite(result$loglik)) # Test 3: Gradient accuracy fm_ptr2 <- initialize_factor_model_cpp(ms, dat, control$n_quad_points) params <- result$estimates result_eval <- evaluate_likelihood_cpp(fm_ptr2, params, compute_gradient = TRUE, compute_hessian = FALSE) ana_grad <- result_eval$gradient # Finite difference gradient eps <- 1e-6 fd_grad <- numeric(length(params)) for (i in seq_along(params)) { params_plus <- params params_plus[i] <- params_plus[i] + eps ll_plus <- evaluate_loglik_only_cpp(fm_ptr2, params_plus) params_minus <- params params_minus[i] <- params_minus[i] - eps ll_minus <- evaluate_loglik_only_cpp(fm_ptr2, params_minus) fd_grad[i] <- (ll_plus - ll_minus) / (2 * eps) } max_rel_err <- 0 for (i in seq_along(params)) { ana <- ana_grad[i] fd <- fd_grad[i] abs_err <- abs(ana - fd) if (abs(ana) < 1e-6 && abs(fd) < 1e-6) { rel_err <- abs_err } else { rel_err <- abs_err / max(abs(ana), abs(fd)) } if (!is.na(rel_err) && rel_err > max_rel_err) max_rel_err <- rel_err } expect_lt(max_rel_err, 1e-5, label = sprintf("Gradient max error: %.2e", max_rel_err)) # Test 4: Hessian accuracy result_hess <- evaluate_likelihood_cpp(fm_ptr2, params, compute_gradient = TRUE, compute_hessian = TRUE) hess_vec <- result_hess$hessian n_params <- length(params) # Expand upper triangular to full matrix hess_mat <- matrix(0, n_params, n_params) idx <- 1 for (i in 1:n_params) { for (j in i:n_params) { hess_mat[i, j] <- hess_vec[idx] hess_mat[j, i] <- hess_vec[idx] idx <- idx + 1 } } # FD Hessian (using gradient differences) fd_hess <- matrix(0, n_params, n_params) grad_base <- evaluate_likelihood_cpp(fm_ptr2, params, compute_gradient = TRUE, compute_hessian = FALSE)$gradient for (i in seq_len(n_params)) { params_plus <- params h <- eps * (abs(params[i]) + 1.0) params_plus[i] <- params[i] + h grad_plus <- evaluate_likelihood_cpp(fm_ptr2, params_plus, compute_gradient = TRUE, compute_hessian = FALSE)$gradient fd_hess[i, ] <- (grad_plus - grad_base) / h } # Symmetrize FD Hessian fd_hess <- (fd_hess + t(fd_hess)) / 2 max_hess_err <- 0 for (i in seq_len(n_params)) { for (j in i:n_params) { ana <- hess_mat[i, j] fd <- fd_hess[i, j] abs_err <- abs(ana - fd) if (abs(ana) < 1e-6 && abs(fd) < 1e-6) { rel_err <- abs_err } else { rel_err <- abs_err / max(abs(ana), abs(fd)) } if (!is.na(rel_err) && rel_err > max_hess_err) max_hess_err <- rel_err } } expect_lt(max_hess_err, 1e-2, label = sprintf("Hessian max error: %.2e", max_hess_err)) # Test 5: Parameter recovery (with relaxed tolerance due to finite sample) tolerance <- 0.4 # Factor variance expect_lt(abs(result$estimates["factor_var_1"] - true_factor_var), tolerance, label = "Factor variance recovery") # Logit parameters (binary logit uses logit_intercept, not logit_c1_intercept) expect_lt(abs(result$estimates["logit_intercept"] - true_beta[1]), tolerance, label = "Logit intercept recovery") expect_lt(abs(result$estimates["logit_x1"] - true_beta[2]), tolerance, label = "Logit x1 coefficient recovery") expect_lt(abs(result$estimates["logit_loading_1"] - true_loading), tolerance, label = "Logit loading recovery") if (VERBOSE) { cat("\nTest K: Binary logit (num_choices = 2)\n") cat(" Outcome coding: 1/2 (1-indexed, as required by C++)\n") cat(sprintf(" Initial log-likelihood: %.4f (should be finite, not -Inf)\n", result_init$logLikelihood)) cat(sprintf(" Final log-likelihood: %.4f\n", result$loglik)) cat(sprintf(" Gradient max rel error: %.2e\n", max_rel_err)) cat(sprintf(" Hessian max rel error: %.2e\n", max_hess_err)) cat(sprintf(" Factor var: true=%.3f, est=%.3f\n", true_factor_var, result$estimates["factor_var_1"])) cat(sprintf(" Logit intercept: true=%.3f, est=%.3f\n", true_beta[1], result$estimates["logit_intercept"])) cat(sprintf(" Logit x1: true=%.3f, est=%.3f\n", true_beta[2], result$estimates["logit_x1"])) cat(sprintf(" Logit loading: true=%.3f, est=%.3f\n", true_loading, result$estimates["logit_loading_1"])) } })