# ============================================================================= # Test: HJV-style 3-Stage Estimation with Adaptive Quadrature # ============================================================================= # Stage 1: Measurement system (2 factors, 3 measures each) # Stage 2: Factor score estimation # Stage 3: Outcome equations (linear, probit, logit, oprobit) # Compare standard vs adaptive quadrature likelihoods test_that("HJV-style 3-stage estimation: adaptive vs standard quadrature", { skip_on_cran() set.seed(42) n <- 1000 # ========================================================================== # True Parameters # ========================================================================== # Factor variances true_var_f1 <- 1.0 true_var_f2 <- 0.8 # Measurement loadings (factor 1: measures 1-3, factor 2: measures 4-6) # First loading for each factor normalized to 1 true_loadings_f1 <- c(1.0, 0.8, 0.7) true_loadings_f2 <- c(1.0, 0.9, 0.6) # Measurement error SDs true_sigma_m <- c(0.5, 0.6, 0.7, 0.5, 0.55, 0.65) # Outcome parameters # Linear outcome true_linear_intercept <- 0.5 true_linear_beta <- 0.3 true_linear_loading_f1 <- 0.6 true_linear_loading_f2 <- 0.4 true_linear_sigma <- 0.4 # Probit outcome true_probit_intercept <- -0.2 true_probit_beta <- 0.4 true_probit_loading_f1 <- 0.5 true_probit_loading_f2 <- 0.3 # Logit outcome (3 choices) true_logit_intercept <- c(0.3, -0.1) # 2 non-reference choices true_logit_beta <- c(0.2, 0.3) true_logit_loading_f1 <- c(0.4, 0.5) true_logit_loading_f2 <- c(0.3, 0.2) # ========================================================================== # Simulate Data # ========================================================================== # Generate factors f1 <- rnorm(n, 0, sqrt(true_var_f1)) f2 <- rnorm(n, 0, sqrt(true_var_f2)) # Covariate x <- rnorm(n, 0, 1) # Measurement equations (factor 1) m1 <- true_loadings_f1[1] * f1 + rnorm(n, 0, true_sigma_m[1]) m2 <- true_loadings_f1[2] * f1 + rnorm(n, 0, true_sigma_m[2]) m3 <- true_loadings_f1[3] * f1 + rnorm(n, 0, true_sigma_m[3]) # Measurement equations (factor 2) m4 <- true_loadings_f2[1] * f2 + rnorm(n, 0, true_sigma_m[4]) m5 <- true_loadings_f2[2] * f2 + rnorm(n, 0, true_sigma_m[5]) m6 <- true_loadings_f2[3] * f2 + rnorm(n, 0, true_sigma_m[6]) # Linear outcome y_linear <- true_linear_intercept + true_linear_beta * x + true_linear_loading_f1 * f1 + true_linear_loading_f2 * f2 + rnorm(n, 0, true_linear_sigma) # Probit outcome latent_probit <- true_probit_intercept + true_probit_beta * x + true_probit_loading_f1 * f1 + true_probit_loading_f2 * f2 + rnorm(n, 0, 1) y_probit <- as.integer(latent_probit > 0) # Logit outcome (3 choices: 0, 1, 2) # V_j = intercept_j + beta_j * x + loading_f1_j * f1 + loading_f2_j * f2 V0 <- 0 # Reference category V1 <- true_logit_intercept[1] + true_logit_beta[1] * x + true_logit_loading_f1[1] * f1 + true_logit_loading_f2[1] * f2 V2 <- true_logit_intercept[2] + true_logit_beta[2] * x + true_logit_loading_f1[2] * f1 + true_logit_loading_f2[2] * f2 # Add type 1 extreme value errors and take argmax eps_logit <- matrix(-log(-log(runif(n * 3))), ncol = 3) U <- cbind(V0 + eps_logit[,1], V1 + eps_logit[,2], V2 + eps_logit[,3]) y_logit <- apply(U, 1, which.max) # 1, 2, or 3 (1-indexed) # Create data frame (excluding oprobit for now to simplify debugging) dat <- data.frame( m1 = m1, m2 = m2, m3 = m3, m4 = m4, m5 = m5, m6 = m6, x = x, y_linear = y_linear, y_probit = y_probit, y_logit = y_logit, intercept = 1 ) # ========================================================================== # Stage 1: Estimate Measurement System # ========================================================================== message("\n========== STAGE 1: Measurement System ==========") fm <- define_factor_model(n_factors = 2) # Factor 1 measures (loading normalized on m1) mc_m1 <- define_model_component( name = "m1", data = dat, outcome = "m1", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(1, 0) # Fix loading_f1=1, loading_f2=0 ) mc_m2 <- define_model_component( name = "m2", data = dat, outcome = "m2", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0) # Free loading_f1, loading_f2=0 ) mc_m3 <- define_model_component( name = "m3", data = dat, outcome = "m3", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(NA_real_, 0) ) # Factor 2 measures (loading normalized on m4) mc_m4 <- define_model_component( name = "m4", data = dat, outcome = "m4", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, 1) # loading_f1=0, fix loading_f2=1 ) mc_m5 <- define_model_component( name = "m5", data = dat, outcome = "m5", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_) ) mc_m6 <- define_model_component( name = "m6", data = dat, outcome = "m6", factor = fm, covariates = "intercept", model_type = "linear", loading_normalization = c(0, NA_real_) ) ms_stage1 <- define_model_system( components = list(mc_m1, mc_m2, mc_m3, mc_m4, mc_m5, mc_m6), factor = fm ) control_stage1 <- define_estimation_control(n_quad_points = 12, num_cores = 1) result_stage1 <- estimate_model_rcpp( model_system = ms_stage1, data = dat, control = control_stage1, optimizer = "nlminb", parallel = FALSE, verbose = TRUE ) expect_equal(result_stage1$convergence, 0, label = "Stage 1 converged") message("\nStage 1 estimates:") print(round(result_stage1$estimates, 4)) # ========================================================================== # Stage 2: Factor Score Estimation # ========================================================================== message("\n========== STAGE 2: Factor Score Estimation ==========") fscores <- estimate_factorscores_rcpp( result = result_stage1, data = dat, control = control_stage1, verbose = TRUE ) # Extract matrices factor_scores_mat <- as.matrix(fscores[, c("factor_1", "factor_2")]) factor_ses_mat <- as.matrix(fscores[, c("se_factor_1", "se_factor_2")]) # Extract factor variances from Stage 1 factor_var_1 <- as.numeric(result_stage1$estimates["factor_var_1"]) factor_var_2 <- as.numeric(result_stage1$estimates["factor_var_2"]) factor_vars_vec <- c(factor_var_1, factor_var_2) message("\nFactor variances from Stage 1:") message(sprintf(" factor_var_1 = %.4f (true = %.4f)", factor_var_1, true_var_f1)) message(sprintf(" factor_var_2 = %.4f (true = %.4f)", factor_var_2, true_var_f2)) message("\nFactor score summary:") message(sprintf(" Factor 1: mean=%.3f, sd=%.3f, SE range=[%.3f, %.3f]", mean(factor_scores_mat[,1]), sd(factor_scores_mat[,1]), min(factor_ses_mat[,1]), max(factor_ses_mat[,1]))) message(sprintf(" Factor 2: mean=%.3f, sd=%.3f, SE range=[%.3f, %.3f]", mean(factor_scores_mat[,2]), sd(factor_scores_mat[,2]), min(factor_ses_mat[,2]), max(factor_ses_mat[,2]))) # Correlation with true factors cor_f1 <- cor(factor_scores_mat[,1], f1) cor_f2 <- cor(factor_scores_mat[,2], f2) message(sprintf("\nCorrelation with true factors: f1=%.3f, f2=%.3f", cor_f1, cor_f2)) expect_true(cor_f1 > 0.8, label = "Factor 1 scores correlate with truth") expect_true(cor_f2 > 0.8, label = "Factor 2 scores correlate with truth") # ========================================================================== # Stage 3: Outcome Equations # ========================================================================== message("\n========== STAGE 3: Outcome Equations ==========") # Define outcome components mc_linear <- define_model_component( name = "out_linear", data = dat, outcome = "y_linear", factor = fm, covariates = c("intercept", "x"), model_type = "linear", loading_normalization = c(NA_real_, NA_real_) ) mc_probit <- define_model_component( name = "out_probit", data = dat, outcome = "y_probit", factor = fm, covariates = c("intercept", "x"), model_type = "probit", loading_normalization = c(NA_real_, NA_real_) ) mc_logit <- define_model_component( name = "out_logit", data = dat, outcome = "y_logit", factor = fm, covariates = c("intercept", "x"), model_type = "logit", num_choices = 3, loading_normalization = c(NA_real_, NA_real_) ) # Model system with previous_stage (linear, probit, logit) ms_stage3 <- define_model_system( components = list(mc_linear, mc_probit, mc_logit), factor = fm, previous_stage = result_stage1 ) # ========================================================================== # Test A: Standard Quadrature # ========================================================================== message("\n----- Test A: Standard Quadrature -----") n_quad <- 8 control_standard <- define_estimation_control( n_quad_points = n_quad, num_cores = 1, adaptive_integration = FALSE ) result_standard <- estimate_model_rcpp( model_system = ms_stage3, data = dat, control = control_standard, optimizer = "nlminb", parallel = FALSE, verbose = TRUE ) expect_equal(result_standard$convergence, 0, label = "Standard quadrature converged") message(sprintf("\nStandard quadrature loglik: %.4f", result_standard$loglik)) # ========================================================================== # Test B: Adaptive Quadrature # ========================================================================== message("\n----- Test B: Adaptive Quadrature -----") threshold <- 0.3 control_adaptive <- define_estimation_control( n_quad_points = n_quad, num_cores = 1, adaptive_integration = TRUE, adapt_int_thresh = threshold ) result_adaptive <- estimate_model_rcpp( model_system = ms_stage3, data = dat, control = control_adaptive, optimizer = "nlminb", parallel = FALSE, verbose = TRUE, factor_scores = factor_scores_mat, factor_ses = factor_ses_mat, factor_vars = factor_vars_vec ) expect_equal(result_adaptive$convergence, 0, label = "Adaptive quadrature converged") message(sprintf("\nAdaptive quadrature loglik: %.4f", result_adaptive$loglik)) # ========================================================================== # Compare Results # ========================================================================== message("\n========== COMPARISON ==========") ll_standard <- result_standard$loglik ll_adaptive <- result_adaptive$loglik ll_ratio <- ll_adaptive / ll_standard ll_diff <- abs(ll_adaptive - ll_standard) ll_diff_pct <- 100 * ll_diff / abs(ll_standard) message(sprintf("\nLog-likelihood comparison:")) message(sprintf(" Standard: %.4f", ll_standard)) message(sprintf(" Adaptive: %.4f", ll_adaptive)) message(sprintf(" Ratio: %.6f", ll_ratio)) message(sprintf(" Abs diff: %.4f", ll_diff)) message(sprintf(" Pct diff: %.2f%%", ll_diff_pct)) # The key test: likelihoods should be similar (within 20% for this approximation) # If adaptive is working correctly, they should be close expect_true(ll_ratio > 0.5 && ll_ratio < 2.0, label = sprintf("Likelihood ratio %.4f should be between 0.5 and 2.0", ll_ratio)) # Compare parameter estimates (Stage 3 only) n_stage1_params <- length(result_stage1$estimates) stage3_names <- names(result_standard$estimates)[(n_stage1_params + 1):length(result_standard$estimates)] est_standard <- result_standard$estimates[(n_stage1_params + 1):length(result_standard$estimates)] est_adaptive <- result_adaptive$estimates[(n_stage1_params + 1):length(result_adaptive$estimates)] message("\nParameter estimates (Stage 3 only):") comparison_df <- data.frame( Parameter = stage3_names, Standard = round(est_standard, 4), Adaptive = round(est_adaptive, 4), Diff = round(est_adaptive - est_standard, 4) ) print(comparison_df) # Parameters should be reasonably close max_param_diff <- max(abs(est_adaptive - est_standard)) message(sprintf("\nMax parameter difference: %.4f", max_param_diff)) # ========================================================================== # Direct likelihood comparison at initial parameters # ========================================================================== message("\n========== DIRECT LIKELIHOOD COMPARISON ==========") message("(Evaluating at the SAME parameters to isolate quadrature difference)") # Use Stage 1 estimates + initial Stage 3 estimates # Get initial parameters for Stage 3 init_result <- initialize_parameters(ms_stage3, dat, verbose = FALSE) full_init_params <- init_result$init_params # Initialize FactorModel for standard quadrature fm_ptr_std <- initialize_factor_model_cpp(ms_stage3, as.matrix(dat), n_quad, full_init_params) # Get free parameters free_params <- extract_free_params_cpp(fm_ptr_std, full_init_params) # Evaluate standard quadrature likelihood ll_std_direct <- evaluate_loglik_only_cpp(fm_ptr_std, free_params) message(sprintf("\nAt initial parameters:")) message(sprintf(" Standard quadrature loglik: %.4f", ll_std_direct)) # Initialize FactorModel for adaptive quadrature fm_ptr_adapt <- initialize_factor_model_cpp(ms_stage3, as.matrix(dat), n_quad, full_init_params) # Set adaptive quadrature set_adaptive_quadrature_cpp( fm_ptr_adapt, factor_scores_mat, factor_ses_mat, factor_vars_vec, threshold = threshold, max_quad = n_quad, verbose = TRUE ) # Evaluate adaptive quadrature likelihood ll_adapt_direct <- evaluate_loglik_only_cpp(fm_ptr_adapt, free_params) message(sprintf(" Adaptive quadrature loglik: %.4f", ll_adapt_direct)) message(sprintf(" Ratio: %.6f", ll_adapt_direct / ll_std_direct)) message(sprintf(" Difference: %.4f", ll_adapt_direct - ll_std_direct)) # This is the critical test - at the same parameters, the likelihoods # should be similar if adaptive quadrature is working correctly direct_ratio <- ll_adapt_direct / ll_std_direct message(sprintf("\n*** CRITICAL TEST: Direct likelihood ratio = %.6f ***", direct_ratio)) if (abs(direct_ratio - 1.0) > 0.5) { message("WARNING: Large discrepancy between standard and adaptive quadrature!") message("This suggests a bug in the adaptive quadrature implementation.") } expect_true(direct_ratio > 0.5 && direct_ratio < 2.0, label = sprintf("Direct likelihood ratio %.4f should be between 0.5 and 2.0", direct_ratio)) })