#' Generate random responses #' #' @param n_data Number of data points #' @param n_outcomes Vector of integers, number of outcomes for each category #' @param prob_na Probability of missing data #' @param na_encode Value to use for missing data #' #' @return Data frame of responses, dim 1 for each data point, dim 2 for each #' category random_response <- function(n_data, n_outcomes, prob_na = 0, na_encode = 0) { responses <- matrix(0, nrow = n_data, ncol = length(n_outcomes)) for (i in seq_len(length(n_outcomes))) { responses[, i] <- sample(n_outcomes[i], n_data, replace = TRUE) } rand <- stats::runif(n_data * length(n_outcomes)) rand <- matrix(rand, nrow = n_data, ncol = length(n_outcomes)) responses[rand < prob_na] <- na_encode responses <- as.data.frame(responses) return(responses) } #' Generate random features #' #' Generate random Normally distributed random features. Column names of the #' data frame have prefix U, this ensures the column names are different #' enough from the responses returned by the function random_response() #' #' @param n_data Number of data points #' @param n_feature Number of features #' #' @return Data frame of features, dim 1 for each data point, dim 2 for each #' feature random_features <- function(n_data, n_feature) { features <- as.data.frame(matrix(stats::rnorm(n_data * n_feature), nrow = n_data, ncol = n_feature )) colnames(features) <- paste0(rep("U", n_feature), paste(seq_len(n_feature))) return(features) } #' Generate the non-regression formula to pass to poLCA() #' #' Generate the non-regression formula to pass to poLCA(). The formula uses #' all columns in the passed parameter responses #' #' @param responses Data frame of responses, dim 1 for each data point, dim 2 #' for each category #' #' @return formula to pass to poLCA() get_non_regression_formula <- function(responses) { f <- formula( paste0("cbind(", paste(colnames(responses), collapse = ","), ")~1") ) return(f) } #' Generate the regression formula to pass to poLCA() #' #' Generate the regression formula to pass to poLCA(). The formula uses all #' columns in the passed parameters responses and features. A linear #' relationship is used #' #' @param responses Data frame of responses, dim 1 for each data point, dim 2 #' for each category #' #' @return formula to pass to poLCA() get_regression_formula <- function(responses, features) { f <- formula( paste0( "cbind(", paste(colnames(responses), collapse = ","), ")~", paste0(colnames(features), collapse = "+"), "" ) ) return(f) } #' Generate random outcome probabilities #' #' Generate random outcome probabilities in an unvectorized form #' #' @param n_outcomes Vector of integers, number of outcomes for each category #' @param n_cluster Number of clusters fitted #' #' @return list of three items (`vecprobs`, `numChoices`, `classes`) where #' * `vecprobs`: vector of outcome probabilities conditioned on the #' manifest/category and the class/cluster. Imagine a nested loop, from inner #' to outer, or a flatten column-major matrix, the probabilities are arranged #' in the following order: #' * dim 1: for each outcome #' * dim 2: for each manifest/category #' * dim 3: for each class/cluster #' * `numChoices`: integer vector, number of outcomes for each category/manifest #' variable #' * `classes`: integer, number of latent classes (or clusters) random_unvectorized_probs <- function(n_outcomes, n_cluster) { probs <- list( vecprobs = poLCAParallel:::random_vectorized_probs(n_outcomes, n_cluster), numChoices = n_outcomes, classes = n_cluster ) probs <- poLCAParallel:::poLCAParallel.unvectorize(probs) return(probs) } #' Generate random cluster probabilities #' #' @param n_data Number of data points #' @param n_cluster Number of clusters fitted #' #' @return Matrix of cluster probabilities, dim 1 for each data point, dim 2 for #' each cluster random_cluster_probs <- function(n_data, n_cluster) { probs <- stats::runif(n_data * n_cluster) probs <- matrix(probs, nrow = n_data, ncol = n_cluster) probs <- probs / matrix( rep(rowSums(probs), n_cluster), nrow = n_data, ncol = n_cluster ) return(probs) } #' Test the cluster probabilities #' #' Test the prior and posterior probabilities. Tests the shape of the provided #' matrix of probabilities, they are in [0, 1] and sum to 1 #' #' @param probs Matrix of cluster probabilities, can be prior or posterior, dim #' 1 for each data point, dim 2 for each cluster #' @param n_data Number of data points #' @param n_cluster Number of clusters fitted test_cluster_probs <- function(probs, n_data, n_cluster) { expect_identical(nrow(probs), as.integer(n_data)) expect_identical(ncol(probs), as.integer(n_cluster)) expect_gte(min(probs), 0) expect_lte(max(probs), 1) prob_sum <- rowSums(probs) for (i in seq_len(length(prob_sum))) { expect_equal(prob_sum[i], 1) } } #' Test the outcome probabilities #' #' Test the outcome probabilities, they have the correct shape, are in [0, 1] #' and sum to 1 #' #' @param probs list for each category/manifest variable. For the `i`th entry, #' it contains a matrix of outcome probabilities with dimensions `n_class` x #' `n_outcomes[i]` #' @param n_outcomes Vector of integers, number of outcomes for each category #' @param n_cluster Number of clusters fitted test_outcome_probs <- function(probs, n_outcomes, n_cluster) { expect_identical(length(probs), as.integer(length(n_outcomes))) tol <- 1e-12 i_category <- 1 for (i in names(probs)) { probs_i <- probs[[i]] expect_identical(nrow(probs_i), as.integer(n_cluster)) expect_identical(ncol(probs_i), as.integer(n_outcomes[i_category])) expect_gte(min(probs_i), 0) expect_lte(max(probs_i), 1 + tol) prob_sum <- rowSums(probs_i) for (j in seq_len(length(prob_sum))) { expect_equal(prob_sum[j][[1]], 1) } i_category <- i_category + 1 } } #' Test the fitted model's standard error #' #' Test the fitted model's (or a list which mocks it) standard error for the #' prior probabilities and outcome probabilities #' #' @param lc A model object estimated using the `poLCA` function (or a list #' which mocks it) #' @param n_outcomes Vector of integers, number of outcomes for each category #' @param n_cluster Number of clusters fitted test_standard_error <- function(lc, n_outcomes, n_cluster) { expect_identical("P.se" %in% names(lc), TRUE) expect_identical("probs.se" %in% names(lc), TRUE) expect_identical(length(lc$P.se), as.integer(n_cluster)) expect_identical(all(lc$P.se >= 0), TRUE) expect_identical(length(lc$probs.se), as.integer(length(n_outcomes))) for (i in seq_len(length(n_outcomes))) { expect_identical(nrow(lc$probs.se[[i]]), as.integer(n_cluster)) expect_identical(ncol(lc$probs.se[[i]]), as.integer(n_outcomes[i])) expect_identical(all(lc$probs.se[[i]] >= 0), TRUE) } } #' Test the fitted model's regression coefficient standard error #' #' Test the fitted model's (or a list which mocks it) regression coefficient #' standard error #' #' @param lc A model object estimated using the `poLCA` function (or a list #' which mocks it) #' @param n_feature Number of features #' @param n_cluster Number of clusters fitted test_standard_coeff_error <- function(lc, n_feature, n_cluster) { n_parameter <- n_feature * (n_cluster - 1) expect_identical(ncol(lc$coeff.V), as.integer(n_parameter)) expect_identical(nrow(lc$coeff.V), as.integer(n_parameter)) } #' Test the fitted model's goodness of fit #' #' Test the fitted model's (or a list which mocks it) goodness of fit outputs #' such as the table of expected and observed frequencies (predcell) and #' statistics such as Gsq and Chisq #' #' @param lc A model object estimated using the `poLCA` function (or a list #' which mocks it) #' @param n_outcomes Vector of integers, number of outcomes for each category test_polca_goodnessfit <- function(lc, n_outcomes) { expect_identical("predcell" %in% names(lc), TRUE) expect_identical("Gsq" %in% names(lc), TRUE) expect_identical("Chisq" %in% names(lc), TRUE) unique_responses <- as.matrix(lc$predcell)[, seq_len(length(n_outcomes))] expect_identical(all(unique_responses > 0), TRUE) for (i in seq_len(length(n_outcomes))) { expect_identical(all(unique_responses[, i] <= n_outcomes[i]), TRUE) } expect_identical(sum(as.integer(lc$predcell$observed)), lc$Nobs) expect_lte(sum(lc$expected), lc$Nobs) }