// poLCAParallel // Copyright (C) 2025 Sherman Lo // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // You should have received a copy of the GNU General Public License along // with this program; if not, write to the Free Software Foundation, Inc., // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. #ifndef POLCAPARALLEL_TESTS_UTIL_TEST_H_ #define POLCAPARALLEL_TESTS_UTIL_TEST_H_ #include #include #include #include #include #include "em_algorithm.h" #include "em_algorithm_array.h" #include "util.h" namespace polca_parallel_test { /** Tolerance for equality of probabilities */ inline constexpr double kTolerance = 1e-12; /** Tolerance for arma::Mat::is_symmetric() */ inline constexpr double kSymmetricTolerance = 1e-15; /** * Calculate the number of fully observed responses * * Calculate (or count) the number of fully observed responses. Unobserved * responses are coded as zero * * @param responses Design matrix transposed of responses, matrix * containing outcomes/responses for each category as integers 1, 2, 3, .... * Missing values may be encoded as 0. The matrix has dimensions *
    *
  • dim 0: for each category
  • *
  • dim 1: for each data point
  • *
* @param n_data Number of data points * @param n_category Number of categories in each response * @return std::size_t Number of fully observed responses */ [[nodiscard]] std::size_t CalcNObs(std::span responses, std::size_t n_data, std::size_t n_category); /** * Generate random responses * * Generate random responses using random priors and random outcome * probabilities. Provide a rng and the resulting random responses are returned * * @param n_data Number of data points * @param n_outcomes Number of outcomes for each category * @param rng Random number generator * @return std::vector The generated responses in matrix form, design * matrix transposed of responses, matrix containing outcomes/responses * for each category as integers 1, 2, 3, .... The matrix has dimensions *
    *
  • dim 0: for each category
  • *
  • dim 1: for each data point
  • *
*/ std::vector RandomMarginal(std::size_t n_data, polca_parallel::NOutcomes n_outcomes, std::mt19937_64& rng); /** * Set missing data at random to the responses * * Set missing data at random to the responses by setting them to zero * * @param missing_prob Probability a data point is set to zero or missing * @param rng Random number generator * @param responses Matrix of responses to modify */ void SetMissingAtRandom(double missing_prob, std::mt19937_64& rng, std::span responses); /** * Instantiate a rng from an array of numbers * * @param seed_array Vector of seeds to init a rng * @return std::mt19937_64 random number generator */ std::mt19937_64 InitRng(std::vector& seed_array); /** * Instantiate a rng from a seed_seq * * @param seed_seq Seed sequence to init a rng * @return std::mt19937_64 random number generator */ std::mt19937_64 InitRng(std::seed_seq& seed_seq); /** * Create a random polca_parallel::NOutcomes * * Create a random polca_parallel::NOutcomes. This is used to set a random * number for the number of outcomes for each category * * To use this function, init a std::vector of length * n_category, then pass it as an argument of * n_outcomes_vec. The vector is modified with the random * n_outcomes and the corresponding polca_parallel::NOutcomes * object is returned * * @param max_n_outcome Maximum number of outcome for every outcome * @param n_outcomes_vec Modified To store the number of outcomes for * each category * @param rng Random number generator * @return polca_parallel::NOutcomes The number of outcomes for each category */ polca_parallel::NOutcomes RandomNOutcomes( std::size_t max_n_outcome, std::vector& n_outcomes_vec, std::mt19937_64& rng); /** * Create random probabilities for each cluster * * Create random probabilities for each cluster which can be used for the prior * and/or posterior * * @param n_data Number of data points * @param n_cluster Number of clusters * @param rng Random number generator * @return arma::Mat matrix with size n_data x * n_cluster, each row has normalised probabilites for each cluster */ arma::Mat RandomClusterProbs(std::size_t n_data, std::size_t n_cluster, std::mt19937_64& rng); /** * Allocate memory for storing the outputs or results * * Allocate memory for storing the resulting posterior, * prior, estiamted_prob and * regress_coeff * * @param n_data Number of data points * @param n_feature Number of features * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @return std::tuple, std::vector, * std::vector, std::vector> Allocated memory for the * posterior, prior, estimated_prob and * regress_coeff respectively */ std::tuple, std::vector, std::vector, std::vector> InitOutputs(std::size_t n_data, std::size_t n_feature, polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster); /** * Test the outcome probabilities * * Test the outcome probabilities are in [0.0, 1.0] and the outcome * probabilities, for a given category and cluster, sums to 1.0 * * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @param probs Vector of outcome probabilities for each outcome, category and * cluster, flatten list in the following order *
    *
  • dim 0: for each outcome
  • *
  • dim 1: for each category
  • *
  • dim 2: for each cluster
  • *
*/ void TestOutcomeProbs(polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster, std::span probs); /** * Test the cluster (prior/posterior) probabilities * * Test the cluster (prior/posterior) probabilities are in [0.0, 1.0] and the * cluster probabilities, for each data point or row, sums to 1.0 * * @param cluster_probs Design matrix of probabilities, the matrix has the * following dimensions *
    *
  • dim 0: for each data
  • *
  • dim 1: for each cluster
  • *
* @param n_data Number of data points, ie number of rows in * cluster_probs * @param n_cluster Number of clusters, ie number of columns in * cluster_probs */ void TestClusterProbs(std::span cluster_probs, std::size_t n_data, std::size_t n_cluster); /** * Test the default outputs of polca_parallel::EmAlgorithm * * Test the default outputs of polca_parallel::EmAlgorithm. They are the * posterior, prior and estimated_prob. * For the regression problem, the output regress_coeff is also * tested * * Test the probabilities in posterior, prior and * estimated_prob are in * [0.0, 1.0] and they are correctly normalised. Also checks if * regress_coeff is a number if applicable * * EmAlgorithmType is used to determine to test * regress_coeff or not. The output regress_coeff is * only tested for regression problems * * @tparam EmAlgorithmType The type of polca_parallel::EmAlgorithm to test, this * determines what to test, eg regress_coeff is tested only in * regression problems, polca_parallel::EmAlgorithmNan supports missing values * @param n_data Number of data points * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @param posterior Design matrix of posterior probabilities. The matrix has the * following dimensions *
    *
  • dim 0: for each data
  • *
  • dim 1: for each cluster
  • *
* @param prior Design matrix of prior probabilities. The matrix has the * following dimensions *
    *
  • dim 0: for each data
  • *
  • dim 1: for each cluster
  • *
* @param estimated_prob Vector of outcome probabilities for each outcome, * category and cluster, flatten list in the following order *
    *
  • dim 0: for each outcome
  • *
  • dim 1: for each category
  • *
  • dim 2: for each cluster
  • *
* @param regress_coeff Matrix of regression coefficients with the following * dimensions *
    *
  • dim 0: n_feature
  • *
  • dim 1: n_cluster - 1
  • *
*/ template void TestEmAlgorithmDefaultOutputs(std::size_t n_data, polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster, std::span posterior, std::span prior, std::span estimated_prob, std::span regress_coeff); /** * Test the optional outputs from polca_parallel::EmAlgorithm * * Test the outputs of polca_parallel::EmAlgorithm::get_ln_l() and * polca_parallel::EmAlgorithm::get_n_iter() * * @param fitter polca_parallel::EmAlgorithm object to test * @param max_iter max_iter argument passed to fitter */ void TestEmAlgorithmOptionalOutputs(polca_parallel::EmAlgorithm& fitter, std::size_t max_iter); /** * Black box test for polca_parallel::EmAlgorithm and their subclasses * * Black box test for polca_parallel::EmAlgorithm and their subclasses. Provided * simulated data and the polca_parallel::EmAlgorithm are initalised within the * function for testing * * Sections: * *
    *
  • * Test the outputs: posterior, prior, * estimated_prob, regress_coeff, * polca_parallel::EmAlgorithm::get_ln_l() and * polca_parallel::EmAlgorithm::get_n_iter() *
  • *
  • * Same as above but also calls * polca_parallel::EmAlgorithm::set_best_initial_prob() and test it *
  • *
  • * Tests if the results can be reproduced again when given the equivalent * rng *
  • *
  • * Tests if the resulting state from polca_parallel::EmAlgorithm::move_rng() * can be reproduced *
  • *
* * @tparam EmAlgorithmType The type of polca_parallel::EmAlgorithm to test, this * determines what to test, eg regress_coeff is tested only in * regression problems, polca_parallel::EmAlgorithmNan supports missing values * @param features Design matrix of features, matrix with dimensions *
    *
  • dim 0: for each data point
  • *
  • dim 1: for each feature
  • *
* Can be empty for the non-regression problem * @param responses Design matrix transposed of responses, matrix * containing outcomes/responses for each category as integers 1, 2, 3, .... If * supported, 0 can be used to indicate a missing value. The matrix has * dimensions *
    *
  • dim 0: for each category
  • *
  • dim 1: for each data point
  • *
* @param initial_prob Vector of initial response probabilities for each * outcome, conditioned on the category and cluster. A flatten list in the * following order *
    *
  • dim 0: for each outcome
  • *
  • dim 1: for each category
  • *
  • dim 2: for each cluster
  • *
* @param n_data Number of data points * @param n_feature Number of features, set to 1 for the non-regression problem * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @param max_iter Maximum number of iterations for EM algorithm * @param tolerance Tolerance for difference in log-likelihood, used for * stopping condition * @param seed For seeding the polca_parallel::EmAlgorithm * @param is_full_constructor true if to use the constructor which * requires all parameters, false to use the overloaded * constructor which has fewer parameters */ template void BlackBoxTestEmAlgorithm(std::span features, std::span responses, std::span initial_prob, std::size_t n_data, std::size_t n_feature, polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster, unsigned int max_iter, double tolerance, unsigned int seed, bool is_full_constructor); /** * Test the optional outputs from polca_parallel::EmAlgorithmArray * * Test the outputs of polca_parallel::EmAlgorithmArray::get_best_rep_index() * and polca_parallel::EmAlgorithmArray::get_n_iter() * * @param fitter polca_parallel::EmAlgorithmArray to test * @param n_rep n_rep argument to pass to fitter * @param max_iter max_iter argument to pass fitter */ void TestEmAlgorithmArrayOptionalOutputs( std::unique_ptr& fitter, std::size_t n_rep, std::size_t max_iter); /** * Black box test for polca_parallel::EmAlgorithmArray and their subclasses * * Black box test for polca_parallel::EmAlgorithmArray and their subclasses. * Provided simulated data and the polca_parallel::EmAlgorithmArray are * initalised within the function for testing * * Sections: * *
    *
  • * Test the outputs: posterior, prior, * estimated_prob, regress_coeff, * polca_parallel::EmAlgorithmArray::get_best_rep_index() and * polca_parallel::EmAlgorithmArray::get_n_iter() *
  • *
  • * Same as above but also calls * polca_parallel::EmAlgorithmArray::set_best_initial_prob() and * polca_parallel::EmAlgorithmArray::set_ln_l_array() before fitting. The * resulting best_initial_prob and ln_l_array are * tested *
  • *
  • * Test if results can be reproduced again when given the same * seed_seq and using one thread *
  • *
* * @tparam EmAlgorithmArrayType Either polca_parallel::EmAlgorithmArray or * polca_parallel::EmAlgorithmArray to test * @tparam EmAlgorithmType The type to pass to * polca_parallel::EmAlgorithmArray::Fit<>(), this specifies if the problem is a * regression problem or not, and if missing data is in the data or not * @param features Design matrix of features, matrix with dimensions *
    *
  • dim 0: for each data point
  • *
  • dim 1: for each feature
  • *
* Can be empty for the non-regression problem * @param responses Design matrix transposed of responses, matrix * containing outcomes/responses for each category as integers 1, 2, 3, .... If * supported, 0 can be used to indicate a missing value. The matrix has * dimensions *
    *
  • dim 0: for each category
  • *
  • dim 1: for each data point
  • *
* @param initial_prob Vector of initial response probabilities for each * outcome, conditioned on the category and cluster. A flatten list in the * following order *
    *
  • dim 0: for each outcome
  • *
  • dim 1: for each category
  • *
  • dim 2: for each cluster
  • *
* @param n_data Number of data points * @param n_feature Number of features, set to 1 for the non-regression problem * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @param n_rep Number of initial values to try out * @param n_thread Number of threads to use * @param max_iter Maximum number of iterations for EM algorithm * @param tolerance Tolerance for difference in log-likelihood, used for * stopping condition * @param seed_seq For seeding polca_parallel::EmAlgorithmArray * @param is_full_constructor true if to use the constructor which * requires all parameters, false to use the overloaded * constructor which has fewer parameters */ template void BlackBoxTestEmAlgorithmArray(std::span features, std::span responses, std::span initial_prob, std::size_t n_data, std::size_t n_feature, polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster, std::size_t n_rep, std::size_t n_thread, unsigned int max_iter, double tolerance, std::seed_seq& seed_seq, bool is_full_constructor); /** * Black box test for polca_parallel::StandardError and their subclasses * * Black box test for polca_parallel::StandardError and their subclasses. * Provided simulated data and the polca_parallel::StandardError are initalised * within the function for testing. * * Test if the errors are positive. For the regression problem, test if the * regression coefficient covariance matrix is symmetric * * @tparam StandardErrorType The type to test, polca_parallel::StandardError or * their subclass * @param features Design matrix of features, matrix with dimensions *
    *
  • dim 0: for each data point
  • *
  • dim 1: for each feature
  • *
* Can be empty for the non-regression problem * @param responses Design matrix transposed of responses, matrix * containing outcomes/responses for each category as integers 1, 2, 3, .... If * supported, 0 can be used to indicate a missing value. The matrix has * dimensions *
    *
  • dim 0: for each category
  • *
  • dim 1: for each data point
  • *
* @param probs Vector of response probabilities for each outcome, conditioned * on the category and cluster. A flatten list in the following order *
    *
  • dim 0: for each outcome
  • *
  • dim 1: for each category
  • *
  • dim 2: for each cluster
  • *
* @param posterior Design matrix of posterior probabilities. The matrix has the * following dimensions *
    *
  • dim 0: for each data
  • *
  • dim 1: for each cluster
  • *
* @param prior Design matrix of prior probabilities. The matrix has the * following dimensions *
    *
  • dim 0: for each data
  • *
  • dim 1: for each cluster
  • *
* @param n_data Number of data points * @param n_feature Number of features, set to 1 for the non-regression problem * @param n_outcomes Number of outcomes for each category * @param n_cluster Number of clusters * @param is_full_constructor true if to use the constructor which * requires all parameters, false to use the overloaded * constructor which has fewer parameters */ template void BlackBoxTestStandardError(std::span features, std::span responses, std::span probs, const arma::Mat& posterior, const arma::Mat& prior, std::size_t n_data, std::size_t n_feature, polca_parallel::NOutcomes n_outcomes, std::size_t n_cluster, bool is_full_constructor); } // namespace polca_parallel_test #endif // POLCAPARALLEL_TESTS_UTIL_TEST_H_