testthat::test_that( "confidence interval experiment", { testthat::skip("This test is intended to be run manually.") set.seed(1) kDimension <- 1 # nolint: Google Style Guide change_point_locations <- NULL for (experiment_id in seq_len(20)) { data <- rbind( mvtnorm::rmvnorm( 300, mean = rep(0, kDimension), sigma = diag(100, kDimension) ), mvtnorm::rmvnorm( 400, mean = rep(50, kDimension), sigma = diag(100, kDimension) ), mvtnorm::rmvnorm( 300, mean = rep(2, kDimension), sigma = diag(100, kDimension) ) ) segment_count_guess <- 10 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_vars <- rep(0, block_count) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_vars[block_index] <- var(data[block_start:block_end, ]) } data_all_var <- mean(data_all_vars) mean_loss <- function(data) { n <- nrow(data) (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + n / 2 * (log(data_all_var) + log(2 * pi)) } mean_loss_result <- fastcpd::fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (kDimension + 1) * log(nrow(data)) / 2, p = kDimension, cost = mean_loss ) change_point_locations <- c(change_point_locations, mean_loss_result@cp_set) } cps_cookie_bucket <- NULL cookie_bucket_id_list <- sample.int(n = 20, size = 1000, replace = TRUE) all_data <- data for (cookie_bucket_id in seq_len(20)) { data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] segment_count_guess <- 10 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_vars <- rep(0, block_count) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_vars[block_index] <- var(data[block_start:block_end, ]) } data_all_var <- mean(data_all_vars) mean_loss <- function(data) { n <- nrow(data) (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + n / 2 * (log(data_all_var) + log(2 * pi)) } mean_loss_result <- fastcpd::fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (kDimension + 1) * log(nrow(data)) / 2, p = kDimension, cost = mean_loss ) for (cp in mean_loss_result@cp_set) { ordinal_mapped_cp <- which( cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp )[1] cps_cookie_bucket <- c(cps_cookie_bucket, ordinal_mapped_cp) } } table_change_point_locations <- table(change_point_locations) testthat::expect_equal( rownames(table_change_point_locations), c("269", "299", "300", "700") ) testthat::expect_equal( unname(table_change_point_locations), c(1, 1, 19, 20), ignore_attr = TRUE ) table_cp_cookie_bucket <- table(cps_cookie_bucket) testthat::expect_equal( rownames(table_cp_cookie_bucket), c("299", "300", "697", "700") ) testthat::expect_equal( unname(table_cp_cookie_bucket), c(1, 19, 1, 19), ignore_attr = TRUE ) } ) testthat::test_that( "confidence interval experiment with one change point", { testthat::skip("This test is intended to be run manually.") set.seed(1) kDimension <- 1 # nolint: Google Style Guide change_point_locations <- list() cps_cookie_bucket <- rep(list(rep(list(NULL), 20)), 500) containing_change_point <- matrix(NA, 500, 5) # 500 experiments and count the number of times change point will be inside # the confidence interval to verify the logics of confidence interval. for (experiment_id in seq_len(500)) { data <- rbind( mvtnorm::rmvnorm( 130, mean = rep(0, kDimension), sigma = diag(100, kDimension) ), mvtnorm::rmvnorm( 130, mean = rep(50, kDimension), sigma = diag(100, kDimension) ) ) segment_count_guess <- 10 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_vars <- rep(0, block_count) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_vars[block_index] <- var(data[block_start:block_end, ]) } data_all_var <- mean(data_all_vars) mean_loss <- function(data) { n <- nrow(data) n / 2 * ( log(data_all_var) + log(2 * pi) + sum((data - colMeans(data))^2 / data_all_var) / n ) } mean_loss_result <- fastcpd::fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (kDimension + 1) * log(nrow(data)) / 2, p = kDimension, cost = mean_loss ) # Store the change point locations for each experiment as a baseline for # cookie bucket experiments. change_point_locations[[experiment_id]] <- mean_loss_result@cp_set # Randomly assign 20 cookie buckets to each experiment. cookie_bucket_id_list <- sample.int(n = 20, size = 130 + 130, replace = TRUE) all_data <- data # Do the cookie bucket experiment for each cookie bucket. for (cookie_bucket_id in seq_len(20)) { # Exclude the data from the cookie bucket. data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] segment_count_guess <- 10 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_vars <- rep(0, block_count) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_vars[block_index] <- var(data[block_start:block_end, ]) } data_all_var <- mean(data_all_vars) mean_loss <- function(data) { n <- nrow(data) n / 2 * ( log(data_all_var) + log(2 * pi) + sum((data - colMeans(data))^2 / data_all_var) / n ) } # Obtain the change points for the cookie bucket experiment. mean_loss_result <- fastcpd::fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (kDimension + 1) * log(nrow(data)) / 2, p = kDimension, cost = mean_loss ) # Map the change points to the original index. for (cp in mean_loss_result@cp_set) { ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1] cps_cookie_bucket[[experiment_id]][[cookie_bucket_id]] <- ordinal_mapped_cp } } # Concatenate the change points from all cookie bucket experiments. cp_for_eid <- Reduce("c", cps_cookie_bucket[[experiment_id]]) # Calculate the mean as the center of confidence interval. d_capital <- mean(cp_for_eid) # Concatenate the change ponints from all cookie bucket experiments # except the current one. d_j <- rep(list(NULL), 20) for (j in seq_len(20)) { for (cookie_bucket_id in seq_len(20)) { if (j != cookie_bucket_id) { d_j[[j]] <- c( d_j[[j]], cps_cookie_bucket[[experiment_id]][[cookie_bucket_id]] ) } } } d_j_bar <- sapply(d_j, mean) # Calculate the jackknife cookie bucket change point. d_capital_j <- 20 * d_capital - 19 * d_j_bar d_bar <- mean(d_capital_j) sd_2 <- sum((d_capital_j - d_bar)^2) / 19 # Following t-distribution with 19 degrees of freedom. containing_change_point[experiment_id, ] <- c( # 70% confidence interval. ceiling(d_capital - 1.066 * sqrt(sd_2) / sqrt(20)) <= 130 && floor(d_capital + 1.066 * sqrt(sd_2) / sqrt(20)) >= 130, # 80% confidence interval. ceiling(d_capital - 1.328 * sqrt(sd_2) / sqrt(20)) <= 130 && floor(d_capital + 1.328 * sqrt(sd_2) / sqrt(20)) >= 130, # 90% confidence interval. ceiling(d_capital - 1.729 * sqrt(sd_2) / sqrt(20)) <= 130 && floor(d_capital + 1.729 * sqrt(sd_2) / sqrt(20)) >= 130, # 95% confidence interval. ceiling(d_capital - 2.093 * sqrt(sd_2) / sqrt(20)) <= 130 && floor(d_capital + 2.093 * sqrt(sd_2) / sqrt(20)) >= 130, # 98% confidence interval. ceiling(d_capital - 2.539 * sqrt(sd_2) / sqrt(20)) <= 130 && floor(d_capital + 2.539 * sqrt(sd_2) / sqrt(20)) >= 130 ) } testthat::expect_equal( colSums(containing_change_point), c(433, 436, 442, 449, 460) ) } ) testthat::test_that( "confidence interval experiment with one change point for linear model", { testthat::skip("This test is intended to be run manually.") set.seed(1) kDimension <- 3 # nolint: Google Style Guide change_point_locations <- list() cps_cookie_bucket <- rep(list(rep(list(NULL), 20)), 500) containing_change_point <- matrix(NA, 500, 5) # 500 experiments and count the number of times change point will be inside # the confidence interval to verify the logics of confidence interval. for (experiment_id in seq_len(500)) { x <- mvtnorm::rmvnorm(800, rep(0, p), diag(p)) theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5)) y <- c( x[1:500, ] %*% theta_0[1, ] + rnorm(100, 0, 1), x[501:800, ] %*% theta_0[2, ] + rnorm(100, 0, 1) ) data <- data.frame(y = y, x = x) result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data, family = "gaussian" ) # Store the change point locations for each experiment as a baseline for # cookie bucket experiments. change_point_locations[[experiment_id]] <- result@cp_set # Randomly assign 20 cookie buckets to each experiment. cookie_bucket_id_list <- sample.int(n = 20, size = 800, replace = TRUE) all_data <- data # Do the cookie bucket experiment for each cookie bucket. for (cookie_bucket_id in seq_len(20)) { # Exclude the data from the cookie bucket. data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] # Obtain the change points for the cookie bucket experiment. result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data, family = "gaussian" ) # Map the change points to the original index. for (cp in result@cp_set) { ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1] cps_cookie_bucket[[experiment_id]][[cookie_bucket_id]] <- ordinal_mapped_cp } } # Concatenate the change points from all cookie bucket experiments. cp_for_eid <- Reduce("c", cps_cookie_bucket[[experiment_id]]) # Calculate the mean as the center of confidence interval. d_capital <- mean(cp_for_eid) # Concatenate the change ponints from all cookie bucket experiments # except the current one. d_j <- rep(list(NULL), 20) for (j in seq_len(20)) { for (cookie_bucket_id in seq_len(20)) { if (j != cookie_bucket_id) { d_j[[j]] <- c( d_j[[j]], cps_cookie_bucket[[experiment_id]][[cookie_bucket_id]] ) } } } d_j_bar <- sapply(d_j, mean) # Calculate the jackknife cookie bucket change point. d_capital_j <- 20 * d_capital - 19 * d_j_bar d_bar <- mean(d_capital_j) sd_2 <- sum((d_capital_j - d_bar)^2) / 19 # Following t-distribution with 19 degrees of freedom. containing_change_point[experiment_id, ] <- c( # 70% confidence interval. ceiling(d_capital + 1.066 * sqrt(sd_2) / sqrt(20)) >= 500 && floor(d_capital - 1.066 * sqrt(sd_2) / sqrt(20)) <= 500, # 80% confidence interval. ceiling(d_capital + 1.328 * sqrt(sd_2) / sqrt(20)) >= 500 && floor(d_capital - 1.328 * sqrt(sd_2) / sqrt(20)) <= 500, # 90% confidence interval. ceiling(d_capital + 1.729 * sqrt(sd_2) / sqrt(20)) >= 500 && floor(d_capital - 1.729 * sqrt(sd_2) / sqrt(20)) <= 500, # 95% confidence interval. ceiling(d_capital + 2.093 * sqrt(sd_2) / sqrt(20)) >= 500 && floor(d_capital - 2.093 * sqrt(sd_2) / sqrt(20)) <= 500, # 98% confidence interval. ceiling(d_capital + 2.539 * sqrt(sd_2) / sqrt(20)) >= 500 && floor(d_capital - 2.539 * sqrt(sd_2) / sqrt(20)) <= 500 ) } testthat::expect_equal( colSums(containing_change_point), c(371, 379, 390, 393, 399) ) } ) testthat::test_that( "all examples in the documentation", { testthat::skip("This test is intended to be run manually.") fastcpd_documentation <- readLines("R/fastcpd.R") examples_index_start <- which(fastcpd_documentation == "#' # Linear regression") examples_index_end <- which(fastcpd_documentation == "#' summary(huber_regression_result)") examples <- fastcpd_documentation[examples_index_start:examples_index_end] for (i in seq_along(examples)) { if (fastcpd_documentation[i] == "#'") { fastcpd_documentation[i] <- "" } else { fastcpd_documentation[i] <- substr(fastcpd_documentation[i], 4, nchar(fastcpd_documentation[i])) } } source(textConnection(paste( fastcpd_documentation[examples_index_start:examples_index_end], collapse = "\n" ))) } ) # Everything in this script is provided as is. The purpose of this script is to # do a sanity check on the C++ implementation of `fastcpd`. # nolint start: script provided as is #' Cost function for Logistic regression, i.e. binomial family in GLM. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_binomial <- function(data, family = "binomial") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm( as.matrix(data[, 1:p]), data[, p + 1], family = family ) return(out$deviance / 2) } #' Implementation of vanilla PELT for logistic regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) } cp <- cp_set[[n + 1]] nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_logistic_update <- function( data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- 1 / (1 + exp(-eta)) cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_binomial <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + log(1 + exp(u)) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_binomial <- function(data, beta, B = 10, trim = 0.025) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm( as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = "binomial" ) coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) e_eta <- exp(coef %*% X1) const <- e_eta / (1 + e_eta)^2 cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) { cval[i] <- neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) } } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t / (1 + e_eta_t)^2 cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] cp <- cp[-ind3] } nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } testthat::test_that("logistic regression", { testthat::skip("This test is intended to be run manually.") # This is the same example with `fastcpd` documentation. Please keep it in # sync if the documentation ever changes. set.seed(1) kChangePointLocation <- 125 kNumberOfDataPoints <- 300 kDimension <- 5 # There are 300 five-dimensional data points. x <- matrix(rnorm(kNumberOfDataPoints * kDimension, 0, 1), ncol = kDimension) # Randomly generate coefficients with different means. theta <- rbind(rnorm(kDimension, 0, 1), rnorm(kDimension, 2, 1)) # Randomly generate response variables based on the segmented data and # corresponding coefficients y <- c( rbinom( kChangePointLocation, 1, 1 / (1 + exp(-x[1:kChangePointLocation, ] %*% theta[1, ])) ), rbinom( kNumberOfDataPoints - kChangePointLocation, 1, 1 / (1 + exp( -x[(kChangePointLocation + 1):kNumberOfDataPoints, ] %*% theta[2, ] )) ) ) change_points_binomial_fastcpd <- suppressWarnings(fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial", segment_count = 5 ))@cp_set change_points_binomial_fastcpd_sanity <- segd_binomial( cbind(x, y), (kDimension + 1) * log(kNumberOfDataPoints) / 2, B = 5 )$cp testthat::expect_equal( change_points_binomial_fastcpd, kChangePointLocation ) testthat::expect_equal( change_points_binomial_fastcpd, change_points_binomial_fastcpd_sanity ) warning_messages <- testthat::capture_warnings( change_points_binomial_fastcpd_vanilla <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial", segment_count = 5, vanilla_percentage = 1 )@cp_set ) testthat::expect_equal( sort(warning_messages), rep(c( "fit_glm: algorithm did not converge", "fit_glm: fitted probabilities numerically 0 or 1 occurred" ), c(6, 3984)) ) change_points_binomial_fastcpd_vanilla_sanity <- pelt_vanilla_binomial( cbind(x, y), (kDimension + 1) * log(kNumberOfDataPoints) / 2 )$cp testthat::expect_equal( change_points_binomial_fastcpd_vanilla, kChangePointLocation ) testthat::expect_equal( c(0, change_points_binomial_fastcpd_vanilla), change_points_binomial_fastcpd_vanilla_sanity ) }) #' Cost function for Poisson regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_poisson <- function(data, family = "poisson") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) return(out$deviance / 2) } #' Implementation of vanilla PELT for poisson regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- exp(eta) cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) coef <- Winsorize(coef, minval = L, maxval = H) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_poisson <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + exp(u) + lfactorial(Y) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- Winsorize(matrix(coef.int[1, ], p, 1), minval = L, maxval = H) e_eta <- exp(coef %*% X1) const <- e_eta cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cum_coef_win <- Winsorize(cum_coef[, i] / (t - k + 1), minval = L, maxval = H) if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- Winsorize(coef.int[index[t], ], minval = L, maxval = H) #### e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # nLL <- out$deviance/2 + nLL # } # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from poisson regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) group <- rpois(length(mu), mu) y <- c(y, group) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } testthat::test_that("poisson regression", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 1500 d <- 5 rho <- 0.9 Sigma <- array(0, c(d, d)) for (i in 1:d) { Sigma[i, ] <- rho^(abs(i - (1:d))) } delta <- c(5, 7, 9, 11, 13) a.sq <- 1 delta.new <- delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) true.cp.loc <- c(375, 750, 1125) # regression coefficients true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) true.coef[, 2] <- true.coef[, 1] + delta.new true.coef[, 3] <- true.coef[, 1] true.coef[, 4] <- true.coef[, 3] - delta.new out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) data <- out[[1]] g_tr <- out[[2]] beta <- log(n) * (d + 1) / 2 change_points_poisson_fastcpd <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = data[, d + 1], x = data[, 1:d]), epsilon = 0.001, family = "poisson", segment_count = 10 )@cp_set change_points_poisson_fastcpd_sanity <- segd_poisson(data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20)$cp warning_messages <- testthat::capture_warnings( change_points_poisson_fastcpd_vanilla <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = data[, d + 1], x = data[, 1:d]), family = "poisson", segment_count = 10, vanilla_percentage = 1 )@cp_set ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted rates numerically 0 occurred", 1539) ) change_points_poisson_fastcpd_vanilla_sanity <- pelt_vanilla_poisson(data, beta)$cp testthat::expect_equal( change_points_poisson_fastcpd, c(380, 751, 1136, 1251) ) testthat::expect_equal( change_points_poisson_fastcpd, change_points_poisson_fastcpd_sanity ) testthat::expect_equal( change_points_poisson_fastcpd_vanilla, c(374, 752, 1133) ) testthat::expect_equal( c(0, change_points_poisson_fastcpd_vanilla), change_points_poisson_fastcpd_vanilla_sanity ) }) #' Cost function for penalized linear regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param lambda Penalty coefficient. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_lasso <- function(data, lambda, family = "gaussian") { data <- as.matrix(data) n <- dim(data)[1] p <- dim(data)[2] - 1 out <- glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) return(deviance(out) / 2) } #' Implementation of vanilla PELT for penalized linear regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 index <- rep(1:B, rep(n / B, B)) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # nLL <- deviance(out)/2 + nLL # } # output <- list(cp, nLL) output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' @param a Coefficient to be updated. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Updated coefficient. soft_threshold <- function(a, lambda) { sign(a) * pmax(abs(a) - lambda, 0) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] mu <- X_new %*% coef cmatrix <- cmatrix + X_new %o% X_new # B <- as.vector(cmatrix_inv%*%X_new) # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix, lik_dev) nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. coef <- soft_threshold(coef, lambda / nc) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_lasso <- function(data, b, lambda) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] resi <- Y - X %*% b L <- sum(resi^2) / 2 + lambda * sum(abs(b)) return(L) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef.int[i, ]) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) eta <- coef %*% X1 # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # cmatrix_inv <- array(c_int, c(p,p,1)) cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] # cmatrix_inv_c <- cmatrix_inv[,,i] cmatrix_c <- cmatrix[, , i] k <- set[i] + 1 out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] # cmatrix_inv[,,i] <- out[[3]] cmatrix[, , i] <- out[[3]] if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries and merge change-points cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # nLL <- out$deviance/2 + nLL # } output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from penalized linear regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @param evar Error variance. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) y <- c(y, add) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } testthat::test_that("penalized linear regression", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 1000 s <- 3 d <- 50 evar <- 0.5 Sigma <- diag(1, d) true.cp.loc <- c(100, 300, 500, 800, 900) seg <- length(true.cp.loc) + 1 true.coef <- matrix(rnorm(seg * s), s, seg) true.coef <- rbind(true.coef, matrix(0, d - s, seg)) out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) data <- out[[1]] beta <- log(n) / 2 # beta here has different meaning change_points_lasso_fastcpd <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = data[, d + 1], x = data[, 1:d]), family = "lasso" )@cp_set change_points_lasso_fastcpd_sanity <- segd_lasso(data, beta, B = 10, trim = 0.025)$cp # TODO(doccstat): Deal with the bugs. # change_points_lasso_fastcpd_vanilla <- fastcpd::fastcpd( # formula = y ~ . - 1, # data = data.frame(y = data[, d + 1], x = data[, 1:d]), # family = "lasso", # vanilla_percentage = 1 # )@cp_set change_points_lasso_fastcpd_vanilla_sanity <- pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp testthat::expect_equal( change_points_lasso_fastcpd, c(315, 798) ) # TODO(doccstat): Check why the results are different. testthat::expect_equal( change_points_lasso_fastcpd_sanity, c(100, 300, 520, 800, 901) ) # testthat::expect_equal( # change_points_lasso_fastcpd_vanilla, # c(374, 752, 1133) # ) testthat::expect_equal( c(0, 103, 299, 510, 800, 900), change_points_lasso_fastcpd_vanilla_sanity ) }) # nolint end testthat::test_that( "build-in binomial performance on large data set with n = 10^4, p = 5", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 10^4 p <- 5 segment_count <- n / 500 theta_mean <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(1, p)) theta <- matrix(NA, segment_count, p) for (segment_count_index in seq_len(segment_count)) { theta[segment_count_index, ] <- rnorm(p, theta_mean, 5) theta_mean <- -theta_mean } y <- matrix(NA, n, 1) for (segment_count_index in seq_len(segment_count)) { segment_index <- (segment_count_index - 1) * 500 + seq_len(500) segment_theta <- theta[segment_count_index, ] y[segment_index] <- rbinom(500, 1, 1 / (1 + exp(-x[segment_index, ] %*% segment_theta))) } warning_messages <- testthat::capture_warnings( runtime <- system.time( result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial" ) ) ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted probabilities numerically 0 or 1 occurred", 16) ) # Discard the runtime value since it is different depending on the machine. # The run time is less than 10 seconds on a GitHub codespace with 2 cores. invisible(runtime) true_change_points <- 500 * seq_len(segment_count - 1) testthat::expect_equal( result@cp_set, true_change_points + c(1, 1, 0, 5, 0, 2, 0, -1, 2, 2, 1, 1, 2, 2, -4, 2, 10, 2, 0) ) } ) testthat::test_that( "build-in binomial performance on large data set with n = 10^4, p = 10", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 10^4 p <- 10 segment_count <- n / 500 theta_mean <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(1, p)) theta <- matrix(NA, segment_count, p) for (segment_count_index in seq_len(segment_count)) { theta[segment_count_index, ] <- rnorm(p, theta_mean, 5) theta_mean <- -theta_mean } y <- matrix(NA, n, 1) for (segment_count_index in seq_len(segment_count)) { segment_index <- (segment_count_index - 1) * 500 + seq_len(500) segment_theta <- theta[segment_count_index, ] y[segment_index] <- rbinom(500, 1, 1 / (1 + exp(-x[segment_index, ] %*% segment_theta))) } warning_messages <- testthat::capture_warnings( runtime <- system.time( result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial" ) ) ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted probabilities numerically 0 or 1 occurred", 14) ) # Discard the runtime value since it is different depending on the machine. # The run time is less than 10 seconds on a GitHub codespace with 2 cores. invisible(runtime) # user system elapsed # 10.444 0.106 7.765 true_change_points <- 500 * seq_len(segment_count - 1) testthat::expect_equal( result@cp_set, true_change_points + c(5, 6, 5, 6, 5, 4, -4, 4, -1, 2, 1, 0, 2, 1, 5, 1, 4, 9, -1) ) } ) testthat::test_that( "build-in binomial performance on large data set with n = 10^4, p = 20", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 10^4 p <- 20 segment_count <- n / 500 theta_mean <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(1, p)) theta <- matrix(NA, segment_count, p) for (segment_count_index in seq_len(segment_count)) { theta[segment_count_index, ] <- rnorm(p, theta_mean, 5) theta_mean <- -theta_mean } y <- matrix(NA, n, 1) for (segment_count_index in seq_len(segment_count)) { segment_index <- (segment_count_index - 1) * 500 + seq_len(500) segment_theta <- theta[segment_count_index, ] y[segment_index] <- rbinom(500, 1, 1 / (1 + exp(-x[segment_index, ] %*% segment_theta))) } warning_messages <- testthat::capture_warnings( runtime <- system.time( result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial" ) ) ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted probabilities numerically 0 or 1 occurred", 7) ) # Discard the runtime value since it is different depending on the machine. # The run time is less than 10 seconds on a GitHub codespace with 2 cores. invisible(runtime) # user system elapsed # 36.672 14.923 28.899 true_change_points <- 500 * seq_len(segment_count - 1) testthat::expect_equal( result@cp_set, true_change_points + c(11, -1, 8, 9, 12, 5, 9, -16, 28, 0, 0, 66, 26, 38, 7, 36, 8, 9, -3) ) } ) testthat::test_that( "build-in binomial performance on large data set with n = 3 * 10^4, p = 30", { testthat::skip("This test is intended to be run manually.") set.seed(1) n <- 3 * 10^4 p <- 30 segment_count <- n / 1000 theta_mean <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(1, p)) theta <- matrix(NA, segment_count, p) for (segment_count_index in seq_len(segment_count)) { theta[segment_count_index, ] <- rnorm(p, theta_mean, 5) theta_mean <- -theta_mean } y <- matrix(NA, n, 1) for (segment_count_index in seq_len(segment_count)) { segment_index <- (segment_count_index - 1) * 1000 + seq_len(1000) segment_theta <- theta[segment_count_index, ] y[segment_index] <- rbinom(1000, 1, 1 / (1 + exp(-x[segment_index, ] %*% segment_theta))) } warning_messages <- testthat::capture_warnings( runtime <- system.time( result <- fastcpd::fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial" ) ) ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted probabilities numerically 0 or 1 occurred", 8) ) # Discard the runtime value since it is different depending on the machine. # The run time is less than 10 seconds on a GitHub codespace with 2 cores. invisible(runtime) # user system elapsed # 683.446 356.068 576.744 true_change_points <- 1000 * seq_len(segment_count - 1) testthat::expect_equal( result@cp_set, na.exclude(true_change_points + c( 21, 50, 3, 91, 12, 27, NA, -317, 36, 168, 19, 79, 43, 97, 0, 18, 5, 9, 63, 16, 26, 95, 4, -4, 24, 30, 57, 19, 19 )), ignore_attr = TRUE ) } )