testthat::test_that( "invalid family provided", { testthat::expect_error( fastcpd( data = data.frame(y = 0, x = 0), family = "bin0mial" ), r"[ The family should be one of "gaussian", "binomial", "poisson", "lasso", "ar", "var", "custom" or `NULL` while the provided family is bin0mial.]" ) } ) testthat::test_that( "custom family but no cost function provided", { testthat::expect_error( fastcpd( data = data.frame(y = 0, x = 0) ), "cost function must be specified for custom family" ) } ) testthat::test_that( "invalid order for ar family", { testthat::expect_error( fastcpd( formula = ~ x - 1, data = data.frame(x = 0), family = "ar", ), "Please specify the order of the AR model as the parameter `p`." ) } ) testthat::test_that( "example linear regression", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5), c(0.5, -0.3, 0.2)) y <- c( x[1:100, ] %*% theta_0[1, ] + rnorm(100, 0, 1), x[101:200, ] %*% theta_0[2, ] + rnorm(100, 0, 1), x[201:300, ] %*% theta_0[3, ] + rnorm(100, 0, 1) ) result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "gaussian" ) testthat::expect_equal(result@cp_set, c(98, 202)) } ) testthat::test_that( "linear regression without change points", { result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = seq_len(100), x = seq_len(100)), family = "gaussian" ) testthat::expect_length(result@cp_set, 0) } ) testthat::test_that( "example linear regression with noise variance not equal to 1", { set.seed(1) p <- 4 n <- 300 cp <- c(100, 200) x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p)) theta_0 <- rbind(c(1, 3.2, -1, 0), c(-1, -0.5, 2.5, -2), c(0.8, -0.3, 1, 1)) y <- c( x[1:cp[1], ] %*% theta_0[1, ] + rnorm(cp[1], 0, sd = 3), x[(cp[1] + 1):cp[2], ] %*% theta_0[2, ] + rnorm(cp[2] - cp[1], 0, sd = 3), x[(cp[2] + 1):n, ] %*% theta_0[3, ] + rnorm(n - cp[2], 0, sd = 3) ) # Compute the variance estimation for each block and then take the average. block_size <- 5 variance_estimation <- rep(NA, n - block_size) for (i in 1:(n - block_size)) { block_index <- seq_len(block_size) + i - 1 block_index_lagged <- seq_len(block_size) + i y_block <- y[block_index] x_block <- x[block_index, ] y_block_lagged <- y[block_index_lagged] x_block_lagged <- x[block_index_lagged, ] x_t_x <- crossprod(x_block) x_t_x_lagged <- crossprod(x_block_lagged) block_slope <- solve(crossprod(x_block), crossprod(x_block, y_block)) block_lagged_slope <- solve( crossprod(x_block_lagged), crossprod(x_block_lagged, y_block_lagged) ) x_t_x_inv <- solve(x_t_x) x_t_x_inv_lagged <- solve(x_t_x_lagged) inv_product <- x_t_x_inv %*% x_t_x_inv_lagged cross_term <- inv_product %*% crossprod(x_block[-1, ], x_block_lagged[-block_size, ]) delta_numerator <- crossprod(block_slope - block_lagged_slope) delta_denominator <- sum(diag(x_t_x_inv + x_t_x_inv_lagged - 2 * cross_term)) variance_estimation[i] <- delta_numerator / delta_denominator } testthat::expect_equal(mean(variance_estimation), 10.472932) result <- fastcpd( data = data.frame(y = y, x = x), beta = (p + 1) * log(n) / 2 * mean(variance_estimation), family = "gaussian", ) testthat::expect_equal(result@cp_set, c(100, 201)) result_internal_variance <- fastcpd( data = data.frame(y = y, x = x), family = "gaussian", ) testthat::expect_equal( result_internal_variance@cp_set, c(100, 201) ) } ) testthat::test_that( "example linear regression with one-dimensional covariate", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 1 x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) theta_0 <- matrix(c(1, -1, 0.5)) y <- c( x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1), x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1), x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1) ) result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "gaussian" ) testthat::expect_equal(result@cp_set, c(100, 194)) } ) testthat::test_that( "linear regression with multi-dimensional responses", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) n <- 300 p <- 3 y_count <- 2 x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p)) theta_0 <- array(NA, dim = c(3, y_count, 3)) theta_0[, , 1] <- cbind(c(1, 1.2, -1), c(-1, 0, 0.5)) theta_0[, , 2] <- cbind(c(-1, 0, 0.5), c(0.5, -0.3, 0.2)) theta_0[, , 3] <- cbind(c(0.5, -0.3, 0.2), c(1, 1.2, -1)) y <- rbind( x[1:100, ] %*% theta_0[, , 1], x[101:200, ] %*% theta_0[, , 2], x[201:n, ] %*% theta_0[, , 3] ) + matrix(rnorm(n * y_count), ncol = y_count) multi_response_linear_loss <- function(data) { x <- data[, (ncol(data) - p + 1):ncol(data)] y <- data[, 1:(ncol(data) - p)] if (nrow(data) <= p) { x_t_x <- diag(p) } else { x_t_x <- crossprod(x) } norm(y - x %*% solve(x_t_x, t(x)) %*% y, type = "F")^2 / 2 } result <- fastcpd( formula = y ~ x - 1, data = data.frame(y = y, x = x), beta = (2 * p + 1) * log(n) / 2, cost = multi_response_linear_loss, cp_only = TRUE ) testthat::expect_equal(result@cp_set, c(102, 195)) } ) testthat::test_that( "example logistic regression", { set.seed(1) x <- matrix(rnorm(1500, 0, 1), ncol = 5) theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1)) y <- c( rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ]))) ) result <- suppressWarnings(fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial" )) testthat::expect_equal(result@cp_set, 126) } ) testthat::test_that( "logistic regression with warm start", { set.seed(1) x <- matrix(rnorm(1500, 0, 1), ncol = 5) theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1)) y <- c( rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ]))) ) result <- suppressWarnings(fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "binomial", vanilla_percentage = 1, warm_start = TRUE )) testthat::expect_equal(result@cp_set, 121) } ) testthat::test_that( "example poisson regression", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) delta <- rnorm(p) theta_0 <- c(1, 1.2, -1) y <- c( rpois(300, exp(x[1:300, ] %*% theta_0)), rpois(400, exp(x[301:700, ] %*% (theta_0 + delta))), rpois(300, exp(x[701:1000, ] %*% theta_0)), rpois(100, exp(x[1001:1100, ] %*% (theta_0 - delta))), rpois(200, exp(x[1101:1300, ] %*% theta_0)), rpois(200, exp(x[1301:1500, ] %*% (theta_0 + delta))) ) result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), beta = (p + 1) * log(1500) / 2, k = function(x) 0, family = "poisson", epsilon = 1e-5 ) result_two_epochs <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), beta = (p + 1) * log(1500) / 2, k = function(x) 1, family = "poisson", epsilon = 1e-4 ) testthat::expect_equal(result@cp_set, c(329, 728, 1021, 1107, 1325)) testthat::expect_equal( result_two_epochs@cp_set, c(328, 716, 1020, 1102, 1323) ) } ) testthat::test_that( "example penalized linear regression", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) n <- 1500 p_true <- 6 p <- 50 x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) theta_0 <- rbind( runif(p_true, -5, -2), runif(p_true, -3, 3), runif(p_true, 2, 5), runif(p_true, -5, 5) ) theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) y <- c( x[1:300, ] %*% theta_0[1, ] + rnorm(300, 0, 1), x[301:700, ] %*% theta_0[2, ] + rnorm(400, 0, 1), x[701:1000, ] %*% theta_0[3, ] + rnorm(300, 0, 1), x[1001:1500, ] %*% theta_0[4, ] + rnorm(500, 0, 1) ) result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "lasso" ) testthat::expect_equal(result@cp_set, c(300, 700, 1000)) testthat::expect_equal( unname(sqrt(colSums((t(theta_0[, 1:6]) - result@thetas[1:6, ])^2))), c(0.5801021, 0.5439067, 0.4677495, 0.4898553), tolerance = 1e-7, ignore_attr = TRUE ) } ) testthat::test_that( "penalized linear regression with smaller sample size", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) n <- 1500 p_true <- 6 p <- 50 x <- mvtnorm::rmvnorm(480, rep(0, p), diag(p)) theta_0 <- rbind( runif(p_true, -5, -2), runif(p_true, -3, 3), runif(p_true, 2, 5), runif(p_true, -5, 5) ) theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) y <- c( x[1:80, ] %*% theta_0[1, ] + rnorm(80, 0, 1), x[81:200, ] %*% theta_0[2, ] + rnorm(120, 0, 1), x[201:320, ] %*% theta_0[3, ] + rnorm(120, 0, 1), x[321:480, ] %*% theta_0[4, ] + rnorm(160, 0, 1) ) result <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "lasso" ) testthat::expect_equal(result@cp_set, c(79, 201, 325)) result_multiple_epochs <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "lasso", k = function(x) if (x < 20) 1 else 0 ) testthat::expect_equal(result_multiple_epochs@cp_set, c(80, 200, 320)) result_vanilla_percentage <- fastcpd( formula = y ~ . - 1, data = data.frame(y = y, x = x), family = "lasso", vanilla_percentage = 0.2 ) testthat::expect_equal(result_vanilla_percentage@cp_set, c(79, 201, 321)) } ) testthat::test_that( "example custom logistic regression", { set.seed(1) p <- 5 x <- matrix(rnorm(375 * p, 0, 1), ncol = p) theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) y <- c( rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] %*% theta[1, ]))), rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] %*% theta[2, ]))) ) data <- data.frame(y = y, x = x) warning_messages <- testthat::capture_warnings( result_builtin <- fastcpd( formula = y ~ . - 1, data = data, family = "binomial" ) ) logistic_loss <- function(data, theta) { x <- data[, -1] y <- data[, 1] u <- x %*% theta nll <- -y * u + log(1 + exp(u)) nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10] sum(nll) } logistic_loss_gradient <- function(data, theta) { x <- data[nrow(data), -1] y <- data[nrow(data), 1] c(-(y - 1 / (1 + exp(-x %*% theta)))) * x } logistic_loss_hessian <- function(data, theta) { x <- data[nrow(data), -1] prob <- 1 / (1 + exp(-x %*% theta)) (x %o% x) * c((1 - prob) * prob) } result_custom <- fastcpd( formula = y ~ . - 1, data = data, epsilon = 1e-5, cost = logistic_loss, cost_gradient = logistic_loss_gradient, cost_hessian = logistic_loss_hessian ) result_custom_two_epochs <- fastcpd( formula = y ~ . - 1, data = data, k = function(x) 1, epsilon = 1e-5, cost = logistic_loss, cost_gradient = logistic_loss_gradient, cost_hessian = logistic_loss_hessian ) testthat::expect_equal( warning_messages, rep("fit_glm: fitted probabilities numerically 0 or 1 occurred", 3) ) testthat::expect_equal(result_builtin@cp_set, 200) testthat::expect_equal(result_custom@cp_set, 201) testthat::expect_equal(result_custom_two_epochs@cp_set, 200) } ) testthat::test_that( "custom one-dimensional logistic regression", { set.seed(1) p <- 1 x <- matrix(rnorm(375 * p, 0, 1), ncol = p) theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) y <- c( rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] * theta[1, ]))), rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] * theta[2, ]))) ) data <- data.frame(y = y, x = x) result_builtin <- fastcpd( formula = y ~ . - 1, data = data, family = "binomial" ) logistic_loss <- function(data, theta) { x <- data[, -1] y <- data[, 1] u <- x * c(theta) nll <- -y * u + log(1 + exp(u)) nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10] sum(nll) } logistic_loss_gradient <- function(data, theta) { x <- data[nrow(data), -1] y <- data[nrow(data), 1] c(-(y - 1 / (1 + exp(-x * theta)))) * x } logistic_loss_hessian <- function(data, theta) { x <- data[nrow(data), -1] prob <- 1 / (1 + exp(-x * theta)) (x %o% x) * c((1 - prob) * prob) } result_custom <- fastcpd( formula = y ~ . - 1, data = data, epsilon = 1e-5, cost = logistic_loss, cost_gradient = logistic_loss_gradient, cost_hessian = logistic_loss_hessian ) testthat::expect_equal(result_builtin@cp_set, 198) testthat::expect_equal(result_custom@cp_set, 200) } ) testthat::test_that( "example custom cost function mean change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 1 data <- rbind( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)), mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)), mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p)) ) segment_count_guess <- 10 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_vars <- rep(0, block_count) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_vars[block_index] <- var(data[block_start:block_end, ]) } data_all_var <- mean(data_all_vars) mean_loss <- function(data) { n <- nrow(data) n / 2 * ( log(data_all_var) + log(2 * pi) + sum((data - colMeans(data))^2 / data_all_var) / n ) } mean_loss_result <- fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (p + 1) * log(nrow(data)) / 2, p = p, cost = mean_loss ) testthat::expect_equal(mean_loss_result@cp_set, c(300, 700)) } ) testthat::test_that( "example custom cost function multivariate mean change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 data <- rbind( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)), mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)), mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p)) ) segment_count_guess <- 5 block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) block_count <- floor(nrow(data) / block_size) data_all_covs <- array(NA, dim = c(block_count, p, p)) for (block_index in seq_len(block_count)) { block_start <- (block_index - 1) * block_size + 1 block_end <- if (block_index < block_count) { block_index * block_size } else { nrow(data) } data_all_covs[block_index, , ] <- cov(data[block_start:block_end, ]) } data_all_cov <- colMeans(data_all_covs) mean_loss <- function(data) { n <- nrow(data) demeaned_data <- sweep(data, 2, colMeans(data)) n / 2 * ( log(det(data_all_cov)) + p * log(2 * pi) + sum(diag(solve(data_all_cov, crossprod(demeaned_data)))) / n ) } mean_loss_result <- fastcpd( formula = ~ . - 1, data = data.frame(data), beta = (p + 1) * log(nrow(data)) / 2, p = p, cost = mean_loss ) testthat::expect_equal(mean_loss_result@cp_set, c(300, 700)) } ) testthat::test_that( "example custom cost function variance change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 1 data <- rbind.data.frame( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(0, p), sigma = diag(50, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(2, p)) ) data_all_mean <- colMeans(data) var_loss <- function(data) { n <- nrow(data) data_cov <- crossprod(sweep(data, 2, data_all_mean)) / (n - 1) n / 2 * (log(data_cov) + log(2 * pi) + (n - 1) / n) } var_loss_result <- fastcpd( formula = ~ . - 1, data = data, beta = (p + 1) * log(nrow(data)) / 2, p = p, cost = var_loss ) testthat::expect_equal(var_loss_result@cp_set, c(300, 699)) } ) testthat::test_that( "example custom cost function multivariate variance change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 data <- rbind.data.frame( mvtnorm::rmvnorm( 300, rep(0, p), crossprod(matrix(runif(p^2) * 2 - 1, p)) ), mvtnorm::rmvnorm( 400, rep(0, p), crossprod(matrix(runif(p^2) * 2 - 1, p)) ), mvtnorm::rmvnorm( 300, rep(0, p), crossprod(matrix(runif(p^2) * 2 - 1, p)) ) ) data_all_mean <- colMeans(data) var_loss <- function(data) { n <- nrow(data) p <- ncol(data) if (n < p) { data_cov <- diag(p) } else { data_cov <- crossprod(sweep(data, 2, data_all_mean)) / (n - 1) } n / 2 * (log(det(data_cov)) + p * log(2 * pi) + p * (n - 1) / n) } var_loss_result <- fastcpd( formula = ~ . - 1, data = data, beta = (p^2 + 1) * log(nrow(data)) / 2, trim = 0.1, p = p^2, cost = var_loss ) testthat::expect_equal(var_loss_result@cp_set, c(300, 700)) } ) testthat::test_that( "example custom cost function mean or variance change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 1 data <- rbind.data.frame( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p)) ) meanvar_loss <- function(data) { n <- nrow(data) data_cov <- 1 if (n > 1) { data_cov <- var(data) } n / 2 * (log(data_cov) + log(2 * pi) + (n - 1) / n) } meanvar_loss_result <- fastcpd( formula = ~ . - 1, data = data, beta = (p^2 + p + 1) * log(nrow(data)) / 2, p = p^2 + p, cost = meanvar_loss ) testthat::expect_equal( meanvar_loss_result@cp_set, c(300, 700, 1000, 1300, 1700) ) } ) testthat::test_that( "example custom cost function multivariate mean or variance change", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 data <- rbind.data.frame( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p)) ) meanvar_loss <- function(data) { n <- nrow(data) p <- ncol(data) if (n <= p) { data_cov <- diag(p) } else { data_cov <- cov(data) } n / 2 * (log(det(data_cov)) + p * log(2 * pi) + p * (n - 1) / n) } meanvar_loss_result <- fastcpd( formula = ~ . - 1, data = data, beta = (p^2 + p + 1) * log(nrow(data)) / 2, trim = 0.01, p = p^2 + p, cost = meanvar_loss ) testthat::expect_equal( meanvar_loss_result@cp_set, c(300, 700, 1000, 1300, 1700) ) } ) testthat::test_that( "custom cost function varince change in mean or var change data", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) p <- 3 data <- rbind.data.frame( mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)), mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p)) ) data_all_mean <- colMeans(data) var_loss <- function(data) { n <- nrow(data) p <- ncol(data) if (n < p) { data_cov <- diag(p) } else { data_cov <- crossprod(sweep(data, 2, data_all_mean)) / (n - 1) } n / 2 * (log(det(data_cov)) + p * log(2 * pi) + p * (n - 1) / n) } var_loss_result <- fastcpd( formula = ~ . - 1, data = data, beta = (p^2 + 1) * log(nrow(data)) / 2, trim = 0.1, p = p^2, cost = var_loss ) testthat::expect_equal(var_loss_result@cp_set, c(700, 1000, 1700)) } ) testthat::test_that( "example custom cost function huber regression", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) n <- 400 + 300 + 500 p <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p)) theta <- rbind( mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)), mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)), mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3)) ) theta <- cbind(theta, matrix(0, 3, 3)) theta <- theta[rep(seq_len(3), c(400, 300, 500)), ] y_true <- rowSums(x * theta) factor <- c( 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1, 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1, 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1 ) y <- factor * y_true + stats::rnorm(n) data <- cbind.data.frame(y, x) huber_threshold <- 1 huber_loss <- function(data, theta) { residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta indicator <- abs(residual) <= huber_threshold sum( residual^2 / 2 * indicator + huber_threshold * ( abs(residual) - huber_threshold / 2 ) * (1 - indicator) ) } huber_loss_gradient <- function(data, theta) { residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) if (abs(residual) <= huber_threshold) { -residual * data[nrow(data), -1] } else { -huber_threshold * sign(residual) * data[nrow(data), -1] } } huber_loss_hessian <- function(data, theta) { residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) if (abs(residual) <= huber_threshold) { outer(data[nrow(data), -1], data[nrow(data), -1]) } else { 0.01 * diag(length(theta)) } } huber_regression_result <- fastcpd( formula = y ~ . - 1, data = data, beta = (p + 1) * log(n) / 2, cost = huber_loss, cost_gradient = huber_loss_gradient, cost_hessian = huber_loss_hessian ) testthat::expect_equal(huber_regression_result@cp_set, c(401, 726)) } ) testthat::test_that( "huber regression with 0.1 vanilla", { testthat::skip_if_not_installed("mvtnorm") set.seed(1) n <- 400 + 300 + 500 p <- 5 x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p)) theta <- rbind( mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)), mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)), mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3)) ) theta <- cbind(theta, matrix(0, 3, 3)) theta <- theta[rep(seq_len(3), c(400, 300, 500)), ] y_true <- rowSums(x * theta) factor <- c( 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1, 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1, 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1 ) y <- factor * y_true + stats::rnorm(n) data <- cbind.data.frame(y, x) huber_threshold <- 1 huber_loss <- function(data, theta) { residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta indicator <- abs(residual) <= huber_threshold sum( residual^2 / 2 * indicator + huber_threshold * ( abs(residual) - huber_threshold / 2 ) * (1 - indicator) ) } huber_loss_gradient <- function(data, theta) { residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) if (abs(residual) <= huber_threshold) { -residual * data[nrow(data), -1] } else { -huber_threshold * sign(residual) * data[nrow(data), -1] } } huber_loss_hessian <- function(data, theta) { residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) if (abs(residual) <= huber_threshold) { outer(data[nrow(data), -1], data[nrow(data), -1]) } else { 0.01 * diag(length(theta)) } } huber_regression_result <- fastcpd( formula = y ~ . - 1, data = data, beta = (p + 1) * log(n) / 2, cost = huber_loss, cost_gradient = huber_loss_gradient, cost_hessian = huber_loss_hessian, vanilla_percentage = 0.1 ) testthat::expect_equal(huber_regression_result@cp_set, c(401, 726)) } ) testthat::test_that( "example ar(1) model", { set.seed(1) n <- 1000 p <- 1 x <- rep(0, n + 1) for (i in 1:600) { x[i + 1] <- 0.6 * x[i] + rnorm(1) } for (i in 601:1000) { x[i + 1] <- 0.3 * x[i] + rnorm(1) } result <- fastcpd( formula = ~ . - 1, data = data.frame(x = x), p = 1, family = "ar" ) testthat::expect_equal(result@cp_set, 609) } ) testthat::test_that( "example ar(3) model with innovation standard deviation 3", { set.seed(1) n <- 1000 p <- 1 x <- rep(0, n + 3) for (i in 1:600) { x[i + 3] <- 0.6 * x[i + 2] - 0.2 * x[i + 1] + 0.1 * x[i] + rnorm(1, 0, 3) } for (i in 601:1000) { x[i + 1] <- 0.3 * x[i + 2] + 0.4 * x[i + 1] + 0.2 * x[i] + rnorm(1, 0, 3) } result <- fastcpd( formula = ~ . - 1, data = data.frame(x = x), p = 3, family = "ar" ) testthat::expect_equal(result@cp_set, 615) } ) testthat::test_that( "example var(2) model with p = 2 with innovation sd = 1", { set.seed(1) n <- 1000 p <- 2 theta_1 <- matrix(c(-0.3, 0.6, -0.5, 0.4, 0.2, 0.1, 0.1, -0.2), nrow = p) theta_2 <- matrix(c(-0.1, -0.2, 0.1, 0.05, 0.1, -0.2, -0.2, 0.1), nrow = p) x <- matrix(0, n + 2, p) for (i in 1:600) { x[i + 2, ] <- theta_1 %*% c(x[i + 1, ], x[i, ]) + rnorm(p, 0, 1) } for (i in 601:1000) { x[i + 2, ] <- theta_2 %*% c(x[i + 1, ], x[i, ]) + rnorm(p, 0, 1) } data <- matrix(NA, n, p * 3) data[, seq_len(p)] <- x[p + seq_len(n), ] for (p_i in seq_len(p)) { data[, p + (p * p_i - 1):(p * p_i)] <- x[p - p_i + seq_len(n), ] } multi_response_linear_loss <- function(data) { x <- data[, (ncol(data) - p + 1):ncol(data)] y <- data[, 1:(ncol(data) - p)] if (nrow(data) <= p + 1) { x_t_x <- diag(p) } else { x_t_x <- crossprod(x) } norm(y - x %*% solve(x_t_x, t(x)) %*% y, type = "F")^2 / 2 } result <- fastcpd( formula = y.1 + y.2 ~ x.1 + x.2 + x.3 + x.4 - 1, data = data.frame(y = data[, 1:2], x = data[, 3:6]), beta = (2 * 4 + 1) * log(n) / 2, p = 4, cost = multi_response_linear_loss ) testthat::expect_equal(result@cp_set, 612) result_var <- fastcpd( formula = ~ . - 1, data = data.frame(x = x), beta = (2 * 4 + 1) * log(n) / 2, p = 2, family = "var" ) testthat::expect_equal(result_var@cp_set, 611) } ) testthat::test_that( "ar(1) model", { set.seed(1) n <- 1000 p <- 1 time_series <- rep(0, n + 1) for (i in 1:600) { time_series[i + 1] <- 0.6 * time_series[i] + rnorm(1) } for (i in 601:1000) { time_series[i + 1] <- 0.3 * time_series[i] + rnorm(1) } ar1_loss <- function(data) { n <- nrow(data) optim_result <- optim( par = 0, fn = function(data, theta) { n <- nrow(data) unconditional_sum_of_squares <- (1 - theta^2) * data[1]^2 + sum( (data[2:n] - theta * data[1:(n - 1)])^2 ) log(unconditional_sum_of_squares) - log(1 - theta^2) / n }, method = "Brent", data = data, lower = -0.999, upper = 0.999 ) n / 2 * (optim_result$value - log(n) + log(2 * pi) + 1) } result <- fastcpd( formula = ~ . - 1, data = data.frame(x = time_series[-1]), beta = (2 * p + 1) * log(n) / 2, p = 2 * p, cost = ar1_loss ) testthat::expect_equal(result@cp_set, 614) } ) testthat::test_that( "ar(1) model with `forecast::Arima` calculation of ML", { testthat::skip_if_not_installed("forecast") set.seed(1) n <- 400 p <- 1 time_series <- rep(0, n + 1) for (i in 1:200) { time_series[i + 1] <- 0.6 * time_series[i] + rnorm(1) } for (i in 201:400) { time_series[i + 1] <- 0.3 * time_series[i] + rnorm(1) } ar1_loss <- function(data) { tryCatch( expr = -forecast::Arima( c(data), order = c(1, 0, 0), include.mean = FALSE, method = "ML" )$loglik, error = function(e) { 0 } ) } result <- fastcpd( formula = ~ . - 1, data = data.frame(x = time_series[-1]), beta = (2 * p + 1) * log(n) / 2, p = p, cost = ar1_loss ) testthat::expect_equal(result@cp_set, 178) } ) testthat::test_that( "var(1) model with p = 2", { set.seed(1) n <- 1000 p <- 2 theta_0 <- array(NA, dim = c(p, p, 2)) theta_0[, , 1] <- cbind(c(-0.3, 0.6), c(-0.4, 0.5)) theta_0[, , 2] <- cbind(c(-0.1, -0.2), c(0.1, 0.05)) x <- matrix(0, n + 1, p) for (i in 1:600) { x[i + 1, ] <- theta_0[, , 1] %*% x[i, ] + rnorm(p) } for (i in 601:1000) { x[i + 1, ] <- theta_0[, , 2] %*% x[i, ] + rnorm(p) } y <- x[3:1001, ] x <- x[2:1000, ] multi_response_linear_loss <- function(data) { x <- data[, (ncol(data) - p + 1):ncol(data)] y <- data[, 1:(ncol(data) - p)] if (nrow(data) <= p) { x_t_x <- diag(p) } else { x_t_x <- crossprod(x) } norm(y - x %*% solve(x_t_x, t(x)) %*% y, type = "F")^2 / 2 } result <- fastcpd( formula = y ~ x - 1, data = data.frame(y = y, x = x), beta = (2 * p + 1) * log(n) / 2, cost = multi_response_linear_loss ) testthat::expect_equal(result@cp_set, 593) } )