build_data_set <- function(n, seed = 0) { withr::local_seed(seed) blink <- stats::binomial() covariate <- data.frame(y = runif(n)) coeffs <- matrix(c(-1, 2, 1, -2), ncol = 2) scores <- cbind(1, covariate$y) %*% coeffs probs <- apply(scores, 1, blink$linkinv) x <- rep(0, n) x[1] <- sample(c(0, 1), 1) for (k in 2:n) { prob <- probs[x[k - 1] + 1, k - 1] x[k] <- sample(c(0, 1), 1, prob = c(1 - prob, prob)) } list(x = x, covariate = covariate) } build_data_set_2 <- function(seed = 0) { withr::local_seed(seed) y <- as.factor(rep(rep(1:4, each = 50), 2)) z <- runif(length(y)) dts_cov <- data.frame(y = y, z = z) cov_dep <- matrix(runif(length(levels(y)) * 4, min = -1, max = 1), nrow = length(levels(y))) cov_dep[cov_dep > 0.5] <- 1 cov_dep[cov_dep < -0.5] <- -1 cov_dep[cov_dep <= 0.5 & cov_dep >= -0.5] <- 0 x <- rep(0, length(y)) x[2] <- sample(c(0, 1), 1) probs <- rep(NA, length(y)) for (k in 3:length(y)) { probs[k] <- 1 / (1 + exp(-sum(cov_dep[as.integer(y[k - 1]), ] * c(1, x[k - 1], z[c(k - 1, k - 2)])))) x[k] <- ifelse(runif(1) <= probs[k], 1, 0) } x <- as.factor(x) list(x = x, covariate = dts_cov) } build_data_set_3_model <- function(n, seed = 0, alpha = 0.1) { withr::local_seed(seed) x3 <- sample(c("A", "B", "C"), n, replace = TRUE) y3 <- ifelse(runif(length(x3)) > 0.5, c(x3[-1], sample(c("A", "B", "C"), 1)), c(x3[-c(1, 2)], sample(c("A", "B", "C"), 2, replace = TRUE))) y3 <- as.factor(ifelse(runif(length(x3)) > 0.2, y3, sample(c("A", "B", "C"), n, replace = TRUE))) z3 <- data.table::fcase( x3[-1] == "A", runif(length(x3) - 1), x3[-1] == "B", 0.5 + runif(length(x3) - 1), x3[-1] == "C", -0.5 + runif(length(x3) - 1) ) z3 <- c(0, z3) df_full <- data.frame(y = y3, z = z3) model <- covlmc(x3, df_full, max_depth = 5, min_size = 5, alpha = alpha) list(model = model, dts = x3, cov = df_full) } extract_p_value <- function(tree) { recurse_extract_p_value <- function(tree) { if (is.null(tree[["children"]])) { if (is.null(tree[["model"]])) { # nothing there (should not happen) NULL } else { p_value <- NA if (!is.null(tree$model[["p_value"]])) { p_value <- tree$model[["p_value"]] } data.frame(p_value = p_value, nb_coeffs = length(tree$model[["coefficients"]])) } } else { df <- NULL for (v in seq_along(tree[["children"]])) { sub_p <- recurse_extract_p_value(tree$children[[v]]) if (!is.null(sub_p)) { df <- rbind(df, sub_p) } } if (!is.null(tree[["merged_model"]])) { if (!is.na(tree[["merged_model"]]$p_value)) { sub_p <- data.frame( p_value = tree[["merged_model"]]$p_value, nb_coeffs = length(tree[["merged_model"]]$coefficients) ) df <- rbind(df, sub_p) } } if (!is.null(tree$p_value)) { sub_p <- data.frame( p_value = tree$p_value, nb_coeffs = NA ) df <- rbind(df, sub_p) } if (!is.null(tree$merged_p_value)) { sub_p <- data.frame( p_value = tree$merged_p_value, nb_coeffs = NA ) df <- rbind(df, sub_p) } df } } recurse_extract_p_value(tree) } compare_covlmc_node <- function(n1, n2) { for (what in c("f_by", "match", "prunable", "vals", "depth", "nb_ctx", "cov_names", "alpha", "merged")) { if (!identical(n1[[what]], n2[[what]])) { return(FALSE) } } if (!is.null(n1[["model"]])) { if (is.null(n2[["model"]])) { return(FALSE) } if (!identical(n1[["model"]][["coefficents"]], n2[["model"]][["coefficents"]])) { return(FALSE) } } if (!is.null(n1[["merged_model"]])) { if (is.null(n2[["merged_model"]])) { return(FALSE) } if (!identical(n1[["merged_model"]][["coefficents"]], n2[["merged_model"]][["coefficents"]])) { return(FALSE) } } if (!is.null(n1[["extended_model"]])) { if (is.null(n2[["extended_model"]])) { return(FALSE) } if (!identical(n1[["extended_model"]][["coefficents"]], n2[["extended_model"]][["coefficents"]])) { return(FALSE) } } TRUE } compare_covlmc <- function(m1, m2) { rec_compare_covlmc <- function(m1, m2) { if (!compare_covlmc_node(m1, m2)) { return(FALSE) } if (is.null(m1[["children"]])) { if (!is.null(m2[["children"]])) { return(FALSE) } return(TRUE) } else { if (is.null(m2[["children"]])) { return(FALSE) } if (length(m1[["children"]]) != length(m2[["children"]])) { return(FALSE) } for (v in seq_along(m1[["children"]])) { if (!rec_compare_covlmc(m1[["children"]][[v]], m2[["children"]][[v]])) { return(FALSE) } } return(TRUE) } } rec_compare_covlmc(m1, m2) } build_degenerate_elec_model <- function(with_new_data = FALSE) { pc_week_15_16 <- powerconsumption[powerconsumption$week %in% c(15, 16), ] elec <- pc_week_15_16$active_power elec_dts <- cut(elec, breaks = c(0, 0.4, 2, 8), labels = c("low", "typical", "high")) elec_cov <- data.frame(day = (pc_week_15_16$hour >= 7 & pc_week_15_16$hour <= 18)) elec_tune <- tune_covlmc(elec_dts, elec_cov, min_size = 5) elec_model <- prune(as_covlmc(elec_tune), alpha = 3.961e-10) result <- list(model = elec_model, dts = elec_dts, cov = elec_cov) if (with_new_data) { pc_week_17_18 <- powerconsumption[powerconsumption$week %in% c(17, 18), ] elec_new_dts <- cut(pc_week_17_18$active_power, breaks = c(0, 0.4, 2, 8), labels = c("low", "typical", "high")) elec_new_cov <- data.frame(day = (pc_week_17_18$hour >= 7 & pc_week_17_18$hour <= 18)) result$new_cov <- elec_new_cov result$new_dts <- elec_new_dts } result } ## same calculations as in slow_loglikelihood but for COVLMC co_slow_loglikelihood <- function(model, newdata, initial = c("truncated", "specific", "extended"), ignore, newcov, verbose = FALSE) { initial <- match.arg(initial) if (missing(ignore)) { if (initial == "truncated") { ignore <- depth(model) } else { ignore <- 0L } } else if (initial == "truncated" && ignore < depth(model)) { stop("Must ignore at least ", depth(model), " for truncated likelihood") } nx <- to_dts(newdata, model$vals) if (length(model$vals) > 2) { newdata <- nx$fx } else { newdata <- nx$ix } x <- nx$ix + 1 ctx <- c() max_depth <- depth(model) if (max_depth == 0) { ## fixed model (and no covariate!) glmdata <- prepare_glm(newcov, model$match, 0, newdata, 0) result <- glm_likelihood(model$model$model, glmdata$local_mm, glmdata$target) } else { result <- 0 discarded <- 0 for (i in seq_along(newdata)) { subtree <- match_context_co(model, ctx) the_model <- NULL to_keep <- (initial != "specific" && i > ignore) || (initial == "specific" && i > max(max_depth, ignore)) ## do we have a true match or an extended one? if (i >= max_depth + 1) { ## true match if (subtree$merged) { if (verbose) { cat("merged model\n") } the_model <- subtree$tree$merged_model } else if (!is.null(subtree$tree[["model"]])) { the_model <- subtree$tree$model } else { stop("No match???") } } else { ## extended match if (subtree$merged) { if (verbose) { cat("merged model\n") } the_model <- subtree$tree$merged_model } else if (!is.null(subtree$tree[["model"]])) { if (verbose) { cat("extended use of context\n") } the_model <- subtree$tree$model } else if (!is.null(subtree$tree[["extended_model"]])) { if (verbose) { cat("extended context\n") } the_model <- subtree$tree$extended_model } else { stop("No extended match???") } } mm <- prepare_covariate(newcov, i - subtree$depth - 1, d = the_model$hsize, from = subtree$depth - the_model$hsize ) ll <- glm_likelihood(the_model$model, mm, newdata[i]) if (verbose) { cat( "[", paste(rev(model$vals[ctx]), collapse = "-"), "] ->", paste(model$vals[newdata[i]]), ll, the_model$coefficients, "\n" ) } if (to_keep) { result <- result + ll } else { discarded <- discarded + ll } ## ctx <- rev(x[(i-max_depth+1):i]) j <- max(i - max_depth + 1, 1) ctx <- x[i:j] # print(paste(i,j)) } result <- as.numeric(result) } if (verbose) { print(as.numeric(discarded)) } attr(result, "nobs") <- max(0, length(x) - ignore) if (initial == "truncated") { attr(result, "df") <- count_parameters(model, FALSE) } else if (initial == "specific") { attr(result, "df") <- count_parameters(model, FALSE) + max_depth } else { attr(result, "df") <- count_parameters(model, TRUE) } result } create_demo_covlmc <- function() { withr::local_seed(0) x3 <- sample(c("A", "B", "C"), 1000, replace = TRUE) y3 <- ifelse(runif(length(x3)) > 0.5, c(x3[-1], sample(c("A", "B", "C"), 1)), c(x3[-c(1, 2)], sample(c("A", "B", "C"), 2, replace = TRUE))) y3 <- as.factor(ifelse(runif(length(x3)) > 0.2, y3, sample(c("A", "B", "C"), 1000, replace = TRUE))) z3 <- data.table::fcase( x3[-1] == "A", runif(length(x3) - 1), x3[-1] == "B", 0.5 + runif(length(x3) - 1), x3[-1] == "C", -0.5 + runif(length(x3) - 1) ) z3 <- c(0, z3) df_y3 <- data.frame(y = y3) x3_covlmc <- covlmc(x3, df_y3, max_depth = 5, min_size = 5, alpha = 0.01) m_cuts <- cutoff(x3_covlmc) m_current <- x3_covlmc for (k in seq_along(m_cuts[1:4])) { m_current <- prune(m_current, m_cuts[k]) } list(model = m_current, dts = x3, cov = df_y3, full_model = x3_covlmc) } create_merged_dataset <- function(n = 1000, seed = 0) { withr::local_seed(seed) y <- rnorm(n) z <- sample(c("K", "L", "M"), n, replace = TRUE, prob = sample(1:4, 3, replace = TRUE)) cov <- data.frame(y = y, z = z) mcov <- model.matrix(~ y + z, cov) alpha_1 <- matrix(rnorm(2 * ncol(mcov)), ncol = 2) alpha_2 <- matrix(rnorm(2 * ncol(mcov)), ncol = 2) prob_1 <- t(apply(mcov %*% alpha_1, 1, \(x) c(1, exp(x)) / sum(exp(x) + 1))) prob_2 <- t(apply(mcov %*% alpha_2, 1, \(x) c(1, exp(x)) / sum(exp(x) + 1))) res <- rep(sample(1:3, 1), n) for (k in 2:n) { if (res[k - 1] == 1) { res[k] <- sample(1:3, 1, prob = prob_1[k - 1, ]) } else { res[k] <- sample(1:3, 1, prob = prob_2[k - 1, ]) } } list(dts = x, cov = cov) }