R Under development (unstable) (2024-08-16 r87026 ucrt) -- "Unsuffered Consequences" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > suppressWarnings(RNGversion("3.5.2")) > > library("partykit") Loading required package: grid Loading required package: libcoin Loading required package: mvtnorm > > set.seed(29) > n <- 1000 > x <- runif(n) > z <- runif(n) > y <- rnorm(n, mean = x * c(-1, 1)[(z > 0.7) + 1], sd = 3) > z_noise <- factor(sample(1:3, size = n, replace = TRUE)) > d <- data.frame(y = y, x = x, z = z, z_noise = z_noise) > > > fmla <- as.formula("y ~ x | z + z_noise") > fmly <- gaussian() > fit <- partykit:::glmfit > > # versions of the data > d1 <- d > d1$z <- signif(d1$z, digits = 1) > > k <- 20 > zs_noise <- matrix(rnorm(n*k), nrow = n) > colnames(zs_noise) <- paste0("z_noise_", 1:k) > d2 <- cbind(d, zs_noise) > fmla2 <- as.formula(paste("y ~ x | z + z_noise +", + paste0("z_noise_", 1:k, collapse = " + "))) > > > d3 <- d2 > d3$z <- factor(sample(1:3, size = n, replace = TRUE, prob = c(0.1, 0.5, 0.4))) > d3$y <- rnorm(n, mean = x * c(-1, 1)[(d3$z == 2) + 1], sd = 3) > > ## check weights > w <- rep(1, n) > w[1:10] <- 2 > (mw1 <- glmtree(formula = fmla, data = d, weights = w)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 706 | (Intercept) x | -0.1447422 -0.8138701 | [3] z > 0.70311: n = 304 | (Intercept) x | 0.07006626 0.73278593 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2551.48 > (mw2 <- glmtree(formula = fmla, data = d, weights = w, caseweights = FALSE)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 704 | (Intercept) x | -0.1447422 -0.8138701 | [3] z > 0.70311: n = 296 | (Intercept) x | 0.07006626 0.73278593 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2551.48 > > > > ## check dfsplit > (mmfluc2 <- mob(formula = fmla, data = d, fit = partykit:::glmfit)) Model-based recursive partitioning (partykit:::glmfit) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 704 | (Intercept) x | -0.1619978 -0.7896293 | [3] z > 0.70311: n = 296 | (Intercept) x | 0.08683535 0.65598287 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function: 2551.673 > (mmfluc3 <- glmtree(formula = fmla, data = d)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 704 | (Intercept) x | -0.1619978 -0.7896293 | [3] z > 0.70311: n = 296 | (Intercept) x | 0.08683535 0.65598287 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2551.673 > (mmfluc3_dfsplit <- glmtree(formula = fmla, data = d, dfsplit = 10)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 704 | (Intercept) x | -0.1619978 -0.7896293 | [3] z > 0.70311: n = 296 | (Intercept) x | 0.08683535 0.65598287 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2551.673 > > > ## check tests > if (require("strucchange")) + print(sctest(mmfluc3, node = 1)) # does not yet work Loading required package: strucchange Loading required package: zoo Attaching package: 'zoo' The following objects are masked from 'package:base': as.Date, as.Date.numeric Loading required package: sandwich z z_noise statistic 2.292499e+01 0.6165335 p.value 7.780038e-04 0.9984952 > > x <- mmfluc3 > (tst3 <- nodeapply(x, ids = nodeids(x), function(n) n$info$criterion)) $`1` NULL $`2` NULL $`3` NULL > > > > > ## check logLik and AIC > logLik(mmfluc2) 'log Lik.' -2551.673 (df=7) > logLik(mmfluc3) 'log Lik.' -2551.673 (df=7) > logLik(mmfluc3_dfsplit) 'log Lik.' -2551.673 (df=16) > logLik(glm(y ~ x, data = d)) 'log Lik.' -2563.694 (df=3) > > AIC(mmfluc3) [1] 5117.347 > AIC(mmfluc3_dfsplit) [1] 5135.347 > > ## check pruning > pr2 <- prune.modelparty(mmfluc2) > AIC(mmfluc2) [1] 5117.347 > AIC(pr2) [1] 5117.347 > > mmfluc_dfsplit3 <- glmtree(formula = fmla, data = d, alpha = 0.5, dfsplit = 3) > mmfluc_dfsplit4 <- glmtree(formula = fmla, data = d, alpha = 0.5, dfsplit = 4) > pr_dfsplit3 <- prune.modelparty(mmfluc_dfsplit3) > pr_dfsplit4 <- prune.modelparty(mmfluc_dfsplit4) > AIC(mmfluc_dfsplit3) [1] 5142.774 > AIC(mmfluc_dfsplit4) [1] 5156.774 > AIC(pr_dfsplit3) [1] 5142.774 > AIC(pr_dfsplit4) [1] 5124.456 > > width(mmfluc_dfsplit3) [1] 8 > width(mmfluc_dfsplit4) [1] 8 > width(pr_dfsplit3) [1] 8 > width(pr_dfsplit4) [1] 3 > > ## check inner and terminal > options <- list(NULL, + "object", + "estfun", + c("object", "estfun")) > > arguments <- list("inner", + "terminal", + c("inner", "terminal")) > > > for (o in options) { + print(o) + x <- glmtree(formula = fmla, data = d, inner = o) + str(nodeapply(x, ids = nodeids(x), function(n) n$info[c("object", "estfun")]), 2) + } NULL List of 3 $ 1:List of 2 ..$ NA: NULL ..$ NA: NULL $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL [1] "object" List of 3 $ 1:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL [1] "estfun" List of 3 $ 1:List of 2 ..$ NA : NULL ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ... .. ..- attr(*, "dimnames")=List of 2 $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL [1] "object" "estfun" List of 3 $ 1:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ... .. ..- attr(*, "dimnames")=List of 2 $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL > > for (o in options) { + print(o) + x <- glmtree(formula = fmla, data = d, terminal = o) + str(nodeapply(x, ids = nodeids(x), function(n) n$info[c("object", "estfun")]), 2) + } NULL List of 3 $ 1:List of 2 ..$ NA: NULL ..$ NA: NULL $ 2:List of 2 ..$ NA: NULL ..$ NA: NULL $ 3:List of 2 ..$ NA: NULL ..$ NA: NULL [1] "object" List of 3 $ 1:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ NA : NULL [1] "estfun" List of 3 $ 1:List of 2 ..$ NA : NULL ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ... .. ..- attr(*, "dimnames")=List of 2 $ 2:List of 2 ..$ NA : NULL ..$ estfun: num [1:704, 1:2] -0.1291 0.5104 -0.0603 -0.1868 -0.0981 ... .. ..- attr(*, "dimnames")=List of 2 $ 3:List of 2 ..$ NA : NULL ..$ estfun: num [1:296, 1:2] -0.1053 -0.0877 0.0544 -0.1581 0.43 ... .. ..- attr(*, "dimnames")=List of 2 [1] "object" "estfun" List of 3 $ 1:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ estfun: num [1:1000, 1:2] -0.1375 -0.0583 -0.0553 0.1043 -0.0744 ... .. ..- attr(*, "dimnames")=List of 2 $ 2:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ estfun: num [1:704, 1:2] -0.1291 0.5104 -0.0603 -0.1868 -0.0981 ... .. ..- attr(*, "dimnames")=List of 2 $ 3:List of 2 ..$ object:List of 24 .. ..- attr(*, "class")= chr [1:2] "glm" "lm" ..$ estfun: num [1:296, 1:2] -0.1053 -0.0877 0.0544 -0.1581 0.43 ... .. ..- attr(*, "dimnames")=List of 2 > > > ## check model > m_mt <- glmtree(formula = fmla, data = d, model = TRUE) > m_mf <- glmtree(formula = fmla, data = d, model = FALSE) > > dim(m_mt$data) [1] 1000 4 > dim(m_mf$data) [1] 0 4 > > > ## check multiway > (m_mult <- glmtree(formula = fmla2, data = d3, catsplit = "multiway", minsize = 80)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise + z_noise_1 + z_noise_2 + z_noise_3 + z_noise_4 + z_noise_5 + z_noise_6 + z_noise_7 + z_noise_8 + z_noise_9 + z_noise_10 + z_noise_11 + z_noise_12 + z_noise_13 + z_noise_14 + z_noise_15 + z_noise_16 + z_noise_17 + z_noise_18 + z_noise_19 + z_noise_20 Fitted party: [1] root | [2] z in 1: n = 76 | (Intercept) x | 0.9859847 -3.2600047 | [3] z in 2: n = 537 | (Intercept) x | -0.06970187 1.12305074 | [4] z in 3: n = 387 | (Intercept) x | 0.3824392 -1.8337151 Number of inner nodes: 1 Number of terminal nodes: 3 Number of parameters per node: 2 Objective function (negative log-likelihood): 2511.927 > > > ## check parm > fmla_p <- as.formula("y ~ x + z_noise + z_noise_1 | z + z_noise_2") > (m_interc <- glmtree(formula = fmla_p, data = d2, parm = 1)) Generalized linear model tree (family: gaussian) Model formula: y ~ x + z_noise + z_noise_1 | z + z_noise_2 Fitted party: [1] root | [2] z <= 0.65035: n = 644 | (Intercept) x z_noise2 z_noise3 z_noise_1 | -0.05585503 -1.01257554 0.34044520 -0.16384987 0.24197601 | [3] z > 0.65035: n = 356 | (Intercept) x z_noise2 z_noise3 z_noise_1 | 0.06411865 0.78733976 -0.67811149 -0.14240432 -0.01239154 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 5 Objective function (negative log-likelihood): 2548.32 > > (m_p3 <- glmtree(formula = fmla_p, data = d2, parm = 3)) Generalized linear model tree (family: gaussian) Model formula: y ~ x + z_noise + z_noise_1 | z + z_noise_2 Fitted party: [1] root: n = 1000 (Intercept) x z_noise2 z_noise3 z_noise_1 -0.058855295 -0.340314311 -0.008404682 -0.109839080 0.154798281 Number of inner nodes: 0 Number of terminal nodes: 1 Number of parameters per node: 5 Objective function (negative log-likelihood): 2562.32 > > > ## check trim > (m_tt <- glmtree(formula = fmla, data = d, trim = 0.2)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.70311: n = 704 | (Intercept) x | -0.1619978 -0.7896293 | [3] z > 0.70311: n = 296 | (Intercept) x | 0.08683535 0.65598287 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2551.673 > > (m_tf <- glmtree(formula = fmla, data = d, trim = 300, minsize = 300)) Generalized linear model tree (family: gaussian) Model formula: y ~ x | z + z_noise Fitted party: [1] root | [2] z <= 0.6892: n = 691 | (Intercept) x | -0.1778199 -0.7692901 | [3] z > 0.6892: n = 309 | (Intercept) x | 0.1065746 0.5562243 Number of inner nodes: 1 Number of terminal nodes: 2 Number of parameters per node: 2 Objective function (negative log-likelihood): 2552.12 > > > > ## check breakties > m_bt <- glmtree(formula = fmla, data = d1, breakties = TRUE) > m_df <- glmtree(formula = fmla, data = d1, breakties = FALSE) > > all.equal(m_bt, m_df, check.environment = FALSE) [1] "Component \"node\": Component \"kids\": Component 1: Component 5: Component 6: Mean relative difference: 0.1237503" [2] "Component \"node\": Component \"kids\": Component 2: Component 5: Component 5: Mean relative difference: 0.1746109" [3] "Component \"node\": Component \"kids\": Component 2: Component 5: Component 6: Mean relative difference: 0.0443985" [4] "Component \"node\": Component \"info\": Component \"p.value\": Mean relative difference: 1.100407" [5] "Component \"node\": Component \"info\": Component \"test\": Mean relative difference: 0.07721086" [6] "Component \"info\": Component \"call\": target, current do not match when deparsed" [7] "Component \"info\": Component \"control\": Component \"breakties\": 1 element mismatch" > > unclass(m_bt)$node$info$criterion NULL > unclass(m_df)$node$info$criterion NULL > > > ### example from mob vignette > data("PimaIndiansDiabetes", package = "mlbench") > > logit <- function(y, x, start = NULL, weights = NULL, offset = NULL, ...) { + glm(y ~ 0 + x, family = binomial, start = start, ...) + } > > pid_formula <- diabetes ~ glucose | pregnant + pressure + triceps + + insulin + mass + pedigree + age > > pid_tree <- mob(pid_formula, data = PimaIndiansDiabetes, fit = logit) > pid_tree Model-based recursive partitioning (logit) Model formula: diabetes ~ glucose | pregnant + pressure + triceps + insulin + mass + pedigree + age Fitted party: [1] root | [2] mass <= 26.3: n = 167 | x(Intercept) xglucose | -9.95150963 0.05870786 | [3] mass > 26.3 | | [4] age <= 30: n = 304 | | x(Intercept) xglucose | | -6.70558554 0.04683748 | | [5] age > 30: n = 297 | | x(Intercept) xglucose | | -2.77095386 0.02353582 Number of inner nodes: 2 Number of terminal nodes: 3 Number of parameters per node: 2 Objective function: 355.4578 > nodeapply(pid_tree, ids = nodeids(pid_tree), function(n) n$info$criterion) $`1` NULL $`2` NULL $`3` NULL $`4` NULL $`5` NULL > > > > > proc.time() user system elapsed 20.98 5.51 26.48