R Under development (unstable) (2024-07-17 r86903 ucrt) -- "Unsuffered Consequences" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ### R code from vignette source 'constparty.Rnw' > > ### test here after removal of RWeka dependent code > > ################################################### > ### code chunk number 1: setup > ################################################### > options(width = 70) > library("partykit") Loading required package: grid Loading required package: libcoin Loading required package: mvtnorm > set.seed(290875) > > > ################################################### > ### code chunk number 2: Titanic > ################################################### > data("Titanic", package = "datasets") > ttnc <- as.data.frame(Titanic) > ttnc <- ttnc[rep(1:nrow(ttnc), ttnc$Freq), 1:4] > names(ttnc)[2] <- "Gender" > > > ################################################### > ### code chunk number 3: rpart > ################################################### > library("rpart") > (rp <- rpart(Survived ~ ., data = ttnc, model = TRUE)) n= 2201 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 2201 711 No (0.6769650 0.3230350) 2) Gender=Male 1731 367 No (0.7879838 0.2120162) 4) Age=Adult 1667 338 No (0.7972406 0.2027594) * 5) Age=Child 64 29 No (0.5468750 0.4531250) 10) Class=3rd 48 13 No (0.7291667 0.2708333) * 11) Class=1st,2nd 16 0 Yes (0.0000000 1.0000000) * 3) Gender=Female 470 126 Yes (0.2680851 0.7319149) 6) Class=3rd 196 90 No (0.5408163 0.4591837) * 7) Class=1st,2nd,Crew 274 20 Yes (0.0729927 0.9270073) * > > > ################################################### > ### code chunk number 4: rpart-party > ################################################### > (party_rp <- as.party(rp)) Model formula: Survived ~ Class + Gender + Age Fitted party: [1] root | [2] Gender in Male | | [3] Age in Adult: No (n = 1667, err = 20.3%) | | [4] Age in Child | | | [5] Class in 3rd: No (n = 48, err = 27.1%) | | | [6] Class in 1st, 2nd: Yes (n = 16, err = 0.0%) | [7] Gender in Female | | [8] Class in 3rd: No (n = 196, err = 45.9%) | | [9] Class in 1st, 2nd, Crew: Yes (n = 274, err = 7.3%) Number of inner nodes: 4 Number of terminal nodes: 5 > > > ################################################### > ### code chunk number 5: rpart-plot-orig > ################################################### > plot(rp) > text(rp) > > > ################################################### > ### code chunk number 6: rpart-plot > ################################################### > plot(party_rp) > > > ################################################### > ### code chunk number 7: rpart-pred > ################################################### > all.equal(predict(rp), predict(party_rp, type = "prob"), + check.attributes = FALSE) [1] TRUE > > > ################################################### > ### code chunk number 8: rpart-fitted > ################################################### > str(fitted(party_rp)) 'data.frame': 2201 obs. of 2 variables: $ (fitted) : int 5 5 5 5 5 5 5 5 5 5 ... $ (response): Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ... > > > ################################################### > ### code chunk number 9: rpart-prob > ################################################### > prop.table(do.call("table", fitted(party_rp)), 1) (response) (fitted) No Yes 3 0.7972406 0.2027594 5 0.7291667 0.2708333 6 0.0000000 1.0000000 8 0.5408163 0.4591837 9 0.0729927 0.9270073 > > > ################################################### > ### code chunk number 10: J48 > ################################################### > #if (require("RWeka")) { > # j48 <- J48(Survived ~ ., data = ttnc) > #} else { > # j48 <- rpart(Survived ~ ., data = ttnc) > #} > #print(j48) > # > # > #################################################### > #### code chunk number 11: J48-party > #################################################### > #(party_j48 <- as.party(j48)) > # > # > #################################################### > #### code chunk number 12: J48-plot > #################################################### > #plot(party_j48) > # > # > #################################################### > #### code chunk number 13: J48-pred > #################################################### > #all.equal(predict(j48, type = "prob"), predict(party_j48, type = "prob"), > # check.attributes = FALSE) > > > ################################################### > ### code chunk number 14: PMML-Titantic > ################################################### > ttnc_pmml <- file.path(system.file("pmml", package = "partykit"), + "ttnc.pmml") > (ttnc_quest <- pmmlTreeModel(ttnc_pmml)) Loading required namespace: XML Model formula: Survived ~ Gender + Class + Age Fitted party: [1] root | [2] Gender in Female | | [3] Class in 3rd, Crew: Yes (n = 219, err = 49.8%) | | [4] Class in 1st, 2nd | | | [5] Class in 2nd: Yes (n = 106, err = 12.3%) | | | [6] Class in 1st: Yes (n = 145, err = 2.8%) | [7] Gender in Male | | [8] Class in 3rd, 2nd, Crew | | | [9] Age in Child: No (n = 59, err = 40.7%) | | | [10] Age in Adult | | | | [11] Class in 3rd, Crew | | | | | [12] Class in Crew: No (n = 862, err = 22.3%) | | | | | [13] Class in 3rd: No (n = 462, err = 16.2%) | | | | [14] Class in 2nd: No (n = 168, err = 8.3%) | | [15] Class in 1st: No (n = 180, err = 34.4%) Number of inner nodes: 7 Number of terminal nodes: 8 > > > ################################################### > ### code chunk number 15: PMML-Titanic-plot1 > ################################################### > plot(ttnc_quest) > > > ################################################### > ### code chunk number 16: ttnc2-reorder > ################################################### > ttnc2 <- ttnc[, names(ttnc_quest$data)] > for(n in names(ttnc2)) { + if(is.factor(ttnc2[[n]])) ttnc2[[n]] <- factor( + ttnc2[[n]], levels = levels(ttnc_quest$data[[n]])) + } > > > ################################################### > ### code chunk number 17: PMML-Titanic-augmentation > ################################################### > ttnc_quest2 <- party(ttnc_quest$node, + data = ttnc2, + fitted = data.frame( + "(fitted)" = predict(ttnc_quest, ttnc2, type = "node"), + "(response)" = ttnc2$Survived, + check.names = FALSE), + terms = terms(Survived ~ ., data = ttnc2) + ) > ttnc_quest2 <- as.constparty(ttnc_quest2) > > > ################################################### > ### code chunk number 18: PMML-Titanic-plot2 > ################################################### > plot(ttnc_quest2) > > > ################################################### > ### code chunk number 19: PMML-write > ################################################### > library("pmml") Loading required package: XML > tfile <- tempfile() > write(toString(pmml(rp)), file = tfile) > > > ################################################### > ### code chunk number 20: PMML-read > ################################################### > (party_pmml <- pmmlTreeModel(tfile)) Model formula: Survived ~ Class + Gender + Age Fitted party: [1] root | [2] Gender in Male | | [3] Age in Adult: No (n = 1667, err = 20.3%) | | [4] Age in Child | | | [5] Class in 3rd: No (n = 48, err = 27.1%) | | | [6] Class in 1st, 2nd: Yes (n = 16, err = 0.0%) | [7] Gender in Female | | [8] Class in 3rd: No (n = 196, err = 45.9%) | | [9] Class in 1st, 2nd, Crew: Yes (n = 274, err = 7.3%) Number of inner nodes: 4 Number of terminal nodes: 5 > all.equal(predict(party_rp, newdata = ttnc, type = "prob"), + predict(party_pmml, newdata = ttnc, type = "prob"), + check.attributes = FALSE) [1] TRUE > > > ################################################### > ### code chunk number 21: mytree-1 > ################################################### > findsplit <- function(response, data, weights, alpha = 0.01) { + + ## extract response values from data + y <- factor(rep(data[[response]], weights)) + + ## perform chi-squared test of y vs. x + mychisqtest <- function(x) { + x <- factor(x) + if(length(levels(x)) < 2) return(NA) + ct <- suppressWarnings(chisq.test(table(y, x), correct = FALSE)) + pchisq(ct$statistic, ct$parameter, log = TRUE, lower.tail = FALSE) + } + xselect <- which(names(data) != response) + logp <- sapply(xselect, function(i) mychisqtest(rep(data[[i]], weights))) + names(logp) <- names(data)[xselect] + + ## Bonferroni-adjusted p-value small enough? + if(all(is.na(logp))) return(NULL) + minp <- exp(min(logp, na.rm = TRUE)) + minp <- 1 - (1 - minp)^sum(!is.na(logp)) + if(minp > alpha) return(NULL) + + ## for selected variable, search for split minimizing p-value + xselect <- xselect[which.min(logp)] + x <- rep(data[[xselect]], weights) + + ## set up all possible splits in two kid nodes + lev <- levels(x[drop = TRUE]) + if(length(lev) == 2) { + splitpoint <- lev[1] + } else { + comb <- do.call("c", lapply(1:(length(lev) - 2), + function(x) combn(lev, x, simplify = FALSE))) + xlogp <- sapply(comb, function(q) mychisqtest(x %in% q)) + splitpoint <- comb[[which.min(xlogp)]] + } + + ## split into two groups (setting groups that do not occur to NA) + splitindex <- !(levels(data[[xselect]]) %in% splitpoint) + splitindex[!(levels(data[[xselect]]) %in% lev)] <- NA_integer_ + splitindex <- splitindex - min(splitindex, na.rm = TRUE) + 1L + + ## return split as partysplit object + return(partysplit(varid = as.integer(xselect), + index = splitindex, + info = list(p.value = 1 - (1 - exp(logp))^sum(!is.na(logp))))) + } > > > ################################################### > ### code chunk number 22: mytree-2 > ################################################### > growtree <- function(id = 1L, response, data, weights, minbucket = 30) { + + ## for less than 30 observations stop here + if (sum(weights) < minbucket) return(partynode(id = id)) + + ## find best split + sp <- findsplit(response, data, weights) + ## no split found, stop here + if (is.null(sp)) return(partynode(id = id)) + + ## actually split the data + kidids <- kidids_split(sp, data = data) + + ## set up all daugther nodes + kids <- vector(mode = "list", length = max(kidids, na.rm = TRUE)) + for (kidid in 1:length(kids)) { + ## select observations for current node + w <- weights + w[kidids != kidid] <- 0 + ## get next node id + if (kidid > 1) { + myid <- max(nodeids(kids[[kidid - 1]])) + } else { + myid <- id + } + ## start recursion on this daugther node + kids[[kidid]] <- growtree(id = as.integer(myid + 1), response, data, w) + } + + ## return nodes + return(partynode(id = as.integer(id), split = sp, kids = kids, + info = list(p.value = min(info_split(sp)$p.value, na.rm = TRUE)))) + } > > > ################################################### > ### code chunk number 23: mytree-3 > ################################################### > mytree <- function(formula, data, weights = NULL) { + + ## name of the response variable + response <- all.vars(formula)[1] + ## data without missing values, response comes last + data <- data[complete.cases(data), c(all.vars(formula)[-1], response)] + ## data is factors only + stopifnot(all(sapply(data, is.factor))) + + if (is.null(weights)) weights <- rep(1L, nrow(data)) + ## weights are case weights, i.e., integers + stopifnot(length(weights) == nrow(data) & + max(abs(weights - floor(weights))) < .Machine$double.eps) + + ## grow tree + nodes <- growtree(id = 1L, response, data, weights) + + ## compute terminal node number for each observation + fitted <- fitted_node(nodes, data = data) + ## return rich constparty object + ret <- party(nodes, data = data, + fitted = data.frame("(fitted)" = fitted, + "(response)" = data[[response]], + "(weights)" = weights, + check.names = FALSE), + terms = terms(formula)) + as.constparty(ret) + } > > > ################################################### > ### code chunk number 24: mytree-4 > ################################################### > (myttnc <- mytree(Survived ~ Class + Age + Gender, data = ttnc)) Model formula: Survived ~ Class + Age + Gender Fitted party: [1] root | [2] Gender in Male | | [3] Class in 1st | | | [4] Age in Child: Yes (n = 5, err = 0.0%) | | | [5] Age in Adult: No (n = 175, err = 32.6%) | | [6] Class in 2nd, 3rd, Crew | | | [7] Age in Child | | | | [8] Class in 2nd: Yes (n = 11, err = 0.0%) | | | | [9] Class in 3rd: No (n = 48, err = 27.1%) | | | [10] Age in Adult | | | | [11] Class in Crew: No (n = 862, err = 22.3%) | | | | [12] Class in 2nd, 3rd: No (n = 630, err = 14.1%) | [13] Gender in Female | | [14] Class in 3rd: No (n = 196, err = 45.9%) | | [15] Class in 1st, 2nd, Crew: Yes (n = 274, err = 7.3%) Number of inner nodes: 7 Number of terminal nodes: 8 > > > ################################################### > ### code chunk number 25: mytree-5 > ################################################### > plot(myttnc) > > > ################################################### > ### code chunk number 26: mytree-pval > ################################################### > nid <- nodeids(myttnc) > iid <- nid[!(nid %in% nodeids(myttnc, terminal = TRUE))] > (pval <- unlist(nodeapply(myttnc, ids = iid, + FUN = function(n) info_node(n)$p.value))) 1 2 3 6 7 0.000000e+00 2.965383e-06 1.756527e-03 6.933623e-05 8.975754e-06 10 13 2.992870e-05 0.000000e+00 > > > ################################################### > ### code chunk number 27: mytree-nodeprune > ################################################### > myttnc2 <- nodeprune(myttnc, ids = iid[pval > 1e-5]) > > > ################################################### > ### code chunk number 28: mytree-nodeprune-plot > ################################################### > plot(myttnc2) > > > ################################################### > ### code chunk number 29: mytree-glm > ################################################### > logLik(glm(Survived ~ Class + Age + Gender, data = ttnc, + family = binomial())) 'log Lik.' -1105.031 (df=6) > > > ################################################### > ### code chunk number 30: mytree-bs > ################################################### > bs <- rmultinom(25, nrow(ttnc), rep(1, nrow(ttnc)) / nrow(ttnc)) > > > ################################################### > ### code chunk number 31: mytree-ll > ################################################### > bloglik <- function(prob, weights) + sum(weights * dbinom(ttnc$Survived == "Yes", size = 1, + prob[,"Yes"], log = TRUE)) > > > ################################################### > ### code chunk number 32: mytree-bsll > ################################################### > f <- function(w) { + tr <- mytree(Survived ~ Class + Age + Gender, data = ttnc, weights = w) + bloglik(predict(tr, newdata = ttnc, type = "prob"), as.numeric(w == 0)) + } > apply(bs, 2, f) [1] -418.2675 -398.0958 -418.6230 -404.1996 -410.8889 -411.7570 [7] -374.9353 -412.6712 -421.7616 -393.9068 -400.0987 -373.6991 [13] -395.1191 -422.8247 -429.5351 -384.3696 -391.9081 -388.7349 [19] -399.7435 -409.1937 -391.9392 -409.1083 -399.7312 -391.7226 [25] -391.5488 > > > ################################################### > ### code chunk number 33: mytree-node > ################################################### > nttnc <- expand.grid(Class = levels(ttnc$Class), + Gender = levels(ttnc$Gender), Age = levels(ttnc$Age)) > nttnc Class Gender Age 1 1st Male Child 2 2nd Male Child 3 3rd Male Child 4 Crew Male Child 5 1st Female Child 6 2nd Female Child 7 3rd Female Child 8 Crew Female Child 9 1st Male Adult 10 2nd Male Adult 11 3rd Male Adult 12 Crew Male Adult 13 1st Female Adult 14 2nd Female Adult 15 3rd Female Adult 16 Crew Female Adult > > > ################################################### > ### code chunk number 34: mytree-prob > ################################################### > predict(myttnc, newdata = nttnc, type = "node") 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 4 8 9 8 15 15 14 15 5 12 12 11 15 15 14 15 > predict(myttnc, newdata = nttnc, type = "response") 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Yes Yes No No Yes Yes No Yes No No No No Yes Yes No Yes Levels: No Yes > predict(myttnc, newdata = nttnc, type = "prob") No Yes 1 0.0000000 1.0000000 2 0.0000000 1.0000000 3 0.7291667 0.2708333 4 0.0000000 1.0000000 5 0.0729927 0.9270073 6 0.0729927 0.9270073 7 0.5408163 0.4591837 8 0.0729927 0.9270073 9 0.6742857 0.3257143 10 0.8587302 0.1412698 11 0.8587302 0.1412698 12 0.7772622 0.2227378 13 0.0729927 0.9270073 14 0.0729927 0.9270073 15 0.5408163 0.4591837 16 0.0729927 0.9270073 > > > ################################################### > ### code chunk number 35: mytree-FUN > ################################################### > predict(myttnc, newdata = nttnc, FUN = function(y, w) + rank(table(rep(y, w)))) No Yes 1 1 2 2 1 2 3 2 1 4 2 1 5 1 2 6 1 2 7 2 1 8 1 2 9 2 1 10 2 1 11 2 1 12 2 1 13 1 2 14 1 2 15 2 1 16 1 2 > > > > proc.time() user system elapsed 4.73 0.48 5.20