R Under development (unstable) (2024-08-15 r87022 ucrt) -- "Unsuffered Consequences" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ## Test implementation of M-S estimator > require(robustbase) Loading required package: robustbase > source(system.file("xtraR/m-s_fns.R", package = "robustbase", mustWork=TRUE)) > source(system.file("xtraR/ex-funs.R", package = "robustbase", mustWork=TRUE)) > source(system.file("xtraR/test-tools.R", package = "robustbase")) # assert.EQ > > ## dataset with factors and continuous variables: > data(education) > education <- within(education, Region <- factor(Region)) > ## for testing purposes: > education2 <- within(education, Group <- factor(rep(1:3, length.out=length(Region)))) > > ## Test splitFrame (type fii is the only problematic type) > testFun <- function(formula, x1.idx) { + obj <- lm(formula, education2) + mf <- obj$model + ret <- splitFrame(mf, type="fii") + if (missing(x1.idx)) { + print(ret$x1.idx) + return(which(unname(ret$x1.idx))) + } + stopifnot(identical(x1.idx, which(unname(ret$x1.idx)))) + } > testFun(Y ~ 1, integer(0)) > testFun(Y ~ X1*X2*X3, integer(0)) > testFun(Y ~ Region + X1 + X2 + X3, 1:4) > testFun(Y ~ 0 + Region + X1 + X2 + X3, 1:4) > testFun(Y ~ Region*X1 + X2 + X3, c(1:5, 8:10)) > testFun(Y ~ Region*X1 + X2 + X3 + Region*Group, c(1:5, 8:18)) > testFun(Y ~ Region*X1 + X2 + X3 + Region*Group*X2, c(1:6, 8:29)) > testFun(Y ~ Region*X1 + X2 + Region*Group*X2, 1:28) > testFun(Y ~ Region*X1 + X2 + Region:Group:X2, 1:21) > testFun(Y ~ Region*X1 + X2*X3 + Region:Group:X2, c(1:6, 8:10, 12:23)) > testFun(Y ~ (X1+X2+X3+Region)^2, c(1:7,10:12,14:19)) > testFun(Y ~ (X1+X2+X3+Region)^3, c(1:19, 21:29)) > testFun(Y ~ (X1+X2+X3+Region)^4, 1:32) > testFun(Y ~ Region:X1:X2 + X1*X2, c(1:1, 4:7)) > > > control <- lmrob.control() > cntrlT1 <- lmrob.control(trace.lev=1) > f.lm <- lm(Y ~ Region + X1 + X2 + X3, education) > splt <- splitFrame(f.lm$model) > stopifnot(identical(names(splt$x1.idx), names(coef(f.lm))), + unname(splt$x1.idx) == c(rep(TRUE, 4), rep(FALSE, 3)) + ) > y <- education$Y > > ## test orthogonalizing > x1 <- splt$x1 > x2 <- splt$x2 > tmp <- lmrob.lar(x1, y, control) > y.tilde <- tmp$resid > t1 <- tmp$coef > x2.tilde <- x2 > T2 <- matrix(0, nrow=ncol(x1), ncol=ncol(x2)) > for (i in 1:ncol(x2)) { + tmp <- lmrob.lar(x1, x2[,i], control) + x2.tilde[,i] <- tmp$resid + T2[,i] <- tmp$coef + } > T2 [,1] [,2] [,3] [1,] 774 4894 307 [2,] -113 14 17 [3,] -171 -774 16 [4,] -48 -81 26 > > set.seed(10) > mss1 <- m_s_subsample(x1, x2.tilde, y.tilde, cntrlT1, orth = FALSE) lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(0,1,0)) Finished M-S subsampling with scale = 30.81835 > mss1 <- within(mss1, b1 <- drop(t1 + b1 - T2 %*% b2)) > stopifnot(all.equal(30.81835, mss1$scale, tol=1e-7)) > set.seed(10) > mss2 <- m_s_subsample(x1, x2, y, cntrlT1, orth = TRUE) lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,0)) Finished M-S subsampling with scale = 30.81835 > stopifnot(all.equal(mss1, mss2)) > > res <- vector("list", 100) > set.seed(0) > time <- system.time(for (i in seq_along(res)) { + tmp <- m_s_subsample(x1, x2.tilde, y.tilde, control, FALSE) + res[[i]] <- unlist(within(tmp, b1 <- drop(t1 + b1 - T2 %*% b2))) + }) > cat('Time elapsed in subsampling: ', time,'\n') Time elapsed in subsampling: 0.33 0 0.33 NA NA > ## show a summary of the results {"FIXME": output is platform dependent} > summary(res1 <- do.call(rbind, res)) b11 b12 b13 b14 Min. :-316.24 Min. :-33.92 Min. :-35.8704 Min. :16.43 1st Qu.:-223.37 1st Qu.:-23.66 1st Qu.: -8.9603 1st Qu.:29.92 Median :-163.19 Median :-21.37 Median : -7.2929 Median :32.20 Mean :-161.11 Mean :-22.02 Mean : -8.0727 Mean :32.15 3rd Qu.:-103.36 3rd Qu.:-18.42 3rd Qu.: -5.9055 3rd Qu.:35.72 Max. : 61.83 Max. :-12.03 Max. : 0.7015 Max. :42.23 b21 b22 b23 scale Min. :-0.03808 Min. :0.02111 Min. :0.2555 Min. :29.79 1st Qu.:-0.00397 1st Qu.:0.03927 1st Qu.:0.4956 1st Qu.:30.43 Median : 0.02160 Median :0.04716 Median :0.6381 Median :30.91 Mean : 0.02734 Mean :0.04618 Mean :0.6219 Mean :30.95 3rd Qu.: 0.05561 3rd Qu.:0.05224 3rd Qu.:0.7473 3rd Qu.:31.35 Max. : 0.10427 Max. :0.06938 Max. :0.9172 Max. :32.18 > ## compare with fast S solution > fmS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init="S") > coef(fmS) (Intercept) Region2 Region3 Region4 X1 -135.72592554 -20.64576283 -9.84881727 24.58013066 0.03405591 X2 X3 0.04327562 0.57895741 > fmS$scale [1] 26.40386 > > ### Comparing m-s_descent implementations() {our C and R} : ------------------- > > ctrl <- control > #ctrl$trace.lev <- 5 > ctrl$k.max <- 1 > mC <- m_s_descent (x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10) > mR <- m_s_descent_Ronly(x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10) > nm <- c("b1","b2", "scale", "res") > stopifnot(all.equal(mC[nm], mR[nm], check.attributes = FALSE, tolerance = 4e-14)) > # seen 5.567e-15 in OpenBLAS ^^^^^ > > ## control$k.m_s <- 100 > res3 <- vector("list", 100) > time <- system.time(for (i in seq_along(res3)) { + ri <- res[[i]] + res3[[i]] <- unlist(m_s_descent(x1, x2, y, control, + ri[1:4], ri[5:7], ri[8])) + }) > cat('Time elapsed in descent proc: ', time,'\n') Time elapsed in descent proc: 0.06 0 0.06 NA NA > > ## show a summary of the results {"FIXME": output is platform dependent} > res4 <- do.call(rbind, res3) > summary(res4[,1:8]) b11 b12 b13 b14 Min. :-316.3 Min. :-30.56 Min. :-36.501 Min. :16.43 1st Qu.:-222.4 1st Qu.:-23.09 1st Qu.: -8.960 1st Qu.:27.96 Median :-160.7 Median :-21.00 Median : -7.868 Median :30.46 Mean :-158.2 Mean :-20.60 Mean : -9.046 Mean :30.84 3rd Qu.:-102.7 3rd Qu.:-17.40 3rd Qu.: -6.842 3rd Qu.:32.75 Max. : 101.7 Max. :-12.24 Max. : -4.032 Max. :42.23 b21 b22 b23 scale Min. :-0.02141 Min. :0.01459 Min. :0.2034 Min. :29.79 1st Qu.: 0.02048 1st Qu.:0.03873 1st Qu.:0.5007 1st Qu.:30.37 Median : 0.04169 Median :0.04271 Median :0.6381 Median :30.57 Mean : 0.03911 Mean :0.04359 Mean :0.6270 Mean :30.70 3rd Qu.: 0.06102 3rd Qu.:0.04798 3rd Qu.:0.7460 3rd Qu.:30.96 Max. : 0.09102 Max. :0.06367 Max. :0.9172 Max. :31.84 > > stopifnot(all.equal( # 'test', not only plot: + res1[, "scale"], res4[,"scale"], tol = 0.03), + res1[, "scale"] >= res4[,"scale"] - 1e-7 ) # 1e-7 just in case > plot(res1[, "scale"], res4[,"scale"]) > abline(0,1, col=adjustcolor("gray", 0.5)) > > ## Test lmrob.M.S > x <- model.matrix(fmS) > control$trace.lev <- 3 > ## --------- -- > set.seed(1003) > fMS <- lmrob.M.S(x, y, control, fmS$model) lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,1)) orthogonalized: SIGMA=10.3443 Starting M-S subsampling procedure(p1=4, p2=3; ini.sc=1.03443e+21) .. [setup Ok] Sample[ 0]: new candidate with sc = 43.2268777 in 46 iter Sample[ 1]: new candidate with sc = 33.1250480 in 49 iter Sample[ 18]: new candidate with sc = 31.8558584 in 42 iter Sample[136]: new candidate with sc = 31.7408095 in 35 iter Sample[338]: new candidate with sc = 31.5118791 in 37 iter Finished M-S subsampling with scale = 31.51188 b1: -5.44507 7.82526 1.75671 7.6568 b2: 0.0484424 0.0376332 0.591759 Starting descent procedure... Scale: 31.51188 Refinement step 1: better fit, scale: 31.486 Refinement step 2: no improvement, scale: 31.496 Refinement step 3: no improvement, scale: 31.498 Refinement step 4: better fit, scale: 31.388 Refinement step 5: better fit, scale: 31.343 Refinement step 6: better fit, scale: 31.329 Refinement step 7: no improvement, scale: 31.331 Refinement step 8: no improvement, scale: 31.342 Refinement step 9: no improvement, scale: 31.358 Refinement step 10: no improvement, scale: 31.377 Refinement step 11: no improvement, scale: 31.398 Refinement step 12: no improvement, scale: 31.421 Refinement step 13: no improvement, scale: 31.445 Refinement step 14: no improvement, scale: 31.469 Refinement step 15: no improvement, scale: 31.495 Refinement step 16: no improvement, scale: 31.521 Refinement step 17: no improvement, scale: 31.549 Refinement step 18: no improvement, scale: 31.577 Refinement step 19: no improvement, scale: 31.607 Refinement step 20: no improvement, scale: 31.637 Refinement step 21: no improvement, scale: 31.668 Refinement step 22: no improvement, scale: 31.700 Refinement step 23: no improvement, scale: 31.734 Refinement step 24: no improvement, scale: 31.768 Refinement step 25: no improvement, scale: 31.804 Refinement step 26: no improvement, scale: 31.841 Descent procedure: not converged (best scale: 31.329, last step: 31.841) The procedure stopped after 27 steps because there was no improvement in the last 20 steps. To increase this number, use the control parameter 'k.m_s'. b1: -113.529 -14.8582 -9.42637 27.4853 b2: 0.0668455 0.0361344 0.540701 Warning message: In lmrob.M.S(x, y, control, fmS$model) : M-S estimator did *not* converge > resid <- drop(y - x %*% fMS$coef) > assert.EQ(resid, fMS$resid, check.attributes=FALSE, tol = 1e-12) > > ## Test direct call to lmrob > ## 1. trace_lev output: > set.seed(17) > fMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S", trace.lev=2) lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,1)) orthogonalized: SIGMA=10.3443 Starting M-S subsampling procedure(p1=4, p2=3; ini.sc=1.03443e+21) .. [setup Ok] Sample[ 0]: new candidate with sc = 720.839768 in 42 iter Sample[ 1]: new candidate with sc = 32.1056576 in 48 iter Sample[ 69]: new candidate with sc = 31.8561665 in 40 iter Sample[128]: new candidate with sc = 31.1223584 in 47 iter Finished M-S subsampling with scale = 31.12236 Starting descent procedure... Refinement step 1: better fit, scale: 30.963 Refinement step 4: better fit, scale: 30.888 Descent procedure: not converged (best scale: 30.888, last step: 31.134) The procedure stopped after 25 steps because there was no improvement in the last 20 steps. To increase this number, use the control parameter 'k.m_s'. init converged (remaining method = "M") -> coef= [1] -236.33813174 -22.50174637 -7.69929975 25.25388208 0.05220654 [6] 0.04769691 0.78984802 lmrob_MM(): rwls(): rwls() used 19 it.; last ||b0 - b1||_1 = 1.76872e-05, L(b1) = 0.146343135734; convergence lmrob..MM..fit(*, obj) --> updating .. qr(x * rweights) -> rank= 7 step "M" -> new coef= (Intercept) Region2 Region3 Region4 X1 -150.21716170 -12.68711627 -10.64944153 21.92534635 0.04153037 X2 X3 0.04337025 0.61139401 Warning message: In lmrob.M.S(x, y, control, mf = mf) : M-S estimator did *not* converge > > set.seed(13) > fiMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S") > out2 <- capture.output(summary(fiMS)) > writeLines(out2) Call: lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = "M-S") \--> method = "M-SM" Residuals: Min 1Q Median 3Q Max -62.729 -15.529 -1.572 23.392 174.750 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -150.07630 143.09300 -1.049 0.30013 Region2 -12.76767 16.63758 -0.767 0.44704 Region3 -10.63954 15.92865 -0.668 0.50774 Region4 21.95445 16.96484 1.294 0.20253 X1 0.04146 0.05040 0.823 0.41525 X2 0.04337 0.01373 3.159 0.00289 ** X3 0.61106 0.35153 1.738 0.08932 . --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Robust residual standard error: 30.82 Multiple R-squared: 0.5695, Adjusted R-squared: 0.5095 Convergence in 19 IRWLS iterations Robustness weights: observation 50 is an outlier with |weight| = 0 ( < 0.002); 7 weights are ~= 1. The remaining 42 ones are summarized as Min. 1st Qu. Median Mean 3rd Qu. Max. 0.2884 0.8904 0.9508 0.8890 0.9867 0.9985 Algorithmic parameters: tuning.chi bb tuning.psi rel.tol 1.548e+00 5.000e-01 4.685e+00 1.000e-07 scale.tol solve.tol zero.tol eps.outlier 1.000e-10 1.000e-07 1.000e-10 2.000e-03 eps.x warn.limit.reject warn.limit.meanrw 1.071e-08 5.000e-01 5.000e-01 nResample max.it k.max maxit.scale k.m_s 500 50 200 200 20 trace.lev mts compute.rd fast.s.large.n 0 1000 0 2000 psi subsampling cov "bisquare" "nonsingular" ".vcov.w" split.type compute.outlier.stats "f" "SM" seed : int(0) > > set.seed(13) > fiM.S <- lmrob(Y ~ Region + X1 + X2 + X3, education, init=lmrob.M.S) > out3 <- capture.output(summary(fiM.S)) > > ## must be the same {apart from the "init=" in the call}: > i <- 3 > stopifnot(identical(out2[-i], out3[-i])) > ## the difference: > c(rbind(out2[i], out3[i])) [1] "lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = \"M-S\")" [2] "lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = lmrob.M.S)" > > > ### "Skipping design matrix equilibration" warning can arise for reasonable designs ----- > set.seed(1) > x2 <- matrix(rnorm(2*30), 30, 2) > data <- data.frame(y = rnorm(30), group = rep(letters[1:3], each=10), x2) > > obj <- lmrob(y ~ ., data, init="M-S", trace.lev=1) lmrob_M_S(n = 30, nRes = 500, (p1,p2)=(3,2), (orth,subs,desc)=(1,1,1)) Finished M-S subsampling with scale = 0.93330 Descent procedure: converged (best scale: 0.92746, last step: 0.92746) init converged (remaining method = "M") -> coef= [1] 0.14412998 -0.16709564 -0.01045825 -0.18551426 0.25915102 lmrob_MM(): rwls(): rwls() used 12 it.; last ||b0 - b1||_1 = 4.61816e-08, L(b1) = 0.0907731911388; convergence lmrob..MM..fit(*, obj) --> updating .. qr(x * rweights) -> rank= 5 step "M" -> new coef= (Intercept) groupb groupc X1 X2 0.41147162 -0.74117537 -0.36321634 0.02379646 0.51727653 Warning message: In lmrob.M.S(x, y, control, mf = mf) : Skipping design matrix equilibration (DGEEQU): row 23 is exactly zero. > > ## illustration: the zero row is introduced during the orthogonalization of x2 wrt x1 > ## l1 regression always produces p zero residuals > ## by chance, the zero residuals of multiple columns happen to be on the same row > sf <- splitFrame(obj$model) > x1 <- sf$x1 > x2 <- sf$x2 > control <- obj$control > > ## orthogonalize > x2.tilde <- x2 > > for(i in 1:ncol(x2)) { + tmp <- lmrob.lar(x1, x2[,i], control) + x2.tilde[,i] <- tmp$resid + } > x2.tilde == 0 X1 X2 1 FALSE FALSE 2 TRUE FALSE 3 FALSE FALSE 4 FALSE TRUE 5 FALSE FALSE 6 FALSE FALSE 7 FALSE FALSE 8 FALSE FALSE 9 FALSE FALSE 10 FALSE FALSE 11 FALSE FALSE 12 TRUE FALSE 13 FALSE FALSE 14 FALSE FALSE 15 FALSE FALSE 16 FALSE FALSE 17 FALSE TRUE 18 FALSE FALSE 19 FALSE FALSE 20 FALSE FALSE 21 FALSE FALSE 22 FALSE FALSE 23 TRUE TRUE 24 FALSE FALSE 25 FALSE FALSE 26 FALSE FALSE 27 FALSE FALSE 28 FALSE FALSE 29 FALSE FALSE 30 FALSE FALSE > > > ## Specifying init="M-S" for a model without categorical variables > ## used to cause a segfault; now uses "S" > lmrob(LNOx ~ LNOxEm, NOxEmissions[1:10,], init="M-S") Call: lmrob(formula = LNOx ~ LNOxEm, data = NOxEmissions[1:10, ], init = "M-S") \--> method = "MM" Coefficients: (Intercept) LNOxEm 1.5118 0.4978 Warning message: In lmrob.M.S(x, y, control, mf = mf) : No categorical variables found in model. Reverting to S-estimator. > > ## Now an ANOVA model with *only* categorical variables > n <- 64 # multiple of 16 > stopifnot(n %% 16 == 0) > d.AOV <- data.frame(y = round(100*rnorm(64)), + A=gl(4,n/4), B=gl(2,8, n), C=gl(2,4,n)) > fm <- lmrob(y ~ A*B*C, data = d.AOV, init = "M-S", trace.lev=2) init converged (remaining method = "M") -> coef= [1] 19 6 58 -161 91 -30 -121 -176 1 45 -75 113 -111 132 187 [16] 0 lmrob_MM(): rwls(): rwls() used 30 it.; last ||b0 - b1||_1 = 5.76455e-05, L(b1) = 0.173788300102; convergence lmrob..MM..fit(*, obj) --> updating .. qr(x * rweights) -> rank= 16 step "M" -> new coef= (Intercept) A2 A3 A4 B2 C2 6.3072125 -12.1364596 69.2442724 -71.0775753 35.0078234 19.5279997 A2:B2 A3:B2 A4:B2 A2:C2 A3:C2 A4:C2 -24.1727568 -87.0419714 -0.7811905 73.9279841 -77.3141570 28.1145939 B2:C2 A2:B2:C2 A3:B2:C2 A4:B2:C2 -74.7935878 46.9640876 66.0934400 -7.4911099 Warning message: In lmrob.M.S(x, y, control, mf = mf) : No continuous variables found in model. Reverting to L1-estimator. > > ## lmrob_M_S(n = 64, nRes = 500, (p1,p2)=(16,0), (orth,subs,desc)=(1,1,1)) > ## Starting subsampling procedure.. Error in lmrob.M.S(x, y, control, mf) : > ## 'Calloc' could not allocate memory (18446744073709551616 of 4 bytes) > > ## BTW: Can we compute an M-estimate (instead of MM-*) as we > ## --- cannot have any x-outliers in such an ANOVA! > > proc.time() user system elapsed 1.00 0.12 1.10