R Under development (unstable) (2024-08-17 r87027 ucrt) -- "Unsuffered Consequences"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> ## Test implementation of M-S estimator
> require(robustbase)
Loading required package: robustbase
> source(system.file("xtraR/m-s_fns.R", package = "robustbase", mustWork=TRUE))
> source(system.file("xtraR/ex-funs.R", package = "robustbase", mustWork=TRUE))
> source(system.file("xtraR/test-tools.R", package = "robustbase")) # assert.EQ
> 
> ## dataset with factors and continuous variables:
> data(education)
> education <- within(education, Region <- factor(Region))
> ## for testing purposes:
> education2 <- within(education, Group <- factor(rep(1:3, length.out=length(Region))))
> 
> ## Test splitFrame (type fii is the only problematic type)
> testFun <- function(formula, x1.idx) {
+     obj <- lm(formula, education2)
+     mf <- obj$model
+     ret <- splitFrame(mf, type="fii")
+     if (missing(x1.idx)) {
+         print(ret$x1.idx)
+         return(which(unname(ret$x1.idx)))
+     }
+     stopifnot(identical(x1.idx, which(unname(ret$x1.idx))))
+ }
> testFun(Y ~ 1, integer(0))
> testFun(Y ~ X1*X2*X3, integer(0))
> testFun(Y ~ Region + X1 + X2 + X3, 1:4)
> testFun(Y ~ 0 + Region + X1 + X2 + X3, 1:4)
> testFun(Y ~ Region*X1 + X2 + X3, c(1:5, 8:10))
> testFun(Y ~ Region*X1 + X2 + X3 + Region*Group, c(1:5, 8:18))
> testFun(Y ~ Region*X1 + X2 + X3 + Region*Group*X2, c(1:6, 8:29))
> testFun(Y ~ Region*X1 + X2 + Region*Group*X2, 1:28)
> testFun(Y ~ Region*X1 + X2 + Region:Group:X2, 1:21)
> testFun(Y ~ Region*X1 + X2*X3 + Region:Group:X2, c(1:6, 8:10, 12:23))
> testFun(Y ~ (X1+X2+X3+Region)^2, c(1:7,10:12,14:19))
> testFun(Y ~ (X1+X2+X3+Region)^3, c(1:19, 21:29))
> testFun(Y ~ (X1+X2+X3+Region)^4, 1:32)
> testFun(Y ~ Region:X1:X2 + X1*X2, c(1:1, 4:7))
> 
> 
> control <- lmrob.control()
> cntrlT1 <- lmrob.control(trace.lev=1)
> f.lm <- lm(Y ~ Region + X1 + X2 + X3, education)
> splt <- splitFrame(f.lm$model)
> stopifnot(identical(names(splt$x1.idx), names(coef(f.lm))),
+           unname(splt$x1.idx) == c(rep(TRUE, 4), rep(FALSE, 3))
+           )
> y <- education$Y
> 
> ## test orthogonalizing
> x1 <- splt$x1
> x2 <- splt$x2
> tmp <- lmrob.lar(x1, y, control)
> y.tilde <- tmp$resid
> t1 <- tmp$coef
> x2.tilde <- x2
> T2 <- matrix(0, nrow=ncol(x1), ncol=ncol(x2))
> for (i in 1:ncol(x2)) {
+     tmp <- lmrob.lar(x1, x2[,i], control)
+     x2.tilde[,i] <- tmp$resid
+     T2[,i] <- tmp$coef
+ }
> T2
     [,1] [,2] [,3]
[1,]  774 4894  307
[2,] -113   14   17
[3,] -171 -774   16
[4,]  -48  -81   26
> 
> set.seed(10)
> mss1 <- m_s_subsample(x1, x2.tilde, y.tilde, cntrlT1, orth = FALSE)
lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(0,1,0))
 Finished M-S subsampling with scale = 30.81835
> mss1 <- within(mss1, b1 <- drop(t1 + b1 - T2 %*% b2))
> stopifnot(all.equal(30.81835, mss1$scale, tol=1e-7))
> set.seed(10)
> mss2 <- m_s_subsample(x1, x2,       y,       cntrlT1, orth = TRUE)
lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,0))
 Finished M-S subsampling with scale = 30.81835
> stopifnot(all.equal(mss1, mss2))
> 
> res <- vector("list", 100)
> set.seed(0)
> time <- system.time(for (i in seq_along(res)) {
+     tmp <- m_s_subsample(x1, x2.tilde, y.tilde, control, FALSE)
+     res[[i]] <- unlist(within(tmp, b1 <- drop(t1 + b1 - T2 %*% b2)))
+ })
> cat('Time elapsed in subsampling: ', time,'\n')
Time elapsed in subsampling:  0.38 0 0.37 NA NA 
> ## show a summary of the results  {"FIXME": output is platform dependent}
> summary(res1 <- do.call(rbind, res))
      b11               b12              b13                b14       
 Min.   :-316.24   Min.   :-33.92   Min.   :-35.8704   Min.   :16.43  
 1st Qu.:-223.37   1st Qu.:-23.66   1st Qu.: -8.9603   1st Qu.:29.92  
 Median :-163.19   Median :-21.37   Median : -7.2929   Median :32.20  
 Mean   :-161.11   Mean   :-22.02   Mean   : -8.0727   Mean   :32.15  
 3rd Qu.:-103.36   3rd Qu.:-18.42   3rd Qu.: -5.9055   3rd Qu.:35.72  
 Max.   :  61.83   Max.   :-12.03   Max.   :  0.7015   Max.   :42.23  
      b21                b22               b23             scale      
 Min.   :-0.03808   Min.   :0.02111   Min.   :0.2555   Min.   :29.79  
 1st Qu.:-0.00397   1st Qu.:0.03927   1st Qu.:0.4956   1st Qu.:30.43  
 Median : 0.02160   Median :0.04716   Median :0.6381   Median :30.91  
 Mean   : 0.02734   Mean   :0.04618   Mean   :0.6219   Mean   :30.95  
 3rd Qu.: 0.05561   3rd Qu.:0.05224   3rd Qu.:0.7473   3rd Qu.:31.35  
 Max.   : 0.10427   Max.   :0.06938   Max.   :0.9172   Max.   :32.18  
> ## compare with fast S solution
> fmS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init="S")
> coef(fmS)
  (Intercept)       Region2       Region3       Region4            X1 
-135.72592554  -20.64576283   -9.84881727   24.58013066    0.03405591 
           X2            X3 
   0.04327562    0.57895741 
> fmS$scale
[1] 26.40386
> 
> ###  Comparing m-s_descent implementations()  {our C and R} : -------------------
> 
> ctrl <- control
> #ctrl$trace.lev <- 5
> ctrl$k.max <- 1
> mC <- m_s_descent      (x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
> mR <- m_s_descent_Ronly(x1, x2, y, ctrl, mss2$b1, mss2$b2, mss2$scale+10)
> nm <- c("b1","b2", "scale", "res")
> stopifnot(all.equal(mC[nm], mR[nm], check.attributes = FALSE, tolerance = 4e-14))
>                                         # seen 5.567e-15 in OpenBLAS      ^^^^^
> 
> ## control$k.m_s <- 100
> res3 <- vector("list", 100)
> time <- system.time(for (i in seq_along(res3)) {
+     ri <- res[[i]]
+     res3[[i]] <- unlist(m_s_descent(x1, x2, y, control,
+ 				    ri[1:4], ri[5:7], ri[8]))
+ })
> cat('Time elapsed in descent proc: ', time,'\n')
Time elapsed in descent proc:  0.06 0 0.07 NA NA 
> 
> ## show a summary of the results   {"FIXME": output is platform dependent}
> res4 <- do.call(rbind, res3)
> summary(res4[,1:8])
      b11              b12              b13               b14       
 Min.   :-316.3   Min.   :-30.56   Min.   :-36.501   Min.   :16.43  
 1st Qu.:-222.4   1st Qu.:-23.09   1st Qu.: -8.960   1st Qu.:27.96  
 Median :-160.7   Median :-21.00   Median : -7.868   Median :30.46  
 Mean   :-158.2   Mean   :-20.60   Mean   : -9.046   Mean   :30.84  
 3rd Qu.:-102.7   3rd Qu.:-17.40   3rd Qu.: -6.842   3rd Qu.:32.75  
 Max.   : 101.7   Max.   :-12.24   Max.   : -4.032   Max.   :42.23  
      b21                b22               b23             scale      
 Min.   :-0.02141   Min.   :0.01459   Min.   :0.2034   Min.   :29.79  
 1st Qu.: 0.02048   1st Qu.:0.03873   1st Qu.:0.5007   1st Qu.:30.37  
 Median : 0.04169   Median :0.04271   Median :0.6381   Median :30.57  
 Mean   : 0.03911   Mean   :0.04359   Mean   :0.6270   Mean   :30.70  
 3rd Qu.: 0.06102   3rd Qu.:0.04798   3rd Qu.:0.7460   3rd Qu.:30.96  
 Max.   : 0.09102   Max.   :0.06367   Max.   :0.9172   Max.   :31.84  
> 
> stopifnot(all.equal( # 'test', not only plot:
+ 	  res1[, "scale"],   res4[,"scale"], tol = 0.03),
+ 	  res1[, "scale"] >= res4[,"scale"] - 1e-7 ) # 1e-7 just in case
>      plot(res1[, "scale"],   res4[,"scale"])
> abline(0,1, col=adjustcolor("gray", 0.5))
> 
> ## Test lmrob.M.S
> x <- model.matrix(fmS)
> control$trace.lev <- 3
> ##      ---------   --
> set.seed(1003)
> fMS <- lmrob.M.S(x, y, control, fmS$model)
lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,1))
  orthogonalized: SIGMA=10.3443
 Starting M-S subsampling procedure(p1=4, p2=3; ini.sc=1.03443e+21) ..  [setup Ok]
  Sample[  0]: new candidate with sc =      43.2268777 in 46 iter
  Sample[  1]: new candidate with sc =      33.1250480 in 49 iter
  Sample[ 18]: new candidate with sc =      31.8558584 in 42 iter
  Sample[136]: new candidate with sc =      31.7408095 in 35 iter
  Sample[338]: new candidate with sc =      31.5118791 in 37 iter
 Finished M-S subsampling with scale = 31.51188
  b1: -5.44507 7.82526 1.75671 7.6568 
  b2: 0.0484424 0.0376332 0.591759 
 Starting descent procedure...
  Scale: 31.51188
  Refinement step   1: better fit, scale:     31.486
  Refinement step   2: no improvement, scale:     31.496
  Refinement step   3: no improvement, scale:     31.498
  Refinement step   4: better fit, scale:     31.388
  Refinement step   5: better fit, scale:     31.343
  Refinement step   6: better fit, scale:     31.329
  Refinement step   7: no improvement, scale:     31.331
  Refinement step   8: no improvement, scale:     31.342
  Refinement step   9: no improvement, scale:     31.358
  Refinement step  10: no improvement, scale:     31.377
  Refinement step  11: no improvement, scale:     31.398
  Refinement step  12: no improvement, scale:     31.421
  Refinement step  13: no improvement, scale:     31.445
  Refinement step  14: no improvement, scale:     31.469
  Refinement step  15: no improvement, scale:     31.495
  Refinement step  16: no improvement, scale:     31.521
  Refinement step  17: no improvement, scale:     31.549
  Refinement step  18: no improvement, scale:     31.577
  Refinement step  19: no improvement, scale:     31.607
  Refinement step  20: no improvement, scale:     31.637
  Refinement step  21: no improvement, scale:     31.668
  Refinement step  22: no improvement, scale:     31.700
  Refinement step  23: no improvement, scale:     31.734
  Refinement step  24: no improvement, scale:     31.768
  Refinement step  25: no improvement, scale:     31.804
  Refinement step  26: no improvement, scale:     31.841
 Descent procedure: not converged (best scale: 31.329, last step: 31.841)
  The procedure stopped after 27 steps because there was no improvement in the last 20 steps.
  To increase this number, use the control parameter 'k.m_s'.
  b1: -113.529 -14.8582 -9.42637 27.4853 
  b2: 0.0668455 0.0361344 0.540701 
Warning message:
In lmrob.M.S(x, y, control, fmS$model) : M-S estimator did *not* converge
> resid <- drop(y - x %*% fMS$coef)
> assert.EQ(resid, fMS$resid, check.attributes=FALSE, tol = 1e-12)
> 
> ## Test direct call to lmrob
> ## 1. trace_lev output:
> set.seed(17)
> fMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S", trace.lev=2)
lmrob_M_S(n = 50, nRes = 500, (p1,p2)=(4,3), (orth,subs,desc)=(1,1,1))
  orthogonalized: SIGMA=10.3443
 Starting M-S subsampling procedure(p1=4, p2=3; ini.sc=1.03443e+21) ..  [setup Ok]
  Sample[  0]: new candidate with sc =      720.839768 in 42 iter
  Sample[  1]: new candidate with sc =      32.1056576 in 48 iter
  Sample[ 69]: new candidate with sc =      31.8561665 in 40 iter
  Sample[128]: new candidate with sc =      31.1223584 in 47 iter
 Finished M-S subsampling with scale = 31.12236
 Starting descent procedure...
  Refinement step   1: better fit, scale:     30.963
  Refinement step   4: better fit, scale:     30.888
 Descent procedure: not converged (best scale: 30.888, last step: 31.134)
  The procedure stopped after 25 steps because there was no improvement in the last 20 steps.
  To increase this number, use the control parameter 'k.m_s'.
init converged (remaining method = "M") -> coef=
[1] -236.33813174  -22.50174637   -7.69929975   25.25388208    0.05220654
[6]    0.04769691    0.78984802
lmrob_MM(): rwls():
 rwls() used 19 it.; last ||b0 - b1||_1 = 1.76872e-05, L(b1) = 0.146343135734; convergence
lmrob..MM..fit(*, obj) --> updating ..  qr(x * rweights) -> rank= 7
step "M" -> new coef=
  (Intercept)       Region2       Region3       Region4            X1 
-150.21716170  -12.68711627  -10.64944153   21.92534635    0.04153037 
           X2            X3 
   0.04337025    0.61139401 
Warning message:
In lmrob.M.S(x, y, control, mf = mf) : M-S estimator did *not* converge
> 
> set.seed(13)
> fiMS <- lmrob(Y ~ Region + X1 + X2 + X3, education, init = "M-S")
> out2 <- capture.output(summary(fiMS))
> writeLines(out2)

Call:
lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = "M-S")
 \--> method = "M-SM"
Residuals:
    Min      1Q  Median      3Q     Max 
-62.729 -15.529  -1.572  23.392 174.750 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept) -150.07630  143.09300  -1.049  0.30013   
Region2      -12.76767   16.63758  -0.767  0.44704   
Region3      -10.63954   15.92865  -0.668  0.50774   
Region4       21.95445   16.96484   1.294  0.20253   
X1             0.04146    0.05040   0.823  0.41525   
X2             0.04337    0.01373   3.159  0.00289 **
X3             0.61106    0.35153   1.738  0.08932 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Robust residual standard error: 30.82 
Multiple R-squared:  0.5695,	Adjusted R-squared:  0.5095 
Convergence in 19 IRWLS iterations

Robustness weights: 
 observation 50 is an outlier with |weight| = 0 ( < 0.002); 
 7 weights are ~= 1. The remaining 42 ones are summarized as
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2884  0.8904  0.9508  0.8890  0.9867  0.9985 
Algorithmic parameters: 
       tuning.chi                bb        tuning.psi           rel.tol 
        1.548e+00         5.000e-01         4.685e+00         1.000e-07 
        scale.tol         solve.tol          zero.tol       eps.outlier 
        1.000e-10         1.000e-07         1.000e-10         2.000e-03 
            eps.x warn.limit.reject warn.limit.meanrw 
        1.071e-08         5.000e-01         5.000e-01 
     nResample         max.it          k.max    maxit.scale          k.m_s 
           500             50            200            200             20 
     trace.lev            mts     compute.rd fast.s.large.n 
             0           1000              0           2000 
                  psi           subsampling                   cov 
           "bisquare"         "nonsingular"             ".vcov.w" 
           split.type compute.outlier.stats 
                  "f"                  "SM" 
seed : int(0) 
> 
> set.seed(13)
> fiM.S <- lmrob(Y ~ Region + X1 + X2 + X3, education, init=lmrob.M.S)
> out3 <- capture.output(summary(fiM.S))
> 
> ## must be the same {apart from the "init=" in the call}:
> i <- 3
> stopifnot(identical(out2[-i], out3[-i]))
> ## the difference:
> c(rbind(out2[i], out3[i]))
[1] "lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = \"M-S\")"  
[2] "lmrob(formula = Y ~ Region + X1 + X2 + X3, data = education, init = lmrob.M.S)"
> 
> 
> ###  "Skipping design matrix equilibration" warning can arise for reasonable designs -----
> set.seed(1)
> x2 <- matrix(rnorm(2*30), 30, 2)
> data <- data.frame(y = rnorm(30), group = rep(letters[1:3], each=10), x2)
> 
> obj <- lmrob(y ~ ., data, init="M-S", trace.lev=1)
lmrob_M_S(n = 30, nRes = 500, (p1,p2)=(3,2), (orth,subs,desc)=(1,1,1))
 Finished M-S subsampling with scale = 0.93330
 Descent procedure: converged (best scale: 0.92746, last step: 0.92746)
init converged (remaining method = "M") -> coef=
[1]  0.14412998 -0.16709564 -0.01045825 -0.18551426  0.25915102
lmrob_MM(): rwls():
 rwls() used 12 it.; last ||b0 - b1||_1 = 4.61816e-08, L(b1) = 0.0907731911388; convergence
lmrob..MM..fit(*, obj) --> updating ..  qr(x * rweights) -> rank= 5
step "M" -> new coef=
(Intercept)      groupb      groupc          X1          X2 
 0.41147162 -0.74117537 -0.36321634  0.02379646  0.51727653 
Warning message:
In lmrob.M.S(x, y, control, mf = mf) :
   Skipping design matrix equilibration (DGEEQU): row 23 is exactly zero.
> 
> ## illustration: the zero row is introduced during the orthogonalization of x2 wrt x1
> ## l1 regression always produces p zero residuals
> ## by chance, the zero residuals of multiple columns happen to be on the same row
> sf <- splitFrame(obj$model)
> x1 <- sf$x1
> x2 <- sf$x2
> control <- obj$control
> 
> ## orthogonalize
> x2.tilde <- x2
> 
> for(i in 1:ncol(x2)) {
+     tmp <- lmrob.lar(x1, x2[,i], control)
+     x2.tilde[,i] <- tmp$resid
+ }
> x2.tilde == 0
      X1    X2
1  FALSE FALSE
2   TRUE FALSE
3  FALSE FALSE
4  FALSE  TRUE
5  FALSE FALSE
6  FALSE FALSE
7  FALSE FALSE
8  FALSE FALSE
9  FALSE FALSE
10 FALSE FALSE
11 FALSE FALSE
12  TRUE FALSE
13 FALSE FALSE
14 FALSE FALSE
15 FALSE FALSE
16 FALSE FALSE
17 FALSE  TRUE
18 FALSE FALSE
19 FALSE FALSE
20 FALSE FALSE
21 FALSE FALSE
22 FALSE FALSE
23  TRUE  TRUE
24 FALSE FALSE
25 FALSE FALSE
26 FALSE FALSE
27 FALSE FALSE
28 FALSE FALSE
29 FALSE FALSE
30 FALSE FALSE
> 
> 
> ## Specifying init="M-S" for a model without categorical variables
> ## used to cause a segfault; now uses "S"
> lmrob(LNOx ~ LNOxEm, NOxEmissions[1:10,], init="M-S")

Call:
lmrob(formula = LNOx ~ LNOxEm, data = NOxEmissions[1:10, ], init = "M-S")
 \--> method = "MM"
Coefficients:
(Intercept)       LNOxEm  
     1.5118       0.4978  

Warning message:
In lmrob.M.S(x, y, control, mf = mf) :
  No categorical variables found in model. Reverting to S-estimator.
> 
> ## Now an ANOVA model with *only* categorical variables
> n <- 64 # multiple of 16
> stopifnot(n %% 16 == 0)
> d.AOV <- data.frame(y = round(100*rnorm(64)),
+ 		    A=gl(4,n/4), B=gl(2,8, n), C=gl(2,4,n))
> fm <- lmrob(y ~ A*B*C, data = d.AOV, init = "M-S", trace.lev=2)
init converged (remaining method = "M") -> coef=
 [1]   19    6   58 -161   91  -30 -121 -176    1   45  -75  113 -111  132  187
[16]    0
lmrob_MM(): rwls():
 rwls() used 30 it.; last ||b0 - b1||_1 = 5.76455e-05, L(b1) = 0.173788300102; convergence
lmrob..MM..fit(*, obj) --> updating ..  qr(x * rweights) -> rank= 16
step "M" -> new coef=
(Intercept)          A2          A3          A4          B2          C2 
  6.3072125 -12.1364596  69.2442724 -71.0775753  35.0078234  19.5279997 
      A2:B2       A3:B2       A4:B2       A2:C2       A3:C2       A4:C2 
-24.1727568 -87.0419714  -0.7811905  73.9279841 -77.3141570  28.1145939 
      B2:C2    A2:B2:C2    A3:B2:C2    A4:B2:C2 
-74.7935878  46.9640876  66.0934400  -7.4911099 
Warning message:
In lmrob.M.S(x, y, control, mf = mf) :
  No continuous variables found in model. Reverting to L1-estimator.
> 
> ## lmrob_M_S(n = 64, nRes = 500, (p1,p2)=(16,0), (orth,subs,desc)=(1,1,1))
> ##  Starting subsampling procedure.. Error in lmrob.M.S(x, y, control, mf) :
> ##   'Calloc' could not allocate memory (18446744073709551616 of 4 bytes)
> 
> ## BTW: Can we compute an  M-estimate (instead of MM-*) as we
> ## ---  cannot have any x-outliers in such an ANOVA!
> 
> proc.time()
   user  system elapsed 
   1.21    0.10    1.31