context("Test formulas") elt <- function(a, ep) testthat::expect_lt(max(abs(a)), ep) test_that("HC1 and HC2 formulas match sandwich", { y <- 1:10 x <- cbind(sin(1:10), sin(2:11)) fm <- lm(y~x) HC1 <- sandwich::vcovHC(fm, type="HC1") HC2 <- sandwich::vcovHC(fm, type="HC2") r <- dfadjustSE(fm) elt(r$vcov-HC2, 10^3*.Machine$double.eps) elt(r$coefficients[, "HC1 se"]-sqrt(diag(HC1)), 10^3*.Machine$double.eps) elt(r$coefficients[, "df"]-c(4.94624731, 3.90754737, 4.77265484), 1e-8) }) test_that("clustervar must be a factor", { y <- 1:10 x <- cbind(sin(1:10), sin(2:11)) fm <- lm(y~x) expect_error(dfadjustSE(fm, clustervar=1:10)) }) ## Compute the inverse square root of a symmetric matrix MatSqrtInverse <- function(A) { ei <- eigen(A, symmetric=TRUE) d <- pmax(ei$values, 0) d2 <- 1/sqrt(d) d2[d == 0] <- 0 ## diag(d2) is d2 x d2 identity if d2 is scalar, instead we want 1x1 matrix ei$vectors %*% (if (length(d2)==1) d2 else diag(d2)) %*% t(ei$vectors) } BMlmSE <- function(model, clustervar=NULL, ell=NULL, IK=TRUE) { X <- model.matrix(model) sum.model <- summary.lm(model) n <- sum(sum.model$df[1:2]) K <- model$rank XXinv <- sum.model$cov.unscaled # XX^{-1} u <- residuals(model) ## Compute DoF given G'*Omega*G without calling eigen as suggested by ## Winston Lin df <- function(GG) sum(diag(GG))^2 / sum(GG * GG) ## Previously: ## lam <- eigen(GG, only.values=TRUE)$values ## sum(lam)^2/sum(lam^2) ## no clustering if (is.null(clustervar)) { Vhat <- sandwich::vcovHC(model, type="HC2") VhatStata <- Vhat*NA M <- diag(n)-X %*% XXinv %*% t(X) # annihilator matrix ## G'*Omega*G GOG <- function(ell) { Xtilde <- drop(X %*% XXinv %*% ell / sqrt(diag(M))) crossprod(M * Xtilde) } } else { if (!is.factor(clustervar)) stop("'clustervar' must be a factor") ## Stata S <- length(levels(clustervar)) # number clusters uj <- apply(u*X, 2, function(x) tapply(x, clustervar, sum)) VhatStata <- S / (S-1) * (n-1) / (n-K) * sandwich::sandwich(model, meat = crossprod(uj)/n) ## HC2 tXs <- function(s) { Xs <- X[clustervar==s, , drop=FALSE] #nolint MatSqrtInverse(diag(NROW(Xs))-Xs%*% XXinv %*% t(Xs)) %*% Xs } tX <- lapply(levels(clustervar), tXs) # list of matrices tu <- split(u, clustervar) tutX <- vapply(seq_along(tu), function(i) crossprod(tu[[i]], tX[[i]]), numeric(K)) Vhat <- sandwich::sandwich(model, meat = tcrossprod(tutX)/n) ## DOF adjustment tHs <- function(s) { Xs <- X[clustervar==s, , drop=FALSE] #nolint index <- which(clustervar==s) ss <- outer(rep(0, n), index) # n x ns matrix of 0 ss[cbind(index, seq_along(index))] <- 1 ss-X %*% XXinv %*% t(Xs) } tH <- lapply(levels(clustervar), tHs) # list of matrices Moulton <- function() { ## Moulton estimates ns <- tapply(u, clustervar, length) ssr <- sum(u^2) s1 <- vapply(seq_along(tu), function(i) sum(tu[[i]] %o% tu[[i]]), numeric(1)) rho <- max((sum(s1)-ssr) / (sum(ns^2)-n), 0) c(sig.eps=max(ssr/n - rho, 0), rho=rho) } GOG <- function(ell) { G <- sapply(seq_along(tX), function(i) tH[[i]] %*% tX[[i]] %*% XXinv %*% ell) GG <- crossprod(G) ## IK method if (IK==TRUE) { Gsums <- apply(G, 2, function(x) tapply(x, clustervar, sum)) # Z'*G GG <- Moulton()[1]*GG + Moulton()[2]*crossprod(Gsums) } GG } } if (!is.null(ell)) { se <- drop(sqrt(crossprod(ell, Vhat) %*% ell)) dof <- df(GOG(ell)) seStata <- drop(sqrt(crossprod(ell, VhatStata) %*% ell)) } else { se <- sqrt(diag(Vhat)) dof <- vapply(seq_len(K), function(k) df(GOG(diag(K)[, k])), numeric(1)) seStata <- sqrt(diag(VhatStata)) } names(dof) <- names(se) list(vcov=Vhat, dof=dof, adj.se=se*qt(0.975, df=dof)/qnorm(0.975), se=se, seStata=seStata) } test_that("New implementation matches old", { cl3 <- c(rep(1, 60), rep(2, 20), rep(3, 20)) d0 <- list(data.frame(y=tan(1:10), x=cbind(sin(1:10), cos(2:11)), cl=as.factor(c(rep(1, 6), rep(2, 2), rep(3, 2)))), data.frame(y=1:100, x=cbind(sin(1:100), (1:100)^3), cl=as.factor(c(rep(1, 60), rep(2, 20), rep(3, 20)))), data.frame(y = (1:100)^2+cl3, x=cbind(sin(1:100), 1:100), cl=as.factor(cl3))) ep <- 10^3*.Machine$double.eps ## Increase numerical tolerance if platform doesn't have long double. ## BLAS/LAPACK 3.10.1 in R-devel also seemes to be creating issues on M1 Mac bigep <- if (capabilities("long.double")) 10*ep else 10^4*ep for (j in seq_along(d0)) { fm <- lm(y~x.1+x.2, data=d0[[j]]) ## No clustering r <- dfadjustSE(fm) ro <- BMlmSE(fm) elt((r$vcov-ro$vcov)/r$vcov, ep) elt((r$coefficients[, "df"]-ro$dof)/ro$dof, bigep) elt((r$coefficients[, "Adj. se"]-ro$adj.se)/ro$adj.se, bigep) elt((r$coefficients[, "HC2 se"]-ro$se)/ro$se, ep) ## If each observation in its cluster, we get HC2 cl <- as.factor(seq_along(fm$model$y)) elt(dfadjustSE(fm, cl, IK=FALSE)$coefficients-r$coefficients, bigep) ## Clustering r <- dfadjustSE(fm, d0[[j]]$cl, IK=FALSE) ro <- BMlmSE(fm, d0[[j]]$cl, IK=FALSE) elt((r$vcov-ro$vcov)/r$vcov, bigep) elt((r$coefficients[, "HC2 se"]-ro$se)/ro$se, bigep) elt((r$coefficients[, "df"]-ro$dof)/ro$dof, 10*bigep) elt((r$coefficients[, "Adj. se"]-ro$adj.se) / r$coefficients[, "Adj. se"], 10*bigep) elt((r$coefficients[, "HC1 se"]-ro$seStata)/ro$seStata, ep) r <- dfadjustSE(fm, d0[[j]]$cl, IK=TRUE, rho0=TRUE) ro <- BMlmSE(fm, d0[[j]]$cl, IK=TRUE) elt((r$vcov-ro$vcov)/r$vcov, bigep) elt((r$coefficients[, "HC2 se"]-ro$se)/ro$se, bigep) elt(r$coefficients[, "df"]-ro$dof, ep) elt((r$coefficients[, "Adj. se"]-ro$adj.se)/ro$adj.se, bigep) elt((r$coefficients[, "HC1 se"]-ro$seStata)/ro$seStata, ep) r <- dfadjustSE(fm, d0[[j]]$cl, IK=TRUE, ell=c(1, 1, 0), rho0=TRUE) ro <- BMlmSE(fm, d0[[j]]$cl, IK=TRUE, ell=c(1, 1, 0)) elt((r$vcov-ro$vcov)/r$vcov, bigep) elt((r$coefficients[, "HC2 se"]-ro$se)/ro$se, bigep) elt(r$coefficients[, "df"]-ro$dof, ep) elt((r$coefficients[, "Adj. se"]-ro$adj.se)/ro$adj.se, bigep) elt((r$coefficients[, "HC1 se"]-ro$seStata)/ro$seStata, ep) } r <- dfadjustSE(lm(d0[[3]]$y~1)) ro <- BMlmSE(lm(d0[[3]]$y~1)) elt((r$vcov-ro$vcov)/ro$vcov, bigep) elt(r$coefficients[, "df"]-ro$dof, bigep) elt((r$coefficients[, "Adj. se"]-ro$adj.se)/ro$adj.se, ep) elt((r$coefficients[, "HC2 se"]-ro$se)/ro$se, ep) ## previously incorrectly 9.05e-37 for p-value expect_equal(capture.output(print(r, digits=3))[4], "(Intercept) 3385 303 303 306 99 2.83e-19") ## Noninvertible cases d1 <- d0[[3]] d1$x.3 <- FALSE d1$x.3[1] <- TRUE fm1 <- lm(y~x.1+x.2+x.3, data=d1) rq <- dfadjustSE(fm1) r0 <- dfadjustSE(lm(y~x.1+x.2, data=d1)) elt(rq$coefficients[, 4]-c(188.97371028, 107.8865573, 3.36194461, 211.67556014), 1e-8) elt(r0$coefficients[, 4]-c(188.164883779, 109.636001038, 3.371563406), 1e-9) expect_warning(rr <- BMlmSE(fm1)$se) expect_true(all(is.nan(rr))) ## Fixed effects: here minimum eigenvalue is only numerically positive d2 <- d0[[1]] d2$x <- c(rep(1, 4), rep(0, 5), 1) fm2 <- lm(y~x+cl, data=d2) p1 <- dfadjustSE(fm2, d2$cl) p2 <- BMlmSE(fm2, d2$cl) elt(p2$adj.se - p1$coefficients[, "Adj. se"], 1e-6) ## P-values expect_equal(capture.output(print(p1, digits=3))[5], "x 1.71 0.3501 0.3705 2.4 1 0.1362") ## Test scaling p3 <- dfadjustSE(lm.fit(x=model.matrix(fm2), y=1e-8*fm2$model$y), d2$cl) p4 <- dfadjustSE(lm.fit(x=model.matrix(fm2), y=1e8*fm2$model$y), d2$cl) expect_lt(max(abs(p3$vcov*1e16-p1$vcov)), 1e-12) expect_lt(max(abs(p4$vcov*1e-16-p1$vcov)), 1e-12) p5 <- dfadjustSE(lm.fit(x=model.matrix(fm2)*1e8, y=fm2$model$y), d2$cl) p6 <- dfadjustSE(lm.fit(x=model.matrix(fm2)*1e-8, y=fm2$model$y), d2$cl) expect_lt(max(abs(p5$vcov*1e16-p1$vcov)), 1e-12) expect_lt(max(abs(p6$vcov*1e-16-p1$vcov)), 1e-12) })