context("Estimator - lm_robust, non-clustered") test_that("lm robust se", { set.seed(42) N <- 40 dat <- data.frame(Y = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), W = runif(N)) tidy(lm_robust(Y ~ Z, data = dat)) lm_robust(Y ~ Z, se_type = "none", data = dat) lm_robust(Y ~ Z + X, data = dat) lm_robust(Y ~ Z * X, data = dat) expect_equivalent( coef(lm_robust(Y ~ 1, data = dat))[1], mean(dat$Y) ) expect_error( lm_robust(Y ~ Z + X, data = dat, se_type = "not_a_real_one"), "`se_type` must be either 'HC0', 'HC1', 'stata', 'HC2', 'HC3'," ) # Works with subset lmsub <- lm_robust(Y ~ Z + X, data = dat, subset = W > 0.5) lmbool <- lm_robust(Y ~ Z + X, data = dat[dat$W > 0.5, ]) expect_equal( rmcall(lmsub), rmcall(lmbool) ) lm_robust(Y ~ Z, weights = W, data = dat) # matches. # commarobust::commarobust(lm(Y ~ Z, weights = W, data = dat)) # To easily do with and without weights test_lm_robust_variance <- function(w) { # Test other estimators lm_hc0 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC0") lm_hc1 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC1") lm_hc2 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC2") lm_hc3 <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "HC3") lm_stata <- lm_robust(Y ~ Z + X, data = dat, weights = w, se_type = "stata") # Stata is the same as HC1 expect_equal( rmcall(lm_hc1), rmcall(lm_stata) ) expect_false(all(lm_hc0$std.error == lm_hc1$std.error)) expect_false(all(lm_hc0$std.error == lm_hc2$std.error)) expect_false(all(lm_hc0$std.error == lm_hc3$std.error)) expect_false(all(lm_hc1$std.error == lm_hc2$std.error)) expect_false(all(lm_hc1$std.error == lm_hc3$std.error)) expect_false(all(lm_hc2$std.error == lm_hc3$std.error)) expect_equivalent(lm_hc0$df,lm_hc1$df) expect_equivalent(lm_hc0$df,lm_hc2$df) expect_equivalent(lm_hc0$df,lm_hc3$df) expect_equivalent(lm_hc0$df,lm_stata$df) expect_equivalent( lm_hc0$std.error ^ 2, lm_hc1$std.error ^ 2 * ((N - length(coef(lm_hc1))) / N) ) } # No weights first test_lm_robust_variance(NULL) test_lm_robust_variance(dat$W) # works with formula in a variable (always worked) form <- Y ~ Z lm_form <- lm_robust(form, data = dat) # works with formula inside a function (didn't work before 0.4.0) f <- function(data) { form2 <- Y ~ Z return(lm_robust(form2, data = data)) } lm_f_form <- f(dat) expect_equal( rmcall(lm_form), rmcall(lm_f_form) ) # Drops unused levels appropriately dat$Z <- as.factor(sample(LETTERS[1:3], nrow(dat), replace = TRUE)) lmall <- lm_robust(Y ~ Z, data = dat) lm1 <- lm_robust(Y ~ Z, data = dat[dat$Z %in% c("A", "B"), ]) lm2 <- lm_robust(Y ~ Z, data = dat, subset = Z %in% c("A", "B")) expect_equal( rmcall(lm1), rmcall(lm2) ) # pvals and cis diff because dof are diff expect_equal( tidy(lmall)[1:2, 1:3], tidy(lm1)[, 1:3] ) # rlang works my_w_vec <- rlang::sym("W") expect_equal( tidy(lm_robust(Y ~ Z + X, data = dat, weights = !!my_w_vec, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_dat <- rlang::sym("dat") expect_equal( tidy(lm_robust(Y ~ Z + X, data = !!my_dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_y <- rlang::sym("Y") expect_equal( tidy(lm_robust(!!my_y ~ Z + X, data = dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) my_formula <- Y ~ Z + X expect_equal( tidy(lm_robust(!!my_formula, data = dat, weights = W, se_type = "HC2")), tidy(lm_robust(Y ~ Z + X, data = dat, weights = W, se_type = "HC2")) ) }) test_that("lm robust F-tests are correct", { skip_if_not_installed("car") skip_if_not_installed("clubSandwich") co <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = "classical") caro <- car::linearHypothesis(co, c("hp = 0", "am = 0"), test = "F") carolm <- car::linearHypothesis(lm(mpg ~ hp + am, data = mtcars), c("hp = 0", "am = 0"), test = "F") expect_equivalent( co$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) expect_equivalent( co$fstatistic, c(carolm$F[2], carolm$Df[2], carolm$Res.Df[2]) ) cow <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = "classical") caro <- car::linearHypothesis(cow, c("hp = 0", "am = 0"), test = "F") expect_equivalent( cow$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) for (se_type in setdiff(se_types, "classical")) { lmr <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = se_type) caro <- car::linearHypothesis(lmr, c("hp = 0", "am = 0"), test = "F") carolm <- car::linearHypothesis(lm(mpg ~ hp + am, data = mtcars), c("hp = 0", "am = 0"), test = "F", white.adjust = tolower(se_type)) expect_equivalent( lmr$fstatistic, c(caro$F[2], caro$Df[2], caro$Res.Df[2]) ) expect_equivalent( lmr$fstatistic, c(carolm$F[2], carolm$Df[2], carolm$Res.Df[2]) ) lmrw <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = se_type) carow <- car::linearHypothesis(lmrw, c("hp = 0", "am = 0"), test = "F") carolmw <- car::linearHypothesis(lm(mpg ~ hp + am, weights = wt, data = mtcars), c("hp = 0", "am = 0"), test = "F", white.adjust = tolower(se_type)) expect_equivalent( lmrw$fstatistic, c(carow$F[2], carow$Df[2], carow$Res.Df[2]) ) expect_equivalent( lmrw$fstatistic, c(carolmw$F[2], carolmw$Df[2], carolmw$Res.Df[2]) ) } for (se_type in cr_se_types) { lmcr <- lm_robust(mpg ~ hp + am, data = mtcars, clusters = carb, se_type = se_type) caro <- clubSandwich::Wald_test(lm(mpg ~ hp + am, data = mtcars), cluster = mtcars$carb, constraints = clubSandwich::constrain_zero(2:3), vcov = ifelse(se_type == "stata", "CR1S", se_type), test = "Naive-F") lmcrw <- lm_robust(mpg ~ hp + am, data = mtcars, clusters = carb, weights = wt, se_type = se_type) carow <- clubSandwich::Wald_test(lm(mpg ~ hp + am, weights = wt, data = mtcars), cluster = mtcars$carb, constraints = clubSandwich::constrain_zero(2:3), vcov = ifelse(se_type == "stata", "CR1S", se_type), test = "Naive-F") expect_equivalent( lmcr$fstatistic[c(1, 3)], c(caro$Fstat, caro$df_denom) ) expect_equivalent( lmcrw$fstatistic[c(1, 3)], c(carow$Fstat, carow$df_denom) ) } }) test_that("lm robust mlm gets right fstats", { for (se_type in se_types) { lmcyl <- lm_robust(cyl ~ hp + am, data = mtcars, se_type = se_type) lmmpg <- lm_robust(mpg ~ hp + am, data = mtcars, se_type = se_type) lm2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, se_type = se_type) expect_equivalent( lm2$fstatistic[1:2], c(lmcyl$fstatistic[1], lmmpg$fstatistic[1]) ) lmwcyl <- lm_robust(cyl ~ hp + am, data = mtcars, weights = wt, se_type = se_type) lmwmpg <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, se_type = se_type) lmw2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, weights = wt, se_type = se_type) expect_equivalent( lmw2$fstatistic[1:2], c(lmwcyl$fstatistic[1], lmwmpg$fstatistic[1]) ) } for (se_type in cr_se_types) { lmccyl <- lm_robust(cyl ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) lmcmpg <- lm_robust(mpg ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) lmc2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, cluster = carb, se_type = se_type) expect_equivalent( lmc2$fstatistic[1:2], c(lmccyl$fstatistic[1], lmcmpg$fstatistic[1]) ) lmcwcyl <- lm_robust(cyl ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) lmcwmpg <- lm_robust(mpg ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) lmcw2 <- lm_robust(cbind(cyl, mpg) ~ hp + am, data = mtcars, weights = wt, cluster = carb, se_type = se_type) expect_equivalent( lmcw2$fstatistic[1:2], c(lmcwcyl$fstatistic[1], lmcwmpg$fstatistic[1]) ) } }) test_that("lm robust works with missingness", { skip_if_not_installed("sandwich") dat <- data.frame( Y = rnorm(100), Z = rbinom(100, 1, .5), X = rnorm(100), W = runif(100) ) dat$X[23] <- NA expect_equal( rmcall(lm_robust(Y ~ Z + X, data = dat)), rmcall(lm_robust(Y ~ Z + X, data = dat[-23, ])) ) lm_robust(Y ~ Z + X, data = dat) lm_robust(Y ~ Z * X, data = dat) ## Outcome missingness dat$Y[35] <- NA estimatr_missout_out <- lm_robust(Y ~ Z + X, data = dat) lm_missout_out <- lm(Y ~ Z + X, data = dat) lm_missout_hc2 <- cbind( coef(lm_missout_out), sqrt(diag(sandwich::vcovHC(lm_missout_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_missout_out)[, c("estimate", "std.error")]), lm_missout_hc2 ) # nested DFs dat$Y2 <- matrix(dat$Y) expect_equivalent( tidy(lm_robust(Y ~ Z + X, data = dat))[, 1:6], tidy(lm_robust(Y2 ~ Z + X, data = dat))[, 1:6] ) }) test_that("lm_robust doesn't include aux variables when . is used", { n <- 10 dat <- data.frame(y = rnorm(n), x = rnorm(n)) # not in data.frame clust <- rep(1:5, each = 2) expect_equal( rmcall(lm_robust(y ~ ., clusters = clust, data = dat)), rmcall(lm_robust(y ~ x, clusters = clust, data = dat)) ) }) test_that("lm robust works with weights", { skip_if_not_installed("sandwich") N <- 100 dat <- data.frame( Y = rnorm(N), Z = rbinom(N, 1, .5), X = rnorm(N), W = runif(N) ) ## Make sure weighting works expect_error( estimatr_out <- lm_robust(Y ~ Z * X, weights = W, data = dat), NA ) expect_true( any(grepl("Weighted", capture.output(summary(estimatr_out)))) ) # Compare to lm output lm_out <- lm(Y ~ Z * X, weights = W, data = dat) lmo_hc2 <- cbind( coef(lm_out), sqrt(diag(sandwich::vcovHC(lm_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_out)[, c("estimate", "std.error")]), lmo_hc2 ) ## Make sure weighting works with missingness dat$W[39] <- NA expect_warning( estimatr_miss_out <- lm_robust(Y ~ Z * X, weights = W, data = dat), "missing" ) expect_equal( rmcall(estimatr_miss_out), rmcall(lm_robust(Y ~ Z * X, weights = W, data = dat[-39, ])) ) # Compare to lm output lm_miss_out <- lm(Y ~ Z * X, weights = W, data = dat) lmo_miss_hc2 <- cbind( coef(lm_miss_out), sqrt(diag(sandwich::vcovHC(lm_miss_out, type = "HC2"))) ) expect_equivalent( as.matrix(tidy(estimatr_miss_out)[, c("estimate", "std.error")]), lmo_miss_hc2 ) expect_error( lm_robust(Y ~ Z, data = dat, weights = c(-0.5, runif(N - 1))), "`weights` must not be negative" ) }) test_that("lm_robust_fit adds column names", { n <- 10 y <- rnorm(n) X <- matrix(rnorm(n * 3), ncol = 3) lm_o <- lm_robust_fit( y = y, X = X, weights = NULL, cluster = NULL, ci = TRUE, se_type = "classical", alpha = 0.05, return_vcov = TRUE, try_cholesky = TRUE, has_int = FALSE, iv_stage = list(0) ) expect_equal( lm_o$term, c("X1", "X2", "X3") ) }) test_that("lm robust works with large data", { N <- 75000 dat <- data.frame( Y = rbinom(N, 1, .5), X1 = rnorm(N), X2 = rnorm(N), X3 = rnorm(N) ) expect_error( lm_robust(Y ~ X1 + X2 + X3, data = dat, se_type = "none"), NA ) }) set.seed(42) N <- 100 dat <- data.frame( Y = rbinom(N, 1, .5), X1 = rnorm(N), X2 = rnorm(N), X3 = rnorm(N) ) test_that("lm robust works with rank-deficient X", { dat$Z1 <- dat$X1 sum_lm <- summary(lm(Y ~ X1 + X2 + Z1 + X3, data = dat)) ## manually build vector of coefficients, can't extract from summary.lm out_sumlm <- matrix(NA, nrow = length(sum_lm$aliased), ncol = 2) j <- 1 for (i in seq_along(sum_lm$aliased)) { if (!sum_lm$aliased[i]) { out_sumlm[i, ] <- coef(sum_lm)[j, 1:2] j <- j + 1 } } ## order sometimes is different! Not stable order! # expect_equivalent( # as.matrix(tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, se_type = 'classical'))[, c('estimate', 'std.error')]), # out_sumlm # ) dat$Z1 <- dat$X1 + 5 ## Not the same as LM! Different QR decompositions when dependency isn't just equivalency expect_equivalent( as.matrix(tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, se_type = "classical"))[, c("estimate", "std.error")]), as.matrix(RcppEigen:::summary.fastLm(RcppEigen::fastLm(Y ~ X1 + X2 + Z1 + X3, data = dat))$coefficients[, 1:2]) ) # trigger cascade to QR from try_chol; set seed above because try_cholesky # sometimes will work! expect_equivalent( tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat)), tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, try_cholesky = TRUE)) ) # Weighted rank deficient dat$w <- 1 expect_equivalent( tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat)), tidy(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat, weights = w)) ) expect_true( any(grepl( "not defined because the design matrix is rank deficient", capture.output(summary(lm_robust(Y ~ X1 + X2 + Z1 + X3, data = dat))) )) ) }) test_that("r squared is right", { lmo <- summary(lm(mpg ~ hp, mtcars)) lmow <- summary(lm(mpg ~ hp, mtcars, weights = wt)) lmon <- summary(lm(mpg ~ hp - 1, mtcars)) lmown <- summary(lm(mpg ~ hp - 1, mtcars, weights = wt)) lmro <- lm_robust(mpg ~ hp, mtcars) lmrow <- lm_robust(mpg ~ hp, mtcars, weights = wt) lmroclust <- lm_robust(mpg ~ hp, mtcars, clusters = carb) lmrowclust <- lm_robust(mpg ~ hp, mtcars, weights = wt, clusters = carb) lmron <- lm_robust(mpg ~ hp - 1, mtcars) lmrown <- lm_robust(mpg ~ hp - 1, mtcars, weights = wt) lmrclust <- lm_robust(mpg ~ hp - 1, mtcars, clusters = carb) lmrwclust <- lm_robust(mpg ~ hp - 1, mtcars, weights = wt, clusters = carb) # Use equivalent instead of equal because we change the name of the fstat value expect_equivalent( c(lmo$r.squared, lmo$adj.r.squared), c(lmro$r.squared, lmro$adj.r.squared) ) expect_equivalent( c(lmow$r.squared, lmow$adj.r.squared), c(lmrow$r.squared, lmrow$adj.r.squared) ) expect_equivalent( c(lmon$r.squared, lmon$adj.r.squared), c(lmron$r.squared, lmron$adj.r.squared) ) expect_equivalent( c(lmown$r.squared, lmown$adj.r.squared), c(lmrown$r.squared, lmrown$adj.r.squared) ) expect_equal( c(lmon$r.squared, lmon$adj.r.squared), c(lmrclust$r.squared, lmrclust$adj.r.squared) ) expect_equal( c(lmown$r.squared, lmown$adj.r.squared), c(lmrwclust$r.squared, lmrwclust$adj.r.squared) ) expect_equal( c(lmo$r.squared, lmo$adj.r.squared), c(lmroclust$r.squared, lmroclust$adj.r.squared) ) expect_equal( c(lmow$r.squared, lmow$adj.r.squared), c(lmrowclust$r.squared, lmrowclust$adj.r.squared) ) # multiple outcomes lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars) lmmpg <- lm_robust(mpg ~ cyl, data = mtcars) lmhp <- lm_robust(hp ~ cyl, data = mtcars) expect_equivalent(lmro$r.squared[1], lmmpg$r.squared) expect_equivalent(lmro$r.squared[2], lmhp$r.squared) }) test_that("multiple outcomes", { lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcars) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, se_type = "classical") mo <- tidy(lmro) expect_identical( mo$term, c("(Intercept)", "cyl", "(Intercept)", "cyl") ) expect_equal( coef(lmro), coef(lmo) ) expect_equal( vcov(lmo), vcov(lmro) ) for (se_type in setdiff(se_types, "classical")) { expect_equal( sandwich::vcovHC(lmo, type = se_type), vcov(lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, se_type = se_type)) ) } # with weights lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcars, weights = wt) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcars, weights = wt, se_type = "classical") mo <- tidy(lmro) expect_identical( mo$term, c("(Intercept)", "cyl", "(Intercept)", "cyl") ) expect_equal( coef(lmro), coef(lmo) ) expect_equivalent( sapply(summary(lmo)[[1]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1) ) expect_equivalent( sapply(summary(lmo)[[2]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 2) ) # with missingness mtcarsmiss <- mtcars mtcarsmiss$hp[10] <- NA lmo <- lm(cbind(mpg, hp) ~ cyl, data = mtcarsmiss) lmro <- lm_robust(cbind(mpg, hp) ~ cyl, data = mtcarsmiss, se_type = "classical") expect_equivalent( do.call(rbind, lapply(summary(lmo), function(x) x$coefficients[, 1:4])), as.matrix(tidy(lmro)[, c("estimate", "std.error", "statistic", "p.value")]) ) expect_equivalent( sapply(summary(lmo)[[1]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1) ) expect_equivalent( sapply(summary(lmo)[[2]][c("r.squared", "adj.r.squared", "fstatistic")], `[`, 1), sapply(lmro[c("r.squared", "adj.r.squared", "fstatistic")], `[`, 2) ) })