# Double Machine Learning for Partially Linear Regression. dml_plr = function(data, y, d, n_folds, ml_l, ml_m, ml_g, dml_procedure, score, n_rep = 1, smpls = NULL, params_l = NULL, params_m = NULL, params_g = NULL) { if (is.null(smpls)) { smpls = lapply(1:n_rep, function(x) sample_splitting(n_folds, data)) } all_thetas = all_ses = rep(NA_real_, n_rep) all_preds = list() for (i_rep in 1:n_rep) { this_smpl = smpls[[i_rep]] stopifnot(length(this_smpl$train_ids) == length(this_smpl$test_ids)) if (length(this_smpl$train_ids) == 1) { dml_procedure = "dml1" } res_single_split = fit_plr_single_split( data, y, d, n_folds, ml_l, ml_m, ml_g, dml_procedure, score, this_smpl, params_l, params_m, params_g) all_preds[[i_rep]] = res_single_split$all_preds all_thetas[i_rep] = res_single_split$theta all_ses[i_rep] = res_single_split$se } theta = stats::median(all_thetas) if (length(this_smpl$train_ids) > 1) { n = nrow(data) } else { n = length(this_smpl$test_ids[[1]]) } se = se_repeated(all_ses * sqrt(n), all_thetas, theta) / sqrt(n) t = theta / se pval = 2 * stats::pnorm(-abs(t)) names(theta) = names(se) = names(t) = names(pval) = d res = list( coef = theta, se = se, t = t, pval = pval, thetas = all_thetas, ses = all_ses, all_preds = all_preds, smpls = smpls) return(res) } dml_plr_multitreat = function(data, y, d, n_folds, ml_l, ml_m, ml_g, dml_procedure, score, n_rep = 1, smpls = NULL, params_l = NULL, params_m = NULL, params_g = NULL) { if (is.null(smpls)) { smpls = lapply(1:n_rep, function(x) sample_splitting(n_folds, data)) } all_preds = all_thetas = all_ses = list() n_d = length(d) for (i_rep in 1:n_rep) { this_smpl = smpls[[i_rep]] thetas_this_rep = ses_this_rep = rep(NA_real_, n_d) all_preds_this_rep = list() for (i_d in seq(n_d)) { if (!is.null(params_l)) { this_params_l = params_l[[i_d]] } else { this_params_l = NULL } if (!is.null(params_g)) { this_params_g = params_g[[i_d]] } else { this_params_g = NULL } if (!is.null(params_m)) { this_params_m = params_m[[i_d]] } else { this_params_m = NULL } res_single_split = fit_plr_single_split( data, y, d[i_d], n_folds, ml_l, ml_m, ml_g, dml_procedure, score, this_smpl, this_params_l, this_params_m, this_params_g) all_preds_this_rep[[i_d]] = res_single_split$all_preds thetas_this_rep[i_d] = res_single_split$theta ses_this_rep[i_d] = res_single_split$se } all_preds[[i_rep]] = all_preds_this_rep all_thetas[[i_rep]] = thetas_this_rep all_ses[[i_rep]] = ses_this_rep } theta = se = t = pval = rep(NA_real_, n_d) if (length(this_smpl$train_ids) > 1) { n = nrow(data) } else { n = length(smpls[[1]]$test_ids[[1]]) } for (i_d in seq(n_d)) { theta_vec = unlist(lapply(all_thetas, function(x) x[i_d])) se_vec = unlist(lapply(all_ses, function(x) x[i_d])) theta[i_d] = stats::median(theta_vec) se[i_d] = se_repeated(se_vec * sqrt(n), theta_vec, theta[i_d]) / sqrt(n) t[i_d] = theta[i_d] / se[i_d] pval[i_d] = 2 * stats::pnorm(-abs(t[i_d])) } names(theta) = names(se) = names(t) = names(pval) = d res = list( coef = theta, se = se, t = t, pval = pval, thetas = all_thetas, ses = all_ses, all_preds = all_preds, smpls = smpls) return(res) } fit_plr_single_split = function(data, y, d, n_folds, ml_l, ml_m, ml_g, dml_procedure, score, smpl, params_l, params_m, params_g) { train_ids = smpl$train_ids test_ids = smpl$test_ids fit_g = (score == "IV-type") | is.function(score) all_preds = fit_nuisance_plr( data, y, d, ml_l, ml_m, ml_g, n_folds, smpl, fit_g, params_l, params_m, params_g) residuals = compute_plr_residuals( data, y, d, n_folds, smpl, all_preds) y_minus_l_hat = residuals$y_minus_l_hat d_minus_m_hat = residuals$d_minus_m_hat y_minus_g_hat = residuals$y_minus_g_hat D = data[, d] Y = data[, y] # DML 1 if (dml_procedure == "dml1") { thetas = rep(NA_real_, n_folds) for (i in 1:n_folds) { test_index = test_ids[[i]] orth_est = orth_plr_dml( y_minus_l_hat = y_minus_l_hat[test_index], d_minus_m_hat = d_minus_m_hat[test_index], y_minus_g_hat = y_minus_g_hat[test_index], d = D[test_index], score = score) thetas[i] = orth_est$theta } theta = mean(thetas, na.rm = TRUE) if (length(train_ids) == 1) { D = D[test_index] y_minus_l_hat = y_minus_l_hat[test_index] d_minus_m_hat = d_minus_m_hat[test_index] y_minus_g_hat = y_minus_g_hat[test_index] } } if (dml_procedure == "dml2") { orth_est = orth_plr_dml( y_minus_l_hat = y_minus_l_hat, d_minus_m_hat = d_minus_m_hat, y_minus_g_hat = y_minus_g_hat, d = D, score = score) theta = orth_est$theta } se = sqrt(var_plr( theta = theta, d = D, y_minus_l_hat = y_minus_l_hat, d_minus_m_hat = d_minus_m_hat, y_minus_g_hat = y_minus_g_hat, score = score)) res = list( theta = theta, se = se, all_preds = all_preds) return(res) } fit_nuisance_plr = function(data, y, d, ml_l, ml_m, ml_g, n_folds, smpls, fit_g, params_l, params_m, params_g) { train_ids = smpls$train_ids test_ids = smpls$test_ids # nuisance l l_indx = names(data) != d data_l = data[, l_indx, drop = FALSE] task_l = mlr3::TaskRegr$new( id = paste0("nuis_l_", d), backend = data_l, target = y) resampling_l = mlr3::rsmp("custom") resampling_l$instantiate(task_l, train_ids, test_ids) if (!is.null(params_l)) { ml_l$param_set$values = params_l } r_l = mlr3::resample(task_l, ml_l, resampling_l, store_models = TRUE) l_hat_list = lapply(r_l$predictions(), function(x) x$response) # nuisance m if (!is.null(params_m)) { ml_m$param_set$values = params_m } m_indx = names(data) != y data_m = data[, m_indx, drop = FALSE] if (ml_m$task_type == "regr") { task_m = mlr3::TaskRegr$new(id = paste0("nuis_m_", d), backend = data_m, target = d) resampling_m = mlr3::rsmp("custom") resampling_m$instantiate(task_m, train_ids, test_ids) r_m = mlr3::resample(task_m, ml_m, resampling_m, store_models = TRUE) m_hat_list = lapply(r_m$predictions(), function(x) x$response) } else if ((ml_m$task_type == "classif")) { ml_m$predict_type = "prob" data_m[[d]] = factor(data_m[[d]]) task_m = mlr3::TaskClassif$new( id = paste0("nuis_m_", d), backend = data_m, target = d, positive = "1") resampling_m = mlr3::rsmp("custom") resampling_m$instantiate(task_m, train_ids, test_ids) r_m = mlr3::resample(task_m, ml_m, resampling_m, store_models = TRUE) m_hat_list = lapply(r_m$predictions(), function(x) as.data.table(x)$prob.1) } if (fit_g) { # nuisance g residuals = compute_plr_residuals( data, y, d, n_folds, smpls, list( l_hat_list = l_hat_list, g_hat_list = NULL, m_hat_list = m_hat_list)) y_minus_l_hat = residuals$y_minus_l_hat d_minus_m_hat = residuals$d_minus_m_hat psi_a = -d_minus_m_hat * d_minus_m_hat psi_b = d_minus_m_hat * y_minus_l_hat theta_initial = -mean(psi_b, na.rm = TRUE) / mean(psi_a, na.rm = TRUE) D = data[, d] Y = data[, y] g_indx = names(data) != y & names(data) != d y_minus_theta_d = Y - theta_initial * D data_g = cbind(data[, g_indx, drop = FALSE], y_minus_theta_d) task_g = mlr3::TaskRegr$new( id = paste0("nuis_g_", d), backend = data_g, target = "y_minus_theta_d") resampling_g = mlr3::rsmp("custom") resampling_g$instantiate(task_g, train_ids, test_ids) if (!is.null(params_g)) { ml_g$param_set$values = params_g } r_g = mlr3::resample(task_g, ml_g, resampling_g, store_models = TRUE) g_hat_list = lapply(r_g$predictions(), function(x) x$response) } else { g_hat_list = NULL } all_preds = list( l_hat_list = l_hat_list, m_hat_list = m_hat_list, g_hat_list = g_hat_list) return(all_preds) } compute_plr_residuals = function(data, y, d, n_folds, smpls, all_preds) { test_ids = smpls$test_ids l_hat_list = all_preds$l_hat_list m_hat_list = all_preds$m_hat_list g_hat_list = all_preds$g_hat_list n = nrow(data) D = data[, d] Y = data[, y] y_minus_l_hat = d_minus_m_hat = y_minus_g_hat = rep(NA_real_, n) for (i in 1:n_folds) { test_index = test_ids[[i]] l_hat = l_hat_list[[i]] m_hat = m_hat_list[[i]] y_minus_l_hat[test_index] = Y[test_index] - l_hat d_minus_m_hat[test_index] = D[test_index] - m_hat if (!is.null(g_hat_list)) { g_hat = g_hat_list[[i]] y_minus_g_hat[test_index] = Y[test_index] - g_hat } } residuals = list( y_minus_l_hat = y_minus_l_hat, d_minus_m_hat = d_minus_m_hat, y_minus_g_hat = y_minus_g_hat) return(residuals) } # Orthogonalized Estimation of Coefficient in PLR orth_plr_dml = function(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score) { theta = NA_real_ if (score == "partialling out") { res_fit = stats::lm(y_minus_l_hat ~ 0 + d_minus_m_hat) theta = stats::coef(res_fit) } else if (score == "IV-type") { theta = mean(d_minus_m_hat * y_minus_g_hat) / mean(d_minus_m_hat * d) } else { stop("Inference framework for orthogonal estimation unknown") } res = list(theta = theta) return(res) } # Variance estimation for DML estimator in the partially linear regression model var_plr = function(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score) { n = length(d) if (score == "partialling out") { var = 1 / n * 1 / (mean(d_minus_m_hat^2))^2 * mean(((y_minus_l_hat - d_minus_m_hat * theta) * d_minus_m_hat)^2) } else if (score == "IV-type") { var = 1 / n * 1 / mean(d_minus_m_hat * d)^2 * mean(((y_minus_g_hat - d * theta) * d_minus_m_hat)^2) } return(c(var)) } # Bootstrap Implementation for Partially Linear Regression Model bootstrap_plr = function(thetas, ses, data, y, d, n_folds, smpls, all_preds, bootstrap, n_rep_boot, score, n_rep = 1) { for (i_rep in 1:n_rep) { n = nrow(data) weights = draw_bootstrap_weights(bootstrap, n_rep_boot, n) this_res = boot_plr_single_split( thetas[i_rep], ses[i_rep], data, y, d, n_folds, smpls[[i_rep]], all_preds[[i_rep]], weights, n_rep_boot, score) if (i_rep == 1) { boot_res = this_res } else { boot_res$boot_coef = cbind(boot_res$boot_coef, this_res$boot_coef) boot_res$boot_t_stat = cbind(boot_res$boot_t_stat, this_res$boot_t_stat) } } return(boot_res) } boot_plr_multitreat = function(thetas, ses, data, y, d, n_folds, smpls, all_preds, bootstrap, n_rep_boot, score, n_rep = 1) { n_d = length(d) for (i_rep in 1:n_rep) { n = nrow(data) weights = draw_bootstrap_weights(bootstrap, n_rep_boot, n) boot_theta = boot_t_stat = matrix(NA_real_, nrow = n_d, ncol = n_rep_boot) for (i_d in seq(n_d)) { this_res = boot_plr_single_split( thetas[[i_rep]][i_d], ses[[i_rep]][i_d], data, y, d[i_d], n_folds, smpls[[i_rep]], all_preds[[i_rep]][[i_d]], weights, n_rep_boot, score) boot_theta[i_d, ] = this_res$boot_coef boot_t_stat[i_d, ] = this_res$boot_t_stat } this_res = list( boot_coef = boot_theta, boot_t_stat = boot_t_stat) if (i_rep == 1) { boot_res = this_res } else { boot_res$boot_coef = cbind(boot_res$boot_coef, this_res$boot_coef) boot_res$boot_t_stat = cbind(boot_res$boot_t_stat, this_res$boot_t_stat) } } return(boot_res) } boot_plr_single_split = function(theta, se, data, y, d, n_folds, smpl, all_preds, weights, n_rep_boot, score) { residuals = compute_plr_residuals( data, y, d, n_folds, smpl, all_preds) y_minus_l_hat = residuals$y_minus_l_hat d_minus_m_hat = residuals$d_minus_m_hat y_minus_g_hat = residuals$y_minus_g_hat D = data[, d] if (score == "partialling out") { psi = (y_minus_l_hat - d_minus_m_hat * theta) * d_minus_m_hat psi_a = -d_minus_m_hat * d_minus_m_hat } else if (score == "IV-type") { psi = (y_minus_g_hat - D * theta) * d_minus_m_hat psi_a = -d_minus_m_hat * D } res = functional_bootstrap( theta, se, psi, psi_a, n_folds, smpl, n_rep_boot, weights) return(res) }