################################################################################ context("LDPRED2") ################################################################################ test_that("LDpred2 works", { skip_if(is_cran) skip_if_offline("raw.githubusercontent.com") bedfile <- file.path(tempdir(), "tmp-data/public-data3.bed") if (!file.exists(rdsfile <- sub_bed(bedfile, ".rds"))) { zip <- tempfile(fileext = ".zip") download.file( "https://github.com/privefl/bigsnpr/blob/master/data-raw/public-data3.zip?raw=true", destfile = zip, mode = "wb") unzip(zip, exdir = tempdir()) rds <- snp_readBed(bedfile) expect_identical(normalizePath(rds), normalizePath(rdsfile)) } obj.bigSNP <- snp_attach(rdsfile) G <- obj.bigSNP$genotypes y <- obj.bigSNP$fam$affection POS2 <- obj.bigSNP$map$genetic.dist + 1000 * obj.bigSNP$map$chromosome sumstats <- bigreadr::fread2(file.path(tempdir(), "tmp-data/public-data3-sumstats.txt")) sumstats$n_eff <- sumstats$N map <- setNames(obj.bigSNP$map[-3], c("chr", "rsid", "pos", "a1", "a0")) df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) ind_var <- df_beta$`_NUM_ID_` corr0 <- snp_cor(G, ind.col = ind_var, size = 3 / 1000, infos.pos = POS2[ind_var], ncores = 2) ld <- bigsnpr:::sp_colSumsSq_sym(p = corr0@p, i = corr0@i, x = corr0@x) corr <- as_SFBM(corr0, compact = sample(c(TRUE, FALSE), 1)) rm(corr0) # LD score regression (ldsc <- with(df_beta, snp_ldsc(ld, length(ld), chi2 = (beta / beta_se)^2, sample_size = n_eff, blocks = NULL))) (ldsc2 <- snp_ldsc2(corr, df_beta, intercept = NULL)) expect_equal(ldsc2, ldsc) # LDpred2-inf beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = ldsc[["h2"]]) pred_inf <- big_prodVec(G, beta_inf, ind.col = df_beta[["_NUM_ID_"]]) expect_gt(cor(pred_inf, y), 0.2) # LDpred2-gibbs p_seq <- signif(seq_log(1e-3, 1, length.out = 7), 1) params <- expand.grid(p = p_seq, h2 = ldsc[["h2"]], sparse = c(FALSE, TRUE)) expect_equal(dim(params), c(14, 3)) expect_equal(dim(snp_ldpred2_grid(corr, df_beta, params[1, ])), c(nrow(df_beta), 1)) beta_grid <- snp_ldpred2_grid(corr, df_beta, params, ncores = 2) pred_grid <- big_prodMat(G, beta_grid, ind.col = df_beta[["_NUM_ID_"]]) expect_gt(max(cor(pred_grid, y)), 0.4) # LDpred2-uncertainty expect_error(snp_ldpred2_grid(corr, df_beta, params, return_sampling_betas = TRUE), "Only one set of parameters is allowed") beta_sample <- snp_ldpred2_grid(corr, df_beta, params[3, ], num_iter = 200, return_sampling_betas = TRUE) expect_equal(dim(beta_sample), c(nrow(df_beta), 200)) expect_gt(cor(rowMeans(beta_sample), beta_grid[, 3]), 0.9) # LDpred2-auto beta_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], burn_in = 200, num_iter = 200, sparse = TRUE, shrink_corr = runif(1, 0.9, 1)) expect_length(beta_auto, 1) mod <- beta_auto[[1]] expect_null(dim(mod$beta_est)) pred_auto <- big_prodVec(G, mod$beta_est, ind.col = df_beta[["_NUM_ID_"]]) expect_gt(cor(pred_auto, y), 0.4) expect_gt(cor(mod$beta_est_sparse, mod$beta_est), 0.9) expect_lt(mean(mod$beta_est == 0), 0.001) expect_gt(mean(mod$beta_est_sparse == 0), 0.5) expect_equal(mod$h2_init, ldsc[["h2"]]) expect_equal(mod$p_init, 0.1) expect_equal(mod$h2_est, mean(tail(mod$path_h2_est, 200))) expect_equal(mod$p_est, mean(tail(mod$path_p_est, 200))) expect_equal(mod$alpha_est, mean(tail(mod$path_alpha_est, 200))) expect_length(mod$postp_est, length(mod$beta_est)) expect_null(dim(mod$postp_est)) expect_equal(mean(mod$postp_est), mod$p_est, tolerance = 0.01) expect_equal(dim(mod$sample_beta), c(ncol(corr), 0)) beta_hat <- with(df_beta, beta / sqrt(n_eff * beta_se^2 + beta^2)) expect_gt(cor(mod$corr_est, beta_hat), 0.3) beta_auto2 <- snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], burn_in = 200, num_iter = 200, vec_p_init = 1, allow_jump_sign = FALSE) expect_gt(cor(beta_auto2[[1]]$beta_est, mod$beta_est), 0.9) expect_lt(beta_auto2[[1]]$p_est, 0.1) # Sampling betas beta_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], burn_in = 200, num_iter = 200, report_step = 10) bsamp <- beta_auto[[1]]$sample_beta expect_equal(dim(bsamp), c(ncol(corr), 20)) expect_s4_class(bsamp, "dgCMatrix") h2_est <- apply(bsamp, 2, function(x) crossprod(x, bigsparser::sp_prodVec(corr, x))) expect_equal(h2_est, beta_auto[[1]]$path_h2_est[seq(210, 400, by = 10)]) # alpha bounds beta_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], burn_in = 200, num_iter = 200, alpha_bounds = c(-1, -1)) expect_equal(beta_auto[[1]]$path_alpha_est, rep(-1, 400)) beta_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], burn_in = 200, num_iter = 200, alpha_bounds = c(-0.5, 0)) expect_true(all(beta_auto[[1]]$path_alpha_est >= -0.5)) expect_true(all(beta_auto[[1]]$path_alpha_est <= 0)) # Errors expect_error(snp_ldpred2_inf(corr, df_beta[-6]), "'df_beta' should have element 'beta'.") expect_error(snp_ldpred2_grid(corr, df_beta[-6], params), "'df_beta' should have element 'beta'.") expect_error(snp_ldpred2_auto(corr, df_beta[-6]), "'df_beta' should have element 'beta'.") # Parallelism all_beta <- replicate( n = 10, simplify = FALSE, snp_ldpred2_grid(corr, df_beta, params[c(1, 8), ], ncores = 2)) all_beta <- replicate( n = 5, simplify = FALSE, snp_ldpred2_auto(corr, df_beta, h2_init = ldsc[["h2"]], vec_p_init = seq_log(1e-4, 0.9, 6), burn_in = 100, num_iter = 100, sparse = TRUE, ncores = 2)) # Reproducibility set.seed(1) grid1 <- snp_ldpred2_grid(corr, df_beta, params, ncores = 2) grid2 <- snp_ldpred2_grid(corr, df_beta, params, ncores = 2) expect_false(identical(grid2, grid1)) set.seed(1) grid3 <- snp_ldpred2_grid(corr, df_beta, params, ncores = 2) expect_identical(grid3, grid1) set.seed(1) beta1 <- snp_ldpred2_grid(corr, df_beta, params[3, ], return_sampling_betas = TRUE) set.seed(1) beta2 <- snp_ldpred2_grid(corr, df_beta, params[3, ], return_sampling_betas = TRUE) expect_identical(beta2, beta1) set.seed(1) auto1 <- snp_ldpred2_auto(corr, df_beta, burn_in = 50, num_iter = 100, sparse = TRUE, h2_init = 0.3, vec_p_init = seq_log(1e-5, 1, 8), report_step = 5, ncores = 2) set.seed(1) auto2 <- snp_ldpred2_auto(corr, df_beta, burn_in = 50, num_iter = 100, sparse = TRUE, h2_init = 0.3, vec_p_init = seq_log(1e-5, 1, 8), report_step = 5, ncores = 2) expect_identical(auto2, auto1) inf1 <- snp_ldpred2_inf(corr, df_beta, h2 = 0.3) inf2 <- snp_ldpred2_inf(corr, df_beta, h2 = 0.3) expect_identical(inf2, inf1) # no sampling, so reproducible }) ################################################################################ test_that("MLE_alpha works", { skip_if(is_cran) bigsnp <- snp_attachExtdata() G <- bigsnp$genotypes FUN <- function(x, log_var, beta2) { S <- 1 + x[[1]]; sigma2 <- x[[2]] S * sum(log_var) + length(log_var) * log(sigma2) + sum(beta2 / exp(S * log_var)) / sigma2 } DER <- function(x, log_var, beta2) { S <- 1 + x[[1]]; sigma2 <- x[[2]] res1 <- sum(log_var) - sum(log_var * beta2 / exp(S * log_var)) / sigma2 res2 <- length(log_var) / sigma2 - sum(beta2 / exp(S * log_var)) / sigma2^2 c(res1, res2) } alpha <- 0 simu <- snp_simuPheno(G, 0.2, 500, alpha = alpha) log_var <- log(big_colstats(G, ind.col = simu$set)$var) beta2 <- simu$effects^2 # without bootstrap res1 <- optim(par = c(-0.5, 0.2 / 500), fn = FUN, gr = DER, method = "L-BFGS-B", lower = c(-1.5, 0.2 / 5000), upper = c(0.5, 0.2 / 50), log_var = log_var, beta2 = beta2)$par res2 <- bigsnpr:::MLE_alpha(par = c(-0.5, res1[2] * runif(1, 0.7, 1.4)), log_var = log_var, curr_beta = simu$effects, ind_causal = 0:499, alpha_bounds = c(-0.5, 1.5)) expect_equal(res2[1:2] - 1:0, res1, tolerance = 1e-4) # with bootstrap all_est <- replicate(1000, { ind <- sample(500, replace = TRUE) optim(par = c(-0.5, 0.2 / 500), fn = FUN, gr = DER, method = "L-BFGS-B", lower = c(-1.5, 0.2 / 5000), upper = c(0.5, 0.2 / 50), log_var = log_var[ind], beta2 = beta2[ind])$par }) all_est2 <- replicate(1000, { bigsnpr:::MLE_alpha(par = c(-0.5, mean(all_est[2, ])), ind_causal = 0:499, log_var = log_var, curr_beta = simu$effects, alpha_bounds = c(-0.5, 1.5), boot = TRUE)[1:2] - 1:0 }) Q <- ppoints(10) expect_equal(quantile(all_est[1, ], Q), quantile(all_est2[1, ], Q), tolerance = 0.1) expect_equal(quantile(all_est[2, ], Q), quantile(all_est2[2, ], Q), tolerance = 0.1) }) ################################################################################ test_that("ind.corr works", { skip_if(is_cran) skip_if_offline("raw.githubusercontent.com") bedfile <- file.path(tempdir(), "tmp-data/public-data3.bed") if (!file.exists(rdsfile <- sub_bed(bedfile, ".rds"))) { zip <- tempfile(fileext = ".zip") download.file( "https://github.com/privefl/bigsnpr/blob/master/data-raw/public-data3.zip?raw=true", destfile = zip, mode = "wb") unzip(zip, exdir = tempdir()) rds <- snp_readBed(bedfile) expect_identical(normalizePath(rds), normalizePath(rdsfile)) } obj.bigSNP <- snp_attach(rdsfile) G <- obj.bigSNP$genotypes y <- obj.bigSNP$fam$affection POS2 <- obj.bigSNP$map$genetic.dist + 1000 * obj.bigSNP$map$chromosome sumstats <- bigreadr::fread2(file.path(tempdir(), "tmp-data/public-data3-sumstats.txt")) sumstats$n_eff <- sumstats$N map <- setNames(obj.bigSNP$map[-3], c("chr", "rsid", "pos", "a1", "a0")) df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) ind_var <- df_beta$`_NUM_ID_` corr0 <- snp_cor(G, ind.col = ind_var, size = 3 / 1000, infos.pos = POS2[ind_var], ncores = 2) ld <- bigsnpr:::sp_colSumsSq_sym(p = corr0@p, i = corr0@i, x = corr0@x) corr <- as_SFBM(corr0, compact = sample(c(TRUE, FALSE), 1)) rm(corr0) ind.sub <- sample(ncol(corr), 30e3) # LDpred2-gibbs p_seq <- signif(seq_log(1e-3, 1, length.out = 7), 1) params <- expand.grid(p = p_seq, h2 = 0.3, sparse = c(FALSE, TRUE)) set.seed(1) system.time( beta_grid <- snp_ldpred2_grid(as_SFBM(corr[ind.sub, ind.sub]), df_beta[ind.sub, ], params, ncores = 2) ) set.seed(1) system.time( beta_grid2 <- snp_ldpred2_grid(corr, ind.corr = ind.sub, df_beta[ind.sub, ], params, ncores = 2) ) expect_equal(beta_grid2, beta_grid) # LDpred2-uncertainty set.seed(1) beta_grid <- snp_ldpred2_grid(as_SFBM(corr[ind.sub, ind.sub]), df_beta[ind.sub, ], params[3, ], return_sampling_betas = TRUE) set.seed(1) beta_grid2 <- snp_ldpred2_grid(corr, ind.corr = ind.sub, df_beta[ind.sub, ], params[3, ], return_sampling_betas = TRUE) expect_equal(beta_grid2, beta_grid) # LDpred2-auto set.seed(1) beta_auto <- snp_ldpred2_auto(as_SFBM(corr[ind.sub, ind.sub]), df_beta[ind.sub, ], h2_init = 0.3, sparse = TRUE, shrink_corr = 0.95, burn_in = 100, num_iter = 100) set.seed(1) beta_auto2 <- snp_ldpred2_auto(corr, ind.corr = ind.sub, df_beta[ind.sub, ], h2_init = 0.3, sparse = TRUE, shrink_corr = 0.95, burn_in = 100, num_iter = 100) expect_equal(beta_auto2, beta_auto) # LD Score regression ld <- Matrix::colSums(corr[]^2) ldsc <- snp_ldsc(ld_score = ld[ind.sub], ld_size = ncol(corr), chi2 = with(df_beta[ind.sub, ], (beta / beta_se)^2), sample_size = df_beta$N[ind.sub]) ldsc2 <- snp_ldsc2(corr, ind.beta = ind.sub, df_beta[ind.sub, ], intercept = NULL, blocks = 200) expect_equal(ldsc2, ldsc) }) ################################################################################ test_that("p_bounds works in LDpred2-auto", { skip_if(is_cran) skip_if_offline("raw.githubusercontent.com") bedfile <- file.path(tempdir(), "tmp-data/public-data3.bed") if (!file.exists(rdsfile <- sub_bed(bedfile, ".rds"))) { zip <- tempfile(fileext = ".zip") download.file( "https://github.com/privefl/bigsnpr/blob/master/data-raw/public-data3.zip?raw=true", destfile = zip, mode = "wb") unzip(zip, exdir = tempdir()) rds <- snp_readBed(bedfile) expect_identical(normalizePath(rds), normalizePath(rdsfile)) } obj.bigSNP <- snp_attach(rdsfile) G <- obj.bigSNP$genotypes y <- obj.bigSNP$fam$affection POS2 <- obj.bigSNP$map$genetic.dist + 1000 * obj.bigSNP$map$chromosome sumstats <- bigreadr::fread2(file.path(tempdir(), "tmp-data/public-data3-sumstats.txt")) sumstats$n_eff <- sumstats$N map <- setNames(obj.bigSNP$map[-3], c("chr", "rsid", "pos", "a1", "a0")) df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) ind_var <- df_beta$`_NUM_ID_` corr0 <- snp_cor(G, ind.col = ind_var, size = 3 / 1000, infos.pos = POS2[ind_var], ncores = 2) corr <- as_SFBM(corr0, compact = sample(c(TRUE, FALSE), 1)) rm(corr0) # LDpred2-auto with bounds for p p_bounds <- sort(10^runif(2, -5, 0)) beta_auto <- snp_ldpred2_auto(corr, df_beta, p_bounds = p_bounds, h2_init = 0.3, shrink_corr = 0.95, burn_in = 100, num_iter = 100) all_p_est <- beta_auto[[1]]$path_p_est sapply(all_p_est, function(p_est) expect_gte(p_est, p_bounds[1])) sapply(all_p_est, function(p_est) expect_lte(p_est, p_bounds[2])) beta_auto2 <- snp_ldpred2_auto(corr, df_beta, p_bounds = p_bounds[c(1, 1)], h2_init = 0.3, shrink_corr = 0.95, burn_in = 100, num_iter = 100) sapply(beta_auto2[[1]]$path_p_est, function(p_est) expect_equal(p_est, p_bounds[1])) }) ################################################################################