#checkes various statistics of example 1 context("examples") test_that("checked_example1", { sink(file = "test_example1.log") hh <- data2haplohh( hap_file = "example1.hap", map_file = "example1.map", allele_coding = "map" ) res <- calc_ehh( hh, mrk = "rs6", limehh = 0, discard_integration_at_border = FALSE, include_nhaplo = TRUE ) expect_equivalent(res$freq, c(0.5, 0.5)) expected_ehh_ancestral <- c(0, 0, 0, 1 / 6, 1 / 2, 1, 1 / 3, 1 / 6, 0, 0, 0) expected_ehh_derived <- c(1 / 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 / 3) expect_identical(res$ehh$EHH_A, expected_ehh_ancestral) expect_identical(res$ehh$EHH_D, expected_ehh_derived) expect_identical(res$ehh$NHAPLO_A, c(0L, 0L, rep(4L, 7), 0L, 0L)) expect_identical(res$ehh$NHAPLO_D, rep(4L, 11)) expected_ihh_A <- sum(expected_ehh_ancestral[2:10] * 10000) + sum(expected_ehh_ancestral[c(1, 11)] * 5000) expected_ihh_D <- sum(expected_ehh_derived[2:10] * 10000) + sum(expected_ehh_derived[c(1, 11)] * 5000) expect_equivalent(res$ihh, cbind(expected_ihh_A, expected_ihh_D)) scan <- scan_hh(hh, discard_integration_at_border = FALSE) expected <- data.frame( CHR = rep("chr1", 11), POSITION = c( 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 1e+05, 110000 ), FREQ_A = c( 0.875, 0.875, 0.75, 0.75, 0.875, 0.5, 0.75, 0.875, 0.75, 0.875, 0.5 ) , FREQ_D = c( 0.125, 0.125, 0.25, 0.25, 0.125, 0.5, 0.25, 0.125, 0.25, 0.125, 0.5 ) , NHAPLO_A = c(7L, 7L, 6L, 6L, 7L, 4L, 6L, 7L, 6L, 7L, 4L), NHAPLO_D = c(1L, 1L, 2L, 2L, 1L, 4L, 2L, 1L, 2L, 1L, 4L), IHH_A = c( 21992.2619047619, 36190.4761904762, 45000, 47666.6666666667, 33333.3333333333, 18816.6666666667, 45333.3333333333, 38571.4285714286, 50666.6666666667, 35000, 25075 ), IHH_D = c( 0, 0, 23512.5, 28025, 0, 89166.6666666667, 28025, 0, 23512.5, 0, 25833.3333333333 ), IES = c( 15295.2380952381, 25892.8571428571, 22678.5714285714, 24285.7142857143, 23750, 19821.4285714286, 23035.7142857143, 27678.5714285714, 25714.2857142857, 25000, 8032.14285714286 ), INES = c( 21992.2619047619, 36190.4761904762, 43437.5, 46250, 33333.3333333333, 52916.6666666667, 44062.5, 38571.4285714286, 48750, 35000, 25416.6666666667 ) ) row.names(expected) <- row.names <- c("rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8", "rs9", "rs10", "rs11") expect_equal(scan, expected) # test extract_regions regions <- data.frame( CHR = c("chr1", "chr2", "chr1", "chr1"), START = c(100000, 10000, 50000, 55000), END = c(110000, 120000, 61000, 70000) ) scan_regions <- extract_regions(scan, regions) expect_identical(scan_regions, scan[c(6, 7, 11), ]) # check integration over stepwise constant rehh curve ehh <- calc_ehh( hh, mrk = "rs6", discard_integration_at_border = FALSE, phased = FALSE, limehh = 0.05, interpolate = FALSE ) expected_ihh_A <- 25000 - 0.05 * 30000 expected_ihh_D <- 100000 - 0.05 * 100000 expect_equivalent(ehh$ihh, cbind(expected_ihh_A, expected_ihh_D)) ehh <- calc_ehh( hh, mrk = "rs6", discard_integration_at_border = FALSE, phased = FALSE, limehh = 0, interpolate = FALSE ) expected_ihh_A <- 25000 expected_ihh_D <- 100000 expect_equivalent(ehh$ihh, cbind(expected_ihh_A, expected_ihh_D)) # check that min_maf excludes mrks hh <- data2haplohh( hap_file = "example1.hap", map_file = "example1.map", allele_coding = "map", min_maf = 0.2 ) res <- calc_ehh( hh, mrk = "rs6", limehh = 0, discard_integration_at_border = FALSE, include_nhaplo = TRUE ) expect_equivalent(res$freq, c(0.5, 0.5)) expected_ehh_ancestral <- c(0, 1 / 3, 1, 1 / 3, 0, 0) expected_ehh_derived <- c(1, 1, 1, 1, 1, 1 / 3) expect_identical(res$ehh$EHH_A, expected_ehh_ancestral) expect_identical(res$ehh$EHH_D, expected_ehh_derived) expect_identical(res$ehh$NHAPLO_A, c(rep(4L, 5), 0L)) expect_identical(res$ehh$NHAPLO_D, rep(4L, 6)) expected_ihh_A <- (1 / 3 * 10000 + (1 / 3 + 1) * 20000 + (1 + 1 / 3) * 10000 + 1 / 3 * 20000) / 2 expected_ihh_D <- ((1 + 1) * 10000 + (1 + 1) * 20000 + (1 + 1) * 30000 + (1 + 1 / 3) * 20000) / 2 expect_equivalent(res$ihh, cbind(expected_ihh_A, expected_ihh_D)) # check that scalegap reduces gaps ehh <- calc_ehh( hh, mrk = "rs6", limehh = 0, scalegap = 15000, discard_integration_at_border = FALSE, include_nhaplo = TRUE ) expected_ihh_A <- (1 / 3 * 10000 + (1 / 3 + 1) * 15000 + (1 + 1 / 3) * 10000 + 1 / 3 * 15000) / 2 expected_ihh_D <- ((1 + 1) * 10000 + (1 + 1) * 15000 + (1 + 1) * 25000 + (1 + 1 / 3) * 15000) / 2 expect_equivalent(ehh$ihh, cbind(expected_ihh_A, expected_ihh_D)) # check that maxgap cuts integration ehh <- calc_ehh( hh, mrk = "rs6", limehh = 0, discard_integration_at_border = FALSE, maxgap = 15000 ) expected_ihh_A <- ((1 + 1 / 3) * 10000) / 2 expected_ihh_D <- ((1 + 1) * 10000) / 2 expect_equivalent(ehh$ihh, cbind(expected_ihh_A, expected_ihh_D)) sink() file.remove("test_example1.log") }) test_that("checked_example2", { sink(file = "test_example2.log") hh <- data2haplohh( hap_file = "example2.hap", map_file = "example2.map", allele_coding = "01", min_perc_geno.hap = 0, min_perc_geno.mrk = 1 ) res <- calc_ehh( hh, mrk = "rs6", limehh = 0, discard_integration_at_border = FALSE, include_nhaplo = TRUE ) expect_equivalent(res$freq, c(4 / 11, 3 / 11, 4 / 11)) expected_ehh_ancestral <- c(0, 0, 0, 1 / 6, 1 / 2, 1, 1 / 3, 1 / 6, 0, 0, 0) expected_ehh_derived1 <- c(rep(1, 10), 0) expected_ehh_derived2 <- c(0, 0, 1 / 3, 1 / 3, 1, 1, 1, 1, 0, 0, 0) expect_identical(res$ehh$EHH_A, expected_ehh_ancestral) expect_identical(res$ehh$EHH_D1, expected_ehh_derived1) expect_identical(res$ehh$EHH_D2, expected_ehh_derived2) expect_identical(res$ehh$NHAPLO_A, c(0L, 0L, 2L, rep(4L, 6), 0L, 0L)) expect_identical(res$ehh$NHAPLO_D1, c(rep(3L, 9), 2L, 2L)) expect_identical(res$ehh$NHAPLO_D2, c(0L, 2L, rep(3L, 3), rep(4L, 3), 3L, 0L, 0L)) expected_ihh_A <- sum(expected_ehh_ancestral[2:10] * 10000) + sum(expected_ehh_ancestral[c(1, 11)] * 5000) expected_ihh_D1 <- sum(expected_ehh_derived1[2:10] * 10000) + sum(expected_ehh_derived1[c(1, 11)] * 5000) expected_ihh_D2 <- sum(expected_ehh_derived2[2:10] * 10000) + sum(expected_ehh_derived2[c(1, 11)] * 5000) expect_equivalent(res$ihh, cbind(expected_ihh_A, expected_ihh_D1, expected_ihh_D2)) ## check scan scan <- scan_hh(hh, discard_integration_at_border = FALSE) expected <- data.frame( CHR = rep("chr1", 11), POSITION = c( 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 1e+05, 110000 ), FREQ_A = c( 0.916666666666667, 0.909090909090909, 0.9, 0.75, 0.909090909090909, 0.363636363636364, 0.833333333333333, 0.916666666666667, 0.7, 0.777777777777778, 0.666666666666667 ) , FREQ_D = c( 0.0833333333333333, 0.0909090909090909, 0.1, 0.25, 0.0909090909090909, 0.363636363636364, 0.166666666666667, 0.0833333333333333, 0.2, 0.222222222222222, 0.333333333333333 ), NHAPLO_A = c(11L, 10L, 9L, 9L, 10L, 4L, 10L, 11L, 7L, 7L, 8L), NHAPLO_D = c(1L, 1L, 1L, 3L, 1L, 4L, 2L, 1L, 2L, 2L, 4L) , IHH_A = c( 25045.2380952381, 32750.9920634921, 39881.9444444445, 38453.373015873, 29680.1587301587, 18816.6666666667, 28960.5158730159, 27370.1298701299, 40142.8571428571, 24452.380952381, 14291.6666666667 ), IHH_D = c( 0, 0, 0, 21383.3333333333, 0, 43216.6666666667, 28025, 0, 33012.5, 9025, 13037.5 ), IES = c( 19362.1212121212, 25023.5930735931, 30057.1428571429, 23654.6717171717, 22727.2005772006, 9582.1608946609, 17869.696969697, 21479.797979798, 15730.1587301587, 11326.9841269841, 5890.83694083694 ), INES = c( 25045.2380952381, 32750.9920634921, 39881.9444444445, 38145.5069124424, 29680.1587301587, 49005.9523809524, 28743.6766567576, 27370.1298701299, 39857.9545454545, 23125, 14158.2264957265 ) ) row.names(expected) <- row.names <- c("rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8", "rs9", "rs10", "rs11") expect_equal(scan, expected) sink() file.remove("test_example2.log") }) test_that("checked_unpolarized", { sink(file = "test_unpolarized.log") hh <- data2haplohh(hap_file = "example1.hap", map_file = "example1.map", allele_coding = "map") ihh_1 <- scan_hh(hh, discard_integration_at_border = FALSE, polarized = TRUE) ihh_2 <- scan_hh(hh, discard_integration_at_border = FALSE, polarized = FALSE) ihs1 <- ihh2ihs(ihh_1, freqbin = 1) ihs2 <- ihh2ihs(ihh_2, freqbin = 1) expect_identical(ihs1, ihs2) # test that allele coding is irrelevant for unpolarized data, # (as long as the order is not changed, since for equal frequency, # the "major" allele is the one with lower encoding hh@haplo[hh@haplo == 0] <- 4L # arbitrary integer number hh@haplo[hh@haplo == 1] <- 7L # arbitrary, bigger integer number ihh_3 <- scan_hh(hh, discard_integration_at_border = FALSE, polarized = FALSE) expect_identical(ihh_2, ihh_3) sink() file.remove("test_unpolarized.log") }) test_that("checked_limhaplo", { sink(file = "test_limhaplo.log") hh <- data2haplohh( hap_file = "example2.hap", map_file = "example2.map", allele_coding = "map", min_perc_geno.mrk = 1 ) expected_ihh <- c(IHH_A = 18816.6666666667, IHH_D1 = 80512.5, IHH_D2 = 43216.6666666667) ehh <- calc_ehh( hh, mrk = "rs6", limhaplo = 3, discard_integration_at_border = FALSE ) expect_equal(ehh$ihh, expected_ihh) expected_ihh <- c(IHH_A = 18816.6666666667, IHH_D1 = 0, IHH_D2 = 28025) ehh <- calc_ehh( hh, mrk = "rs6", limhaplo = 4, discard_integration_at_border = FALSE ) expect_equal(ehh$ihh, expected_ihh) sink() file.remove("test_limhaplo.log") })