test_that("msigdbdf() human", { mdb <- msigdbdf(target_species = "HS") expect_s3_class(mdb, "tbl_df") expect_gt(nrow(mdb), 4400000) expect_lt(nrow(mdb), 4500000) expect_identical(names(mdb)[1:6], c("db_gene_symbol", "db_ncbi_gene", "db_ensembl_gene", "source_gene", "gs_id", "gs_name")) expect_equal(n_distinct(mdb$db_version), 1) expect_identical(unique(mdb$db_target_species), "HS") expect_equal(nrow(filter(mdb, db_gene_symbol == "")), 0) expect_equal(nrow(filter(mdb, db_ncbi_gene == "")), 0) expect_equal(nrow(filter(mdb, db_ensembl_gene == "")), 0) expect_equal(nrow(filter(mdb, gs_collection == "")), 0) # overall dimensions expect_equal(n_distinct(mdb$gs_collection), 9) expect_equal(n_distinct(mdb$gs_subcollection), 22) expect_gt(n_distinct(mdb$gs_id), 34000) expect_gt(n_distinct(mdb$db_gene_symbol), 40000) expect_gt(n_distinct(mdb$db_ncbi_gene), 40000) expect_gt(n_distinct(mdb$db_ensembl_gene), 40000) # gene set sizes expect_equal(min(table(mdb$gs_id)), 5) expect_lt(max(table(mdb$gs_id)), 2500) expect_lt(quantile(table(mdb$gs_id), 0.999), 2000) expect_lt(quantile(table(mdb$gs_id), 0.98), 1000) expect_lt(quantile(table(mdb$gs_id), 0.9), 300) expect_gt(quantile(table(mdb$gs_id), 0.9), 200) expect_gt(quantile(table(mdb$gs_id), 0.5), 40) expect_gt(quantile(table(mdb$gs_id), 0.2), 10) mdb_sym <- distinct(mdb, gs_id, db_gene_symbol) expect_equal(min(table(mdb_sym$gs_id)), 5) expect_lt(max(table(mdb_sym$gs_id)), 2001) mdb_ens <- distinct(mdb, gs_id, db_ensembl_gene) expect_equal(min(table(mdb_ens$gs_id)), 5) expect_lt(max(table(mdb_ens$gs_id)), 2001) # specific gene set sizes expect_equal(nrow(filter(mdb_sym, gs_id == "M39207")), 5) expect_equal(nrow(filter(mdb_ens, gs_id == "M39207")), 5) expect_equal(nrow(filter(mdb_sym, gs_id == "M32651")), 5) expect_equal(nrow(filter(mdb_ens, gs_id == "M32651")), 5) expect_equal(nrow(filter(mdb_sym, gs_id == "M22010")), 8) expect_equal(nrow(filter(mdb_ens, gs_id == "M22010")), 8) expect_equal(nrow(filter(mdb_sym, gs_id == "M5902")), 161) expect_equal(nrow(filter(mdb_ens, gs_id == "M5902")), 161) expect_equal(nrow(filter(mdb_sym, gs_id == "M5903")), 32) expect_equal(nrow(filter(mdb_ens, gs_id == "M5903")), 32) expect_equal(nrow(filter(mdb_sym, gs_id == "M40020")), 12) expect_equal(nrow(filter(mdb_ens, gs_id == "M40020")), 12) expect_equal(nrow(filter(mdb_sym, gs_id == "M40180")), 91) expect_equal(nrow(filter(mdb_ens, gs_id == "M40180")), 91) expect_equal(nrow(filter(mdb_sym, gs_id == "M6220")), 33) expect_equal(nrow(filter(mdb_ens, gs_id == "M6220")), 33) expect_equal(nrow(filter(mdb_sym, gs_id == "M30053")), 1957) expect_equal(nrow(filter(mdb_ens, gs_id == "M30053")), 1957) expect_equal(nrow(filter(mdb_sym, gs_id == "M3458")), 1902) expect_equal(nrow(filter(mdb_ens, gs_id == "M3458")), 1920) # specific genes present in specific gene sets expect_equal(nrow(filter(mdb, gs_id == "M30055", db_gene_symbol == "FOS")), 1) expect_equal(nrow(filter(mdb, gs_id == "M30055", db_ncbi_gene == "2353")), 1) expect_equal(nrow(filter(mdb, gs_id == "M30055", db_ensembl_gene == "ENSG00000170345")), 1) expect_equal(nrow(filter(mdb, gs_id == "M40827", db_gene_symbol == "ABCA11P")), 1) expect_equal(nrow(filter(mdb, gs_id == "M40827", db_ncbi_gene == "79963")), 1) expect_equal(nrow(filter(mdb, gs_id == "M40827", db_ensembl_gene == "ENSG00000251595")), 1) expect_equal(nrow(filter(mdb, gs_id == "M8918", db_gene_symbol == "NEPNP")), 1) expect_equal(nrow(filter(mdb, gs_id == "M8918", db_ncbi_gene == "442253")), 1) expect_equal(nrow(filter(mdb, gs_id == "M8918", db_ensembl_gene == "ENSG00000218233")), 1) # specific Ensembl mappings mdb_map <- distinct(mdb, db_gene_symbol, db_ensembl_gene) expect_lt(nrow(mdb_map), n_distinct(mdb_map$db_gene_symbol) * 1.05) expect_identical(filter(mdb, db_gene_symbol == "CDK1"), filter(mdb, db_ensembl_gene == "ENSG00000170312")) expect_identical(filter(mdb, db_gene_symbol == "TP53"), filter(mdb, db_ensembl_gene == "ENSG00000141510")) expect_identical(filter(mdb, db_gene_symbol == "SRSF1"), filter(mdb, db_ensembl_gene == "ENSG00000136450")) }) test_that("msigdbdf() mouse", { mdb <- msigdbdf(target_species = "MM") expect_s3_class(mdb, "tbl_df") expect_gt(nrow(mdb), 1600000) expect_lt(nrow(mdb), 1700000) expect_equal(n_distinct(mdb$db_version), 1) expect_identical(unique(mdb$db_target_species), "MM") expect_equal(nrow(filter(mdb, db_gene_symbol == "")), 0) expect_equal(nrow(filter(mdb, db_ncbi_gene == "")), 0) expect_equal(nrow(filter(mdb, db_ensembl_gene == "")), 0) expect_equal(nrow(filter(mdb, gs_collection == "")), 0) # overall dimensions expect_equal(n_distinct(mdb$gs_collection), 6) expect_equal(n_distinct(mdb$gs_subcollection), 11) expect_gt(n_distinct(mdb$gs_id), 16000) expect_gt(n_distinct(mdb$db_gene_symbol), 40000) expect_gt(n_distinct(mdb$db_ncbi_gene), 40000) expect_gt(n_distinct(mdb$db_ensembl_gene), 40000) # gene set sizes expect_equal(min(table(mdb$gs_id)), 5) expect_lt(max(table(mdb$gs_id)), 2900) expect_lt(quantile(table(mdb$gs_id), 0.999), 2000) expect_lt(quantile(table(mdb$gs_id), 0.98), 1000) expect_lt(quantile(table(mdb$gs_id), 0.9), 300) expect_gt(quantile(table(mdb$gs_id), 0.9), 200) expect_gt(quantile(table(mdb$gs_id), 0.5), 20) expect_gt(quantile(table(mdb$gs_id), 0.1), 5) mdb_sym <- distinct(mdb, gs_id, db_gene_symbol) expect_equal(min(table(mdb_sym$gs_id)), 5) expect_lt(max(table(mdb_sym$gs_id)), 2001) mdb_ens <- distinct(mdb, gs_id, db_ensembl_gene) expect_equal(min(table(mdb_ens$gs_id)), 5) expect_lt(max(table(mdb_ens$gs_id)), 2001) # specific gene set sizes expect_equal(nrow(filter(mdb_sym, gs_id == "MM3871")), 200) expect_equal(nrow(filter(mdb_ens, gs_id == "MM3871")), 200) expect_equal(nrow(filter(mdb_sym, gs_id == "MM3634")), 294) expect_equal(nrow(filter(mdb_ens, gs_id == "MM3634")), 294) expect_equal(nrow(filter(mdb_sym, gs_id == "MM10270")), 12) expect_equal(nrow(filter(mdb_ens, gs_id == "MM10270")), 12) # specific genes present in specific gene sets expect_equal(nrow(filter(mdb, gs_id == "MM1067", db_gene_symbol == "Zfas1")), 1) expect_equal(nrow(filter(mdb, gs_id == "MM1067", db_ncbi_gene == "68949")), 1) expect_equal(nrow(filter(mdb, gs_id == "MM1067", db_ensembl_gene == "ENSMUSG00000074578")), 1) # specific Ensembl mappings mdb_map <- distinct(mdb, db_gene_symbol, db_ensembl_gene) expect_lt(nrow(mdb_map), n_distinct(mdb_map$db_gene_symbol) * 1.05) expect_identical(filter(mdb, db_gene_symbol == "Cdk1"), filter(mdb, db_ensembl_gene == "ENSMUSG00000019942")) expect_identical(filter(mdb, db_gene_symbol == "Trp53"), filter(mdb, db_ensembl_gene == "ENSMUSG00000059552")) expect_identical(filter(mdb, db_gene_symbol == "Srsf1"), filter(mdb, db_ensembl_gene == "ENSMUSG00000018379")) }) test_that("msigdbdf() target_species variants", { expect_identical(msigdbdf(target_species = "HS"), msigdbdf(target_species = "Hs")) expect_identical(msigdbdf(target_species = "MM"), msigdbdf(target_species = "Mm")) })