library(dplyr) test_that("species", { # Human database and human genes m_hs_hs <- msigdbr() expect_identical(m_hs_hs, msigdbr(species = "Homo sapiens")) expect_identical(m_hs_hs, msigdbr(db_species = "hs", species = "human")) # Basic info expect_s3_class(m_hs_hs, "tbl_df") expect_type(m_hs_hs$gene_symbol, "character") expect_type(m_hs_hs$ncbi_gene, "character") expect_type(m_hs_hs$ensembl_gene, "character") expect_identical(names(m_hs_hs)[1:3], c("gene_symbol", "ncbi_gene", "ensembl_gene")) expect_identical(names(m_hs_hs)[4:8], c("db_gene_symbol", "db_ncbi_gene", "db_ensembl_gene", "source_gene", "gs_id")) expect_equal(n_distinct(m_hs_hs$db_version), 1) expect_identical(unique(m_hs_hs$db_target_species), "HS") # Overall dimensions expect_gt(nrow(m_hs_hs), 4400000) expect_lt(nrow(m_hs_hs), 4900000) expect_equal(n_distinct(m_hs_hs$gs_collection), 9) expect_equal(n_distinct(m_hs_hs$gs_subcollection), 22) expect_gt(n_distinct(m_hs_hs$gs_id), 34000) expect_gt(n_distinct(m_hs_hs$gene_symbol), 40000) expect_gt(n_distinct(m_hs_hs$ncbi_gene), 40000) expect_gt(n_distinct(m_hs_hs$ensembl_gene), 40000) # Gene set sizes expect_equal(min(table(m_hs_hs$gs_id)), 5) expect_lt(max(table(m_hs_hs$gs_id)), 2500) expect_lt(quantile(table(m_hs_hs$gs_id), 0.999), 2000) expect_lt(quantile(table(m_hs_hs$gs_id), 0.98), 1000) expect_lt(quantile(table(m_hs_hs$gs_id), 0.9), 300) expect_gt(quantile(table(m_hs_hs$gs_id), 0.9), 200) expect_gt(quantile(table(m_hs_hs$gs_id), 0.5), 40) expect_gt(quantile(table(m_hs_hs$gs_id), 0.2), 10) m_hs_hs_sym <- distinct(m_hs_hs, gs_id, db_gene_symbol) expect_equal(min(table(m_hs_hs_sym$gs_id)), 5) expect_lt(max(table(m_hs_hs_sym$gs_id)), 2001) m_hs_hs_ens <- distinct(m_hs_hs, gs_id, db_ensembl_gene) expect_equal(min(table(m_hs_hs_ens$gs_id)), 5) expect_lt(max(table(m_hs_hs_ens$gs_id)), 2001) # Human database and mouse genes m_hs_mm <- msigdbr(species = "Mus musculus") expect_s3_class(m_hs_mm, "tbl_df") expect_type(m_hs_mm$gene_symbol, "character") expect_type(m_hs_mm$ncbi_gene, "character") expect_type(m_hs_mm$ensembl_gene, "character") expect_identical(m_hs_mm, msigdbr(db_species = "hs", species = "mouse")) # Human database and rat genes m_hs_rn <- msigdbr(species = "Rattus norvegicus") expect_s3_class(m_hs_rn, "tbl_df") expect_type(m_hs_rn$gene_symbol, "character") expect_type(m_hs_rn$ncbi_gene, "character") expect_type(m_hs_rn$ensembl_gene, "character") expect_equal(max(m_hs_rn$num_ortholog_sources), 10) # Column names should be identical (extra output with orthologs) expect_identical(names(m_hs_hs)[1:19], names(m_hs_mm)[1:19]) expect_identical(names(m_hs_mm)[1:19], names(m_hs_rn)[1:19]) # Ortholog conversion should not reduce the database size substantially expect_gt(nrow(m_hs_mm), nrow(m_hs_hs) * 0.9) expect_gt(nrow(m_hs_rn), nrow(m_hs_hs) * 0.9) # Mouse database and mouse genes m_mm_mm <- msigdbr(db_species = "mm", species = "Mus musculus") expect_s3_class(m_mm_mm, "data.frame") # Column names should be identical (extra output with orthologs) expect_identical(names(m_mm_mm)[1:19], names(m_hs_mm)[1:19]) # Basic info expect_equal(n_distinct(m_mm_mm$db_version), 1) expect_identical(unique(m_mm_mm$db_target_species), "MM") # Overall dimensions expect_gt(nrow(m_mm_mm), 1600000) expect_lt(nrow(m_mm_mm), 1900000) expect_gt(n_distinct(m_mm_mm$gs_collection), 5) expect_gt(n_distinct(m_mm_mm$gs_subcollection), 10) expect_gt(n_distinct(m_mm_mm$gs_id), 16000) expect_gt(n_distinct(m_mm_mm$db_gene_symbol), 40000) expect_gt(n_distinct(m_mm_mm$db_ncbi_gene), 40000) expect_gt(n_distinct(m_mm_mm$db_ensembl_gene), 40000) # Gene set sizes expect_equal(min(table(m_mm_mm$gs_id)), 5) expect_lt(max(table(m_mm_mm$gs_id)), 2900) expect_lt(quantile(table(m_mm_mm$gs_id), 0.999), 2000) expect_lt(quantile(table(m_mm_mm$gs_id), 0.98), 1000) expect_lt(quantile(table(m_mm_mm$gs_id), 0.9), 300) expect_gt(quantile(table(m_mm_mm$gs_id), 0.9), 200) expect_gt(quantile(table(m_mm_mm$gs_id), 0.5), 20) expect_gt(quantile(table(m_mm_mm$gs_id), 0.1), 5) m_mm_mm_sym <- distinct(m_mm_mm, gs_id, db_gene_symbol) expect_equal(min(table(m_mm_mm_sym$gs_id)), 5) expect_lt(max(table(m_mm_mm_sym$gs_id)), 2001) m_mm_mm_ens <- distinct(m_mm_mm, gs_id, db_ensembl_gene) expect_equal(min(table(m_mm_mm_ens$gs_id)), 5) expect_lt(max(table(m_mm_mm_ens$gs_id)), 2001) }) test_that("collections and subcollections", { m_rn_bp <- msigdbr(species = "Rattus norvegicus", collection = "C5", subcollection = "BP") expect_s3_class(m_rn_bp, "data.frame") expect_gt(nrow(m_rn_bp), 25) expect_gt(n_distinct(m_rn_bp$gene_symbol), 10) expect_gt(n_distinct(m_rn_bp$ncbi_gene), 10) expect_gt(n_distinct(m_rn_bp$ensembl_gene), 10) expect_equal(n_distinct(m_rn_bp$gs_collection), 1) expect_equal(n_distinct(m_rn_bp$gs_subcollection), 1) expect_gt(n_distinct(m_rn_bp$gs_id), 1) }) test_that("msigdbr() subcollection partial match", { m_mm_gomf <- msigdbr(species = "mouse", collection = "C5", subcollection = "GO:MF") expect_s3_class(m_mm_gomf, "data.frame") expect_gt(nrow(m_mm_gomf), 25) m_mm_mf <- msigdbr(species = "mouse", collection = "C5", subcollection = "MF") expect_s3_class(m_mm_mf, "data.frame") expect_gt(nrow(m_mm_mf), 25) expect_equal(nrow(m_mm_gomf), nrow(m_mm_mf)) expect_identical(m_mm_gomf, m_mm_mf) }) test_that("wrong parameters", { expect_error(msigdbr(db_species = "X")) expect_error(msigdbr(db_species = "RN")) expect_error(msigdbr(species = "test")) expect_error(msigdbr(species = c("Homo sapiens", "Mus musculus"))) expect_error(msigdbr(species = "")) expect_error(msigdbr(species = NA)) expect_error(msigdbr(species = "Homo sapiens", collection = "X")) expect_error(msigdbr(species = "Homo sapiens", collection = "X", subcollection = "X")) expect_error(msigdbr(species = "Homo sapiens", collection = "H", subcollection = "H")) expect_error(msigdbr(species = "Homo sapiens", collection = c("C1", "C2"))) expect_error(msigdbr(species = "Homo sapiens", collection = "C2", subcollection = c("CGP", "CP"))) expect_error(msigdbr(db_species = "mm", species = "Homo sapiens")) expect_error(msigdbr(db_species = "mm", species = "human")) expect_error(msigdbr(db_species = "mm", species = "Rattus norvegicus")) }) test_that("deprecated parameters", { expect_warning(msigdbr(species = "Homo sapiens", category = "H")) expect_warning(msigdbr(species = "Homo sapiens", subcategory = "CGP")) expect_no_error(msigdbr(species = "Homo sapiens", category = NULL)) expect_no_error(msigdbr(species = "Homo sapiens", subcategory = NULL)) expect_identical(nrow(msigdbr(species = "human")), nrow(msigdbr(species = "human", category = NULL))) m_hs <- msigdbr(species = "Homo sapiens", category = "H") expect_contains(colnames(m_hs), c("gene_symbol", "entrez_gene", "ensembl_gene", "gs_cat", "gs_subcat")) })