# in first month 2 patients, 2 codes, one twice # second two patients, one code # third one patient 2 codes # devtools::load_all() df_ehr = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4), Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4), Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2', 'C3', 'C4'), Count = 1:9) spm_cooc = build_df_cooc(df_ehr) test_build_df_cooc = function() { expect_true(inherits(spm_cooc, 'sparseMatrix')) # in diagonal, no NAs, no 0s code_freqs = Matrix::diag(spm_cooc) expect_true(!any(is.na(code_freqs))) expect_true(!any(code_freqs == 0)) # in coocs, none above the diag above_diag = sapply(seq_along(code_freqs), function(idx) { any(spm_cooc[, idx] > code_freqs[idx]) }) expect_true(!any(above_diag)) # try with uniq_codes df_ehr_batch1 = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4), Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4), Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2', 'C3', 'C4'), Count = 1:9) df_ehr_batch2 = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4), Patient = c(5, 5, 6, 5, 6, 5, 5, 7, 8), Parent_Code = c('C5', 'C6', 'C6', 'C5', 'C5', 'C1', 'C2', 'C3', 'C4'), Count = 1:9) df_ehr_batch3 = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4), Patient = c(5, 5, 6, 5, 6, 5, 5, 7, 8), Parent_Code = c('C5', 'C6', 'C6', 'C5', 'C5', 'C1', 'C2', 'C7', 'C8'), Count = 1:9) uniq_codes = paste0('C', 1:8) spm_cooc_batch = build_df_cooc(df_ehr_batch1, uniq_codes, min_code_freq = 0) spm_cooc_batch = spm_cooc_batch + build_df_cooc(df_ehr_batch2, uniq_codes, min_code_freq = 0) spm_cooc_batch + build_df_cooc(df_ehr_batch3, uniq_codes, min_code_freq = 0) # there's your error # also just with build_df_cooc(df_ehr_batch3, uniq_codes) # only affects code freqs, not code coocs df_ehr_batch1 = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4), Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4), Parent_Code = c('C7', 'C8', 'C8', 'C7', 'C7', 'C7', 'C8', 'C3', 'C4'), Count = 1:9) build_df_cooc(df_ehr_batch1, uniq_codes) } test_that('build_df_cooc', test_build_df_cooc()) test_build_df_cooc_cuis = function() { # with cuis # devtools::load_all() #5 x 5 sparse Matrix of class "dgCMatrix" # BACITRACIN C0000726 C0000729 C0000731 C0000737 #BACITRACIN 15 8 1 1 2 #C0000726 . 707 16 27 79 #C0000729 . . 19 3 5 #C0000731 . . . 31 10 #C0000737 . . . . 110 df_ehr = get(load('df_ehr_10_patients_cuis.rds')) #library(profvis) #p = profvis({ df_cooc = build_df_cooc(df_ehr) }) #htmlwidgets::saveWidget(p, file = 'profiling.html') system.time({ df_cooc = build_df_cooc(df_ehr, n_cores = 3) }) spm_cooc = df_cooc code_freqs = Matrix::diag(spm_cooc) expect_true(!any(is.na(code_freqs))) expect_true(!any(code_freqs == 0)) } #test_that('build_df_cooc_cuis', test_build_df_cooc_cuis()) test_spm_to_df = function() { # obj = "structure(list(V1 = c(\"C1\", \"C1\", \"C2\", \"C3\", \"C4\"), # V2 = c(\"C1\", \"C2\", \"C2\", \"C3\", \"C4\"), # value = c(16, 7, 12, 8, 9)), row.names = c(NA, -5L), # header = \"4 x 4 sparse Matrix of class \\\"dgCMatrix\\\", with 5 entries\", # class = \"data.frame\")" obj = "structure(list(V1 = c(\"C1\", \"C1\", \"C2\", \"C3\", \"C4\"), V2 = c(\"C1\", \"C2\", \"C2\", \"C3\", \"C4\"), value = c(16, 7, 12, 8, 9)), row.names = c(NA, -5L), class = \"data.frame\")" expect_equal(spm_to_df(spm_cooc), eval(parse(text = obj))) } test_that('spm_to_df', test_spm_to_df()) if (FALSE) { # larger tests # devtools::load_all() #system.time({ set.seed(1) n_rows = 2e3 * 4 * 12 / 2 df_ehr = data.frame(Parent_Code = sample(paste0('C', 1:2e3), n_rows, replace = TRUE), Patient = sample(1:4, n_rows, replace = TRUE), Month = sample(1:12, n_rows, replace = TRUE), Count = sample(1:9, n_rows, replace = TRUE)) df_ehr = subset(df_ehr, !duplicated(paste0('P', Patient, 'M', Month, 'C', Parent_Code))) df_cooc = build_df_cooc(df_ehr) #library(profvis) #p = profvis({df_cooc = build_df_cooc(df_ehr)}) #htmlwidgets::saveWidget(p, file = 'profiling.html') #}) # devtools::load_all() set.seed(1) n_rows = 2e3 * 4 * 12 / 2 df_ehr = data.frame(Parent_Code = sample(paste0('C', 1:2e3), n_rows, replace = TRUE), Patient = sample(1:4, n_rows, replace = TRUE), Month = sample(1:12, n_rows, replace = TRUE), Count = sample(1:9, n_rows, replace = TRUE)) df_ehr = subset(df_ehr, !duplicated(paste0('P', Patient, 'M', Month, 'C', Parent_Code))) system.time({ df_cooc = build_df_cooc(df_ehr) }) df_cooc[1:5, 1:5] df_cooc = build_df_cooc(df_ehr,n_cores = 4) # 1 patient set.seed(1) n_rows = 2e3 * 4 * 12 / 2 df_ehr = data.frame(Parent_Code = sample(paste0('C', 1:2e3), n_rows, replace = TRUE), Patient = sample(1, n_rows, replace = TRUE), Count = sample(1:9, n_rows, replace = TRUE), Month = sample(1:12, n_rows, replace = TRUE)) df_cooc = build_df_cooc(df_ehr) df_cooc[1:5, 1:5] # 1 month set.seed(1) n_rows = 2e3 * 4 * 12 / 2 df_ehr = data.frame(Parent_Code = sample(paste0('C', 1:2e3), n_rows, replace = TRUE), Patient = sample(1, n_rows, replace = TRUE), Count = sample(1:9, n_rows, replace = TRUE), Month = sample(1, n_rows, replace = TRUE)) df_cooc = build_df_cooc(df_ehr) df_cooc[1:5, 1:5] }