# comprehensive tests for download utilities and internal helpers # --- inep_base_url ----------------------------------------------------------- test_that("inep_base_url returns correct URL", { url <- educabR:::inep_base_url() expect_equal(url, "https://download.inep.gov.br") }) # --- detect_delim ------------------------------------------------------------ test_that("detect_delim detects semicolon delimiter", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) writeLines("a;b;c\n1;2;3", f) expect_equal(educabR:::detect_delim(f), ";") }) test_that("detect_delim detects comma delimiter", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) writeLines("a,b,c\n1,2,3", f) expect_equal(educabR:::detect_delim(f), ",") }) test_that("detect_delim detects pipe delimiter", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) writeLines("a|b|c\n1|2|3", f) expect_equal(educabR:::detect_delim(f), "|") }) test_that("detect_delim detects tab delimiter", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) writeLines("a\tb\tc\n1\t2\t3", f) expect_equal(educabR:::detect_delim(f), "\t") }) # --- detect_encoding --------------------------------------------------------- test_that("detect_encoding detects UTF-8 file", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) writeBin(charToRaw("col1;col2\nabc;def\n"), f) expect_equal(educabR:::detect_encoding(f), "UTF-8") }) test_that("detect_encoding detects Latin1 file", { f <- tempfile(fileext = ".csv") withr::defer(unlink(f)) # write bytes that are valid Latin1 but invalid UTF-8 # 0xE9 = e-acute in Latin1, invalid continuation in UTF-8 latin1_bytes <- charToRaw("col1;col2\n") latin1_bytes <- c(latin1_bytes, as.raw(c(0xE9, 0x3B, 0xE7, 0x0A))) writeBin(latin1_bytes, f) expect_equal(educabR:::detect_encoding(f), "latin1") }) # --- find_data_files --------------------------------------------------------- test_that("find_data_files finds CSV files in directory", { d <- tempfile("find_data_") dir.create(d, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(d, "data.csv")) file.create(file.path(d, "readme.txt")) files <- educabR:::find_data_files(d) expect_true(any(grepl("data\\.csv$", files))) expect_true(any(grepl("readme\\.txt$", files))) }) test_that("find_data_files finds files in subdirectories", { d <- tempfile("find_data_sub_") sub <- file.path(d, "DADOS") dir.create(sub, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(sub, "microdados.CSV")) files <- educabR:::find_data_files(d) expect_length(files, 1) expect_true(grepl("microdados\\.CSV$", files)) }) test_that("find_data_files errors when no files match", { d <- tempfile("find_data_empty_") dir.create(d, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(d, "data.xlsx")) expect_error(educabR:::find_data_files(d), "no data files found") }) test_that("find_data_files uses custom pattern", { d <- tempfile("find_data_pattern_") dir.create(d, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(d, "data.xlsx")) file.create(file.path(d, "data.csv")) files <- educabR:::find_data_files(d, pattern = "\\.xlsx$") expect_length(files, 1) expect_true(grepl("xlsx$", files)) }) # --- build_inep_url ---------------------------------------------------------- test_that("build_inep_url builds censo_escolar URL", { url <- build_inep_url("censo_escolar", 2023) expect_match(url, "download\\.inep\\.gov\\.br") expect_match(url, "microdados_censo_escolar_2023\\.zip$") }) test_that("build_inep_url builds enem URL", { url <- build_inep_url("enem", 2023) expect_match(url, "microdados_enem_2023\\.zip$") }) test_that("build_inep_url builds saeb URL for non-2021 year", { url <- build_inep_url("saeb", 2019) expect_match(url, "microdados_saeb_2019\\.zip$") expect_false(grepl("fundamental|infantil", url)) }) test_that("build_inep_url builds saeb 2021 URL with default level", { url <- build_inep_url("saeb", 2021) expect_match(url, "microdados_saeb_2021_ensino_fundamental_e_medio\\.zip$") }) test_that("build_inep_url builds saeb 2021 URL with educacao_infantil level", { url <- build_inep_url("saeb", 2021, level = "educacao_infantil") expect_match(url, "microdados_saeb_2021_educacao_infantil\\.zip$") }) test_that("build_inep_url builds censo_superior URL", { url <- build_inep_url("censo_superior", 2023) expect_match(url, "microdados_censo_da_educacao_superior_2023\\.zip$") }) test_that("build_inep_url builds enade URL", { url <- build_inep_url("enade", 2023) expect_match(url, "microdados_enade_2023\\.zip$") }) test_that("build_inep_url builds encceja URL", { url <- build_inep_url("encceja", 2023) expect_match(url, "microdados_encceja_2023\\.zip$") }) test_that("build_inep_url builds enem_escola URL (fixed URL)", { url <- build_inep_url("enem_escola", 2015) expect_match(url, "enem_por_escola/2005_a_2015/microdados_enem_por_escola\\.zip$") }) test_that("build_inep_url builds idd URL with zip for year >= 2021", { url <- build_inep_url("idd", 2021) expect_match(url, "microdados_IDD_2021\\.zip$") }) test_that("build_inep_url builds idd URL with 7z for year < 2021", { url <- build_inep_url("idd", 2019) expect_match(url, "microdados_IDD_2019\\.7z$") }) test_that("build_inep_url builds ideb URL", { url <- build_inep_url("ideb", 2021) expect_match(url, "ideb/2021/$") }) test_that("build_inep_url errors for unknown dataset", { expect_error(build_inep_url("invalid_dataset", 2023), "unknown dataset") }) # --- validate_year ----------------------------------------------------------- test_that("validate_year accepts valid years for all datasets", { expect_silent(validate_year(2023, "censo_escolar")) expect_silent(validate_year(1995, "censo_escolar")) expect_silent(validate_year(2024, "enem")) expect_silent(validate_year(1998, "enem")) expect_silent(validate_year(2021, "saeb")) expect_silent(validate_year(2023, "censo_superior")) expect_silent(validate_year(2023, "enade")) expect_silent(validate_year(2023, "encceja")) expect_silent(validate_year(2023, "idd")) expect_silent(validate_year(2023, "cpc")) expect_silent(validate_year(2023, "igc")) expect_silent(validate_year(2023, "capes")) expect_silent(validate_year(2023, "ideb")) expect_silent(validate_year(2026, "fundeb")) }) test_that("validate_year rejects invalid years", { expect_error(validate_year(1990, "censo_escolar"), "not available") expect_error(validate_year(1990, "enem"), "not available") expect_error(validate_year(2020, "saeb"), "not available") expect_error(validate_year(2008, "censo_superior"), "not available") expect_error(validate_year(2003, "enade"), "not available") expect_error(validate_year(2013, "encceja"), "not available") expect_error(validate_year(2020, "idd"), "not available") expect_error(validate_year(2020, "cpc"), "not available") expect_error(validate_year(2020, "igc"), "not available") expect_error(validate_year(2012, "capes"), "not available") expect_error(validate_year(2016, "ideb"), "not available") expect_error(validate_year(2006, "fundeb"), "not available") }) test_that("validate_year returns year invisibly on success", { result <- validate_year(2023, "censo_escolar") expect_equal(result, 2023) }) # --- available_years --------------------------------------------------------- test_that("fallback_years returns correct years for all datasets", { expect_equal(fallback_years("censo_escolar"), 1995:2024) expect_equal(fallback_years("enem"), 1998:2024) expect_equal(fallback_years("saeb"), c(2011L, 2013L, 2015L, 2017L, 2019L, 2021L, 2023L)) expect_equal(fallback_years("censo_superior"), 2009:2024) expect_equal(fallback_years("enade"), c(2004L:2019L, 2021L:2023L)) expect_equal(fallback_years("encceja"), 2014:2024) expect_equal(fallback_years("idd"), c(2014L:2019L, 2021L:2023L)) expect_equal(fallback_years("cpc"), c(2007L:2019L, 2021L:2023L)) expect_equal(fallback_years("igc"), c(2007L:2019L, 2021L:2023L)) expect_equal(fallback_years("fundeb_enrollment"), 2017:2018) expect_false(2020 %in% fallback_years("idd")) expect_false(2020 %in% fallback_years("enade")) }) test_that("available_years returns non-empty sorted integer vector", { # available_years may use dynamic discovery or fallback depending on network # so we only test general properties, not exact values for (ds in c("censo_escolar", "enem", "saeb", "censo_superior", "enade", "encceja", "idd", "cpc", "igc", "capes", "ideb", "fundeb")) { yrs <- available_years(ds) expect_true(length(yrs) > 0, info = paste("dataset:", ds)) expect_true(is.numeric(yrs), info = paste("dataset:", ds)) expect_equal(yrs, sort(yrs), info = paste("dataset:", ds)) } }) test_that("available_years accepts fundeb_enrollment as dataset", { yrs <- available_years("fundeb_enrollment") expect_true(length(yrs) > 0) expect_true(is.numeric(yrs)) }) test_that("available_years rejects invalid dataset", { expect_error(available_years("nonexistent")) }) test_that("available_years returns correct years for capes", { yrs <- available_years("capes") expect_true(2013 %in% yrs) expect_true(2024 %in% yrs) }) test_that("available_years returns correct years for ideb", { yrs <- available_years("ideb") expect_equal(yrs, c(2017L, 2019L, 2021L, 2023L)) }) test_that("available_years returns correct years for fundeb", { yrs <- available_years("fundeb") expect_true(2007 %in% yrs) expect_true(2026 %in% yrs) }) test_that("available_years errors for invalid dataset", { expect_error(available_years("nonexistent"), "should be one of") }) # --- standardize_names ------------------------------------------------------- test_that("standardize_names converts to lowercase", { df <- data.frame(UPPERCASE = 1, MixedCase = 2, check.names = FALSE) result <- standardize_names(df) expect_equal(names(result), c("uppercase", "mixedcase")) }) test_that("standardize_names replaces spaces with underscores", { df <- data.frame(`Column Name` = 1, `Another Col` = 2, check.names = FALSE) result <- standardize_names(df) expect_equal(names(result), c("column_name", "another_col")) }) test_that("standardize_names replaces dots with underscores", { df <- data.frame(col.name = 1, another.col.here = 2, check.names = FALSE) result <- standardize_names(df) expect_equal(names(result), c("col_name", "another_col_here")) }) test_that("standardize_names removes accents", { df <- data.frame( "\u00e9cole" = 1, "institui\u00e7\u00e3o" = 2, check.names = FALSE ) result <- standardize_names(df) expect_true(all(!grepl("[\u00e9\u00e7\u00e3]", names(result)))) expect_match(names(result)[1], "^[a-z_]+$") }) test_that("standardize_names collapses consecutive underscores", { df <- data.frame(`a b` = 1, `c---d` = 2, check.names = FALSE) result <- standardize_names(df) expect_false(any(grepl("__", names(result)))) }) test_that("standardize_names removes leading and trailing underscores", { df <- data.frame(` leading` = 1, `trailing ` = 2, check.names = FALSE) result <- standardize_names(df) expect_false(any(grepl("^_|_$", names(result)))) }) test_that("standardize_names handles mixed special characters", { df <- data.frame( `Col (Name) #1!` = 1, check.names = FALSE ) result <- standardize_names(df) expect_match(names(result), "^[a-z0-9_]+$") }) # --- uf_to_code -------------------------------------------------------------- test_that("uf_to_code converts all 27 UF abbreviations", { uf_expected <- c( RO = 11, AC = 12, AM = 13, RR = 14, PA = 15, AP = 16, TO = 17, MA = 21, PI = 22, CE = 23, RN = 24, PB = 25, PE = 26, AL = 27, SE = 28, BA = 29, MG = 31, ES = 32, RJ = 33, SP = 35, PR = 41, SC = 42, RS = 43, MS = 50, MT = 51, GO = 52, DF = 53 ) for (uf_name in names(uf_expected)) { expect_equal( uf_to_code(uf_name), unname(uf_expected[uf_name]), info = paste("UF:", uf_name) ) } }) test_that("uf_to_code is case insensitive", { expect_equal(uf_to_code("sp"), 35) expect_equal(uf_to_code("Sp"), 35) expect_equal(uf_to_code("SP"), 35) }) test_that("uf_to_code passes through numeric codes", { expect_equal(uf_to_code(35), 35) expect_equal(uf_to_code(33), 33) expect_equal(uf_to_code(11), 11) }) test_that("uf_to_code rejects invalid UF", { expect_error(uf_to_code("XX"), "invalid UF") expect_error(uf_to_code("ZZ"), "invalid UF") }) # --- parse_sas_dates --------------------------------------------------------- test_that("parse_sas_dates converts dt_ columns with SAS datetime format", { df <- data.frame( dt_criacao = c("12FEB2024:00:00:00", "01JAN2023:00:00:00"), co_entidade = c(1, 2), stringsAsFactors = FALSE ) result <- educabR:::parse_sas_dates(df) expect_s3_class(result$dt_criacao, "Date") expect_equal(result$dt_criacao[1], as.Date("2024-02-12")) expect_equal(result$dt_criacao[2], as.Date("2023-01-01")) }) test_that("parse_sas_dates converts dh_ columns", { df <- data.frame( dh_alteracao = c("15MAR2023:10:30:00"), no_escola = "test", stringsAsFactors = FALSE ) result <- educabR:::parse_sas_dates(df) expect_s3_class(result$dh_alteracao, "Date") expect_equal(result$dh_alteracao, as.Date("2023-03-15")) }) test_that("parse_sas_dates leaves non-date columns unchanged", { df <- data.frame( co_escola = 12345, no_escola = "Test School", dt_data = "01JAN2023:00:00:00", stringsAsFactors = FALSE ) result <- educabR:::parse_sas_dates(df) expect_type(result$co_escola, "double") expect_type(result$no_escola, "character") expect_equal(result$no_escola, "Test School") }) test_that("parse_sas_dates skips non-character dt_ columns", { df <- data.frame( dt_numeric = 12345, co_escola = 1, stringsAsFactors = FALSE ) result <- educabR:::parse_sas_dates(df) # numeric dt_ column should remain numeric expect_type(result$dt_numeric, "double") expect_equal(result$dt_numeric, 12345) }) test_that("parse_sas_dates handles df with no date columns", { df <- data.frame(co_escola = 1, no_escola = "A", stringsAsFactors = FALSE) result <- educabR:::parse_sas_dates(df) expect_equal(result, df) }) # --- find_censo_file --------------------------------------------------------- test_that("find_censo_file finds file matching year pattern", { d <- tempfile("censo_find_") sub <- file.path(d, "DADOS") dir.create(sub, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(sub, "microdados_ed_basica_2023.csv")) file.create(file.path(sub, "suplemento_2023.csv")) result <- educabR:::find_censo_file(d, 2023) expect_match(basename(result), "microdados_ed_basica_2023\\.csv") }) test_that("find_censo_file prefers non-suplemento file", { d <- tempfile("censo_pref_") sub <- file.path(d, "DADOS") dir.create(sub, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(sub, "microdados_ed_basica_2023.csv")) file.create(file.path(sub, "microdados_ed_basica_suplemento_2023.csv")) result <- educabR:::find_censo_file(d, 2023) expect_false(grepl("suplemento", basename(result), ignore.case = TRUE)) }) test_that("find_censo_file falls back to broader pattern", { d <- tempfile("censo_fallback_") sub <- file.path(d, "DADOS") dir.create(sub, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) # no year-specific file, but a generic microdados file file.create(file.path(sub, "microdados_ed_basica.csv")) result <- educabR:::find_censo_file(d, 2023) expect_match(basename(result), "microdados_ed_basica\\.csv") }) test_that("find_censo_file falls back to just 'microdados' pattern", { d <- tempfile("censo_micro_") sub <- file.path(d, "DADOS") dir.create(sub, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(sub, "microdados.csv")) result <- educabR:::find_censo_file(d, 2023) expect_match(basename(result), "microdados\\.csv") }) test_that("find_censo_file errors when no files found", { d <- tempfile("censo_empty_") dir.create(d, recursive = TRUE) withr::defer(unlink(d, recursive = TRUE)) file.create(file.path(d, "readme.txt")) expect_error(educabR:::find_censo_file(d, 2023), "no data file found") }) # --- clean_dash_values (from get-censo-superior.R) --------------------------- test_that("clean_dash_values replaces hyphens with NA", { df <- data.frame( a = c("value", "-", "other"), b = c(1, 2, 3), stringsAsFactors = FALSE ) result <- educabR:::clean_dash_values(df) expect_true(is.na(result$a[2])) expect_equal(result$a[1], "value") expect_equal(result$a[3], "other") }) test_that("clean_dash_values replaces en-dash and em-dash", { df <- data.frame( a = c("\u2013", "\u2014", "ok"), stringsAsFactors = FALSE ) result <- educabR:::clean_dash_values(df) expect_true(is.na(result$a[1])) expect_true(is.na(result$a[2])) expect_equal(result$a[3], "ok") }) test_that("clean_dash_values leaves non-character columns unchanged", { df <- data.frame( num_col = c(1, 2, 3), char_col = c("-", "a", "b"), stringsAsFactors = FALSE ) result <- educabR:::clean_dash_values(df) expect_equal(result$num_col, c(1, 2, 3)) expect_true(is.na(result$char_col[1])) }) # --- clean_ideb_values (from get-ideb.R) ------------------------------------- test_that("clean_ideb_values replaces - and ND with NA in vl_ columns", { df <- data.frame( vl_nota = c("-", "ND", "5,5", "7,2"), sg_uf = c("SP", "RJ", "MG", "BA"), stringsAsFactors = FALSE ) result <- educabR:::clean_ideb_values(df) expect_true(is.na(result$vl_nota[1])) expect_true(is.na(result$vl_nota[2])) expect_equal(result$vl_nota[3], 5.5) expect_equal(result$vl_nota[4], 7.2) }) test_that("clean_ideb_values fixes comma decimal separator", { df <- data.frame( vl_ideb = c("6,5", "7,8"), stringsAsFactors = FALSE ) result <- educabR:::clean_ideb_values(df) expect_type(result$vl_ideb, "double") expect_equal(result$vl_ideb, c(6.5, 7.8)) }) test_that("clean_ideb_values leaves non-vl columns unchanged", { df <- data.frame( vl_score = c("5,0"), no_escola = c("Test School"), co_escola = 12345, stringsAsFactors = FALSE ) result <- educabR:::clean_ideb_values(df) expect_equal(result$no_escola, "Test School") expect_equal(result$co_escola, 12345) expect_equal(result$vl_score, 5.0) }) # --- convert_faixa_columns (from get-cpc.R) ---------------------------------- test_that("convert_faixa_columns converts faixa columns to numeric", { df <- data.frame( cpc_faixa = c("3", "4", "5"), nome = c("a", "b", "c"), stringsAsFactors = FALSE ) result <- educabR:::convert_faixa_columns(df) expect_type(result$cpc_faixa, "double") expect_equal(result$cpc_faixa, c(3, 4, 5)) }) test_that("convert_faixa_columns converts SC to NA", { df <- data.frame( cpc_faixa = c("3", "SC", "5"), stringsAsFactors = FALSE ) result <- educabR:::convert_faixa_columns(df) expect_equal(result$cpc_faixa[1], 3) expect_true(is.na(result$cpc_faixa[2])) expect_equal(result$cpc_faixa[3], 5) }) test_that("convert_faixa_columns only affects _faixa columns", { df <- data.frame( enade_faixa = c("4"), nome_curso = c("Eng"), stringsAsFactors = FALSE ) result <- educabR:::convert_faixa_columns(df) expect_type(result$enade_faixa, "double") expect_type(result$nome_curso, "character") }) test_that("convert_faixa_columns handles df without faixa columns", { df <- data.frame(a = 1, b = "x", stringsAsFactors = FALSE) result <- educabR:::convert_faixa_columns(df) expect_equal(result, df) }) # --- read_excel_safe (from get-cpc.R) ---------------------------------------- test_that("read_excel_safe errors on invalid file", { expect_error( educabR:::read_excel_safe("nonexistent_file.xlsx"), "failed to read Excel file" ) }) # --- read_ideb_excel (from get-ideb.R) --------------------------------------- test_that("read_ideb_excel reads with skip = 9", { skip_if_not_installed("readxl") # Test that it calls readxl (will fail on non-xlsx file) expect_error( educabR:::read_ideb_excel("nonexistent_file.xlsx") ) }) # --- list_ideb_available (from get-ideb.R) ----------------------------------- test_that("list_ideb_available returns expected structure", { result <- list_ideb_available() expect_s3_class(result, "tbl_df") expect_true(all(c("year", "level", "stage") %in% names(result))) expect_true(nrow(result) > 0) })