test_data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) test_dictionary <- readRDS(system.file("extdata", "test_dictionary.RDS", package = "cleanepi")) test_that("clean_data works as expected with the default parameters", { cleaned_data <- clean_data( data = test_data, params = NULL ) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) }) # DEFINING THE CLEANING PARAMETERS use_na <- list(target_columns = NULL, na_strings = "-99") standardize_col_names <- list(keep = NULL, rename = NULL) remove_duplicates <- list(target_columns = NULL) standardize_dates <- list(target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list(named_months = c("Ybd", "dby"), digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd")), modern_excel = TRUE) standardize_subject_ids <- list(target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L) to_numeric <- list(target_columns = "sex", lang = "en") check_date_sequence <- list( target_columns = c("date_first_pcr_positive_test", "date.of.admission") ) params <- list( standardize_column_names = standardize_col_names, remove_constants = list(cutoff = 1.0), replace_missing_values = use_na, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = test_dictionary, check_date_sequence = check_date_sequence ) test_that("clean_data works as expected", { cleaned_data <- clean_data( data = test_data, params = params ) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) expect_false("-99" %in% as.vector(as.matrix(cleaned_data))) }) test_that("cleaned_data works in a pipable way", { cleaned_data <- test_data |> standardize_column_names(keep = NULL, rename = NULL) |> replace_missing_values(target_columns = NULL, na_strings = "-99") |> remove_constants(cutoff = 1.0) |> remove_duplicates(target_columns = NULL) |> standardize_dates(target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29"))) |> check_subject_ids(target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L) |> convert_to_numeric(target_columns = "sex", lang = "en") |> clean_using_dictionary(dictionary = test_dictionary) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) expect_false("-99" %in% as.vector(as.matrix(cleaned_data))) }) test_that("cleaned_data works in a pipable way even when old column names are used", { cleaned_data <- test_data |> standardize_column_names(keep = NULL, rename = c(DOB = "dateOfBirth")) |> standardize_dates(target_columns = c("dateOfBirth", "date_of_admission")) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(class(cleaned_data[["DOB"]]), "Date") expect_identical(class(cleaned_data[["date_of_admission"]]), "Date") }) test_that("clean_data fails as expected", { params[["standardize_subject_ids"]][["target_columns"]] <- NULL expect_error( clean_data(data = test_data, params = params), regexp = cat("'target_columns' must be provided.") ) expect_error( test_data |> standardize_column_names(keep = NULL, rename = "dateOfBirth = DOB") |> standardize_dates(target_columns = c("dateOfBirth", "fake_column_name", "date_of_admission")) ) })