test_data <- readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ) test_dictionary <- readRDS( system.file("extdata", "test_dictionary.RDS", package = "cleanepi") ) test_that("clean_data works as expected with the default parameters", { cleaned_data <- clean_data( data = test_data ) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) expect_identical( names(cleaned_data), c("study_id", "date_of_admission", "date_of_birth", "date_first_pcr_positive_test", "sex") ) }) # DEFINING THE CLEANING PARAMETERS replace_missing_values <- list(target_columns = NULL, na_strings = "-99") standardize_column_names <- list(keep = NULL, rename = NULL) remove_duplicates <- list(target_columns = NULL) standardize_dates <- list( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list( named_months = c("Ybd", "dby"), digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd") ) ) standardize_subject_ids <- list( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) to_numeric <- list(target_columns = "sex", lang = "en") remove_constants <- list(cutoff = 1.0) check_date_sequence <- list( target_columns = c("date_first_pcr_positive_test", "date.of.admission") ) test_that("clean_data works as expected", { cleaned_data <- clean_data( data = test_data, standardize_column_names = standardize_column_names, remove_constants = list(cutoff = 1.0), replace_missing_values = replace_missing_values, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = test_dictionary, check_date_sequence = check_date_sequence ) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) expect_false("-99" %in% as.vector(as.matrix(cleaned_data))) }) test_that("cleaned_data works in a pipable way", { cleaned_data <- test_data %>% standardize_column_names(keep = NULL, rename = NULL) %>% replace_missing_values(target_columns = NULL, na_strings = "-99") %>% remove_constants(cutoff = 1.0) %>% remove_duplicates(target_columns = NULL) %>% standardize_dates( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")) ) %>% check_subject_ids( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) %>% convert_to_numeric(target_columns = "sex", lang = "en") %>% clean_using_dictionary(dictionary = test_dictionary) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(ncol(cleaned_data), 5L) expect_false("-99" %in% as.vector(as.matrix(cleaned_data))) }) test_that("cleaned_data works in a pipable way even when old column names are used", { cleaned_data <- test_data %>% standardize_column_names( keep = NULL, rename = c(DOB = "dateOfBirth") ) %>% standardize_dates( target_columns = c("dateOfBirth", "date_of_admission") ) expect_s3_class(cleaned_data, "data.frame") expect_identical(nrow(cleaned_data), 10L) expect_identical(class(cleaned_data[["DOB"]]), "Date") expect_identical(class(cleaned_data[["date_of_admission"]]), "Date") }) test_that("clean_data fails as expected", { standardize_subject_ids[["target_columns"]] <- NULL expect_error( clean_data( data = test_data, standardize_subject_ids = standardize_subject_ids ), regexp = cat("'target_columns' must be provided.") ) expect_error( test_data %>% standardize_column_names( keep = NULL, rename = "dateOfBirth = DOB" ) %>% standardize_dates( target_columns = c("dateOfBirth", "fake_column_name", "date_of_admission") ) ) })