library(dplyr) library(testthat) # options("DEBUG_DATABASECONNECTOR_DBPLYR" = TRUE) testDbplyrFunctions <- function(connectionDetails, cdmDatabaseSchema) { assertTempEmulationSchemaSet(connectionDetails$dbms) connection <- connect(connectionDetails) on.exit(dropEmulatedTempTables(connection)) on.exit(disconnect(connection), add = TRUE) if ("person" %in% getTableNames(connection, cdmDatabaseSchema, cast = "none")) { personTableName <- "person" observationPeriodTableName <- "observation_period" } else if ("PERSON" %in% getTableNames(connection, cdmDatabaseSchema, cast = "none")) { personTableName <- "PERSON" observationPeriodTableName <- "OBSERVATION_PERIOD" } else { stop("person table not found in cdm schema!") } person <- tbl(connection, inDatabaseSchema(cdmDatabaseSchema, personTableName)) %>% rename_all(tolower) observationPeriod <- tbl(connection, inDatabaseSchema(cdmDatabaseSchema, observationPeriodTableName)) %>% rename_all(tolower) # Test filter, arrange, relocate, distinct ----------------------------------- nMales <- person %>% filter(.data$gender_concept_id == 8507) %>% count() %>% pull() expect_gt(nMales, 1) nMales2 <- person %>% filter(gender_concept_id %in% c(8507)) %>% count() %>% pull() expect_gt(nMales2, 1) # longestObsPeriod <- observationPeriod %>% # mutate(duration = dateDiff("day", observation_period_start_date, observation_period_end_date)) %>% # arrange(desc(duration)) %>% # # relocate(duration) %>% # relocate of field containing custom function no longer works in dbplr 2.3.1. # head(1) %>% # collect() # # expect_gt(longestObsPeriod$duration, 1) # translation of year() not correct on sqlite if (!(dbms(connection) %in% c("sqlite"))) { topAges <- person %>% inner_join(observationPeriod, by = "person_id") %>% mutate(age = year(observation_period_start_date) - year_of_birth) %>% distinct(age) %>% rename(person_age = age) %>% arrange(desc(person_age)) %>% head(10) %>% collect() expect_gt(nrow(topAges), 1) } # Test copy_inline ----------------------------------------------------------- # wrong translation, we can solve it locally (see backend-DatabaseConnector, commented out lines), but solution needs to be included in dbplyr if (!(dbms(connection) %in% c("redshift", "oracle", "sql server", "snowflake", "bigquery"))) { rows <- dbplyr::copy_inline(connection, mtcars) %>% filter(hp > 200) %>% arrange(wt, mpg) %>% collect() rows2 <- mtcars %>% filter(hp > 200) %>% arrange(wt, mpg) %>% collect() expect_equivalent(rows, rows2, tolerance = 1e-6) } # Test slicing --------------------------------------------------------------- personSample <- person %>% slice_sample(n = 10) %>% relocate(care_site_id) %>% collect() expect_equal(nrow(personSample), 10) expect_equal(which(names(personSample) == "care_site_id"), 1) # Test ifelse ---------------------------------------------------------------- sexString <- person %>% mutate(sex = ifelse(.data$gender_concept_id == 8507, "Male", ifelse(.data$gender_concept_id == 8532, "Female", NA) )) %>% select("person_id", "sex") %>% head() %>% collect() expect_true(all(sexString$sex %in% c("Male", "Female"))) # Test creation of temp tables ----------------------------------------------- # issues with temp emulation with oracle and sql server when Analyze happens as part of copy_to if (!(dbms(connection) %in% c("oracle", "sql server", "snowflake", "spark", "bigquery", "postgresql"))) { cars2 <- copy_to(connection, cars, overwrite = TRUE) cars2 <- cars2 %>% collect() expect_equivalent(arrange(cars, speed, dist), arrange(cars2, speed, dist)) tempTable <- person %>% filter(gender_concept_id == 8507) %>% compute() nMales2 <- tempTable %>% count() %>% pull() expect_gt(nMales2, 1) dataWithNa <- cars dataWithNa$speed[2] <- NA dataWithNa <- copy_to(connection, dataWithNa, overwrite = TRUE) filteredRow <- dataWithNa %>% filter(is.na(speed)) %>% collect() expect_equal(nrow(filteredRow), 1) aPersonId <- person %>% head(1) %>% pull(person_id) localTable = tibble(person_id = aPersonId, person_name = "Pedro") remoteTable <- copy_to(connection, localTable, overwrite = TRUE) result <- remoteTable %>% left_join(person, by = join_by(person_id)) %>% collect() expect_equal(result$person_name, "Pedro") } # Test joins and unions ------------------------------------------------------ # Casting duration to numeric because platforms like SQL Server compute the mean by first computing the sum, which # will not fit in an integer: # durationDist <- person %>% # inner_join(observationPeriod, by = "person_id") %>% # mutate(duration = as.numeric(dateDiff("day", observation_period_start_date, observation_period_end_date))) %>% # group_by(gender_concept_id) %>% # summarize(mean_duration = mean(duration, na.rm = TRUE), # min_duration = min(duration, na.rm = TRUE), # max_duration = max(duration, na.rm = TRUE), # count_duration = n()) %>% # collect() # # expect_equal(nrow(durationDist), 2) resultOfAntiJoin <- observationPeriod %>% anti_join( person %>% filter(!is.null(race_concept_id)), by = "person_id" ) %>% group_by(period_type_concept_id) %>% summarize(value_count = n()) %>% collect() expect_s3_class(resultOfAntiJoin, "data.frame") personTwice <- person %>% union_all(person) %>% count() %>% collect() expect_gt(personTwice$n, 1) # Skipping until DatabaseConnector 7 or a new dbplyr version. See #271 # tripleJoin <- person %>% # left_join(observationPeriod, by = join_by(person_id)) %>% # left_join(observationPeriod %>% # select(person_id, dummy = observation_period_start_date), # by = join_by(person_id)) %>% # collect() # expect_gt(nrow(tripleJoin), 0) # Test row_number ------------------------------------------------------------ top10PersonsHardWay <- person %>% mutate(rn = row_number(person_id)) %>% filter(rn <= 10) %>% collect() expect_equal(nrow(top10PersonsHardWay), 10) # Test date functions -------------------------------------------------------- # nObsOverOneYear <- observationPeriod %>% # filter(dateDiff("day", observation_period_start_date, observation_period_end_date) > 365) %>% # count() %>% # pull() # # expect_gt(nObsOverOneYear, 1) ## In redshift eoMonth() must become LAST_DAY(), is not translated correctly. as.integer() is also needed to cast the float to integer for dateAdd. # testData <- observationPeriod %>% # mutate(plus_one_date = dateAdd("day", as.integer(1), observation_period_start_date), # end_of_month_date = eoMonth(observation_period_start_date), # obs_year = year(observation_period_start_date), # obs_month = month(observation_period_start_date), # obs_day = day(observation_period_start_date)) %>% # mutate(is_later = if_else(plus_one_date > observation_period_start_date, 1, 0)) %>% # head(1) %>% # collect() # # expect_equal(as.Date(testData$plus_one_date), dateAdd("day", 1, testData$observation_period_start_date)) # expect_equal(testData$end_of_month_date, eoMonth(testData$observation_period_start_date)) # expect_equal(testData$obs_year, year(testData$observation_period_start_date)) # expect_equal(testData$obs_month, month(testData$observation_period_start_date)) # expect_equal(testData$obs_day, day(testData$observation_period_start_date)) # expect_equal(testData$is_later, 1) # dumbNameCars <- cars # names(dumbNameCars) <- c("Car speed", "Dist. to Stop") # copy_to(connection, dumbNameCars, name = "dn_cars") # disconnect(connection) invisible(NULL) }