set.seed(123) d <- sample.int(10, size = 500, replace = TRUE) test_that("recode median", { expect_identical(categorize(d), ifelse(d >= median(d), 2, 1)) expect_identical(categorize(d, lowest = 0), as.numeric(d >= median(d))) }) test_that("recode mean", { expect_identical(categorize(d, split = "mean"), ifelse(d >= mean(d), 2, 1)) expect_identical(categorize(d, split = "mean", lowest = 0), as.numeric(d >= mean(d))) }) test_that("recode quantile", { expect_error(categorize(d, split = "quantile")) q <- quantile(d, probs = c(1 / 3, 2 / 3, 1)) f <- cut(d, breaks = unique(c(min(d), q, max(d))), include.lowest = TRUE, right = FALSE) levels(f) <- 1:nlevels(f) expect_identical(categorize(d, split = "quantile", n_groups = 3), as.numeric(f)) expect_identical(categorize(d, split = "quantile", n_groups = 3, lowest = 0), as.numeric(f) - 1) }) set.seed(123) d <- sample.int(100, size = 1000, replace = TRUE) test_that("recode range", { expect_error(categorize(d, split = "range")) d2 <- d d2[d <= 20] <- 1 d2[d > 20 & d <= 40] <- 2 d2[d > 40 & d <= 60] <- 3 d2[d > 60 & d <= 80] <- 4 d2[d > 80] <- 5 expect_equal(table(categorize(d, split = "equal_range", range = 20)), table(d2), ignore_attr = TRUE) expect_equal( table(categorize( d, split = "equal_range", range = 20, lowest = 1 )), table(d2), ignore_attr = TRUE ) d2 <- d d2[d < 20] <- 0 d2[d >= 20 & d < 40] <- 1 d2[d >= 40 & d < 60] <- 2 d2[d >= 60 & d < 80] <- 3 d2[d >= 80] <- 4 expect_equal( table(categorize( d, split = "equal_range", range = 20, lowest = 0 )), table(d2), ignore_attr = TRUE ) }) test_that("recode length", { expect_error(categorize(d, split = "equal_length")) d2 <- d d2[d <= 20] <- 1 d2[d > 20 & d <= 40] <- 2 d2[d > 40 & d <= 60] <- 3 d2[d > 60 & d <= 80] <- 4 d2[d > 80] <- 5 expect_equal(table(categorize(d, split = "equal_length", n_groups = 5)), table(d2), ignore_attr = TRUE) expect_equal( table(categorize( d, split = "equal_length", n_groups = 5, lowest = 1 )), table(d2), ignore_attr = TRUE ) }) set.seed(123) x <- sample.int(10, size = 30, replace = TRUE) test_that("recode factor labels", { expect_type(categorize(x, "equal_length", n_groups = 3), "double") expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor") expect_identical( levels(categorize( x, "equal_length", n_groups = 3, labels = c("low", "mid", "high") )), c("low", "mid", "high") ) t1 <- table(categorize(x, "equal_length", n_groups = 3)) t2 <- table(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))) expect_equal(t1, t2, ignore_attr = TRUE) }) test_that("recode data frame", { data(iris) x <- iris out <- categorize(x, split = "median", select = c("Sepal.Length", "Sepal.Width")) expect_s3_class(out, "data.frame") expect_identical(out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = starts_with("Sepal")) expect_s3_class(out, "data.frame") expect_identical(out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = ~ Sepal.Width + Sepal.Length) expect_s3_class(out, "data.frame") expect_identical(out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = Sepal.Length) expect_s3_class(out, "data.frame") expect_identical(out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)) expect_identical(out$Petal.Length, iris$Petal.Length) expect_warning( expect_warning( out <- categorize(x, split = "median", select = c("sepal.Length", "sepal.Width"), ignore_case = FALSE), "not found" ), "not found" ) expect_identical(out$Sepal.Length, iris$Sepal.Length) out <- categorize(x, split = "median", select = starts_with("sepal"), ignore_case = TRUE) expect_s3_class(out, "data.frame") expect_identical(out$Sepal.Length, ifelse(iris$Sepal.Length >= median(iris$Sepal.Length), 2, 1)) expect_identical(out$Petal.Length, iris$Petal.Length) out <- categorize(x, split = "median", select = starts_with("sepal"), ignore_case = FALSE) expect_identical(out$Sepal.Length, iris$Sepal.Length) out <- categorize(x, split = "median", select = starts_with("sepal"), ignore_case = TRUE, append = "_r") expect_identical(colnames(out), c( "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "Sepal.Length_r", "Sepal.Width_r" )) out <- categorize(iris, split = "median", select = starts_with("Sepal")) expect_identical( out$Sepal.Length, c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ) ) skip_if_not_installed("poorman") x <- poorman::group_by(iris, Species) out <- categorize(x, split = "median", select = starts_with("Sepal")) expect_identical( out$Sepal.Length, c( 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1 ) ) }) test_that("recode all NA", { x <- rep(NA, 10) expect_message( y <- categorize(x), "can't be recoded" ) expect_identical(y, x) x <- rep(NA_real_, 10) expect_message( y <- categorize(x), "only missing values" ) expect_identical(y, x) }) test_that("recode numeric", { expect_identical( categorize(mtcars$hp, split = c(100, 150)), c( 2, 2, 1, 2, 3, 2, 3, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 2, 3, 3, 3, 2 ) ) x <- mtcars$hp x[mtcars$hp < 100] <- 1 x[mtcars$hp >= 100 & mtcars$hp < 150] <- 2 x[mtcars$hp >= 150] <- 3 expect_identical(categorize(mtcars$hp, split = c(100, 150)), x) expect_identical(categorize(mtcars$hp, split = c(100, 150), lowest = NULL), x) expect_identical( categorize(mtcars$hp, split = "equal_range", range = 50, lowest = NULL), c( 2, 2, 1, 2, 3, 2, 4, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 1, 2, 2, 4, 3, 1, 1, 2, 5, 3, 6, 2 ) ) }) # select helpers ------------------------------ test_that("categorize regex", { expect_identical( categorize(mtcars, select = "pg", regex = TRUE), categorize(mtcars, select = "mpg") ) }) # labelling ranges ------------------------------ test_that("categorize labelling ranged", { data(mtcars) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")) }) test_that("categorize breaks", { data(mtcars) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")) expect_error( categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), regex = "should be one of" ) })