library(testthat) library(nc) library(data.table) context("multiple") source(system.file("test_engines.R", package="nc", mustWork=TRUE), local=TRUE) test_engines("only one input variable per column value is an error", { expect_error({ nc::capture_melt_multiple(iris, column=".*[.].*", other=".*") }, "only one input variable for each value captured in column group; typically this happens when the column group matches the entire input column name; fix by changing regex so that column group matches a strict substring (not the entire input column names)", fixed=TRUE) }) iris.dt <- data.table(i=1:nrow(iris), iris) iris.dims <- capture_melt_multiple( iris.dt, part=".*?", "[.]", column=".*") iris.dims.wide <- dcast( iris.dims, i + Species ~ part, value.var=c("Length", "Width")) names(iris.dims.wide) <- sub("(.*?)_(.*)", "\\2.\\1", names(iris.dims.wide)) should.be.orig <- iris.dims.wide[, names(iris.dt), with=FALSE] setkeyv(should.be.orig, key(iris.dt)) test_engines("orig iris.dt and recast data are the same", { expect_equal(should.be.orig, iris.dt) }) iris.dt[, chr := paste(Species)] compare.cols <- c( "Sepal.Length", "Petal.Length", "Sepal.Width", "Petal.Width", "chr") set.seed(1) iris.rand <- iris.dt[sample(.N)] iris.wide <- cbind(treatment=iris.rand[1:75], control=iris.rand[76:150]) test_engines("iris multiple column matches", { nc.melted <- capture_melt_multiple( iris.wide, group="[^.]+", "[.]", column=".*") expect_identical(nc.melted$group, rep(c("control", "treatment"), each=75)) nc.melted.orig <- nc.melted[order(i), ..compare.cols] expect_identical(nc.melted.orig, iris.dt[, ..compare.cols]) }) set.seed(1) iris.wide.rand <- iris.wide[, sample(names(iris.wide)), with=FALSE] test_engines("iris multiple rand column matches", { rand.melted <- capture_melt_multiple( iris.wide.rand, group="[^.]+", "[.]", column=".*" )[order(i)] expect_identical(rand.melted[, ..compare.cols], iris.dt[, ..compare.cols]) }) iris.wide.bad <- data.table(iris.wide) names(iris.wide.bad)[2] <- "treatment.Sepal.bad" test_engines("iris multiple rand column matches", { expect_error({ capture_melt_multiple( iris.wide.bad, group="[^.]+", "[.]", column=".*") }, "need column=same count for each value") }) set.seed(45) DT <- data.table( i_1 = c(1:5, NA), i_2 = c(NA,6:10), f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), c_1 = sample(c(letters[1:3], NA), 6, TRUE), d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), d_2 = as.Date(6:1, origin="2012-01-01")) ## add a couple of list cols DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] test_engines("melt multiple column types error if no other group", { expect_error({ capture_melt_multiple(DT, column="^[^c]") }, "need at least one group other than column") }) test_engines("melt multiple column types", { result <- capture_melt_multiple( DT, column="^[^c]", "_", .number="[0-9]", as.integer) expect_identical(result$.number, rep(1:2, each=nrow(DT))) expect_is(result$c_1, "character") expect_is(result$i, "integer") expect_is(result$f, "character") expect_is(result$d, "Date") expect_is(result$l, "list") expect_identical(nrow(result), 12L) }) family.dt <- fread(text="family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3 1 30 1998-11-26 2000-01-29 NA 1 2 NA 2 27 1996-06-22 NA NA 2 NA NA 3 26 2002-07-11 2004-04-05 2007-09-02 2 2 1 4 32 2004-10-10 2009-08-27 2012-07-21 1 1 1 5 29 2000-12-05 2005-02-28 NA 2 1 NA") test_engines("gender dob example", { na.children <- capture_melt_multiple( family.dt, column="[^_]+", between="_child", number="[1-3]", na.rm=FALSE) expect_is(na.children$family_id, "integer") expect_is(na.children$age_mother, "integer") expect_is(na.children$dob, "IDate") expect_is(na.children$gender, "integer") expect_equal(sum(is.na(na.children$dob)), 4) expect_equal(nrow(na.children), 15) }) test_engines("error for unequal number of children", { bad.groups <- data.table(family.dt, dob_child0="1999-01-01", gender_child4=2) expect_error({ capture_melt_multiple( bad.groups, column="[^_]+", between="_child", number="[0-9]") }, "need between,number=same count for each value") }) test_engines("error for column conversion to factor", { expect_error({ capture_melt_multiple( family.dt, column="[^_]+", as.factor, between="_child", number="[0-9]") }, "column group must be character, but conversion function returned factor") }) test_engines("error for column conversion to integer", { expect_error({ expect_warning({ capture_melt_multiple( family.dt, column="[^_]+", as.integer, between="_child", number="[0-9]") }, "NAs introduced by coercion") }, "column group must be character, but conversion function returned integer") }) test_engines("error for unequal number of columns", { bad.cols <- data.table(family.dt, bar_child0="1999-01-01", foo_child0=2) expect_error({ capture_melt_multiple( bad.cols, column="[^_]+", between="_child", number="[0-9]") }, "need column=same count for each value") }) test_engines("gender dob example na.rm=TRUE", { children <- capture_melt_multiple( family.dt, column="[^_]+", between="_child", number="[1-3]", na.rm=TRUE) expect_is(children$family_id, "integer") expect_is(children$age_mother, "integer") expect_is(children$dob, "IDate") expect_is(children$gender, "integer") expect_equal(sum(is.na(children$dob)), 0) expect_equal(nrow(children), 11) }) test_engines("variable group ok in melt_multiple", { result <- capture_melt_multiple( family.dt, column="[^_]+", variable="_child", number="[1-3]") exp.names <- c( "family_id", "age_mother", "variable", "number", "dob", "gender") expect_identical(names(result), exp.names) }) test_engines("multiple error if subject not df", { expect_error({ capture_melt_multiple("foobar") }, "first argument (subject) must be a data.frame", fixed=TRUE) }) test_engines("multiple error if no arg named variable", { expect_error({ capture_melt_multiple(family.dt, baz="family") }, "pattern must define group named column") }) test_engines("multiple error if no matching column names", { expect_error({ capture_melt_multiple(family.dt, column="foobar") }, "no column names match regex") }) ## what if input df has repeated names? bad.dt <- data.table(family_id=LETTERS[1:5], family.dt) test_engines("multiple df with same col names is an error", { expect_error({ capture_melt_multiple( bad.dt, column=".*", "_", field("child", "", "[0-9]")) }, "input must have columns with unique names, problems: family_id") }) ## what if there are two groups with the same name? test_engines("multiple groups with the same name is an error", { expect_error({ capture_melt_multiple( family.dt, column=".*", child="_", field("child", "", "[0-9]")) }, "duplicate capture group names are only allowed in alternatives, problem: child") }) ## what if a capture group has the same name as an input column? test_engines("err mult capture group same as input col", { expect_error({ capture_melt_multiple( family.dt, column=".*", family_id="_", field("child", "", "[0-9]")) }, "some capture group names (family_id) are the same as input column names that did not match the pattern; please change either the pattern or the capture group names so that all output column names will be unique", fixed=TRUE) }) ## what if value.name is the same as input col? bad2 <- data.table(family.dt) names(bad2)[1] <- "dob" test_engines("err change value.name if same as input col", { expect_error({ capture_melt_multiple( bad2, column=".*", family_id="_", field("child", "", "[0-9]")) }, "unable to create unique output column names; some values (dob) captured by the regex group named column are the same as input column names which do not match the pattern; please change either the pattern or the input column names which do not match the pattern so that output column names will be unique", fixed=TRUE) }) test_engines("err mult value.name same as group names", { expect_error({ capture_melt_multiple( family.dt, column=".*", dob="_", field("child", "", "[0-9]")) }, "unable to create unique output column names; some values (dob) captured by the regex group named column are the same as other regex group names; please change either the pattern or the other regex group names so that output column names will be unique", fixed=TRUE) }) i.vec <- 1:10000 c.vec <- paste(i.vec) one.row <- data.frame(I=t(i.vec), C=t(c.vec), stringsAsFactors=FALSE) test_engines("multiple melting lots of columns is OK", { out <- capture_melt_multiple( one.row, column=".", "[.]", int="[0-9]+", as.integer) expect_identical(out$int, i.vec) expect_identical(out$I, i.vec) expect_identical(out$C, c.vec) }) wide.metrics <- data.table( FP.possible=8202, FN.possible=1835, FP.count=0, FN.count=1835) test_engines("count is either 0 or 1835", { tall.metrics <- capture_melt_multiple( wide.metrics, metric=".*?", "[.]", column=".*")[order(metric)] expect_identical(tall.metrics$metric, c("FN", "FP")) expect_identical(tall.metrics$count, c(1835, 0)) expect_identical(tall.metrics$possible, c(1835, 8202)) }) test_engines("melt multiple with count group ok", { iris.count <- nc::capture_melt_multiple( iris, column=".*?", "[.]", count=".*") expect_identical(names(iris.count), c("Species", "count", "Petal", "Sepal")) }) ## apparently there are some data with missing columns ## https://github.com/Rdatatable/data.table/issues/4027 wide.input <- data.table( id = 1, a.1 = 1, a.3 = 3, b.1 = 1, b.2 = 2, b.3 = 3) ix.pattern <- list(column="[a-z]", "[.]", ix="[0-9]", as.integer) test_engines("error by default for missing column", { expect_error({ capture_melt_multiple(wide.input, ix.pattern) }, "need ix=same count for each value, but have: 1=2 2=1 3=2; please change pattern, edit input column names, or use fill=TRUE to output missing values") }) ##But what I really want is the ix of a.3 to be 3, not 2. ## id ix a b ## 1: 1 1 1 1 ## 2: 1 2 NA 2 ## 3: 1 3 3 3 tall.expected <- data.table( id=1, ix=1:3, a=c(1, NA, 3), b=1:3) test_engines("NA for missing column when fill=TRUE", { tall.output <- capture_melt_multiple( wide.input, ix.pattern, fill=TRUE) expect_equal(data.frame(tall.output), data.frame(tall.expected)) }) test_engines("NA for missing columns as in data table example", { DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] computed <- capture_melt_multiple( DT.missing.cols, column="[a-z]", "_", int="[0-9]", as.integer, fill=TRUE) expected <- DT.missing.cols[, data.table( int=rep(1:2, each=.N), c=c(c_1, rep(NA, .N)), d=c(d_1, d_2), f=c(rep(NA, .N), paste(f_2)))] expect_identical(data.frame(computed), data.frame(expected)) }) family4.dt <- data.table(family.dt) family4.dt[, `:=`(dob_child4=NA_character_, gender_child4=NA_integer_)] test_engines("no families with 4 children", { child.pattern <- list( column="[^_]+", between="_child", number="[1-4]") na.rm.T <- suppressWarnings({ capture_melt_multiple( family4.dt, child.pattern, na.rm=TRUE) }) expect_equal(sum(is.na(na.rm.T$dob)), 0) }) test_engines("lots of missing columns ok with chr capture col small", { tall.dt <- data.table( family=1, child=10:1, num=c(10, 9, rep(NA, 8)), chr=c("ten", "nine", rep(NA, 8))) wide.dt <- dcast( tall.dt, family ~ child, value.var=c("num", "chr")) tall.again <- capture_melt_multiple( wide.dt, column=".*", "_", child="[0-9]+", na.rm=TRUE)[order(num)] expect_identical(tall.again$child, paste(9:10)) }) test_engines("lots of missing columns ok with chr capture col big", { PROVEDIt.csv <- system.file( "extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv", package="nc", mustWork=TRUE) PROVEDIt.wide <- data.table::fread(PROVEDIt.csv) PROVEDIt.tall <- suppressWarnings({ nc::capture_melt_multiple( PROVEDIt.wide, column=".*", " ", peak="[0-9]+", na.rm=TRUE) }) peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak))) expect_identical(peak.int, seq_along(peak.int)) }) test_engines("lots of missing columns chr col big na.rm=FALSE", { PROVEDIt.csv <- system.file( "extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv", package="nc", mustWork=TRUE) PROVEDIt.wide <- data.table::fread(PROVEDIt.csv) PROVEDIt.tall <- suppressWarnings({ nc::capture_melt_multiple( PROVEDIt.wide, column=".*", " ", peak="[0-9]+", na.rm=FALSE)[!is.na(Size)] }) ## Why does this test pass? No missing values are removed so data ## table assigns peak numbers 1-100 correctly and then we filter the ## NAs. peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak))) expect_identical(peak.int, seq_along(peak.int)) }) test_engines("lots of missing columns int col big na.rm=TRUE", { PROVEDIt.csv <- system.file( "extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv", package="nc", mustWork=TRUE) PROVEDIt.wide <- data.table::fread(PROVEDIt.csv) PROVEDIt.tall <- suppressWarnings({ nc::capture_melt_multiple( PROVEDIt.wide, column=".*", " ", peak="[0-9]+", as.integer, na.rm=TRUE) }) ## Why does this test pass? With buggy data.table the missing values ## are removed and then the peak IDs are assigned. Because the ## missing values are at the end (for peak>36) and the data are ## sorted numerically by integers, the assigned IDs match the peak ## numbers exactly. peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak))) expect_identical(peak.int, seq_along(peak.int)) }) test_engines("missing first child", { wide.dt <- data.table( family=1, age_childA=NA_real_, sex_childA=NA_character_, age_childB=5, sex_childB="m") child.pattern <- list( column=".*", "_", nc::field("child", "", "[A-Z]")) tall.dt <- nc::capture_melt_multiple(wide.dt, child.pattern, na.rm=TRUE) expect_identical(tall.dt$child, "B") expect_identical(tall.dt$age, 5) expect_identical(tall.dt$sex, "m") })