library(testthat) test_that("adjust_frm merges by INCHIKEY when available", { # Load pretrained model for speed-up frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) # Add an artificial INCHIKEY column to the training data. Then introduce # some duplicate SMILES+INCHIKEY entries with slightly different RTs. n <- nrow(frm$df) frm$df$INCHIKEY <- sprintf("IK%05d", seq_len(n)) frm$df$INCHIKEY[1:5] <- frm$df$INCHIKEY[1] frm$df$SMILES[1:5] <- frm$df$SMILES[1] frm$df$RT[1:5] <- frm$df$RT[1] + c(0, 0.1, 0.2, 0.3, 0.4) # Build new_data from a small subset and change some NAMES to avoid # NAME-matching. Duplicate the first entry, so we have the most complex # case. Multiple new entries (2) matching multiple old entries (5). Also # make sure that the duplicated entry is at different positions within # new_data. idx <- c(seq(1, 401, 40), 1) cols <- intersect(c("NAME", "SMILES", "RT", "INCHIKEY"), colnames(frm$df)) new <- frm$df[idx, cols] new$RT <- new$RT + rnorm(nrow(new), mean = 3, sd = 0.5) # Slightly perturb RTs # Run adjustment; this should use INCHIKEY+SMILES and not error afm <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1) afm42 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1) afm99 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 99, predictors = 1) # Basic checks expect_true("adj" %in% names(afm)) expect_true(is.list(afm$adj)) expect_true("INCHIKEY" %in% colnames(afm$adj$df)) expect_equal(nrow(afm$adj$df), nrow(new)) expect_true(isTRUE(all.equal(afm, afm42))) expect_false(isFALSE(all.equal(afm, afm99))) expect_equal(new$RT, afm$adj$df$RT_ADJ) # Ensure that afm$adj$df$RT is calculated correctly by mapping to frm$df via # INCHIKEY+SMILES and averaging RT over duplicates. n <- length(idx) old <- afm$df[idx, ] # (1) old$RT[c(1, n)] <- mean(afm$df$RT[1:5]) # (2) expect_equal(length(unique(c(new$SMILES[c(1, n)], afm$df$SMILES[1:5]))), 1) # (3) expect_equal(length(unique(c(new$INCHIKEY[c(1, n)], afm$df$INCHIKEY[1:5]))), 1) # (3) expect_equal(afm$adj$df$RT_ADJ, new$RT) expect_equal(afm$adj$df$RT, old$RT) # (1) We generated `new` from `afm$df[idx, ]`, so naivly, we would expect # these elements to be picked as corresponding "old" measurements, that will # be adjusted. # (2) However, because the first and last entry in `new` have the same # INCHIKEY+SMILES combination as the first five entries in `afm$df` (3), our # algorithm should calculate the average retention time over `afm$df[1:5,]` # and then map both `new[1,]` and `new[n,]` to that average. # Now remove some INCHIKEYs from the new data to force fallback to # SMILES+NAME mapping new$INCHIKEY[8:10] <- NA afm2 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1) expect_identical(afm$model, afm2$model) # Now change a few NAMES as well, so that even the SMILES+NAME fallback # cannot find matches for all entries new$NAME[8:10] <- NA expect_error( afm3 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1), "Could not map 3 new entries" ) # Plot during interactive sessions if (identical(environment(), .GlobalEnv)) { par(mfrow = c(3, 1)) plot(new$RT, frm$df$RT[idx], xlab = "New RT", ylab = "Old RT") plot_frm(afm, type = "scatter.train.adj") plot_frm(afm, type = "scatter.cv.adj") par(mfrow = c(1, 1)) } }) test_that("adjust_frm works with do_cv = FALSE", { # Load pretrained model for speed-up frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) # Build new_data from a small subset idx <- seq(1, 401, 40) new <- frm$df[idx, c("NAME", "SMILES", "RT")] new$RT <- new$RT + rnorm(nrow(new), mean = 3, sd = 0.5) # Test with do_cv=FALSE afm1 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1:2, do_cv = FALSE) # Test with do_cv=TRUE afm2 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1:2, do_cv = TRUE) # Model with do_cv=FALSE should have NULL cv element expect_true(is.list(afm1$adj)) expect_equal(names(afm1$adj), c("model", "df", "cv", "args")) expect_null(afm1$adj$cv) # Model with do_cv=TRUE should have cv element expect_true(is.list(afm2$adj)) expect_equal(names(afm1$adj), c("model", "df", "cv", "args")) expect_true(is.list(afm2$adj$cv)) expect_equal(names(afm2$adj$cv), c("folds", "models", "stats", "preds")) # The actual adjustment models should be the same (only cv differs) afm2_nocv <- afm2 afm2_nocv$adj["cv"] <- list(NULL) afm2_nocv$adj$args$do_cv <- FALSE expect_equal(afm1, afm2_nocv) }) test_that("adjust_frm works with lm, lasso, ridge, gbtree", { frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) idx <- seq(1, 401, by = 20) cols <- intersect(c("NAME", "SMILES", "RT", "INCHIKEY"), colnames(frm$df)) new <- frm$df[idx, cols] new$RT <- new$RT + rnorm(nrow(new), sd = 0.25) withr::local_options(FastRet.adj_gbtree_nrounds = 5) adj_lasso <- adjust_frm(frm = frm, new_data = new, nfolds = 2, verbose = 0, seed = 7, adj_type = "lasso") adj_gbtre <- adjust_frm(frm = frm, new_data = new, nfolds = 2, verbose = 0, seed = 7, adj_type = "gbtree") expect_s3_class(adj_lasso$adj$model, "glmnet") expect_true(inherits(adj_gbtre$adj$model, "xgb.Booster")) expect_identical(adj_lasso$adj$args$adj_type, "lasso") expect_identical(adj_gbtre$adj$args$adj_type, "gbtree") yhat_lasso <- predict(adj_lasso, df = new, adjust = TRUE, verbose = 0) yhat_gbtre <- predict(adj_gbtre, df = new, adjust = TRUE, verbose = 0) expect_equal(length(yhat_lasso), length(idx)) expect_equal(length(yhat_gbtre), length(idx)) expect_error(adjust_frm(frm, new, adj_type = "foo")) })