context("generators") test_that("Checking the generator for the Fasta files", { testthat::skip_if_not_installed("tensorflow") testthat::skip_if_not(reticulate::py_module_available("tensorflow")) testpath <- file.path("fasta_2") vocabulary <- c("a", "c", "g", "t") batch_size <- 5 maxlen <- 3 gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) arrays <- gen() expect_equivalent(dim(arrays[[1]])[1], batch_size) expect_equivalent(dim(arrays[[1]])[2], maxlen) expect_equivalent(dim(arrays[[1]])[3], length(vocabulary)) expect_equivalent(dim(arrays[[2]])[1], batch_size) expect_equivalent(dim(arrays[[2]])[2], length(vocabulary)) expect_equivalent(length(arrays),2) # a.fasta file starts with aaccggtt expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c arrays_2 <- gen() expect_equivalent(arrays_2[[1]][2, 1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays_2[[1]][2, 2, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays_2[[1]][2, 3, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays_2[[2]][2, ], c(0, 0, 0, 1)) # t # test transition to second fasta file gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) for (i in 1:5){ arrays <- gen() } # samples start at beginning of b.fasta expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[2]][5, ], c(1, 0, 0, 0)) # a # complete one iteration (100 samples) gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) for (i in 1:9){ arrays <- gen() } # start from a.fasta again expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c ################### # test for different step size gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 4, maxlen = 3, step = 2) arrays <- gen() expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(1, 0, 0, 0)) #### # tests with chars outside vocabulary, vocabulary does not contain "A" gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("c", "g", "t")) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0)) # a expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0)) # c expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # c #### # test padding gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 1, maxlen = 10, step = 4, vocabulary = c("a", "c", "g", "t")) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][1, ], c(0, 0, 0, 1)) # no padding testpath <- file.path("fasta_3") gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 2, maxlen = 12, step = 1, vocabulary = c("a", "c", "g", "t"), padding = FALSE) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 5, ], c(0, 1, 0, 0)) #### testpath <- file.path("fasta_2") expect_error(generator_fasta_lm()) expect_error(generator_fasta_lm("")) expect_is(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen), "function") expect_is(gen(), "list") expect_is(gen()[[1]], "array") expect_is(gen()[[2]], "matrix") expect_silent(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen)) expect_type(gen()[[1]], "double") expect_type(gen()[[2]], "double") ############# Test label generator (header) ############# testpath <- file.path("fasta_2") gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # C expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) # T expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) arrays <- gen() expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) arrays <- gen() expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][5, 7, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][5, 8, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][1, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][2, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) arrays <- gen() # go through a/b.fasta once discard samples with target z expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) # A expect_equivalent(arrays[[1]][8, 3, ], c(0, 1, 0, 0)) # C expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W ############# Test label generator (folder) ############# directories <- c("label_folder/x", "label_folder/y", "label_folder/z") val <- FALSE gen_list <- generator_initialize(directories = directories, val = val, format = "fasta", batch_size = 6, maxlen = 2, vocabulary = c("a", "c", "g", "t"), step = 2) gen <- generator_fasta_label_folder_wrapper(val = val, path = directories, gen_list = gen_list) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) # test skipping file for (i in 1:2){ arrays <- gen() } expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) # arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) ####### Test discard ambiguous nucleotides ########### testpath <- file.path("fasta_3") vocabulary = c("a", "c", "g", "t") batch_size <- 6 maxlen <- 3 step <- 2 gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "discard", step = step) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 0, 1)) # label header gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "discard", step = step, reverse_complement = FALSE, vocabulary_label = c("X", "Y")) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][5, ], c(0, 1)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 1)) # label folder directories = c("fasta_2", "fasta_3") gen <- get_generator(val = FALSE, train_type = "label_folder", path = directories, format = "fasta", batch_size = 6, maxlen = 3, ambiguous_nuc = "discard", vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, step = 2) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1)) expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 1)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 1)) ####### Test ambiguous nucleotides as 1/length(vocabulary) ########### testpath <- file.path("fasta_3") vocabulary = c("a", "c", "g", "t") batch_size <- 4 maxlen <- 3 step <- 2 gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "equal", step = step) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) # label header gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "equal", step = step, reverse_complement = FALSE, vocabulary_label = c("X", "Y")) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[2]][4, ], c(1, 0)) # label folder directories = c("fasta_2", "fasta_3") gen <- get_generator(train_type = "label_folder", val = FALSE, path = directories, format = "fasta", batch_size = 4, maxlen = 3, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, ambiguous_nuc = "equal", step = 2) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[2]][3, ], c(0, 1)) expect_equivalent(arrays[[1]][4, 1, ], c(1/4, 1/4, 1/4, 1/4)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1)) ####### Test ambiguous nucleotides as "empirical" ########### # LM testpath <- file.path("fasta_3") vocabulary <- c("a", "c", "g", "t") batch_size <- 4 maxlen <- 3 step <- 2 gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step) arrays <- gen() nuc_dist <- 1/18*c(8, 2, 3, 5) expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], nuc_dist) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][2, ], nuc_dist) expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], nuc_dist) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], nuc_dist) expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) # LM second file testpath <- file.path("fasta_3") vocabulary <- c("a", "c", "g", "t") batch_size <- 4 maxlen <- 3 step <- 20 gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step) arrays <- gen() nuc_dist_1 <- 1/18*c(8, 2, 3, 5) nuc_dist_2 <- 1/17*c(3, 2, 6, 6) expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) # label header testpath <- file.path("fasta_3") vocabulary <- c("a", "c", "g", "t") batch_size <- 4 maxlen <- 3 step <- 2 gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step, reverse_complement = FALSE, vocabulary_label = c("X", "Y")) arrays <- gen() nuc_dist <- 1/18*c(8, 2, 3, 5) expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], nuc_dist) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], nuc_dist) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 3, ], nuc_dist) expect_equivalent(arrays[[2]][4, ], c(1, 0)) # label folder directories = c("fasta_2", "fasta_3") gen <- get_generator(path = directories, val = FALSE, train_type = "label_folder", format = "fasta", batch_size = 4, maxlen = 3, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, ambiguous_nuc = "empirical", step = 2) arrays <- gen() nuc_dist <- 1/18*c(8, 2, 3, 5) expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], nuc_dist) expect_equivalent(arrays[[2]][3, ], c(0, 1)) expect_equivalent(arrays[[1]][4, 1, ], nuc_dist) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1)) ############# padding/amb nucleotide LM ############ gen <- generator_fasta_lm(path_corpus = "fasta_3", batch_size = 3, maxlen = 15, step = 1, ambiguous_nuc = "equal") arrays <- gen() equal_vector <- rep(0.25, 4) expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 6, ], equal_vector) expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 8, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 9, ], equal_vector) expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 11, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 12, ], equal_vector) expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 15, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 7, ], equal_vector) expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 14, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 15, ], equal_vector) expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) ############# padding/amb nucleotide, label_header ############ gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3", batch_size = 3, maxlen = 15, step = 1, vocabulary_label = c("X", "Y"), reverse_complement = FALSE, ambiguous_nuc = "empirical") nuc_dist_1 <- 1/18*c(8, 2, 3, 5) nuc_dist_2 <- 1/17*c(3, 2, 6, 6) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], nuc_dist_1) expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 8, ], nuc_dist_1) expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 11, ], nuc_dist_1) expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][3, 14, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 15, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(0, 1)) ############# padding/amb nucleotide, label_folder ############ directories = c("fasta_2", "fasta_3") gen <- get_generator(path = directories, val = FALSE, train_type = "label_folder", padding = TRUE, format = "fasta", batch_size = 6, maxlen = 15, ambiguous_nuc = "equal", vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, step = 1) equal_vector <- rep(0.25, 4) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 6, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 9, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 10, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 11, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 6, ], equal_vector) expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 11, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 13, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][6, 14, ], equal_vector) expect_equivalent(arrays[[1]][6, 15, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 1)) ###### more than 2 files in one batch ###### # LM gen <- generator_fasta_lm(path_corpus = "fasta_3", batch_size = 8, maxlen = 12, max_iter = 10000, step = 50, ambiguous_nuc = "empirical") nuc_dist_1 <- 1/18*c(8, 2, 3, 5) nuc_dist_2 <- 1/17*c(3, 2, 6, 6) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 7, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 8, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 9, ], nuc_dist_2) expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][8, 6, ], c(0, 0, 1, 0)) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 7, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 8, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 9, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 7, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 8, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 9, ], nuc_dist_2) expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0)) # label header gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3", batch_size = 8, maxlen = 12, max_iter = 10000, step = 50, ambiguous_nuc = "empirical", reverse_complement = FALSE, vocabulary_label = c("X", "Y") ) nuc_dist_1 <- 1/18*c(8, 2, 3, 5) nuc_dist_2 <- 1/17*c(3, 2, 6, 6) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 7, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 8, ], nuc_dist_2) expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][8, 4, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0)) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][2, 7, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 8, ], nuc_dist_2) expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][6, 6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 2, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2) expect_equivalent(arrays[[1]][7, 7, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 8, ], nuc_dist_2) expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1) expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0)) # label folder directories = c("fasta_2", "fasta_3") gen <- get_generator(path = directories, train_type = "label_folder", batch_size = 20, maxlen = 12, val = FALSE, padding = TRUE, ambiguous_nuc = "empirical", vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, step = 1) nuc_dist_1 <- 1/18*c(8, 2, 3, 5) nuc_dist_2 <- 1/17*c(3, 2, 6, 6) arrays <- gen() expect_equivalent(arrays[[1]][9, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][9, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][9, 8, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][9, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][9, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][9, 11, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][9, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][9, ], c(1, 0)) expect_equivalent(arrays[[1]][12, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][12, 2, ], nuc_dist_1) expect_equivalent(arrays[[1]][12, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][12, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][12, 5, ], nuc_dist_1) expect_equivalent(arrays[[1]][12, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][12, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][12, 8, ], nuc_dist_1) expect_equivalent(arrays[[1]][12, 9, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][12, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][12, 11, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][12, 12, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][12, ], c(0, 1)) expect_equivalent(arrays[[1]][18, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][18, 2, ], nuc_dist_1) expect_equivalent(arrays[[1]][18, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][18, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][18, 5, ], nuc_dist_1) expect_equivalent(arrays[[1]][18, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][18, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][18, 8, ], nuc_dist_1) expect_equivalent(arrays[[1]][18, 9, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][18, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][18, 11, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][18, 12, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][18, ], c(0, 1)) arrays <- gen() expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 4, ], c(0, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][7, 7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][7, 8, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][7, 9, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][7, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][7, 11, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][7, 12, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][7, ], c(1, 0)) expect_equivalent(arrays[[1]][14, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][14, 2, ], nuc_dist_1) expect_equivalent(arrays[[1]][14, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][14, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][14, 5, ], nuc_dist_1) expect_equivalent(arrays[[1]][14, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][14, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][14, 8, ], nuc_dist_1) expect_equivalent(arrays[[1]][14, 9, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][14, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][14, 11, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][14, 12, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][14, ], c(0, 1)) expect_equivalent(arrays[[1]][20, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][20, 2, ], nuc_dist_1) expect_equivalent(arrays[[1]][20, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][20, 4, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][20, 5, ], nuc_dist_1) expect_equivalent(arrays[[1]][20, 6, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][20, 7, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][20, 8, ], nuc_dist_1) expect_equivalent(arrays[[1]][20, 9, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][20, 10, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][20, 11, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][20, 12, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][20, ], c(0, 1)) # test quality scores LM gen <- generator_fasta_lm(path_corpus = "fastq", format = "fastq", batch_size = 10, maxlen = 3, max_iter = 10000, vocabulary = c("a", "c", "g", "t"), verbose = FALSE, shuffle_file_order = FALSE, step = 2, seed = 1234, shuffle_input = FALSE, file_limit = NULL, path_file_log = NULL, reverse_complement = FALSE, output_format = "target_right", ambiguous_nuc = "zeros", use_quality_score = TRUE, proportion_per_seq = NULL, padding = FALSE) a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], a) expect_equivalent(arrays[[1]][1, 2, ], a) expect_equivalent(arrays[[1]][1, 3, ], c) expect_equivalent(arrays[[1]][2, 1, ], c) expect_equivalent(arrays[[1]][2, 2, ], c) expect_equivalent(arrays[[1]][2, 3, ], g) expect_equivalent(arrays[[1]][3, 1, ], a) expect_equivalent(arrays[[1]][3, 2, ], c) expect_equivalent(arrays[[1]][3, 3, ], g) expect_equivalent(arrays[[1]][4, 1, ], g) expect_equivalent(arrays[[1]][4, 2, ], t) expect_equivalent(arrays[[1]][4, 3, ], a) expect_equivalent(arrays[[1]][5, 1, ], c) expect_equivalent(arrays[[1]][5, 2, ], g) expect_equivalent(arrays[[1]][5, 3, ], t) expect_equivalent(arrays[[1]][6, 1, ], t) expect_equivalent(arrays[[1]][6, 2, ], c) expect_equivalent(arrays[[1]][6, 3, ], g) expect_equivalent(arrays[[1]][7, 1, ], a) expect_equivalent(arrays[[1]][7, 2, ], t) expect_equivalent(arrays[[1]][7, 3, ], a) expect_equivalent(arrays[[1]][8, 1, ], a) expect_equivalent(arrays[[1]][8, 2, ], a) expect_equivalent(arrays[[1]][8, 3, ], c) expect_equivalent(arrays[[2]][1, ], c) expect_equivalent(arrays[[2]][2, ], g) expect_equivalent(arrays[[2]][3, ], t) expect_equivalent(arrays[[2]][4, ], c) expect_equivalent(arrays[[2]][5, ], c) expect_equivalent(arrays[[2]][6, ], t) expect_equivalent(arrays[[2]][7, ], t) expect_equivalent(arrays[[2]][8, ], c) # test quality scores label gen <- generator_fasta_label_folder(path_corpus = "fastq", format = "fastq", batch_size = 10, maxlen = 3, max_iter = 10000, vocabulary = c("a", "c", "g", "t"), verbose = FALSE, shuffle_file_order = FALSE, step = 2, seed = 1234, shuffle_input = FALSE, file_limit = NULL, path_file_log = NULL, reverse_complement = FALSE, ambiguous_nuc = "zeros", use_quality_score = TRUE, proportion_per_seq = NULL, num_targets = 2, ones_column = 1, padding = FALSE) a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, ], a) expect_equivalent(arrays[[1]][1, 2, ], a) expect_equivalent(arrays[[1]][1, 3, ], c) expect_equivalent(arrays[[1]][2, 1, ], c) expect_equivalent(arrays[[1]][2, 2, ], c) expect_equivalent(arrays[[1]][2, 3, ], g) expect_equivalent(arrays[[1]][3, 1, ], a) expect_equivalent(arrays[[1]][3, 2, ], c) expect_equivalent(arrays[[1]][3, 3, ], g) expect_equivalent(arrays[[1]][4, 1, ], g) expect_equivalent(arrays[[1]][4, 2, ], t) expect_equivalent(arrays[[1]][4, 3, ], a) expect_equivalent(arrays[[1]][5, 1, ], a) expect_equivalent(arrays[[1]][5, 2, ], c) expect_equivalent(arrays[[1]][5, 3, ], g) expect_equivalent(arrays[[1]][6, 1, ], c) expect_equivalent(arrays[[1]][6, 2, ], g) expect_equivalent(arrays[[1]][6, 3, ], t) expect_equivalent(arrays[[1]][7, 1, ], t) expect_equivalent(arrays[[1]][7, 2, ], c) expect_equivalent(arrays[[1]][7, 3, ], g) expect_equivalent(arrays[[1]][8, 1, ], a) expect_equivalent(arrays[[1]][8, 2, ], t) expect_equivalent(arrays[[1]][8, 3, ], a) expect_equivalent(arrays[[1]][9, 1, ], a) expect_equivalent(arrays[[1]][9, 2, ], t) expect_equivalent(arrays[[1]][9, 3, ], a) expect_equivalent(arrays[[1]][10, 1, ], a) expect_equivalent(arrays[[1]][10, 2, ], a) expect_equivalent(arrays[[1]][10, 3, ], c) expect_equivalent(arrays[[2]][1, ], c(1,0)) expect_equivalent(arrays[[2]][10, ], c(1,0)) ## test read data with quality gen <- generator_fasta_label_folder(path_corpus = "read_data", format = "fastq", batch_size = 5, maxlen = 12, max_iter = 10000, vocabulary = c("a", "c", "g", "t"), verbose = FALSE, shuffle_file_order = FALSE, step = 2, seed = 1234, shuffle_input = FALSE, file_limit = NULL, path_file_log = NULL, read_data = TRUE, reverse_complement = FALSE, ambiguous_nuc = "zeros", use_quality_score = TRUE, proportion_per_seq = NULL, num_targets = 2, ones_column = 1, padding = FALSE) a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,c,c,c)) expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,g,g,g)) expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,c,a,c,a,c)) expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,g,c,g,c,g)) expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,t,t,t)) expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,g,g,g)) expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,t,g,t,g,t)) expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,g,t,g,t,g)) expect_equivalent(arrays[[1]][[1]][5, , ], rbind(a,a,a,c,c,c)) expect_equivalent(arrays[[1]][[2]][5, , ], rbind(c,c,c,g,g,g)) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,c,a,c,a,c)) expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,g,c,g,c,g)) expect_equivalent(arrays[[1]][[1]][2, , ], rbind(g,g,g,t,t,t)) expect_equivalent(arrays[[1]][[2]][2, , ], rbind(t,t,t,g,g,g)) expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,t,g,t,g,t)) expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,g,t,g,t,g)) # additional input LM gen <- generator_fasta_lm(path_corpus = "fasta_3", format = "fasta", batch_size = 10, maxlen = 5, vocabulary = c("a", "c", "g", "t"), shuffle_file_order = FALSE, step = 4, shuffle_input = FALSE, reverse_complement = FALSE, output_format = "target_right", ambiguous_nuc = "zeros", added_label_path = "label.csv", add_input_as_seq = FALSE, padding = FALSE) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1,], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][2,], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][3,], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][4,], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][5,], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][6,], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][7,], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][8,], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][9,], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][10,], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[2]][10, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[2]][10, 3, ], c(0, 0, 0, 0)) # additional input label_folder dir <- c("label_folder/x", "label_folder/y", "label_folder/z") gen_list <- generator_initialize(directories = dir, format = "fasta", batch_size = 15, maxlen = 4, step = 2, val = FALSE, padding = FALSE, added_label_path = "label.csv", add_input_as_seq = FALSE) gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 1)) expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1)) expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 1)) expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1)) expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 1)) expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[2]][5, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[2]][5, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[2]][10, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[2]][10, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[2]][15, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][[2]][15, 2, ], c(0, 0, 0, 1)) gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 1)) expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 1)) expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 1)) expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1)) expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 1)) expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1)) expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0)) ## read data with quality and 2 classes gen <- get_generator(path = c("read_data_2/label_a", "read_data_2/label_b"), train_type = "label_folder", format = "fastq", batch_size = 4, maxlen = 12, vocabulary = c("a", "c", "g", "t"), verbose = FALSE, shuffle_file_order = FALSE, step = 1, seed = 1234, shuffle_input = FALSE, file_limit = NULL, path_file_log = NULL, reverse_complement = FALSE, val = FALSE, ambiguous_nuc = "zero", proportion_per_seq = NULL, read_data = TRUE, use_quality_score = TRUE, padding = FALSE, added_label_path = NULL, skip_amb_nuc = NULL) arrays <- gen() a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,a,a,a)) expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,c,c,c)) expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,a,a,a,a,a)) expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,c,c,c,c,c)) expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,g,g,g)) expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,t,t,t)) expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,g,g,g,g,g)) expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,t,t,t,t,t)) ### get output tensor from csv file + concat testpath <- file.path("fasta_2") label_from_csv <- "output_label.csv" gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 10, step = 10, vocabulary = c("a", "c", "g", "t", "Z"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), format = "fasta", max_iter = 10000, verbose = FALSE, shuffle_file_order = FALSE, seed = 1234, shuffle_input = FALSE, file_limit = NULL, path_file_log = NULL, ambiguous_nuc = "zero", proportion_per_seq = NULL, read_data = FALSE, use_quality_score = FALSE, padding = TRUE, skip_amb_nuc = NULL, max_samples = NULL, concat_seq = "ZZ", added_label_path = NULL, add_input_as_seq = NULL, target_from_csv = label_from_csv) arrays <- gen() expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1)) expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], 1:4) expect_equivalent(arrays[[2]][2, ], 1:4) expect_equivalent(arrays[[2]][3, ], 1:4) expect_equivalent(arrays[[2]][4, ], 11:14) expect_equivalent(arrays[[2]][5, ], 11:14) arrays <- gen() expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1)) expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1)) expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, ], 11:14) expect_equivalent(arrays[[2]][2, ], 1:4) expect_equivalent(arrays[[2]][3, ], 1:4) expect_equivalent(arrays[[2]][4, ], 1:4) expect_equivalent(arrays[[2]][5, ], 11:14) ## 2 added input files LM gen <- generator_fasta_lm(path_corpus = "fasta_3", format = "fasta", batch_size = 10, maxlen = 5, vocabulary = c("a", "c", "g", "t"), shuffle_file_order = FALSE, step = 4, shuffle_input = FALSE, reverse_complement = FALSE, output_format = "target_right", ambiguous_nuc = "zeros", added_label_path = c("label.csv", "add_seq.csv"), add_input_as_seq = c(FALSE, TRUE), padding = FALSE) v1 <- c(0, 0, 1, 0) v2 <- c(1, 0, 0, 0) m1 <- matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), byrow = TRUE, ncol = 4) m2 <- matrix(c(0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), byrow = TRUE, ncol = 4) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], v1) expect_equivalent(arrays[[1]][[1]][2, ], v1) expect_equivalent(arrays[[1]][[1]][3, ], v1) expect_equivalent(arrays[[1]][[1]][4, ], v1) expect_equivalent(arrays[[1]][[1]][5, ], v2) expect_equivalent(arrays[[1]][[1]][6, ], v2) expect_equivalent(arrays[[1]][[1]][7, ], v2) expect_equivalent(arrays[[1]][[1]][8, ], v2) expect_equivalent(arrays[[1]][[1]][9, ], v2) expect_equivalent(arrays[[1]][[1]][10, ], v1) expect_equivalent(arrays[[1]][[2]][1, , ], m1) expect_equivalent(arrays[[1]][[2]][2, , ], m1) expect_equivalent(arrays[[1]][[2]][3, , ], m1) expect_equivalent(arrays[[1]][[2]][4, , ], m1) expect_equivalent(arrays[[1]][[2]][5, , ], m2) expect_equivalent(arrays[[1]][[2]][6, , ], m2) expect_equivalent(arrays[[1]][[2]][7, , ], m2) expect_equivalent(arrays[[1]][[2]][8, , ], m2) expect_equivalent(arrays[[1]][[2]][9, , ], m2) expect_equivalent(arrays[[1]][[2]][10, , ], m1) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], v1) expect_equivalent(arrays[[1]][[1]][2, ], v1) expect_equivalent(arrays[[1]][[1]][3, ], v1) expect_equivalent(arrays[[1]][[1]][4, ], v2) expect_equivalent(arrays[[1]][[1]][5, ], v2) expect_equivalent(arrays[[1]][[1]][6, ], v2) expect_equivalent(arrays[[1]][[1]][7, ], v2) expect_equivalent(arrays[[1]][[1]][8, ], v2) expect_equivalent(arrays[[1]][[1]][9, ], v1) expect_equivalent(arrays[[1]][[1]][10, ], v1) expect_equivalent(arrays[[1]][[2]][1, , ], m1) expect_equivalent(arrays[[1]][[2]][2, , ], m1) expect_equivalent(arrays[[1]][[2]][3, , ], m1) expect_equivalent(arrays[[1]][[2]][4, , ], m2) expect_equivalent(arrays[[1]][[2]][5, , ], m2) expect_equivalent(arrays[[1]][[2]][6, , ], m2) expect_equivalent(arrays[[1]][[2]][7, , ], m2) expect_equivalent(arrays[[1]][[2]][8, , ], m2) expect_equivalent(arrays[[1]][[2]][9, , ], m1) expect_equivalent(arrays[[1]][[2]][10, , ], m1) ## 2 added input files, label_folder dir <- c("label_folder/x", "label_folder/y", "label_folder/z") gen <- get_generator(path = dir, train_type = "label_folder", format = "fasta", batch_size = 15, maxlen = 4, step = 2, val = FALSE, padding = FALSE, added_label_path = c("label.csv", "add_seq.csv"), add_input_as_seq = c(FALSE, TRUE) ) x1 <- c(1, 0, 0, 0) x2 <- c(1, 0, 0, 1) y1 <- c(0, 1, 0, 0) y2 <- c(0, 1, 0, 1) z1 <- c(0, 0, 1, 0) z2 <- c(0, 0, 1, 1) mx1 <- matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0), byrow = TRUE, ncol = 4) mx2 <- matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), byrow = TRUE, ncol = 4) my1 <- matrix(c(0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0), byrow = TRUE, ncol = 4) my2 <- matrix(c(0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4) mz1 <- matrix(c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4) mz2 <- matrix(c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1), byrow = TRUE, ncol = 4) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], x1) expect_equivalent(arrays[[1]][[1]][2, ], x1) expect_equivalent(arrays[[1]][[1]][3, ], x1) expect_equivalent(arrays[[1]][[1]][4, ], x1) expect_equivalent(arrays[[1]][[1]][5, ], x2) expect_equivalent(arrays[[1]][[1]][6, ], y1) expect_equivalent(arrays[[1]][[1]][7, ], y1) expect_equivalent(arrays[[1]][[1]][8, ], y2) expect_equivalent(arrays[[1]][[1]][9, ], y2) expect_equivalent(arrays[[1]][[1]][10, ], y1) expect_equivalent(arrays[[1]][[1]][11, ], z1) expect_equivalent(arrays[[1]][[1]][12, ], z1) expect_equivalent(arrays[[1]][[1]][13, ], z2) expect_equivalent(arrays[[1]][[1]][14, ], z2) expect_equivalent(arrays[[1]][[1]][15, ], z1) expect_equivalent(arrays[[1]][[2]][1, , ], mx1) expect_equivalent(arrays[[1]][[2]][2, , ], mx1) expect_equivalent(arrays[[1]][[2]][3, , ], mx1) expect_equivalent(arrays[[1]][[2]][4, , ], mx1) expect_equivalent(arrays[[1]][[2]][5, , ], mx2) expect_equivalent(arrays[[1]][[2]][6, , ], my1) expect_equivalent(arrays[[1]][[2]][7, , ], my1) expect_equivalent(arrays[[1]][[2]][8, , ], my2) expect_equivalent(arrays[[1]][[2]][9, , ], my2) expect_equivalent(arrays[[1]][[2]][10, , ], my1) expect_equivalent(arrays[[1]][[2]][11, , ], mz1) expect_equivalent(arrays[[1]][[2]][12, , ], mz1) expect_equivalent(arrays[[1]][[2]][13, , ], mz2) expect_equivalent(arrays[[1]][[2]][14, , ], mz2) expect_equivalent(arrays[[1]][[2]][15, , ], mz1) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, ], x2) expect_equivalent(arrays[[1]][[1]][2, ], x2) expect_equivalent(arrays[[1]][[1]][3, ], x1) expect_equivalent(arrays[[1]][[1]][4, ], x1) expect_equivalent(arrays[[1]][[1]][5, ], x1) expect_equivalent(arrays[[1]][[1]][6, ], y1) expect_equivalent(arrays[[1]][[1]][7, ], y2) expect_equivalent(arrays[[1]][[1]][8, ], y2) expect_equivalent(arrays[[1]][[1]][9, ], y1) expect_equivalent(arrays[[1]][[1]][10, ], y1) expect_equivalent(arrays[[1]][[1]][11, ], z1) expect_equivalent(arrays[[1]][[1]][12, ], z2) expect_equivalent(arrays[[1]][[1]][13, ], z2) expect_equivalent(arrays[[1]][[1]][14, ], z1) expect_equivalent(arrays[[1]][[1]][15, ], z1) expect_equivalent(arrays[[1]][[2]][1, , ], mx2) expect_equivalent(arrays[[1]][[2]][2, , ], mx2) expect_equivalent(arrays[[1]][[2]][3, , ], mx1) expect_equivalent(arrays[[1]][[2]][4, , ], mx1) expect_equivalent(arrays[[1]][[2]][5, , ], mx1) expect_equivalent(arrays[[1]][[2]][6, , ], my1) expect_equivalent(arrays[[1]][[2]][7, , ], my2) expect_equivalent(arrays[[1]][[2]][8, , ], my2) expect_equivalent(arrays[[1]][[2]][9, , ], my1) expect_equivalent(arrays[[1]][[2]][10, , ], my1) expect_equivalent(arrays[[1]][[2]][11, , ], mz1) expect_equivalent(arrays[[1]][[2]][12, , ], mz2) expect_equivalent(arrays[[1]][[2]][13, , ], mz2) expect_equivalent(arrays[[1]][[2]][14, , ], mz1) expect_equivalent(arrays[[1]][[2]][15, , ], mz1) # 3 targets, target right gen <- generator_fasta_lm(path_corpus = "fasta_3", batch_size = 5, maxlen = 4, step = 5, output_format = "target_right", padding = FALSE, target_len = 3) arrays <- gen() expect_equivalent(arrays[[1]][1, , ], matrix( c(1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][5, , ], matrix( c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), byrow = TRUE, ncol = 4 )) m1 <- matrix( c(0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0), byrow = TRUE, ncol = 4 ) m2 <- matrix( c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0), byrow = TRUE, ncol = 4) m3 <- matrix( c(0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0), byrow = TRUE, ncol = 4) expect_equivalent(arrays[[2]][ ,1 , ], m1) expect_equivalent(arrays[[2]][ ,2 , ], m2) expect_equivalent(arrays[[2]][ ,3 , ], m3) # 3 targets, target middle cnn gen <- generator_fasta_lm(path_corpus = "fasta_3", batch_size = 5, maxlen = 4, step = 5, output_format = "target_middle_cnn", padding = FALSE, target_len = 3) arrays <- gen() expect_equivalent(arrays[[1]][1, , ], matrix( c(1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][5, , ], matrix( c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4 )) m1 <- matrix( c(0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4 ) m2 <- matrix( c(0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), byrow = TRUE, ncol = 4) m3 <- matrix( c(0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0), byrow = TRUE, ncol = 4) expect_equivalent(arrays[[2]][ ,1 , ], m1) expect_equivalent(arrays[[2]][ ,2 , ], m2) expect_equivalent(arrays[[2]][ ,3 , ], m3) # 3 targets, target middle lstm gen <- generator_fasta_lm(path_corpus = "fasta_3", batch_size = 5, maxlen = 4, step = 5, output_format = "target_middle_lstm", padding = FALSE, target_len = 3) arrays <- gen() expect_equivalent(arrays[[1]][[1]][1, , ], matrix( c(1, 0, 0, 0, 1, 0, 0, 0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][[2]][1, , ], matrix( c(0, 0, 1, 0, 0, 0, 0, 0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][[1]][5, , ], matrix( c(1, 0, 0, 0, 0, 1, 0, 0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][[2]][5, , ], matrix( c(0, 0, 1, 0, 0, 1, 0, 0), byrow = TRUE, ncol = 4 )) m1 <- matrix( c(0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0), byrow = TRUE, ncol = 4 ) m2 <- matrix( c(0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), byrow = TRUE, ncol = 4) m3 <- matrix( c(0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0), byrow = TRUE, ncol = 4) expect_equivalent(arrays[[2]][ ,1 , ], m1) expect_equivalent(arrays[[2]][ ,2 , ], m2) expect_equivalent(arrays[[2]][ ,3 , ], m3) # coverage + set learning for label_folder directories <- c("coverage_data/x", "coverage_data/y") val <- FALSE batch_size <- 6 samples_per_target <- 3 #new_batch_size <- batch_size/samples_per_target path <- directories voc_len <- 4 maxlen <- 7 reshape_mode <- "time_dist" set_learning <- list(reshape_mode = reshape_mode, maxlen = maxlen, samples_per_target = samples_per_target) gen <- get_generator(path = directories, train_type = "label_folder", val = FALSE, padding = TRUE, format = "fasta", batch_size = batch_size, maxlen = maxlen, vocabulary = c("a", "c", "g", "t"), step = 4, use_coverage = 1, set_learning = set_learning) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, , ], matrix( c(7,0,0,0, 7,0,0,0, 0,0,0,0, 0,7,0,0, 0,7,0,0, 0,0,0,0, 0,0,7,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][1, 3, , ], matrix( c(11,0,0,0, 11,0,0,0, 11,0,0,0, 11,0,0,0, 0,0,0,0, 0,0,0,11, 0,0,0,11), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][2, 1, , ], matrix( c(0,0,0,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,0,1, 0,0,0,1), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][3, 1, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][3, 2, , ], matrix( c(7,0,0,0, 7,0,0,0, 0,0,0,0, 0,7,0,0, 0,7,0,0, 0,0,0,0, 0,0,7,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][4, 1, , ], matrix( c(2,0,0,0, 0,2,0,0, 0,0,2,0, 0,0,0,2, 2,0,0,0, 2,0,0,0, 0,2,0,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][5, 3, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][6, 3, , ], matrix( c(0,0,0,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,0,1, 0,0,0,1), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[2]], matrix( c(1,0, 1,0, 1,0, 0,1, 0,1, 0,1), byrow = TRUE, ncol = 2 )) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, , ], matrix( c(11,0,0,0, 11,0,0,0, 11,0,0,0, 11,0,0,0, 0,0,0,0, 0,0,0,11, 0,0,0,11), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[1]][4, 3, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )) expect_equivalent(arrays[[2]], matrix( c(1,0, 1,0, 1,0, 0,1, 0,1, 0,1), byrow = TRUE, ncol = 2 )) # coverage + set learning for label_folder + normalizing input tensor directories <- c("coverage_data/x", "coverage_data/y") val <- FALSE batch_size <- 6 samples_per_target <- 3 #new_batch_size <- batch_size/samples_per_target path <- directories voc_len <- 4 maxlen <- 7 use_coverage <- 17 reshape_mode <- "time_dist" set_learning <- list(reshape_mode = reshape_mode, maxlen = maxlen, samples_per_target = samples_per_target) gen <- get_generator(path = directories, train_type = "label_folder", val = FALSE, padding = TRUE, format = "fasta", batch_size = batch_size, maxlen = maxlen, vocabulary = c("a", "c", "g", "t"), step = 4, use_coverage = use_coverage, set_learning = set_learning) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, , ], matrix( c(7,0,0,0, 7,0,0,0, 0,0,0,0, 0,7,0,0, 0,7,0,0, 0,0,0,0, 0,0,7,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][1, 3, , ], matrix( c(11,0,0,0, 11,0,0,0, 11,0,0,0, 11,0,0,0, 0,0,0,0, 0,0,0,11, 0,0,0,11), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][2, 1, , ], matrix( c(0,0,0,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,0,1, 0,0,0,1), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][3, 1, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][3, 2, , ], matrix( c(7,0,0,0, 7,0,0,0, 0,0,0,0, 0,7,0,0, 0,7,0,0, 0,0,0,0, 0,0,7,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][4, 1, , ], matrix( c(2,0,0,0, 0,2,0,0, 0,0,2,0, 0,0,0,2, 2,0,0,0, 2,0,0,0, 0,2,0,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][5, 3, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][6, 3, , ], matrix( c(0,0,0,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,1,0, 0,0,0,1, 0,0,0,1), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[2]], matrix( c(1,0, 1,0, 1,0, 0,1, 0,1, 0,1), byrow = TRUE, ncol = 2 )) arrays <- gen() expect_equivalent(arrays[[1]][1, 1, , ], matrix( c(11,0,0,0, 11,0,0,0, 11,0,0,0, 11,0,0,0, 0,0,0,0, 0,0,0,11, 0,0,0,11), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[1]][4, 3, , ], matrix( c(0,0,0,0, 17,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, ncol = 4 )/use_coverage) expect_equivalent(arrays[[2]], matrix( c(1,0, 1,0, 1,0, 0,1, 0,1, 0,1), byrow = TRUE, ncol = 2 )) # rds label generator gen <- generator_rds(rds_folder = "rds", batch_size = 1) l_x <- list() l_y <- list() for (i in 1:40) { z <- gen() l_x[[i]] <- z[[1]][1,1,1] l_y[[i]] <- which.max(z[[2]]) } expect_equivalent(sort(unlist(l_x)), rep(1:20, each=2)) expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2)) gen <- generator_rds(rds_folder = "rds", batch_size = 10) l_x <- list() l_y <- list() for (i in 1:4) { z <- gen() l_x[[i]] <- z[[1]][,1,1] l_y[[i]] <- apply(z[[2]], 1, which.max) } expect_equivalent(sort(unlist(l_x)), rep(1:20, each = 2)) expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2)) # rds lm generator target_len <- 3 batch_size <- 1 gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len) for (one_iter in 1:3) { first_input <- 1 + (100*(0:4)) for (i in 1:5) { z <- gen() expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4)) l_x <- z[[1]][1,1,1] first_input <- setdiff(first_input, l_x) l_y <- NULL for (j in 1:target_len) { l_y[[j]] <- z[[2]][[j]][1,1] } expect_equivalent(l_y, l_x + 3 + (1:target_len)) } expect_equivalent(length(first_input), 0) } batch_size <- 5 gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len) for (one_iter in 1:3) { first_input <- 1 + (100*(0:4)) z <- gen() expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4)) l_x <- z[[1]][ , 1, 1] first_input <- setdiff(first_input, l_x) l_y <- NULL for (j in 1:target_len) { l_y[[j]] <- z[[2]][[j]][,1] } expect_equivalent(sort(l_y[[1]]), 5 + (100*(0:4))) expect_equivalent(sort(l_y[[2]]), 6 + (100*(0:4))) expect_equivalent(sort(l_y[[3]]), 7 + (100*(0:4))) expect_equivalent(length(first_input), 0) } # n-gram rds n_gram <- 3 gen <- generator_rds(rds_folder = "n_gram_rds", batch_size = 1, target_len = 6, n_gram = n_gram, n_gram_stride = n_gram) arrays <- gen() y <- arrays[[2]] y_1_n_gram <- apply(y[[1]], 1, which.max) y_2_n_gram <- apply(y[[2]], 1, which.max) int_seq <- c(1,2,0) expect_equivalent(y_1_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # cga int_seq <- c(0,0,1) expect_equivalent(y_2_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # aac # set learning concat with coverage encoding directories <- c("coverage_data/x", "coverage_data/y") val <- FALSE batch_size <- 8 samples_per_target <- 3 #new_batch_size <- batch_size/samples_per_target path <- directories voc_len <- 4 maxlen <- 6 use_coverage <- 17 reshape_mode <- "concat" set_learning <- list(reshape_mode = reshape_mode, maxlen = maxlen, buffer_len = NULL, samples_per_target = samples_per_target) buffer_size <- 0 concat_maxlen <- (maxlen * samples_per_target) + (buffer_size * (samples_per_target - 1)) gen <- get_generator(path = directories, train_type = "label_folder", val = FALSE, padding = TRUE, format = "fasta", batch_size = batch_size, maxlen = maxlen, vocabulary = c("a", "c", "g", "t"), step = maxlen, use_coverage = use_coverage, set_learning = set_learning) m <- matrix( c(0,0,0,0, 0,0,1/17,0, 0,0,1/17,0, 0,0,1/17,0, 0,0,1/17,0, 0,0,0,1/17, 13/17,0,0,0, 0,13/17,0,0, 0,0,13/17,0, 0,0,0,13/17, 13/17,0,0,0, 0,13/17,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0), byrow = TRUE, ncol = 4 ) m2 <- matrix( c(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0, 2/17,0,0,0, 0,2/17,0,0, 0,0,2/17,0, 0,0,0,2/17, 2/17,0,0,0, 2/17,0,0,0, 0,0,3/17,0, 0,0,3/17,0, 0,0,3/17,0, 0,0,3/17,0, 0,0,0,3/17, 0,0,0,3/17), byrow = TRUE, ncol = 4 ) y <- matrix(c(1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1), ncol = 2, byrow = TRUE) arrays <- gen() expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) expect_equivalent(arrays[[2]], y) expect_equivalent(arrays[[1]][4, , ], m) expect_equivalent(arrays[[1]][8, , ], m2) arrays <- gen() expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) expect_equivalent(arrays[[2]], y) expect_equivalent(arrays[[1]][4, , ], m) arrays <- gen() expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) expect_equivalent(arrays[[2]], y) expect_equivalent(arrays[[1]][4, , ], m) expect_equivalent(arrays[[1]][5, , ], m2) # rds generator with multi inputs/outputs x1 <- array(0, dim = c(9,5,4)) x2 <- array(0, dim = c(9,5,3)) y1 <- array(0, dim = c(9,2)) y2 <- array(0, dim = c(9,6)) for (i in 1:dim(x1)[1]) { x1[i,,] <- i y1[i, ] <- i x2[i,,] <- i + 10 y2[i, ] <- i + 10 } index_1 <- 1:5 index_2 <- 6:9 x_list_1 <- list(x1[index_1, , ], x2[index_1, , ]) x_list_2 <- list(x1[index_2, , ], x2[index_2, , ]) y_list_1 <- list(y1[index_1, ], y2[index_1, ]) y_list_2 <- list(y1[index_2, ], y2[index_2, ]) z1 <- list(x = x_list_1, y = y_list_1) z2 <- list(x = x_list_2, y = y_list_2) temp_dir <- tempfile() dir.create(temp_dir) saveRDS(z1, paste0(temp_dir, "/file_1.rds")) saveRDS(z2, paste0(temp_dir, "/file_2.rds")) gen <- generator_rds(rds_folder = temp_dir, batch_size = 10, path_file_log = NULL, max_samples = NULL, proportion_per_seq = NULL, target_len = NULL, seed = 1, reverse_complement = FALSE, sample_by_file_size = FALSE, n_gram = NULL, n_gram_stride = 1, reverse_complement_encoding = FALSE, add_noise = NULL) for (k in 1:5) { z <- gen() x1 <- z[[1]][[1]] %>% as.array() x2 <- z[[1]][[2]] %>% as.array() y1 <- z[[2]][[1]] %>% as.array() y2 <- z[[2]][[2]] %>% as.array() for (i in 1:dim(x1)[1]) { expect_equivalent(min(x1[i,,]), max(y1[i,])) expect_equivalent(min(x1[i,,]) + 10, max(x2[i,,])) expect_equivalent(max(x2[i,,]), min(y2[i,])) expect_equivalent(max(y1[i,]) + 10, min(y2[i,])) } } # integer encoding label header # testpath <- file.path("fasta_2") gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) arrays <- gen() expect_equivalent(arrays[[1]][1, 1], 1) # A expect_equivalent(arrays[[1]][1, 2], 1) # A expect_equivalent(arrays[[1]][1, 3], 2) # C expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W expect_equivalent(arrays[[1]][5, 1], 1) # A expect_equivalent(arrays[[1]][5, 2], 1) # A expect_equivalent(arrays[[1]][5, 3], 4) # T expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) arrays <- gen() expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) arrays <- gen() expect_equivalent(arrays[[1]][5, 1], 3) expect_equivalent(arrays[[1]][5, 2], 3) expect_equivalent(arrays[[1]][5, 3], 3) expect_equivalent(arrays[[1]][5, 4], 3) expect_equivalent(arrays[[1]][5, 5], 4) expect_equivalent(arrays[[1]][5, 6], 4) expect_equivalent(arrays[[1]][5, 7], 4) expect_equivalent(arrays[[1]][5, 8], 4) expect_equivalent(arrays[[2]][1, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][2, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][3, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) arrays <- gen() # go through a/b.fasta once discard samples with target z expect_equivalent(arrays[[1]][8, 1], 1) # A expect_equivalent(arrays[[1]][8, 2], 1) # A expect_equivalent(arrays[[1]][8, 3], 2) # C expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W # label folder with integer encoding directories <- c("label_folder/x", "label_folder/y", "label_folder/z") gen <- get_generator(path = directories, train_type = "label_folder", val = FALSE, padding = TRUE, format = "fasta", batch_size = 6, maxlen = 2, return_int = TRUE, vocabulary = c("a", "c", "g", "t"), step = 2) arrays <- gen() expect_equivalent(arrays[[1]][1, 1], 1) expect_equivalent(arrays[[1]][1, 2], 2) expect_equivalent(arrays[[1]][2, 1], 1) expect_equivalent(arrays[[1]][2, 2], 2) expect_equivalent(arrays[[1]][3, 1], 3) expect_equivalent(arrays[[1]][3, 2], 2) expect_equivalent(arrays[[1]][4, 1], 3) expect_equivalent(arrays[[1]][4, 2], 2) expect_equivalent(arrays[[1]][5, 1], 4) expect_equivalent(arrays[[1]][5, 2], 4) expect_equivalent(arrays[[1]][6, 1], 4) expect_equivalent(arrays[[1]][6, 2], 4) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) # test skipping file for (i in 1:2) { arrays <- gen() } expect_equivalent(arrays[[1]][1, 1], 1) expect_equivalent(arrays[[1]][1, 2], 2) expect_equivalent(arrays[[1]][2, 1], 1) expect_equivalent(arrays[[1]][2, 2], 3) expect_equivalent(arrays[[1]][3, 1], 2) expect_equivalent(arrays[[1]][3, 2], 3) expect_equivalent(arrays[[1]][4, 1], 2) expect_equivalent(arrays[[1]][4, 2], 3) expect_equivalent(arrays[[1]][5, 1], 1) expect_equivalent(arrays[[1]][5, 2], 1) expect_equivalent(arrays[[1]][6, 1], 1) expect_equivalent(arrays[[1]][6, 2], 1) expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) # n-gram integer encoding, label folder # directories <- c("label_folder/x", "label_folder/y", "label_folder/z") gen <- get_generator(path = directories, train_type = "label_folder", batch_size = 6, maxlen = 12, padding = TRUE, n_gram = 3, n_gram_stride = 2, return_int = TRUE, vocabulary = c("a", "c", "g", "t"), step = 2) arrays <- gen() x <- arrays[[1]] y <- arrays[[2]] expect_equivalent(dim(x), c(6, 5)) expect_equivalent(x[1, 1], 0) # padding expect_equivalent(x[1, 2], 5) # ACA expect_equivalent(unique(x[5, 1:4]), 0) # padding expect_equivalent(x[5, 5], 64) # TTT = 4^3 # n-gram one-hot encoding, label folder # directories <- c("label_folder/x", "label_folder/y", "label_folder/z") gen <- get_generator(path = directories, train_type = "label_folder", batch_size = 6, maxlen = 12, padding = TRUE, n_gram = 3, n_gram_stride = 2, return_int = FALSE, vocabulary = c("a", "c", "g", "t"), step = 2) arrays <- gen() x <- arrays[[1]] y <- arrays[[2]] expect_equivalent(dim(x), c(6, 5, 64)) expect_equivalent(unique(x[1, 1, ]), 0) # padding expect_equivalent(which.max(x[1, 2, ]), 5) # ACA expect_equivalent(unique(as.vector(x[5, 1:4, ])), 0) # padding expect_equivalent(which.max(x[5, 5, ]), 64) # TTT = 4^3 ##### masked lm ##### testpath <- file.path("a.fastq") masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE) gen <- get_generator(path = testpath, train_type = "masked_lm", masked_lm = masked_lm, batch_size = 1, maxlen = 200, format = "fastq", padding = TRUE, return_int = TRUE) z <- gen() x <- z[[1]] y <- z[[2]] sw <- z[[3]] expect_equivalent(x[1,1:12], rep(0, 12)) # padding expect_equivalent(sw[1,1:12], rep(0, 12)) # no sample weights in padding region sw_pos <- which(sw[1,] == 1) random_pos <- which(x[1,] %in% c(2,3,4)) masked_pos <- which(x[1,] == 5) # masked and random positions must have sw 1 expect_contains(sw_pos, random_pos) expect_contains(sw_pos, masked_pos) ### testpath <- file.path("fasta_2/b.fasta") masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE) gen <- get_generator(path = testpath, train_type = "masked_lm", shuffle_input = FALSE, masked_lm = masked_lm, batch_size = 3, maxlen = 10, padding = TRUE, return_int = TRUE) z <- gen() x <- z[[1]] y <- z[[2]] sw <- z[[3]] expect_equivalent(sum(x[,1:2]), 0) # padding expect_equivalent(sum(sw[,1:2]), 0) # no sample weights in padding region for (i in 1:3) { sw_pos <- which(sw[i,] == 1) masked_pos <- which(x[i,] == 5) expect_contains(sw_pos, masked_pos) # masked positions must have sw 1 } #### test reshape #### directories <- c("fasta_2", "fasta_3") fx <- function(x) {return(x)} reshape_xy <- list(x = fx) expect_error(gen <- get_generator(path = directories, reshape_xy = reshape_xy, train_type = "label_folder", batch_size = 4, maxlen = 3)) directories <- c("fasta_2", "fasta_3") fx <- function(x = NULL, y = NULL) { return(x + 1) } fy <- function(x = NULL, y = NULL) { return(x) } reshape_xy <- list(x = fx, y = fy) gen <- get_generator(path = directories, reshape_xy = reshape_xy, val = FALSE, train_type = "label_folder", format = "fasta", batch_size = 4, maxlen = 3, vocabulary = c("a", "c", "g", "t"), reverse_complement = FALSE, ambiguous_nuc = "zero", step = 2) arrays <- gen() arrays[[1]][1,,] y <- arrays[[2]] expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0) + 1) expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0) + 1) expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0) + 1) expect_equivalent(arrays[[2]][1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[2]][1, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][4, 1, ], rep(0, 4) + 1) expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0) + 1) expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0) + 1) expect_equivalent(arrays[[2]][4, 1, ], rep(0, 4)) expect_equivalent(arrays[[2]][4, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][4, 3, ], c(0, 1, 0, 0)) testpath <- file.path("fasta_2") label_from_csv <- "output_label.csv" fx <- function(x = NULL, y = NULL) { return(y + 3) } fy <- function(x = NULL, y = NULL) { return(x + 2) } reshape_xy <- list(x = fx, y = fy) gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, reshape_xy = reshape_xy, maxlen = 10, step = 10, vocabulary = c("a", "c", "g", "t", "Z"), reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), shuffle_file_order = FALSE, seed = 1234, shuffle_input = FALSE, padding = TRUE, concat_seq = "ZZ", target_from_csv = label_from_csv) arrays <- gen() expect_equivalent(arrays[[2]][1, 8, ], c(0, 0, 0, 1, 0) + 2) expect_equivalent(arrays[[2]][1, 9, ], c(0, 0, 0, 0, 1) + 2) expect_equivalent(arrays[[2]][1, 10, ], c(0, 0, 0, 0, 1) + 2) expect_equivalent(arrays[[2]][4, 3, ], c(1, 0, 0, 0, 0) + 2) expect_equivalent(arrays[[2]][4, 4, ], c(1, 0, 0, 0, 0) + 2) expect_equivalent(arrays[[1]][1, ], 1:4 + 3) expect_equivalent(arrays[[1]][2, ], 1:4 + 3) expect_equivalent(arrays[[1]][3, ], 1:4 + 3) expect_equivalent(arrays[[1]][4, ], 11:14 + 3) expect_equivalent(arrays[[1]][5, ], 11:14 + 3) arrays <- gen() expect_equivalent(arrays[[2]][1, 8, ], c(1, 0, 0, 0, 0) + 2) expect_equivalent(arrays[[2]][2, 3, ], c(0, 1, 0, 0, 0) + 2) expect_equivalent(arrays[[1]][1, ], 11:14 + 3) expect_equivalent(arrays[[1]][5, ], 11:14 + 3) # set learning directories = c("fasta_2", "fasta_3") maxlen <- 3 samples_per_target <- 3 reshape_mode <- "time_dist" set_learning <- list(reshape_mode = reshape_mode, maxlen = maxlen, samples_per_target = samples_per_target) gen <- get_generator(val = FALSE, set_learning = set_learning, train_type = "label_folder", path = directories, format = "fasta", batch_size = 2, maxlen = maxlen, ambiguous_nuc = "discard", vocabulary = c("a", "c", "g", "t"), step = 2) arrays <- gen() # add axis to previous test expect_equivalent(arrays[[1]][1, 1, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 1, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][1, 1, 3, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[1]][1, 2, 1, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 2, 2, ], c(0, 1, 0, 0)) expect_equivalent(arrays[[1]][1, 2, 3, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 3, 1, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 3, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][1, 3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][2, 1, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][2, 1, 2, ], c(0, 0, 1, 0)) expect_equivalent(arrays[[1]][2, 1, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, 1, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, 2, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 2, 3, ], c(1, 0, 0, 0)) expect_equivalent(arrays[[1]][2, 3, 1, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][2, 3, 2, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[1]][2, 3, 3, ], c(0, 0, 0, 1)) expect_equivalent(arrays[[2]][1, ], c(1, 0)) expect_equivalent(arrays[[2]][2, ], c(0, 1)) })