#--------------------------------------------------------------------------------------- data path_read = file.path(getwd(), "example_text.txt") default_write_path = file.path(getwd(), 'save_model_vecs') path_write_vecs = file.path(default_write_path, 'word_vectors') path_write_logs = file.path(default_write_path, 'model_logs.txt') path_supervised = file.path(getwd(), 'cooking_supervised.txt') path_lang_identify = file.path(getwd(), 'declaration_human_rights_english.txt') pre_train_ftz = system.file("language_identification/lid.176.ftz", package = "fastText") #--------------------------------------------------------------------------------------- context('tests for all functions') #========================= # print usage of functions #========================= testthat::test_that("it prints information for the 'printDumpUsage' function", { testthat::expect_output( printDumpUsage() ) }) testthat::test_that("it prints information for the 'printNNUsage' function", { testthat::expect_output( printNNUsage() ) }) testthat::test_that("it prints information for the 'printPredictUsage' function", { testthat::expect_output( printPredictUsage() ) }) testthat::test_that("it prints information for the 'printPrintNgramsUsage' function", { testthat::expect_output( printPrintNgramsUsage() ) }) testthat::test_that("it prints information for the 'printPrintSentenceVectorsUsage' function", { testthat::expect_output( printPrintSentenceVectorsUsage() ) }) testthat::test_that("it prints information for the 'printPrintWordVectorsUsage' function", { testthat::expect_output( printPrintWordVectorsUsage() ) }) testthat::test_that("it prints information for the 'printQuantizeUsage' function", { testthat::expect_output( printQuantizeUsage() ) }) testthat::test_that("it prints information for the 'printTestLabelUsage' function", { testthat::expect_output( printTestLabelUsage() ) }) testthat::test_that("it prints information for the 'printTestUsage' function", { testthat::expect_output( printTestUsage() ) }) testthat::test_that("it prints information for the 'printUsage' function", { testthat::expect_output( printUsage() ) }) testthat::test_that("it prints information about the parameters of a specified command", { testthat::expect_output( print_parameters(command = 'supervised') ) }) #============================== # 'fasttext_interface' function [ 'expect_true' and 'expect_output' ] #============================== testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'cbow' command", { list_params = list(command = 'cbow', lr = 0.1, dim = 5, input = path_read, output = path_write_vecs, verbose = 2, thread = 1) res = fasttext_interface(list_params, path_output = path_write_logs, MilliSecs = 100) out = list.files(default_write_path, full.names = F) testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'skipgram' command", { list_params = list(command = 'skipgram', lr = 0.1, dim = 5, input = path_read, output = path_write_vecs, verbose = 2, thread = 1) res = fasttext_interface(list_params, path_output = path_write_logs, MilliSecs = 100) out = list.files(default_write_path, full.names = F) testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'supervised' command", { list_params = list(command = 'supervised', lr = 0.1, dim = 5, input = path_supervised, output = path_write_vecs, verbose = 2, thread = 1) res = fasttext_interface(list_params, path_output = path_write_logs, MilliSecs = 100) out = list.files(default_write_path, full.names = F) testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'predict' and 'predict-prob' command", { list_params = list(command = 'predict', model = file.path(default_write_path, 'word_vectors.bin'), test_data = file.path(getwd(), 'cooking_valid.txt'), k = 1, th = 0.0) res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) out_preds = list.files(default_write_path, full.names = F) out_preds = ('preds_valid.txt' %in% out_preds) read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") ncol_valid = ncol(read_preds_valid) == 1 # single column output list_params = list(command = 'predict-prob', model = file.path(default_write_path, 'word_vectors.bin'), test_data = file.path(getwd(), 'cooking_valid.txt'), k = 1, th = 0.0) res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) out_preds_prob = list.files(default_write_path, full.names = F) out_preds_prob = ('preds_valid.txt' %in% out_preds_prob) read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") ncol_valid_prob = ncol(read_preds_valid) == 2 # 2-column output (probabilities, too) testthat::expect_true( all(c(out_preds, out_preds_prob)) && all(ncol_valid, ncol_valid_prob) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'test-label' command", { list_params = list(command = 'test-label', model = file.path(default_write_path, 'word_vectors.bin'), test_data = file.path(getwd(), 'cooking_valid.txt'), k = 5, th = 0.0) res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) out_preds_prob = list.files(default_write_path, full.names = F) out_preds_prob = ('preds_valid.txt' %in% out_preds_prob) read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") ncol_valid_prob = ncol(read_preds_valid) == 10 # 10-column output (precision & recall, too) testthat::expect_true( ncol_valid_prob ) }) testthat::test_that("the 'fasttext_interface' function prints information to the R session (precision, recall) when using the 'test' command", { list_params = list(command = 'test', model = file.path(default_write_path, 'word_vectors.bin'), test_data = file.path(getwd(), 'cooking_valid.txt'), k = 5, th = 0.0) # it prints precision, recall to the R session (only) testthat::expect_output( fasttext_interface(list_params) ) }) testthat::test_that("the 'fasttext_interface' function will create an .ftz file when using the 'quantize' command", { pth_in_bin = file.path(default_write_path, 'word_vectors.bin') pth_out_ftz = file.path(default_write_path, 'word_vectors.ftz') list_params = list(command = 'quantize', input = pth_in_bin, output = pth_out_ftz) res = fasttext_interface(list_params) ftz_exists = file.exists(pth_out_ftz) ftz_smaller_size_than_bin = (file.size(pth_in_bin) > file.size(pth_out_ftz)) testthat::expect_true( ftz_exists & ftz_smaller_size_than_bin ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-word-vectors' command", { list_params = list(command = 'print-word-vectors', model = file.path(default_write_path, 'word_vectors.bin')) out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_input = file.path(getwd(), 'queries.txt'), path_output = out_data) read_word_vecs = utils::read.table(out_data, quote="\"", comment.char="") testthat::expect_true( nrow(read_word_vecs) == 5 && ncol(read_word_vecs) == 6 ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-sentence-vectors' command", { list_params = list(command = 'print-sentence-vectors', model = file.path(default_write_path, 'word_vectors.bin')) out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_input = file.path(getwd(), 'text_sentence.txt'), path_output = out_data) read_word_vecs = utils::read.table(out_data, quote="\"", comment.char="") # the 3rd and 4th rows must give the same output because they are the same sentences testthat::expect_true( nrow(read_word_vecs) == 5 && ncol(read_word_vecs) == 5 && all(read_word_vecs[3, ] == read_word_vecs[4, ]) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-ngrams' command", { list_params = list(command = 'skipgram', lr = 0.1, dim = 5, input = path_read, output = path_write_vecs, verbose = 2, thread = 1, minn = 2, maxn = 2) res = fasttext_interface(list_params, path_output = path_write_logs, MilliSecs = 100) list_params = list(command = 'print-ngrams', model = file.path(default_write_path, 'word_vectors.bin'), word = 'word') out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_output = out_data) read_ngrams = utils::read.table(out_data, quote="\"", comment.char="") testthat::expect_true( all(dim(read_ngrams) == c(5, 6)) ) # 'print-ngrams' prints to R session too, just use : res = fasttext_interface(list_params, path_output = "") }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'nn' command", { list_params = list(command = 'nn', model = file.path(default_write_path, 'word_vectors.bin'), k = 5, query_word = 'word') out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_output = out_data) read_nn = utils::read.table(out_data, quote="\"", comment.char="") testthat::expect_true( all(nrow(read_nn) == list_params[['k']] && ncol(read_nn) == 2) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'analogies' command", { list_params = list(command = 'analogies', model = file.path(default_write_path, 'word_vectors.bin'), k = 5) out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_input = file.path(getwd(), 'analogy_queries.txt'), path_output = out_data) # the 'analogy_queries.txt' file contains 4 triplets and I'm looking for 5 analogies for each triplet. # therefore the output file should contain : 4 * 5 + 4 = 24 rows ( I've added a 4 because after each k-analogies I've added a empty line ) read_analogies = utils::read.table(out_data, quote="\"", comment.char="", blank.lines.skip = FALSE) testthat::expect_true( all(nrow(read_analogies) == (4 * 5 + 4) && ncol(read_analogies) == 2) ) }) testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'dump' command", { list_params = list(command = 'dump', model = file.path(default_write_path, 'word_vectors.bin'), option = 'args') out_data = file.path(default_write_path, 'preds_valid.txt') res = fasttext_interface(list_params, path_output = out_data, remove_previous_file = TRUE) read_dump = utils::read.table(out_data, quote="\"", comment.char="") testthat::expect_true( all(dim(read_dump) == c(13, 2)) ) }) #=================================== # 'language_identification' function #=================================== testthat::test_that("the 'language_identification' function gives an error if the 'input_obj' parameter is neither a character vector consisting of character string(s) nor a valid path to a file", { lst_invalid = list(1,2,3) testthat::expect_error( language_identification(input_obj = lst_invalid, pre_trained_language_model_path = pre_train_ftz, k = 1, th = 0.0, verbose = TRUE) ) }) testthat::test_that("the 'language_identification' function gives an error if the 'pre_trained_language_model_path' parameter does not point to a valid pre-trained weights file", { vec_txt = c("Incapaz de distinguir la luna y la cara de esta chica, Las estrellas se ponen nerviosas en el cielo.", "Unable to tell apart the moon and this girl's face, Stars are flustered up in the sky.") file_pretrained = 'INVALID_pre_trained_weights' testthat::expect_error( language_identification(input_obj = vec_txt, pre_trained_language_model_path = file_pretrained, k = 1, th = 0.0, verbose = TRUE) ) }) testthat::test_that("the 'language_identification' function returns the correct output when it takes a character vector of character strings as input", { vec_txt = c("Incapaz de distinguir la luna y la cara de esta chica, Las estrellas se ponen nerviosas en el cielo.", "Unable to tell apart the moon and this girl's face, Stars are flustered up in the sky.") res_out = language_identification(input_obj = vec_txt, pre_trained_language_model_path = pre_train_ftz, k = 1, th = 0.0, verbose = TRUE) testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) == 2 ) }) testthat::test_that("the 'language_identification' function returns the correct output when it takes a valid path to a text file as input", { res_out = language_identification(input_obj = path_lang_identify, pre_trained_language_model_path = pre_train_ftz, k = 1, th = 0.0, verbose = TRUE) testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) > 1 & length(unique(res_out$iso_lang_1)) >= 1) }) testthat::test_that("the 'language_identification' function returns the correct output if the input object is 'data' (see Github issue https://github.com/mlampros/fastText/issues/3)", { res_out = language_identification(input_obj = "data", pre_trained_language_model_path = pre_train_ftz, k = 1, th = 0.0, verbose = TRUE) testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) == 1 ) })