skip_on_ci() skip_spark_min_version(4.0) test_that("Binarizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_binarizer(sc, "a", "b"))) expect_snapshot(class(ft_binarizer(ml_pipeline(sc), "a", "b"))) expect_snapshot( use_test_table_mtcars() |> ft_binarizer("mpg", "mpg_new", threshold = 20) |> use_test_pull(TRUE) ) }) test_that("Bucket Random Projection LSH works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_bucketed_random_projection_lsh(sc))) expect_snapshot( class(ft_bucketed_random_projection_lsh(ml_pipeline(sc))) ) expect_snapshot( use_test_mtcars_va() |> ft_bucketed_random_projection_lsh("vec_x", "lsh_x", bucket_length = 1) |> use_test_pull() ) }) test_that("Bucketizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_bucketizer(sc, "a", "b", c(1, 2, 3)))) expect_snapshot(class(ft_bucketizer(ml_pipeline(sc), "a", "b", c(1, 2, 3)))) expect_snapshot( use_test_table_mtcars() |> ft_bucketizer("mpg", "mpg_new", splits = c(0, 10, 20, 30, 40)) |> use_test_pull(TRUE) ) }) test_that("Count vectorizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_count_vectorizer(sc))) expect_snapshot(class(ft_count_vectorizer(ml_pipeline(sc)))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer(input_col = "x", output_col = "token_x") |> ft_count_vectorizer("token_x", "cv_x") |> use_test_pull() ) }) test_that("DCT works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_dct(sc))) expect_snapshot(class(ft_dct(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_dct("vec_x", "dct_x") |> use_test_pull() ) }) test_that("Discrete Cosine works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_discrete_cosine_transform(sc))) expect_snapshot(class(ft_discrete_cosine_transform(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_discrete_cosine_transform("vec_x", "dct_x") |> use_test_pull() ) }) test_that("Elementwise Product works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_elementwise_product(sc))) expect_snapshot(class(ft_elementwise_product(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_elementwise_product("vec_x", "elm_x", scaling_vec = c(1:3)) |> use_test_pull() ) }) test_that("Feature Hasher works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_feature_hasher(sc))) expect_snapshot(class(ft_feature_hasher(ml_pipeline(sc)))) expect_snapshot( use_test_table_mtcars() |> ft_feature_hasher(c("mpg", "wt", "cyl")) |> use_test_pull() ) }) test_that("Hashing TF works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_hashing_tf(ml_pipeline(sc)))) expect_snapshot(class(ft_hashing_tf(sc))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer(input_col = "x", output_col = "token_x") |> ft_hashing_tf( input_col = "token_x", output_col = "hashed_x", binary = TRUE, num_features = 1024 ) |> use_test_pull() ) }) test_that("IDF works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_idf(sc))) expect_snapshot(class(ft_idf(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_idf("vec_x", "idf_x") |> use_test_pull() ) }) test_that("Imputer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_imputer(sc))) expect_snapshot(class(ft_imputer(ml_pipeline(sc)))) expect_snapshot( use_test_table_simple() |> ft_imputer(list(c("x")), list(c("new_x"))) |> use_test_pull() ) }) test_that("Index-to-string works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_index_to_string(ml_pipeline(sc)))) expect_snapshot(class(ft_index_to_string(sc))) expect_snapshot( use_test_table_iris() |> ft_string_indexer("Species", "species_idx") |> ft_index_to_string("species_idx", "species_x") |> use_test_pull(TRUE) ) }) test_that("Interaction works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_interaction(sc))) expect_snapshot(class(ft_interaction(ml_pipeline(sc)))) expect_snapshot( use_test_table_mtcars() |> ft_interaction(c("mpg", "wt"), c("mpg_wt")) |> use_test_pull() ) }) test_that("Max Abs Scaler works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_max_abs_scaler(sc))) expect_snapshot(class(ft_max_abs_scaler(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_max_abs_scaler("vec_x", "rs_x") |> use_test_pull() ) }) test_that("Min Hash LSH works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_minhash_lsh(sc))) expect_snapshot(class(ft_minhash_lsh(ml_pipeline(sc)))) expect_snapshot( use_test_iris_va() |> ft_minhash_lsh("vec_x", "hash_x") |> use_test_pull() |> round() |> table() ) }) test_that("N-gram works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_ngram(sc))) expect_snapshot(class(ft_ngram(ml_pipeline(sc)))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer("x", "token_x") |> ft_ngram("token_x", "ngram_x") |> dplyr::pull() ) }) test_that("Normalizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_hashing_tf(ml_pipeline(sc)))) expect_snapshot(class(ft_hashing_tf(sc))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer(input_col = "x", output_col = "token_x") |> ft_stop_words_remover(input_col = "token_x", output_col = "stop_x") |> ft_hashing_tf( input_col = "stop_x", output_col = "hashed_x", binary = TRUE, num_features = 1024 ) |> ft_normalizer( input_col = "hashed_x", output_col = "normal_x" ) |> use_test_pull() ) }) test_that("One hot encoder works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_one_hot_encoder(sc))) expect_snapshot(class(ft_one_hot_encoder(ml_pipeline(sc)))) expect_snapshot( use_test_table_simple() |> ft_one_hot_encoder(list(c("y")), list(c("ohe_x"))) |> use_test_pull() ) }) test_that("PCA works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_pca(sc))) expect_snapshot(class(ft_pca(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_pca("vec_x", "pca_x", k = 2) |> use_test_pull() ) }) test_that("Polynomial expansion works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_polynomial_expansion(sc))) expect_snapshot(class(ft_polynomial_expansion(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_polynomial_expansion("vec_x", "pe_x", degree = 2) |> use_test_pull() ) }) test_that("Quantile discretizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_quantile_discretizer(sc))) expect_snapshot(class(ft_quantile_discretizer(ml_pipeline(sc)))) expect_snapshot( use_test_table_simple() |> ft_quantile_discretizer(c("y"), c("ohe_x")) |> use_test_pull() ) }) test_that("R Formula works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_r_formula(ml_pipeline(sc)))) expect_snapshot(class(ft_r_formula(sc))) expect_snapshot( use_test_table_mtcars() |> ft_r_formula(mpg ~ ., features_col = "test") |> dplyr::select(test) |> use_test_pull() ) }) test_that("Regex Tokenizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_regex_tokenizer(sc))) expect_snapshot(class(ft_regex_tokenizer(ml_pipeline(sc)))) expect_snapshot( use_test_table_reviews() |> ft_regex_tokenizer("x", "new_x") |> dplyr::pull() ) }) test_that("Standard Scaler works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_standard_scaler(sc))) expect_snapshot(class(ft_standard_scaler(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_standard_scaler("vec_x", "rs_x") |> use_test_pull() ) }) test_that("Robust Scaler works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_robust_scaler(sc))) expect_snapshot(class(ft_robust_scaler(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_robust_scaler("vec_x", "rs_x") |> use_test_pull() ) }) test_that("SQL transformer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_sql_transformer(sc))) expect_snapshot(class(ft_sql_transformer(ml_pipeline(sc)))) expect_snapshot( use_test_mtcars_va() |> ft_sql_transformer("select * from __THIS__ where mpg > 20") |> use_test_pull() ) }) test_that("Stop words remover works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_tokenizer(ml_pipeline(sc)))) expect_snapshot(class(ft_tokenizer(sc))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer(input_col = "x", output_col = "token_x") |> ft_stop_words_remover(input_col = "token_x", output_col = "stop_x") |> dplyr::pull() ) }) test_that("String indexer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_string_indexer(ml_pipeline(sc)))) expect_snapshot(class(ft_string_indexer(sc))) expect_snapshot( use_test_table_iris() |> ft_string_indexer("Species", "species_idx") |> use_test_pull(TRUE) ) }) test_that("Tokenizer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_tokenizer(ml_pipeline(sc)))) expect_snapshot(class(ft_tokenizer(sc))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer(input_col = "x", output_col = "token_x") |> dplyr::pull() ) }) test_that("Vector assembler works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_vector_assembler(ml_pipeline(sc)))) expect_snapshot(class(ft_vector_assembler(sc))) expect_snapshot( use_test_table_mtcars() |> ft_vector_assembler( input_cols = c("mpg", "wt", "cyl"), output_col = "vec_x" ) |> use_test_pull() ) }) test_that("Vector indexer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_vector_indexer(ml_pipeline(sc)))) expect_snapshot(class(ft_vector_indexer(sc))) expect_equal( use_test_mtcars_va() |> ft_vector_indexer("vec_x", "index_x") |> use_test_pull() |> nrow(), 32 ) }) test_that("Vector slicer works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_vector_slicer(ml_pipeline(sc)))) expect_snapshot(class(ft_vector_slicer(sc))) expect_snapshot( use_test_mtcars_va() |> ft_vector_slicer("vec_x", "index_x", indices = list(1L)) |> use_test_pull() ) }) test_that("Word2Vec works", { sc <- use_test_spark_connect() expect_snapshot(class(ft_word2vec(ml_pipeline(sc)))) expect_snapshot(class(ft_word2vec(sc))) expect_snapshot( use_test_table_reviews() |> ft_tokenizer("x", "token_x") |> ft_word2vec("token_x", "word_x", min_count = 1) |> use_test_pull() ) })