context("feature_specs") # Set up ------------------------------------------------------------------ skip_if_not_tf <- function() { skip_if_no_tensorflow(required_version = "2.0") } df <- list( a = letters, b = 1:length(letters), c = runif(length(letters)), d = LETTERS, y = runif(length(letters)) ) dataset <- df %>% tensor_slices_dataset() %>% dataset_batch(2) get_features <- function(df, feature_columns) { if (tensorflow::tf$executing_eagerly()) example <- reticulate::iter_next(reticulate::as_iterator(df)) else { example <- make_iterator_one_shot(df) example <- iterator_get_next(example) } skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() # k <- keras::layer_dense_features(feature_columns = feature_columns) if (tensorflow::tf$executing_eagerly()) return(k(example)) else { res <- k(example) sess <- tf$Session() return(sess$run(res)) } } # Tests ------------------------------------------------------------------- test_that("Can create a feature_spec", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) expect_equal(sort(spec$feature_names()), sort(names(df)[-which(names(df) == "y")])) }) test_that("Can create numeric columns", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b, c) spec$fit() #TODO use the fit S3 method when available expect_length(spec$features(), 2) expect_named(spec$features(), c("b", "c")) expect_s3_class(spec$features()[[1]], "tensorflow.python.feature_column.feature_column._DenseColumn") expect_s3_class(spec$features()[[2]], "tensorflow.python.feature_column.feature_column._DenseColumn") }) test_that("Can create categorical columns with vocabulary list", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_categorical_column_with_vocabulary_list(a, d) spec$fit() expect_length(spec$features(), 2) expect_named(spec$features(), c("a", "d")) expect_s3_class(spec$features()[[1]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") expect_s3_class(spec$features()[[2]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, vocabulary_list = letters[1:5]) spec$fit() expect_length(spec$features(), 1) expect_length(spec$dense_features(), 0) }) test_that("Can create categorical columns with hash_bucket", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_categorical_column_with_hash_bucket(a, d, hash_bucket_size = 10) spec$fit() expect_length(spec$features(), 2) expect_named(spec$features(), c("a", "d")) expect_s3_class(spec$features()[[1]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") expect_s3_class(spec$features()[[2]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") }) test_that("Can create categorical columns with identity", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_categorical_column_with_identity(a, num_buckets = 10) spec$fit() expect_length(spec$features(), 1) expect_named(spec$features(), c("a")) expect_s3_class(spec$features()[[1]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") }) test_that("Can create categorical columns with vocabulary file", { skip_if_not_tf() tmp <- tempfile() writeLines(tmp, text = letters) spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_categorical_column_with_vocabulary_file(a, vocabulary_file = tmp) spec$fit() expect_length(spec$features(), 1) expect_named(spec$features(), c("a")) expect_s3_class(spec$features()[[1]], "tensorflow.python.feature_column.feature_column._CategoricalColumn") }) test_that("Can create indicator variables", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) spec$fit() expect_length(spec$dense_features(), 2) expect_named(spec$dense_features(), c("indicator_a", "indicator_d")) expect_s3_class(spec$dense_features()[[1]], "tensorflow.python.feature_column.feature_column_v2.IndicatorColumn") spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(ind_a = a) spec$fit() expect_named(spec$dense_features(), c("ind_a")) }) test_that("Can create embedding columns", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_embedding_column(a, d, dimension = 5) spec$fit() expect_length(spec$dense_features(), 2) expect_named(spec$dense_features(), c("embedding_a", "embedding_d")) expect_s3_class(spec$dense_features()[[1]], "tensorflow.python.feature_column.feature_column_v2.EmbeddingColumn") spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_embedding_column(emb_a = a, dimension = 5) spec$fit() expect_named(spec$dense_features(), c("emb_a")) spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_embedding_column(a, d) spec$fit() expect_length(spec$dense_features(), 2) expect_named(spec$dense_features(), c("embedding_a", "embedding_d")) expect_s3_class(spec$dense_features()[[1]], "tensorflow.python.feature_column.feature_column_v2.EmbeddingColumn") }) test_that("Can create crossed columns", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_crossed_column(c(a, d), hash_bucket_size = 100) %>% step_indicator_column(crossed_a_d) spec$fit() expect_named(spec$dense_features(), "indicator_crossed_a_d") expect_s3_class(spec$dense_features()[[1]], "tensorflow.python.feature_column.feature_column_v2.IndicatorColumn") expect_s3_class(spec$features()$crossed_a_d, "tensorflow.python.feature_column.feature_column_v2.CrossedColumn") }) test_that("Can create bucketized columns", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b) %>% step_bucketized_column(b, boundaries = c(5, 10, 15)) spec$fit() expect_s3_class(spec$dense_features()$bucketized_b, "tensorflow.python.feature_column.feature_column_v2.BucketizedColumn") }) test_that("Can remove columns", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b) %>% step_bucketized_column(b, boundaries = c(5, 10, 15)) %>% step_remove_column(b) spec$fit() expect_length(spec$features(), 1) }) test_that("Using with layer_dense_features", { skip_if_not_tf() skip("layer_dense_features") spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b, c) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) spec$fit() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() # lyr <- keras::layer_dense_features(feature_columns = spec$dense_features()) ds <- reticulate::as_iterator(dataset) x <- lyr(reticulate::iter_next(ds)) if (tensorflow::tf$executing_eagerly()) expect_equal(x$shape$as_list(), c(2, 2 + 2*26)) else expect_equal(x$shape$as_list()[[2]], 2 + 2*26) }) test_that("Recipes are correctly cloned/imutable", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b, c) %>% step_categorical_column_with_vocabulary_list(a, d) spec1 <- spec %>% step_indicator_column(a, d) spec2 <- spec %>% step_indicator_column(a, d) spec1$fit() expect_length(spec1$features(), 6) expect_error(spec2$features()) expect_error(spec$features()) spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b, c) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) spec_prep <- fit(spec) expect_length(spec_prep$features(), 6) expect_error(spec$features()) }) test_that("Recipes column types", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a+b+c+d) %>% step_numeric_column(b) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) expect_equal( spec$feature_types(), c("float32", "string", "string", "float32", "float32", "float32") ) }) test_that("Fit feature_spec", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(b) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) spec_prep <- fit(spec) expect_error(dataset_use_spec(dataset, spec)) expect_s3_class(dataset_use_spec(dataset, spec_prep), "tensorflow.python.data.ops.dataset_ops.DatasetV2") }) test_that("Prep with different dataset", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(b) %>% step_categorical_column_with_vocabulary_list(a, d) %>% step_indicator_column(a, d) ds <- df %>% tensor_slices_dataset() %>% dataset_take(10) spec_prep <- fit(spec, ds) expect_s3_class(dataset_use_spec(ds, spec_prep), "tensorflow.python.data.ops.dataset_ops.DatasetV2") }) test_that("Can select with has_type", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(has_type("float32")) %>% step_numeric_column(has_type("int32")) expect_length(spec$steps, 2) spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(has_type("float32")) %>% step_numeric_column(has_type("int32")) %>% step_categorical_column_with_vocabulary_list(has_type("string")) %>% step_indicator_column(has_type("string")) expect_length(spec$steps, 6) expect_error(spec %>% step_indicator_column(a = has_type("string"))) spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(all_numeric()) %>% step_categorical_column_with_vocabulary_list(has_type("string")) %>% step_indicator_column(all_nominal()) expect_length(spec$steps, 6) }) test_that("Can remove variables using -", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(all_numeric(), - b) %>% step_categorical_column_with_vocabulary_list(all_nominal()) %>% step_indicator_column(all_nominal(), - a) spec <- fit(spec) expect_length(spec$dense_features(), 2) expect_named(spec$dense_features(), c("c", "indicator_d")) }) test_that("StandardScaler works as expected", { x <- runif(100) sc <- StandardScaler$new() splited <- split(x, rep(1:10, each = 10)) a <- lapply(splited, sc$fit_batch) sc$fit_resume() expect_equal(sc$mean, mean(x)) expect_equal(sc$sd, sd(x)) }) test_that("Can use a scaler_standard", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(all_numeric(), normalizer_fn = scaler_standard()) spec <- fit(spec) value <- as.matrix(get_features(dataset, spec$dense_features())) normalized_c <- (df$c - mean(df$c))/sd(df$c) normalized_b <- (df$b - mean(df$b))/sd(df$b) expect_equal(as.numeric(value[,2]), normalized_c[1:2], tolerance = 1e-6) expect_equal(as.numeric(value[,1]), normalized_b[1:2], tolerance = 1e-6) }) test_that("MinMaxScaler works as expected", { x <- runif(100) sc <- MinMaxScaler$new() splited <- split(x, rep(1:10, each = 10)) a <- lapply(splited, sc$fit_batch) sc$fit_resume() expect_equal(sc$min, min(x)) expect_equal(sc$max, max(x)) }) test_that("Can use a scaler_min_max", { skip_if_not_tf() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(all_numeric(), normalizer_fn = scaler_min_max()) spec <- fit(spec) value <- as.matrix(get_features(dataset, spec$dense_features())) normalized_c <- (df$c - min(df$c))/(max(df$c) - min(df$c)) normalized_b <- (df$b - min(df$b))/(max(df$b) - min(df$b)) expect_equal(as.numeric(value[,2]), normalized_c[1:2], tolerance = 1e-6) expect_equal(as.numeric(value[,1]), normalized_b[1:2], tolerance = 1e-6) }) test_that("Can use layer_input_from_dataset with TF datasets", { skip_if_not_tf() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() spec <- feature_spec(dataset, y ~ a + b + c + d) %>% step_numeric_column(all_numeric(), normalizer_fn = scaler_min_max()) spec <- fit(spec) ds <- dataset_use_spec(dataset, spec) input <- layer_input_from_dataset(ds) # output <- input %>% # keras::layer_dense_features(spec$dense_features()) # model <- keras::keras_model(inputs = input, outputs = output) expect_length(input, 4) if (tf$executing_eagerly()) expect_equal(dim(as.matrix(model(next_batch(ds)[[1]]))), c(2,2)) }) test_that("Can use layer_input_from_dataset with TF data frames", { skip_if_not_tf() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() spec <- feature_spec(as.data.frame(df), y ~ a + b + c + d) %>% step_numeric_column(all_numeric(), normalizer_fn = scaler_min_max()) spec <- fit(spec) input <- layer_input_from_dataset(as.data.frame(df)[, 1:4]) output <- input %>% # keras::layer_dense_features(spec$dense_features()) %>% # keras::layer_dense(units = 1) # model <- keras::keras_model(inputs = input, outputs = output) # keras::compile(model, loss = "mse", optimizer = "adam") # hist <- keras::fit(model, x = df, y = df$y, verbose = 0) expect_s3_class(hist, "keras_training_history") }) test_that("Can use data.frames", { skip_if_not_tf() spec <- feature_spec(hearts, target ~ .) %>% step_numeric_column( all_numeric(), -cp, -restecg, -exang, -sex, -fbs, normalizer_fn = scaler_standard() ) %>% step_categorical_column_with_vocabulary_list(thal) %>% step_bucketized_column(age, boundaries = c(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)) %>% step_indicator_column(thal) %>% step_embedding_column(thal, dimension = 2) %>% step_crossed_column(c(thal, bucketized_age), hash_bucket_size = 10) %>% step_indicator_column(crossed_thal_bucketized_age) %>% fit() expect_length(spec$dense_features(), 11) }) test_that("Correctly creates indicator vars", { skip_if_not_tf() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() x <- data.frame( y = runif(5), x = c("a", "aĆ£", "b", "c", "d"), b = runif(5), stringsAsFactors = FALSE ) spec <- feature_spec(x, y ~ x) %>% step_categorical_column_with_vocabulary_list(x) %>% step_indicator_column(x) spec <- fit(spec) # k <- keras::layer_dense_features(feature_columns = spec$dense_features()) res <- as.matrix(k(list(x = x$x))) expect_equal( res, diag(nrow(res)) ) }) test_that("feature_spec works with make_csv_dataset", { skip_if_not_tf() TRAIN_DATA_URL <- "https://storage.googleapis.com/tf-datasets/titanic/train.csv" train_file_path <- keras3::get_file("train_csv", TRAIN_DATA_URL) train_dataset <- make_csv_dataset( train_file_path, field_delim = ",", batch_size = 5, num_epochs = 1 ) spec <- feature_spec(train_dataset, survived ~ .) expect_s3_class(spec, class = "FeatureSpec") }) test_that("can create image embedding steps", { skip_if_not_tf() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() if (tensorflow::tf$executing_eagerly()) skip("Needs non-eager execution.") df <- list(img = array(0, dim = c(1, 192, 192, 3))) df <- tensor_slices_dataset(df) spec <- feature_spec(df, x = c(img)) %>% step_image_embedding_column( img, module_spec = "https://tfhub.dev/google/imagenet/mobilenet_v1_075_192/quantops/feature_vector/3" ) spec <- spec %>% fit() # layer <- keras::layer_dense_features(feature_columns = spec$dense_features()) x <- layer(list(img = array(0, dim = c(1, 192, 192, 3)))) expect_equal(x$get_shape()$as_list(), c(1L, 768L)) }) test_that("can create text embedding columns", { # TODO: this was removed in tfhub, delete this test skip_if_not_tf() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() if (tensorflow::tf$executing_eagerly()) skip("Needs non-eager execution.") df <- list(txt = c("hello world", "hello world")) df <- tensor_slices_dataset(df) spec <- feature_spec(df, x = c(txt)) %>% step_text_embedding_column(txt, module_spec = "https://tfhub.dev/google/nnlm-en-dim50/1") spec <- spec %>% fit() # layer <- keras::layer_dense_features(feature_columns = spec$dense_features()) x <- layer(list(txt = c("hello world", "hello world"))) expect_equal(x$get_shape()$as_list(), list(NULL, 50L)) }) test_that("can save and reload models that use a normalizer_fn", { data <- data.frame( y = runif(5), x = runif(5), b = runif(5) ) spec <- feature_spec(data, y ~ .) %>% step_numeric_column(x, normalizer_fn = scaler_standard()) %>% fit() skip("layer_dense_features()/tfestimators deprecated") # use keras3::layer_feature_space() input <- layer_input_from_dataset(data[-1]) output <- input %>% layer_dense_features(dense_features(spec)) %>% layer_dense(units = 1, activation = "sigmoid") model <- keras_model(input, output) model %>% compile( loss = "binary_crossentropy", optimizer = "adam", metrics = "binary_accuracy" ) tmp <- tempfile("model") rds <- tempfile("rds") save_model_weights_tf(model, tmp) saveRDS(spec, rds) reloaded_spec <- readRDS(rds) input <- layer_input_from_dataset(data[-1]) output <- input %>% layer_dense_features(dense_features(reloaded_spec)) %>% layer_dense(units = 1, activation = "sigmoid") new_model <- keras_model(input, output) load_model_weights_tf(new_model, tmp) expect_equal( predict(model, data[-1]), predict(new_model, data[-1]) ) })