# In tests/testthat/test-data_preprocessing.R test_that("detrending and retrending works as expected", { # Create a mock data.table params <- list( target = "NO2", meteo_variables = c("TMP"), lightgbm = list( nrounds = 5, eta = 0.1, num_leaves = 8 ) ) env_data1 <- data.table::data.table( date = as.POSIXct(c( "2021-01-01 00:00:00", "2021-03-01 00:00:00", "2021-06-01 00:00:00", "2022-01-01 01:00:00", "2023-01-01 02:00:00" )), Station = "TEST001", part = "test", Komponente = "TMP", Komponente_txt = "Temperature", Wert = c(10, 15, 20, 20, 30) ) env_data2 <- data.table::data.table( date = as.POSIXct(c( "2021-01-01 00:00:00", "2021-03-01 00:00:00", "2021-06-01 00:00:00", "2022-01-01 01:00:00", "2023-01-01 02:00:00" )), Station = "TEST001", part = "test", Komponente = "NO2", Komponente_txt = "Stickstoffdioxid", Wert = c(10, 20, 30, 40, 50) ) env_data <- rbind(env_data1, env_data2) meteo_available <- c("TMP") # Run clean_data function with daily aggregation cleaned_data <- clean_data(env_data, "TEST001", aggregate_daily = TRUE ) application_start <- lubridate::ymd("20211201") application_end <- lubridate::ymd("20220901") dt_prepared <- prepare_data_for_modelling(env_data, params) split_data <- split_data_counterfactual( dt_prepared, application_start, application_end ) detrended_data <- detrend(split_data) expect_true(class(split_data) == "list") expect_equal(nrow(split_data$apply), 1) expect_s3_class(detrended_data$train, "data.table") expect_s3_class(detrended_data$apply, "data.table") # Retrending reverses effect of detrending expect_lt(sum(detrended_data$train$value), 1e-8) }) train_data <- data.frame( var1 = c(1, 2, 3, 4, 5), var2 = c(5, 6, 7, 8, 9), var3 = c("A", "B", "C", "D", "E") # Non-numeric column ) apply_data <- data.frame( var1 = c(6, 7, 8), var2 = c(10, 11, 12), var3 = c("F", "G", "H") # Non-numeric column ) test_that("scale_data works with scaling", { result <- scale_data(train_data, apply_data) # Manually calculate means and standard deviations means <- colMeans(train_data %>% select(where(is.numeric))) sds <- apply(train_data %>% select(where(is.numeric)), 2, sd) # Manually scale train_data for comparison train_scaled_var1 <- (train_data$var1 - means["var1"]) / sds["var1"] train_scaled_var2 <- (train_data$var2 - means["var2"]) / sds["var2"] # Check if the train data has been scaled correctly expect_equal(round(result$train$var1, 2), round(train_scaled_var1, 2)) expect_equal(round(result$train$var2, 2), round(train_scaled_var2, 2)) # Check if apply data has been scaled using the means and sds of train data apply_scaled_var1 <- (apply_data$var1 - means["var1"]) / sds["var1"] apply_scaled_var2 <- (apply_data$var2 - means["var2"]) / sds["var2"] expect_equal(round(result$apply$var1, 2), round(apply_scaled_var1, 2)) expect_equal(round(result$apply$var2, 2), round(apply_scaled_var2, 2)) # Check if the non-numeric column is not affected expect_equal(result$train$var3, train_data$var3) expect_equal(result$apply$var3, apply_data$var3) # Check that means and sds are returned correctly expect_equal(result$means, means) expect_equal(result$sds, sds) }) # test rescale predictions # mock data scale_result <- scale_data(train_data, apply_data) # Create standardized predictions for testing rescale_predictions dt_predictions <- data.frame( prediction = c(-1.2649111, 0, 1.2649111), # Standardized predictions prediction_lower = c(-1.5, 0, 1.5), # Standardized min predictions prediction_upper = c(-1.0, 0, 1.0), # Standardized max predictions var1 = scale_result$apply$var1, # Standardized var1 from apply_data var2 = scale_result$apply$var2 # Standardized var2 from apply_data ) test_that("rescale_predictions rescales correctly using scaling parameters", { # Use the rescale_predictions function rescaled <- rescale_predictions(scale_result, dt_predictions) # Manually rescale the predictions using the means and sds from scale_result means <- scale_result$means sds <- scale_result$sds expected_rescaled <- dt_predictions %>% mutate( prediction = prediction * sds["value"] + means["value"], prediction_lower = prediction_lower * sds["value"] + means["value"], prediction_upper = prediction_upper * sds["value"] + means["value"] ) # Compare rescaled predictions with dynamically calculated expected values expect_equal( round(rescaled$prediction, 6), round(expected_rescaled$prediction, 6) ) expect_equal( round(rescaled$prediction_lower, 6), round(expected_rescaled$prediction_lower, 6) ) expect_equal( round(rescaled$prediction_upper, 6), round(expected_rescaled$prediction_upper, 6) ) })