test_that("local_autocast works", { x <- torch_randn(5, 5, dtype = torch_float32()) y <- torch_randn(5, 5, dtype = torch_float32()) foo <- function(x, y) { local_autocast(device_type = "cpu") z <- torch_mm(x, y) w <- torch_mm(z, x) w } out <- foo(x, y) expect_equal(out$dtype$.type(), "BFloat16") a <- torch_mm(x, out$float()) expect_true(a$dtype == torch_float()) }) test_that("with autocast works", { x <- torch_randn(5, 5, dtype = torch_float32()) y <- torch_randn(5, 5, dtype = torch_float32()) with_autocast(device_type="cpu", { z <- torch_mm(x, y) w <- torch_mm(z, x) }) expect_equal(w$dtype$.type(), "BFloat16") a <- torch_mm(x, w$float()) expect_true(a$dtype == torch_float()) }) test_that("works on gpu", { skip_if_cuda_not_available() x <- torch_randn(5, 5, dtype = torch_float32(), device="cuda") y <- torch_randn(5, 5, dtype = torch_float32(), device="cuda") with_autocast(device_type="cuda", { z <- torch_mm(x, y) w <- torch_mm(z, x) }) expect_equal(w$dtype$.type(), "Half") expect_true(w$device == torch_device("cuda", 0)) a <- torch_mm(x, w$float()) expect_true(a$dtype == torch_float()) }) test_that("unscale skipping works", { skip_if_cuda_not_available() model <- nn_linear(2, 2)$cuda() x <- lapply(1:50, function(x) torch_randn(2, 2, dtype = torch_float32(), device="cuda")) y <- lapply(1:50, function(x) torch_randn(2, 2, dtype = torch_float32(), device="cuda")) loss_fn <- nn_mse_loss() orig_params <- lapply(model$parameters, function(x) x$clone()$detach()) optimizer <- optim_sgd(model$parameters, lr=0.001) scaler <- cuda_amp_grad_scaler(enabled=TRUE, init_scale=128.0) for(i in seq_along(x)) { with_autocast(device_type="cuda", dtype=torch_float16(), { output <- model(x[[i]]) loss <- loss_fn(output, y[[i]]) }) scaler$scale(loss)$backward() scaler$unscale_(optimizer) # deliberately break grads model$parameters[[1]]$grad$copy_(torch_tensor(Inf)$cuda()) model$parameters[[2]]$grad$copy_(torch_tensor(NaN)$cuda()) scaler$step(optimizer) scaler$update() } expect_equal_to_tensor(model$parameters[[1]]$cpu(), orig_params[[1]]$cpu()) expect_equal_to_tensor(model$parameters[[2]]$cpu(), orig_params[[2]]$cpu()) }) test_that("loss is scaled correctly", { skip_if_cuda_not_available() model <- nn_linear(2, 2)$cuda() x <- torch_randn(2, 2, device="cuda") y <- torch_randn(2, 2, device="cuda") loss_fn <- nn_mse_loss()$cuda() scaler <- cuda_amp_grad_scaler(init_scale = 1000) with_autocast( device_type="cuda", dtype=torch_float16(), { output <- model(x) loss <- loss_fn(output, y) } ) scaled_loss <- scaler$scale(loss) expect_equal((scaled_loss/loss)$item(), scaler$.scale$item(), tolerance = 1e-4) }) test_that("scaling the loss works", { skip_if_cuda_not_available() model <- nn_linear(2, 2)$cuda() for(par in model$parameters) { # initialize parameters with 0 so gradients should also be small nn_init_constant_(par, 0) } x <- torch_randn(2048, 2, device="cuda")/1e3 y <- torch_randn(2048, 2, device="cuda")/1e3 loss_fn <- nn_mse_loss()$cuda() with_autocast( device_type="cuda", dtype=torch_float16(), { output <- model(x) loss <- loss_fn(output, y) } ) loss$backward() # gradients are so small that they become 0 expect_equal( as.matrix(model$weight$grad$cpu()), array(rep(0, 4), dim = c(2,2)), tolerance = 1e-6 ) # now we scale the loss and gradients scaler <- cuda_amp_grad_scaler() with_autocast( device_type="cuda", dtype=torch_float16(), { output <- model(x) loss <- loss_fn(output, y) } ) scaler$scale(loss)$backward() model$weight$grad expect_true(!any(as.matrix(model$weight$grad$cpu()) == 0)) }) test_that("internal cpp_amp_check works", { skip_if_cuda_not_available() net <- nn_linear(2, 2)$cuda() x <- torch_randn(2, 2, device="cuda") y <- torch_randn(2, 2, device="cuda") loss_fn <- nn_mse_loss()$cuda() loss <- loss_fn(net(x), y) loss$backward() dummy_found_inf <- torch_full(list(), 0, device="cuda") inv_scale <- torch_full(list(), 1, device="cuda") found_inf <- cpp_amp_foreach_non_finite_check_and_unscale(net$parameters, dummy_found_inf, inv_scale) expect_equal(found_inf, 0) net$weight$grad$copy_(torch_tensor(Inf)$cuda()) found_inf <- cpp_amp_foreach_non_finite_check_and_unscale(net$parameters, dummy_found_inf, inv_scale) expect_equal(found_inf, 1) net$bias$grad$copy_(torch_tensor(NaN)$cuda()) found_inf <- cpp_amp_foreach_non_finite_check_and_unscale(net$parameters, dummy_found_inf, inv_scale) expect_equal(found_inf, 2) }) test_that("grad scalers work correctly", { skip_if_cuda_not_available() make_model <- function(in_size, out_size, num_layers) { layers <- list() for (i in seq_len(num_layers-1)) { layers <- c(layers, list(nn_linear(in_size, in_size), nn_relu())) } layers <- c(layers, list(nn_linear(in_size, out_size))) nn_sequential(!!!layers)$cuda() } torch_manual_seed(1) batch_size = 512 # Try, for example, 128, 256, 513. in_size = 4096 out_size = 4096 num_layers = 3 num_batches = 50 epochs = 3 # Creates data in default precision. # The same data is used for both default and mixed precision trials below. # You don't need to manually change inputs' dtype when enabling mixed precision. data <- lapply(1:num_batches, function(x) torch_randn(batch_size, in_size, device="cuda")) targets <- lapply(1:num_batches, function(x) torch_randn(batch_size, in_size, device="cuda")) loss_fn <- nn_mse_loss()$cuda() use_amp <- TRUE use_scaling <- TRUE net <- make_model(in_size, out_size, num_layers) opt <- optim_sgd(net$parameters, lr=0.1) scaler <- cuda_amp_grad_scaler(enabled=use_scaling) for (epoch in seq_len(epochs)) { for (i in seq_along(data)) { with_autocast(device_type="cuda", enabled=use_amp, { output <- net(data[[i]]) loss <- loss_fn(output, targets[[i]]) }) scaled_loss <- scaler$scale(loss) scaled_loss$backward() scaler$step(opt) scaler$update() opt$zero_grad() } } # got the same value as obtained from pytorch expect_equal( sprintf("%1.6f", loss$item()), sprintf("%1.6f", 1.003786) ) })