# See http://cs231n.github.io/neural-networks-3/#gradcheck

# Function to calculate numerical and analytical gradient
calcGradients <- function(X, y, regression, ...) {
  
  # Set epsilon (for calculating numerical gradient), determine dimensions
  epsilon <- 1e-5
  n_rows <- nrow(X)
  n_cols <- ncol(X)
  
  # Construct NN
  NN <- neuralnetwork(X = X, 
                      y = y, 
                      standardize = TRUE,
                      regression = regression,
                      batch.size = n_rows,
                      val.prop = 0, 
                      n.epochs = 100,
                      verbose = FALSE, 
                      step.k = 30, ...)
  
  # Prepare targets for regression/classification
  if (regression) {
    y <- as.matrix(y)
  } else {
    y <- 1 * outer(y, NN$Rcpp_ANN$getMeta()$y_names, '==')
  }
  y <- NN$Rcpp_ANN$scale_y(y, FALSE)
  
  # Prepare features - same for regression and classification
  X <- as.matrix(X)
  X <- NN$Rcpp_ANN$scale_X(X, FALSE)
  
  #### NUMERICAL GRADIENT
  # Construct feature matrices with small perturbations (positive and negative) 
  # at one of the elements per row. For this element the gradients compared
  X_eps_neg <- X_eps_pos <- X
  idx_vec <- sample(n_cols, n_rows, replace = TRUE)
  
  for (i in 1:n_rows) {
    idx <- idx_vec[i]
    X_eps_pos[i,idx] <- X_eps_pos[i,idx] + epsilon
    X_eps_neg[i,idx] <- X_eps_neg[i,idx] - epsilon
  }
  
  # Function to evaluate loss for a feature matrix X and targets y
  eval_loss <- function(X, y) {
    y_fit <- NN$Rcpp_ANN$forwardPass(X)
    return( NN$Rcpp_ANN$evalLoss(y, y_fit) )
  }
  
  # Calculate numerical gradient
  E_num <- eval_loss(X_eps_pos, y) - eval_loss(X_eps_neg, y)
  E_num <- E_num / (2*epsilon)
  E_num <- rowSums(E_num)
  
  #### ANALYTICAL GRADIENT
  y_fit <- NN$Rcpp_ANN$forwardPass(X)
  E <- NN$Rcpp_ANN$backwardPass(y, y_fit)
  E_ana <- sapply(1:n_rows, function(i) E[i, idx_vec[i]])
  
  return ( list(numeric = E_num, analytic = E_ana) )
}

# Function to check equality between observed and expected by inspecting their 
# distance to one another relative to the largest absolute value of the two
relCompare <- function(observed, expected, tolerance) {
  
  # Elementwise maximum of absolute values
  max_elem <- pmax(abs(expected), abs(observed))
  
  # Absolute relative difference
  rel_diff <- abs(observed - expected) / max_elem
  
  # Check equality given tolerance
  check_vec <- (rel_diff < tolerance)
  
  # Check for edge case: both gradient elements zero
  check_vec[max_elem < 1e-8] <- TRUE # Both element close to zero
  
  # Pass check if all numerical grads are equal to their analytical counterparts
  return ( all(check_vec) )
}

context("Full gradient check for regression")

## ACTIVATION FUNCTIONS
test_that("the full gradient is correct for all loss functions",
{
  skip_on_cran()
  
  # Set data and relative tolerance for equality check
  data <- iris[sample(nrow(iris), size = 4),  1:4]
  rel_tol_smooth <- 1e-7
  rel_tol_kinks <- 1e-5
  
  # Function to check gradient for current cases
  gradCheck <- function(loss.type, rel_tol) {
    grads <- calcGradients(X = data[,1:3], 
                           y = data[,4],
                           hidden.layers = c(5,5),
                           activ.functions = "tanh", 
                           regression = TRUE,
                           loss.type = loss.type)
    return( relCompare(grads$analytic, grads$numeric, rel_tol) )

  }
  
  # Run tests on gradient
  # The gradient checks sometimes fail by chance (especially with functions 
  # that contain kinks such as the ramp, step and relu) so do not run on CRAN
  expect_true(gradCheck("squared",      rel_tol_smooth))
  expect_true(gradCheck("absolute",     rel_tol_kinks))
  expect_true(gradCheck("huber",        rel_tol_kinks))
  expect_true(gradCheck("pseudo-huber", rel_tol_kinks))
  
})

## ACTIVATION FUNCTIONS
test_that("the full gradient is correct for all activation functions",
{
  skip_on_cran()
  
  # Set data and relative tolerance for equality check
  data <- iris[sample(nrow(iris), size = 4),  1:4]
  rel_tol_smooth <- 1e-7
  rel_tol_kinks <- 1e-5
  
  # Function to check gradient for current cases
  gradCheck <- function(activ.functions, rel_tol) {
    grads <- calcGradients(X = data[,1:3], 
                           y = data[,4],
                           hidden.layers = c(5),
                           activ.functions = activ.functions, 
                           regression = TRUE,
                           loss.type = 'squared')
    return( relCompare(grads$analytic, grads$numeric, rel_tol) )
    
  }
  
  # Run tests on gradient
  # The gradient checks sometimes fail by chance (especially with functions 
  # that contain kinks such as the ramp, step and relu) so do not run on CRAN
  expect_true(gradCheck("tanh",    rel_tol_smooth))
  expect_true(gradCheck("sigmoid", rel_tol_smooth))
  expect_true(gradCheck("linear",  rel_tol_smooth))
  expect_true(gradCheck("ramp",    rel_tol_kinks))
  expect_true(gradCheck("step",    rel_tol_kinks))
  expect_true(gradCheck("relu",    rel_tol_kinks))
  
})


context("Full gradient check for classification")

test_that("the full gradient is correct for all activation functions",
{
  skip_on_cran()
  
  # Set data and relative tolerance for equality check
  data <- iris[sample(nrow(iris), size = 4),]
  rel_tol_smooth <- 1e-7
  rel_tol_kinks <- 1e-4

  # Function to check gradient for current cases
  gradCheck <- function(activ.functions, rel_tol) {
    grads <- calcGradients(X = data[,1:4],
                           y = data[,5],
                           hidden.layers = c(5),
                           activ.functions = activ.functions,
                           regression = FALSE,
                           loss.type = 'log')
    return( relCompare(grads$analytic, grads$numeric, rel_tol) )

  }

  # Run tests on gradients
  # The gradient checks sometimes fail by chance (especially with functions 
  # that contain kinks such as the ramp, step and relu) so do not run on CRAN
  expect_true(gradCheck("tanh",    rel_tol_smooth))
  expect_true(gradCheck("sigmoid", rel_tol_smooth))
  expect_true(gradCheck("linear",  rel_tol_smooth))
  expect_true(gradCheck("ramp",    rel_tol_kinks))
  expect_true(gradCheck("step",    rel_tol_kinks))
  expect_true(gradCheck("relu",    rel_tol_kinks))
  
})