context("vi-optimization")

test_succeeds("vi_fit_surrogate_posterior works", {
  skip_if_tfp_below("0.9")

  if (!tf$compat$v1$resource_variables_enabled()) tf$compat$v1$enable_resource_variables()

  # 1: Normal-Normal model
  # We'll first consider a simple model `z ~ N(0, 1)`, `x ~ N(z, 1)`,
  # where we suppose we are interested in the posterior `p(z | x=5)`:

  log_prob <-
    function(z, x)
      tfd_normal(0, 1) %>% tfd_log_prob(z) + tfd_normal(z, 1) %>% tfd_log_prob(x)

  conditioned_log_prob <- function(z)
    log_prob(z, x = 5)

  # The posterior is itself normal by and can be computed analytically (it's `N(loc=5/2., scale=1/sqrt(2)`).
  # But suppose we don't want to bother doing the math: we can use variational inference instead!
  # Note that we ensure positive scale by using a softplus transformation of
  # the underlying variable, invoked via `DeferredTensor`. Deferring the
  # transformation causes it to be performed at runtime of the distribution's
  # methods, creating a gradient to the underlying variable. If we
  # had simply specified `scale=tf.nn.softplus(scale_var)` directly,
  # without the `DeferredTensor`, fitting would fail because calls to
  # `q.log_prob` and `q.sample` would never access the underlying variable. In
  # general, transformations of trainable parameters must be deferred to runtime,
  # using either `DeferredTensor` or by the callable mechanisms available in
  # joint distribution classes.
  q_z <- tfd_normal(
    loc = tf$Variable(0, name = 'q_z_loc'),
    scale = tfp$util$TransformedVariable(
      initial_value = 1,
      bijector = tfp$bijectors$Softplus(),
      name = 'q_z_scale'),
    name = 'q_z'
  )

  losses <- vi_fit_surrogate_posterior(
    target_log_prob_fn = conditioned_log_prob,
    surrogate_posterior = q_z,
    optimizer = tf$optimizers$Adam(learning_rate = 0.1),
    num_steps = 100
  )

  if (tf$executing_eagerly()) {

    optimized_mean <- q_z %>% tfd_mean()
    optimized_sd <- q_z %>% tfd_stddev()

  } else {
    with (tf$control_dependencies(list(losses)), {
      # tf$identity ensures we create a new op to capture the dependency
      optimized_mean <- tf$identity(q_z %>% tfd_mean())
      optimized_sd <- tf$identity(q_z %>% tfd_stddev())
    })
  }

  expect_equal(optimized_mean %>% tensor_value() %>% length(), 1)

  # 2: Custom loss function

  q_z2 <- tfd_normal(
    loc = tf$Variable(0., name = 'q_z2_loc'),
    scale = tfp$util$TransformedVariable(
      initial_value = 1,
      bijector = tfp$bijectors$Softplus(),
      name = 'q_z2_scale'),
    name = 'q_z2'
  )

  #forward_kl_loss <- purrr::partial(vi_monte_carlo_variational_loss, ... =, discrepancy_fn = vi_kl_forward)
  forward_kl_loss <- function(target_log_prob_fn,
                              surrogate_posterior,
                              sample_size = 1,
                              seed = NULL,
                              name = NULL)
    vi_monte_carlo_variational_loss(
      target_log_prob_fn,
      surrogate_posterior,
      sample_size,
      discrepancy_fn = vi_kl_forward,
      seed = seed,
      name = name
    )


  skip("variational_loss_fn arg in vi_fit_surrogate_posterior deprecated, needs updating")
  losses2 <- vi_fit_surrogate_posterior(
    target_log_prob_fn = conditioned_log_prob,
    surrogate_posterior = q_z2,
    optimizer = tf$optimizers$Adam(learning_rate = 0.1),
    num_steps = 100,
    variational_loss_fn = forward_kl_loss # deprecated, needs removing
  )

  if (tf$executing_eagerly()) {

    optimized_mean <- q_z2 %>% tfd_mean()
    optimized_sd <- q_z2 %>% tfd_stddev()

  } else {
    with (tf$control_dependencies(list(losses2)), {
      # tf$identity ensures we create a new op to capture the dependency
      optimized_mean <- tf$identity(q_z2 %>% tfd_mean())
      optimized_sd <- tf$identity(q_z2 %>% tfd_stddev())
    })
  }

  expect_equal(optimized_mean %>% tensor_value() %>% length(), 1)

  # 3: Inhomogeneous Poisson Process
  # TBD

})