# test functions

# set the seed and flush the graph before running tests
if (check_tf_version()) {
  tensorflow::tf$compat$v1$reset_default_graph()
}

set.seed(2020 - 02 - 11)

rng_seed <- function() {
  get(".Random.seed", envir = .GlobalEnv)
}

# evaluate a greta_array, node, or tensor
grab <- function(x, dag = NULL) {
  if (inherits(x, "node")) {
    x <- as.greta_array(x)
  }

  if (inherits(x, "greta_array")) {
    node <- get_node(x)
    dag <- dag_class$new(list(x))
    dag$define_tf()
  }

  dag$set_tf_data_list("batch_size", 1L)
  dag$build_feed_dict()
  out <- dag$tf_sess_run(dag$tf_name(node), as_text = TRUE)
  drop_first_dim(out)
}

set_distribution <- function(dist, data) {
  # fix the value of dist
  dist_node <- get_node(dist)
  data_node <- get_node(data)
  distrib <- dist_node$distribution
  distrib$value(data_node$value())
  distrib$.fixed_value <- TRUE
  data_node$set_distribution(distrib)
  data_node$register()
}

# evaluate the (unadjusted) density of distribution greta array at some data
get_density <- function(distrib, data) {
  x <- as_data(data)
  distribution(x) <- distrib

  # create dag and define the density
  dag <- dag_class$new(list(x))
  get_node(x)$distribution$define_tf(dag)

  # get the log density as a vector
  tensor_name <- dag$tf_name(get_node(distrib)$distribution)
  tensor <- get(tensor_name, envir = dag$tf_environment)
  as.vector(grab(tensor, dag))
}

compare_distribution <- function(greta_fun, r_fun, parameters, x,
                                 dim = NULL, multivariate = FALSE,
                                 tolerance = 1e-4) {
  # calculate the absolute difference in the log density of some data between
  # greta and a r benchmark.
  # 'greta_fun' is the greta distribution constructor function (e.g. normal())
  # 'r_fun' is the r density function, which must have argument 'log'
  # both of these functions must take the same parameters in the same order
  # 'parameters' is an optionally named list of numeric parameter values
  # x is the vector of values at which to evaluate the log density

  # define greta distribution, with fixed values
  greta_log_density <- greta_density(
    greta_fun, parameters, x,
    dim, multivariate
  )
  # get R version
  r_log_density <- log(do.call(r_fun, c(list(x), parameters)))

  # return absolute difference
  compare_op(r_log_density, greta_log_density, tolerance)
}

# evaluate the log density of x, given 'parameters' and a distribution
# constructor function 'fun'
greta_density <- function(fun, parameters, x,
                          dim = NULL, multivariate = FALSE) {
  if (is.null(dim)) {
    dim <- NROW(x)
  }

  # add the output dimension to the arguments list
  dim_list <- list(dim = dim)

  # if it's a multivariate distribution name it n_realisations
  if (multivariate) {
    names(dim_list) <- "n_realisations"
  }

  # don't add it for wishart & lkj, which don't mave multiple realisations
  is_wishart <- identical(names(parameters), c("df", "Sigma"))
  is_lkj <- identical(names(parameters), c("eta", "dimension"))
  if (is_wishart | is_lkj) {
    dim_list <- list()
  }

  parameters <- c(parameters, dim_list)

  # evaluate greta distribution
  dist <- do.call(fun, parameters)
  distrib_node <- get_node(dist)$distribution

  # set density
  x_ <- as.greta_array(x)
  distrib_node$remove_target()
  distrib_node$add_target(get_node(x_))

  # create dag
  dag <- dag_class$new(list(x_))
  dag$define_tf()
  dag$set_tf_data_list("batch_size", 1L)
  dag$build_feed_dict()

  # get the log density as a vector
  dag$on_graph(
    result <- dag$evaluate_density(distrib_node, get_node(x_))
  )
  assign("test_density", result, dag$tf_environment)

  density <- dag$tf_sess_run(test_density)
  as.vector(density)
}

# execute a call via greta, swapping the objects named in 'swap' to greta
# arrays, then converting the result back to R. 'swap_scope' tells eval() how
# many environments to go up to get the objects for the swap; 1 would be
# environment above the funct, 2 would be the environment above that etc.
with_greta <- function(call, swap = c("x"), swap_scope = 1) {
  swap_entries <- paste0(swap, " = as_data(", swap, ")")
  swap_text <- paste0(
    "list(",
    paste(swap_entries, collapse = ", "),
    ")"
  )
  swap_list <- eval(parse(text = swap_text),
    envir = parent.frame(n = swap_scope)
  )

  greta_result <- with(
    swap_list,
    eval(call)
  )
  result <- grab(greta_result)

  # account for the fact that greta outputs are 1D arrays; convert them back to
  # R vectors
  if (is.array(result) && length(dim(result)) == 2 && dim(result)[2] == 1) {
    result <- as.vector(result)
  }

  result
}

# check an expression is equivalent when done in R, and when done on greta
# arrays with results ported back to R
# e.g. check_expr(a[1:3], swap = 'a')
check_expr <- function(expr, swap = c("x"), tolerance = 1e-4) {
  call <- substitute(expr)

  r_out <- eval(expr)
  greta_out <- with_greta(call,
    swap = swap,
    swap_scope = 2
  )

  compare_op(r_out, greta_out, tolerance)
}

# generate a random string to describing a binary operation on two variables, do
# an op selected from 'ops' to an arg from 'args' and to init
add_op_string <- function(init = "a",
                          args = c("a", "b"),
                          ops = c("+", "-", "*", "/")) {
  op <- sample(ops, 1)
  arg <- sample(args, 1)
  sprintf("(%s %s %s)", arg, op, init)
}

# generate a random function that combines two variables together in a string of
# (n) increasingly bizarre operations
gen_opfun <- function(n, ops) {
  string <- "a"
  for (i in seq_len(n)) {
    string <- add_op_string(string, ops = ops)
  }

  fun_string <- sprintf("function(a, b) {%s}", string)

  eval(parse(text = fun_string))
}

# sample n values from a distribution by HMC, check they all have the correct
# support greta array is defined as a stochastic in the call
sample_distribution <- function(greta_array, n = 10,
                                lower = -Inf, upper = Inf,
                                warmup = 1) {
  m <- model(greta_array, precision = "double")
  draws <- mcmc(m, n_samples = n, warmup = warmup, verbose = FALSE)
  samples <- as.matrix(draws)
  vectorised <- length(lower) > 1 | length(upper) > 1

  if (vectorised) {
    above_lower <- sweep(samples, 2, lower, `>=`)
    below_upper <- sweep(samples, 2, upper, `<=`)
  } else {
    above_lower <- samples >= lower
    below_upper <- samples <= upper
  }

  expect_true(all(above_lower & below_upper))
}

compare_truncated_distribution <- function(greta_fun,
                                           which,
                                           parameters,
                                           truncation,
                                           tolerance = 1e-4) {
  # calculate the absolute difference in the log density of some data between
  # greta and a r benchmark, for an implied truncated distribution 'greta_array'
  # is a greta array created from a distribution and a constrained variable
  # greta array. 'r_fun' is an r function returning the log density for the same
  # truncated distribution, taking x as its only argument.

  x <- do.call(
    truncdist::rtrunc,
    c(
      n = 100,
      spec = which,
      a = truncation[1],
      b = truncation[2],
      parameters
    )
  )

  # create truncated R function and evaluate it
  r_fun <- truncfun(which, parameters, truncation)
  r_log_density <- log(r_fun(x))

  greta_log_density <- greta_density(
    fun = greta_fun,
    parameters = c(parameters, list(truncation = truncation)),
    x = x,
    dim = 1
  )

  # return absolute difference
  compare_op(r_log_density, greta_log_density, tolerance)
}

# use the truncdist package to crete a truncated distribution function for use
# in compare_truncated_distribution
truncfun <- function(which = "norm", parameters, truncation) {
  args <- c(
    spec = which,
    a = truncation[1],
    b = truncation[2],
    parameters
  )

  function(x) {
    arg_list <- c(x = list(x), args)
    do.call(truncdist::dtrunc, arg_list)
  }
}

# R distribution functions for the location-scale Student T distribution
dt_ls <- function(x, df, location, scale, log = FALSE) {
  ans <- stats::dt((x - location) / scale, df) / scale
  if (log) {
    ans <- log(ans)
  }
  ans
}

pt_ls <- function(q, df, location, scale, log.p = FALSE) {
  ans <- stats::pt((q - location) / scale, df)
  if (log.p) {
    ans <- log(ans)
  }
  ans
}

qt_ls <- function(p, df, location, scale, log.p = FALSE) {
  ans <- stats::qt(p, df) * scale + location
  if (log.p) {
    ans <- log(ans)
  }
  ans
}

# mock up the progress bar to force its output to stdout for testing
cpb <- eval(parse(text = capture.output(dput(create_progress_bar))))
mock_create_progress_bar <- function(...) {
  cpb(..., stream = stdout())
}

# capture messages in testthat block to get the output of the progress bar;
# copied from progress' test suite:
# https://github.com/r-lib/progress/blob/master/tests/testthat/helper.R
get_output <- function(expr) {
  msgs <- character()
  i <- 0
  suppressMessages(withCallingHandlers(
    expr,
    message = function(e) msgs[[i <<- i + 1]] <<- conditionMessage(e)
  ))
  paste0(msgs, collapse = "")
}

# mock up mcmc progress bar output for neurotic testing
mock_mcmc <- function(n_samples = 1010) {
  pb <- create_progress_bar("sampling", c(0, n_samples),
    pb_update = 10, width = 50
  )
  iterate_progress_bar(pb, n_samples, rejects = 10, chains = 1)
}

# random lkj draws, code from the rethinking package (can't load the package
# because of stan*Travis*compiler issues)
rlkjcorr <- function(n, eta = 1, dimension = 2) {
  k <- dimension
  stopifnot(is.numeric(k), k >= 2, k == as.integer(k))
  stopifnot(eta > 0)

  f <- function() {
    alpha <- eta + (k - 2) / 2
    r12 <- 2 * stats::rbeta(1, alpha, alpha) - 1
    r <- matrix(0, k, k)
    r[1, 1] <- 1
    r[1, 2] <- r12
    r[2, 2] <- sqrt(1 - r12^2)

    if (k > 2) {
      for (m in 2:(k - 1)) {
        alpha <- alpha - 0.5
        y <- rbeta(1, m / 2, alpha)
        z <- rnorm(m, 0, 1)
        z <- z / sqrt(crossprod(z)[1])
        r[1:m, m + 1] <- sqrt(y) * z
        r[m + 1, m + 1] <- sqrt(1 - y)
      }
    }

    crossprod(r)
  }

  r <- replicate(n, f())

  if (dim(r)[3] == 1) {
    r <- r[, , 1]
  } else {
    r <- aperm(r, c(3, 1, 2))
  }

  r
}

# helper RNG functions
rmvnorm <- function(n, mean, Sigma) { # nolint
  mvtnorm::rmvnorm(n = n, mean = mean, sigma = Sigma)
}

rwish <- function(n, df, Sigma) { # nolint
  draws <- stats::rWishart(n = n, df = df, Sigma = Sigma)
  aperm(draws, c(3, 1, 2))
}

rmulti <- function(n, size, prob) {
  draws <- stats::rmultinom(n = n, size = size, prob = prob)
  t(draws)
}

rcat <- function(n, prob) {
  rmulti(n, 1, prob)
}

rtnorm <- function(n, mean, sd, truncation) {
  truncdist::rtrunc(
    n,
    "norm",
    a = truncation[1],
    b = truncation[2],
    mean = mean,
    sd = sd
  )
}

rtlnorm <- function(n, meanlog, sdlog, truncation) {
  truncdist::rtrunc(
    n,
    "lnorm",
    a = truncation[1],
    b = truncation[2],
    meanlog = meanlog,
    sdlog = sdlog
  )
}

rtweibull <- function(n, shape, scale, truncation) {
  truncdist::rtrunc(
    n,
    "weibull",
    a = truncation[1],
    b = truncation[2],
    shape = shape,
    scale = scale
  )
}

rtf <- function(n, df1, df2, truncation) {
  truncdist::rtrunc(
    n,
    "f",
    a = truncation[1],
    b = truncation[2],
    df1 = df1,
    df2 = df2
  )
}


# joint testing functions
joint_normals <- function(...) {
  params_list <- list(...)
  components <- lapply(params_list, function(par) do.call(normal, par))
  do.call(joint, components)
}

rjnorm <- function(n, ...) {
  params_list <- list(...)
  args_list <- lapply(params_list, function(par) c(n, par))
  sims <- lapply(args_list, function(par) do.call(stats::rnorm, par))
  do.call(cbind, sims)
}

rjtnorm <- function(n, ...) {
  params_list <- list(...)
  args_list <- lapply(params_list, function(par) c(n, par))
  sims <- lapply(args_list, function(par) do.call(rtnorm, par))
  do.call(cbind, sims)
}

# mixture testing functions
mixture_normals <- function(...) {
  args <- list(...)
  is_weights <- names(args) == "weights"
  params_list <- args[!is_weights]
  components <- lapply(params_list, function(par) do.call(normal, par))
  do.call(mixture, c(components, args[is_weights]))
}

mixture_multivariate_normals <- function(...) {
  args <- list(...)
  is_weights <- names(args) == "weights"
  params_list <- args[!is_weights]
  components <- lapply(
    params_list,
    function(par) {
      do.call(multivariate_normal, par)
    }
  )
  do.call(mixture, c(components, args[is_weights]))
}

rmixnorm <- function(n, ...) {
  args <- list(...)
  is_weights <- names(args) == "weights"
  params_list <- args[!is_weights]
  weights <- args[[which(is_weights)]]
  args_list <- lapply(params_list, function(par) c(n, par))
  sims <- lapply(args_list, function(par) do.call(rnorm, par))
  draws <- do.call(cbind, sims)
  components <- sample.int(length(sims), n, prob = weights, replace = TRUE)
  idx <- cbind(seq_len(n), components)
  draws[idx]
}

rmixtnorm <- function(n, ...) {
  args <- list(...)
  is_weights <- names(args) == "weights"
  params_list <- args[!is_weights]
  weights <- args[[which(is_weights)]]
  args_list <- lapply(params_list, function(par) c(n, par))
  sims <- lapply(args_list, function(par) do.call(rtnorm, par))
  draws <- do.call(cbind, sims)
  components <- sample.int(length(sims), n, prob = weights, replace = TRUE)
  idx <- cbind(seq_len(n), components)
  draws[idx]
}

rmixmvnorm <- function(n, ...) {
  args <- list(...)
  is_weights <- names(args) == "weights"
  params_list <- args[!is_weights]
  weights <- args[[which(is_weights)]]
  args_list <- lapply(params_list, function(par) c(n, par))
  sims <- lapply(args_list, function(par) do.call(rmvnorm, par))

  components <- sample.int(length(sims), n, prob = weights, replace = TRUE)

  # loop through the n observations, pulling out the corresponding slice
  draws_out <- array(NA, dim(sims[[1]]))
  for (i in seq_len(n)) {
    draws_out[i, ] <- sims[[components[i]]][i, ]
  }
  draws_out
}

# a form of two-sample chi squared test for discrete multivariate distributions
combined_chisq_test <- function(x, y) {
  stats::chisq.test(
    x = colSums(x),
    y = colSums(y)
  )
}

# flatten unique part of a symmetric matrix
get_upper_tri <- function(x, diag) {
  x[upper.tri(x, diag = diag)]
}

# compare iid samples from a greta distribution (using calculate) against a
# comparison R RNG function
compare_iid_samples <- function(greta_fun,
                                r_fun,
                                parameters,
                                nsim = 200,
                                p_value_threshold = 0.001) {
  greta_array <- do.call(greta_fun, parameters)

  # get information about distribution
  distribution <- get_node(greta_array)$distribution
  multivariate <- distribution$multivariate
  discrete <- distribution$discrete
  name <- distribution$distribution_name

  greta_samples <- calculate(greta_array, nsim = nsim)[[1]]
  r_samples <- do.call(r_fun, c(n = nsim, parameters))

  # reshape to matrix or vector
  if (multivariate) {

    # if it's a symmetric matrix, take only a triangle and flatten it
    if (name %in% c("wishart", "lkj_correlation")) {
      include_diag <- name == "wishart"
      t_greta_samples <- apply(greta_samples, 1, get_upper_tri, include_diag)
      t_r_samples <- apply(r_samples, 1, get_upper_tri, include_diag)
      greta_samples <- t(t_greta_samples)
      r_samples <- t(t_r_samples)
    } else {
      dim <- dim(greta_samples)
      new_dim <- c(dim[1], dim[2] * dim[3])
      dim(greta_samples) <- new_dim
      dim(r_samples) <- new_dim
    }
  } else {
    greta_samples <- as.vector(greta_samples)
  }

  # find a vaguely appropriate test
  if (discrete) {
    test <- ifelse(multivariate, combined_chisq_test, stats::chisq.test)
  } else {
    test <- ifelse(multivariate, cramer::cramer.test, stats::ks.test)
  }

  # do Kolmogorov Smirnov test on samples
  suppressWarnings(test_result <- test(greta_samples, r_samples))
  testthat::expect_gte(test_result$p.value, p_value_threshold)
}

# is this a release candidate?
skip_if_not_release <- function() {
  if (identical(Sys.getenv("RELEASE_CANDIDATE"), "true")) {
    return(invisible(TRUE))
  }
  skip("Not a Release Candidate")
}

# run a geweke test on a greta model, providing: 'sampler' a greta sampler (e.g.
# hmc()), a greta 'model', a 'data' greta array used as the target in the model,
# the two IID random number generators for the data generating function
# ('p_theta' = generator for the prior, 'p_x_bar_theta' = generator for the
# likelihood), 'niter' the number of MCMC samples to compare
check_geweke <- function(sampler, model, data,
                         p_theta, p_x_bar_theta,
                         niter = 2000, warmup = 1000,
                         title = "Geweke test") {

  # sample independently
  target_theta <- p_theta(niter)

  # sample with Markov chain
  greta_theta <- p_theta_greta(
    niter = niter,
    model = model,
    data = data,
    p_theta = p_theta,
    p_x_bar_theta = p_x_bar_theta,
    sampler = sampler,
    warmup = warmup
  )

  # visualise correspondence
  quants <- (1:99) / 100
  q1 <- stats::quantile(target_theta, quants)
  q2 <- stats::quantile(greta_theta, quants)
  plot(q2, q1, main = title)
  graphics::abline(0, 1)

  # do a formal hypothesis test
  suppressWarnings(stat <- stats::ks.test(target_theta, greta_theta))
  testthat::expect_gte(stat$p.value, 0.005)
}

# sample from a prior on theta the long way round, fro use in a Geweke test:
# gibbs sampling the posterior p(theta | x) and the data generating function p(x
# | theta). Only retain the samples of theta from the joint distribution,
p_theta_greta <- function(niter, model, data,
                          p_theta, p_x_bar_theta,
                          sampler = hmc(),
                          warmup = 1000) {

  # set up and initialize trace
  theta <- rep(NA, niter)
  theta[1] <- p_theta(1)

  # set up and tune sampler
  draws <- mcmc(model,
    warmup = warmup,
    n_samples = 1,
    chains = 1,
    sampler = sampler,
    verbose = FALSE
  )

  # now loop through, sampling and updating x and returning theta
  for (i in 2:niter) {

    # sample x given theta
    x <- p_x_bar_theta(theta[i - 1])

    # put x in the data list
    dag <- model$dag
    target_name <- dag$tf_name(get_node(data))
    x_array <- array(x, dim = c(1, dim(data)))
    dag$tf_environment$data_list[[target_name]] <- x_array

    # put theta in the free state
    sampler <- attr(draws, "model_info")$samplers[[1]]
    sampler$free_state <- as.matrix(theta[i - 1])

    draws <- extra_samples(draws,
      n_samples = 1,
      verbose = FALSE
    )

    theta[i] <- tail(as.numeric(draws[[1]]), 1)
  }

  theta
}

# test mcmc for models with analytic posteriors

not_finished <- function(draws, target_samples = 5000) {
  neff <- coda::effectiveSize(draws)
  rhats <- coda::gelman.diag(
    x = draws,
    multivariate = FALSE,
    autoburnin = FALSE
  )
  rhats <- rhats$psrf[, 1]
  converged <- all(rhats < 1.01)
  enough_samples <- all(neff >= target_samples)
  !(converged & enough_samples)
}

new_samples <- function(draws, target_samples = 5000) {
  neff <- min(coda::effectiveSize(draws))
  efficiency <- neff / coda::niter(draws)
  1.2 * (target_samples - neff) / efficiency
}

not_timed_out <- function(start_time, time_limit = 300) {
  elapsed <- Sys.time() - start_time
  elapsed < time_limit
}

get_enough_draws <- function(model,
                             sampler = sampler,
                             n_effective = 5000,
                             time_limit = 300,
                             verbose = TRUE,
                             one_by_one = FALSE) {
  start_time <- Sys.time()
  draws <- mcmc(model,
    sampler = sampler,
    verbose = verbose,
    one_by_one = one_by_one
  )

  while (not_finished(draws, n_effective) &
    not_timed_out(start_time, time_limit)) {
    n_samples <- new_samples(draws, n_effective)
    draws <- extra_samples(draws, n_samples,
      verbose = verbose,
      one_by_one = one_by_one
    )
  }

  if (not_finished(draws, n_effective)) {
    stop("could not draws enough effective samples within the time limit")
  }

  draws
}

# Monte Carlo standard error (using batch means)
mcse <- function(draws) {
  n <- nrow(draws)
  b <- floor(sqrt(n))
  a <- floor(n / b)

  group <- function(k) {
    idx <- ((k - 1) * b + 1):(k * b)
    colMeans(draws[idx, , drop = FALSE])
  }

  bm <- vapply(
    seq_len(a),
    group,
    draws[1, ]
  )

  if (is.null(dim(bm))) {
    bm <- t(bm)
  }

  mu_hat <- as.matrix(colMeans(draws))
  ss <- sweep(t(bm), 2, mu_hat, "-")^2
  var_hat <- b * colSums(ss) / (a - 1)
  sqrt(var_hat / n)
}

# absolute error of the estimate, scaled by the Monte Carlo standard error
scaled_error <- function(draws, expectation) {
  draws <- as.matrix(draws)
  se <- mcse(draws)
  est <- colMeans(draws)
  abs(est - expectation) / se
}

# given a sampler (e.g. hmc()) and minimum number of effective samples, ensure
# that the sampler can draw correct samples from a bivariate normal distribution
check_mvn_samples <- function(sampler, n_effective = 3000) {

  # get multivariate normal samples
  mu <- as_data(t(rnorm(2, 0, 5)))
  sigma <- stats::rWishart(1, 3, diag(2))[, , 1]
  x <- multivariate_normal(mu, sigma)
  m <- model(x, precision = "single")

  draws <- get_enough_draws(m,
    sampler = sampler,
    n_effective = n_effective,
    verbose = FALSE
  )

  # get MCMC samples for statistics of the samples (value, variance and
  # correlation of error wrt mean)
  err <- x - mu
  var <- (err)^2
  corr <- prod(err) / prod(sqrt(diag(sigma)))
  err_var_corr <- c(err, var, corr)
  stat_draws <- calculate(err_var_corr, values = draws)

  # get true values of these - on average the error should be 0, and the
  # variance and correlation of the errors should encoded in Sigma
  stat_truth <- c(
    rep(0, 2),
    diag(sigma),
    cov2cor(sigma)[1, 2]
  )

  # get absolute errors between posterior means and true values, and scale them
  # by time-series Monte Carlo standard errors (the expected amount of
  # uncertainty in the MCMC estimate), to give the number of standard errors
  # away from truth. There's a 1/100 chance of any one of these scaled errors
  # being greater than qnorm(0.99) if the sampler is correct
  errors <- scaled_error(stat_draws, stat_truth)
  expect_lte(max(errors), stats::qnorm(0.99))
}

# sample values of greta array 'x' (which must follow a distribution), and
# compare the samples with iid samples returned by iid_function (which takes the
# number of arguments as its sole argument), producing a labelled qqplot, and
# running a KS test for differences between the two samples
check_samples <- function(x,
                          iid_function,
                          sampler = hmc(),
                          n_effective = 3000,
                          title = NULL,
                          one_by_one = FALSE) {
  m <- model(x, precision = "single")
  draws <- get_enough_draws(m,
    sampler = sampler,
    n_effective = n_effective,
    verbose = FALSE,
    one_by_one = one_by_one
  )

  neff <- coda::effectiveSize(draws)
  iid_samples <- iid_function(neff)
  mcmc_samples <- as.matrix(draws)

  # plot
  if (is.null(title)) {
    distrib <- get_node(x)$distribution$distribution_name
    sampler_name <- class(sampler)[1]
    title <- paste(distrib, "with", sampler_name)
  }

  stats::qqplot(mcmc_samples, iid_samples, main = title)
  graphics::abline(0, 1)

  # do a formal hypothesis test
  suppressWarnings(stat <- ks.test(mcmc_samples, iid_samples))
  testthat::expect_gte(stat$p.value, 0.01)
}