CheckPREDICTSData <- function(predicts, full = TRUE) {
  # all extracts should have 67 columns
  expect_equal(ncol(predicts), 67)

  # do the names match up to what we expect
  names_predicts <- c(
    "Source_ID", "Reference", "Study_number", "Study_name", "SS",
    "Diversity_metric", "Diversity_metric_unit", "Diversity_metric_type",
    "Diversity_metric_is_effort_sensitive",
    "Diversity_metric_is_suitable_for_Chao", "Sampling_method",
    "Sampling_effort_unit", "Study_common_taxon", "Rank_of_study_common_taxon",
    "Site_number", "Site_name", "Block", "SSS", "SSB", "SSBS",
    "Sample_start_earliest", "Sample_end_latest", "Sample_midpoint",
    "Sample_date_resolution", "Max_linear_extent_metres",
    "Habitat_patch_area_square_metres", "Sampling_effort",
    "Rescaled_sampling_effort", "Habitat_as_described", "Predominant_land_use",
    "Source_for_predominant_land_use", "Use_intensity",
    "Km_to_nearest_edge_of_habitat", "Years_since_fragmentation_or_conversion",
    "Transect_details", "Coordinates_method", "Longitude", "Latitude",
    "Country_distance_metres", "Country", "UN_subregion", "UN_region",
    "Ecoregion_distance_metres", "Ecoregion", "Biome", "Realm", "Hotspot",
    "Wilderness_area", "Taxon_number", "Taxon_name_entered", "Indication",
    "Parsed_name", "Taxon", "COL_ID", "Name_status", "Rank", "Kingdom",
    "Phylum", "Class", "Order", "Family", "Genus", "Species",
    "Best_guess_binomial", "Higher_taxon", "Measurement",
    "Effort_corrected_measurement"
  )
  expect_equal(names(predicts), names_predicts)

  expect_equal(
    levels(predicts$Diversity_metric_type),
    c("Abundance", "Occurrence", "Species richness")
  )

  # check land use factors
  expect_equal(length(levels(predicts$Predominant_land_use)), 10)
  expect_equal(length(levels(predicts$Use_intensity)), 4)

  # check that the site-level factors match expectations across the data (these
  # are to be the same for the 2016, and 2016+2022 data)
  if (full) {
    expect_equal(length(levels(predicts$SS)), 993)
    expect_equal(length(levels(predicts$SSS)), 50032)
    expect_equal(length(levels(predicts$SSB)), 6098)
    expect_equal(length(levels(predicts$SSBS)), 50032)
  }
}

CheckSitelevelData <- function(sls) {
  expect_equal(nrow(sls), 35738)
  expect_equal(ncol(sls), 50)

  expected_names <- c(
    "Source_ID", "Reference", "Study_number", "Study_name", "SS",
    "Diversity_metric", "Diversity_metric_unit", "Diversity_metric_type",
    "Diversity_metric_is_effort_sensitive",
    "Diversity_metric_is_suitable_for_Chao", "Sampling_method",
    "Sampling_effort_unit", "Study_common_taxon",
    "Rank_of_study_common_taxon", "Site_number", "Site_name", "Block", "SSS",
    "SSB", "SSBS", "Sample_start_earliest", "Sample_end_latest",
    "Sample_midpoint", "Sample_date_resolution", "Max_linear_extent_metres",
    "Habitat_patch_area_square_metres", "Sampling_effort",
    "Rescaled_sampling_effort", "Habitat_as_described",
    "Predominant_land_use", "Source_for_predominant_land_use",
    "Use_intensity", "Km_to_nearest_edge_of_habitat",
    "Years_since_fragmentation_or_conversion", "Transect_details",
    "Coordinates_method", "Longitude", "Latitude", "Country_distance_metres",
    "Country", "UN_subregion", "UN_region", "Ecoregion_distance_metres",
    "Ecoregion", "Biome", "Realm", "Hotspot", "Wilderness_area", "N_samples",
    "Higher_taxa"
  )
  expect_equal(names(sls), expected_names)

  # source ID's are what is quoted on the website
  expect_equal(length(unique(sls$Source_ID)), 595)

  # check that the study counts are expected
  expect_equal(length(unique(sls$SS)), 817)
  expect_equal(length(levels(sls$SS)), 993)

  # check that the sites are expected
  expect_equal(length(unique(sls$SSBS)), 35738)
  expect_equal(length(levels(sls$SSBS)), 53008)

  # we expect 94 countries from the webpage
  expect_equal(length(unique(sls$Country)), 101)

  # some factor checks
  expect_setequal(
    levels(sls$Diversity_metric_type),
    c("Abundance", "Occurrence", "Species richness")
  )
  expect_setequal(
    levels(sls$Use_intensity),
    c("Minimal use", "Light use", "Intense use", "Cannot decide")
  )
  expect_setequal(
    levels(sls$Diversity_metric),
    c(
      "abundance",
      "species richness",
      "effort-corrected abundance",
      "density",
      "group abundance",
      "occurrence",
      "percent cover",
      "sign relative abundance",
      "occurrence frequency",
      "sign density",
      "biomass",
      "biovolume",
      "relative abundance",
      "effort-corrected sign abundance",
      "sign abundance"
    )
  )
}