# Phase 1: File Validation Tests
# Tests for data quality checks and validation functions

test_that("sample data generator creates valid structure", {
  data <- create_sample_lead_data(n = 50)

  expect_s3_class(data, "data.frame")
  expect_equal(nrow(data), 50)
  expect_true("geoid" %in% names(data))
  expect_true("income_bracket" %in% names(data))
  expect_true("income" %in% names(data))
  expect_true("energy_cost" %in% names(data))
  expect_true("households" %in% names(data))
})

test_that("sample AMI data has correct income brackets", {
  data <- create_sample_lead_data(n = 100, dataset = "ami")

  expected_brackets <- c("0-30%", "30-60%", "60-80%", "80-100%", "100%+")
  expect_true(all(data$income_bracket %in% expected_brackets))
})

test_that("sample FPL data has correct income brackets", {
  data <- create_sample_lead_data(n = 100, dataset = "fpl")

  expected_brackets <- c("0-100%", "100-150%", "150-200%", "200%+")
  expect_true(all(data$income_bracket %in% expected_brackets))
})

test_that("energy cost correlates positively with income", {
  data <- create_sample_lead_data(n = 200)

  # Should have positive correlation (though not perfect)
  cor_value <- cor(data$income, data$energy_cost, use = "complete.obs")
  expect_gt(cor_value, 0)
  expect_lt(cor_value, 1)  # Not perfect correlation
})

test_that("corrupted data has all-NA income_bracket", {
  data <- create_corrupted_fpl_data(n = 100)

  expect_true(all(is.na(data$income_bracket)))
  expect_equal(nrow(data), 100)
  expect_true("income" %in% names(data))
})

test_that("incomplete schema data is missing required column", {
  data <- create_incomplete_schema_data(n = 50)

  expect_false("income" %in% names(data))
  expect_true("income_bracket" %in% names(data))
})

test_that("edge case data includes problematic values", {
  data <- create_edge_case_data()

  # Check for zero income
  expect_true(any(data$income == 0))

  # Check for zero energy cost
  expect_true(any(data$energy_cost == 0))

  # Check for zero/invalid households
  expect_true(any(data$households == 0))

  # Check for NA housing tenure
  expect_true(any(is.na(data$housing_tenure)))
})

test_that("NER calculation handles zero income correctly", {
  # When income is zero, NER should be -1
  result <- ner_func(0, 1000)
  expect_equal(result, -1)
})

test_that("NER calculation handles zero energy cost correctly", {
  # When energy cost is zero, NER should be Inf
  result <- ner_func(50000, 0)
  expect_equal(result, Inf)
})

test_that("NER calculation handles negative income", {
  # Negative income should still produce finite result
  result <- ner_func(-1000, 2000)
  expect_true(is.finite(result))
  expect_equal(result, -1.5)  # (-1000 - 2000) / 2000 = -1.5
})

test_that("energy burden calculation handles edge cases", {
  # Zero income -> Inf burden
  expect_equal(energy_burden_func(0, 1000), Inf)

  # Zero cost -> 0 burden
  expect_equal(energy_burden_func(50000, 0), 0)

  # Normal case
  expect_equal(energy_burden_func(50000, 5000), 0.1)
})

test_that("energy burden and NER are mathematically consistent", {
  income <- 50000
  cost <- 5000

  eb <- energy_burden_func(income, cost)
  ner <- ner_func(income, cost)

  # eb = 1 / (ner + 1)
  expect_equal(eb, 1 / (ner + 1), tolerance = 1e-10)

  # ner = (1 / eb) - 1
  expect_equal(ner, (1 / eb) - 1, tolerance = 1e-10)
})

test_that("household counts validation catches negative values", {
  data <- data.frame(
    households = c(100, 200, -50, 300)
  )

  # Should have negative household count
  expect_true(any(data$households < 0))
})

test_that("household counts validation catches zero values", {
  data <- data.frame(
    households = c(100, 0, 200, 300)
  )

  # Should have zero household count
  expect_true(any(data$households == 0))
})

test_that("income validation catches negative values", {
  data <- create_edge_case_data()

  # Should have negative income
  expect_true(any(data$income < 0))
})

test_that("test CSV writing works", {
  data <- create_sample_lead_data(n = 10)
  filepath <- write_test_csv(data, "test_sample.csv")

  expect_true(file.exists(filepath))

  # Read it back
  read_data <- read.csv(filepath, stringsAsFactors = FALSE)
  expect_equal(nrow(read_data), 10)

  # Cleanup
  file.remove(filepath)
})

test_that("test cache directory creation works", {
  cache_dir <- create_test_cache()

  expect_true(dir.exists(cache_dir))

  # Cleanup
  unlink(cache_dir, recursive = TRUE)
})

test_that("cleanup function removes test files", {
  # Create test files
  test_file1 <- write_test_csv(create_sample_lead_data(10), "cleanup_test1.csv")
  test_file2 <- write_test_csv(create_sample_lead_data(10), "cleanup_test2.csv")
  test_dir <- create_test_cache()

  # Verify they exist
  expect_true(file.exists(test_file1))
  expect_true(file.exists(test_file2))
  expect_true(dir.exists(test_dir))

  # Cleanup
  cleanup_test_files(c(test_file1, test_file2, test_dir))

  # Verify removed
  expect_false(file.exists(test_file1))
  expect_false(file.exists(test_file2))
  expect_false(dir.exists(test_dir))
})

test_that("required column check works", {
  data <- create_sample_lead_data(n = 20)

  # Should have all required columns
  required_cols <- c("geoid", "income", "energy_cost", "income_bracket", "households")
  expect_true(all(required_cols %in% names(data)))

  # Missing column test
  data_incomplete <- data
  data_incomplete$income <- NULL
  expect_false("income" %in% names(data_incomplete))
})

test_that("income bracket validation for AMI data", {
  data <- create_sample_lead_data(n = 100, dataset = "ami")

  ami_brackets <- c("0-30%", "30-60%", "60-80%", "80-100%", "100%+")

  # All values should be valid AMI brackets
  expect_true(all(data$income_bracket %in% ami_brackets))

  # Should have variety of brackets (not all the same)
  expect_gt(length(unique(data$income_bracket)), 1)
})

test_that("income bracket validation for FPL data", {
  data <- create_sample_lead_data(n = 100, dataset = "fpl")

  fpl_brackets <- c("0-100%", "100-150%", "150-200%", "200%+")

  # All values should be valid FPL brackets
  expect_true(all(data$income_bracket %in% fpl_brackets))

  # Should have variety of brackets
  expect_gt(length(unique(data$income_bracket)), 1)
})

test_that("vintage field is set correctly", {
  data_2018 <- create_sample_lead_data(n = 50, vintage = "2018")
  data_2022 <- create_sample_lead_data(n = 50, vintage = "2022")

  expect_equal(unique(data_2018$vintage), "2018")
  expect_equal(unique(data_2022$vintage), "2022")
})

test_that("geoid format is valid", {
  data <- create_sample_lead_data(n = 100)

  # All geoids should start with 37 (NC FIPS code)
  expect_true(all(startsWith(data$geoid, "37")))

  # All geoids should be 11 characters (FIPS code format)
  expect_true(all(nchar(data$geoid) == 11))
})

test_that("housing tenure values are valid", {
  data <- create_sample_lead_data(n = 100)

  valid_tenure <- c("OWNER", "RENTER")
  expect_true(all(data$housing_tenure %in% valid_tenure | is.na(data$housing_tenure)))
})

test_that("primary heating fuel values are realistic", {
  data <- create_sample_lead_data(n = 100)

  valid_fuels <- c("Electricity", "Natural gas", "Fuel oil", "Propane")
  expect_true(all(data$primary_heating_fuel %in% valid_fuels | is.na(data$primary_heating_fuel)))
})

test_that("building type values are valid", {
  data <- create_sample_lead_data(n = 100)

  valid_types <- c("Single-Family", "Multi-Family")
  expect_true(all(data$building_type %in% valid_types | is.na(data$building_type)))
})

test_that("derived metrics are calculated correctly", {
  data <- create_sample_lead_data(n = 50)

  # Check net_income calculation
  expect_equal(data$net_income, data$income - data$energy_cost)

  # Check NER calculation
  expected_ner <- (data$income - data$energy_cost) / data$energy_cost
  expect_equal(data$ner, expected_ner)

  # Check energy burden calculation
  expected_eb <- data$energy_cost / data$income
  expect_equal(data$energy_burden, expected_eb)
})