context("manual data")

manualData <- file.path(tempdir(), "manual.sqlite")
on.exit(file.remove(manualData), add = TRUE)

manualData2 <- file.path(tempdir(), "manual2.sqlite")
on.exit(file.remove(manualData2), add = TRUE)

test_that("manual data runCharacterizationAnalyses", {
  # this test creates made-up OMOP CDM data
  # and runs runCharacterizationAnalyses on the data
  # to check whether the results are as expected
  connectionDetails <- DatabaseConnector::createConnectionDetails(
    dbms = "sqlite",
    server = manualData
  )
  con <- DatabaseConnector::connect(connectionDetails = connectionDetails)
  schema <- "main"

  # add persons  - aggregate covs (age)
  persons <- data.frame(
    person_id = 1:10,
    gender_concept_id = rep(8532, 10),
    year_of_birth = rep(2000, 10),
    race_concept_id = rep(1, 10),
    ethnicity_concept_id = rep(1, 10),
    location_id = rep(1, 10),
    provider_id = rep(1, 10),
    care_site_id = rep(1, 10),
    person_source_value = 1:10,
    gender_source_value = rep("female", 10),
    race_source_value = rep("na", 10),
    ethnicity_source_value = rep("na", 10)
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "person",
    data = persons
  )

  # observation period
  obs_period <- data.frame(
    observation_period_id = 1:10,
    person_id = 1:10,
    observation_period_start_date = rep("2000-12-31", 10),
    observation_period_end_date = c("2000-12-31", rep("2020-12-31", 9)),
    period_type_concept_id = rep(1, 10)
  )
  obs_period$observation_period_start_date <- as.Date(obs_period$observation_period_start_date)
  obs_period$observation_period_end_date <- as.Date(obs_period$observation_period_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "observation_period",
    data = obs_period
  )
  # person 1 has 1 day obs
  # person 2-6 has no events
  # person 7 has diabetes at 10, headache at 12
  # person 8 has diabetes at 13
  # person 9 has headache multiple times
  # person 10 has diabetes at 14
  # add conditions - aggregate covs (conditions)

  condition_era <- data.frame(
    condition_era_id = 1:7,
    person_id = c(7, 7, 8, 9, 9, 9, 10),
    condition_concept_id = c(201820, 378253, 201820, 378253, 378253, 378253, 201820),
    condition_era_start_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_era_end_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_occurrence_count = rep(1, 7)
  )
  condition_era$condition_era_start_date <- as.Date(condition_era$condition_era_start_date)
  condition_era$condition_era_end_date <- as.Date(condition_era$condition_era_end_date)

  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "condition_era",
    data = condition_era
  )

  # add concept
  concept <- data.frame(
    concept_id = c(201820, 378253),
    concept_name = c("diabetes", "hypertension"),
    domain_id = rep(1, 2),
    vocabulary_id = rep(1, 2),
    concept_class_id = c("Condition", "Condition"),
    standard_concept = rep("S", 2),
    concept_code = rep("Snowmed", 2)
    # ,valid_start_date = NULL,
    # valid_end_date = NULL,
    # invalid_reason = NULL
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "concept",
    data = concept
  )

  # add cohort  - tte/dechal/rechal
  cohort <- data.frame(
    subject_id = c(
      1:10,
      7, 8, 10,
      c(3, 6, 7, 8, 10),
      c(7)
    ),
    cohort_definition_id = c(
      rep(1, 10),
      rep(1, 3),
      rep(2, 5),
      2
    ),
    cohort_start_date = c(
      rep("2018-01-01", 10),
      rep("2018-05-01", 3),
      "2018-01-13", "2018-01-03", rep("2018-01-06", 3),
      "2018-05-24"
    ),
    cohort_end_date = c(
      rep("2018-02-01", 10),
      rep("2018-06-01", 3),
      "2018-02-02", "2018-02-04", rep("2018-02-08", 3),
      "2018-06-05"
    )
  )
  cohort$cohort_start_date <- as.Date(cohort$cohort_start_date)
  cohort$cohort_end_date <- as.Date(cohort$cohort_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "cohort",
    data = cohort
  )

  # create settings and run
  characterizationSettings <- Characterization::createCharacterizationSettings(
    timeToEventSettings = Characterization::createTimeToEventSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    dechallengeRechallengeSettings = Characterization::createDechallengeRechallengeSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    aggregateCovariateSettings = Characterization::createAggregateCovariateSettings(
      targetIds = 1,
      outcomeIds = 2,
      minPriorObservation = 365,
      outcomeWashoutDays = 30,
      riskWindowStart = 1,
      riskWindowEnd = 90,
      covariateSettings = FeatureExtraction::createCovariateSettings(
        useDemographicsAge = T,
        useDemographicsGender = T,
        useConditionEraAnyTimePrior = T
      ),
      caseCovariateSettings = Characterization::createDuringCovariateSettings(useConditionEraDuring = T),
      casePreTargetDuration = 365 * 5
    )
  )
  Characterization::runCharacterizationAnalyses(
    connectionDetails = connectionDetails,
    targetDatabaseSchema = schema,
    targetTable = "cohort",
    outcomeDatabaseSchema = schema,
    outcomeTable = "cohort",
    cdmDatabaseSchema = schema,
    characterizationSettings = characterizationSettings,
    outputDirectory = file.path(tempdir(), "result"),
    executionPath = file.path(tempdir(), "execution"),
    csvFilePrefix = "c_",
    databaseId = "1",
    incremental = T,
    threads = 1,
    minCharacterizationMean = 0.0001,
    minCellCount = NULL,
    showSubjectId = T
  )

  # check csv results are as expected

  tte <- read.csv(file.path(tempdir(), "result", "c_time_to_event.csv"))

  # check counts - 1-day subsequent missing?
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 1-day"]))
  # subsequent is > 100 days after first drug so not in the 1-day count
  testthat::expect_true(0 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 1-day"]))
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 30-day"]))
  testthat::expect_true(1 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 30-day"]))
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 365-day"]))
  testthat::expect_true(1 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 365-day"]))

  # check times
  testthat::expect_true(sum(c(2, 5, 12) %in% tte$time_to_event[tte$outcome_type == "first" & tte$time_scale == "per 1-day"]) == 3)

  # TODO: check in code whether minCellCount < or <=

  dechal <- read.csv(file.path(tempdir(), "result", "c_dechallenge_rechallenge.csv"))
  testthat::expect_true(dechal$num_exposure_eras == 13)
  testthat::expect_true(dechal$num_persons_exposed == 10)
  testthat::expect_true(dechal$num_cases == 6)
  testthat::expect_true(dechal$dechallenge_attempt == 5)
  testthat::expect_true(dechal$dechallenge_success == 5)
  testthat::expect_true(dechal$rechallenge_attempt == 3)

  # one person has a rechal and event stops when second drug exposure stops
  testthat::expect_true(dechal$rechallenge_fail == 1)
  testthat::expect_true(dechal$rechallenge_success == 2)
  testthat::expect_true(dechal$pct_rechallenge_fail == 0.3333333)

  failed <- read.csv(file.path(tempdir(), "result", "c_rechallenge_fail_case_series.csv"))
  testthat::expect_true(nrow(failed) == 1)
  testthat::expect_true(failed$subject_id == 7)
  testthat::expect_true(failed$dechallenge_exposure_end_date_offset == 31)
  testthat::expect_true(failed$dechallenge_outcome_start_date_offset == 5)
  testthat::expect_true(failed$rechallenge_exposure_start_date_offset == 120)
  testthat::expect_true(failed$rechallenge_outcome_start_date_offset == 143)

  # Aggregate covs
  # =======
  counts <- read.csv(file.path(tempdir(), "result", "c_cohort_counts.csv"))
  # when restricted to first exposure 5 people have outcome
  testthat::expect_true(counts$row_count[counts$cohort_type == "Cases"] == 5)
  # target is 9 because 1 has insufficient min prior obs
  testthat::expect_true(counts$row_count[counts$cohort_type == "Target" &
    counts$target_cohort_id == 1] == 9)
  # make sure outcome is there a has count of 5
  testthat::expect_true(counts$row_count[counts$cohort_type == "Target" &
    counts$target_cohort_id == 2] == 5)

  # Tall should not have first restriction
  testthat::expect_true(counts$row_count[counts$cohort_type == "Tall" &
    counts$target_cohort_id == 1] == 13)
  testthat::expect_true(counts$person_count[counts$cohort_type == "Tall" &
    counts$target_cohort_id == 1] == 10)
  # make sure outcome is there a has count of 6 and 5
  testthat::expect_true(counts$row_count[counts$cohort_type == "Tall" &
    counts$target_cohort_id == 2] == 6)
  testthat::expect_true(counts$person_count[counts$cohort_type == "Tall" &
    counts$target_cohort_id == 2] == 5)

  covs <- read.csv(file.path(tempdir(), "result", "c_covariates.csv"))

  # checks all females
  testthat::expect_true(covs$average_value[covs$covariate_id == 8532001 & covs$cohort_type == "Cases"] == 1)
  testthat::expect_true(covs$average_value[covs$covariate_id == 8532001 & covs$cohort_type == "Target" &
    covs$target_cohort_id == 1] == 1)
  testthat::expect_true(covs$average_value[covs$covariate_id == 8532001 & covs$cohort_type == "Target" &
    covs$target_cohort_id == 2] == 1)

  ## TODO: check diabetes and hypertensions
  # covs$covariate_id
  # 201820 7,8 and 10 have in history and all are cases
  ind <- covs$covariate_id == 201820201 & covs$target_cohort_id == 1 &
    covs$cohort_type == "Cases"
  testthat::expect_true(covs$sum_value[ind] == 3)
  testthat::expect_true(covs$average_value[ind] == 3 / 5)

  ind <- covs$covariate_id == 201820201 & covs$target_cohort_id == 1 &
    covs$cohort_type == "Target"
  testthat::expect_true(covs$sum_value[ind] == 3)
  testthat::expect_equal(covs$average_value[ind], 3 / 9, tolerance = 0.01)

  # 378253 7,9 (9 multiple times) but 9 not a case
  ind <- covs$covariate_id == 378253201 & covs$target_cohort_id == 1 &
    covs$cohort_type == "Cases"
  testthat::expect_true(covs$sum_value[ind] == 1)
  testthat::expect_true(covs$average_value[ind] == 1 / 5)

  ind <- covs$covariate_id == 378253201 & covs$target_cohort_id == 1 &
    covs$cohort_type == "Target"
  testthat::expect_true(covs$sum_value[ind] == 2)
  testthat::expect_equal(covs$average_value[ind], 2 / 9, tolerance = 0.01)


  covs_cont <- read.csv(file.path(tempdir(), "result", "c_covariates_continuous.csv"))

  # checks age in years
  testthat::expect_true(covs_cont$average_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Cases"] == 18)
  testthat::expect_true(covs_cont$count_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Cases"] == 5)
  testthat::expect_true(covs_cont$average_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Target" &
    covs_cont$target_cohort_id == 1] == 18)
  testthat::expect_true(covs_cont$count_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Target" &
    covs_cont$target_cohort_id == 1] == 9)
  testthat::expect_true(covs_cont$average_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Target" &
    covs_cont$target_cohort_id == 2] == 18)
  testthat::expect_true(covs_cont$count_value[covs_cont$covariate_id == 1002 & covs_cont$cohort_type == "Target" &
    covs_cont$target_cohort_id == 2] == 5)
})


test_that("manual data checking exclude count works", {
  # this test creates made-up OMOP CDM data
  # and runs runCharacterizationAnalyses on the data
  # to check whether the results are as expected
  connectionDetails <- DatabaseConnector::createConnectionDetails(
    dbms = "sqlite",
    server = manualData2
  )
  con <- DatabaseConnector::connect(connectionDetails = connectionDetails)
  schema <- "main"

  # add persons  - aggregate covs (age)
  persons <- data.frame(
    person_id = 1:10,
    gender_concept_id = rep(8532, 10),
    year_of_birth = rep(2000, 10),
    race_concept_id = rep(1, 10),
    ethnicity_concept_id = rep(1, 10),
    location_id = rep(1, 10),
    provider_id = rep(1, 10),
    care_site_id = rep(1, 10),
    person_source_value = 1:10,
    gender_source_value = rep("female", 10),
    race_source_value = rep("na", 10),
    ethnicity_source_value = rep("na", 10)
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "person",
    data = persons
  )

  # observation period
  obs_period <- data.frame(
    observation_period_id = 1:10,
    person_id = 1:10,
    observation_period_start_date = rep("2000-12-31", 10),
    observation_period_end_date = c("2000-12-31", rep("2020-12-31", 9)),
    period_type_concept_id = rep(1, 10)
  )
  obs_period$observation_period_start_date <- as.Date(obs_period$observation_period_start_date)
  obs_period$observation_period_end_date <- as.Date(obs_period$observation_period_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "observation_period",
    data = obs_period
  )
  # person 1 has 1 day obs
  # person 2-6 has no events
  # person 7 has diabetes at 10, headache at 12
  # person 8 has diabetes at 13
  # person 9 has headache multiple times
  # person 10 has diabetes at 14
  # add conditions - aggregate covs (conditions)

  condition_era <- data.frame(
    condition_era_id = 1:7,
    person_id = c(7, 7, 8, 9, 9, 9, 10),
    condition_concept_id = c(201820, 378253, 201820, 378253, 378253, 378253, 201820),
    condition_era_start_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_era_end_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_occurrence_count = rep(1, 7)
  )
  condition_era$condition_era_start_date <- as.Date(condition_era$condition_era_start_date)
  condition_era$condition_era_end_date <- as.Date(condition_era$condition_era_end_date)

  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "condition_era",
    data = condition_era
  )

  # add concept
  concept <- data.frame(
    concept_id = c(201820, 378253),
    concept_name = c("diabetes", "hypertension"),
    domain_id = rep(1, 2),
    vocabulary_id = rep(1, 2),
    concept_class_id = c("Condition", "Condition"),
    standard_concept = rep("S", 2),
    concept_code = rep("Snowmed", 2)
    # ,valid_start_date = NULL,
    # valid_end_date = NULL,
    # invalid_reason = NULL
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "concept",
    data = concept
  )

  # add cohort  - tte/dechal/rechal
  # person 6 has the outcome just before the exposure
  cohort <- data.frame(
    subject_id = c(
      1:10,
      7, 8, 10,
      c(3, 6, 7, 8, 10),
      c(7),
      6
    ),
    cohort_definition_id = c(
      rep(1, 10),
      rep(1, 3),
      rep(2, 5),
      2,
      2
    ),
    cohort_start_date = c(
      rep("2018-01-01", 10),
      rep("2018-05-01", 3),
      "2018-01-13", "2018-01-03", rep("2018-01-06", 3),
      "2018-05-24",
      "2017-12-29"
    ),
    cohort_end_date = c(
      rep("2018-02-01", 10),
      rep("2018-06-01", 3),
      "2018-02-02", "2018-02-04", rep("2018-02-08", 3),
      "2018-06-05",
      "2017-12-29"
    )
  )
  cohort$cohort_start_date <- as.Date(cohort$cohort_start_date)
  cohort$cohort_end_date <- as.Date(cohort$cohort_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "cohort",
    data = cohort
  )

  # create settings and run
  characterizationSettings <- Characterization::createCharacterizationSettings(
    timeToEventSettings = Characterization::createTimeToEventSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    dechallengeRechallengeSettings = Characterization::createDechallengeRechallengeSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    aggregateCovariateSettings = Characterization::createAggregateCovariateSettings(
      targetIds = 1,
      outcomeIds = 2,
      minPriorObservation = 365,
      outcomeWashoutDays = 30,
      riskWindowStart = 1,
      riskWindowEnd = 90,
      covariateSettings = FeatureExtraction::createCovariateSettings(
        useDemographicsAge = T,
        useDemographicsGender = T,
        useConditionEraAnyTimePrior = T
      ),
      caseCovariateSettings = Characterization::createDuringCovariateSettings(useConditionEraDuring = T),
      casePreTargetDuration = 365 * 5
    )
  )
  Characterization::runCharacterizationAnalyses(
    connectionDetails = connectionDetails,
    targetDatabaseSchema = schema,
    targetTable = "cohort",
    outcomeDatabaseSchema = schema,
    outcomeTable = "cohort",
    cdmDatabaseSchema = schema,
    characterizationSettings = characterizationSettings,
    outputDirectory = file.path(tempdir(), "result2"),
    executionPath = file.path(tempdir(), "execution2"),
    csvFilePrefix = "c_",
    databaseId = "1",
    incremental = T,
    threads = 1,
    minCharacterizationMean = 0.0001,
    minCellCount = NULL,
    showSubjectId = T
  )

  # load the cohort counts to make sure the exclude is there
  counts <- read.csv(file.path(tempdir(), "result2", "c_cohort_counts.csv"))
  # when restricted to first exposure 5 people have outcome
  testthat::expect_true(counts$row_count[counts$cohort_type == "Cases"] == 4)
  testthat::expect_true(counts$row_count[counts$cohort_type == "Exclude"] == 1)
})