context("manual data")

manualData <- file.path(tempdir(), "manual.sqlite")
on.exit(file.remove(manualData), add = TRUE)

manualData2 <- file.path(tempdir(), "manual2.sqlite")
on.exit(file.remove(manualData2), add = TRUE)

test_that("manual data runCharacterizationAnalyses", {
  # this test creates made-up OMOP CDM data
  # and runs runCharacterizationAnalyses on the data
  # to check whether the results are as expected
  connectionDetailsManual <- DatabaseConnector::createConnectionDetails(
    dbms = "sqlite",
    server = manualData
  )
  con <- DatabaseConnector::connect(connectionDetails = connectionDetailsManual)
  on.exit(DatabaseConnector::disconnect(con))
  schema <- "main"

  # add persons  - aggregate covs (age)
  persons <- data.frame(
    person_id = 1:10,
    gender_concept_id = rep(8532, 10),
    year_of_birth = rep(1984, 10)+sample(10,10),
    race_concept_id = rep(1, 10),
    ethnicity_concept_id = rep(1, 10),
    location_id = rep(1, 10),
    provider_id = rep(1, 10),
    care_site_id = rep(1, 10),
    person_source_value = 1:10,
    gender_source_value = rep("female", 10),
    race_source_value = rep("na", 10),
    ethnicity_source_value = rep("na", 10)
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "person",
    data = persons
  )

  # observation period
  obs_period <- data.frame(
    observation_period_id = 1:10,
    person_id = 1:10,
    observation_period_start_date = rep("2000-12-31", 10),
    observation_period_end_date = c("2000-12-31", rep("2020-12-31", 9)),
    period_type_concept_id = rep(1, 10)
  )
  obs_period$observation_period_start_date <- as.Date(obs_period$observation_period_start_date)
  obs_period$observation_period_end_date <- as.Date(obs_period$observation_period_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "observation_period",
    data = obs_period
  )
  # person 1 has 1 day obs
  # person 2-6 has no events
  # person 7 has diabetes at 10, headache at 12
  # person 8 has diabetes at 13
  # person 9 has headache multiple times
  # person 10 has diabetes at 14
  # add conditions - aggregate covs (conditions)

  condition_era <- data.frame(
    condition_era_id = 1:7,
    person_id = c(7, 7, 8, 9, 9, 9, 10),
    condition_concept_id = c(201820, 378253, 201820, 378253, 378253, 378253, 201820),
    condition_era_start_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_era_end_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_occurrence_count = rep(1, 7)
  )
  condition_era$condition_era_start_date <- as.Date(condition_era$condition_era_start_date)
  condition_era$condition_era_end_date <- as.Date(condition_era$condition_era_end_date)

  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "condition_era",
    data = condition_era
  )

  # add concept
  concept <- data.frame(
    concept_id = c(201820, 378253),
    concept_name = c("diabetes", "hypertension"),
    domain_id = rep(1, 2),
    vocabulary_id = rep(1, 2),
    concept_class_id = c("Condition", "Condition"),
    standard_concept = rep("S", 2),
    concept_code = rep("Snowmed", 2)
    # ,valid_start_date = NULL,
    # valid_end_date = NULL,
    # invalid_reason = NULL
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "concept",
    data = concept
  )

  # add cohort  - tte/dechal/rechal
  cohort <- data.frame(
    subject_id = c(
      1:10,
      7, 8, 10,
      c(3, 6, 7, 8, 10),
      c(7)
    ),
    cohort_definition_id = c(
      rep(1, 10),
      rep(1, 3),
      rep(2, 5),
      2
    ),
    cohort_start_date = c(
      rep("2018-01-01", 10),
      rep("2018-05-01", 3),
      "2018-01-13", "2018-01-03", rep("2018-01-06", 3),
      "2018-05-24"
    ),
    cohort_end_date = c(
      rep("2018-02-01", 10),
      rep("2018-06-01", 3),
      "2018-02-02", "2018-02-04", rep("2018-02-08", 3),
      "2018-06-05"
    )
  )
  cohort$cohort_start_date <- as.Date(cohort$cohort_start_date)
  cohort$cohort_end_date <- as.Date(cohort$cohort_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "cohort",
    data = cohort
  )

  # create settings and run
  characterizationSettings <- createCharacterizationSettings(
    timeToEventSettings = createTimeToEventSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    dechallengeRechallengeSettings = createDechallengeRechallengeSettings(
      targetIds = 1,
      outcomeIds = 2
    ),
    targetBaselineSettings = createTargetBaselineSettings(
      targetIds = 1,
      limitToFirstInNDays = 99999,
      minPriorObservation = 365,
      covariateSettings = FeatureExtraction::createCovariateSettings(
        useDemographicsAge = TRUE,
        useDemographicsGender = TRUE,
        useConditionEraAnyTimePrior = TRUE
      )
    ),

    riskFactorSettings = createRiskFactorSettings(
      targetIds = 1,
      outcomeIds = 2,
      limitToFirstInNDays = 99999,
      minPriorObservation = 365,
      outcomeWashoutDays = 30,
      riskWindowStart = 1,
      riskWindowEnd = 90,
      covariateSettings = FeatureExtraction::createCovariateSettings(
        useDemographicsAge = TRUE,
        useDemographicsGender = TRUE,
        useConditionEraAnyTimePrior = TRUE
      )
      ),

    caseSeriesSettings = createCaseSeriesSettings(
      targetIds = 1,
      outcomeIds = 2,
      limitToFirstInNDays = 99999,
      minPriorObservation = 365,
      outcomeWashoutDays = 30,
      riskWindowStart = 1,
      riskWindowEnd = 90,
      caseCovariateSettings = createDuringCovariateSettings(
        useConditionEraDuring = TRUE
      )
    )
  )
  runCharacterizationAnalyses(
    connectionDetails = connectionDetailsManual,
    targetDatabaseSchema = schema,
    targetTable = "cohort",
    outcomeDatabaseSchema = schema,
    outcomeTable = "cohort",
    cdmDatabaseSchema = schema,
    characterizationSettings = characterizationSettings,
    outputDirectory = file.path(tempdir(), "result"),
    outputDatabaseSchema = schema,
    outputTable = 'test_char_manual_cohort',
    executionPath = file.path(tempdir(), "execution"),
    csvFilePrefix = "c_",
    databaseId = "1",
    incremental = TRUE,
    threads = 1,
    nTargetJobs = 1,
    minCharacterizationMean = 0,
    minCellCount = 0,
    minSMD = 0,
    minCovariateCount = 0,
    mode = 'PatientLevelPrediction',
    showSubjectId = TRUE
  )

  # check csv results are as expected

  tte <- utils::read.csv(file.path(tempdir(), "result", "c_time_to_event.csv"))

  # check counts - 1-day subsequent missing?
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 1-day"]))
  # subsequent is > 100 days after first drug so not in the 1-day count
  testthat::expect_true(0 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 1-day"]))
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 30-day"]))
  testthat::expect_true(1 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 30-day"]))
  testthat::expect_true(5 == sum(tte$num_events[tte$outcome_type == "first" & tte$time_scale == "per 365-day"]))
  testthat::expect_true(1 == sum(tte$num_events[tte$outcome_type == "subsequent" & tte$time_scale == "per 365-day"]))

  # check times
  testthat::expect_true(sum(c(2, 5, 12) %in% tte$time_to_event[tte$outcome_type == "first" & tte$time_scale == "per 1-day"]) == 3)

  # TODO: check in code whether minCellCount < or <=

  dechal <- utils::read.csv(file.path(tempdir(), "result", "c_dechallenge_rechallenge.csv"))
  testthat::expect_true(dechal$num_exposure_eras == 13)
  testthat::expect_true(dechal$num_persons_exposed == 10)
  testthat::expect_true(dechal$num_cases == 6)
  testthat::expect_true(dechal$dechallenge_attempt == 5)
  testthat::expect_true(dechal$dechallenge_success == 5)
  testthat::expect_true(dechal$rechallenge_attempt == 3)

  # one person has a rechal and event stops when second drug exposure stops
  testthat::expect_true(dechal$rechallenge_fail == 1)
  testthat::expect_true(dechal$rechallenge_success == 2)
  testthat::expect_true(dechal$pct_rechallenge_fail == 0.3333333)

  failed <- utils::read.csv(file.path(tempdir(), "result", "c_rechallenge_fail_case_series.csv"))
  testthat::expect_true(nrow(failed) == 1)
  testthat::expect_true(failed$subject_id == 7)
  testthat::expect_true(failed$dechallenge_exposure_end_date_offset == 31)
  testthat::expect_true(failed$dechallenge_outcome_start_date_offset == 5)
  testthat::expect_true(failed$rechallenge_exposure_start_date_offset == 120)
  testthat::expect_true(failed$rechallenge_outcome_start_date_offset == 143)

  # Target baseline covs
  # =======

  eset <- utils::read.csv(file.path(tempdir(), "result", "c_execution_settings.csv"))
  testthat::expect_true(nrow(eset) == 1)


  # targetId = 1, limitToFirstInNDays = 99999,minPriorObservation = 365,
  tset <- utils::read.csv(file.path(tempdir(), "result", "c_target_settings.csv"))
  testthat::expect_true(tset$target_id == 1)
  testthat::expect_true(tset$limit_to_first_in_n_days == 99999)
  testthat::expect_true(tset$min_prior_observation == 365)
  testthat::expect_true(tset$characterization_target_id == tset$target_id*10)

  attrition <- utils::read.csv(file.path(tempdir(), "result", "c_attrition.csv"))
  # there should be 9 people as the first subject has cohort date outside observation
  testthat::expect_true(attrition$n[attrition$cohort_definition_id==10] == 9)

  # useDemographicsAge = TRUE, useDemographicsGender = TRUE, useConditionEraAnyTimePrior = TRUE
  covs <- utils::read.csv(file.path(tempdir(), "result", "c_target_covariates.csv"))
  # gender 8532001
  testthat::expect_true(8532001 %in% covs$covariate_id)

  maxCount <- length(unique(cohort$subject_id[cohort$cohort_definition_id == 1]))
  testthat::expect_true(sum(covs$sum_value <= maxCount) == nrow(covs))

  # data is all female so make sure female cov has average_value of 1
  testthat::expect_true(covs$average_value[covs$covariate_id == 8532001] == 1)
  testthat::expect_true(covs$sum_value[covs$covariate_id == 8532001] == attrition$n[attrition$cohort_definition_id==10])

  covs_cont <- utils::read.csv(file.path(tempdir(), "result", "c_target_covariates_continuous.csv"))
  testthat::expect_true(1002 %in% covs_cont$covariate_id)
  testthat::expect_true(covs_cont$count_value == 9)


  # risk factor - check SMD
  rf_covs <- utils::read.csv(file.path(tempdir(), "result", "c_risk_factor_covariates.csv"))
  # 378253 - 7 : 2013-04-03, 9 : 2006-01-04/2014-08-02/2014-08-04
  # 7 had outcome 5 days after index and ~5 months after index
  # 9 does not have outcome

  testthat::expect_true(rf_covs$non_case_sum_value[rf_covs$covariate_id == 378253201] == 1)
  testthat::expect_true(rf_covs$case_sum_value[rf_covs$covariate_id == 378253201] == 1)
  # 5 cases and 4 non-cases
  testthat::expect_true(rf_covs$non_case_average_value[rf_covs$covariate_id == 378253201] == 1/4)
  testthat::expect_true(rf_covs$case_average_value[rf_covs$covariate_id == 378253201] == 1/5)

  # now make sure SMD is correct
  # 4 non-cases and 1 has outcome, 5-cases and 1 has outcome
  nonCases <- 4
  nonCaseMean <- rf_covs$non_case_average_value[rf_covs$covariate_id == 378253201]
  nonCaseOnes <- 1
  nonCaseZeros <- 3
  cases <- 5
  caseMean <- rf_covs$case_average_value[rf_covs$covariate_id == 378253201]
  caseOnes <- 1
  caseZeros <- 4

  meanDiff <- caseMean - nonCaseMean
  nonCaseSD <- sqrt((nonCaseOnes*(1-nonCaseMean)^2 + nonCaseZeros*(0-nonCaseMean)^2)/(nonCases-1))
  caseSD <- sqrt((caseOnes*(1-caseMean)^2 + caseZeros*(0-caseMean)^2)/(cases-1))
  pooledSD <- sqrt((caseSD^2+nonCaseSD^2)/2)
  manualSMD <- meanDiff/pooledSD

  testthat::expect_equal(
    rf_covs$standardized_mean_difference[rf_covs$covariate_id == 378253201],
    manualSMD, tolerance = 0.01
    )

  # test when a non-case or case is 0

  nonCases <- 4
  nonCaseMean <- rf_covs$non_case_average_value[rf_covs$covariate_id == 201820201]
  nonCaseOnes <- 0
  nonCaseZeros <- 4
  cases <- 5
  caseMean <- rf_covs$case_average_value[rf_covs$covariate_id == 201820201]
  caseOnes <- 3
  caseZeros <- 2

  meanDiff <- caseMean - nonCaseMean
  nonCaseSD <- sqrt((nonCaseOnes*(1-nonCaseMean)^2 + nonCaseZeros*(0-nonCaseMean)^2)/(nonCases-1))
  caseSD <- sqrt((caseOnes*(1-caseMean)^2 + caseZeros*(0-caseMean)^2)/(cases-1))
  pooledSD <- sqrt((caseSD^2+nonCaseSD^2)/2)
  manualSMD <- meanDiff/pooledSD

  testthat::expect_equal(
    rf_covs$standardized_mean_difference[rf_covs$covariate_id == 201820201],
    manualSMD, tolerance = 0.01
  )


  rf_cont <- utils::read.csv(file.path(tempdir(), "result", "c_risk_factor_covariates_continuous.csv"))

  nonCases <- 4
  nonCaseMean <- rf_cont$non_case_average_value[rf_cont$covariate_id == 1002]
  nonCaseSd <- rf_cont$non_case_standard_deviation[rf_cont$covariate_id == 1002]
  cases <- 5
  caseMean <- rf_cont$case_average_value[rf_cont$covariate_id == 1002]
  caseSd <- rf_cont$case_standard_deviation[rf_cont$covariate_id == 1002]

  meanDiff <- caseMean - nonCaseMean
  pooledSD <- sqrt((caseSd^2+nonCaseSd^2)/2)
  manualSMD <- meanDiff/pooledSD

  testthat::expect_equal(
    rf_cont$standardized_mean_difference[rf_cont$covariate_id == 1002],
    manualSMD, tolerance = 0.01
  )


})


test_that("manual data checking ... works", {
  # this test creates made-up OMOP CDM data
  # and runs runCharacterizationAnalyses on the data
  # to check whether the results are as expected
  connectionDetailsManual <- DatabaseConnector::createConnectionDetails(
    dbms = "sqlite",
    server = manualData2
  )
  con <- DatabaseConnector::connect(connectionDetails = connectionDetailsManual)
  on.exit(DatabaseConnector::disconnect(con))
  schema <- "main"

  # add persons  - aggregate covs (age)
  persons <- data.frame(
    person_id = 1:10,
    gender_concept_id = rep(8532, 10),
    year_of_birth = rep(2000, 10),
    race_concept_id = rep(1, 10),
    ethnicity_concept_id = rep(1, 10),
    location_id = rep(1, 10),
    provider_id = rep(1, 10),
    care_site_id = rep(1, 10),
    person_source_value = 1:10,
    gender_source_value = rep("female", 10),
    race_source_value = rep("na", 10),
    ethnicity_source_value = rep("na", 10)
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "person",
    data = persons
  )

  # observation period
  obs_period <- data.frame(
    observation_period_id = 1:10,
    person_id = 1:10,
    observation_period_start_date = rep("2000-12-31", 10),
    observation_period_end_date = c("2000-12-31", rep("2020-12-31", 9)),
    period_type_concept_id = rep(1, 10)
  )
  obs_period$observation_period_start_date <- as.Date(obs_period$observation_period_start_date)
  obs_period$observation_period_end_date <- as.Date(obs_period$observation_period_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "observation_period",
    data = obs_period
  )
  # person 1 has 1 day obs
  # person 2-6 has no events
  # person 7 has diabetes at 10, headache at 12
  # person 8 has diabetes at 13
  # person 9 has headache multiple times
  # person 10 has diabetes at 14
  # add conditions - aggregate covs (conditions)

  condition_era <- data.frame(
    condition_era_id = 1:7,
    person_id = c(7, 7, 8, 9, 9, 9, 10),
    condition_concept_id = c(201820, 378253, 201820, 378253, 378253, 378253, 201820),
    condition_era_start_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_era_end_date = c(
      "2011-01-01", "2013-04-03", "2016-01-01",
      "2006-01-04", "2014-08-02", "2014-08-04",
      "2013-01-04"
    ),
    condition_occurrence_count = rep(1, 7)
  )
  condition_era$condition_era_start_date <- as.Date(condition_era$condition_era_start_date)
  condition_era$condition_era_end_date <- as.Date(condition_era$condition_era_end_date)

  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "condition_era",
    data = condition_era
  )

  # add concept
  concept <- data.frame(
    concept_id = c(201820, 378253),
    concept_name = c("diabetes", "hypertension"),
    domain_id = rep(1, 2),
    vocabulary_id = rep(1, 2),
    concept_class_id = c("Condition", "Condition"),
    standard_concept = rep("S", 2),
    concept_code = rep("Snowmed", 2)
    # ,valid_start_date = NULL,
    # valid_end_date = NULL,
    # invalid_reason = NULL
  )
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "concept",
    data = concept
  )

  # add cohort  - tte/dechal/rechal
  # person 6 has the outcome just before the exposure
  cohort <- data.frame(
    subject_id = c(
      1:10,
      7, 8, 10,
      c(3, 6, 7, 8, 10),
      c(7),
      6
    ),
    cohort_definition_id = c(
      rep(1, 10),
      rep(1, 3),
      rep(2, 5),
      2,
      2
    ),
    cohort_start_date = c(
      rep("2018-01-01", 10),
      rep("2018-05-01", 3),
      "2018-01-13", "2018-01-03", rep("2018-01-06", 3),
      "2018-05-24",
      "2017-12-29"
    ),
    cohort_end_date = c(
      rep("2018-02-01", 10),
      rep("2018-06-01", 3),
      "2018-02-02", "2018-02-04", rep("2018-02-08", 3),
      "2018-06-05",
      "2017-12-29"
    )
  )
  cohort$cohort_start_date <- as.Date(cohort$cohort_start_date)
  cohort$cohort_end_date <- as.Date(cohort$cohort_end_date)
  DatabaseConnector::insertTable(
    connection = con,
    databaseSchema = schema,
    tableName = "cohort",
    data = cohort
  )

  # add checks here

})