context("Datastep Performance Tests")

base_path <-  file.path(getwd(), "tests\\testthat\\data")

base_path <- "./data"

DEV <- FALSE


test_that("perf01: add_autos() function works as expected", {
  
  libname(dat, file.path(base_path, "SDTM"), "sas7bdat")

  tm <- Sys.time()
  
  dat3 <- add_autos(dat$vs, c("USUBJID", "VSTESTCD", "VSORRES"))
  
  
  tmdiff <- Sys.time() - tm
  tmdiff
  
  expect_equal("first." %in% names(dat3), TRUE)
  expect_equal("last." %in% names(dat3), TRUE)

})

test_that("perf02: sort check works as expected", {

  dat3 <- add_autos(mtcars, c("am"), sort_check = FALSE)
  

  expect_equal("first." %in% names(dat3), TRUE)
  expect_equal("last." %in% names(dat3), TRUE)
  expect_error( add_autos(mtcars, c("am"), sort_check = TRUE))
})


# Baseline of 10 sec on condition
test_that("perf03: datastep() performance is good", {
  
  if (DEV) {
    
    #profvis::profvis({
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    dt <- dat$lb
    
    tm <- Sys.time()
    
    res <- datastep(dt, 
                    {
                      if (is.na(LBBLFL))
                        blisna <- TRUE
                      else
                        blisna <- FALSE
                    
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
   # })
    
    expect_equal(tmdiff < 10, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Jumps to 14 seconds when retain added
test_that("perf04: datastep() performance with retain is good", {

  
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    
    tm <- Sys.time()
    
    res <- datastep(dat$lb, retain = list(rnum = 0),
                    {
                      if (is.na(LBBLFL))
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                      rnum <- 2 + 1
                      
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 14, TRUE)
    
    # Currently 6.7 seconds 2026-03-23
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Still less than < 11 seconds when group by added
test_that("perf05: datastep() performance with by group is good", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    
    tm <- Sys.time()
    
    res <- datastep(dat$lb, 
                    by = c("USUBJID"),
                    {
                      if (first.)
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                    #  rnum <- rnum + 1
                      
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 11, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Jumps to 14 seconds when retain and group by added
test_that("perf06: datastep() performance with output and retain is good", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    tm <- Sys.time()
  
    
    res <- datastep(dat$lb, retain = list(rnum = 0),
                    by = c("USUBJID", "LBCAT", "LBTESTCD"),
                    {
                      if (first.)
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                      rnum <- rnum + 1
                      
                      rnum2 <- n.
                      
                      if (subjstart == TRUE)
                        output()
                      
                    }, sort_check = TRUE)
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 16, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("perf07: datastep() with group_by performance is good", {
  
  if (DEV) {
    
    library(dplyr)
    
    scs <- specs(PE = import_spec(PESTAT = "character"))
    
    libname(dat, file.path(base_path, "SDTM"), "csv", import_specs = scs)
    
    tm <- Sys.time()
    
    prep <- dat$DM |> 
      left_join(dat$VS, by = c("USUBJID" = "USUBJID")) |> 
      select(USUBJID, VSTESTCD, VISIT, VISITNUM, VSSTRESN, ARM, VSBLFL) |> 
      filter(VSTESTCD %in% c("PULSE", "RESP", "TEMP", "DIABP", "SYSBP"), 
             !(VISIT == "SCREENING" & VSBLFL != "Y")) |> 
      arrange(USUBJID, VSTESTCD, VISITNUM) |> 
      group_by(USUBJID, VSTESTCD) |> 
      #datastep(by = c("USUBJID", "VSTESTCD"), retain = list(BSTRESN = 0), {
      datastep(retain = list(BSTRESN = 0), {
        
        # Combine treatment groups
        # And distingish baseline time points
        if (ARM == "ARM A") {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "A_BASE"
          } else {
            GRP <- "A_TRT"
          }
          
        } else {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "O_BASE"
          } else {
            GRP <- "O_TRT"
          }
          
        }
        
        # Populate baseline value
        if (first.)
          BSTRESN = VSSTRESN
        
      })
    
    tmdiff <- Sys.time() - tm 
    tmdiff
    
    expect_equal(tmdiff < 3, TRUE)
    
  } else 
    expect_equal(TRUE, TRUE)
  
})


test_that("perf08: 100,000 row datastep on data.frame is good.", {
  
  if (DEV) {
    
    l <- 1000000
    
    df <- data.frame(C1 = seq_len(l), C2 = runif(l), 
                     C3 = runif(l), C4 = runif(l), stringsAsFactors = FALSE)
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 20 seconds
    # 1,000,000 rows is 3.6 minutes
    # 1,000,000 rows 5.36 minutes
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("perf09: 100,000 row datastep on tibble is good.", {
  
  if (DEV) {
    
    library(tibble)
    
    l <- 100000
    
    df <- tibble(C1 = seq_len(l), C2 = runif(l), 
                     C3 = runif(l), C4 = runif(l))
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 21.2 seconds
    # 1,000,000 rows is 3.8 minutes
    
  } else
    expect_equal(TRUE, TRUE)
  
})



test_that("perf10: 100,000 row datastep on data.table is good.", {
  
  if (DEV) {
    
    library(data.table)
    
    l <- 100000
    
    df <- data.table(C1 = seq_len(l), C2 = runif(l), 
                 C3 = runif(l), C4 = runif(l))
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 1.3 minutes with out modification to datastep
    # 100,000 rows is 22.3 seconds after modification 
    # 1,000,000 rows not going to try
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("perf11: complex datastep() performance is good", {
  
  if (DEV) {
    
    library(dplyr)
    
    scs <- specs(PE = import_spec(PESTAT = "character"))
    
    libname(dat, file.path(base_path, "SDTM"), "csv", import_specs = scs, 
            quiet = TRUE)
    
    tm <- Sys.time()
    
    prep <- dat$DM |> 
      left_join(dat$VS, by = c("USUBJID" = "USUBJID")) |> 
      select(USUBJID, VSTESTCD, VISIT, VISITNUM, VSSTRESN, ARM, VSBLFL) |> 
      filter(VSTESTCD %in% c("PULSE", "RESP", "TEMP", "DIABP", "SYSBP"), 
             !(VISIT == "SCREENING" & VSBLFL != "Y")) |> 
      arrange(USUBJID, VSTESTCD, VISITNUM) |> 
      group_by(USUBJID, VSTESTCD) |> 
      datastep(by = c("USUBJID", "VSTESTCD"), retain = list(BSTRESN = 0), {
        
        # Combine treatment groups
        # And distinguish baseline time points
        if (ARM == "ARM A") {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "A_BASE"
          } else {
            GRP <- "A_TRT"
          }
          
        } else {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "O_BASE"
          } else {
            GRP <- "O_TRT"
          }
          
        }
        
        # Populate baseline value
        if (first.)
          BSTRESN = VSSTRESN
        
      })
    
    tmdiff <- Sys.time() - tm 
    tmdiff
    
    # 1.6 seconds
    
    expect_equal(tmdiff < 3, TRUE)
    
  } else 
    expect_equal(TRUE, TRUE)
  
})

test_that("perf12: Wide and long table performance is good.", {
  
  if (DEV) {
  
  library(fmtr)
  
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    rfmt <- value(condition(x == "ASIAN", "Asian"),
                  condition(x == "BLACK OR AFRICAN AMERICAN", "Black or African American"),
                  condition(x == "WHITE", "White"),
                  condition(x == "UNKNOWN", "Unknown"))
    
    tm <- Sys.time()
    dm <- datastep(dat$dm, merge = dat$lb,
                   merge_by = c("STUDYID", "USUBJID"),
                   merge_in = c("ina", "inb"),
                   {
                     if (ina) {
                       HASADSL <- TRUE
                     }
                     
                     if (inb) {
                       HASLB <- TRUE
                       
                     }
                     
                     RACEC <- fapply(RACE, rfmt)
                     
                    # output()
                     
                   })
    
    print(Sys.time() - tm)
    
    # 14.14 seconds
  
  } else {
    expect_equal(TRUE, TRUE) 
  }
  
})

test_that("perf13: Really long table performance is good.", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")

    
    tm <- Sys.time()
    dm <- datastep(dat$lb, set = list(dat$lb, dat$lb, dat$lb),
                   {
                     if (is.na(LBBLFL))
                       subjstart <- TRUE
                     else
                       subjstart <- FALSE
                     
                     
                     mynum <- n.
                     
                   })
    
    print(Sys.time() - tm)
    
    # 27.7 seconds
    
  } else {
    expect_equal(TRUE, TRUE) 
  }
  
})


test_that("perf14: Really long table performance is good with retain.", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    
    tm <- Sys.time()
    dm <- datastep(dat$lb, set = list(dat$lb, dat$lb, dat$lb),
                   retain = list(A = 0, B = "hork", C = 0),
                   {
                     if (!is.na(LBBLFL)) {
                       output()
                     }
                    
                     mynum <- n.
                     
                     A <- A + 1
                     C <- C - 1
                     # B <- paste(B, n.) 
                     
                   })
    
    print(Sys.time() - tm)
    
    # Without output
    # 35.31 seconds, 74304 rows
    
    # With output
    # 2.764391 mins
    
  } else {
    expect_equal(TRUE, TRUE) 
  }
  
})