# Copyright 2025 Observational Health Data Sciences and Informatics # # This file is part of PatientLevelPrediction # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. test_that("Create existing sklearn works", { skip_if_not_installed("reticulate") skip_on_cran() expect_error(createSklearnModel("existing")) # create a file model.pkl for testing file.create("model.pkl") covariateSettings <- FeatureExtraction::createCovariateSettings(useDemographicsAge = TRUE) populationSettings <- createStudyPopulationSettings() # dataframe wrong type expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = list( columnId = "columnId", covariateId = c(1) ), covariateSettings = covariateSettings, populationSettings = populationSettings )) # dataframe wrong column names expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( columnId = c(1), notCovariateId = c(1002), ), covariateSettings = covariateSettings, populationSettings = populationSettings )) # dataframe wrong column types expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( columnId = 1, covariateId = "2" ), covariateSettings = covariateSettings, populationSettings = populationSettings )) model <- createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( columnId = c(1, 2), covariateId = c(1002, 1003) ), covariateSettings = covariateSettings, populationSettings = populationSettings ) expect_equal(attr(model, "modelType"), "binary") expect_equal(attr(model, "saveType"), "file") expect_equal(attr(model, "predictionFunction"), "predictPythonSklearn") expect_equal(attr(model, "saveToJson"), FALSE) expect_equal(class(model), "plpModel") unlink("model.pkl") }) test_that("existing sklearn model works", { skip_if_not_installed("reticulate") skip_on_cran() skip_if_offline() # fit a simple sklearn model with plp modelSettings <- setDecisionTree( criterion = list("gini"), splitter = list("best"), maxDepth = list(as.integer(4)), minSamplesSplit = list(2), minSamplesLeaf = list(10), minWeightFractionLeaf = list(0), maxFeatures = list("sqrt"), maxLeafNodes = list(NULL), minImpurityDecrease = list(10^-7), classWeight = list(NULL), seed = sample(1000000, 1) ) plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, analysisId = "DecisionTree", analysisPath = tempdir() ) # load model json and save as pickle with joblib model <- sklearnFromJson(file.path(plpModel$model, "model.json")) joblib <- reticulate::import("joblib") joblib$dump(model, file.path(plpModel$model, "model.pkl")) # extract covariateMap from plpModel covariateMap <- plpModel$covariateImportance %>% dplyr::select(columnId, covariateId) existingModel <- createSklearnModel( modelLocation = file.path(plpModel$model), covariateMap = covariateMap, covariateSettings = plpModel$modelDesign$covariateSettings, populationSettings = plpModel$modelDesign$populationSettings ) prediction <- predictPlp(plpModel, testData, testData$labels) predictionNew <- predictPlp(existingModel, testData, testData$labels) expect_correct_predictions(prediction, testData) expect_equal(prediction$value, predictionNew$value) }) test_that("Externally trained sklearn model works", { skip_if_not_installed("reticulate") skip_on_cran() skip_if_offline() # change map to be some random order covariateIds <- tinyTrainData$covariateData$covariates %>% dplyr::pull(.data$covariateId) %>% unique() map <- data.frame( columnId = sample(1:20, length(covariateIds)), covariateId = sample(covariateIds, length(covariateIds)) ) matrixData <- toSparseM(tinyTrainData, map = map) matrix <- matrixData$dataMatrix %>% Matrix::as.matrix() # fit with sklearn xMatrix <- reticulate::r_to_py(matrix) y <- reticulate::r_to_py(tinyTrainData$labels$outcomeCount) sklearn <- reticulate::import("sklearn") classifier <- sklearn$tree$DecisionTreeClassifier() classifier <- classifier$fit(xMatrix, y) testMatrix <- toSparseM(testData, map = matrixData$covariateMap) xTest <- reticulate::r_to_py(testMatrix$dataMatrix %>% Matrix::as.matrix()) yTest <- reticulate::r_to_py(testData$labels$outcomeCount) externalPredictions <- classifier$predict_proba(xTest)[, 2] auc <- sklearn$metrics$roc_auc_score(yTest, externalPredictions) joblib <- reticulate::import("joblib") path <- tempfile() createDir(path) joblib$dump(classifier, file.path(path, "model.pkl")) plpModel <- createSklearnModel( model = path, covariateMap = matrixData$covariateMap, covariateSettings = FeatureExtraction::createCovariateSettings( useDemographicsAge = TRUE ), populationSettings = populationSettings ) prediction <- predictPlp(plpModel, testData, testData$labels) expect_equal(mean(prediction$value), mean(externalPredictions)) expect_correct_predictions(prediction, testData) }) test_that("Create existing GLM model works", { expect_error(createGlmModel(coefficients = data.frame( weights = c(1, 2), covariateId = c(1, 2) ))) expect_error(createGlmModel(coefficients = data.frame( coefficient = c("1", "2"), covariateId = c(1, 2) ))) expect_error(createGlmModel(coefficients = data.frame( coefficient = c(1, 2), covariateId = c("1", "2") ))) expect_error(createGlmModel(coefficients = data.frame( coefficient = c(1, 2), covariateId = c(1, 2) ), intercept = "2")) expect_error(createGlmModel(coefficients = data.frame( coefficient = c(1, 2), covariateId = c(1, 2) ), mapping = "linear")) model <- createGlmModel( coefficients = data.frame( coefficient = c(1, 2), covariateId = c(1, 2) ), intercept = 2, mapping = "logistic" ) expect_equal(attr(model, "modelType"), "binary") expect_equal(attr(model, "saveType"), "RToJson") expect_equal(attr(model, "predictionFunction"), "PatientLevelPrediction::predictGlm") }) test_that("Existing glm model works", { model <- createGlmModel(coefficients = data.frame( coefficient = c(0.05), covariateId = c(1002) ), intercept = -2.5) prediction <- predictPlp(model, testData, testData$labels) expect_correct_predictions(prediction, testData) })