R Under development (unstable) (2025-08-19 r88650 ucrt) -- "Unsuffered Consequences" Copyright (C) 2025 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > cat("sample splits testing") sample splits testing> library(qgcomp) > > data(metals) > metals$clust = sample(seq_len(floor(nrow(metals)/8)), nrow(metals), replace=TRUE) > > set.seed(1231124) > spl = split_data(metals) > Xnm <- c( + 'arsenic','barium','cadmium','calcium','chromium','copper', + 'iron','lead','magnesium','manganese','mercury','selenium','silver', + 'sodium','zinc' + ) > dim(spl$traindata) # 181 observations = 40% of total [1] 181 27 > dim(spl$validdata) # 271 observations = 60% of total [1] 271 27 > splitres <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, + traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = FALSE, .globalbreaks = TRUE) > splitres Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver, zinc Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium Partial effect sizes estimated in validation data Positive direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) -0.39417 0.22601 -0.83714 0.048801 -1.7440 0.082329 psi1 0.32615 0.10373 0.12284 0.529463 3.1442 0.001858 Negative direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) 0.0796297 0.2111589 -0.33423 0.49349 0.3771 0.7064 psi1 -0.0028262 0.0753330 -0.15048 0.14482 -0.0375 0.9701 > > # check for break preservation > posbr = splitres$pos.fit$breaks[[1]] > posnm = splitres$pos.fit$expnms[[1]] > negbr = splitres$neg.fit$breaks[[1]] > negnm = splitres$neg.fit$expnms[[1]] > posidx = which(splitres$train.fit$expnms == posnm) > negidx = which(splitres$train.fit$expnms == negnm) > stopifnot(all.equal(splitres$train.fit$breaks[[posidx]], posbr)) > stopifnot(all.equal(splitres$train.fit$breaks[[negidx]], negbr)) > > splitres2 <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, + traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = TRUE, .globalbreaks = FALSE) > splitres2 Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver, zinc Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium Partial effect sizes estimated in validation data Positive direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) -0.35859 0.23924 -0.827496 0.11032 -1.4988 0.13512 psi1 0.27290 0.10609 0.064958 0.48084 2.5722 0.01066 Negative direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) 0.120045 0.210972 -0.29345 0.53354 0.5690 0.5698 psi1 -0.014597 0.074770 -0.16114 0.13195 -0.1952 0.8454 > > # check for break preservation > posbr2 = splitres2$pos.fit$breaks[[1]] > posnm2 = splitres2$pos.fit$expnms[[1]] > negbr2 = splitres2$neg.fit$breaks[[1]] > negnm2 = splitres2$neg.fit$expnms[[1]] > posidx2 = which(splitres2$train.fit$expnms == posnm) > negidx2 = which(splitres2$train.fit$expnms == negnm) > stopifnot(all.equal(splitres2$train.fit$breaks[[posidx2]], posbr2)) > stopifnot(all.equal(splitres2$train.fit$breaks[[negidx2]], negbr2)) > > > # are clusters allocated equally across training/testing? > margdist = as.numeric(prop.table(table(metals$clust))) # 70/30 split > # distance between marginal distribution of cluster and split specific clustering > #sqrt(sum((as.numeric(prop.table(table(spl$traindata$clust))) - margdist)^2)) # invalid because this doesnt contain all clusters > #sqrt(sum((as.numeric(prop.table(table(spl$validdata$clust))) - margdist)^2)) # 0.04049317 > # do all clusters show up in both datasets? > length(table(spl$traindata$clust)) - length(margdist) [1] -2 > length(table(spl$validdata$clust)) - length(margdist) [1] -1 > > > spl2 = split_data(metals, cluster="clust") > dim(spl2$traindata) # 181 observations = 40% of total [1] 178 27 > dim(spl2$validdata) # 271 observations = 60% of total [1] 274 27 > # distance between marginal distribution of cluster and split specific clustering > > sqrt(sum((as.numeric(prop.table(table(spl2$traindata$clust))) - margdist)^2)) # 0.0116399 [1] 0.0119672 > sqrt(sum((as.numeric(prop.table(table(spl2$validdata$clust))) - margdist)^2)) # 0.007774251 [1] 0.007774315 > # do all clusters show up in both datasets? > length(table(spl2$traindata$clust)) - length(margdist) [1] 0 > length(table(spl2$validdata$clust)) - length(margdist) [1] 0 > > > > proc.time() user system elapsed 2.03 0.20 2.23