R Under development (unstable) (2025-08-19 r88650 ucrt) -- "Unsuffered Consequences"
Copyright (C) 2025 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> cat("sample splits testing")
sample splits testing> library(qgcomp)
> 
> data(metals)
> metals$clust = sample(seq_len(floor(nrow(metals)/8)), nrow(metals), replace=TRUE)
> 
> set.seed(1231124)
> spl = split_data(metals)
> Xnm <- c(
+   'arsenic','barium','cadmium','calcium','chromium','copper',
+  'iron','lead','magnesium','manganese','mercury','selenium','silver',
+  'sodium','zinc'
+ )
> dim(spl$traindata) # 181 observations = 40% of total
[1] 181  27
> dim(spl$validdata) # 271 observations = 60% of total
[1] 271  27
> splitres <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, 
+   traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = FALSE, .globalbreaks = TRUE)
> splitres

Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver, zinc
Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium
Partial effect sizes estimated in validation data
Positive direction Mixture slope parameters (delta method CI):

            Estimate Std. Error Lower CI Upper CI t value Pr(>|t|)
(Intercept) -0.39417    0.22601 -0.83714 0.048801 -1.7440 0.082329
psi1         0.32615    0.10373  0.12284 0.529463  3.1442 0.001858

Negative direction Mixture slope parameters (delta method CI):

              Estimate Std. Error Lower CI Upper CI t value Pr(>|t|)
(Intercept)  0.0796297  0.2111589 -0.33423  0.49349  0.3771   0.7064
psi1        -0.0028262  0.0753330 -0.15048  0.14482 -0.0375   0.9701
> 
> # check for break preservation
> posbr = splitres$pos.fit$breaks[[1]]
> posnm = splitres$pos.fit$expnms[[1]]
> negbr = splitres$neg.fit$breaks[[1]]
> negnm = splitres$neg.fit$expnms[[1]]
> posidx = which(splitres$train.fit$expnms == posnm)
> negidx = which(splitres$train.fit$expnms == negnm)
> stopifnot(all.equal(splitres$train.fit$breaks[[posidx]], posbr))
> stopifnot(all.equal(splitres$train.fit$breaks[[negidx]], negbr))
> 
> splitres2 <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, 
+                             traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = TRUE, .globalbreaks = FALSE)
> splitres2

Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver, zinc
Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium
Partial effect sizes estimated in validation data
Positive direction Mixture slope parameters (delta method CI):

            Estimate Std. Error  Lower CI Upper CI t value Pr(>|t|)
(Intercept) -0.35859    0.23924 -0.827496  0.11032 -1.4988  0.13512
psi1         0.27290    0.10609  0.064958  0.48084  2.5722  0.01066

Negative direction Mixture slope parameters (delta method CI):

             Estimate Std. Error Lower CI Upper CI t value Pr(>|t|)
(Intercept)  0.120045   0.210972 -0.29345  0.53354  0.5690   0.5698
psi1        -0.014597   0.074770 -0.16114  0.13195 -0.1952   0.8454
> 
> # check for break preservation
> posbr2 = splitres2$pos.fit$breaks[[1]]
> posnm2 = splitres2$pos.fit$expnms[[1]]
> negbr2 = splitres2$neg.fit$breaks[[1]]
> negnm2 = splitres2$neg.fit$expnms[[1]]
> posidx2 = which(splitres2$train.fit$expnms == posnm)
> negidx2 = which(splitres2$train.fit$expnms == negnm)
> stopifnot(all.equal(splitres2$train.fit$breaks[[posidx2]], posbr2))
> stopifnot(all.equal(splitres2$train.fit$breaks[[negidx2]], negbr2))
> 
> 
> # are clusters allocated equally across training/testing?
> margdist = as.numeric(prop.table(table(metals$clust))) # 70/30 split
> # distance between marginal distribution of cluster and split specific clustering
> #sqrt(sum((as.numeric(prop.table(table(spl$traindata$clust))) - margdist)^2)) #  invalid because this doesnt contain all clusters
> #sqrt(sum((as.numeric(prop.table(table(spl$validdata$clust))) - margdist)^2)) # 0.04049317
> # do all clusters show up in both datasets?
> length(table(spl$traindata$clust)) - length(margdist)
[1] -2
> length(table(spl$validdata$clust)) - length(margdist)
[1] -1
> 
> 
> spl2 = split_data(metals, cluster="clust")
> dim(spl2$traindata) # 181 observations = 40% of total
[1] 178  27
> dim(spl2$validdata) # 271 observations = 60% of total
[1] 274  27
> # distance between marginal distribution of cluster and split specific clustering
> 
> sqrt(sum((as.numeric(prop.table(table(spl2$traindata$clust))) - margdist)^2)) #  0.0116399
[1] 0.0119672
> sqrt(sum((as.numeric(prop.table(table(spl2$validdata$clust))) - margdist)^2)) # 0.007774251
[1] 0.007774315
> # do all clusters show up in both datasets?
> length(table(spl2$traindata$clust)) - length(margdist)
[1] 0
> length(table(spl2$validdata$clust)) - length(margdist)
[1] 0
> 
> 
> 
> proc.time()
   user  system elapsed 
   2.03    0.20    2.23