R Under development (unstable) (2025-01-21 r87610 ucrt) -- "Unsuffered Consequences"
Copyright (C) 2025 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> cat("sample splits testing")
sample splits testing> library(qgcomp)
> 
> data(metals)
> metals$clust = sample(seq_len(floor(nrow(metals)/8)), nrow(metals), replace=TRUE)
> 
> set.seed(1231124)
> spl = split_data(metals)
> Xnm <- c(
+   'arsenic','barium','cadmium','calcium','chromium','copper',
+  'iron','lead','magnesium','manganese','mercury','selenium','silver',
+  'sodium','zinc'
+ )
> dim(spl$traindata) # 181 observations = 40% of total
[1] 181  27
> dim(spl$validdata) # 271 observations = 60% of total
[1] 271  27
> splitres <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, 
+   traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = FALSE, .globalbreaks = TRUE)
> splitres

Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver
Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium, zinc
Partial effect sizes estimated in validation data
Positive direction Mixture slope parameters (delta method CI):

            Estimate Std. Error  Lower CI Upper CI t value Pr(>|t|)
(Intercept) -0.35857    0.22376 -0.797126 0.079983 -1.6025 0.110248
psi1         0.29741    0.10265  0.096221 0.498606  2.8973 0.004082

Negative direction Mixture slope parameters (delta method CI):

            Estimate Std. Error Lower CI Upper CI t value Pr(>|t|)
(Intercept) 0.021121   0.219381 -0.40886  0.45110  0.0963   0.9234
psi1        0.024673   0.078326 -0.12884  0.17819  0.3150   0.7530
> 
> # check for break preservation
> posbr = splitres$pos.fit$breaks[[1]]
> posnm = splitres$pos.fit$expnms[[1]]
> negbr = splitres$neg.fit$breaks[[1]]
> negnm = splitres$neg.fit$expnms[[1]]
> posidx = which(splitres$train.fit$expnms == posnm)
> negidx = which(splitres$train.fit$expnms == negnm)
> stopifnot(all.equal(splitres$train.fit$breaks[[posidx]], posbr))
> stopifnot(all.equal(splitres$train.fit$breaks[[negidx]], negbr))
> 
> splitres2 <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, 
+                             traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = TRUE, .globalbreaks = FALSE)
> splitres2

Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, lead, manganese, mercury, silver
Variables with negative effect sizes in training data: arsenic, copper, magnesium, selenium, sodium, zinc
Partial effect sizes estimated in validation data
Positive direction Mixture slope parameters (delta method CI):

            Estimate Std. Error  Lower CI Upper CI t value Pr(>|t|)
(Intercept) -0.33063    0.23726 -0.795653  0.13440 -1.3935   0.1646
psi1         0.24951    0.10624  0.041278  0.45774  2.3485   0.0196

Negative direction Mixture slope parameters (delta method CI):

            Estimate Std. Error Lower CI Upper CI t value Pr(>|t|)
(Intercept) 0.060800   0.218993 -0.36842  0.49002  0.2776   0.7815
psi1        0.011238   0.074398 -0.13458  0.15706  0.1511   0.8800
> 
> # check for break preservation
> posbr2 = splitres2$pos.fit$breaks[[1]]
> posnm2 = splitres2$pos.fit$expnms[[1]]
> negbr2 = splitres2$neg.fit$breaks[[1]]
> negnm2 = splitres2$neg.fit$expnms[[1]]
> posidx2 = which(splitres2$train.fit$expnms == posnm)
> negidx2 = which(splitres2$train.fit$expnms == negnm)
> stopifnot(all.equal(splitres2$train.fit$breaks[[posidx2]], posbr2))
> stopifnot(all.equal(splitres2$train.fit$breaks[[negidx2]], negbr2))
> 
> 
> # are clusters allocated equally across training/testing?
> margdist = as.numeric(prop.table(table(metals$clust))) # 70/30 split
> # distance between marginal distribution of cluster and split specific clustering
> #sqrt(sum((as.numeric(prop.table(table(spl$traindata$clust))) - margdist)^2)) #  invalid because this doesnt contain all clusters
> #sqrt(sum((as.numeric(prop.table(table(spl$validdata$clust))) - margdist)^2)) # 0.04049317
> # do all clusters show up in both datasets?
> length(table(spl$traindata$clust)) - length(margdist)
[1] -3
> length(table(spl$validdata$clust)) - length(margdist)
[1] 0
> 
> 
> spl2 = split_data(metals, cluster="clust")
> dim(spl2$traindata) # 181 observations = 40% of total
[1] 182  27
> dim(spl2$validdata) # 271 observations = 60% of total
[1] 270  27
> # distance between marginal distribution of cluster and split specific clustering
> 
> sqrt(sum((as.numeric(prop.table(table(spl2$traindata$clust))) - margdist)^2)) #  0.0116399
[1] 0.01223595
> sqrt(sum((as.numeric(prop.table(table(spl2$validdata$clust))) - margdist)^2)) # 0.007774251
[1] 0.008247938
> # do all clusters show up in both datasets?
> length(table(spl2$traindata$clust)) - length(margdist)
[1] 0
> length(table(spl2$validdata$clust)) - length(margdist)
[1] 0
> 
> 
> 
> proc.time()
   user  system elapsed 
   2.35    0.42    2.76