R Under development (unstable) (2025-01-21 r87610 ucrt) -- "Unsuffered Consequences" Copyright (C) 2025 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > cat("sample splits testing") sample splits testing> library(qgcomp) > > data(metals) > metals$clust = sample(seq_len(floor(nrow(metals)/8)), nrow(metals), replace=TRUE) > > set.seed(1231124) > spl = split_data(metals) > Xnm <- c( + 'arsenic','barium','cadmium','calcium','chromium','copper', + 'iron','lead','magnesium','manganese','mercury','selenium','silver', + 'sodium','zinc' + ) > dim(spl$traindata) # 181 observations = 40% of total [1] 181 27 > dim(spl$validdata) # 271 observations = 60% of total [1] 271 27 > splitres <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, + traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = FALSE, .globalbreaks = TRUE) > splitres Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, manganese, mercury, silver Variables with negative effect sizes in training data: arsenic, copper, lead, magnesium, selenium, sodium, zinc Partial effect sizes estimated in validation data Positive direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) -0.35857 0.22376 -0.797126 0.079983 -1.6025 0.110248 psi1 0.29741 0.10265 0.096221 0.498606 2.8973 0.004082 Negative direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) 0.021121 0.219381 -0.40886 0.45110 0.0963 0.9234 psi1 0.024673 0.078326 -0.12884 0.17819 0.3150 0.7530 > > # check for break preservation > posbr = splitres$pos.fit$breaks[[1]] > posnm = splitres$pos.fit$expnms[[1]] > negbr = splitres$neg.fit$breaks[[1]] > negnm = splitres$neg.fit$expnms[[1]] > posidx = which(splitres$train.fit$expnms == posnm) > negidx = which(splitres$train.fit$expnms == negnm) > stopifnot(all.equal(splitres$train.fit$breaks[[posidx]], posbr)) > stopifnot(all.equal(splitres$train.fit$breaks[[negidx]], negbr)) > > splitres2 <- qgcomp.partials(fun="qgcomp.noboot", f=y~., q=4, + traindata=spl$traindata,validdata=spl$validdata, expnms=Xnm, .fixbreaks = TRUE, .globalbreaks = FALSE) > splitres2 Variables with positive effect sizes in training data: barium, cadmium, calcium, chromium, iron, lead, manganese, mercury, silver Variables with negative effect sizes in training data: arsenic, copper, magnesium, selenium, sodium, zinc Partial effect sizes estimated in validation data Positive direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) -0.33063 0.23726 -0.795653 0.13440 -1.3935 0.1646 psi1 0.24951 0.10624 0.041278 0.45774 2.3485 0.0196 Negative direction Mixture slope parameters (delta method CI): Estimate Std. Error Lower CI Upper CI t value Pr(>|t|) (Intercept) 0.060800 0.218993 -0.36842 0.49002 0.2776 0.7815 psi1 0.011238 0.074398 -0.13458 0.15706 0.1511 0.8800 > > # check for break preservation > posbr2 = splitres2$pos.fit$breaks[[1]] > posnm2 = splitres2$pos.fit$expnms[[1]] > negbr2 = splitres2$neg.fit$breaks[[1]] > negnm2 = splitres2$neg.fit$expnms[[1]] > posidx2 = which(splitres2$train.fit$expnms == posnm) > negidx2 = which(splitres2$train.fit$expnms == negnm) > stopifnot(all.equal(splitres2$train.fit$breaks[[posidx2]], posbr2)) > stopifnot(all.equal(splitres2$train.fit$breaks[[negidx2]], negbr2)) > > > # are clusters allocated equally across training/testing? > margdist = as.numeric(prop.table(table(metals$clust))) # 70/30 split > # distance between marginal distribution of cluster and split specific clustering > #sqrt(sum((as.numeric(prop.table(table(spl$traindata$clust))) - margdist)^2)) # invalid because this doesnt contain all clusters > #sqrt(sum((as.numeric(prop.table(table(spl$validdata$clust))) - margdist)^2)) # 0.04049317 > # do all clusters show up in both datasets? > length(table(spl$traindata$clust)) - length(margdist) [1] -3 > length(table(spl$validdata$clust)) - length(margdist) [1] 0 > > > spl2 = split_data(metals, cluster="clust") > dim(spl2$traindata) # 181 observations = 40% of total [1] 182 27 > dim(spl2$validdata) # 271 observations = 60% of total [1] 270 27 > # distance between marginal distribution of cluster and split specific clustering > > sqrt(sum((as.numeric(prop.table(table(spl2$traindata$clust))) - margdist)^2)) # 0.0116399 [1] 0.01223595 > sqrt(sum((as.numeric(prop.table(table(spl2$validdata$clust))) - margdist)^2)) # 0.007774251 [1] 0.008247938 > # do all clusters show up in both datasets? > length(table(spl2$traindata$clust)) - length(margdist) [1] 0 > length(table(spl2$validdata$clust)) - length(margdist) [1] 0 > > > > proc.time() user system elapsed 2.35 0.42 2.76