R Under development (unstable) (2023-11-26 r85638 ucrt) -- "Unsuffered Consequences" Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ## Originally inspired by Kasper Fischer-Rasmussen 's clara_gower.html [html from Rmd] > > library(cluster) > packageDescription("cluster") Package: cluster Version: 2.1.5 Date: 2023-11-27 Priority: recommended Title: "Finding Groups in Data": Cluster Analysis Extended Rousseeuw et al. Description: Methods for Cluster analysis. Much extended the original from Peter Rousseeuw, Anja Struyf and Mia Hubert, based on Kaufman and Rousseeuw (1990) "Finding Groups in Data". Maintainer: Martin Maechler Authors@R: c(person("Martin","Maechler", role = c("aut","cre"), email="maechler@stat.math.ethz.ch", comment = c(ORCID = "0000-0002-8685-9910")) ,person("Peter", "Rousseeuw", role="aut", email="peter.rousseeuw@kuleuven.be", comment = c("Fortran original", ORCID = "0000-0002-3807-5353")) ,person("Anja", "Struyf", role="aut", comment= "S original") ,person("Mia", "Hubert", role="aut", email= "Mia.Hubert@uia.ua.ac.be", comment = c("S original", ORCID = "0000-0001-6398-4850")) ,person("Kurt", "Hornik", role=c("trl", "ctb"), email="Kurt.Hornik@R-project.org", comment=c("port to R; maintenance(1999-2000)", ORCID="0000-0003-4198-9911")) ,person("Matthias", "Studer", role="ctb") ,person("Pierre", "Roudier", role="ctb") ,person("Juan", "Gonzalez", role="ctb") ,person("Kamil", "Kozlowski", role="ctb") ,person("Erich", "Schubert", role="ctb", comment = c("fastpam options for pam()", ORCID = "0000-0001-9143-4880")) ,person("Keefe", "Murphy", role="ctb", comment = "volume.ellipsoid({d >= 3})") ,person("Fischer-Rasmussen", "Kasper", role = "ctb", comment = "Gower distance for CLARA") ) Depends: R (>= 3.5.0) Imports: graphics, grDevices, stats, utils Suggests: MASS, Matrix SuggestsNote: MASS: two examples using cov.rob() and mvrnorm(); Matrix tools for testing Enhances: mvoutlier, fpc, ellipse, sfsmisc EnhancesNote: xref-ed in man/*.Rd LazyLoad: yes LazyData: yes ByteCompile: yes BuildResaveData: no License: GPL (>= 2) URL: https://svn.r-project.org/R-packages/trunk/cluster/ NeedsCompilation: yes Packaged: 2023-11-27 15:04:10 UTC; maechler Author: Martin Maechler [aut, cre] (), Peter Rousseeuw [aut] (Fortran original, ), Anja Struyf [aut] (S original), Mia Hubert [aut] (S original, ), Kurt Hornik [trl, ctb] (port to R; maintenance(1999-2000), ), Matthias Studer [ctb], Pierre Roudier [ctb], Juan Gonzalez [ctb], Kamil Kozlowski [ctb], Erich Schubert [ctb] (fastpam options for pam(), ), Keefe Murphy [ctb] (volume.ellipsoid({d >= 3})), Fischer-Rasmussen Kasper [ctb] (Gower distance for CLARA) Built: R 4.4.0; x86_64-w64-mingw32; 2023-11-27 15:55:09 UTC; windows Archs: x64 -- File: D:/RCompile/CRANincoming/R-devel/lib/cluster/Meta/package.rds > > ## carefully getting 150 + 200 + 150 = 500 obs. from the 3 xclara clusters : > str(dd <- xclara[c(1:150, 1001:1200, 2101:2250), ]) 'data.frame': 500 obs. of 2 variables: $ V1: num 2.07 17.94 1.08 11.12 23.71 ... $ V2: num -3.24 15.78 7.32 14.41 2.56 ... > dim(dd) # 500 2 [1] 500 2 > > set.seed(47) > cl_manhat <- clara(dd, 3, metric = "manhattan", rngR=TRUE, pamLike=TRUE, samples = 500) > cl_gower <- clara(dd, 3, metric = "gower", rngR=TRUE, pamLike=TRUE, samples = 500) > > table(cl_manhat$cluster, + cl_gower $cluster) 1 2 3 1 150 0 0 2 0 200 0 3 0 0 150 > > stopifnot(exprs = { + ## Apart from [188], they are the same + ## usually even *including* [188], but not always ???? {FIXME ??? Random? even we use rngR?} + cl_manhat$cluster[-188] == cl_gower $cluster[-188] + identical(rle(unname(cl_manhat$cluster)), + structure(class = "rle", + list(lengths = c(29L, 1L, 120L, 80L, 1L, 119L, 150L), + values = c( 1L, 2L, 1L, 2L, 1L, 2L, 3L)))) + }) > ## ==> no distinction between the clusters wrt Manhattan vs. Gower's distance. > > > ## Using {cluster}'s built in tools to compute Gower's distance. > > cl_gower_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 500, sampsize = nrow(dd)) > dist_cl_full <- as.matrix(cl_gower_full$diss) > i_full <- rownames(dist_cl_full) > d_full <- data.frame(CLARA = as.vector(cl_gower_full$diss), + DAISY = as.vector(daisy(dd[i_full, ], metric = "gower"))) > > ## MM: instead of all this, just > all.equal(d_full$CLARA, + d_full$DAISY, tol=0) # "Mean relative difference: 2.17e-16" [1] "Mean relative difference: 2.171402e-16" > ## ... but sometimes *VERY* different (relative diff. 0.5xxx) > if(FALSE) + stopifnot( all.equal(d_full$CLARA, + d_full$DAISY, tol = 1e-15) ) ## equal up to 15 digits! > > ## We can see that the distance measurements are exactly identical when the > ## whole data is used in the clustering. This is because the Gower distance > ## scales the distances measurements with the range of each feature. Due to > ## the subsampling, approximate ranges are calculated based on each > ## subsample explaining the deviations. > > > ## MM: compare -- with pam(): > dGow <- daisy(dd, metric="gower") > cl_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 1, sampsize = nrow(dd)) > > all.equal(c(dGow) , c(cl_full$diss), tol=0) # "Mean relative difference: 2.171402e-16" [1] "Mean relative difference: 2.171402e-16" > > pam_3 <- pam(dGow, k = 3, variant = "faster") > ## FIXME !! -- bug !? > all.equal(pam_3 $ clustering, # we would want *identical* -- bug ?? + cl_full$ clustering) [1] TRUE > all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15) [1] TRUE > if(FALSE) ## FIXME + stopifnot(exprs = { + identical(pam_3 $ clustering, + cl_full$ clustering) + all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15) + }) > > > proc.time() user system elapsed 0.62 0.09 0.70