R Under development (unstable) (2023-11-26 r85638 ucrt) -- "Unsuffered Consequences"
Copyright (C) 2023 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> ## Originally inspired by  Kasper Fischer-Rasmussen 's  clara_gower.html  [html from Rmd]
> 
> library(cluster)
> packageDescription("cluster")
Package: cluster
Version: 2.1.5
Date: 2023-11-27
Priority: recommended
Title: "Finding Groups in Data": Cluster Analysis Extended Rousseeuw et
        al.
Description: Methods for Cluster analysis.  Much extended the original
        from Peter Rousseeuw, Anja Struyf and Mia Hubert, based on
        Kaufman and Rousseeuw (1990) "Finding Groups in Data".
Maintainer: Martin Maechler <maechler@stat.math.ethz.ch>
Authors@R: c(person("Martin","Maechler", role = c("aut","cre"),
        email="maechler@stat.math.ethz.ch", comment = c(ORCID =
        "0000-0002-8685-9910")) ,person("Peter", "Rousseeuw",
        role="aut", email="peter.rousseeuw@kuleuven.be", comment =
        c("Fortran original", ORCID = "0000-0002-3807-5353"))
        ,person("Anja", "Struyf", role="aut", comment= "S original")
        ,person("Mia", "Hubert", role="aut", email=
        "Mia.Hubert@uia.ua.ac.be", comment = c("S original", ORCID =
        "0000-0001-6398-4850")) ,person("Kurt", "Hornik", role=c("trl",
        "ctb"), email="Kurt.Hornik@R-project.org", comment=c("port to
        R; maintenance(1999-2000)", ORCID="0000-0003-4198-9911"))
        ,person("Matthias", "Studer", role="ctb") ,person("Pierre",
        "Roudier", role="ctb") ,person("Juan", "Gonzalez", role="ctb")
        ,person("Kamil", "Kozlowski", role="ctb") ,person("Erich",
        "Schubert", role="ctb", comment = c("fastpam options for
        pam()", ORCID = "0000-0001-9143-4880")) ,person("Keefe",
        "Murphy", role="ctb", comment = "volume.ellipsoid({d >= 3})")
        ,person("Fischer-Rasmussen", "Kasper", role = "ctb", comment =
        "Gower distance for CLARA") )
Depends: R (>= 3.5.0)
Imports: graphics, grDevices, stats, utils
Suggests: MASS, Matrix
SuggestsNote: MASS: two examples using cov.rob() and mvrnorm(); Matrix
        tools for testing
Enhances: mvoutlier, fpc, ellipse, sfsmisc
EnhancesNote: xref-ed in man/*.Rd
LazyLoad: yes
LazyData: yes
ByteCompile: yes
BuildResaveData: no
License: GPL (>= 2)
URL: https://svn.r-project.org/R-packages/trunk/cluster/
NeedsCompilation: yes
Packaged: 2023-11-27 15:04:10 UTC; maechler
Author: Martin Maechler [aut, cre]
        (<https://orcid.org/0000-0002-8685-9910>), Peter Rousseeuw
        [aut] (Fortran original,
        <https://orcid.org/0000-0002-3807-5353>), Anja Struyf [aut] (S
        original), Mia Hubert [aut] (S original,
        <https://orcid.org/0000-0001-6398-4850>), Kurt Hornik [trl,
        ctb] (port to R; maintenance(1999-2000),
        <https://orcid.org/0000-0003-4198-9911>), Matthias Studer
        [ctb], Pierre Roudier [ctb], Juan Gonzalez [ctb], Kamil
        Kozlowski [ctb], Erich Schubert [ctb] (fastpam options for
        pam(), <https://orcid.org/0000-0001-9143-4880>), Keefe Murphy
        [ctb] (volume.ellipsoid({d >= 3})), Fischer-Rasmussen Kasper
        [ctb] (Gower distance for CLARA)
Built: R 4.4.0; x86_64-w64-mingw32; 2023-11-27 15:55:09 UTC; windows
Archs: x64

-- File: D:/RCompile/CRANincoming/R-devel/lib/cluster/Meta/package.rds 
> 
> ## carefully getting  150 + 200 + 150 = 500 obs. from the 3  xclara clusters :
> str(dd <- xclara[c(1:150, 1001:1200, 2101:2250), ])
'data.frame':	500 obs. of  2 variables:
 $ V1: num  2.07 17.94 1.08 11.12 23.71 ...
 $ V2: num  -3.24 15.78 7.32 14.41 2.56 ...
> dim(dd) # 500 2
[1] 500   2
> 
> set.seed(47)
> cl_manhat <- clara(dd, 3, metric = "manhattan", rngR=TRUE, pamLike=TRUE, samples = 500)
> cl_gower  <- clara(dd, 3, metric = "gower",     rngR=TRUE, pamLike=TRUE, samples = 500)
> 
> table(cl_manhat$cluster,
+       cl_gower $cluster)
   
      1   2   3
  1 150   0   0
  2   0 200   0
  3   0   0 150
> 
> stopifnot(exprs = {
+     ## Apart from [188], they are the same
+     ##    usually even *including* [188], but not always ???? {FIXME ??? Random? even we use rngR?}
+     cl_manhat$cluster[-188] == cl_gower $cluster[-188]
+     identical(rle(unname(cl_manhat$cluster)),
+               structure(class = "rle",
+                         list(lengths = c(29L, 1L, 120L, 80L, 1L, 119L, 150L),
+                              values  = c( 1L, 2L,   1L,  2L, 1L,   2L,   3L))))
+ })
> ## ==> no distinction between the clusters wrt Manhattan vs. Gower's distance.
> 
> 
> ## Using {cluster}'s built in tools to compute Gower's distance.
> 
> cl_gower_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 500, sampsize = nrow(dd))
> dist_cl_full <- as.matrix(cl_gower_full$diss)
> i_full <- rownames(dist_cl_full)
> d_full <- data.frame(CLARA = as.vector(cl_gower_full$diss),
+                      DAISY = as.vector(daisy(dd[i_full, ], metric = "gower")))
> 
> ## MM: instead of all this, just
> all.equal(d_full$CLARA,
+           d_full$DAISY, tol=0) # "Mean relative difference: 2.17e-16"
[1] "Mean relative difference: 2.171402e-16"
> ## ... but sometimes *VERY* different (relative diff.   0.5xxx)
> if(FALSE)
+ stopifnot( all.equal(d_full$CLARA,
+                      d_full$DAISY, tol = 1e-15) ) ## equal up to  15 digits!
> 
> ## We can see that the distance measurements are exactly identical when the
> ## whole data is used in the clustering. This is because the Gower distance
> ## scales the distances measurements with the range of each feature. Due to
> ## the subsampling, approximate ranges are calculated based on each
> ## subsample explaining the deviations.
> 
> 
> ## MM: compare -- with pam():
> dGow <- daisy(dd, metric="gower")
> cl_full <- clara(dd, k = 3, metric = "gower", rngR = TRUE, pamLike = TRUE, samples = 1, sampsize = nrow(dd))
> 
> all.equal(c(dGow) , c(cl_full$diss), tol=0) # "Mean relative difference: 2.171402e-16"
[1] "Mean relative difference: 2.171402e-16"
> 
> pam_3 <- pam(dGow, k = 3, variant = "faster")
> ## FIXME !! -- bug !?
> all.equal(pam_3  $ clustering, # we would want *identical* -- bug ??
+           cl_full$ clustering)
[1] TRUE
> all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15)
[1] TRUE
> if(FALSE) ## FIXME
+ stopifnot(exprs = {
+     identical(pam_3  $ clustering,
+               cl_full$ clustering)
+     all.equal(c(dGow) , c(cl_full$diss), tol = 1e-15)
+ })
> 
> 
> proc.time()
   user  system elapsed 
   0.62    0.09    0.70