R Under development (unstable) (2024-01-11 r85801 ucrt) -- "Unsuffered Consequences" Copyright (C) 2024 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # fastcluster: Fast hierarchical clustering routines for R and Python > # > # Copyright: > # * Until package version 1.1.23: © 2011 Daniel Müllner > # * All changes from version 1.1.24 on: © Google Inc. > # > # Test script for the R interface > > seed = as.integer(runif(1, 0, 1e9)) > set.seed(seed) > cat(sprintf("Random seed: %d\n",seed)) Random seed: 445602103 > > print_seed <- function() { + return(sprintf(' + Please send a report to the author of the \'fastcluster\' package, Daniel Müllner. + For contact details, see . To make the error + reproducible, you must include the following number (the random seed value) in + your error report: %d.\n\n', seed)) + } > > hasWardD2 = getRversion() >= '3.1.0' > > # Compare two dendrograms and check whether they are equal, except that > # ties may be resolved differently. > compare <- function(dg1, dg2) { + h1 <- dg1$height + h2 <- dg2$height + # "height" vectors may have small numerical errors. + rdiffs <- abs(h1-h2)/pmax(abs(h1),abs(h2)) + rdiffs = rdiffs[complete.cases(rdiffs)] + rel_error <- max(rdiffs) + # We allow a relative error of 1e-13. + if (rel_error>1e-13) { + print(h1) + print(h2) + cat(sprintf('Height vectors differ! The maximum relative error is %e.\n', rel_error)) + return(FALSE) + } + # Filter the indices where consecutive merging distances are distinct. + d = diff(dg1$height) + b = (c(d,1)!=0 & c(1,d)!=0) + #cat(sprintf("Percentage of indices where we can test: %g.\n",100.0*length(b[b])/length(b))) + if (any(b)) { + m1 = dg1$merge[b,] + m2 = dg2$merge[b,] + + r = function(i) { + if (i<0) { + return(1) + } + else { + return(b[i]) + } + } + + f = sapply(m1,r) + fm1 = m1*f + fm2 = m2*f + # The "merge" matrices must be identical whereever indices are not ambiguous + # due to ties. + if (!identical(fm1,fm2)) { + cat('Merge matrices differ!\n') + return(FALSE) + } + # Compare the "order" vectors only if all merging distances were distinct. + if (all(b) && !identical(dg1$order,dg2$order)) { + cat('Order vectors differ!\n') + return(FALSE) + } + } + return(TRUE) + } > > # Generate uniformly distributed random data > generate.uniform <- function() { + n = sample(10:1000,1) + range_exp = runif(1,min=-10, max=10) + cat(sprintf("Number of sample points: %d\n",n)) + cat(sprintf("Dissimilarity range: [0,%g]\n",10^range_exp)) + d = runif(n*(n-1)/2, min=0, max=10^range_exp) + # Fake a compressed distance matrix + attributes(d) <- NULL + attr(d,"Size") <- n + attr(d, "call") <- 'N/A' + class(d) <- "dist" + return(d) + } > > # Generate normally distributed random data > generate.normal <- function() { + n = sample(10:1000,1) + dim = sample(2:20,1) + + cat (sprintf("Number of sample points: %d\n",n)) + cat (sprintf("Dimension: %d\n",dim)) + + pcd = matrix(rnorm(n*dim), c(n,dim)) + d = dist(pcd) + return(d) + } > > # Test the clustering functions when a distance matrix is given. > test.dm <- function(d) { + d2 = d + if (hasWardD2) { + methods = c('single','complete','average','mcquitty','ward.D','ward.D2','centroid','median') + } + else { + methods = c('single','complete','average','mcquitty','ward','centroid','median') + } + for (method in methods) { + cat(paste('Method :', method, '\n')) + dg_stats = stats::hclust(d, method=method) + if (method == 'ward') { + method = 'ward.D' + } + dg_fastcluster = fastcluster::hclust(d, method=method) + if (!identical(d,d2)) { + cat('Input array was corrupted!\n') + stop(print_seed()) + } + if (!compare(dg_stats, dg_fastcluster)) { + stop(print_seed()) + } + } + cat('Passed.\n') + } > > # Test the clustering functions for vector input in Euclidean space. > test.vector <- function() { + # generate test data + n = sample(10:1000,1) + dim = sample(2:20,1) + cat (sprintf("Number of sample points: %d\n",n)) + cat (sprintf("Dimension: %d\n",dim)) + + range_exp = runif(1,min=-10, max=10) + pcd = matrix(rnorm(n*dim, sd=10^range_exp), c(n,dim)) + pcd2 = pcd + # test + method='single' + cat(paste('Method:', method, '\n')) + for (metric in c('euclidean', 'maximum', 'manhattan', 'canberra', 'minkowski')) { + cat(paste(' Metric:', metric, '\n')) + if (metric=='minkowski') { + p = runif(1, min=1.0, max=10.0) + cat (sprintf(" p: %g\n",p)); + dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric, p=p) + d = dist(pcd, method=metric, p=p) + } + else { + dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) + d = dist(pcd, method=metric) + } + d2 = d + dg_fastcluster_dist = fastcluster::hclust(d, method=method) + if (!identical(d,d2) || !identical(pcd,pcd2)) { + cat('Input array was corrupted!\n') + stop(print_seed()) + } + if (!compare(dg_fastcluster_dist, dg_fastcluster)) { + stop(print_seed()) + } + } + for (method in c('ward','centroid','median') ) { + cat(paste('Method:', method, '\n')) + dg_fastcluster = fastcluster::hclust.vector(pcd, method=method) + if (!identical(pcd,pcd2)) { + cat('Input array was corrupted!\n') + stop(print_seed()) + } + d = dist(pcd) + if(method == "ward" && hasWardD2) { + method = "ward.D2" + } + else + { + # Workaround: fastcluster::hclust expects _squared_ euclidean distances. + d = d^2 + } + d2 = d + dg_fastcluster_dist = fastcluster::hclust(d, method=method) + if (!identical(d,d2)) { + cat('Input array was corrupted!\n') + stop(print_seed()) + } + if(method != "ward.D2") { + dg_fastcluster_dist$height = sqrt(dg_fastcluster_dist$height) + } + # The Euclidean methods may have small numerical errors due to squaring/ + # taking the root in the Euclidean distances. + if (!compare(dg_fastcluster_dist, dg_fastcluster)) { + stop(print_seed()) + } + } + cat('Passed.\n') + } > > # Test the single linkage function with the "binary" metric > test.vector.binary <- function() { + # generate test data + cat (sprintf("Uniform sampling for the 'binary' metric:\n")) + n = sample(10:400,1) + dim = sample(n:(2*n),1) + cat (sprintf("Number of sample points: %d\n",n)) + cat (sprintf("Dimension: %d\n",dim)) + pcd = matrix(sample(-1:2, n*dim, replace=T), c(n,dim)) + pcd2 = pcd + # test + method='single' + metric='binary' + cat(paste('Method:', method, '\n')) + cat(paste(' Metric:', metric, '\n')) + dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) + d = dist(pcd, method=metric) + d2 = d + dg_fastcluster_dist = fastcluster::hclust(d, method=method) + if (!identical(d,d2) || !identical(d,d2)) { + cat('Input array was corrupted!\n') + stop(print_seed()) + } + if (!compare(dg_fastcluster_dist, dg_fastcluster)) { + stop(print_seed()) + } + cat('Passed.\n') + } > > > N = 15 > for (i in (1:N)) { + if (i%%2==1) { + cat(sprintf('Random test %d of %d (uniform distribution of distances):\n',i,2*N)) + d = generate.uniform() + } + else { + cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) + d = generate.normal() + } + test.dm(d) + } Random test 1 of 30 (uniform distribution of distances): Number of sample points: 557 Dissimilarity range: [0,1.63267e-06] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 2 of 30 (Gaussian density): Number of sample points: 329 Dimension: 4 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 3 of 30 (uniform distribution of distances): Number of sample points: 168 Dissimilarity range: [0,2.47854e+09] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 4 of 30 (Gaussian density): Number of sample points: 477 Dimension: 16 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 5 of 30 (uniform distribution of distances): Number of sample points: 220 Dissimilarity range: [0,2.2738e+07] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 6 of 30 (Gaussian density): Number of sample points: 865 Dimension: 6 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 7 of 30 (uniform distribution of distances): Number of sample points: 712 Dissimilarity range: [0,3.02168] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 8 of 30 (Gaussian density): Number of sample points: 185 Dimension: 3 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 9 of 30 (uniform distribution of distances): Number of sample points: 763 Dissimilarity range: [0,168684] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 10 of 30 (Gaussian density): Number of sample points: 36 Dimension: 6 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 11 of 30 (uniform distribution of distances): Number of sample points: 435 Dissimilarity range: [0,3.2354e-05] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 12 of 30 (Gaussian density): Number of sample points: 111 Dimension: 3 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 13 of 30 (uniform distribution of distances): Number of sample points: 317 Dissimilarity range: [0,273697] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 14 of 30 (Gaussian density): Number of sample points: 112 Dimension: 16 Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. Random test 15 of 30 (uniform distribution of distances): Number of sample points: 290 Dissimilarity range: [0,1.77159e-05] Method : single Method : complete Method : average Method : mcquitty Method : ward.D Method : ward.D2 Method : centroid Method : median Passed. > for (i in (N+1:N)) { + cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) + test.vector() + test.vector.binary() + } Random test 16 of 30 (Gaussian density): Number of sample points: 125 Dimension: 4 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 3.88649 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 40 Dimension: 76 Method: single Metric: binary Passed. Random test 17 of 30 (Gaussian density): Number of sample points: 133 Dimension: 9 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 3.77223 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 288 Dimension: 405 Method: single Metric: binary Passed. Random test 18 of 30 (Gaussian density): Number of sample points: 579 Dimension: 13 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 4.78537 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 68 Dimension: 115 Method: single Metric: binary Passed. Random test 19 of 30 (Gaussian density): Number of sample points: 348 Dimension: 10 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 5.52154 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 307 Dimension: 545 Method: single Metric: binary Passed. Random test 20 of 30 (Gaussian density): Number of sample points: 648 Dimension: 17 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 5.56664 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 225 Dimension: 226 Method: single Metric: binary Passed. Random test 21 of 30 (Gaussian density): Number of sample points: 354 Dimension: 14 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 7.0146 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 353 Dimension: 691 Method: single Metric: binary Passed. Random test 22 of 30 (Gaussian density): Number of sample points: 707 Dimension: 20 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 2.1132 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 190 Dimension: 217 Method: single Metric: binary Passed. Random test 23 of 30 (Gaussian density): Number of sample points: 812 Dimension: 18 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 1.0137 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 21 Dimension: 33 Method: single Metric: binary Passed. Random test 24 of 30 (Gaussian density): Number of sample points: 885 Dimension: 5 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 2.73422 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 281 Dimension: 549 Method: single Metric: binary Passed. Random test 25 of 30 (Gaussian density): Number of sample points: 100 Dimension: 18 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 9.89781 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 370 Dimension: 534 Method: single Metric: binary Passed. Random test 26 of 30 (Gaussian density): Number of sample points: 472 Dimension: 11 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 8.37118 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 40 Dimension: 46 Method: single Metric: binary Passed. Random test 27 of 30 (Gaussian density): Number of sample points: 793 Dimension: 9 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 9.82664 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 354 Dimension: 582 Method: single Metric: binary Passed. Random test 28 of 30 (Gaussian density): Number of sample points: 806 Dimension: 4 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 2.23315 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 184 Dimension: 204 Method: single Metric: binary Passed. Random test 29 of 30 (Gaussian density): Number of sample points: 445 Dimension: 7 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 5.13641 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 25 Dimension: 31 Method: single Metric: binary Passed. Random test 30 of 30 (Gaussian density): Number of sample points: 436 Dimension: 10 Method: single Metric: euclidean Metric: maximum Metric: manhattan Metric: canberra Metric: minkowski p: 3.19936 Method: ward Method: centroid Method: median Passed. Uniform sampling for the 'binary' metric: Number of sample points: 201 Dimension: 258 Method: single Metric: binary Passed. > > cat('Done.\n') Done. > > proc.time() user system elapsed 19.42 0.14 19.56