4. Clustering

[1]:

getData <- function(N=30) {
    x1 <- c(
        rnorm(N, mean=2.5, sd=2.5),
        rnorm(N, mean=10.5, sd=2.5)
    )
    x2 <- c(
        rnorm(N, mean=2.5, sd=2.5),
        rnorm(N, mean=10.5, sd=2.5)
    )
    y <- c(rep(0, N), rep(1, N))
    X <- data.frame(x1=x1, x2=x2)
    return(list(X=X, y=y))
}

D = getData()

4.1. K-means

[2]:

m <- kmeans(D$X, 2)

[3]:

print(m)

K-means clustering with 2 clusters of sizes 30, 30

Cluster means:
         x1        x2
1 10.519288 10.215782
2  1.618027  2.197318

Clustering vector:
 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 306.8271 403.1129
 (between_SS / total_SS =  75.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

[4]:

print(m$totss)

[1] 2862.863

[5]:

options(repr.plot.width=5, repr.plot.height=5)

plot(D$X, col=m$cluster)
points(m$centers, col=1:2, pch=8, cex=2)

[6]:

library('cluster')

options(repr.plot.width=5, repr.plot.height=5)

clusplot(D$X, m$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

[7]:

library('fpc')

options(repr.plot.width=5, repr.plot.height=5)
plotcluster(D$X, m$cluster)

4.2. Ward hierarchical clustering

[8]:

d <- dist(D$X, method='euclidean')
m <- hclust(d, method='ward.D2')

[9]:

print(m)


Call:
hclust(d = d, method = "ward.D2")

Cluster method   : ward.D2
Distance         : euclidean
Number of objects: 60

[10]:

options(repr.plot.width=10, repr.plot.height=8)

plot(m)

4.3. Model based

[11]:

library('mclust')

m <- Mclust(D$X)

Package 'mclust' version 5.4.5
Type 'citation("mclust")' for citing this R package in publications.

[12]:

print(summary(m))

----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------

Mclust EII (spherical, equal volume) model with 2 components:

 log-likelihood  n df       BIC       ICL
      -318.0198 60  6 -660.6057 -661.7037

Clustering table:
 1  2
30 30

[13]:

plot(m)

4.4. Comparing clustering solutions

[14]:

m1 <- kmeans(D$X, 2)
m2 <- Mclust(D$X)

d <- dist(D$X, method='euclidean')

s <- cluster.stats(d, m1$cluster, m2$cluster)
print(s)

$n
[1] 60

$cluster.number
[1] 2

$cluster.size
[1] 30 30

$min.cluster.size
[1] 30

$noisen
[1] 0

$diameter
[1] 10.89581 15.15251

$average.distance
[1] 4.034249 4.647682

$median.distance
[1] 3.649638 4.319535

$separation
[1] 3.633523 3.633523

$average.toother
[1] 12.4389 12.4389

$separation.matrix
         [,1]     [,2]
[1,] 0.000000 3.633523
[2,] 3.633523 0.000000

$ave.between.matrix
        [,1]    [,2]
[1,]  0.0000 12.4389
[2,] 12.4389  0.0000

$average.between
[1] 12.4389

$average.within
[1] 4.340965

$n.between
[1] 900

$n.within
[1] 870

$max.diameter
[1] 15.15251

$min.separation
[1] 3.633523

$within.cluster.ss
[1] 709.94

$clus.avg.silwidths
        1         2
0.6738004 0.6067246

$avg.silwidth
[1] 0.6402625

$g2
NULL

$g3
NULL

$pearsongamma
[1] 0.8017156

$dunn
[1] 0.2397967

$dunn2
[1] 2.676367

$entropy
[1] 0.6931472

$wb.ratio
[1] 0.348983

$ch
[1] 175.8875

$cwidegap
[1] 3.071360 5.367805

$widestgap
[1] 5.367805

$sindex
[1] 3.88928

$corrected.rand
NULL

$vi
NULL

4.5. Silhouette score

[15]:

s <- silhouette(m1$cluster, d)
print(summary(s))

Silhouette of 60 units in 2 clusters from silhouette.default(x = m1$cluster, dist = d) :
 Cluster sizes and average silhouette widths:
       30        30
0.6738004 0.6067246
Individual silhouette widths:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.04566 0.60988 0.68303 0.64026 0.72343 0.75897

[16]:

plot(s, col=c('red', 'blue'))