クラスター分析とデンドログラムの表示
library(readr)
library(dplyr)
library(magrittr)
library(reshape2)
library(ggplot2)
library(fpc)
# データの概要
label item1 item2 item3 item4 item5 item6 item7 item8 item9 item10
1 A 5 5 5 5 5 5 5 5 5 5
2 B 5 4 4 4 4 4 4 4 3 3
3 C 5 5 5 5 5 5 5 4 5 1
4 D 5 5 5 5 5 5 5 3 4 3
6 E 5 5 5 5 5 5 5 5 3 1
7 F 5 4 4 4 4 4 4 4 3 3
# Euclidean Distance
d <- dist(dat[, -1], method = "euclidean")
# Hierarchical Clustering with Ward Method
cluster <- hclust(d, method = "ward.D2")
# Dendrogram
plot(cluster, labels = dat$label, hang = -1)
# k = nのクラスター数で囲いを入れる
rect.hclust(cluster, k = 2, border = "red")

クラスター数の決定
##Calinski-Harabasz (CH) index(Pseudo F)
# total sum of squares
totss <- function(dmatrix){
grandmean <- apply(dmatrix, 2, FUN = mean)
sum(apply(dmatrix, 1, FUN = function(row){sqr_edist(row, grandmean)}))
}
# wss
Pseudo_F <- function(dmatrix, kmax){
npts <- dim(dmatrix)[1] # number of rows.
totss <- totss(dmatrix)
wss <- numeric(kmax)
crit <- numeric(kmax)
wss[1] <- (npts - 1) * sum(apply(dmatrix, 2, var))
for(k in 2 : kmax){
d <- dist(dmatrix, method = "euclidean")
pfit <- hclust(d, method = "ward.D2")
labels <- cutree(pfit, k = k)
wss[k] <- wss.total(dmatrix, labels)
}
bss <- totss - wss # bssの計算
crit.num <- bss / (0 : (kmax - 1)) # bss/k-1
crit.denom <- wss/(npts - 1 : kmax) # wss/n-k
list(crit = crit.num / crit.denom, wss = wss, totss = totss, test= crit.num) # crit = (bss/k-1) / (wss/n-k)
}
# 結果の可視化(CH indexが極大となるクラスター数を採用)
Fcrit <- Pseudo_F(dat[, -1], 10)
Fframe <- data.frame(k = 1 : 10, ch = Fcrit$crit)
Fframe <- melt(Fframe, id.vars = c("k"),
variable.name = "measure",
value.name = "score")
ggplot(Fframe, aes(x = k, y = score, color = measure)) +
geom_point(aes(shape = measure), color = "black", size = 6, shape = 20) +
geom_line(aes(linetype = measure), color = "black", size = 1) +
theme_bw(base_size = 16, base_family = '') +
scale_x_continuous(breaks = 1 : 10, labels = 1 : 10) +
labs(x = "Number of Clusters", y = "Calinski-Harabasz (CH) Index") +
theme(legend.position = 'none')
# クラスタの安定度
kbest.p <- 4 # 任意の数
cboot.hclust <- clusterboot(dat[, -1], clustermethod = hclustCBI, method = "ward.D2", k = kbest.p)
bootMean.data <- data.frame(cluster = 1 : kbest.p, bootMeans = cboot.hclust$bootmean)
round(bootMean.data, digit = 2)
