Adiciona tutorial de análise de agrupamento.

parent 4bee0016
......@@ -41,6 +41,8 @@ navbar:
href: tutorials/08-logistic-regression.html
- text: "Análise discriminante"
href: tutorials/09-linear-discriminant-analysis.html
- text: "Análise de agrupamento"
href: tutorials/10-clustering.html
- text: "Scripts"
icon: fa-file-text
href: scripts/
......
---
title: "Análise de Agrupamento"
author: Prof. Walmes M. Zeviani & Prof. Eduardo V. Ferreira
date: 2017-10-26
#bibliography: ../config/Refs.bib
#csl: ../config/ABNT-UFPR-2011-Mendeley.csl
---
```{r, include = FALSE}
source("../config/setup.R")
opts_chunk$set(
cache = FALSE,
message = FALSE,
warning = FALSE)
```
# K-means
* <https://www.r-statistics.com/2013/08/k-means-clustering-from-r-in-action/>;
* <https://datascienceplus.com/k-means-clustering-in-r/>;
* <https://rpubs.com/FelipeRego/K-Means-Clustering>;
```{r}
library(lattice)
library(latticeExtra)
library(labestData)
# labestDataView()
# Cães Pré-históricos da Tailândia.
ManlyTb1.4
# Índices de Desenvolvimento de Países.
MingotiTb6.8
# Emprego em Paises Europeus.
ManlyTb1.5
#-----------------------------------------------------------------------
# help(kmeans, help_type = "html")
# Variáveis métricas.
X <- ManlyTb1.4[, -(1)]
# Diagrama de pares de dispersão.
splom(X)
km <- kmeans(X, centers = 2, trace = TRUE)
class(km)
methods(class = class(km))
km
splom(X, groups = km$cluster)
#-----------------------------------------------------------------------
# Função para guiar a escolha do número de grupos.
wssplot <- function(data, nc = 15, seed = 1234, ...) {
# Soma de quadrados interna aos cluters.
wss <- (nrow(data) - 1) *
sum(apply(data, MARGIN = 2, FUN = var))
for (i in 2:nc) {
set.seed(seed)
wss[i] <- sum(kmeans(data, centers = i, ...)$withinss)
}
plot(x = 1:nc,
y = wss,
type = "b",
xlab = "Número de agrupamentos",
ylab = "Soma de quadrados dentro de grupos")
}
# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)
# Dados originais.
wssplot(X, nc = 15, trace = TRUE)
# Dados padronizados.
wssplot(Z, nc = 15)
#-----------------------------------------------------------------------
# Determinar o melhor número de grupos.
# NbClust Package for determining the best number of clusters.
library(NbClust)
# help(package = "NbClust", help_type = "html")
set.seed(1234)
nc <- NbClust(Z, min.nc = 2, max.nc = 15, method = "kmeans")
table(nc$Best.n[1, ])
splom(Z, groups = nc$Best.partition)
table(nc$Best.partition)
split(x = ManlyTb1.5[, 1], f = nc$Best.partition)
#-----------------------------------------------------------------------
# Os agrupamentos vistos com os primeiros scores.
pc <- princomp(Z)
summary(pc)
screeplot(pc, type = "lines")
biplot(pc)
xyplot(pc$scores[, 2] ~ pc$scores[, 1],
groups = nc$Best.partition)
splom(pc$scores[, 1:4],
groups = nc$Best.partition)
```
# Agrupamento hierárquico
* <https://cran.r-project.org/web/packages/dendextend/vignettes/Cluster_Analysis.html>;
* <https://rpubs.com/gaston/dendrograms>;
* <http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/>;
* <https://stackoverflow.com/questions/15376075/cluster-analysis-in-r-determine-the-optimal-number-of-clusters>;
```{r}
ManlyTb1.4
Z <- scale(ManlyTb1.4[, -1])
rownames(Z) <- as.character(ManlyTb1.4[, 1])
Z
# Variáveis métricas.
X <- ManlyTb1.5[, -(1:2)]
Z <- scale(X)
rownames(Z) <- ManlyTb1.5[, 1]
Z
X <- MingotiTb6.8[, -1]
Z <- scale(X)
rownames(Z) <- MingotiTb6.8[, 1]
# help(dist, help_type = "html")
# method = {"euclidean", "maximum", "manhattan", "canberra", "binary",
# "minkowski"}
# Distâncias entre cada par de observações.
dis <- dist(as.matrix(Z), method = "euclidian")
dis
# help(hclust, help_type = "html")
# method = {"ward.D", "ward.D2", "single", "complete", "average"
# (=UPGMA), "mcquitty" (=WPGMA), "median" (=WPGMC), "centroid" (=
# UPGMC)}
layout(1)
hcl <- hclust(dis, method = "average")
# hcl <- hclust(dis, method = "ward.D")
plot(hcl, hang = -1)
# Cortando de 2 a 4 grupos.
cutree(hcl, k = 2:4)
#-----------------------------------------------------------------------
# Aprimorando a exibição fazendo cortes.
library(dendextend)
den <- as.dendrogram(hcl)
k <- 2
plot(den, ylab = "Distância")
rect.hclust(hcl, k = 2, border = "red")
den <- color_branches(dend = den, k = k)
plot(den, ylab = "Distância")
rect.dendrogram(den,
k = k,
lty = 2,
border = "gray30")
# Determina o número ótimo de grupos.
den_k <- find_k(den)
plot(den_k)
plot(color_branches(den, k = den_k$nc))
#-----------------------------------------------------------------------
library(pvclust)
pvc <- pvclust(t(Z),
method.dist = "euclidean",
method.hclust = "average",
nboot = 2000)
pvc
dim(Z)
plot(pvc)
pvrect(pvc, alpha = 0.9)
```
# DBSCAN
* Density Based Clustering of Applications with Noise (DBSCAN);
* <https://cran.r-project.org/web/packages/dbscan/vignettes/dbscan.pdf>;
* <https://github.com/mhahsler/dbscan>;
* <http://www.sthda.com/english/wiki/print.php?id=246>;
* <http://en.proft.me/2017/02/3/density-based-clustering-r/>;
```{r}
# Dois pacotes com o algoritmo implementado.
# library(dbscan)
library(fpc)
# help(dbscan, help_type = "html")
# dbscan(data,
# eps,
# MinPts = 5,
# scale = FALSE,
# method = c("hybrid", "raw", "dist"))
# Usando o iris.
X <- scale(iris[, 1:2])
set.seed(123)
db <- fpc::dbscan(X, eps = 0.25, MinPts = 3)
class(db)
methods(class = class(db))
db
# Plot DBSCAN results.
plot(db, X, main = "DBSCAN", frame = FALSE)
box()
# Predição.
predict(db)
#-----------------------------------------------------------------------
# Cria grid cruzando valores dos parâmetros de tunning.
grid <- expand.grid(eps = c(0.05, 0.1, 0.25, 0.5, 0.8),
MinPts = c(1, 2, 3, 5, 8, 13),
KEEP.OUT.ATTRS = FALSE)
# Aplica o algoritmo para cada condição de tunning.
m <- mapply(FUN = dbscan,
eps = grid$eps,
MinPts = grid$MinPts,
MoreArgs = list(data = X))
# Empilha as predições de cada condição gerando vetor.
pred <- c(sapply(m, predict))
# Junta as observações, as condições e a predição.
pred <- cbind(X,
grid[rep(seq_len(nrow(grid)),
each = nrow(X)), ],
clas = pred)
str(pred)
# Converte para fator.
pred <- transform(pred,
eps = factor(eps),
MinPts = factor(MinPts))
library(grid)
useOuterStrips(
xyplot(Sepal.Length ~ Sepal.Width | eps + MinPts,
groups = clas,
# auto.key = list(columns = 5),
as.table = TRUE,
data = pred)) +
layer({
n <- nlevels(factor(groups[subscripts]))
grid.text(x = 0.9, y = 0.9, label = n)
})
#-----------------------------------------------------------------------
# Número ótimo do parâmetro de tunning.
MinPts <- 2
dbscan::kNNdistplot(X, k = MinPts)
eps <- 0.4
abline(h = eps, lty = 2)
set.seed(456)
db <- fpc::dbscan(X, eps = eps, MinPts = MinPts)
db
p <- predict(db)
# Plot DBSCAN results.
plot(db, X, frame = FALSE, asp = 1, type = "n")
points(X, col = p, pch = 19)
points(X[p == 0, ], pch = 4)
for (i in seq_len(nrow(X))) {
plotrix::draw.circle(X[i, 1],
X[i, 2],
radius = eps,
border = "gray50")
}
box()
```
```{r, eval = FALSE}
# Usando rpanel para controlar.
library(rpanel)
action <- function(panel) {
with(panel, {
EPS <- as.numeric(eps)
MINPTS <- as.numeric(MinPts)
db <- dbscan::dbscan(X,
eps = EPS,
minPts = MINPTS)
p <- predict(db)
plot(db, X, frame = FALSE, asp = 1, type = "n")
points(X, col = p, pch = 19)
points(X[p == 0, ], pch = 4)
if (circle) {
for (i in seq_len(nrow(X))) {
plotrix::draw.circle(X[i, 1],
X[i, 2],
radius = EPS,
border = "gray70")
}
}
box()
})
return(panel)
}
# apropos("^rp\\.")
# help(package = "rpanel", help_type = "html")
panel <- rp.control()
# rp.textentry(panel = panel,
# variable = eps,
# action = action,
# labels = c("eps"),
# initval = c(0.4))
# rp.textentry(panel = panel,
# variable = MinPts,
# action = action,
# labels = c("minPts"),
# initval = c(5))
rp.checkbox(panel = panel,
variable = circle,
action = action,
title = "Circulos?",
initval = FALSE)
rp.doublebutton(panel = panel,
variable = eps,
step = 0.05,
initval = 0.4,
range = c(0.05, 0.8),
action = action,
title = "eps",
showvalue = TRUE)
rp.doublebutton(panel = panel,
variable = MinPts,
step = 1,
initval = 4,
range = c(1, 8),
action = action,
title = "MinPts",
showvalue = TRUE)
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment