| output | github_document |
|---|
00289_example_9.1_of_section_9.1.2.R
# example 9.1 of section 9.1.2
# (example 9.1 of section 9.1.2) : Unsupervised methods : Cluster analysis : Preparing the data
# Title: Reading the protein data
protein <- read.table("../Protein/protein.txt", sep = "\t", header=TRUE)
summary(protein)## Country RedMeat WhiteMeat Eggs
## Albania : 1 Min. : 4.400 Min. : 1.400 Min. :0.500
## Austria : 1 1st Qu.: 7.800 1st Qu.: 4.900 1st Qu.:2.700
## Belgium : 1 Median : 9.500 Median : 7.800 Median :2.900
## Bulgaria : 1 Mean : 9.828 Mean : 7.896 Mean :2.936
## Czechoslovakia: 1 3rd Qu.:10.600 3rd Qu.:10.800 3rd Qu.:3.700
## Denmark : 1 Max. :18.000 Max. :14.000 Max. :4.700
## (Other) :19
## Milk Fish Cereals Starch
## Min. : 4.90 Min. : 0.200 Min. :18.60 Min. :0.600
## 1st Qu.:11.10 1st Qu.: 2.100 1st Qu.:24.30 1st Qu.:3.100
## Median :17.60 Median : 3.400 Median :28.00 Median :4.700
## Mean :17.11 Mean : 4.284 Mean :32.25 Mean :4.276
## 3rd Qu.:23.30 3rd Qu.: 5.800 3rd Qu.:40.10 3rd Qu.:5.700
## Max. :33.70 Max. :14.200 Max. :56.70 Max. :6.500
##
## Nuts Fr.Veg
## Min. :0.700 Min. :1.400
## 1st Qu.:1.500 1st Qu.:2.900
## Median :2.400 Median :3.800
## Mean :3.072 Mean :4.136
## 3rd Qu.:4.700 3rd Qu.:4.900
## Max. :7.800 Max. :7.900
##
## Country RedMeat WhiteMeat Eggs
## Albania : 1 Min. : 4.400 Min. : 1.400 Min. :0.500
## Austria : 1 1st Qu.: 7.800 1st Qu.: 4.900 1st Qu.:2.700
## Belgium : 1 Median : 9.500 Median : 7.800 Median :2.900
## Bulgaria : 1 Mean : 9.828 Mean : 7.896 Mean :2.936
## Czechoslovakia: 1 3rd Qu.:10.600 3rd Qu.:10.800 3rd Qu.:3.700
## Denmark : 1 Max. :18.000 Max. :14.000 Max. :4.700
## (Other) :19
## Milk Fish Cereals Starch
## Min. : 4.90 Min. : 0.200 Min. :18.60 Min. :0.600
## 1st Qu.:11.10 1st Qu.: 2.100 1st Qu.:24.30 1st Qu.:3.100
## Median :17.60 Median : 3.400 Median :28.00 Median :4.700
## Mean :17.11 Mean : 4.284 Mean :32.25 Mean :4.276
## 3rd Qu.:23.30 3rd Qu.: 5.800 3rd Qu.:40.10 3rd Qu.:5.700
## Max. :33.70 Max. :14.200 Max. :56.70 Max. :6.500
##
## Nuts Fr.Veg
## Min. :0.700 Min. :1.400
## 1st Qu.:1.500 1st Qu.:2.900
## Median :2.400 Median :3.800
## Mean :3.072 Mean :4.136
## 3rd Qu.:4.700 3rd Qu.:4.900
## Max. :7.800 Max. :7.90000290_example_9.2_of_section_9.1.2.R
# example 9.2 of section 9.1.2
# (example 9.2 of section 9.1.2) : Unsupervised methods : Cluster analysis : Preparing the data
# Title: Rescaling the dataset
vars_to_use <- colnames(protein)[-1] # Note: 1
pmatrix <- scale(protein[, vars_to_use])
pcenter <- attr(pmatrix, "scaled:center") # Note: 2
pscale <- attr(pmatrix, "scaled:scale")
rm_scales <- function(scaled_matrix) { # Note: 3
attr(scaled_matrix, "scaled:center") <- NULL
attr(scaled_matrix, "scaled:scale") <- NULL
scaled_matrix
}
pmatrix <- rm_scales(pmatrix) # Note: 4
# Note 1:
# Use all the columns except the first
# (Country).
# Note 2:
# Store the scaling attributes.
# Note 3:
# Convenience function to remove scale attributes from a scaled matrix.
# Note 4:
# Null the scale attributes out for safety. 00291_example_9.3_of_section_9.1.3.R
# example 9.3 of section 9.1.3
# (example 9.3 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Hierarchical clustering
distmat <- dist(pmatrix, method = "euclidean") # Note: 1
pfit <- hclust(distmat, method = "ward.D") # Note: 2
plot(pfit, labels = protein$Country) # Note: 3# Note 1:
# Create the distance matrix.
# Note 2:
# Do the clustering.
# Note 3:
# Plot the dendrogram. 00293_example_9.4_of_section_9.1.3.R
# example 9.4 of section 9.1.3
# (example 9.4 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Extracting the clusters found by hclust()
groups <- cutree(pfit, k = 5)
print_clusters = function(data, groups, columns) { # Note: 1
groupedD = split(data, groups)
lapply(groupedD,
function(df) df[, columns])
}
cols_to_print = wrapr::qc(Country, RedMeat, Fish, Fr.Veg)
print_clusters(protein, groups, cols_to_print)## $`1`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
## $`1`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
# Note 1:
# A convenience function for printing out the
# countries in each cluster, along with the values
# for red meat, fish, and fruit/vegetable
# consumption. We’ll use this function throughout
# this section. Note the function assumes that the
# data is in a data.frame (not a matrix). 00294_example_9.5_of_section_9.1.3.R
# example 9.5 of section 9.1.3
# (example 9.5 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Projecting the clusters on the first two principal components
library(ggplot2)
princ <- prcomp(pmatrix) # Note: 1
nComp <- 2
project <- predict(princ, pmatrix)[, 1:nComp] # Note: 2
project_plus <- cbind(as.data.frame(project), # Note: 3
cluster = as.factor(groups),
country = protein$Country)
ggplot(project_plus, aes(x = PC1, y = PC2)) + # Note: 4
geom_point(data = as.data.frame(project), color = "darkgrey") +
geom_point() +
geom_text(aes(label = country),
hjust = 0, vjust = 1) +
facet_wrap(~ cluster, ncol = 3, labeller = label_both)# Note 1:
# Calculate the principal components of the
# data.
# Note 2:
# The predict() function will rotate the data
# into the coordinates described by the principal
# components. The first two columns of the rotated data
# are the projection of the data on the first two principal
# components.
# Note 3:
# Create a data frame with the transformed
# data, along with the cluster label and country
# label of each point.
# Note 4:
# Plot it. Put each cluster in a separate facet for legibility. 00295_example_9.6_of_section_9.1.3.R
# example 9.6 of section 9.1.3
# (example 9.6 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Running clusterboot() on the protein data
library(fpc) # Note: 1
kbest_p <- 5 # Note: 2
cboot_hclust <- clusterboot(pmatrix,
clustermethod = hclustCBI, # Note: 3
method = "ward.D",
k = kbest_p)## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
summary(cboot_hclust$result) # Note: 4 ## Length Class Mode
## result 7 hclust list
## noise 1 -none- logical
## nc 1 -none- numeric
## clusterlist 5 -none- list
## partition 25 -none- numeric
## clustermethod 1 -none- character
## nccl 1 -none- numeric
## Length Class Mode
## result 7 hclust list
## noise 1 -none- logical
## nc 1 -none- numeric
## clusterlist 5 -none- list
## partition 25 -none- numeric
## clustermethod 1 -none- character
## nccl 1 -none- numeric
groups <- cboot_hclust$result$partition # Note: 5
print_clusters(protein, groups, cols_to_print) # Note: 6 ## $`1`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
## $`1`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
cboot_hclust$bootmean # Note: 7 ## [1] 0.8041667 0.7688452 0.6111627 0.9088690 0.7373333
## [1] 0.8090000 0.7939643 0.6247976 0.9366667 0.7815000
cboot_hclust$bootbrd # Note: 8 ## [1] 22 16 51 9 40
## [1] 19 14 45 9 30
# Note 1:
# Load the fpc package. You may have to
# install it first.
# Note 2:
# Set the desired number of clusters.
# Note 3:
# Run clusterboot() with hclust
# (clustermethod = hclustCBI) using Ward’s method
# (method = "ward.D") and kbest_p clusters
# (k = kbest_p). Return the results in an object
# called cboot_hclust.
# Note 4:
# The results of the clustering are in
# cboot_hclust$result.
# Note 5:
# cboot_hclust$result$partition returns a
# vector of clusterlabels.
# Note 6:
# The clusters are the same as those produced
# by a direct call to hclust().
# Note 7:
# The vector of cluster stabilities.
# Note 8:
# The count of how many times each cluster was
# dissolved. By default clusterboot() runs 100
# bootstrap iterations. 00296_example_9.7_of_section_9.1.3.R
# example 9.7 of section 9.1.3
# (example 9.7 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Calculating total within sum of squares
sqr_edist <- function(x, y) { # Note: 1
sum((x - y)^2)
}
wss_cluster <- function(clustermat) { # Note: 2
c0 <- colMeans(clustermat) # Note: 3
sum(apply(clustermat, 1, FUN = function(row) { sqr_edist(row, c0) })) # Note: 4
}
wss_total <- function(dmatrix, labels) { # Note: 5
wsstot <- 0
k <- length(unique(labels))
for(i in 1:k)
wsstot <- wsstot + wss_cluster(subset(dmatrix, labels == i)) # Note: 6
wsstot
}
wss_total(pmatrix, groups) # Note: 7 ## [1] 71.94342
## [1] 71.94342
# Note 1:
# Function to calculate squared distance
# between two vectors.
# Note 2:
# Function to calculate the WSS for a single
# cluster, which is represented as a matrix (one row
# for every point).
# Note 3:
# Calculate the centroid of the cluster (the
# mean of all the points).
# Note 4:
# Calculate the squared difference of every
# point in the cluster from the centroid, and sum
# all the distances.
# Note 5:
# Function to compute the total WSS from a set
# of data points and cluster labels.
# Note 6:
# Extract each cluster, calculate the
# cluster’s WSS, and sum all the values.
# Note 7:
# Calculate the total WSS for the current protein clustering. 00297_example_9.8_of_section_9.1.3.R
# example 9.8 of section 9.1.3
# (example 9.8 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Plot WSS for a range of k
get_wss <- function(dmatrix, max_clusters) { # Note: 1
wss = numeric(max_clusters)
wss[1] <- wss_cluster(dmatrix) # Note: 2
d <- dist(dmatrix, method = "euclidean")
pfit <- hclust(d, method = "ward.D") # Note: 3
for(k in 2:max_clusters) { # Note: 4
labels <- cutree(pfit, k = k)
wss[k] <- wss_total(dmatrix, labels)
}
wss
}
kmax <- 10
cluster_meas <- data.frame(nclusters = 1:kmax,
wss = get_wss(pmatrix, kmax))
breaks <- 1:kmax
ggplot(cluster_meas, aes(x=nclusters, y = wss)) + # Note: 5
geom_point() + geom_line() +
scale_x_continuous(breaks = breaks)# Note 1:
# A function to get the total WSS for a
# range of clusters from 1 to max
# Note 2:
# wss[1] is just the WSS of all the data
# Note 3:
# Cluster the data.
# Note 4:
# For each k, calculate the cluster labels and the cluster WSS
# Note 5:
# Plot WSS as a function of k 00299_example_9.9_of_section_9.1.3.R
# example 9.9 of section 9.1.3
# (example 9.9 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: Plot BSS and WSS as a function of k
total_ss <- function(dmatrix) { # Note: 1
grandmean <- colMeans(dmatrix)
sum(apply(dmatrix, 1, FUN = function(row) { sqr_edist(row, grandmean) }))
}
tss <- total_ss(pmatrix)
cluster_meas$bss <- with(cluster_meas, tss - wss)
library(cdata) # Note: 2
cmlong <- unpivot_to_blocks(cluster_meas, # Note: 3
nameForNewKeyColumn = "measure",
nameForNewValueColumn = "value",
columnsToTakeFrom = c("wss", "bss"))
ggplot(cmlong, aes(x = nclusters, y = value)) +
geom_point() + geom_line() +
facet_wrap(~measure, ncol = 1, scale = "free_y") +
scale_x_continuous(breaks = 1:10)# Note 1:
# Calculate total sum of squares TSS.
# Note 2:
# Load the cdata package to reshape the data.
# Note 3:
# Reshape cluster_meas so that WSS and BSS are in the same column. 00302_example_9.10_of_section_9.1.3.R
# example 9.10 of section 9.1.3
# (example 9.10 of section 9.1.3) : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust
# Title: The Calinski-Harabasz index
cluster_meas$B <- with(cluster_meas, bss / (nclusters - 1)) # Note: 1
n = nrow(pmatrix)
cluster_meas$W <- with(cluster_meas, wss / (n - nclusters)) # Note: 2
cluster_meas$ch_crit <- with(cluster_meas, B / W) # Note: 3
ggplot(cluster_meas, aes(x = nclusters, y = ch_crit)) +
geom_point() + geom_line() +
scale_x_continuous(breaks = 1:kmax)## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
# Note 1:
# Calculate the between-cluster variance B
# Note 2:
# Calculate the within-cluster variance W
# Note 3:
# Calculate the CH index 00303_example_9.11_of_section_9.1.4.R
# example 9.11 of section 9.1.4
# (example 9.11 of section 9.1.4) : Unsupervised methods : Cluster analysis : The k-means algorithm
# Title: Running k-means with k = 5
kbest_p <- 5
pclusters <- kmeans(pmatrix, kbest_p, nstart = 100, iter.max = 100) # Note: 1
summary(pclusters) # Note: 2 ## Length Class Mode
## cluster 25 -none- numeric
## centers 45 -none- numeric
## totss 1 -none- numeric
## withinss 5 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 5 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
## Length Class Mode
## cluster 25 -none- numeric
## centers 45 -none- numeric
## totss 1 -none- numeric
## withinss 5 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 5 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
pclusters$centers # Note: 3 ## RedMeat WhiteMeat Eggs Milk Fish Cereals
## 1 1.011180399 0.7421332 0.94084150 0.5700581 -0.2671539 -0.6877583
## 2 -0.807569986 -0.8719354 -1.55330561 -1.0783324 -1.0386379 1.7200335
## 3 -0.570049402 0.5803879 -0.08589708 -0.4604938 -0.4537795 0.3181839
## 4 -0.508801956 -1.1088009 -0.41248496 -0.8320414 0.9819154 0.1300253
## 5 0.006572897 -0.2290150 0.19147892 1.3458748 1.1582546 -0.8722721
## Starch Nuts Fr.Veg
## 1 0.2288743 -0.5083895 0.02161979
## 2 -1.4234267 0.9961313 -0.64360439
## 3 0.7857609 -0.2679180 0.06873983
## 4 -0.1842010 1.3108846 1.62924487
## 5 0.1676780 -0.9553392 -1.11480485
## RedMeat WhiteMeat Eggs Milk Fish Cereals
## 1 -0.570049402 0.5803879 -0.08589708 -0.4604938 -0.4537795 0.3181839
## 2 -0.508801956 -1.1088009 -0.41248496 -0.8320414 0.9819154 0.1300253
## 3 -0.807569986 -0.8719354 -1.55330561 -1.0783324 -1.0386379 1.7200335
## 4 0.006572897 -0.2290150 0.19147892 1.3458748 1.1582546 -0.8722721
## 5 1.011180399 0.7421332 0.94084150 0.5700581 -0.2671539 -0.6877583
## Starch Nuts Fr.Veg
## 1 0.7857609 -0.2679180 0.06873983
## 2 -0.1842010 1.3108846 1.62924487
## 3 -1.4234267 0.9961313 -0.64360439
## 4 0.1676780 -0.9553392 -1.11480485
## 5 0.2288743 -0.5083895 0.02161979
pclusters$size # Note: 4 ## [1] 8 4 5 4 4
## [1] 5 4 4 4 8
groups <- pclusters$cluster # Note: 5
cols_to_print = wrapr::qc(Country, RedMeat, Fish, Fr.Veg)
print_clusters(protein, groups, cols_to_print) # Note: 6 ## $`1`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
## $`1`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
# Note 1:
# Run kmeans() with five clusters (kbest_p = 5),
# 100 random starts, and 100 maximum iterations per
# run.
# Note 2:
# kmeans() returns all the sum of squares
# measures.
# Note 3:
# pclusters$centers is a matrix whose rows are
# the centroids of the clusters. Note that
# pclusters$centers is in the scaled coordinates,
# not the original protein coordinates.
# Note 4:
# pclusters$size returns the number of points
# in each cluster. Generally (though not always) a
# good clustering will be fairly well balanced: no
# extremely small clusters and no extremely large
# ones.
# Note 5:
# pclusters$cluster is a vector of cluster
# labels.
# Note 6:
# In this case, kmeans() and hclust() returned
# the same clustering. This won’t always be true. 00304_example_9.12_of_section_9.1.4.R
# example 9.12 of section 9.1.4
# (example 9.12 of section 9.1.4) : Unsupervised methods : Cluster analysis : The k-means algorithm
# Title: Plotting cluster criteria
clustering_ch <- kmeansruns(pmatrix, krange = 1:10, criterion = "ch") # Note: 1
clustering_ch$bestk # Note: 2 ## [1] 2
## [1] 2
clustering_asw <- kmeansruns(pmatrix, krange = 1:10, criterion = "asw") # Note: 3
clustering_asw$bestk## [1] 3
## [1] 3
clustering_asw$crit # Note: 4 ## [1] 0.0000000 0.3271084 0.3351694 0.2617868 0.2639450 0.2734815 0.2471165
## [8] 0.2429985 0.2412922 0.2388293
## [1] 0.0000000 0.3271084 0.3351694 0.2617868 0.2639450 0.2734815 0.2471165
## [8] 0.2429985 0.2412922 0.2388293
clustering_ch$crit # Note: 5 ## [1] 0.000000 14.094814 11.417985 10.418801 10.011797 9.964967 9.861682
## [8] 9.412089 9.166676 9.075569
## [1] 0.000000 14.094814 11.417985 10.418801 10.011797 9.964967 9.861682
## [8] 9.412089 9.166676 9.075569
cluster_meas$ch_crit # Note: 6 ## [1] NaN 12.215107 10.359587 9.690891 10.011797 9.964967 9.506978
## [8] 9.092065 8.822406 8.695065
## [1] NaN 12.215107 10.359587 9.690891 10.011797 9.964967 9.506978
## [8] 9.092065 8.822406 8.695065
summary(clustering_ch) # Note: 7 ## Length Class Mode
## cluster 25 -none- numeric
## centers 18 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
## crit 10 -none- numeric
## bestk 1 -none- numeric
## Length Class Mode
## cluster 25 -none- numeric
## centers 18 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
## crit 10 -none- numeric
## bestk 1 -none- numeric
# Note 1:
# Run kmeansruns() from 1–10 clusters, and the
# CH criterion. By default, kmeansruns() uses 100
# random starts and 100 maximum iterations per
# run.
# Note 2:
# The CH criterion picks two clusters.
# Note 3:
# Run kmeansruns() from 1–10 clusters, and the
# average silhouette width criterion. Average
# silhouette width picks 3 clusters.
# Note 4:
# Look at the values of the asw criterion as a function of k.
# Note 5:
# Look at the values of the CH criterion as a function of k.
# Note 6:
# Compare these to the CH values for the
# hclust() clustering. They’re not quite the same,
# because the two algorithms didn’t pick the same
# clusters.
# Note 7:
# kmeansruns() also returns the output of
# kmeans for k = bestk. 00305_example_9.13_of_section_9.1.4.R
# example 9.13 of section 9.1.4
# (example 9.13 of section 9.1.4) : Unsupervised methods : Cluster analysis : The k-means algorithm
# Title: Running clusterboot() with k-means
kbest_p <- 5
cboot <- clusterboot(pmatrix, clustermethod = kmeansCBI,
runs = 100,iter.max = 100,
krange = kbest_p, seed = 15555) # Note: 1 ## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
groups <- cboot$result$partition
print_clusters(protein, groups, cols_to_print)## $`1`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
## $`1`
## Country RedMeat Fish Fr.Veg
## 1 Albania 10.1 0.2 1.7
## 4 Bulgaria 7.8 1.2 4.2
## 18 Romania 6.2 1.0 2.8
## 25 Yugoslavia 4.4 0.6 3.2
##
## $`2`
## Country RedMeat Fish Fr.Veg
## 6 Denmark 10.6 9.9 2.4
## 8 Finland 9.5 5.8 1.4
## 15 Norway 9.4 9.7 2.7
## 20 Sweden 9.9 7.5 2.0
##
## $`3`
## Country RedMeat Fish Fr.Veg
## 5 Czechoslovakia 9.7 2.0 4.0
## 7 E Germany 8.4 5.4 3.6
## 11 Hungary 5.3 0.3 4.2
## 16 Poland 6.9 3.0 6.6
## 23 USSR 9.3 3.0 2.9
##
## $`4`
## Country RedMeat Fish Fr.Veg
## 2 Austria 8.9 2.1 4.3
## 3 Belgium 13.5 4.5 4.0
## 9 France 18.0 5.7 6.5
## 12 Ireland 13.9 2.2 2.9
## 14 Netherlands 9.5 2.5 3.7
## 21 Switzerland 13.1 2.3 4.9
## 22 UK 17.4 4.3 3.3
## 24 W Germany 11.4 3.4 3.8
##
## $`5`
## Country RedMeat Fish Fr.Veg
## 10 Greece 10.2 5.9 6.5
## 13 Italy 9.0 3.4 6.7
## 17 Portugal 6.2 14.2 7.9
## 19 Spain 7.1 7.0 7.2
cboot$bootmean## [1] 0.7540000 0.7441548 0.5965675 0.8716429 0.7971667
## [1] 0.8670000 0.8420714 0.6147024 0.7647341 0.7508333
cboot$bootbrd## [1] 28 15 53 16 21
## [1] 15 20 49 17 32
# Note 1:
# We’ve set the seed for the random generator
# so the results are reproducible. 00306_example_9.14_of_section_9.1.5.R
# example 9.14 of section 9.1.5
# (example 9.14 of section 9.1.5) : Unsupervised methods : Cluster analysis : Assigning new points to clusters
# Title: A function to assign points to a cluster
assign_cluster <- function(newpt, centers, xcenter = 0, xscale = 1) {
xpt <- (newpt - xcenter) / xscale # Note: 1
dists <- apply(centers, 1, FUN = function(c0) { sqr_edist(c0, xpt) }) # Note: 2
which.min(dists) # Note: 3
}
# Note 1:
# Center and scale the new data point.
# Note 2:
# Calculate how far the new data point is from
# each of the cluster centers.
# Note 3:
# Return the cluster number of the closest
# centroid. 00307_example_9.15_of_section_9.1.5.R
# example 9.15 of section 9.1.5
# (example 9.15 of section 9.1.5) : Unsupervised methods : Cluster analysis : Assigning new points to clusters
# Title: Generate and cluster synthetic data for cluster assignment example
mean1 <- c(1, 1, 1) # Note: 1
sd1 <- c(1, 2, 1)
mean2 <- c(10, -3, 5)
sd2 <- c(2, 1, 2)
mean3 <- c(-5, -5, -5)
sd3 <- c(1.5, 2, 1)
library(MASS) # Note: 2
clust1 <- mvrnorm(100, mu = mean1, Sigma = diag(sd1))
clust2 <- mvrnorm(100, mu = mean2, Sigma = diag(sd2))
clust3 <- mvrnorm(100, mu = mean3, Sigma = diag(sd3))
toydata <- rbind(clust3, rbind(clust1, clust2))
tmatrix <- scale(toydata) # Note: 3
tcenter <- attr(tmatrix, "scaled:center") # Note: 4
tscale <-attr(tmatrix, "scaled:scale")
tmatrix <- rm_scales(tmatrix)
kbest_t <- 3
tclusters <- kmeans(tmatrix, kbest_t, nstart = 100, iter.max = 100) # Note: 5
tclusters$size # Note: 6 ## [1] 100 99 101
## [1] 101 100 99
# Note 1:
# Set the parameters for three 3D
# Gaussian clusters.
# Note 2:
# Use the mvrnorm() function from MASS package to generate
# three-dimensional axis-aligned Gaussian clusters.
# Note 3:
# Scale the synthetic data.
# Note 4:
# Get the scaling attributes, then remove them from the matrix.
# Note 5:
# Cluster the synthetic data into three clusters.
# Note 6:
# The generated clusters are consistent in size with the true clusters. 00308_example_9.16_of_section_9.1.5.R
# example 9.16 of section 9.1.5
# (example 9.16 of section 9.1.5) : Unsupervised methods : Cluster analysis : Assigning new points to clusters
# Title: Unscale the centers
unscaled = scale(tclusters$centers, center = FALSE, scale = 1 / tscale)
rm_scales(scale(unscaled, center = -tcenter, scale = FALSE)) ## [,1] [,2] [,3]
## 1 -4.7554083 -5.0841602 -5.0110629
## 2 9.9278210 -2.9398534 4.8330536
## 3 0.9833241 0.7977014 0.8149083
## [,1] [,2] [,3]
## 1 9.8234797 -3.005977 4.7662651
## 2 -4.9749654 -4.862436 -5.0577002
## 3 0.8926698 1.185734 0.833697700309_example_9.17_of_section_9.1.5.R
# example 9.17 of section 9.1.5
# (example 9.17 of section 9.1.5) : Unsupervised methods : Cluster analysis : Assigning new points to clusters
# Title: An example of assigning points to clusters
assign_cluster(mvrnorm(1, mean1, diag(sd1)), # Note: 1
tclusters$centers,
tcenter, tscale)## 3
## 3
## 3
## 3
assign_cluster(mvrnorm(1, mean2, diag(sd2)), # Note: 2
tclusters$centers,
tcenter, tscale)## 2
## 2
## 1
## 1
assign_cluster(mvrnorm(1, mean3, diag(sd3)), # Note: 3
tclusters$centers,
tcenter, tscale)## 1
## 1
## 2
## 2
# Note 1:
# # This should be assigned to cluster 3.
# Note 2:
# # This should be assigned to cluster 1.
# Note 3:
# # This should be assigned to cluster 2. 00311_example_9.18_of_section_9.2.3.R
# example 9.18 of section 9.2.3
# (example 9.18 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Reading in the book data
library(arules) # Note: 1 ## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
bookbaskets <- read.transactions("../Bookdata/bookdata.tsv.gz",
format = "single", # Note: 2
header = TRUE, # Note: 3
sep = "\t", # Note: 4
cols = c("userid", "title"), # Note: 5
rm.duplicates = TRUE) # Note: 6
# Note 1:
# Load the arules package.
# Note 2:
# Specify the file and the file format.
# Note 3:
# Specify that the input file has a header.
# Note 4:
# Specify the column separator (a tab).
# Note 5:
# Specify the column of transaction IDs and of
# item IDs, respectively.
# Note 6:
# Tell the function to look for and remove
# duplicate entries (for example, multiple entries
# for “The Hobbit” by the same user). 00312_example_9.19_of_section_9.2.3.R
# example 9.19 of section 9.2.3
# (example 9.19 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Examining the transaction data
class(bookbaskets) # Note: 1 ## [1] "transactions"
## attr(,"package")
## [1] "arules"
## [1] "transactions"
## attr(,"package")
## [1] "arules"
bookbaskets # Note: 2 ## transactions in sparse format with
## 92108 transactions (rows) and
## 220447 items (columns)
## transactions in sparse format with
## 92108 transactions (rows) and
## 220447 items (columns)
dim(bookbaskets) # Note: 3 ## [1] 92108 220447
## [1] 92108 220447
colnames(bookbaskets)[1:5] # Note: 4 ## [1] " A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861"
## [2] " Always Have Popsicles"
## [3] " Apple Magic"
## [4] " Ask Lily"
## [5] " Beyond IBM: Leadership Marketing and Finance for the 1990s"
## [1] " A Light in the Storm:[...]"
## [2] " Always Have Popsicles"
## [3] " Apple Magic"
## [4] " Ask Lily"
## [5] " Beyond IBM: Leadership Marketing and Finance for the 1990s"
rownames(bookbaskets)[1:5] # Note: 5 ## [1] "10" "1000" "100001" "100002" "100004"
## [1] "10" "1000" "100001" "100002" "100004"
# Note 1:
# The object is of class transactions.
# Note 2:
# Printing the object tells you its
# dimensions.
# Note 3:
# You can also use dim() to see the dimensions
# of the matrix.
# Note 4:
# The columns are labeled by book
# title.
# Note 5:
# The rows are labeled by customer. 00313_informalexample_9.10_of_section_9.2.3.R
# informalexample 9.10 of section 9.2.3
# (informalexample 9.10 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
basketSizes <- size(bookbaskets)
summary(basketSizes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 1.0 11.1 4.0 10253.0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 1.0 11.1 4.0 10250.000314_example_9.20_of_section_9.2.3.R
# example 9.20 of section 9.2.3
# (example 9.20 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Examining the size distribution
quantile(basketSizes, probs = seq(0, 1, 0.1)) # Note: 1 ## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 1 2 3 5 13 10253
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 1 2 3 5 13 10253
library(ggplot2) # Note: 2
ggplot(data.frame(count = basketSizes)) +
geom_density(aes(x = count)) +
scale_x_log10()# Note 1:
# Look at the basket size distribution, in 10%
# increments.
# Note 2:
# Plot the distribution to get a better
# look. 00315_example_9.21_of_section_9.2.3.R
# example 9.21 of section 9.2.3
# (example 9.21 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Count how often each book occurs
bookCount <- itemFrequency(bookbaskets, "absolute")
summary(bookCount)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 4.638 3.000 2502.000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 4.638 3.000 2502.00000316_example_9.22_of_section_9.2.3.R
# example 9.22 of section 9.2.3
# (example 9.22 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Finding the 10 most frequent books
orderedBooks <- sort(bookCount, decreasing = TRUE) # Note: 1
knitr::kable(orderedBooks[1:10]) # Note: 2 | x | |
|---|---|
| Wild Animus | 2502 |
| The Lovely Bones: A Novel | 1295 |
| She's Come Undone | 934 |
| The Da Vinci Code | 905 |
| Harry Potter and the Sorcerer's Stone | 832 |
| The Nanny Diaries: A Novel | 821 |
| A Painted House | 819 |
| Bridget Jones's Diary | 772 |
| The Secret Life of Bees | 762 |
| Divine Secrets of the Ya-Ya Sisterhood: A Novel | 737 |
# | | x|
# |:-----------------------------------------------|----:|
# |Wild Animus | 2502|
# |The Lovely Bones: A Novel | 1295|
# |She's Come Undone | 934|
# |The Da Vinci Code | 905|
# |Harry Potter and the Sorcerer's Stone | 832|
# |The Nanny Diaries: A Novel | 821|
# |A Painted House | 819|
# |Bridget Jones's Diary | 772|
# |The Secret Life of Bees | 762|
# |Divine Secrets of the Ya-Ya Sisterhood: A Novel | 737|
orderedBooks[1] / nrow(bookbaskets) # Note: 3 ## Wild Animus
## 0.02716376
## Wild Animus
## 0.02716376
# Note 1:
# Sort the counts in decreasing order.
# Note 2:
# Display the top 10 books in a nice format
# Note 3:
# The most popular book in the dataset
# occurred in fewer than 3% of the baskets. 00317_informalexample_9.11_of_section_9.2.3.R
# informalexample 9.11 of section 9.2.3
# (informalexample 9.11 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
bookbaskets_use <- bookbaskets[basketSizes > 1]
dim(bookbaskets_use)## [1] 40822 220447
## [1] 40822 22044700318_example_9.23_of_section_9.2.3.R
# example 9.23 of section 9.2.3
# (example 9.23 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Finding the association rules
rules <- apriori(bookbaskets_use, # Note: 1
parameter = list(support = 0.002, confidence = 0.75))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.002 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 81
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[216031 item(s), 40822 transaction(s)] done [1.04s].
## sorting and recoding items ... [1256 item(s)] done [0.03s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2 3 4 5 done [0.04s].
## writing ... [191 rule(s)] done [0.00s].
## creating S4 object ... done [0.06s].
summary(rules)## set of 191 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5
## 11 100 66 14
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 3.435 4.000 5.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.002009 Min. :0.7500 Min. : 40.89 Min. : 82.0
## 1st Qu.:0.002131 1st Qu.:0.8113 1st Qu.: 86.44 1st Qu.: 87.0
## Median :0.002278 Median :0.8468 Median :131.36 Median : 93.0
## Mean :0.002593 Mean :0.8569 Mean :129.68 Mean :105.8
## 3rd Qu.:0.002695 3rd Qu.:0.9065 3rd Qu.:158.77 3rd Qu.:110.0
## Max. :0.005830 Max. :0.9882 Max. :321.89 Max. :238.0
##
## mining info:
## data ntransactions support confidence
## bookbaskets_use 40822 0.002 0.75
## set of 191 rules # Note: 2
##
## rule length distribution (lhs + rhs):sizes # Note: 3
## 2 3 4 5
## 11 100 66 14
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 3.435 4.000 5.000
##
## summary of quality measures: # Note: 4
## support confidence lift count
## Min. :0.002009 Min. :0.7500 Min. : 40.89 Min. : 82.0
## 1st Qu.:0.002131 1st Qu.:0.8113 1st Qu.: 86.44 1st Qu.: 87.0
## Median :0.002278 Median :0.8468 Median :131.36 Median : 93.0
## Mean :0.002593 Mean :0.8569 Mean :129.68 Mean :105.8
## 3rd Qu.:0.002695 3rd Qu.:0.9065 3rd Qu.:158.77 3rd Qu.:110.0
## Max. :0.005830 Max. :0.9882 Max. :321.89 Max. :238.0
##
## mining info: # Note: 5
## data ntransactions support confidence
## bookbaskets_use 40822 0.002 0.75
# Note 1:
# Call apriori() with a minimum support of
# 0.002 and a minimum confidence of 0.75
# Note 2:
# The number of rules found
# Note 3:
# The distribution of rule lengths (in this
# example, most rules contain 3 items—2 on the left
# side, X (lhs), and one on the right side, Y
# (rhs))
# Note 4:
# A summary of rule quality measures,
# including support and confidence
# Note 5:
# Some information on how apriori() was
# called 00319_example_9.24_of_section_9.2.3.R
# example 9.24 of section 9.2.3
# (example 9.24 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Scoring rules
measures <- interestMeasure(rules, # Note: 1
measure=c("coverage", "fishersExactTest"), # Note: 2
transactions = bookbaskets_use) # Note: 3
summary(measures)## coverage fishersExactTest
## Min. :0.002082 Min. : 0.000e+00
## 1st Qu.:0.002511 1st Qu.: 0.000e+00
## Median :0.002719 Median : 0.000e+00
## Mean :0.003039 Mean :5.080e-138
## 3rd Qu.:0.003160 3rd Qu.: 0.000e+00
## Max. :0.006982 Max. :9.702e-136
## coverage fishersExactTest
## Min. :0.002082 Min. : 0.000e+00
## 1st Qu.:0.002511 1st Qu.: 0.000e+00
## Median :0.002719 Median : 0.000e+00
## Mean :0.003039 Mean :5.080e-138
## 3rd Qu.:0.003160 3rd Qu.: 0.000e+00
## Max. :0.006982 Max. :9.702e-136
# Note 1:
# The first argument to interestMeasure() is
# the discovered rules
# Note 2:
# Second argument is a list of interest
# measures to apply
# Note 3:
# Last argument is a dataset to evaluate the
# interest measures over. This is usually the same
# set used to mine the rules, but it needn’t be. For
# instance, you can evaluate the rules over the full
# dataset, bookbaskets, to get coverage estimates
# that reflect all the customers, not just the ones
# who showed interest in more than one book. 00320_example_9.25_of_section_9.2.3.R
# example 9.25 of section 9.2.3
# (example 9.25 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Get the five most confident rules
library(magrittr) # Note: 1
rules %>%
sort(., by = "confidence") %>% # Note: 2
head(., n = 5) %>% # Note: 3
inspect(.) # Note: 4## lhs rhs support confidence lift count
## [1] {Four to Score,
## High Five,
## Seven Up,
## Two for the Dough} => {Three To Get Deadly : A Stephanie Plum Novel} 0.002057714 0.9882353 165.33500 84
## [2] {Harry Potter and the Order of the Phoenix,
## Harry Potter and the Prisoner of Azkaban,
## Harry Potter and the Sorcerer's Stone} => {Harry Potter and the Chamber of Secrets} 0.002866102 0.9669421 72.82751 117
## [3] {Four to Score,
## High Five,
## One for the Money,
## Two for the Dough} => {Three To Get Deadly : A Stephanie Plum Novel} 0.002082211 0.9659091 161.59976 85
## [4] {Four to Score,
## Seven Up,
## Three To Get Deadly : A Stephanie Plum Novel,
## Two for the Dough} => {High Five} 0.002057714 0.9655172 180.79975 84
## [5] {High Five,
## Seven Up,
## Three To Get Deadly : A Stephanie Plum Novel,
## Two for the Dough} => {Four to Score} 0.002057714 0.9655172 167.72062 84
# Note 1:
# Attach magrittr to get pipe notation.
# Note 2:
# Sort rules by confidence.
# Note 3:
# Get the first 5 rules.
# Note 4:
# Call inspect() to pretty-print the rules. 00321_example_9.26_of_section_9.2.3.R
# example 9.26 of section 9.2.3
# (example 9.26 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Finding rules with restrictions
brules <- apriori(bookbaskets_use,
parameter = list(support = 0.001, # Note: 1
confidence = 0.6),
appearance = list(rhs = c("The Lovely Bones: A Novel"), # Note: 2
default = "lhs")) # Note: 3 ## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 40
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[216031 item(s), 40822 transaction(s)] done [0.85s].
## sorting and recoding items ... [3172 item(s)] done [0.03s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.22s].
## writing ... [46 rule(s)] done [0.04s].
## creating S4 object ... done [0.06s].
summary(brules)## set of 46 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 44 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 3.000 3.043 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001004 Min. :0.6000 Min. :21.81 Min. :41.00
## 1st Qu.:0.001029 1st Qu.:0.6118 1st Qu.:22.24 1st Qu.:42.00
## Median :0.001102 Median :0.6258 Median :22.75 Median :45.00
## Mean :0.001132 Mean :0.6365 Mean :23.14 Mean :46.22
## 3rd Qu.:0.001219 3rd Qu.:0.6457 3rd Qu.:23.47 3rd Qu.:49.75
## Max. :0.001396 Max. :0.7455 Max. :27.10 Max. :57.00
##
## mining info:
## data ntransactions support confidence
## bookbaskets_use 40822 0.001 0.6
## set of 46 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 44 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 3.000 3.043 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001004 Min. :0.6000 Min. :21.81 Min. :41.00
## 1st Qu.:0.001029 1st Qu.:0.6118 1st Qu.:22.24 1st Qu.:42.00
## Median :0.001102 Median :0.6258 Median :22.75 Median :45.00
## Mean :0.001132 Mean :0.6365 Mean :23.14 Mean :46.22
## 3rd Qu.:0.001219 3rd Qu.:0.6457 3rd Qu.:23.47 3rd Qu.:49.75
## Max. :0.001396 Max. :0.7455 Max. :27.10 Max. :57.00
##
## mining info:
## data ntransactions support confidence
## bookbaskets_use 40822 0.001 0.6
# Note 1:
# Relax the minimum support to 0.001 and the
# minimum confidence to 0.6.
# Note 2:
# Only “The Lovely Bones” is allowed to appear
# on the right side of the rules.
# Note 3:
# By default, all the books can go into the
# left side of the rules. 00322_example_9.27_of_section_9.2.3.R
# example 9.27 of section 9.2.3
# (example 9.27 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Inspecting rules
brules %>%
sort(., by = "confidence") %>%
lhs(.) %>% # Note: 1
head(., n = 5) %>%
inspect(.) ## items
## [1] {Divine Secrets of the Ya-Ya Sisterhood: A Novel,
## Lucky : A Memoir}
## [2] {Lucky : A Memoir,
## The Notebook}
## [3] {Lucky : A Memoir,
## Wild Animus}
## [4] {Midwives: A Novel,
## Wicked: The Life and Times of the Wicked Witch of the West}
## [5] {Lucky : A Memoir,
## Summer Sisters}
## items
## 1 {Divine Secrets of the Ya-Ya Sisterhood: A Novel,
## Lucky : A Memoir}
## 2 {Lucky : A Memoir,
## The Notebook}
## 3 {Lucky : A Memoir,
## Wild Animus}
## 4 {Midwives: A Novel,
## Wicked: The Life and Times of the Wicked Witch of the West}
## 5 {Lucky : A Memoir,
## Summer Sisters}
# Note 1:
# Get the left hand side of the sorted rules. 00323_example_9.28_of_section_9.2.3.R
# example 9.28 of section 9.2.3
# (example 9.28 of section 9.2.3) : Unsupervised methods : Association rules : Mining association rules with the arules package
# Title: Inspecting rules with restrictions
brulesSub <- subset(brules, subset = !(lhs %in% "Lucky : A Memoir")) # Note: 1
brulesSub %>%
sort(., by = "confidence") %>%
lhs(.) %>%
head(., n = 5) %>%
inspect(.)## items
## [1] {Midwives: A Novel,
## Wicked: The Life and Times of the Wicked Witch of the West}
## [2] {She's Come Undone,
## The Secret Life of Bees,
## Wild Animus}
## [3] {A Walk to Remember,
## The Nanny Diaries: A Novel}
## [4] {Beloved,
## The Red Tent}
## [5] {The Da Vinci Code,
## The Reader}
brulesConf <- sort(brulesSub, by="confidence")
inspect(head(lhs(brulesConf), n = 5))## items
## [1] {Midwives: A Novel,
## Wicked: The Life and Times of the Wicked Witch of the West}
## [2] {She's Come Undone,
## The Secret Life of Bees,
## Wild Animus}
## [3] {A Walk to Remember,
## The Nanny Diaries: A Novel}
## [4] {Beloved,
## The Red Tent}
## [5] {The Da Vinci Code,
## The Reader}
## items
## 1 {Midwives: A Novel,
## Wicked: The Life and Times of the Wicked Witch of the West}
## 2 {She's Come Undone,
## The Secret Life of Bees,
## Wild Animus}
## 3 {A Walk to Remember,
## The Nanny Diaries: A Novel}
## 4 {Beloved,
## The Red Tent}
## 5 {The Da Vinci Code,
## The Reader}
# Note 1:
# Restrict to the subset of rules where
# Lucky is not in the left
# side. 




