<- read.csv(here::here("labs/data/Wine.csv"))
wine row.names(wine) <- paste("W", c(1:nrow(wine)), sep="") # row names are used after
head(wine)
summary(wine)
-12] <- scale(wine[,-12]) ## scale all the features except "quality" wine[,
Clustering
Data
In this exercise, we’ll use the same wine
data introduced in the ensemble exercises. As noted before, all the numerical features have units. Additionally, the original objective of this dataset was to predict the wine quality
from the other features (supervised learning). The data will only be used for unsupervised task here.
First, load the data and scale the numerical features:
library(reticulate)
use_condaenv("MLBA")
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Start fresh with all necessary imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os
# Find the correct path to the data file
# Try different possible locations
= [
possible_paths 'labs/data/Wine.csv', # From project root
'../../data/Wine.csv', # Relative to current file
'../../../data/Wine.csv', # One level up
'data/Wine.csv', # Direct in data folder
'Wine.csv' # In current directory
]
= None
wine_data for path in possible_paths:
try:
= pd.read_csv(path)
wine_data print(f"Successfully loaded data from: {path}")
break
except FileNotFoundError:
continue
if wine_data is None:
# If we can't find the file, create a small sample dataset for demonstration
print("Could not find Wine.csv, creating sample data for demonstration")
42)
np.random.seed(= pd.DataFrame(
wine_data 100, 11),
np.random.randn(=[f'feature_{i}' for i in range(11)]
columns
)'quality'] = np.random.randint(3, 9, size=100)
wine_data[
# Continue with the analysis using wine_data
= wine_data
wine = ["W" + str(i) for i in range(1, len(wine)+1)]
wine.index = StandardScaler()
scaler = pd.DataFrame(
wine_scaled -1]),
scaler.fit_transform(wine.iloc[:, :=wine.index,
index=wine.columns[:-1]
columns
)= wine_scaled.values
X
# Clear previous plot (if any)
plt.clf()
# Within sum-of-squares
= []
wss for k in range(1, 26):
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels = np.array([X[labels == i].mean(axis=0) for i in range(k)])
centroids sum(((X - centroids[labels]) ** 2).sum(axis=1)))
wss.append(
=(10, 6))
plt.figure(figsizerange(1, 26), wss, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Within sum-of-squares')
plt.ylabel('Elbow Method for Optimal k')
plt.title(
plt.show()
# Silhouette
= []
silhouette_scores for k in range(2, 26):
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels
silhouette_scores.append(silhouette_score(X, labels))
=(10, 6))
plt.figure(figsizerange(2, 26), silhouette_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Silhouette score')
plt.ylabel('Silhouette Method for Optimal k')
plt.title(
plt.show()
# Alternative metrics
= []
ch_scores = []
db_scores = range(2, 26)
k_range
for k in k_range:
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels
ch_scores.append(calinski_harabasz_score(X, labels))
db_scores.append(davies_bouldin_score(X, labels))
# Plot Calinski-Harabasz Index
=(10, 6))
plt.figure(figsizelist(k_range), ch_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Calinski-Harabasz Index (higher is better)')
plt.ylabel('Calinski-Harabasz Index for determining optimal k')
plt.title(
plt.show()
# Plot Davies-Bouldin Index
=(10, 6))
plt.figure(figsizelist(k_range), db_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Davies-Bouldin Index (lower is better)')
plt.ylabel('Davies-Bouldin Index for determining optimal k')
plt.title( plt.show()
Please note that all the interpretations will be based on the R outputs (like PCA and most other exercises). As usual, the python outputs may be slightly different due to differences in the implementations of the algorithms.
Hierarchical clustering
Distances
We apply here an agglomerative hierarchical clustering (AGNES). Only the numerical features are used here. First, we compute the distances and plot them. We use Manhattan distance below.
library(reshape2) # contains the melt function
library(ggplot2)
<- dist(wine[,-12], method = "manhattan") # matrix of Manhattan distances
wine_d
<- melt(as.matrix(wine_d)) # create a data frame of the distances in long format
wine_melt head(wine_melt)
ggplot(data = wine_melt, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# wine_d = pd.DataFrame(np.abs(wine.iloc[:, :-1].values[:, None] - wine.iloc[:, :-1].values), columns=wine.index, index=wine.index)
# from scipy.spatial.distance import pdist, squareform
# wine_d = pdist(wine.iloc[:, :-1], metric='cityblock')
# wine_d = squareform(wine_d)
# wine_d_df = pd.DataFrame(wine_d, index=wine.index, columns=wine.index)
from scipy.spatial.distance import cdist
= pd.DataFrame(cdist(wine.iloc[:, :-1], wine.iloc[:, :-1], metric='cityblock'), columns=wine.index, index=wine.index)
wine_d
= wine_d.reset_index().melt(id_vars='index', var_name='Var1', value_name='value')
wine_melt
=(10, 8))
plt.figure(figsize="coolwarm", center=0)
sns.heatmap(wine_d, cmap plt.show()
We can see that some wines are closer than others (darker color). However, it is not really possible to extract any information from such a graph.
Dendrogram
Now, we build a dendrogram using a complete linkage.
<- hclust(wine_d, method = "complete")
wine_hc plot(wine_hc, hang=-1)
from scipy.cluster.hierarchy import dendrogram, linkage
= linkage(wine.iloc[:, :-1], method='complete', metric='cityblock')
wine_linkage =(10, 8))
plt.figure(figsize=wine.index, orientation='top', color_threshold=0, leaf_font_size=10)
dendrogram(wine_linkage, labels plt.show()
We cut the tree to 4 clusters, and represent the result. We also extract the cluster assignment of each wine.
plot(wine_hc, hang=-1)
rect.hclust(wine_hc, k=4)
<- cutree(wine_hc, k=4)
wine_clust wine_clust
from scipy.cluster.hierarchy import fcluster
=(10, 8))
plt.figure(figsize=wine.index, orientation='top', color_threshold=80, leaf_font_size=10)
dendrogram(wine_linkage, labels=80, color='black', linestyle='--')
plt.axhline(y
plt.show()
= fcluster(wine_linkage, 4, criterion='maxclust')
wine_clust wine_clust
Interpretation of the clusters
Now we analyze the clusters by looking at the distribution of the features within each cluster.
<- data.frame(wine[,-12], Clust=factor(wine_clust), Id=row.names(wine))
wine_comp <- melt(wine_comp, id=c("Id", "Clust"))
wine_df head(wine_df)
ggplot(wine_df, aes(y=value, group=Clust, fill=Clust)) +
geom_boxplot() +
facet_wrap(~variable, ncol=4, nrow=3)
= wine.iloc[:, :-1].copy()
wine_comp 'Clust'] = wine_clust
wine_comp['Id'] = wine.index
wine_comp[= wine_comp.melt(id_vars=['Id', 'Clust'])
wine_melt
=(14, 10))
plt.figure(figsize='variable', y='value', hue='Clust', data=wine_melt)
sns.boxplot(x=90)
plt.xticks(rotation='Cluster')
plt.legend(title plt.show()
We can see for example that - Cluster 4 (the smallest): small pH, large fixed.acidity and citric.acid, a large density, and a small alcohol. Also a large free.sulfur.dioxide. - Cluster 2: also has large fixed.acidity but not citric.acid. It looks like less acid than cluster 3. - Cluster 3: apparently has a large alcohol. - Etc.
Choice of the number of clusters
To choose the number of cluster, we can inspect the dendrogram (judgmental approach), or we can rely on a statistics. Below, we use the within sum-of-squares, the GAP statistics, and the silhouette. It is obtained by the function fviz_nbclust
in package factoextra
. It uses the dendrogram with complete linkage on Manhattan distance is obtained using the function hcut
with hc_method="complete"
and hc_metric="manhattan"
.
library(factoextra)
fviz_nbclust(wine[,-12],
hc_method="complete",
hcut, hc_metric="manhattan",
method = "wss",
k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
hc_method="complete",
hcut, hc_metric="manhattan",
method = "silhouette",
k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
hc_method="complete",
hcut, hc_metric="manhattan",
method = "gap",
k.max = 25, verbose = FALSE)
The gap statistic is readily available in R (as seen above), but for Python we’ll use more commonly available metrics in scikit-learn: - Davies-Bouldin Index (lower values indicate better clustering) - Calinski-Harabasz Index (higher values indicate better clustering)
These metrics serve the same purpose - helping us determine the optimal number of clusters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os
# Find the correct path to the data file
# Try different possible locations
= [
possible_paths 'labs/data/Wine.csv', # From project root
'../../data/Wine.csv', # Relative to current file
'../../../data/Wine.csv', # One level up
'data/Wine.csv', # Direct in data folder
'Wine.csv' # In current directory
]
= None
wine_data for path in possible_paths:
try:
= pd.read_csv(path)
wine_data print(f"Successfully loaded data from: {path}")
break
except FileNotFoundError:
continue
# Continue with the analysis using wine_data
= wine_data
wine = ["W" + str(i) for i in range(1, len(wine)+1)]
wine.index = StandardScaler()
scaler = pd.DataFrame(
wine_scaled -1]),
scaler.fit_transform(wine.iloc[:, :=wine.index,
index=wine.columns[:-1]
columns
)= wine_scaled.values
X
# Clear previous plot (if any)
plt.clf()
# Within sum-of-squares (Elbow method)
= []
wss for k in range(1, 26):
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels = np.array([X[labels == i].mean(axis=0) for i in range(k)])
centroids sum(((X - centroids[labels]) ** 2).sum(axis=1)))
wss.append(
=(10, 6))
plt.figure(figsizerange(1, 26), wss, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Within sum-of-squares')
plt.ylabel('Elbow Method for Optimal k')
plt.title(
plt.show()
# Silhouette method
= []
silhouette_scores for k in range(2, 26):
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels
silhouette_scores.append(silhouette_score(X, labels))
=(10, 6))
plt.figure(figsizerange(2, 26), silhouette_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Silhouette score')
plt.ylabel('Silhouette Method for Optimal k')
plt.title(
plt.show()
# Alternative metrics to gap statistic
= []
ch_scores = []
db_scores = range(2, 26)
k_range
for k in k_range:
= AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
model
model.fit(X)= model.labels_
labels
ch_scores.append(calinski_harabasz_score(X, labels))
db_scores.append(davies_bouldin_score(X, labels))
# Plot Calinski-Harabasz Index
=(10, 6))
plt.figure(figsizelist(k_range), ch_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Calinski-Harabasz Index (higher is better)')
plt.ylabel('Calinski-Harabasz Index for determining optimal k')
plt.title(
plt.show()
# Plot Davies-Bouldin Index
=(10, 6))
plt.figure(figsizelist(k_range), db_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Davies-Bouldin Index (lower is better)')
plt.ylabel('Davies-Bouldin Index for determining optimal k')
plt.title( plt.show()
Like often, these methods are not easy to interpret. Globally, they choose \(k=2\) or \(k=3\).
K-means
The K-means application follows the same logic as before. A visualization of the number of clusters gives \(k=2\), and a k-means application provides the following cluster assignment.
fviz_nbclust(wine[,-12],
kmeans,method = "wss",
k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
kmeans, method = "silhouette",
k.max = 25, verbose = FALSE)
# Using gap statistic in R (works fine in R)
fviz_nbclust(wine[,-12],
kmeans,method = "gap",
k.max = 25, verbose = FALSE)
<- kmeans(wine[,-12], centers=2)
wine_km $cluster ## see also wine_km for more information about the clustering hence obtained wine_km
# K-means clustering with alternative metrics to gap statistic
# We use standard metrics available in scikit-learn that are widely used in practice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os
# Within sum-of-squares (Elbow method)
= []
wss for k in range(1, 26):
= KMeans(n_clusters=k, n_init=10)
model
model.fit(X)
wss.append(model.inertia_)
=(10, 6))
plt.figure(figsizerange(1, 26), wss, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Within sum-of-squares')
plt.ylabel('Elbow Method for Optimal k')
plt.title(
plt.show()
# Silhouette method
= []
silhouette_scores for k in range(2, 26):
= KMeans(n_clusters=k, n_init=10)
model
model.fit(X)= model.labels_
labels
silhouette_scores.append(silhouette_score(X, labels))
=(10, 6))
plt.figure(figsizerange(2, 26), silhouette_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Silhouette score')
plt.ylabel('Silhouette Method for Optimal k')
plt.title(
plt.show()
# Alternative metrics to gap statistic
= []
ch_scores = []
db_scores = range(2, 26)
k_range
for k in k_range:
= KMeans(n_clusters=k, n_init=10)
model
model.fit(X)= model.labels_
labels
ch_scores.append(calinski_harabasz_score(X, labels))
db_scores.append(davies_bouldin_score(X, labels))
# Plot Calinski-Harabasz Index
=(10, 6))
plt.figure(figsizelist(k_range), ch_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Calinski-Harabasz Index (higher is better)')
plt.ylabel('Calinski-Harabasz Index for determining optimal k')
plt.title(
plt.show()
# Plot Davies-Bouldin Index
=(10, 6))
plt.figure(figsizelist(k_range), db_scores, marker='o')
plt.plot('Number of clusters')
plt.xlabel('Davies-Bouldin Index (lower is better)')
plt.ylabel('Davies-Bouldin Index for determining optimal k')
plt.title(
plt.show()
# Final K-means with k=2
= KMeans(n_clusters=2, n_init=10)
wine_km
wine_km.fit(X) wine_km.labels_
Again the clusters can be inspected through their features exactly the same way as before.
PAM and silhouette plot
The application of the PAM is similar to K-means. We use the pam
function from package cluster
. Below, for illustration, we use \(k=3\) (note that the number of clusters can be studied exactly like before, replacing kmeans
by pam
).
library(cluster)
<- pam(wine[,-12], k=3)
wine_pam wine_pam
# PAM clustering with fresh imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_samples
from scipy.spatial.distance import pdist, squareform
import os
# Find the correct path to the data file
# Try different possible locations
= [
possible_paths 'labs/data/Wine.csv', # From project root
'../../data/Wine.csv', # Relative to current file
'../../../data/Wine.csv', # One level up
'data/Wine.csv', # Direct in data folder
'Wine.csv' # In current directory
]
= None
wine_data for path in possible_paths:
try:
= pd.read_csv(path)
wine_data print(f"Successfully loaded data from: {path}")
break
except FileNotFoundError:
continue
if wine_data is None:
# If we can't find the file, create a small sample dataset for demonstration
print("Could not find Wine.csv, creating sample data for demonstration")
42)
np.random.seed(= pd.DataFrame(
wine_data 100, 11),
np.random.randn(=[f'feature_{i}' for i in range(11)]
columns
)'quality'] = np.random.randint(3, 9, size=100)
wine_data[
# Continue with the analysis using wine_data
= wine_data
wine = ["W" + str(i) for i in range(1, len(wine)+1)]
wine.index = StandardScaler()
scaler = pd.DataFrame(
wine_scaled -1]),
scaler.fit_transform(wine.iloc[:, :=wine.index,
index=wine.columns[:-1]
columns
)= wine_scaled.values
X
# PAM with k=3
= KMedoids(n_clusters=3, metric='manhattan', method='pam')
wine_pam
wine_pam.fit(X)
wine_pam.labels_
# Silhouette plot function
def plot_silhouette(data, labels, metric='euclidean', figsize=(10, 6)):
= pdist(data, metric='cityblock' if metric == 'manhattan' else metric)
distances = squareform(distances)
dist_matrix = silhouette_samples(dist_matrix, labels, metric='precomputed')
silhouette_vals
= plt.subplots(figsize=figsize)
fig, ax = 0, 0
y_lower, y_upper = len(np.unique(labels))
n_clusters
for i in range(n_clusters):
= silhouette_vals[labels == i]
cluster_silhouette_vals
cluster_silhouette_vals.sort()+= len(cluster_silhouette_vals)
y_upper range(y_lower, y_upper), cluster_silhouette_vals, height=1)
ax.barh(-0.03, (y_lower + y_upper) / 2, str(i + 1))
ax.text(+= len(cluster_silhouette_vals)
y_lower
= np.mean(silhouette_vals)
silhouette_avg ='red', linestyle='--')
ax.axvline(silhouette_avg, color'Silhouette coefficient values')
ax.set_xlabel('Cluster labels')
ax.set_ylabel('Silhouette plot for the various clusters')
ax.set_title(
plt.show()
# Create the silhouette plot
='manhattan') plot_silhouette(X, wine_pam.labels_, metric
We see that the average silhouette considering the all three clusters is 0.12. Cluster 2 has some wines badly clustered (silhouette of 0.06, the lowest, and several wine with negative silhouettes). Cluster 3 is the most homogeneous cluster and the best separated from the other two.
Your turn
Repeat the clustering analysis on one of the datasets seen in the course (like the real_estate_data
or nursing_home
). Evidently, this is not a supervised task, therefore, you may choose to leave the outcome variable out of your analysis.