Clustering

Data

In this exercise, we’ll use the same wine data introduced in the ensemble exercises. As noted before, all the numerical features have units. Additionally, the original objective of this dataset was to predict the wine quality from the other features (supervised learning). The data will only be used for unsupervised task here.

First, load the data and scale the numerical features:

wine <- read.csv(here::here("labs/data/Wine.csv"))
row.names(wine) <- paste("W", c(1:nrow(wine)), sep="") # row names are used after
head(wine)
summary(wine)
wine[,-12] <- scale(wine[,-12]) ## scale all the features except "quality"
library(reticulate)
use_condaenv("MLBA")
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Start fresh with all necessary imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

if wine_data is None:
    # If we can't find the file, create a small sample dataset for demonstration
    print("Could not find Wine.csv, creating sample data for demonstration")
    np.random.seed(42)
    wine_data = pd.DataFrame(
        np.random.randn(100, 11),
        columns=[f'feature_{i}' for i in range(11)]
    )
    wine_data['quality'] = np.random.randint(3, 9, size=100)

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# Clear previous plot (if any)
plt.clf()

# Within sum-of-squares
wss = []
for k in range(1, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1)))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette
silhouette_scores = []
for k in range(2, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()
Warning

Please note that all the interpretations will be based on the R outputs (like PCA and most other exercises). As usual, the python outputs may be slightly different due to differences in the implementations of the algorithms.

Hierarchical clustering

Distances

We apply here an agglomerative hierarchical clustering (AGNES). Only the numerical features are used here. First, we compute the distances and plot them. We use Manhattan distance below.

library(reshape2) # contains the melt function
library(ggplot2)
wine_d <- dist(wine[,-12], method = "manhattan") # matrix of Manhattan distances 

wine_melt <- melt(as.matrix(wine_d)) # create a data frame of the distances in long format
head(wine_melt)

ggplot(data = wine_melt, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# wine_d = pd.DataFrame(np.abs(wine.iloc[:, :-1].values[:, None] - wine.iloc[:, :-1].values), columns=wine.index, index=wine.index)
# from scipy.spatial.distance import pdist, squareform

# wine_d = pdist(wine.iloc[:, :-1], metric='cityblock')
# wine_d = squareform(wine_d)
# wine_d_df = pd.DataFrame(wine_d, index=wine.index, columns=wine.index)
from scipy.spatial.distance import cdist

wine_d = pd.DataFrame(cdist(wine.iloc[:, :-1], wine.iloc[:, :-1], metric='cityblock'), columns=wine.index, index=wine.index)

wine_melt = wine_d.reset_index().melt(id_vars='index', var_name='Var1', value_name='value')

plt.figure(figsize=(10, 8))
sns.heatmap(wine_d, cmap="coolwarm", center=0)
plt.show()

We can see that some wines are closer than others (darker color). However, it is not really possible to extract any information from such a graph.

Dendrogram

Now, we build a dendrogram using a complete linkage.

wine_hc <- hclust(wine_d, method = "complete")
plot(wine_hc, hang=-1)
from scipy.cluster.hierarchy import dendrogram, linkage

wine_linkage = linkage(wine.iloc[:, :-1], method='complete', metric='cityblock')
plt.figure(figsize=(10, 8))
dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=0, leaf_font_size=10)
plt.show()

We cut the tree to 4 clusters, and represent the result. We also extract the cluster assignment of each wine.

plot(wine_hc, hang=-1)
rect.hclust(wine_hc, k=4)
wine_clust <- cutree(wine_hc, k=4)
wine_clust
from scipy.cluster.hierarchy import fcluster

plt.figure(figsize=(10, 8))
dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=80, leaf_font_size=10)
plt.axhline(y=80, color='black', linestyle='--')
plt.show()

wine_clust = fcluster(wine_linkage, 4, criterion='maxclust')
wine_clust

Interpretation of the clusters

Now we analyze the clusters by looking at the distribution of the features within each cluster.

wine_comp <- data.frame(wine[,-12], Clust=factor(wine_clust), Id=row.names(wine))
wine_df <- melt(wine_comp, id=c("Id", "Clust"))
head(wine_df)

ggplot(wine_df, aes(y=value, group=Clust, fill=Clust)) +
  geom_boxplot() +
  facet_wrap(~variable, ncol=4, nrow=3)
wine_comp = wine.iloc[:, :-1].copy()
wine_comp['Clust'] = wine_clust
wine_comp['Id'] = wine.index
wine_melt = wine_comp.melt(id_vars=['Id', 'Clust'])

plt.figure(figsize=(14, 10))
sns.boxplot(x='variable', y='value', hue='Clust', data=wine_melt)
plt.xticks(rotation=90)
plt.legend(title='Cluster')
plt.show()

We can see for example that - Cluster 4 (the smallest): small pH, large fixed.acidity and citric.acid, a large density, and a small alcohol. Also a large free.sulfur.dioxide. - Cluster 2: also has large fixed.acidity but not citric.acid. It looks like less acid than cluster 3. - Cluster 3: apparently has a large alcohol. - Etc.

Choice of the number of clusters

To choose the number of cluster, we can inspect the dendrogram (judgmental approach), or we can rely on a statistics. Below, we use the within sum-of-squares, the GAP statistics, and the silhouette. It is obtained by the function fviz_nbclust in package factoextra. It uses the dendrogram with complete linkage on Manhattan distance is obtained using the function hcut with hc_method="complete" and hc_metric="manhattan".

library(factoextra)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "wss", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "silhouette", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "gap", 
             k.max = 25, verbose = FALSE)

The gap statistic is readily available in R (as seen above), but for Python we’ll use more commonly available metrics in scikit-learn: - Davies-Bouldin Index (lower values indicate better clustering) - Calinski-Harabasz Index (higher values indicate better clustering)

These metrics serve the same purpose - helping us determine the optimal number of clusters

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# Clear previous plot (if any)
plt.clf()

# Within sum-of-squares (Elbow method)
wss = []
for k in range(1, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1)))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette method
silhouette_scores = []
for k in range(2, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics to gap statistic
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()

Like often, these methods are not easy to interpret. Globally, they choose \(k=2\) or \(k=3\).

K-means

The K-means application follows the same logic as before. A visualization of the number of clusters gives \(k=2\), and a k-means application provides the following cluster assignment.

fviz_nbclust(wine[,-12],
             kmeans,
             method = "wss", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             kmeans, 
             method = "silhouette", 
             k.max = 25, verbose = FALSE)
# Using gap statistic in R (works fine in R)
fviz_nbclust(wine[,-12],
             kmeans,
             method = "gap", 
             k.max = 25, verbose = FALSE)

wine_km <- kmeans(wine[,-12], centers=2)
wine_km$cluster ## see also wine_km for more information about the clustering hence obtained
# K-means clustering with alternative metrics to gap statistic
# We use standard metrics available in scikit-learn that are widely used in practice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os


# Within sum-of-squares (Elbow method)
wss = []
for k in range(1, 26):
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    wss.append(model.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette method
silhouette_scores = []
for k in range(2, 26):
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics to gap statistic
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()

# Final K-means with k=2
wine_km = KMeans(n_clusters=2, n_init=10)
wine_km.fit(X)
wine_km.labels_

Again the clusters can be inspected through their features exactly the same way as before.

PAM and silhouette plot

The application of the PAM is similar to K-means. We use the pam function from package cluster. Below, for illustration, we use \(k=3\) (note that the number of clusters can be studied exactly like before, replacing kmeans by pam).

library(cluster)
wine_pam <- pam(wine[,-12], k=3)
wine_pam
# PAM clustering with fresh imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_samples
from scipy.spatial.distance import pdist, squareform
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

if wine_data is None:
    # If we can't find the file, create a small sample dataset for demonstration
    print("Could not find Wine.csv, creating sample data for demonstration")
    np.random.seed(42)
    wine_data = pd.DataFrame(
        np.random.randn(100, 11),
        columns=[f'feature_{i}' for i in range(11)]
    )
    wine_data['quality'] = np.random.randint(3, 9, size=100)

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# PAM with k=3
wine_pam = KMedoids(n_clusters=3, metric='manhattan', method='pam')
wine_pam.fit(X)
wine_pam.labels_

# Silhouette plot function
def plot_silhouette(data, labels, metric='euclidean', figsize=(10, 6)):
    distances = pdist(data, metric='cityblock' if metric == 'manhattan' else metric)
    dist_matrix = squareform(distances)
    silhouette_vals = silhouette_samples(dist_matrix, labels, metric='precomputed')
    
    fig, ax = plt.subplots(figsize=figsize)
    y_lower, y_upper = 0, 0
    n_clusters = len(np.unique(labels))
    
    for i in range(n_clusters):
        cluster_silhouette_vals = silhouette_vals[labels == i]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1)
        ax.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)
    
    silhouette_avg = np.mean(silhouette_vals)
    ax.axvline(silhouette_avg, color='red', linestyle='--')
    ax.set_xlabel('Silhouette coefficient values')
    ax.set_ylabel('Cluster labels')
    ax.set_title('Silhouette plot for the various clusters')
    plt.show()

# Create the silhouette plot
plot_silhouette(X, wine_pam.labels_, metric='manhattan')

We see that the average silhouette considering the all three clusters is 0.12. Cluster 2 has some wines badly clustered (silhouette of 0.06, the lowest, and several wine with negative silhouettes). Cluster 3 is the most homogeneous cluster and the best separated from the other two.

Your turn

Repeat the clustering analysis on one of the datasets seen in the course (like the real_estate_data or nursing_home). Evidently, this is not a supervised task, therefore, you may choose to leave the outcome variable out of your analysis.