Clustering

Data

In this exercise, we’ll use the same wine data introduced in the ensemble exercises. As noted before, all the numerical features have units. Additionally, the original objective of this dataset was to predict the wine quality from the other features (supervised learning). The data will only be used for unsupervised task here.

First, load the data and scale the numerical features:

R
Python

wine <- read.csv(here::here("labs/data/Wine.csv"))
row.names(wine) <- paste("W", c(1:nrow(wine)), sep="") # row names are used after
head(wine)
summary(wine)
wine[,-12] <- scale(wine[,-12]) ## scale all the features except "quality"

library(reticulate)
use_condaenv("MLBA")

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Start fresh with all necessary imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

if wine_data is None:
    # If we can't find the file, create a small sample dataset for demonstration
    print("Could not find Wine.csv, creating sample data for demonstration")
    np.random.seed(42)
    wine_data = pd.DataFrame(
        np.random.randn(100, 11),
        columns=[f'feature_{i}' for i in range(11)]
    )
    wine_data['quality'] = np.random.randint(3, 9, size=100)

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# Clear previous plot (if any)
plt.clf()

# Within sum-of-squares
wss = []
for k in range(1, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1)))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette
silhouette_scores = []
for k in range(2, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()

Warning

Please note that all the interpretations will be based on the R outputs (like PCA and most other exercises). As usual, the python outputs may be slightly different due to differences in the implementations of the algorithms.

Hierarchical clustering

Distances

We apply here an agglomerative hierarchical clustering (AGNES). Only the numerical features are used here. First, we compute the distances and plot them. We use Manhattan distance below.

R
Python

library(reshape2) # contains the melt function
library(ggplot2)
wine_d <- dist(wine[,-12], method = "manhattan") # matrix of Manhattan distances 

wine_melt <- melt(as.matrix(wine_d)) # create a data frame of the distances in long format
head(wine_melt)

ggplot(data = wine_melt, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# wine_d = pd.DataFrame(np.abs(wine.iloc[:, :-1].values[:, None] - wine.iloc[:, :-1].values), columns=wine.index, index=wine.index)
# from scipy.spatial.distance import pdist, squareform

# wine_d = pdist(wine.iloc[:, :-1], metric='cityblock')
# wine_d = squareform(wine_d)
# wine_d_df = pd.DataFrame(wine_d, index=wine.index, columns=wine.index)
from scipy.spatial.distance import cdist

wine_d = pd.DataFrame(cdist(wine.iloc[:, :-1], wine.iloc[:, :-1], metric='cityblock'), columns=wine.index, index=wine.index)

wine_melt = wine_d.reset_index().melt(id_vars='index', var_name='Var1', value_name='value')

plt.figure(figsize=(10, 8))
sns.heatmap(wine_d, cmap="coolwarm", center=0)
plt.show()

We can see that some wines are closer than others (darker color). However, it is not really possible to extract any information from such a graph.

Dendrogram

Now, we build a dendrogram using a complete linkage.

R
Python

wine_hc <- hclust(wine_d, method = "complete")
plot(wine_hc, hang=-1)

from scipy.cluster.hierarchy import dendrogram, linkage

wine_linkage = linkage(wine.iloc[:, :-1], method='complete', metric='cityblock')
plt.figure(figsize=(10, 8))
dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=0, leaf_font_size=10)
plt.show()

We cut the tree to 4 clusters, and represent the result. We also extract the cluster assignment of each wine.

R
Python

plot(wine_hc, hang=-1)
rect.hclust(wine_hc, k=4)
wine_clust <- cutree(wine_hc, k=4)
wine_clust

from scipy.cluster.hierarchy import fcluster

plt.figure(figsize=(10, 8))
dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=80, leaf_font_size=10)
plt.axhline(y=80, color='black', linestyle='--')
plt.show()

wine_clust = fcluster(wine_linkage, 4, criterion='maxclust')
wine_clust

Interpretation of the clusters

Now we analyze the clusters by looking at the distribution of the features within each cluster.

R
Python

wine_comp <- data.frame(wine[,-12], Clust=factor(wine_clust), Id=row.names(wine))
wine_df <- melt(wine_comp, id=c("Id", "Clust"))
head(wine_df)

ggplot(wine_df, aes(y=value, group=Clust, fill=Clust)) +
  geom_boxplot() +
  facet_wrap(~variable, ncol=4, nrow=3)

wine_comp = wine.iloc[:, :-1].copy()
wine_comp['Clust'] = wine_clust
wine_comp['Id'] = wine.index
wine_melt = wine_comp.melt(id_vars=['Id', 'Clust'])

plt.figure(figsize=(14, 10))
sns.boxplot(x='variable', y='value', hue='Clust', data=wine_melt)
plt.xticks(rotation=90)
plt.legend(title='Cluster')
plt.show()

We can see for example that - Cluster 4 (the smallest): small pH, large fixed.acidity and citric.acid, a large density, and a small alcohol. Also a large free.sulfur.dioxide. - Cluster 2: also has large fixed.acidity but not citric.acid. It looks like less acid than cluster 3. - Cluster 3: apparently has a large alcohol. - Etc.

Choice of the number of clusters

To choose the number of cluster, we can inspect the dendrogram (judgmental approach), or we can rely on a statistics. Below, we use the within sum-of-squares, the GAP statistics, and the silhouette. It is obtained by the function fviz_nbclust in package factoextra. It uses the dendrogram with complete linkage on Manhattan distance is obtained using the function hcut with hc_method="complete" and hc_metric="manhattan".

R
Python

library(factoextra)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "wss", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "silhouette", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             hcut, hc_method="complete",
             hc_metric="manhattan",
             method = "gap", 
             k.max = 25, verbose = FALSE)

The gap statistic is readily available in R (as seen above), but for Python we’ll use more commonly available metrics in scikit-learn: - Davies-Bouldin Index (lower values indicate better clustering) - Calinski-Harabasz Index (higher values indicate better clustering)

These metrics serve the same purpose - helping us determine the optimal number of clusters

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# Clear previous plot (if any)
plt.clf()

# Within sum-of-squares (Elbow method)
wss = []
for k in range(1, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1)))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette method
silhouette_scores = []
for k in range(2, 26):
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics to gap statistic
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete')
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()

Like often, these methods are not easy to interpret. Globally, they choose $k=2$ or $k=3$.

K-means

The K-means application follows the same logic as before. A visualization of the number of clusters gives $k=2$, and a k-means application provides the following cluster assignment.

R
Python

fviz_nbclust(wine[,-12],
             kmeans,
             method = "wss", 
             k.max = 25, verbose = FALSE)
fviz_nbclust(wine[,-12],
             kmeans, 
             method = "silhouette", 
             k.max = 25, verbose = FALSE)
# Using gap statistic in R (works fine in R)
fviz_nbclust(wine[,-12],
             kmeans,
             method = "gap", 
             k.max = 25, verbose = FALSE)

wine_km <- kmeans(wine[,-12], centers=2)
wine_km$cluster ## see also wine_km for more information about the clustering hence obtained

# K-means clustering with alternative metrics to gap statistic
# We use standard metrics available in scikit-learn that are widely used in practice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import os


# Within sum-of-squares (Elbow method)
wss = []
for k in range(1, 26):
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    wss.append(model.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 26), wss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Within sum-of-squares')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette method
silhouette_scores = []
for k in range(2, 26):
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    labels = model.labels_
    silhouette_scores.append(silhouette_score(X, labels))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 26), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method for Optimal k')
plt.show()

# Alternative metrics to gap statistic
ch_scores = []
db_scores = []
k_range = range(2, 26)

for k in k_range:
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    labels = model.labels_
    ch_scores.append(calinski_harabasz_score(X, labels))
    db_scores.append(davies_bouldin_score(X, labels))

# Plot Calinski-Harabasz Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), ch_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index (higher is better)')
plt.title('Calinski-Harabasz Index for determining optimal k')
plt.show()

# Plot Davies-Bouldin Index
plt.figure(figsize=(10, 6))
plt.plot(list(k_range), db_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index (lower is better)')
plt.title('Davies-Bouldin Index for determining optimal k')
plt.show()

# Final K-means with k=2
wine_km = KMeans(n_clusters=2, n_init=10)
wine_km.fit(X)
wine_km.labels_

Again the clusters can be inspected through their features exactly the same way as before.

PAM and silhouette plot

The application of the PAM is similar to K-means. We use the pam function from package cluster. Below, for illustration, we use $k=3$ (note that the number of clusters can be studied exactly like before, replacing kmeans by pam).

R
Python

library(cluster)
wine_pam <- pam(wine[,-12], k=3)
wine_pam

# PAM clustering with fresh imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_samples
from scipy.spatial.distance import pdist, squareform
import os

# Find the correct path to the data file
# Try different possible locations
possible_paths = [
    'labs/data/Wine.csv',  # From project root
    '../../data/Wine.csv',  # Relative to current file
    '../../../data/Wine.csv',  # One level up
    'data/Wine.csv',  # Direct in data folder
    'Wine.csv'  # In current directory
]

wine_data = None
for path in possible_paths:
    try:
        wine_data = pd.read_csv(path)
        print(f"Successfully loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

if wine_data is None:
    # If we can't find the file, create a small sample dataset for demonstration
    print("Could not find Wine.csv, creating sample data for demonstration")
    np.random.seed(42)
    wine_data = pd.DataFrame(
        np.random.randn(100, 11),
        columns=[f'feature_{i}' for i in range(11)]
    )
    wine_data['quality'] = np.random.randint(3, 9, size=100)

# Continue with the analysis using wine_data
wine = wine_data
wine.index = ["W" + str(i) for i in range(1, len(wine)+1)]
scaler = StandardScaler()
wine_scaled = pd.DataFrame(
    scaler.fit_transform(wine.iloc[:, :-1]),
    index=wine.index,
    columns=wine.columns[:-1]
)
X = wine_scaled.values

# PAM with k=3
wine_pam = KMedoids(n_clusters=3, metric='manhattan', method='pam')
wine_pam.fit(X)
wine_pam.labels_

# Silhouette plot function
def plot_silhouette(data, labels, metric='euclidean', figsize=(10, 6)):
    distances = pdist(data, metric='cityblock' if metric == 'manhattan' else metric)
    dist_matrix = squareform(distances)
    silhouette_vals = silhouette_samples(dist_matrix, labels, metric='precomputed')
    
    fig, ax = plt.subplots(figsize=figsize)
    y_lower, y_upper = 0, 0
    n_clusters = len(np.unique(labels))
    
    for i in range(n_clusters):
        cluster_silhouette_vals = silhouette_vals[labels == i]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1)
        ax.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)
    
    silhouette_avg = np.mean(silhouette_vals)
    ax.axvline(silhouette_avg, color='red', linestyle='--')
    ax.set_xlabel('Silhouette coefficient values')
    ax.set_ylabel('Cluster labels')
    ax.set_title('Silhouette plot for the various clusters')
    plt.show()

# Create the silhouette plot
plot_silhouette(X, wine_pam.labels_, metric='manhattan')

We see that the average silhouette considering the all three clusters is 0.12. Cluster 2 has some wines badly clustered (silhouette of 0.06, the lowest, and several wine with negative silhouettes). Cluster 3 is the most homogeneous cluster and the best separated from the other two.

Your turn

Repeat the clustering analysis on one of the datasets seen in the course (like the real_estate_data or nursing_home). Evidently, this is not a supervised task, therefore, you may choose to leave the outcome variable out of your analysis.

--- title: "Clustering" --- ```{r global_options, include = FALSE} knitr::opts_chunk$set(fig.align="center", results = 'hide', fig.show = 'hide') ``` # Data In this exercise, we'll use the same `wine` data introduced in the ensemble exercises. As noted before, all the numerical features have units. Additionally, the original objective of this dataset was to predict the wine `quality` from the other features (supervised learning). The data will only be used for unsupervised task here. First, load the data and scale the numerical features: ::: {.panel-tabset} ### R ```{r} wine <- read.csv(here::here("labs/data/Wine.csv")) row.names(wine) <- paste("W", c(1:nrow(wine)), sep="") # row names are used after head(wine) summary(wine) wine[,-12] <- scale(wine[,-12]) ## scale all the features except "quality" ``` ### Python ```{r} library(reticulate) use_condaenv("MLBA") ``` ```{python} import pandas as pd from sklearn.preprocessing import StandardScaler # Start fresh with all necessary imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score import os # Find the correct path to the data file # Try different possible locations possible_paths = [ 'labs/data/Wine.csv', # From project root '../../data/Wine.csv', # Relative to current file '../../../data/Wine.csv', # One level up 'data/Wine.csv', # Direct in data folder 'Wine.csv' # In current directory ] wine_data = None for path in possible_paths: try: wine_data = pd.read_csv(path) print(f"Successfully loaded data from: {path}") break except FileNotFoundError: continue if wine_data is None: # If we can't find the file, create a small sample dataset for demonstration print("Could not find Wine.csv, creating sample data for demonstration") np.random.seed(42) wine_data = pd.DataFrame( np.random.randn(100, 11), columns=[f'feature_{i}' for i in range(11)] ) wine_data['quality'] = np.random.randint(3, 9, size=100) # Continue with the analysis using wine_data wine = wine_data wine.index = ["W" + str(i) for i in range(1, len(wine)+1)] scaler = StandardScaler() wine_scaled = pd.DataFrame( scaler.fit_transform(wine.iloc[:, :-1]), index=wine.index, columns=wine.columns[:-1] ) X = wine_scaled.values # Clear previous plot (if any) plt.clf() # Within sum-of-squares wss = [] for k in range(1, 26): model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)]) wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1))) plt.figure(figsize=(10, 6)) plt.plot(range(1, 26), wss, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Within sum-of-squares') plt.title('Elbow Method for Optimal k') plt.show() # Silhouette silhouette_scores = [] for k in range(2, 26): model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ silhouette_scores.append(silhouette_score(X, labels)) plt.figure(figsize=(10, 6)) plt.plot(range(2, 26), silhouette_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.title('Silhouette Method for Optimal k') plt.show() # Alternative metrics ch_scores = [] db_scores = [] k_range = range(2, 26) for k in k_range: model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ ch_scores.append(calinski_harabasz_score(X, labels)) db_scores.append(davies_bouldin_score(X, labels)) # Plot Calinski-Harabasz Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), ch_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Calinski-Harabasz Index (higher is better)') plt.title('Calinski-Harabasz Index for determining optimal k') plt.show() # Plot Davies-Bouldin Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), db_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Davies-Bouldin Index (lower is better)') plt.title('Davies-Bouldin Index for determining optimal k') plt.show() ``` ::: {.callout-warning} Please note that all the interpretations will be based on the R outputs (like PCA and most other exercises). As usual, the python outputs may be slightly different due to differences in the implementations of the algorithms. ::: ::: # Hierarchical clustering ## Distances We apply here an agglomerative hierarchical clustering (AGNES). Only the numerical features are used here. First, we compute the distances and plot them. We use Manhattan distance below. ::: {.panel-tabset} ### R ```{r} library(reshape2) # contains the melt function library(ggplot2) wine_d <- dist(wine[,-12], method = "manhattan") # matrix of Manhattan distances wine_melt <- melt(as.matrix(wine_d)) # create a data frame of the distances in long format head(wine_melt) ggplot(data = wine_melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() ``` ### Python ```{python} import numpy as np import seaborn as sns import matplotlib.pyplot as plt # wine_d = pd.DataFrame(np.abs(wine.iloc[:, :-1].values[:, None] - wine.iloc[:, :-1].values), columns=wine.index, index=wine.index) # from scipy.spatial.distance import pdist, squareform # wine_d = pdist(wine.iloc[:, :-1], metric='cityblock') # wine_d = squareform(wine_d) # wine_d_df = pd.DataFrame(wine_d, index=wine.index, columns=wine.index) from scipy.spatial.distance import cdist wine_d = pd.DataFrame(cdist(wine.iloc[:, :-1], wine.iloc[:, :-1], metric='cityblock'), columns=wine.index, index=wine.index) wine_melt = wine_d.reset_index().melt(id_vars='index', var_name='Var1', value_name='value') plt.figure(figsize=(10, 8)) sns.heatmap(wine_d, cmap="coolwarm", center=0) plt.show() ``` ::: We can see that some wines are closer than others (darker color). However, it is not really possible to extract any information from such a graph. ## Dendrogram Now, we build a dendrogram using a complete linkage. ::: {.panel-tabset} ### R ```{r} wine_hc <- hclust(wine_d, method = "complete") plot(wine_hc, hang=-1) ``` ### Python ```{python} from scipy.cluster.hierarchy import dendrogram, linkage wine_linkage = linkage(wine.iloc[:, :-1], method='complete', metric='cityblock') plt.figure(figsize=(10, 8)) dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=0, leaf_font_size=10) plt.show() ``` ::: We cut the tree to 4 clusters, and represent the result. We also extract the cluster assignment of each wine. ::: {.panel-tabset} ### R ```{r} plot(wine_hc, hang=-1) rect.hclust(wine_hc, k=4) wine_clust <- cutree(wine_hc, k=4) wine_clust ``` ### Python ```{python} from scipy.cluster.hierarchy import fcluster plt.figure(figsize=(10, 8)) dendrogram(wine_linkage, labels=wine.index, orientation='top', color_threshold=80, leaf_font_size=10) plt.axhline(y=80, color='black', linestyle='--') plt.show() wine_clust = fcluster(wine_linkage, 4, criterion='maxclust') wine_clust ``` ::: ## Interpretation of the clusters Now we analyze the clusters by looking at the distribution of the features within each cluster. ::: {.panel-tabset} ### R ```{r} wine_comp <- data.frame(wine[,-12], Clust=factor(wine_clust), Id=row.names(wine)) wine_df <- melt(wine_comp, id=c("Id", "Clust")) head(wine_df) ggplot(wine_df, aes(y=value, group=Clust, fill=Clust)) + geom_boxplot() + facet_wrap(~variable, ncol=4, nrow=3) ``` ### Python ```{python} wine_comp = wine.iloc[:, :-1].copy() wine_comp['Clust'] = wine_clust wine_comp['Id'] = wine.index wine_melt = wine_comp.melt(id_vars=['Id', 'Clust']) plt.figure(figsize=(14, 10)) sns.boxplot(x='variable', y='value', hue='Clust', data=wine_melt) plt.xticks(rotation=90) plt.legend(title='Cluster') plt.show() ``` ::: We can see for example that - Cluster 4 (the smallest): small pH, large fixed.acidity and citric.acid, a large density, and a small alcohol. Also a large free.sulfur.dioxide. - Cluster 2: also has large fixed.acidity but not citric.acid. It looks like less acid than cluster 3. - Cluster 3: apparently has a large alcohol. - Etc. ## Choice of the number of clusters To choose the number of cluster, we can inspect the dendrogram (judgmental approach), or we can rely on a statistics. Below, we use the within sum-of-squares, the GAP statistics, and the silhouette. It is obtained by the function `fviz_nbclust` in package `factoextra`. It uses the dendrogram with complete linkage on Manhattan distance is obtained using the function `hcut` with `hc_method="complete"` and `hc_metric="manhattan"`. ::: {.panel-tabset} ### R ```{r} library(factoextra) fviz_nbclust(wine[,-12], hcut, hc_method="complete", hc_metric="manhattan", method = "wss", k.max = 25, verbose = FALSE) fviz_nbclust(wine[,-12], hcut, hc_method="complete", hc_metric="manhattan", method = "silhouette", k.max = 25, verbose = FALSE) fviz_nbclust(wine[,-12], hcut, hc_method="complete", hc_metric="manhattan", method = "gap", k.max = 25, verbose = FALSE) ``` ### Python The gap statistic is readily available in R (as seen above), but for Python we'll use more commonly available metrics in scikit-learn: - Davies-Bouldin Index (lower values indicate better clustering) - Calinski-Harabasz Index (higher values indicate better clustering) These metrics serve the same purpose - helping us determine the optimal number of clusters ```{python} import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score import os # Find the correct path to the data file # Try different possible locations possible_paths = [ 'labs/data/Wine.csv', # From project root '../../data/Wine.csv', # Relative to current file '../../../data/Wine.csv', # One level up 'data/Wine.csv', # Direct in data folder 'Wine.csv' # In current directory ] wine_data = None for path in possible_paths: try: wine_data = pd.read_csv(path) print(f"Successfully loaded data from: {path}") break except FileNotFoundError: continue # Continue with the analysis using wine_data wine = wine_data wine.index = ["W" + str(i) for i in range(1, len(wine)+1)] scaler = StandardScaler() wine_scaled = pd.DataFrame( scaler.fit_transform(wine.iloc[:, :-1]), index=wine.index, columns=wine.columns[:-1] ) X = wine_scaled.values # Clear previous plot (if any) plt.clf() # Within sum-of-squares (Elbow method) wss = [] for k in range(1, 26): model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)]) wss.append(sum(((X - centroids[labels]) ** 2).sum(axis=1))) plt.figure(figsize=(10, 6)) plt.plot(range(1, 26), wss, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Within sum-of-squares') plt.title('Elbow Method for Optimal k') plt.show() # Silhouette method silhouette_scores = [] for k in range(2, 26): model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ silhouette_scores.append(silhouette_score(X, labels)) plt.figure(figsize=(10, 6)) plt.plot(range(2, 26), silhouette_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.title('Silhouette Method for Optimal k') plt.show() # Alternative metrics to gap statistic ch_scores = [] db_scores = [] k_range = range(2, 26) for k in k_range: model = AgglomerativeClustering(n_clusters=k, metric='manhattan', linkage='complete') model.fit(X) labels = model.labels_ ch_scores.append(calinski_harabasz_score(X, labels)) db_scores.append(davies_bouldin_score(X, labels)) # Plot Calinski-Harabasz Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), ch_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Calinski-Harabasz Index (higher is better)') plt.title('Calinski-Harabasz Index for determining optimal k') plt.show() # Plot Davies-Bouldin Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), db_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Davies-Bouldin Index (lower is better)') plt.title('Davies-Bouldin Index for determining optimal k') plt.show() ``` ::: Like often, these methods are not easy to interpret. Globally, they choose $k=2$ or $k=3$. # K-means The K-means application follows the same logic as before. A visualization of the number of clusters gives $k=2$, and a k-means application provides the following cluster assignment. ::: {.panel-tabset} ### R ```{r} fviz_nbclust(wine[,-12], kmeans, method = "wss", k.max = 25, verbose = FALSE) fviz_nbclust(wine[,-12], kmeans, method = "silhouette", k.max = 25, verbose = FALSE) # Using gap statistic in R (works fine in R) fviz_nbclust(wine[,-12], kmeans, method = "gap", k.max = 25, verbose = FALSE) wine_km <- kmeans(wine[,-12], centers=2) wine_km$cluster ## see also wine_km for more information about the clustering hence obtained ``` ### Python ```{python} # K-means clustering with alternative metrics to gap statistic # We use standard metrics available in scikit-learn that are widely used in practice import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score import os # Within sum-of-squares (Elbow method) wss = [] for k in range(1, 26): model = KMeans(n_clusters=k, n_init=10) model.fit(X) wss.append(model.inertia_) plt.figure(figsize=(10, 6)) plt.plot(range(1, 26), wss, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Within sum-of-squares') plt.title('Elbow Method for Optimal k') plt.show() # Silhouette method silhouette_scores = [] for k in range(2, 26): model = KMeans(n_clusters=k, n_init=10) model.fit(X) labels = model.labels_ silhouette_scores.append(silhouette_score(X, labels)) plt.figure(figsize=(10, 6)) plt.plot(range(2, 26), silhouette_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.title('Silhouette Method for Optimal k') plt.show() # Alternative metrics to gap statistic ch_scores = [] db_scores = [] k_range = range(2, 26) for k in k_range: model = KMeans(n_clusters=k, n_init=10) model.fit(X) labels = model.labels_ ch_scores.append(calinski_harabasz_score(X, labels)) db_scores.append(davies_bouldin_score(X, labels)) # Plot Calinski-Harabasz Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), ch_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Calinski-Harabasz Index (higher is better)') plt.title('Calinski-Harabasz Index for determining optimal k') plt.show() # Plot Davies-Bouldin Index plt.figure(figsize=(10, 6)) plt.plot(list(k_range), db_scores, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Davies-Bouldin Index (lower is better)') plt.title('Davies-Bouldin Index for determining optimal k') plt.show() # Final K-means with k=2 wine_km = KMeans(n_clusters=2, n_init=10) wine_km.fit(X) wine_km.labels_ ``` ::: Again the clusters can be inspected through their features exactly the same way as before. # PAM and silhouette plot The application of the PAM is similar to K-means. We use the `pam` function from package `cluster`. Below, for illustration, we use $k=3$ (note that the number of clusters can be studied exactly like before, replacing `kmeans` by `pam`). ::: {.panel-tabset} ### R ```{r} library(cluster) wine_pam <- pam(wine[,-12], k=3) wine_pam ``` ### Python ```{python} # PAM clustering with fresh imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn_extra.cluster import KMedoids from sklearn.metrics import silhouette_samples from scipy.spatial.distance import pdist, squareform import os # Find the correct path to the data file # Try different possible locations possible_paths = [ 'labs/data/Wine.csv', # From project root '../../data/Wine.csv', # Relative to current file '../../../data/Wine.csv', # One level up 'data/Wine.csv', # Direct in data folder 'Wine.csv' # In current directory ] wine_data = None for path in possible_paths: try: wine_data = pd.read_csv(path) print(f"Successfully loaded data from: {path}") break except FileNotFoundError: continue if wine_data is None: # If we can't find the file, create a small sample dataset for demonstration print("Could not find Wine.csv, creating sample data for demonstration") np.random.seed(42) wine_data = pd.DataFrame( np.random.randn(100, 11), columns=[f'feature_{i}' for i in range(11)] ) wine_data['quality'] = np.random.randint(3, 9, size=100) # Continue with the analysis using wine_data wine = wine_data wine.index = ["W" + str(i) for i in range(1, len(wine)+1)] scaler = StandardScaler() wine_scaled = pd.DataFrame( scaler.fit_transform(wine.iloc[:, :-1]), index=wine.index, columns=wine.columns[:-1] ) X = wine_scaled.values # PAM with k=3 wine_pam = KMedoids(n_clusters=3, metric='manhattan', method='pam') wine_pam.fit(X) wine_pam.labels_ # Silhouette plot function def plot_silhouette(data, labels, metric='euclidean', figsize=(10, 6)): distances = pdist(data, metric='cityblock' if metric == 'manhattan' else metric) dist_matrix = squareform(distances) silhouette_vals = silhouette_samples(dist_matrix, labels, metric='precomputed') fig, ax = plt.subplots(figsize=figsize) y_lower, y_upper = 0, 0 n_clusters = len(np.unique(labels)) for i in range(n_clusters): cluster_silhouette_vals = silhouette_vals[labels == i] cluster_silhouette_vals.sort() y_upper += len(cluster_silhouette_vals) ax.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1) ax.text(-0.03, (y_lower + y_upper) / 2, str(i + 1)) y_lower += len(cluster_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) ax.axvline(silhouette_avg, color='red', linestyle='--') ax.set_xlabel('Silhouette coefficient values') ax.set_ylabel('Cluster labels') ax.set_title('Silhouette plot for the various clusters') plt.show() # Create the silhouette plot plot_silhouette(X, wine_pam.labels_, metric='manhattan') ``` ::: We see that the average silhouette considering the all three clusters is 0.12. Cluster 2 has some wines badly clustered (silhouette of 0.06, the lowest, and several wine with negative silhouettes). Cluster 3 is the most homogeneous cluster and the best separated from the other two. # Your turn Repeat the clustering analysis on one of the datasets seen in the course (like the `real_estate_data` or `nursing_home`). Evidently, this is not a supervised task, therefore, you may choose to leave the outcome variable out of your analysis.