Skip to article frontmatterSkip to article content
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline

%matplotlib widget
if not os.getenv(
    "NBGRADER_EXECUTION"
):
    %load_ext jupyter_ai
    %ai update chatgpt dive:chat
    # %ai update chatgpt dive-azure:gpt4o

In this notebook, we will consider different methods of evaluating clustering solutions.

Intrinsic measure

The intrinsic measures of cluster quality are helpful when the ground truth is unavailable or should not be used. For example, to determine the number of clusters, we can compare the intrinsic measures of the clustering solutions for different numbers of clusters.

Elbow Method

The following diagram shows the KnowledgeFlow interface of Weka that applies SimpleKMeans to the iris.2D.arff dataset for different choices of kk.

Elbow method

Implement the above using Weka to answer the following exercise.

df_WSS = pd.DataFrame(columns=["k", "WSS"])
df_WSS["k"] = np.arange(1, 5, dtype=int)
df_WSS["WSS"] = df_WSS["WSS"].astype(float)
# YOUR CODE HERE
raise NotImplementedError

# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=1)
df_WSS.plot(x="k", y="WSS", xlabel=r"$k$", ylabel="WSS", legend=False, ax=ax)
# plt.xlabel("k")
# plt.ylabel("WSS")
plt.show()
df_WSS
%%ai chatgpt -f text
When using the elbow method to determine the number of clusters, how to 
determine the number of clusters if there are multiple elbows?

Silhouette analysis

An alternative method to the elbow method is to compute the silhouette coefficient below:

%%ai chatgpt -f text
Why should the silhouette coefficient be undefined, rather than 0, when the 
number of clusters is 1, unlike when the size of the cluster is 1?
%%ai chatgpt -f text
Why does the silhouette coefficient use distance instead of squared distance?

We will use the existing implementation in sklearn. First, import the iris dataset from sklearn.datasets and store it as a DataFrame:

# load the dataset from sklearn
dataset = datasets.load_iris()

# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df

To cluster the data using kk-means clustering:

kmeans_minmax_normalized = make_pipeline(
    preprocessing.MinMaxScaler(), KMeans(n_clusters=3, random_state=0)
)
kmeans_minmax_normalized

feature1, feature2 = "petal length (cm)", "petal width (cm)"
labels = kmeans_minmax_normalized.fit_predict(df[[feature1, feature2]])

plt.figure(figsize=(10, 5), clear=True, num=2)

_ = plt.subplot(121, title="Cluster assignment", xlabel=feature1, ylabel=feature2)
plt.scatter(df[feature1], df[feature2], c=labels)

plt.subplot(122, title="Class (ground truth)", xlabel=feature1, sharey=_)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])

plt.show()
df_silouette = pd.DataFrame(columns=["k", "s"])
df_silouette["k"] = np.arange(2, 11, dtype=int)
df_silouette["s"] = df_silouette["s"].astype(float)
for i in range(len(df_silouette)):
    labels_ = make_pipeline(
        preprocessing.MinMaxScaler(),
        KMeans(n_clusters=df_silouette.loc[i, "k"], random_state=0),
    ).fit_predict(df[[feature1, feature2]])
    # YOUR CODE HERE
    raise NotImplementedError

# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=4)
df_silouette.plot(x="k", y="s", ax=ax)
plt.xlabel("k")
plt.ylabel("silouette score")
plt.show()
df_silouette
%%ai chatgpt -f text
Explain how to determine the number of clusters using silhouette analysis. 
Why is this method better than simply looking at the average silhouette 
coefficient?

Extrinsic measures of cluster quality

Suppose L(p)L(\M{p}) and C(p)C(\M{p}) are the class and cluster labels of each sample pD\M{p}\in D. An extrinsic measure compares how similar the corresponding partitions {Li}\Set{L_i} and {Ci}\Set{C_i} are, where

Li:={pL(p)=i}Ci:={pC(p)=i}.\begin{align} L_i &:=\Set{\M{p}|L(\M{p})=i}\\ C_i &:=\Set{\M{p}|C(\M{p})=i}. \end{align}

Pairwise correctness

Define the indicator of correct clustering for a pair p,qD\M{p}, \M{q}\in D as

correctness(p,q):={1L(p)=L(q)    C(p)=C(q)0otherwise.\begin{align} \op{correctness}(\M{p},\M{q}) := \begin{cases} 1 & L(\M{p})=L(\M{q}) \iff C(\M{p})=C(\M{q})\\ 0 & \text{otherwise.} \end{cases} \end{align}

In other words, the clustering for the pair of samples is correct when

  • the samples have equal class labels and equal cluster labels, or
  • the samples have different class labels and different cluster labels.

The following function returns a boolean matrix of [correctness(p,q)][\op{correctness}(\M{p},\M{q})], with rows an columns indexed by p\M{p} and q\M{q} respectively.

def correctness(class_labels, cluster_labels):
    """Returns the pairwise correctness matrix for the class and cluster labels.

    Parameters:
    -----------
    class_labels (array): non-negative integer class labels for certain samples
    cluster_labels (array): corresponding non-negative integer cluster labels

    Returns:
    --------
    A matrix (2D array) of correctness(p,q) with rows and columns indexed by
    p and q, respectively, in the samples. correctness(p,q) indicates whether
      - p and q have equal class labels and equal cluster labels, or
      - p and q have different class labels and different cluster labels.
    """
    class_labels = np.asarray(class_labels)
    cluster_labels = np.asarray(cluster_labels)

    eq_class = class_labels.reshape(-1, 1) == class_labels.reshape(1, -1)
    eq_cluster = cluster_labels.reshape(-1, 1) == cluster_labels.reshape(1, -1)

    return (eq_class & eq_cluster) | ~(eq_class | eq_cluster)

For instance, consider the following class and cluster labels:

index iiclass label L(pi)L(\M{p}_i)cluster label C(pi)C(\M{p}_i)
001
101
210

The correctness matrix and the fraction correctly clustered pairs are:

correctness_matrix = correctness(class_labels=[0, 0, 1], cluster_labels=[1, 1, 0])
correctness_accuracy = correctness_matrix.mean()
print(f"Correctness matrix:\n {correctness_matrix}")
print(f"Accuracy: {correctness_matrix.mean():.3g}")
# YOUR CODE HERE
raise NotImplementedError
correctness_accuracy
%%ai chatgpt -f text
When evaluating a clustering solution with many clusters, why is the accuracy 
from the pairwise correctness matrix misleading?

B-Cubed metrics

The accuracy computed from the pairwise correctness matrix can be misleading when there are many clusters. Similar to the class imbalance problem for classification, precision and recall can be used instead.

For instance, consider the following class and cluster labels:

index iiclass label L(pi)L(\M{p}_i)cluster label C(pi)C(\M{p}_i)
000
101
212
312

The precision and recall for each point are

index iiprecision for pi\M{p}_irecall for pi\M{p}_i
010.5
110.5
211
311

and so the overall precision and recall are 1 and 0.75, respectively.

class BCubed:
    """Compute B-Cubed precision and recall.

    Parameters
    -----------
    class_labels: int array
        Non-negative integer class labels for certain samples.
    cluster_labels: int array
        Corresponding non-negative integer cluster labels.

    Attributes
    -----------
    precisions: array of float
        B-Cubed precisions for each sample.
    recalls: array of float
        B-Cubed recalls for each sample.
    precision: float
        Overall B-Cubed precision.
    recall: float
        Overall B-Cubed recall.
    """

    def __init__(self, class_labels, cluster_labels):
        self.class_labels = np.asarray(class_labels)
        self.cluster_labels = np.asarray(cluster_labels)

        eq_class = self.class_labels[:, None] == self.class_labels[None, :]
        eq_cluster = self.cluster_labels[:, None] == self.cluster_labels[None, :]

        TPs = (eq_class & eq_cluster).sum(axis=1)
        # YOUR CODE HERE
        raise NotImplementedError
        self.precision = self.precisions.mean()
        self.recall = self.recalls.mean()

    def __repr__(self):
        return f"Precision: {self.precision:.3g}\n" + f"Recall: {self.recall:.3g}"
Source
# tests
# simple case
bcubed_eval = BCubed(class_labels=[0, 0, 1, 1], cluster_labels=[0, 1, 2, 2])
assert np.isclose(bcubed_eval.precisions, [1, 1, 1, 1], rtol=1e-3).all()
assert np.isclose(bcubed_eval.recalls, [0.5, 0.5, 1, 1], rtol=1e-3).all()
# test on iris
bcubed_eval = BCubed(dataset["target"], labels)
print(bcubed_eval)
assert np.isclose(bcubed_eval.precision, 0.9252136752136751, rtol=1e-3)
assert np.isclose(bcubed_eval.recall, 0.9253333333333335, rtol=1e-3)
%%ai chatgpt -f text
Why are BCubed precision and recall for cluster evaluation called BCubed?

Classes to Clusters Evaluation

Instead of using pairwise comparison, Weka uses the classes-to-clusters evaluation. The computation of accuracy can be cast as a linear sum assignment problem, also known as the maximum weight matching in bipartite graphs. More precisely:

It will be useful to compute a matrix of the counts nijn_{ij} of samples with class label ii as row index and cluster label jj as column index. The following function implements such computation.

def class_cluster_counts(class_labels, cluster_labels):
    """Returns the class-cluster count matrix with rows and columns indexed by
    class and cluster labels, respectively.

    Parameters:
    -----------
    class_labels (array): non-negative integer class labels for certain samples
    cluster_labels (array): corresponding non-negative integer cluster labels

    Returns:
    --------
    counts: a matrix of counts of samples with rows indexed by class labels and
            columns index by cluster labels.
    """
    counts = np.zeros((class_labels.max() + 1, cluster_labels.max() + 1), dtype=int)
    for i, j in np.column_stack((class_labels, cluster_labels)):
        counts[i, j] += 1
    return counts

For the kk-means clustering on the iris dataset, the matrix [nij][n_{ij}] of class-cluster counts is:

counts = class_cluster_counts(dataset["target"], labels)
df_counts = pd.DataFrame(counts)
df_counts

We can use linear_sum_assignment from scipy.optimize module to find the optimal assignment as follows:

from scipy.optimize import linear_sum_assignment
classes, clusters = linear_sum_assignment(counts, maximize=True)

The following highlights the optimal assignment on the class-cluster count matrix.

def highlight(data):
    attr = "background-color: yellow"
    return pd.DataFrame(
        [
            [attr if i == j else "" for j in range(len(data.columns))]
            for i in range(len(data.index))
        ],
        index=data.index,
        columns=data.columns,
    )


df_counts.style.apply(highlight, axis=None, subset=(classes, clusters))
class Classes2ClustersEvaluation:
    """Classes-to-clusters evaluation.

    Parameters:
    -----------
    class_labels: array
        Non-negative integer class labels for certain samples.
    cluster_labels: array
        Corresponding non-negative integer cluster labels.

    Attributes:
    -----------
    accuracy: float
        fraction of correctly clustered instances.
    classes: int array
        Assigned class labels sorted in ascending order.
    clusters: int array
        Cluster labels corresponding to the assigned class labels.
    """

    def __init__(self, class_labels, cluster_labels):
        self.class_labels = np.asarray(class_labels)
        self.cluster_labels = np.asarray(cluster_labels)
        # YOUR CODE HERE
        raise NotImplementedError

    def __repr__(self):
        s = f"Accuracy: {self.accuracy:.3g}\n"
        for i, j in np.column_stack((self.classes, self.clusters)):
            s += f"Class #{i} --> Cluster #{j}\n"
        return s
Source
# tests
# simple case
c2c_eval = Classes2ClustersEvaluation(np.array([0, 0, 1, 1]), np.array([0, 1, 2, 2]))
assert (c2c_eval.classes == [0, 1]).all()
assert (c2c_eval.clusters == [0, 2]).all()
assert np.isclose(c2c_eval.accuracy, 0.75, rtol=1e-3)
# test on iris
c2c_eval = Classes2ClustersEvaluation(dataset["target"], labels)
print(c2c_eval)
assert np.isclose(c2c_eval.accuracy, 0.96, rtol=1e-3)
%%ai chatgpt -f text
What is the time complexity of the linear sum assignment problem? Is it
considered scalable to large datasets? If not, are there better implementations
of the classes-to-clusters evaluation?