import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
%matplotlib widget
if not os.getenv(
"NBGRADER_EXECUTION"
):
%load_ext jupyter_ai
%ai update chatgpt dive:chat
# %ai update chatgpt dive-azure:gpt4o
In this notebook, we will consider different methods of evaluating clustering solutions.
Intrinsic measure¶
The intrinsic measures of cluster quality are helpful when the ground truth is unavailable or should not be used. For example, to determine the number of clusters, we can compare the intrinsic measures of the clustering solutions for different numbers of clusters.
Elbow Method¶
The following diagram shows the KnowledgeFlow interface of Weka that applies SimpleKMeans
to the iris.2D.arff
dataset for different choices of .
Implement the above using Weka to answer the following exercise.
df_WSS = pd.DataFrame(columns=["k", "WSS"])
df_WSS["k"] = np.arange(1, 5, dtype=int)
df_WSS["WSS"] = df_WSS["WSS"].astype(float)
# YOUR CODE HERE
raise NotImplementedError
# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=1)
df_WSS.plot(x="k", y="WSS", xlabel=r"$k$", ylabel="WSS", legend=False, ax=ax)
# plt.xlabel("k")
# plt.ylabel("WSS")
plt.show()
df_WSS
%%ai chatgpt -f text
When using the elbow method to determine the number of clusters, how to
determine the number of clusters if there are multiple elbows?
Silhouette analysis¶
An alternative method to the elbow method is to compute the silhouette coefficient below:
%%ai chatgpt -f text
Why should the silhouette coefficient be undefined, rather than 0, when the
number of clusters is 1, unlike when the size of the cluster is 1?
%%ai chatgpt -f text
Why does the silhouette coefficient use distance instead of squared distance?
We will use the existing implementation in sklearn
. First, import the iris dataset from sklearn.datasets
and store it as a DataFrame
:
# load the dataset from sklearn
dataset = datasets.load_iris()
# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df
To cluster the data using -means clustering:
kmeans_minmax_normalized = make_pipeline(
preprocessing.MinMaxScaler(), KMeans(n_clusters=3, random_state=0)
)
kmeans_minmax_normalized
feature1, feature2 = "petal length (cm)", "petal width (cm)"
labels = kmeans_minmax_normalized.fit_predict(df[[feature1, feature2]])
plt.figure(figsize=(10, 5), clear=True, num=2)
_ = plt.subplot(121, title="Cluster assignment", xlabel=feature1, ylabel=feature2)
plt.scatter(df[feature1], df[feature2], c=labels)
plt.subplot(122, title="Class (ground truth)", xlabel=feature1, sharey=_)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.show()
df_silouette = pd.DataFrame(columns=["k", "s"])
df_silouette["k"] = np.arange(2, 11, dtype=int)
df_silouette["s"] = df_silouette["s"].astype(float)
for i in range(len(df_silouette)):
labels_ = make_pipeline(
preprocessing.MinMaxScaler(),
KMeans(n_clusters=df_silouette.loc[i, "k"], random_state=0),
).fit_predict(df[[feature1, feature2]])
# YOUR CODE HERE
raise NotImplementedError
# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=4)
df_silouette.plot(x="k", y="s", ax=ax)
plt.xlabel("k")
plt.ylabel("silouette score")
plt.show()
df_silouette
%%ai chatgpt -f text
Explain how to determine the number of clusters using silhouette analysis.
Why is this method better than simply looking at the average silhouette
coefficient?
Extrinsic measures of cluster quality¶
Suppose and are the class and cluster labels of each sample . An extrinsic measure compares how similar the corresponding partitions and are, where
Pairwise correctness¶
Define the indicator of correct clustering for a pair as
In other words, the clustering for the pair of samples is correct when
- the samples have equal class labels and equal cluster labels, or
- the samples have different class labels and different cluster labels.
The following function returns a boolean matrix of , with rows an columns indexed by and respectively.
def correctness(class_labels, cluster_labels):
"""Returns the pairwise correctness matrix for the class and cluster labels.
Parameters:
-----------
class_labels (array): non-negative integer class labels for certain samples
cluster_labels (array): corresponding non-negative integer cluster labels
Returns:
--------
A matrix (2D array) of correctness(p,q) with rows and columns indexed by
p and q, respectively, in the samples. correctness(p,q) indicates whether
- p and q have equal class labels and equal cluster labels, or
- p and q have different class labels and different cluster labels.
"""
class_labels = np.asarray(class_labels)
cluster_labels = np.asarray(cluster_labels)
eq_class = class_labels.reshape(-1, 1) == class_labels.reshape(1, -1)
eq_cluster = cluster_labels.reshape(-1, 1) == cluster_labels.reshape(1, -1)
return (eq_class & eq_cluster) | ~(eq_class | eq_cluster)
For instance, consider the following class and cluster labels:
index | class label | cluster label |
---|---|---|
0 | 0 | 1 |
1 | 0 | 1 |
2 | 1 | 0 |
The correctness matrix and the fraction correctly clustered pairs are:
correctness_matrix = correctness(class_labels=[0, 0, 1], cluster_labels=[1, 1, 0])
correctness_accuracy = correctness_matrix.mean()
print(f"Correctness matrix:\n {correctness_matrix}")
print(f"Accuracy: {correctness_matrix.mean():.3g}")
# YOUR CODE HERE
raise NotImplementedError
correctness_accuracy
%%ai chatgpt -f text
When evaluating a clustering solution with many clusters, why is the accuracy
from the pairwise correctness matrix misleading?
B-Cubed metrics¶
The accuracy computed from the pairwise correctness matrix can be misleading when there are many clusters. Similar to the class imbalance problem for classification, precision and recall can be used instead.
For instance, consider the following class and cluster labels:
index | class label | cluster label |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 1 | 2 |
3 | 1 | 2 |
The precision and recall for each point are
index | precision for | recall for |
---|---|---|
0 | 1 | 0.5 |
1 | 1 | 0.5 |
2 | 1 | 1 |
3 | 1 | 1 |
and so the overall precision and recall are 1 and 0.75, respectively.
class BCubed:
"""Compute B-Cubed precision and recall.
Parameters
-----------
class_labels: int array
Non-negative integer class labels for certain samples.
cluster_labels: int array
Corresponding non-negative integer cluster labels.
Attributes
-----------
precisions: array of float
B-Cubed precisions for each sample.
recalls: array of float
B-Cubed recalls for each sample.
precision: float
Overall B-Cubed precision.
recall: float
Overall B-Cubed recall.
"""
def __init__(self, class_labels, cluster_labels):
self.class_labels = np.asarray(class_labels)
self.cluster_labels = np.asarray(cluster_labels)
eq_class = self.class_labels[:, None] == self.class_labels[None, :]
eq_cluster = self.cluster_labels[:, None] == self.cluster_labels[None, :]
TPs = (eq_class & eq_cluster).sum(axis=1)
# YOUR CODE HERE
raise NotImplementedError
self.precision = self.precisions.mean()
self.recall = self.recalls.mean()
def __repr__(self):
return f"Precision: {self.precision:.3g}\n" + f"Recall: {self.recall:.3g}"
Source
# tests
# simple case
bcubed_eval = BCubed(class_labels=[0, 0, 1, 1], cluster_labels=[0, 1, 2, 2])
assert np.isclose(bcubed_eval.precisions, [1, 1, 1, 1], rtol=1e-3).all()
assert np.isclose(bcubed_eval.recalls, [0.5, 0.5, 1, 1], rtol=1e-3).all()
# test on iris
bcubed_eval = BCubed(dataset["target"], labels)
print(bcubed_eval)
assert np.isclose(bcubed_eval.precision, 0.9252136752136751, rtol=1e-3)
assert np.isclose(bcubed_eval.recall, 0.9253333333333335, rtol=1e-3)
%%ai chatgpt -f text
Why are BCubed precision and recall for cluster evaluation called BCubed?
Classes to Clusters Evaluation¶
Instead of using pairwise comparison, Weka uses the classes-to-clusters evaluation. The computation of accuracy can be cast as a linear sum assignment problem, also known as the maximum weight matching in bipartite graphs. More precisely:
It will be useful to compute a matrix of the counts of samples with class label as row index and cluster label as column index. The following function implements such computation.
def class_cluster_counts(class_labels, cluster_labels):
"""Returns the class-cluster count matrix with rows and columns indexed by
class and cluster labels, respectively.
Parameters:
-----------
class_labels (array): non-negative integer class labels for certain samples
cluster_labels (array): corresponding non-negative integer cluster labels
Returns:
--------
counts: a matrix of counts of samples with rows indexed by class labels and
columns index by cluster labels.
"""
counts = np.zeros((class_labels.max() + 1, cluster_labels.max() + 1), dtype=int)
for i, j in np.column_stack((class_labels, cluster_labels)):
counts[i, j] += 1
return counts
For the -means clustering on the iris dataset, the matrix of class-cluster counts is:
counts = class_cluster_counts(dataset["target"], labels)
df_counts = pd.DataFrame(counts)
df_counts
We can use linear_sum_assignment
from scipy.optimize
module to find the optimal assignment as follows:
from scipy.optimize import linear_sum_assignment
classes, clusters = linear_sum_assignment(counts, maximize=True)
The following highlights the optimal assignment on the class-cluster count matrix.
def highlight(data):
attr = "background-color: yellow"
return pd.DataFrame(
[
[attr if i == j else "" for j in range(len(data.columns))]
for i in range(len(data.index))
],
index=data.index,
columns=data.columns,
)
df_counts.style.apply(highlight, axis=None, subset=(classes, clusters))
class Classes2ClustersEvaluation:
"""Classes-to-clusters evaluation.
Parameters:
-----------
class_labels: array
Non-negative integer class labels for certain samples.
cluster_labels: array
Corresponding non-negative integer cluster labels.
Attributes:
-----------
accuracy: float
fraction of correctly clustered instances.
classes: int array
Assigned class labels sorted in ascending order.
clusters: int array
Cluster labels corresponding to the assigned class labels.
"""
def __init__(self, class_labels, cluster_labels):
self.class_labels = np.asarray(class_labels)
self.cluster_labels = np.asarray(cluster_labels)
# YOUR CODE HERE
raise NotImplementedError
def __repr__(self):
s = f"Accuracy: {self.accuracy:.3g}\n"
for i, j in np.column_stack((self.classes, self.clusters)):
s += f"Class #{i} --> Cluster #{j}\n"
return s
Source
# tests
# simple case
c2c_eval = Classes2ClustersEvaluation(np.array([0, 0, 1, 1]), np.array([0, 1, 2, 2]))
assert (c2c_eval.classes == [0, 1]).all()
assert (c2c_eval.clusters == [0, 2]).all()
assert np.isclose(c2c_eval.accuracy, 0.75, rtol=1e-3)
# test on iris
c2c_eval = Classes2ClustersEvaluation(dataset["target"], labels)
print(c2c_eval)
assert np.isclose(c2c_eval.accuracy, 0.96, rtol=1e-3)
%%ai chatgpt -f text
What is the time complexity of the linear sum assignment problem? Is it
considered scalable to large datasets? If not, are there better implementations
of the classes-to-clusters evaluation?