import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
from ipywidgets import interact
from sklearn import datasets, preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import make_pipeline
from util import plot_dendrogram
%matplotlib widget
if not os.getenv(
"NBGRADER_EXECUTION"
):
%load_ext jupyter_ai
%ai update chatgpt dive:chat
# %ai update chatgpt dive-azure:gpt4o
In this notebook, we continue to cluster the instances in the iris 2D dataset using scikit-learn.
Agglomerative clustering¶
We first import the iris dataset from sklearn.datasets
and store it as a DataFrame.
# load the dataset from sklearn
dataset = datasets.load_iris()
# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df # display an overview of the data
To normalize the features followed by agglomerative clustering, we create a pipeline as follows:
from sklearn.cluster import AgglomerativeClustering
agnes_minmax_normalized = make_pipeline(
preprocessing.MinMaxScaler(),
AgglomerativeClustering(
n_clusters=3, linkage="complete", memory="private", compute_distances=True
),
)
agnes_minmax_normalized
In the above, we configured the agglomerative clustering algorithm to use complete-linkage to return 3 clusters.
- By setting the parameter
memory
, the solution will be cached to the specified folderprivate
. - By setting compute_distances to
True
, the cluster distances are computer for dendrogram visualization later on.
To cluster based on the two input features petal length (cm)
and petal width (cm)
:
feature1, feature2 = "petal length (cm)", "petal width (cm)"
cluster_labels = agnes_minmax_normalized.fit_predict(df[[feature1, feature2]])
plt.figure(num=1, figsize=(10, 5), clear=True)
plt.subplot(121)
plt.scatter(df[feature1], df[feature2], c=cluster_labels)
plt.title("Cluster assignment")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.subplot(122)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.title("Class (ground truth)")
plt.xlabel(feature1)
plt.show()
The fit_predict
method fits and returns the cluster labels for the given data.
YOUR ANSWER HERE
%%ai chatgpt -f text
How to apply an existing clustering solution from the complete linkage method
to predict the cluster index of a new data point?
Dendrogram¶
Further details of a clustering solution can be obtained from the fitted properties:[1]
agnes = agnes_minmax_normalized["agglomerativeclustering"]
print("Cluster labels for each sample:", *agnes.labels_)
print("Children of each non-leaf node:", *agnes.children_)
print("Distances between nodes:", *agnes.distances_.round(3))
print("Number of leaves:", agnes.n_leaves_)
print("Number of connected components:", agnes.n_connected_components_)
To plot the dendrogram, we will use the function plot_dendrogram
input as follows:
from util import plot_dendrogram
The code is adopted from an example in sklearn
, which uses the function dendrogram
from scipy.cluster.hierarchy
. To generate the dendrogram:
plt.figure(num=2, figsize=(10, 5), clear=True)
plot_dendrogram(agnes, distance_sort=True)
plt.title("Dendrogram for complete-linkage method on iris dataset")
plt.ylabel("cophenetic distance")
plt.xlabel("instance index")
plt.show()
fig = plt.figure(num=3, figsize=(10, 10), clear=True)
@interact(
linkage=["ward", "complete", "average", "single"],
feature1=dataset.feature_names,
feature2=dataset.feature_names,
)
def analyze_agnes(
linkage,
feature1=dataset.feature_names[2],
feature2=dataset.feature_names[3],
k=widgets.IntSlider(3, 1, 4, continuous_update=False),
):
# YOUR CODE HERE
raise NotImplementedError
agnes = agnes_minmax_normalized["agglomerativeclustering"]
# Create subplots
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(223)
ax3 = fig.add_subplot(224)
# Plot dendrogram
plot_dendrogram(agnes, ax=ax1)
ax1.set_title(f"Dendrogram for {linkage}-linkage method on iris dataset")
ax1.set_ylabel("cophenetic distance")
ax1.set_xlabel("instance index")
# Plot cluster assignment
ax2.scatter(df[feature1], df[feature2], c=agnes.labels_)
ax2.set_title("Cluster assignment")
ax2.set_ylabel(feature2)
ax2.set_xlabel(feature1)
# Plot ground truth
ax3.scatter(df[feature1], df[feature2], c=dataset["target"])
ax3.set_title("Class (ground truth)")
ax3.set_xlabel(feature1)
plt.show()
%%ai chatgpt -f text
Does sklearn implement a divisive clustering algorithm? If not, why not?
What are the pros and cons of agglomerative vs divisive clustering?
sklearn
uses trailing underscore to denote fitted properties.