import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ipywidgets import interact
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from util import plot_cluster_regions
%matplotlib widget
if not os.getenv(
"NBGRADER_EXECUTION"
):
%load_ext jupyter_ai
%ai update chatgpt dive:chat
# %ai update chatgpt dive-azure:gpt4oRecall that the classification problem for the iris dataset is to classify the iris species based on the lengths and widths of the petals and sepals. In this notebook, we will cluster the instances in the iris dataset with the class attribute removed. The purpose is to evaluate whether a partitional clustering algorithm can identify the 3 different iris species without looking at the class attribute.
%%ai chatgpt -f text
Is it meaningful to cluster data with the class attribute removed?
Why not train a classifier instead?Clustering using Weka¶
The Explorer interface has a Cluster panel for clustering. Follow the procedures below to cluster the iris.2D dataset:
- Using the
Preprocesspanel, loadiris.2D.arfffrom the Weka data folder. - Using the
Clusterpanel, choose theClustererasSimpleKMeans, which implements the -means clustering algorithm. - The default number of clusters is (k=2). Change it to (k=3) instead, i.e., set
numClustersto 3. - Click the
ignore attributesbutton belowCluster modeand selectclass. - Click
Startto run the clustering algorithm.
%%ai chatgpt -f text
How to use python-weka-wrapper3 to cluster the iris.2D.arff dataset?Source
# YOUR CODE HERE
raise NotImplementedError
wssSource
df_centroids = pd.DataFrame(columns=["petallength", "petalwidth"], dtype=float)
# YOUR CODE HERE
raise NotImplementedError
df_centroidsYOUR ANSWER HERE
YOUR ANSWER HERE
Source
# YOUR CODE HERE
raise NotImplementedError
error_rateExplain how the error rate is calculated.
YOUR ANSWER HERE
Clustering using scikit-learn¶
We first import the iris dataset from sklearn.datasets and store it as a DataFrame.
# load the dataset from sklearn
dataset = datasets.load_iris()
# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df # display an overview of the dataTo normalize the features followed by -means clustering, we create a pipeline as follows:
from sklearn.cluster import KMeanskmeans_minmax_normalized = make_pipeline(
preprocessing.MinMaxScaler(), KMeans(n_clusters=3)
)
kmeans_minmax_normalizedTo cluster based on the two input features petal length (cm) and petal width (cm):
feature1, feature2 = "petal length (cm)", "petal width (cm)"
kmeans_minmax_normalized.fit(df[[feature1, feature2]])
plt.figure(num=1, figsize=(10, 5))
plt.subplot(121)
plt.scatter(
df[feature1],
df[feature2],
c=kmeans_minmax_normalized.predict(df[[feature1, feature2]]),
)
plt.title("Cluster assignment")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.subplot(122)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.title("Class (ground truth)")
plt.xlabel(feature1)
plt.show()Since clustering is unsupervised, unlike classification,
- the
fitmethod ofkmeans_minmax_normalizeddoes not take the target attribute as an argument, and - the
predictmethod returns cluster labels that may not be associated with the class labels.
Further details can be obtained from the fitted properties (those with a trailing underscore):
kmeans = kmeans_minmax_normalized["kmeans"]
print("Cluster labels:", *kmeans.labels_)
print("Cluster centers:", *kmeans.cluster_centers_)
print("WSS:", kmeans.inertia_)
print("# iterations:", kmeans.n_iter_)Similar to plotting the decision regions for a classifier, we provide the function plot_cluster_regions in util.py to plot the cluster regions for a clusterer.
from util import plot_cluster_regions?plot_cluster_regionsComplete the following code to assign to kmeans a trained and piplined -means clusterer that
- normalizes the features according to the method specified by
normalization, and - clusters the instances of features specified by
feature1andfeature2intokclusters.
Observe the change in the clustering solution when you change the normalization, features, and number of clusters.
if not os.getenv("NBGRADER_EXECUTION"):
fig, ax = plt.subplots(
nrows=1,
ncols=1,
clear=True,
figsize=(10, 10),
layout="constrained",
num=2,
sharey=True,
)
@interact(
normalization=["None", "Min-max", "Standard"],
feature1=dataset.feature_names,
feature2=dataset.feature_names,
k=widgets.IntSlider(3, 1, 4, continuous_update=False),
resolution=widgets.IntSlider(1, 1, 4, continuous_update=False),
)
def cluster_regions_kmeans(
normalization,
feature1=dataset.feature_names[2],
feature2=dataset.feature_names[3],
k=3,
resolution=1,
):
scaler = {
"Min-max": preprocessing.MinMaxScaler,
"Standard": preprocessing.StandardScaler,
}
# YOUR CODE HERE
raise NotImplementedError
ax.clear()
plot_cluster_regions(
df[[feature1, feature2]], df.target, kmeans, N=resolution * 100, ax=ax
)
ax.set_title("Cluster regions for k-means clustering")
ax.set_xlabel(feature1)
ax.set_ylabel(feature2)
plt.show()%%ai chatgpt -f text
How to visualize the clustering solution if the dataset has very high dimension?