12.11.6: Python Code for K-Means Clustering

Last updated
Save as PDF

Page ID: 142087

\( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

\( \newcommand{\dsum}{\displaystyle\sum\limits} \)

\( \newcommand{\dint}{\displaystyle\int\limits} \)

\( \newcommand{\dlim}{\displaystyle\lim\limits} \)

\( \newcommand{\id}{\mathrm{id}}\) \( \newcommand{\Span}{\mathrm{span}}\)

( \newcommand{\kernel}{\mathrm{null}\,}\) \( \newcommand{\range}{\mathrm{range}\,}\)

\( \newcommand{\RealPart}{\mathrm{Re}}\) \( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

\( \newcommand{\Argument}{\mathrm{Arg}}\) \( \newcommand{\norm}[1]{\| #1 \|}\)

\( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

\( \newcommand{\Span}{\mathrm{span}}\)

\( \newcommand{\id}{\mathrm{id}}\)

\( \newcommand{\Span}{\mathrm{span}}\)

\( \newcommand{\kernel}{\mathrm{null}\,}\)

\( \newcommand{\range}{\mathrm{range}\,}\)

\( \newcommand{\RealPart}{\mathrm{Re}}\)

\( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

\( \newcommand{\Argument}{\mathrm{Arg}}\)

\( \newcommand{\norm}[1]{\| #1 \|}\)

\( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

\( \newcommand{\Span}{\mathrm{span}}\) \( \newcommand{\AA}{\unicode[.8,0]{x212B}}\)

\( \newcommand{\vectorA}[1]{\vec{#1}} % arrow\)

\( \newcommand{\vectorAt}[1]{\vec{\text{#1}}} % arrow\)

\( \newcommand{\vectorB}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\( \newcommand{\vectorC}[1]{\textbf{#1}} \)

\( \newcommand{\vectorD}[1]{\overrightarrow{#1}} \)

\( \newcommand{\vectorDt}[1]{\overrightarrow{\text{#1}}} \)

\( \newcommand{\vectE}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash{\mathbf {#1}}}} \)

\( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\(\newcommand{\longvect}{\overrightarrow}\)

\( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

\(\newcommand{\avec}{\mathbf a}\) \(\newcommand{\bvec}{\mathbf b}\) \(\newcommand{\cvec}{\mathbf c}\) \(\newcommand{\dvec}{\mathbf d}\) \(\newcommand{\dtil}{\widetilde{\mathbf d}}\) \(\newcommand{\evec}{\mathbf e}\) \(\newcommand{\fvec}{\mathbf f}\) \(\newcommand{\nvec}{\mathbf n}\) \(\newcommand{\pvec}{\mathbf p}\) \(\newcommand{\qvec}{\mathbf q}\) \(\newcommand{\svec}{\mathbf s}\) \(\newcommand{\tvec}{\mathbf t}\) \(\newcommand{\uvec}{\mathbf u}\) \(\newcommand{\vvec}{\mathbf v}\) \(\newcommand{\wvec}{\mathbf w}\) \(\newcommand{\xvec}{\mathbf x}\) \(\newcommand{\yvec}{\mathbf y}\) \(\newcommand{\zvec}{\mathbf z}\) \(\newcommand{\rvec}{\mathbf r}\) \(\newcommand{\mvec}{\mathbf m}\) \(\newcommand{\zerovec}{\mathbf 0}\) \(\newcommand{\onevec}{\mathbf 1}\) \(\newcommand{\real}{\mathbb R}\) \(\newcommand{\twovec}[2]{\left[\begin{array}{r}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\ctwovec}[2]{\left[\begin{array}{c}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\threevec}[3]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\cthreevec}[3]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\fourvec}[4]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\cfourvec}[4]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\fivevec}[5]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\cfivevec}[5]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\mattwo}[4]{\left[\begin{array}{rr}#1 \amp #2 \\ #3 \amp #4 \\ \end{array}\right]}\) \(\newcommand{\laspan}[1]{\text{Span}\{#1\}}\) \(\newcommand{\bcal}{\cal B}\) \(\newcommand{\ccal}{\cal C}\) \(\newcommand{\scal}{\cal S}\) \(\newcommand{\wcal}{\cal W}\) \(\newcommand{\ecal}{\cal E}\) \(\newcommand{\coords}[2]{\left\{#1\right\}_{#2}}\) \(\newcommand{\gray}[1]{\color{gray}{#1}}\) \(\newcommand{\lgray}[1]{\color{lightgray}{#1}}\) \(\newcommand{\rank}{\operatorname{rank}}\) \(\newcommand{\row}{\text{Row}}\) \(\newcommand{\col}{\text{Col}}\) \(\renewcommand{\row}{\text{Row}}\) \(\newcommand{\nul}{\text{Nul}}\) \(\newcommand{\var}{\text{Var}}\) \(\newcommand{\corr}{\text{corr}}\) \(\newcommand{\len}[1]{\left|#1\right|}\) \(\newcommand{\bbar}{\overline{\bvec}}\) \(\newcommand{\bhat}{\widehat{\bvec}}\) \(\newcommand{\bperp}{\bvec^\perp}\) \(\newcommand{\xhat}{\widehat{\xvec}}\) \(\newcommand{\vhat}{\widehat{\vvec}}\) \(\newcommand{\uhat}{\widehat{\uvec}}\) \(\newcommand{\what}{\widehat{\wvec}}\) \(\newcommand{\Sighat}{\widehat{\Sigma}}\) \(\newcommand{\lt}{<}\) \(\newcommand{\gt}{>}\) \(\newcommand{\amp}{&}\) \(\definecolor{fillinmathshade}{gray}{0.9}\)

# K-means (k=3) clustering on Z_Scores, silhouette score, 2D PCA plot, and export of original data with cluster labels to Excel
# Requirements: pandas, numpy, scikit-learn, matplotlib

# Step 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Step 2: Configuration
EXCEL_PATH = "C:/1_A_OER/Chapter 13 - Cluster Analysis/Consumer Dataset.xlsx" # update if needed
SHEET_Z = "Z_Scores" # standardized features for modeling
SHEET_RAW = "Household_Data" # original data to receive cluster labels
K = 3 # Specificy the number of clusters
RANDOM_STATE = 42

##################### NO USER INPUT BEYOND THIS POINT ##################################

# Step 3: Load data
z_df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_Z)
raw_df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_RAW)

# Keep numeric modeling matrix from Z_Scores
X = z_df.select_dtypes(include=[np.number]).copy()

# Step 4: Fit K-means with k=3 and get labels
kmeans = KMeans(n_clusters=K, random_state=RANDOM_STATE, n_init=10)
labels_0 = kmeans.fit_predict(X) # 0-based labels
labels = labels_0 + 1 # 1-based for presentation (1,2,3)

# Step 5: Compute silhouette score on standardized feature space
sil = silhouette_score(X, labels_0) # silhouette uses 0-based labels; value is unaffected by offset
print(f"Silhouette score (k=3): {sil:.3f}")

# Step 6: 2D PCA projection for visualization (does not affect clustering)
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_2d = pca.fit_transform(X)
centers_2d = pca.transform(kmeans.cluster_centers_)

# Step 7: Plot clusters and centroids
plt.figure(figsize=(8,6))

# colors chosen to mirror earlier graphic: red, blue, green
cluster_colors = {1: 'red', 2: 'blue', 3: 'green'}

for c in [1,2,3]:
idx = (labels == c)
plt.scatter(X_2d[idx, 0], X_2d[idx, 1], s=30, c=cluster_colors[c], alpha=0.6, label=f"Cluster {c}")

# centroids in bold black with white edge
plt.scatter(centers_2d[:,0], centers_2d[:,1], s=220, c='black', marker='X', edgecolors='white', linewidths=2)

# draw simple rings around each centroid
for i, c in enumerate([1,2,3]):
circ = plt.Circle((centers_2d[i,0], centers_2d[i,1]), radius=1.5, color=cluster_colors[c], fill=False, linewidth=2, linestyle='--')
plt.gca().add_patch(circ)
# label near centroid
plt.text(centers_2d[i,0], centers_2d[i,1]+0.25, str(c), fontsize=12, ha='center', color='black')

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-means (k=3) clustering: PCA projection")
plt.legend()
plt.tight_layout()
# plt.show() # uncomment to display
# plt.savefig("kmeans_k3_pca.png", dpi=300) # optional

# Step 8: Export original data with cluster labels to Excel
out = raw_df.copy()
out["Cluster"] = labels
out_path = "households_with_clusters.xlsx"
out.to_excel(out_path, index=False)
print(f"Wrote clusters to: {out_path}")

Search

Text Color

Text Size

Margin Size

Font Type