10.8.3: Python Code used for SVM Classification Case Study

Last updated
Save as PDF

Page ID: 142000

\( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

\( \newcommand{\dsum}{\displaystyle\sum\limits} \)

\( \newcommand{\dint}{\displaystyle\int\limits} \)

\( \newcommand{\dlim}{\displaystyle\lim\limits} \)

\( \newcommand{\id}{\mathrm{id}}\) \( \newcommand{\Span}{\mathrm{span}}\)

( \newcommand{\kernel}{\mathrm{null}\,}\) \( \newcommand{\range}{\mathrm{range}\,}\)

\( \newcommand{\RealPart}{\mathrm{Re}}\) \( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

\( \newcommand{\Argument}{\mathrm{Arg}}\) \( \newcommand{\norm}[1]{\| #1 \|}\)

\( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

\( \newcommand{\Span}{\mathrm{span}}\)

\( \newcommand{\id}{\mathrm{id}}\)

\( \newcommand{\Span}{\mathrm{span}}\)

\( \newcommand{\kernel}{\mathrm{null}\,}\)

\( \newcommand{\range}{\mathrm{range}\,}\)

\( \newcommand{\RealPart}{\mathrm{Re}}\)

\( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

\( \newcommand{\Argument}{\mathrm{Arg}}\)

\( \newcommand{\norm}[1]{\| #1 \|}\)

\( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

\( \newcommand{\Span}{\mathrm{span}}\) \( \newcommand{\AA}{\unicode[.8,0]{x212B}}\)

\( \newcommand{\vectorA}[1]{\vec{#1}} % arrow\)

\( \newcommand{\vectorAt}[1]{\vec{\text{#1}}} % arrow\)

\( \newcommand{\vectorB}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\( \newcommand{\vectorC}[1]{\textbf{#1}} \)

\( \newcommand{\vectorD}[1]{\overrightarrow{#1}} \)

\( \newcommand{\vectorDt}[1]{\overrightarrow{\text{#1}}} \)

\( \newcommand{\vectE}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash{\mathbf {#1}}}} \)

\( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

\(\newcommand{\longvect}{\overrightarrow}\)

\( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

\(\newcommand{\avec}{\mathbf a}\) \(\newcommand{\bvec}{\mathbf b}\) \(\newcommand{\cvec}{\mathbf c}\) \(\newcommand{\dvec}{\mathbf d}\) \(\newcommand{\dtil}{\widetilde{\mathbf d}}\) \(\newcommand{\evec}{\mathbf e}\) \(\newcommand{\fvec}{\mathbf f}\) \(\newcommand{\nvec}{\mathbf n}\) \(\newcommand{\pvec}{\mathbf p}\) \(\newcommand{\qvec}{\mathbf q}\) \(\newcommand{\svec}{\mathbf s}\) \(\newcommand{\tvec}{\mathbf t}\) \(\newcommand{\uvec}{\mathbf u}\) \(\newcommand{\vvec}{\mathbf v}\) \(\newcommand{\wvec}{\mathbf w}\) \(\newcommand{\xvec}{\mathbf x}\) \(\newcommand{\yvec}{\mathbf y}\) \(\newcommand{\zvec}{\mathbf z}\) \(\newcommand{\rvec}{\mathbf r}\) \(\newcommand{\mvec}{\mathbf m}\) \(\newcommand{\zerovec}{\mathbf 0}\) \(\newcommand{\onevec}{\mathbf 1}\) \(\newcommand{\real}{\mathbb R}\) \(\newcommand{\twovec}[2]{\left[\begin{array}{r}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\ctwovec}[2]{\left[\begin{array}{c}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\threevec}[3]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\cthreevec}[3]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\fourvec}[4]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\cfourvec}[4]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\fivevec}[5]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\cfivevec}[5]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\mattwo}[4]{\left[\begin{array}{rr}#1 \amp #2 \\ #3 \amp #4 \\ \end{array}\right]}\) \(\newcommand{\laspan}[1]{\text{Span}\{#1\}}\) \(\newcommand{\bcal}{\cal B}\) \(\newcommand{\ccal}{\cal C}\) \(\newcommand{\scal}{\cal S}\) \(\newcommand{\wcal}{\cal W}\) \(\newcommand{\ecal}{\cal E}\) \(\newcommand{\coords}[2]{\left\{#1\right\}_{#2}}\) \(\newcommand{\gray}[1]{\color{gray}{#1}}\) \(\newcommand{\lgray}[1]{\color{lightgray}{#1}}\) \(\newcommand{\rank}{\operatorname{rank}}\) \(\newcommand{\row}{\text{Row}}\) \(\newcommand{\col}{\text{Col}}\) \(\renewcommand{\row}{\text{Row}}\) \(\newcommand{\nul}{\text{Nul}}\) \(\newcommand{\var}{\text{Var}}\) \(\newcommand{\corr}{\text{corr}}\) \(\newcommand{\len}[1]{\left|#1\right|}\) \(\newcommand{\bbar}{\overline{\bvec}}\) \(\newcommand{\bhat}{\widehat{\bvec}}\) \(\newcommand{\bperp}{\bvec^\perp}\) \(\newcommand{\xhat}{\widehat{\xvec}}\) \(\newcommand{\vhat}{\widehat{\vvec}}\) \(\newcommand{\uhat}{\widehat{\uvec}}\) \(\newcommand{\what}{\widehat{\wvec}}\) \(\newcommand{\Sighat}{\widehat{\Sigma}}\) \(\newcommand{\lt}{<}\) \(\newcommand{\gt}{>}\) \(\newcommand{\amp}{&}\) \(\definecolor{fillinmathshade}{gray}{0.9}\)

# =========================================
# Support Vector Machine (SVM) for Churn
# =========================================
# This script trains and evaluates an SVM classifier to predict "Churn" using the
# remaining variables as predictors. It searches over multiple kernels and selects
# the best model by cross-validated ROC-AUC. Final outputs include:
# - Selected Kernel and key hyperparameters
# - Number of support vectors (total and by class)
# - Confusion Matrix in (1,0) order
# - Accuracy, Sensitivity (Recall for 1), Specificity (for 0)
# - ROC curve displayed and AUC on test set
#
# =========================================

# ----------------------------
# Step 0: Imports & Settings
# Purpose: Load required libraries and set global settings
# ----------------------------
import warnings # suppress optional warnings
warnings.filterwarnings("ignore")

import numpy as np # numerical operations
import pandas as pd # data loading and wrangling
from pathlib import Path # file path handling
from typing import Tuple # type hints for clarity

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
confusion_matrix,
accuracy_score,
roc_curve,
roc_auc_score
)
from sklearn.svm import SVC
import matplotlib.pyplot as plt

RANDOM_STATE = 42 # for reproducibility

# ----------------------------
# Step 1: User Inputs
# Purpose: Specify dataset location, target column, and (optionally) sheet for Excel
# NOTE: The path and sheet have been pre-filled per your request.
# ----------------------------

# >>>>> USER INPUT (pre-filled for you) <<<<<
DATA_PATH = Path(r"C:/1_A_OER/Chapter 10 - Support Vector Machines/Classification_Business_Dataset.xlsx")
SHEET_NAME = "With_Dummies" # Excel sheet to use
TARGET = "Churn" # target column name
TEST_SIZE = 0.20 # test set proportion (e.g., 0.30 = 30% test)

# >>>>> No User Input is Required Beyond this Point <<<<<

# ----------------------------
# Step 2: Load Data
# Purpose: Read the dataset into a DataFrame. Supports Excel or CSV based on file extension.
# ----------------------------
def load_data(path: Path, sheet_name: str = None) -> pd.DataFrame:
ext = path.suffix.lower()
if ext in [".xls", ".xlsx"]:
return pd.read_excel(path, sheet_name=sheet_name)
elif ext == ".csv":
return pd.read_csv(path)
else:
raise ValueError(f"Unsupported file extension: {ext}. Use .csv, .xls, or .xlsx")

df = load_data(DATA_PATH, SHEET_NAME)

# ----------------------------
# Step 3: Clean Target & Basic Preprocessing
# Purpose: Ensure the TARGET is binary (0/1). If target is text (e.g., 'Yes'/'No'),
# map to 1/0. Also, optionally one-hot encode any remaining object columns.
# ----------------------------
if TARGET not in df.columns:
raise KeyError(f"TARGET column '{TARGET}' not found in data. Please check the name.")

# Make a copy to avoid modifying original
data = df.copy()

# Coerce target to 0/1 if it's not numeric
if not np.issubdtype(data[TARGET].dtype, np.number):
# normalize text then map common positive/negative encodings
data[TARGET] = (
data[TARGET]
.astype(str)
.str.strip()
.str.lower()
.map({"1":1,"yes":1,"y":1,"true":1,"t":1,
"0":0,"no":0,"n":0,"false":0,"f":0})
)

# If any unmapped values remain, try converting directly to numeric
data[TARGET] = pd.to_numeric(data[TARGET], errors="coerce")

# Drop rows with missing target
data = data.dropna(subset=[TARGET])

# Optionally one-hot encode any remaining object columns (if your sheet is already "With_Dummies",
# this will generally be a no-op)
obj_cols = data.select_dtypes(include=["object", "category"]).columns.tolist()
obj_cols = [c for c in obj_cols if c != TARGET]
if len(obj_cols) > 0:
data = pd.get_dummies(data, columns=obj_cols, drop_first=True)

# ----------------------------
# Step 4: Define X, y and Train/Test Split
# Purpose: Separate predictors (X) and target (y), then split with stratification.
# ----------------------------
X = data.drop(columns=[TARGET])
y = data[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# ----------------------------
# Step 5: Build Pipeline & Hyperparameter Grid
# Purpose: Create a preprocessing + SVM pipeline and a parameter grid checking multiple kernels.
# ----------------------------
pipe = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")), # robust to missing values
("scaler", StandardScaler(with_mean=False) if hasattr(X, "sparse") and X.sparse else StandardScaler()),
("svc", SVC()) # We'll tune kernel and other hyperparameters via GridSearchCV
])

# Parameter grids for different kernels
param_grid = [
# Linear
{
"svc__kernel": ["linear"],
"svc__C": [0.1, 1, 10, 100]
},
# RBF (Gaussian)
{
"svc__kernel": ["rbf"],
"svc__C": [0.1, 1, 10, 100],
"svc__gamma": ["scale", "auto", 0.01, 0.1, 1.0]
},
# Polynomial
{
"svc__kernel": ["poly"],
"svc__C": [0.1, 1, 10, 100],
"svc__degree": [2, 3, 4],
"svc__gamma": ["scale", "auto"]
},
# Sigmoid
{
"svc__kernel": ["sigmoid"],
"svc__C": [0.1, 1, 10, 100],
"svc__gamma": ["scale", "auto"],
"svc__coef0": [0.0, 0.5, 1.0]
}
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# ----------------------------
# Step 6: Grid Search for Best Kernel/Hyperparameters
# Purpose: Use ROC-AUC as selection metric to find best SVM configuration.
# ----------------------------
grid = GridSearchCV(
estimator=pipe,
param_grid=param_grid,
scoring="roc_auc",
cv=cv,
n_jobs=-1,
refit=True, # fit final model on full training set
verbose=0
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_params = grid.best_params_
best_cv_auc = grid.best_score_

# Helper to describe kernel succinctly
def kernel_description(params: dict) -> str:
k = params.get("svc__kernel", "unknown")
if k == "linear":
return "Linear"
if k == "rbf":
return "RBF (Gaussian)"
if k == "poly":
deg = params.get("svc__degree", "?")
return f"Polynomial (degree={deg})"
if k == "sigmoid":
return "Sigmoid"
return str(k)

# ----------------------------
# Step 7: Evaluation on Test Set
# Purpose: Compute predictions, confusion matrix (1,0 order), accuracy, sensitivity, specificity,
# and ROC/AUC. Plot ROC curve.
# ----------------------------
# Predict class labels
y_pred = best_model.predict(X_test)

# Confusion matrix with (1,0) for rows and columns
cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
TP = cm[0, 0]
FN = cm[0, 1]
FP = cm[1, 0]
TN = cm[1, 1]

accuracy = accuracy_score(y_test, y_pred)
sensitivity = TP / (TP + FN) if (TP + FN) > 0 else np.nan # recall for class 1
specificity = TN / (TN + FP) if (TN + FP) > 0 else np.nan # recall for class 0

# Scores for ROC/AUC (decision_function preferred; fallback to predict_proba if available)
def get_scores(model, X) -> np.ndarray:
svc = model.named_steps["svc"]
if hasattr(svc, "decision_function"):
return model.decision_function(X)
elif hasattr(svc, "predict_proba"):
# probability=True is not set by default for SVC; using decision_function is standard for ROC
return model.predict_proba(X)[:, 1]
else:
# As a last resort, use predicted labels (not ideal for ROC, but prevents errors)
return model.predict(X)

y_scores = get_scores(best_model, X_test)
test_auc = roc_auc_score(y_test, y_scores)

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# ----------------------------
# Step 8: Support Vector Info & Human-Readable Summary
# Purpose: Print final model choice, support vectors count, and all metrics in logical order.
# ----------------------------
svc_final = best_model.named_steps["svc"]
n_support_by_class = svc_final.n_support_
total_support = int(np.sum(n_support_by_class))

print("========== Final Model Summary ==========")
print(f"Selected Kernel: {kernel_description(best_params)}")
# Also show key hyperparameters for reproducibility
print("Best Hyperparameters:")
for k, v in best_params.items():
print(f" - {k}: {v}")
print(f"Cross-validated AUC (train CV): {best_cv_auc:.4f}")
print(f"Number of Support Vectors (by class): {n_support_by_class}")
print(f"Total Number of Support Vectors: {total_support}")

print("\n========== Test Set Performance ==========")
print("Confusion Matrix (rows/cols in order [1, 0]):")
print(cm)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Sensitivity: {sensitivity:.4f} # Recall for class 1")
print(f"Specificity: {specificity:.4f} # Recall for class 0")
print(f"AUC: {test_auc:.4f}")

# ----------------------------
# Step 9: Plot ROC Curve
# Purpose: Visualize trade-off between TPR (sensitivity) and FPR (1 - specificity).
# ----------------------------
plt.figure()
plt.plot(fpr, tpr, linewidth=2, label=f"ROC curve (AUC = {test_auc:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1, label="Chance")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("SVM ROC Curve - Test Set")
plt.legend(loc="lower right")
plt.grid(True, linestyle=":", linewidth=0.5)
plt.tight_layout()
plt.show()

# ----------------------------
# No User Input is Required Beyond this Point
# ----------------------------

Search

Text Color

Text Size

Margin Size

Font Type