Skip to main content
Business LibreTexts

10.8.3: Python Code used for SVM Classification Case Study

  • Page ID
    142000
  • \( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

    \( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

    \( \newcommand{\dsum}{\displaystyle\sum\limits} \)

    \( \newcommand{\dint}{\displaystyle\int\limits} \)

    \( \newcommand{\dlim}{\displaystyle\lim\limits} \)

    \( \newcommand{\id}{\mathrm{id}}\) \( \newcommand{\Span}{\mathrm{span}}\)

    ( \newcommand{\kernel}{\mathrm{null}\,}\) \( \newcommand{\range}{\mathrm{range}\,}\)

    \( \newcommand{\RealPart}{\mathrm{Re}}\) \( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

    \( \newcommand{\Argument}{\mathrm{Arg}}\) \( \newcommand{\norm}[1]{\| #1 \|}\)

    \( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

    \( \newcommand{\Span}{\mathrm{span}}\)

    \( \newcommand{\id}{\mathrm{id}}\)

    \( \newcommand{\Span}{\mathrm{span}}\)

    \( \newcommand{\kernel}{\mathrm{null}\,}\)

    \( \newcommand{\range}{\mathrm{range}\,}\)

    \( \newcommand{\RealPart}{\mathrm{Re}}\)

    \( \newcommand{\ImaginaryPart}{\mathrm{Im}}\)

    \( \newcommand{\Argument}{\mathrm{Arg}}\)

    \( \newcommand{\norm}[1]{\| #1 \|}\)

    \( \newcommand{\inner}[2]{\langle #1, #2 \rangle}\)

    \( \newcommand{\Span}{\mathrm{span}}\) \( \newcommand{\AA}{\unicode[.8,0]{x212B}}\)

    \( \newcommand{\vectorA}[1]{\vec{#1}}      % arrow\)

    \( \newcommand{\vectorAt}[1]{\vec{\text{#1}}}      % arrow\)

    \( \newcommand{\vectorB}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

    \( \newcommand{\vectorC}[1]{\textbf{#1}} \)

    \( \newcommand{\vectorD}[1]{\overrightarrow{#1}} \)

    \( \newcommand{\vectorDt}[1]{\overrightarrow{\text{#1}}} \)

    \( \newcommand{\vectE}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash{\mathbf {#1}}}} \)

    \( \newcommand{\vecs}[1]{\overset { \scriptstyle \rightharpoonup} {\mathbf{#1}} } \)

    \(\newcommand{\longvect}{\overrightarrow}\)

    \( \newcommand{\vecd}[1]{\overset{-\!-\!\rightharpoonup}{\vphantom{a}\smash {#1}}} \)

    \(\newcommand{\avec}{\mathbf a}\) \(\newcommand{\bvec}{\mathbf b}\) \(\newcommand{\cvec}{\mathbf c}\) \(\newcommand{\dvec}{\mathbf d}\) \(\newcommand{\dtil}{\widetilde{\mathbf d}}\) \(\newcommand{\evec}{\mathbf e}\) \(\newcommand{\fvec}{\mathbf f}\) \(\newcommand{\nvec}{\mathbf n}\) \(\newcommand{\pvec}{\mathbf p}\) \(\newcommand{\qvec}{\mathbf q}\) \(\newcommand{\svec}{\mathbf s}\) \(\newcommand{\tvec}{\mathbf t}\) \(\newcommand{\uvec}{\mathbf u}\) \(\newcommand{\vvec}{\mathbf v}\) \(\newcommand{\wvec}{\mathbf w}\) \(\newcommand{\xvec}{\mathbf x}\) \(\newcommand{\yvec}{\mathbf y}\) \(\newcommand{\zvec}{\mathbf z}\) \(\newcommand{\rvec}{\mathbf r}\) \(\newcommand{\mvec}{\mathbf m}\) \(\newcommand{\zerovec}{\mathbf 0}\) \(\newcommand{\onevec}{\mathbf 1}\) \(\newcommand{\real}{\mathbb R}\) \(\newcommand{\twovec}[2]{\left[\begin{array}{r}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\ctwovec}[2]{\left[\begin{array}{c}#1 \\ #2 \end{array}\right]}\) \(\newcommand{\threevec}[3]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\cthreevec}[3]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \end{array}\right]}\) \(\newcommand{\fourvec}[4]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\cfourvec}[4]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \end{array}\right]}\) \(\newcommand{\fivevec}[5]{\left[\begin{array}{r}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\cfivevec}[5]{\left[\begin{array}{c}#1 \\ #2 \\ #3 \\ #4 \\ #5 \\ \end{array}\right]}\) \(\newcommand{\mattwo}[4]{\left[\begin{array}{rr}#1 \amp #2 \\ #3 \amp #4 \\ \end{array}\right]}\) \(\newcommand{\laspan}[1]{\text{Span}\{#1\}}\) \(\newcommand{\bcal}{\cal B}\) \(\newcommand{\ccal}{\cal C}\) \(\newcommand{\scal}{\cal S}\) \(\newcommand{\wcal}{\cal W}\) \(\newcommand{\ecal}{\cal E}\) \(\newcommand{\coords}[2]{\left\{#1\right\}_{#2}}\) \(\newcommand{\gray}[1]{\color{gray}{#1}}\) \(\newcommand{\lgray}[1]{\color{lightgray}{#1}}\) \(\newcommand{\rank}{\operatorname{rank}}\) \(\newcommand{\row}{\text{Row}}\) \(\newcommand{\col}{\text{Col}}\) \(\renewcommand{\row}{\text{Row}}\) \(\newcommand{\nul}{\text{Nul}}\) \(\newcommand{\var}{\text{Var}}\) \(\newcommand{\corr}{\text{corr}}\) \(\newcommand{\len}[1]{\left|#1\right|}\) \(\newcommand{\bbar}{\overline{\bvec}}\) \(\newcommand{\bhat}{\widehat{\bvec}}\) \(\newcommand{\bperp}{\bvec^\perp}\) \(\newcommand{\xhat}{\widehat{\xvec}}\) \(\newcommand{\vhat}{\widehat{\vvec}}\) \(\newcommand{\uhat}{\widehat{\uvec}}\) \(\newcommand{\what}{\widehat{\wvec}}\) \(\newcommand{\Sighat}{\widehat{\Sigma}}\) \(\newcommand{\lt}{<}\) \(\newcommand{\gt}{>}\) \(\newcommand{\amp}{&}\) \(\definecolor{fillinmathshade}{gray}{0.9}\)

    # =========================================
    # Support Vector Machine (SVM) for Churn
    # =========================================
    # This script trains and evaluates an SVM classifier to predict "Churn" using the
    # remaining variables as predictors. It searches over multiple kernels and selects
    # the best model by cross-validated ROC-AUC. Final outputs include:
    # - Selected Kernel and key hyperparameters
    # - Number of support vectors (total and by class)
    # - Confusion Matrix in (1,0) order
    # - Accuracy, Sensitivity (Recall for 1), Specificity (for 0)
    # - ROC curve displayed and AUC on test set
    #
    # =========================================

    # ----------------------------
    # Step 0: Imports & Settings
    # Purpose: Load required libraries and set global settings
    # ----------------------------
    import warnings # suppress optional warnings
    warnings.filterwarnings("ignore")

    import numpy as np # numerical operations
    import pandas as pd # data loading and wrangling
    from pathlib import Path # file path handling
    from typing import Tuple # type hints for clarity

    from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_curve,
    roc_auc_score
    )
    from sklearn.svm import SVC
    import matplotlib.pyplot as plt

    RANDOM_STATE = 42 # for reproducibility

    # ----------------------------
    # Step 1: User Inputs
    # Purpose: Specify dataset location, target column, and (optionally) sheet for Excel
    # NOTE: The path and sheet have been pre-filled per your request.
    # ----------------------------

    # >>>>> USER INPUT (pre-filled for you) <<<<<
    DATA_PATH = Path(r"C:/1_A_OER/Chapter 10 - Support Vector Machines/Classification_Business_Dataset.xlsx")
    SHEET_NAME = "With_Dummies" # Excel sheet to use
    TARGET = "Churn" # target column name
    TEST_SIZE = 0.20 # test set proportion (e.g., 0.30 = 30% test)

    # >>>>> No User Input is Required Beyond this Point <<<<<

    # ----------------------------
    # Step 2: Load Data
    # Purpose: Read the dataset into a DataFrame. Supports Excel or CSV based on file extension.
    # ----------------------------
    def load_data(path: Path, sheet_name: str = None) -> pd.DataFrame:
    ext = path.suffix.lower()
    if ext in [".xls", ".xlsx"]:
    return pd.read_excel(path, sheet_name=sheet_name)
    elif ext == ".csv":
    return pd.read_csv(path)
    else:
    raise ValueError(f"Unsupported file extension: {ext}. Use .csv, .xls, or .xlsx")

    df = load_data(DATA_PATH, SHEET_NAME)

    # ----------------------------
    # Step 3: Clean Target & Basic Preprocessing
    # Purpose: Ensure the TARGET is binary (0/1). If target is text (e.g., 'Yes'/'No'),
    # map to 1/0. Also, optionally one-hot encode any remaining object columns.
    # ----------------------------
    if TARGET not in df.columns:
    raise KeyError(f"TARGET column '{TARGET}' not found in data. Please check the name.")

    # Make a copy to avoid modifying original
    data = df.copy()

    # Coerce target to 0/1 if it's not numeric
    if not np.issubdtype(data[TARGET].dtype, np.number):
    # normalize text then map common positive/negative encodings
    data[TARGET] = (
    data[TARGET]
    .astype(str)
    .str.strip()
    .str.lower()
    .map({"1":1,"yes":1,"y":1,"true":1,"t":1,
    "0":0,"no":0,"n":0,"false":0,"f":0})
    )

    # If any unmapped values remain, try converting directly to numeric
    data[TARGET] = pd.to_numeric(data[TARGET], errors="coerce")

    # Drop rows with missing target
    data = data.dropna(subset=[TARGET])

    # Optionally one-hot encode any remaining object columns (if your sheet is already "With_Dummies",
    # this will generally be a no-op)
    obj_cols = data.select_dtypes(include=["object", "category"]).columns.tolist()
    obj_cols = [c for c in obj_cols if c != TARGET]
    if len(obj_cols) > 0:
    data = pd.get_dummies(data, columns=obj_cols, drop_first=True)

    # ----------------------------
    # Step 4: Define X, y and Train/Test Split
    # Purpose: Separate predictors (X) and target (y), then split with stratification.
    # ----------------------------
    X = data.drop(columns=[TARGET])
    y = data[TARGET].astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )

    # ----------------------------
    # Step 5: Build Pipeline & Hyperparameter Grid
    # Purpose: Create a preprocessing + SVM pipeline and a parameter grid checking multiple kernels.
    # ----------------------------
    pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), # robust to missing values
    ("scaler", StandardScaler(with_mean=False) if hasattr(X, "sparse") and X.sparse else StandardScaler()),
    ("svc", SVC()) # We'll tune kernel and other hyperparameters via GridSearchCV
    ])

    # Parameter grids for different kernels
    param_grid = [
    # Linear
    {
    "svc__kernel": ["linear"],
    "svc__C": [0.1, 1, 10, 100]
    },
    # RBF (Gaussian)
    {
    "svc__kernel": ["rbf"],
    "svc__C": [0.1, 1, 10, 100],
    "svc__gamma": ["scale", "auto", 0.01, 0.1, 1.0]
    },
    # Polynomial
    {
    "svc__kernel": ["poly"],
    "svc__C": [0.1, 1, 10, 100],
    "svc__degree": [2, 3, 4],
    "svc__gamma": ["scale", "auto"]
    },
    # Sigmoid
    {
    "svc__kernel": ["sigmoid"],
    "svc__C": [0.1, 1, 10, 100],
    "svc__gamma": ["scale", "auto"],
    "svc__coef0": [0.0, 0.5, 1.0]
    }
    ]

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    # ----------------------------
    # Step 6: Grid Search for Best Kernel/Hyperparameters
    # Purpose: Use ROC-AUC as selection metric to find best SVM configuration.
    # ----------------------------
    grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    refit=True, # fit final model on full training set
    verbose=0
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_params = grid.best_params_
    best_cv_auc = grid.best_score_

    # Helper to describe kernel succinctly
    def kernel_description(params: dict) -> str:
    k = params.get("svc__kernel", "unknown")
    if k == "linear":
    return "Linear"
    if k == "rbf":
    return "RBF (Gaussian)"
    if k == "poly":
    deg = params.get("svc__degree", "?")
    return f"Polynomial (degree={deg})"
    if k == "sigmoid":
    return "Sigmoid"
    return str(k)

    # ----------------------------
    # Step 7: Evaluation on Test Set
    # Purpose: Compute predictions, confusion matrix (1,0 order), accuracy, sensitivity, specificity,
    # and ROC/AUC. Plot ROC curve.
    # ----------------------------
    # Predict class labels
    y_pred = best_model.predict(X_test)

    # Confusion matrix with (1,0) for rows and columns
    cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
    TP = cm[0, 0]
    FN = cm[0, 1]
    FP = cm[1, 0]
    TN = cm[1, 1]

    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else np.nan # recall for class 1
    specificity = TN / (TN + FP) if (TN + FP) > 0 else np.nan # recall for class 0

    # Scores for ROC/AUC (decision_function preferred; fallback to predict_proba if available)
    def get_scores(model, X) -> np.ndarray:
    svc = model.named_steps["svc"]
    if hasattr(svc, "decision_function"):
    return model.decision_function(X)
    elif hasattr(svc, "predict_proba"):
    # probability=True is not set by default for SVC; using decision_function is standard for ROC
    return model.predict_proba(X)[:, 1]
    else:
    # As a last resort, use predicted labels (not ideal for ROC, but prevents errors)
    return model.predict(X)

    y_scores = get_scores(best_model, X_test)
    test_auc = roc_auc_score(y_test, y_scores)

    # ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)

    # ----------------------------
    # Step 8: Support Vector Info & Human-Readable Summary
    # Purpose: Print final model choice, support vectors count, and all metrics in logical order.
    # ----------------------------
    svc_final = best_model.named_steps["svc"]
    n_support_by_class = svc_final.n_support_
    total_support = int(np.sum(n_support_by_class))

    print("========== Final Model Summary ==========")
    print(f"Selected Kernel: {kernel_description(best_params)}")
    # Also show key hyperparameters for reproducibility
    print("Best Hyperparameters:")
    for k, v in best_params.items():
    print(f" - {k}: {v}")
    print(f"Cross-validated AUC (train CV): {best_cv_auc:.4f}")
    print(f"Number of Support Vectors (by class): {n_support_by_class}")
    print(f"Total Number of Support Vectors: {total_support}")

    print("\n========== Test Set Performance ==========")
    print("Confusion Matrix (rows/cols in order [1, 0]):")
    print(cm)
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Sensitivity: {sensitivity:.4f} # Recall for class 1")
    print(f"Specificity: {specificity:.4f} # Recall for class 0")
    print(f"AUC: {test_auc:.4f}")

    # ----------------------------
    # Step 9: Plot ROC Curve
    # Purpose: Visualize trade-off between TPR (sensitivity) and FPR (1 - specificity).
    # ----------------------------
    plt.figure()
    plt.plot(fpr, tpr, linewidth=2, label=f"ROC curve (AUC = {test_auc:.4f})")
    plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1, label="Chance")
    plt.xlabel("False Positive Rate (1 - Specificity)")
    plt.ylabel("True Positive Rate (Sensitivity)")
    plt.title("SVM ROC Curve - Test Set")
    plt.legend(loc="lower right")
    plt.grid(True, linestyle=":", linewidth=0.5)
    plt.tight_layout()
    plt.show()

    # ----------------------------
    # No User Input is Required Beyond this Point
    # ----------------------------


    This page titled 10.8.3: Python Code used for SVM Classification Case Study is shared under a CC BY 4.0 license and was authored, remixed, and/or curated by Elbert L. Hearon, M.B.A., M.S..