pip install cvxpy numpy matplotlib scipy

import numpy as np
import matplotlib.pyplot as plt
import cvxpy as cp
from sklearn.datasets import make_blobs, make_circles, make_moons  # data generation only
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------------------------------
# TODO: Replace 12345 with your actual numeric Student ID
# -------------------------------------------------------
STUDENT_ID = 12345
np.random.seed(STUDENT_ID)
print("cvxpy version:", cp.__version__)
print("Seed:", STUDENT_ID)

def plot_decision_boundary(predictor, X, y, support_vectors=None,
                            title="SVM", ax=None, margin_fn=None):
    """Generic boundary plotter. predictor must expose predict(X) -> {-1,+1}."""
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 5))
    x0 = np.linspace(X[:,0].min()-0.5, X[:,0].max()+0.5, 300)
    x1 = np.linspace(X[:,1].min()-0.5, X[:,1].max()+0.5, 300)
    xx, yy = np.meshgrid(x0, x1)
    grid = np.c_[xx.ravel(), yy.ravel()]
    Z = predictor.predict(grid).reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.15, cmap='bwr', levels=[-2,0,2])
    ax.contour (xx, yy, Z, levels=[0], colors='black', linewidths=2)
    if margin_fn is not None:
        Zm = margin_fn(grid).reshape(xx.shape)
        ax.contour(xx, yy, Zm, levels=[-1,1], colors=['red','green'],
                   linestyles='--', linewidths=1.5)
    ax.scatter(X[:,0], X[:,1], c=y, cmap='bwr', edgecolors='k', s=40, zorder=3)
    if support_vectors is not None:
        ax.scatter(support_vectors[:,0], support_vectors[:,1],
                   s=180, facecolors='none', edgecolors='gold',
                   linewidths=2, zorder=4, label='SV')
        ax.legend()
    ax.set_title(title, fontsize=12)
    return ax

# ── Solver helper: tries CLARABEL → SCS → OSQP ────────────────────────────
def solve_qp(prob, **kwargs):
    """
    Try a sequence of solvers in order, returning on first success.
    OSQP is fast but struggles with dense kernel matrices (e.g. RBF/Poly).
    SCS handles dense problems better at the cost of lower precision.
    CLARABEL is the modern default in newer CVXPY and very robust.
    """
    solvers = [
        dict(solver=cp.CLARABEL),                              # best default (cvxpy >= 1.3)
        dict(solver=cp.SCS, eps=1e-6, max_iters=20000),       # dense-friendly fallback
        dict(solver=cp.OSQP, eps_abs=1e-6, eps_rel=1e-6,
             max_iter=20000, adaptive_rho=True, polish=True),  # last resort
    ]
    last_err = None
    for s_kwargs in solvers:
        try:
            prob.solve(**s_kwargs, **kwargs)
            if prob.status in ("optimal", "optimal_inaccurate") and prob.value is not None:
                return
        except Exception as e:
            last_err = e
    raise cp.error.SolverError(
        f"All solvers failed. Last error: {last_err}\n"
        f"Final status: {prob.status}"
    )

X_hard, y_raw = make_blobs(n_samples=80, centers=2, cluster_std=0.55,
                            n_features=2, random_state=STUDENT_ID)
# Convert labels to {-1, +1}
y_hard = np.where(y_raw == 0, -1, 1)

print("X shape:", X_hard.shape, "  y unique:", np.unique(y_hard))

plt.figure(figsize=(5,4))
plt.scatter(X_hard[:,0], X_hard[:,1], c=y_hard, cmap='bwr', edgecolors='k')
plt.title("Training Data")
plt.show()

X shape: (80, 2)   y unique: [-1  1]

class HardMarginSVM:
    """
    Hard-Margin SVM solved via the primal QP using CVXPY.

    Attributes (set after fit):
        w       : np.ndarray, shape (d,) — weight vector
        b       : float                  — bias
        support_vectors : np.ndarray     — training points with margin <= 1+tol
    """

    def fit(self, X, y):
        """
        Solve the hard-margin SVM primal QP.

        Steps:
          1. Define cvxpy variables  w (shape d) and b (scalar)
          2. Write the objective:    minimize  0.5 * cp.sum_squares(w)
          3. Write constraints:      y_i * (X[i] @ w + b) >= 1  for all i
             Hint: vectorized form:  cp.multiply(y, X @ w + b) >= 1
          4. Create cp.Problem, call .solve(solver=cp.OSQP)
          5. Extract w.value, b.value
          6. Identify support vectors: points where y*(w@x+b) <= 1 + 1e-4
        """
        n, d = X.shape

        # YOUR CODE HERE
        self.w = None
        self.b = None
        self.support_vectors = None
        return self

    def decision_function(self, X):
        """Return signed distance to hyperplane: X @ w + b"""
        # YOUR CODE HERE
        pass

    def predict(self, X):
        """Return predicted labels in {-1, +1}"""
        # YOUR CODE HERE
        pass

# TODO: Instantiate HardMarginSVM, call .fit(X_hard, y_hard)
# Print w, b, margin width (= 2 / ||w||), number of support vectors
# Print training accuracy

# YOUR CODE HERE
hm_svm = HardMarginSVM()

# TODO: Call plot_decision_boundary with:
#   - predictor = hm_svm
#   - support_vectors = hm_svm.support_vectors
#   - margin_fn = hm_svm.decision_function

# YOUR CODE HERE

w           = [-0.15690685 -0.00825728]
b           = 0.1049
Margin width= 12.7288
# SVs       = 2
Train acc   = 1.0000

X_soft_raw, y_raw2 = make_blobs(n_samples=150, centers=2, cluster_std=1.4,
                                 n_features=2, random_state=STUDENT_ID)
y_soft = np.where(y_raw2 == 0, -1, 1)
X_tr, X_te, y_tr, y_te = train_test_split(X_soft_raw, y_soft,
                                            test_size=0.25, random_state=STUDENT_ID)
print("Train:", X_tr.shape, " Test:", X_te.shape)

Train: (112, 2)  Test: (38, 2)

class SoftMarginSVM:
    """
    Soft-Margin SVM solved via the primal QP using CVXPY.

    Parameters
    ----------
    C : float  — regularization strength (default 1.0)
    """

    def __init__(self, C=1.0):
        self.C = C

    def fit(self, X, y):
        """
        Solve the soft-margin SVM primal QP.

        Steps:
          1. Define cvxpy variables: w (d,), b (scalar), xi (n,)
          2. Objective:  minimize  0.5 * cp.sum_squares(w) + C * cp.sum(xi)
          3. Constraints:
               (a) cp.multiply(y, X @ w + b) >= 1 - xi
               (b) xi >= 0
          4. Solve and extract values
          5. Support vectors: indices where xi > 1e-4 OR y*(w@x+b) <= 1+1e-4
          6. Store self.xi_values = xi.value  (slack values)
        """
        n, d = X.shape

        # YOUR CODE HERE
        self.w = None
        self.b = None
        self.support_vectors = None
        self.xi_values = None
        return self

    def decision_function(self, X):
        # YOUR CODE HERE
        pass

    def predict(self, X):
        # YOUR CODE HERE
        pass

# TODO:
#  1. Fit SoftMarginSVM(C=1.0) on X_tr, y_tr
#  2. Print w, b, margin = 2/||w||, total slack sum, # support vectors
#  3. Print train and test accuracy
#  4. Plot decision boundary using plot_decision_boundary

# YOUR CODE HERE
sm_svm = SoftMarginSVM(C=1.0)

w            = [-0.23471171 -0.01611776]
b            = 0.1367
Margin width = 8.5011
Sum slack    = 0.0000
# SVs        = 2
Train acc    = 1.0000
Test  acc    = 1.0000

# ── 2b.1  Implement the hinge loss ────────────────────────────────────────
def hinge_loss(y, scores):
    """
    Compute per-sample hinge loss.

    Parameters
    ----------
    y      : np.ndarray (n,)  — true labels in {-1, +1}
    scores : np.ndarray (n,)  — decision function values  w^T x + b

    Returns
    -------
    losses : np.ndarray (n,)  — max(0, 1 - y * scores)

    TODO: implement using numpy only (no loops)
    """
    # YOUR CODE HERE
    pass


# ── 2b.2  Verify equivalence: slack xi == hinge loss at optimum ───────────
# TODO:
#   1. After fitting sm_svm on X_tr, y_tr, compute:
#        scores     = sm_svm.decision_function(X_tr)
#        h_loss     = hinge_loss(y_tr, scores)         # via your function
#   2. Compare h_loss with sm_svm.xi_values element-wise.
#      Print max absolute difference — should be < 1e-4.
#   3. Print mean hinge loss and total slack sum.

# YOUR CODE HERE

# ── 2b.3  Implement SVM objective as pure hinge loss minimisation ─────────
class HingeLossSVM:
    """
    Soft-Margin SVM reformulated as unconstrained hinge loss minimisation.

    Solves:  min_{w,b}  0.5*||w||^2 + C * sum_i max(0, 1 - y_i*(w^T x_i + b))

    Use cvxpy with cp.pos() to represent max(0, ...) — this keeps the
    problem as a convex QP that cvxpy can solve directly WITHOUT
    introducing explicit xi variables or inequality constraints.

    Hint:
        hinge_terms = cp.sum(cp.pos(1 - cp.multiply(y, X @ w + b)))
        objective   = cp.Minimize(0.5 * cp.sum_squares(w) + C * hinge_terms)
    """

    def __init__(self, C=1.0):
        self.C = C

    def fit(self, X, y):
        # YOUR CODE HERE
        self.w = None
        self.b = None
        return self

    def decision_function(self, X):
        # YOUR CODE HERE
        pass

    def predict(self, X):
        # YOUR CODE HERE
        pass

# ── 2b.4  Compare: SoftMarginSVM (explicit xi) vs HingeLossSVM (cp.pos) ──
# TODO:
#   1. Fit HingeLossSVM(C=1.0) on X_tr, y_tr
#   2. Print w and b for both formulations — they should be nearly identical.
#   3. Print train accuracy for both.
#   4. Plot both decision boundaries side by side (1×2 subplot).

# YOUR CODE HERE

# ── 2b.5  Visualise hinge loss as a function of margin ────────────────────
# TODO:
#   Plot hinge loss  ℓ_H(t) = max(0, 1-t)  for t in [-1.5, 2.5]
#   On the same axes, also plot:
#     - Zero-one loss:      ℓ_01(t) = 1 if t < 0 else 0
#     - Squared hinge loss: ℓ_sq(t) = max(0, 1-t)^2
#   Mark the hinge point at t=1 with a vertical dashed line.
#   Label axes:  x = 'Functional margin  y·f(x)',  y = 'Loss'
#   Add a legend and title 'Hinge Loss vs Other Losses'

# YOUR CODE HERE

Comparison: SoftMarginSVM (xi) vs HingeLossSVM (cp.pos)
  w  (xi-form)   : [-0.23471171 -0.01611776]
  w  (hinge-form): [-0.23471171 -0.01611776]
  b  (xi-form)   : 0.13669
  b  (hinge-form): 0.13669
  max |Δw|       : 0.000000
  |Δb|           : 0.000000
  Train acc (xi-form)   : 1.0000
  Train acc (hinge-form): 1.0000

class DualSVM:
    """
    Soft-Margin SVM solved via the DUAL QP using CVXPY.
    Supports arbitrary kernels (defaults to linear).

    Parameters
    ----------
    C      : float   — regularization
    kernel : str     — 'linear' (Part 3) or passed kernel matrix externally
    """

    def __init__(self, C=1.0):
        self.C = C

    def _linear_kernel(self, X1, X2):
        """Linear kernel: K[i,j] = x_i^T x_j"""
        return X1 @ X2.T

    def fit(self, X, y, K=None):
        """
        Solve the dual QP.

        Parameters
        ----------
        X : training data  (n, d)
        y : labels in {-1, +1}
        K : optional precomputed kernel matrix (n, n)
            If None, uses linear kernel.

        Steps:
          1. Compute kernel matrix K if not provided
          2. Form the QP matrix:  Q[i,j] = y_i * y_j * K[i,j]
             Then symmetrize:    Q = (Q + Q.T) / 2 + 1e-8 * np.eye(n)
          3. cvxpy variable: alpha (n,)
          4. Objective:  maximize  sum(alpha) - 0.5 * cp.quad_form(alpha, cp.psd_wrap(Q))
             (equivalently: minimize  0.5 * quad_form - sum(alpha))
          5. Constraints:
               (a) 0 <= alpha <= C
               (b) y @ alpha == 0   (equality)
          6. Solve and extract alpha.value
          7. Identify support vectors: alpha_i > 1e-5
          8. Recover w = sum_i alpha_i * y_i * X[i]  (for linear kernel)
          9. Recover b using free support vectors (0 < alpha_i < C - 1e-5):
                b = mean( y_sv - K_sv @ (alpha * y) )
          10. Store self.alpha, self.X_train, self.y_train
        """
        n, d = X.shape
        self.X_train = X
        self.y_train = y

        y = y.astype(float)  # CVXPY requires float for equality constraint
        # YOUR CODE HERE
        self.alpha = None
        self.w = None
        self.b = None
        self.support_vectors = None
        return self

    def decision_function(self, X):
        """
        For linear kernel: return X @ w + b
        General form: sum_i alpha_i * y_i * K(x_i, x) + b
        """
        # YOUR CODE HERE
        pass

    def predict(self, X):
        # YOUR CODE HERE
        pass

       C   # SVs    Margin     Train      Test
------------------------------------------------
   0.001      28   13.4610    1.0000    1.0000
    0.01       6   10.0397    1.0000    1.0000
     0.1       2    8.5011    1.0000    1.0000
     1.0       2    8.5011    1.0000    1.0000
    10.0       2    8.5011    1.0000    1.0000
   100.0       2    8.5011    1.0000    1.0000

C_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

# TODO:
#  For each C in C_values:
#    1. Train DualSVM(C=C) on X_tr, y_tr
#    2. Compute train & test accuracy
#    3. Store results
#  Print a table: C | # SVs | margin width | train acc | test acc

# YOUR CODE HERE
dual_results = []

# TODO: Create 1×6 subplot row — one boundary plot per C value
# TODO: Create a second figure: semilogx plot of train vs test accuracy vs C

# YOUR CODE HERE

def linear_kernel(X1, X2):
    """
    Linear kernel: K[i,j] = X1[i] . X2[j]
    Returns: (n1, n2) matrix
    """
    # YOUR CODE HERE
    pass


def rbf_kernel(X1, X2, gamma=1.0):
    """
    RBF kernel: K[i,j] = exp(-gamma * ||X1[i] - X2[j]||^2)

    Hint: Use broadcasting or scipy.spatial.distance.cdist
    ||x - z||^2 = ||x||^2 + ||z||^2 - 2 x^T z

    Returns: (n1, n2) matrix
    """
    # YOUR CODE HERE
    pass


def poly_kernel(X1, X2, degree=3, gamma=1.0, coef0=1.0):
    """
    Polynomial kernel: K[i,j] = (gamma * X1[i].X2[j] + coef0)^degree
    Returns: (n1, n2) matrix
    """
    # YOUR CODE HERE
    pass


# Sanity checks — run these after implementing
X_test_k = np.array([[1.0, 0.0], [0.0, 1.0]])
print("Linear kernel:\n",  linear_kernel(X_test_k, X_test_k))
print("RBF kernel (g=1):\n", rbf_kernel(X_test_k, X_test_k, gamma=1.0))
print("Poly kernel (d=2):\n", poly_kernel(X_test_k, X_test_k, degree=2))

Linear:
 [[1. 0.]
 [0. 1.]]
RBF(g=1):
 [[1.         0.13533528]
 [0.13533528 1.        ]]
Poly(d=2):
 [[4. 1.]
 [1. 4.]]

class KernelSVM:
    """
    Kernel SVM using the dual QP.

    Parameters
    ----------
    C        : float
    kernel   : str in {'linear','rbf','poly'}
    gamma    : float (for RBF and Poly)
    degree   : int   (for Poly)
    coef0    : float (for Poly)
    """

    def __init__(self, C=1.0, kernel='rbf', gamma=1.0, degree=3, coef0=1.0):
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.degree = degree
        self.coef0 = coef0

    def _compute_kernel(self, X1, X2):
        """
        Dispatch to the correct kernel function based on self.kernel.
        """
        # YOUR CODE HERE
        pass

    def fit(self, X, y):
        """
        Solve the dual QP with the chosen kernel.

        Steps: (same as DualSVM.fit, but use self._compute_kernel)
          1. K = self._compute_kernel(X, X)              — (n,n) train kernel matrix
          2. Q[i,j] = y_i * y_j * K[i,j]                — (n,n) QP matrix
          3. cvxpy dual QP → solve for alpha
          4. Find support vector indices: alpha > 1e-5
          5. Compute b from free SVs (1e-5 < alpha < C - 1e-5):
                For each free SV i:  b_i = y_i - sum_j alpha_j*y_j*K[j,i]
                b = mean(b_i)
          6. Store self.alpha_sv, self.X_sv, self.y_sv, self.b
        """
        n, d = X.shape
        self.X_train = X
        self.y_train = y

        # YOUR CODE HERE
        self.alpha = None
        self.alpha_sv = None
        self.X_sv = None
        self.y_sv = None
        self.b = None
        self.support_vectors = None
        return self

    def decision_function(self, X):
        """
        f(x) = sum_{i in SV} alpha_i * y_i * K(x_i, x) + b

        Hint:
          K_pred = self._compute_kernel(self.X_sv, X)   — (n_sv, n_test)
          scores = (self.alpha_sv * self.y_sv) @ K_pred + self.b
        """
        # YOUR CODE HERE
        pass

    def predict(self, X):
        # YOUR CODE HERE
        pass

scaler = StandardScaler()
X_circ_raw, y_circ_raw = make_circles(n_samples=200, noise=0.1,
                                       factor=0.4, random_state=STUDENT_ID)
X_circ = scaler.fit_transform(X_circ_raw)
y_circ = np.where(y_circ_raw == 0, -1, 1)

scaler2 = StandardScaler()
X_moon_raw, y_moon_raw = make_moons(n_samples=200, noise=0.15,
                                     random_state=STUDENT_ID)
X_moon = scaler2.fit_transform(X_moon_raw)
y_moon = np.where(y_moon_raw == 0, -1, 1)

print("Circles:", X_circ.shape, "  Moons:", X_moon.shape)

Circles: (200, 2)   Moons: (200, 2)

# TODO:
#  For each dataset (circles, moons) and each kernel (linear, rbf, poly):
#    1. Train KernelSVM(C=1.0, kernel=...) — use gamma=0.5 for RBF, degree=3 for Poly
#    2. Compute accuracy
#    3. Store model in a dict for plotting

# YOUR CODE HERE
kernel_models = {}  # e.g. kernel_models[('circles','rbf')] = fitted KernelSVM

Circles     linear    acc=0.6000  n_sv=198
Circles     rbf       acc=1.0000  n_sv=27
Circles     poly      acc=1.0000  n_sv=8
Moons       linear    acc=0.8600  n_sv=65
Moons       rbf       acc=0.9850  n_sv=52
Moons       poly      acc=0.9900  n_sv=19

# TODO: 2-row × 3-column subplot grid
#   Row 0: circles + linear, rbf, poly
#   Row 1: moons   + linear, rbf, poly
#   Each subplot: plot_decision_boundary + accuracy in title

# YOUR CODE HERE

gamma_values = [0.01, 0.1, 1.0, 5.0, 20.0]

# TODO:
#  For each gamma, train KernelSVM(C=1, kernel='rbf', gamma=gamma) on X_circ, y_circ
#  Plot decision boundary in a 1×5 row, title includes gamma and accuracy

# YOUR CODE HERE

# TODO: Print summary table:
#   Dataset | Kernel | # SVs | Accuracy
# for all 6 combinations

# YOUR CODE HERE

   Dataset    Kernel   # SVs    Accuracy
------------------------------------------
   Circles    linear     198      0.6000
   Circles       rbf      27      1.0000
   Circles      poly       8      1.0000

     Moons    linear      65      0.8600
     Moons       rbf      52      0.9850
     Moons      poly      19      0.9900

from sklearn.svm import SVC

# TODO: For the circles dataset, train:
#   (a) Your KernelSVM(C=1, kernel='rbf', gamma=0.5)
#   (b) sklearn SVC(C=1, kernel='rbf', gamma=0.5)
# Print both accuracies side by side.
# They should be within ~1-2% of each other.

# YOUR CODE HERE

Our  KernelSVM accuracy : 1.0000  (#SVs=27)
sklearn SVC accuracy    : 1.0000  (#SVs=27)
Difference              : 0.0000

Kernel	Formula
Linear	$K(x,z) = x^T z$
RBF (Gaussian)	$K(x,z) = \exp(-\gamma \\|x-z\\|^2)$
Polynomial	$K(x,z) = (\gamma x^T z + r)^d$

E2 236 Foundations of Machine Learning¶

Lab 6: Support Vector Machines¶

Using CVXPY / SciPy Quadratic Programming¶

Overview¶

Installation¶

Setup¶

Shared Utility: Decision Boundary Plotter¶

Part 1: Hard-Margin SVM via Primal QP¶

Mathematical Formulation¶

1.1 Generate Data¶

1.2 Implement Hard-Margin SVM Solver¶

1.3 Train and Evaluate¶

1.4 Visualize¶

Part 2: Soft-Margin SVM via Primal QP¶

Mathematical Formulation¶

2.1 Generate Overlapping Data¶

2.2 Implement Soft-Margin SVM Solver¶

2.3 Train, Evaluate & Visualize¶

Part 2b: Hinge Loss Interpretation of Soft-Margin SVM¶

Theory¶

Part 3: Regularized SVM — Dual QP and Effect of C¶

Mathematical Formulation (Dual)¶

3.1 Implement the Dual SVM Solver¶

3.2 Sweep Over C Values¶

3.3 Plot Decision Boundaries and Accuracy Curve¶

Part 4: Kernel SVM via Dual QP¶

The Kernel Trick¶

4.1 Implement Kernel Functions¶

4.2 Implement Kernel SVM (extends DualSVM)¶

4.3 Generate Non-Linear Data¶

4.4 Train All Kernels on Both Datasets¶

4.5 Visualize (2×3 Grid)¶

4.6 Effect of Gamma on RBF Kernel (Circles)¶

4.7 Summary Table¶

Part 5: Verify Against sklearn (Sanity Check)¶

End of Assignment¶