import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split

np.random.seed(0)
%matplotlib inline
plt.rcParams.update({'figure.dpi':110, 'font.size':11})

def generate_binary_data(n_per_class=300, sigma=1.2, seed=0):
    """
    Generate 2-class Gaussian data.

    Returns
    -------
    X : ndarray (2n, 3)   features + bias column
    y : ndarray (2n,)     binary labels
    """
    # YOUR CODE HERE
    raise NotImplementedError


X_all, y_all = generate_binary_data()
X_tr, X_te, y_tr, y_te = train_test_split(
    X_all, y_all, test_size=0.2, random_state=1)

print(f"Train: {X_tr.shape}, Test: {X_te.shape}")

# Plot training data
# YOUR CODE HERE

Train: (480, 3), Test: (120, 3)

def sigmoid(z):
    """Numerically stable sigmoid."""
    # YOUR CODE HERE
    raise NotImplementedError


def nll(w, X, y):
    """
    Negative log-likelihood of logistic regression.

    Parameters
    ----------
    w : ndarray (d,)
    X : ndarray (n, d)
    y : ndarray (n,)   — binary {0, 1}

    Returns
    -------
    loss : float
    """
    # YOUR CODE HERE
    raise NotImplementedError


def nll_gradient(w, X, y):
    """
    Gradient of NLL w.r.t. w.

    Returns
    -------
    grad : ndarray (d,)
    """
    # YOUR CODE HERE
    raise NotImplementedError


# Sanity checks
w0 = np.zeros(X_tr.shape[1])
assert 0 < nll(w0, X_tr, y_tr) < 2,           "NLL out of expected range"
assert nll_gradient(w0, X_tr, y_tr).shape == w0.shape, "Gradient shape error"
print("NLL checks passed ✓")

def irls_logistic(X, y, n_iter=50, tol=1e-7, lam=1e-4):
    """
    IRLS (Newton-Raphson) for logistic regression.

    Parameters
    ----------
    X     : ndarray (n, d)
    y     : ndarray (n,)
    n_iter: int
    tol   : float   convergence tolerance on ||Δw||
    lam   : float   L2 regularisation (added to diagonal of Hessian)

    Returns
    -------
    w         : ndarray (d,)
    loss_hist : list[float]   NLL at each full iteration
    """
    # YOUR CODE HERE
    raise NotImplementedError


w_irls, loss_irls = irls_logistic(X_tr, y_tr)
print(f"IRLS  | Train NLL: {loss_irls[-1]:.4f} | "
      f"Test acc: {accuracy_score(y_te, (sigmoid(X_te@w_irls)>=0.5).astype(int)):.3f}")

IRLS  | Train NLL: 0.2764 | Test acc: 0.892 | Iters: 7

def sgd_logistic(X, y, lr=0.1, momentum=0.9, n_epochs=200,
                 batch_size=32, lam=1e-4, seed=1):
    """
    Mini-batch SGD with momentum for logistic regression.

    Parameters
    ----------
    X          : ndarray (n, d)
    y          : ndarray (n,)
    lr         : float   learning rate η
    momentum   : float   μ
    n_epochs   : int
    batch_size : int
    lam        : float   L2 regularisation
    seed       : int

    Returns
    -------
    w         : ndarray (d,)
    loss_hist : list[float]   full-batch NLL per epoch
    """
    # YOUR CODE HERE
    raise NotImplementedError


w_sgd, loss_sgd = sgd_logistic(X_tr, y_tr)
print(f"SGD   | Train NLL: {loss_sgd[-1]:.4f} | "
      f"Test acc: {accuracy_score(y_te, (sigmoid(X_te@w_sgd)>=0.5).astype(int)):.3f}")

def adam_logistic(X, y, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8,
                  n_epochs=200, batch_size=32, lam=1e-4, seed=2):
    """
    Adam optimizer for logistic regression.

    Parameters
    ----------
    X, y        : data
    lr          : float   α (step-size)
    beta1, beta2: float   exponential decay rates
    eps         : float   numerical stability term
    n_epochs    : int
    batch_size  : int
    lam         : float   L2 regularisation

    Returns
    -------
    w         : ndarray (d,)
    loss_hist : list[float]   full-batch NLL per epoch
    """
    # YOUR CODE HERE
    raise NotImplementedError


w_adam, loss_adam = adam_logistic(X_tr, y_tr)
print(f"Adam  | Train NLL: {loss_adam[-1]:.4f} | "
      f"Test acc: {accuracy_score(y_te, (sigmoid(X_te@w_adam)>=0.5).astype(int)):.3f}")

Adam  | Train NLL: 0.2764 | Test acc: 0.892 | Epochs: 200

# Q6.1 — Convergence plot
# YOUR CODE HERE

def plot_boundary(ax, w, X, y, title=''):
    """Plot decision boundary for a logistic model on 2-D + bias data."""
    # YOUR CODE HERE
    # Tips:
    #   - use h = 0.15 for the meshgrid step (smaller values hang the kernel)
    #   - build grid as np.c_[xx.ravel(), yy.ravel(), np.ones(xx.size)]
    #   - predict with (sigmoid(grid @ w) >= 0.5).astype(int)
    #   - call ax.set_xlim / ax.set_ylim to clip to the data range
    pass

def plot_boundary(ax, w, X, y, title=''):
    """Plot decision boundary for a logistic model on 2-D + bias data."""
    h = 0.15   # coarser grid — fast and still smooth enough to see the boundary
    x0_min, x0_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    x1_min, x1_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x0_min, x0_max, h),
                         np.arange(x1_min, x1_max, h))
    grid = np.c_[xx.ravel(), yy.ravel(), np.ones(xx.size)]
    Z = (sigmoid(grid @ w) >= 0.5).astype(int).reshape(xx.shape)
    cmap_bg = ListedColormap(['#AAAAFF', '#FFAAAA'])
    ax.contourf(xx, yy, Z, alpha=0.35, cmap=cmap_bg)
    ax.scatter(X[y==0, 0], X[y==0, 1], s=12, alpha=0.7, c='steelblue', label='Class 0')
    ax.scatter(X[y==1, 0], X[y==1, 1], s=12, alpha=0.7, c='crimson',   label='Class 1')
    ax.set_xlim(x0_min, x0_max)
    ax.set_ylim(x1_min, x1_max)
    ax.set_title(title); ax.legend(fontsize=8)

# Q6.3 — Comparison table
print(f"{'Method':8s} | {'Test Acc':9s} | {'Test NLL':9s} | {'Iters/Epochs':12s}")
print("-" * 50)
for name, w, n_iters in [
    ('IRLS', w_irls, len(loss_irls)),
    ('SGD',  w_sgd,  len(loss_sgd)),
    ('Adam', w_adam, len(loss_adam)),
]:
    preds  = (sigmoid(X_te @ w) >= 0.5).astype(int)
    acc    = accuracy_score(y_te, preds)
    t_nll  = nll(w, X_te, y_te)
    print(f"{name:8s} | {acc:.4f}    | {t_nll:.4f}    | {n_iters}")

Method   | Test Acc  | Test NLL  | Iters/Epochs
----------------------------------------------------
IRLS     | 0.8917    | 0.2678    | 7
SGD      | 0.9000    | 0.2679    | 200
Adam     | 0.8917    | 0.2678    | 200

# Q7.1 — SGD LR sweep
# YOUR CODE HERE

# Q7.2 — Adam LR sweep
# YOUR CODE HERE

E2 236 Foundations of ML¶

Lab 3b - Optimization¶

Part B — Logistic Regression: IRLS vs SGD vs Adam¶

Imports & Setup¶

1. Synthetic Binary Classification Dataset¶

2. Logistic Regression Objective¶

3. Method 1 — IRLS (Newton-Raphson)¶

4. Method 2 — SGD with Momentum¶

5. Method 3 — Adam¶

6. Convergence Comparison¶

7. Hyperparameter Sensitivity¶

8. Conceptual Questions¶

End of Part B¶