import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import solve, cho_factor, cho_solve
from scipy.special import expit   # sigmoid σ(x) = 1/(1+e^{-x})
from scipy.optimize import minimize
from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

np.random.seed(0)
%matplotlib inline
plt.rcParams.update({'figure.dpi': 110, 'font.size': 11})

def se_kernel(X1, X2, length_scale=1.0, signal_var=1.0):
    """
    Squared-exponential (RBF) kernel.

    Parameters
    ----------
    X1 : ndarray (n, d)
    X2 : ndarray (m, d)
    length_scale : float  ℓ
    signal_var   : float  σ_f²

    Returns
    -------
    K : ndarray (n, m)
    """
    # YOUR CODE HERE
    raise NotImplementedError

# YOUR CODE HERE

class GaussianProcessRegressor:
    """
    Exact GP regression with a squared-exponential kernel.

    Parameters
    ----------
    length_scale : float
    signal_var   : float   σ_f²
    noise_var    : float   σ_n²
    """

    def __init__(self, length_scale=1.0, signal_var=1.0, noise_var=0.1):
        self.length_scale = length_scale
        self.signal_var   = signal_var
        self.noise_var    = noise_var

    def _kernel(self, X1, X2):
        return se_kernel(X1, X2,
                         length_scale=self.length_scale,
                         signal_var=self.signal_var)

    def fit(self, X, y):
        """
        Store training data and precompute (K + σ_n² I)^{-1} y.

        Parameters
        ----------
        X : ndarray (n, 1)
        y : ndarray (n,)
        """
        # YOUR CODE HERE
        raise NotImplementedError

    def predict(self, X_star, return_std=False):
        """
        Compute posterior mean (and optionally std) at X_star.

        Parameters
        ----------
        X_star     : ndarray (m, 1)
        return_std : bool

        Returns
        -------
        mu   : ndarray (m,)
        std  : ndarray (m,)  — only if return_std=True
        """
        # YOUR CODE HERE
        raise NotImplementedError

# YOUR CODE HERE

def log_marginal_likelihood(X, y, length_scale, signal_var=1.0, noise_var=0.04):
    """
    Compute the log marginal likelihood of the GP.

    Returns
    -------
    lml : float
    """
    # YOUR CODE HERE
    raise NotImplementedError

# Plot log marginal likelihood vs length_scale
# YOUR CODE HERE

Best length-scale: 1.750

class GPClassifierLaplace:
    """
    Binary GP classifier using the Laplace approximation.

    Parameters
    ----------
    length_scale : float
    signal_var   : float
    max_iter     : int    Newton iterations
    tol          : float  convergence tolerance
    """

    def __init__(self, length_scale=1.0, signal_var=1.0,
                 max_iter=50, tol=1e-6):
        self.length_scale = length_scale
        self.signal_var   = signal_var
        self.max_iter     = max_iter
        self.tol          = tol

    def _kernel(self, X1, X2):
        return se_kernel(X1, X2,
                         length_scale=self.length_scale,
                         signal_var=self.signal_var)

    def _log_likelihood_grad_hess(self, f, y):
        """
        Bernoulli log-likelihood with sigmoid link.

        Returns
        -------
        ll   : float     log p(y|f)
        grad : ndarray   ∂ll/∂f  =  y - σ(f)
        W    : ndarray   -∂²ll/∂f²  (diagonal vector, positive)
                         W_i = σ(f_i)(1 - σ(f_i))
        """
        # YOUR CODE HERE
        raise NotImplementedError

    def fit(self, X, y):
        """
        Run Newton iterations to find the posterior mode f_hat.

        Implementation notes
        --------------------
        1. Form B = I + W^{1/2} K W^{1/2}  (PD matrix of size n×n).
        2. Compute L = np.linalg.cholesky(B)  — plain lower-triangular factor.
           Do NOT call cho_factor on L again; use np.linalg.solve(L, ...) directly.
        3. Newton RHS: b = W*f + grad
        4. Woodbury step:
               rhs  = sqrt_W * (K @ b)
               tmp  = np.linalg.solve(L.T, np.linalg.solve(L, rhs))
               f_new = K @ (b - sqrt_W * tmp)
        5. Store self.f_hat_, self.sqrt_W_, self.L_B_ for use in predict_proba.

        y should be in {0, 1}.
        """
        # YOUR CODE HERE
        raise NotImplementedError

    def predict_proba(self, X_star):
        """
        Predictive probability p(y_*=1 | ...) using Laplace approximation.

        Implementation notes
        --------------------
        mu_s   = K_s.T @ (y - σ(f_hat))
        v      = np.linalg.solve(self.L_B_, self.sqrt_W_[:, None] * K_s)   # (n, m)
        var_s  = diag(K_ss) - einsum("ij,ij->j", v, v)
        kappa  = 1 / sqrt(1 + π·var_s/8)   (probit approximation)
        return σ(kappa * mu_s)
        """
        # YOUR CODE HERE
        raise NotImplementedError

    def predict(self, X_star, threshold=0.5):
        return (self.predict_proba(X_star) >= threshold).astype(int)

# YOUR CODE HERE

# 3.1 — 1-D toy dataset
rng4 = np.random.default_rng(3)
X1d  = rng4.uniform(-5, 5, (60, 1))
y1d  = (np.sin(X1d[:, 0]) > 0).astype(float)
# add ~10% label noise
flip = rng4.random(60) < 0.1
y1d[flip] = 1 - y1d[flip]

gpc1d = GPClassifierLaplace(length_scale=1.2).fit(X1d, y1d)

X_s1d = np.linspace(-6, 6, 300).reshape(-1, 1)
proba = gpc1d.predict_proba(X_s1d)

plt.figure(figsize=(9, 3))
plt.plot(X_s1d, proba, 'b-', lw=2, label='$p(y=1|x_*)$')
plt.scatter(X1d[y1d==1, 0], np.ones(int(y1d.sum())), marker='|', s=60,
            c='green', label='y=1')
plt.scatter(X1d[y1d==0, 0], np.zeros(int((y1d==0).sum())), marker='|',
            s=60, c='red', label='y=0')
plt.axhline(0.5, color='k', linestyle='--', alpha=0.4)
plt.ylim(-0.1, 1.1); plt.xlabel('$x$')
plt.title('GP Classifier — Laplace Approximation (1-D)')
plt.legend(); plt.tight_layout(); plt.show()

# YOUR CODE HERE

Test accuracy: 1.000

# YOUR CODE HERE (optional)

Optimised: ℓ=1.211, σ_f²=0.475, σ_n²=0.0219

E2 236 Foundations of ML¶

Lab 5: Gaussian Processes¶

Regression & Classification with Laplace Approximation¶

0. Setup & Imports¶

1. The GP Prior¶

1.1 Implement the squared-exponential kernel¶

1.2 Sample from the GP Prior¶

2. GP Regression (Exact Posterior)¶

2.1 Fit and visualise¶

2.2 Marginal Likelihood¶

3. GP Classification with Laplace Approximation¶

Algorithm (binary classification, sigmoid likelihood):¶

3.1 Test on 1-D toy data¶

3.2 Test on 2-D make_moons¶

4. Hyperparameter Tuning via Log Marginal Likelihood¶

5. Conceptual Questions¶

End of Part B¶