E2 236 Foundations of Machine Learning Lab 2¶
Regression¶
Student Name: _________________
Last five digits of your SR number: _________________
Date: _________________
Instructions¶
- Fill in the last five digits of your Student ID above - this will be used as the random seed for reproducibility
- Complete all code cells marked with
# YOUR CODE HERE - Do not modify the test cells or data generation code
- This lab component will not be graded. So the solutions need not be returned
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler # Added StandardScaler
from scipy.optimize import linprog, minimize
import warnings
warnings.filterwarnings('ignore')
# Set plot style
plt.style.use('seaborn-v0_8-darkgrid' if 'seaborn-v0_8-darkgrid' in plt.style.available else 'default')
# Example student ID
STUDENT_ID = 12345
np.random.seed(STUDENT_ID)
# Example student ID for demonstration
STUDENT_ID = 12345 # This will vary for each student
np.random.seed(STUDENT_ID)
print(f"Random seed set to: {STUDENT_ID}")
Random seed set to: 12345
Task 1: Linear Regression and Ridge Regression on Gaussian Data¶
In this task, you will:
- Implement Ordinary Least Squares (OLS) Linear Regression from scratch
- Implement Ridge Regression (L2 Regularization) from scratch
- Compare their performance on synthetic Gaussian data
# Generate synthetic Gaussian data
np.random.seed(STUDENT_ID)
n_samples = 200
n_features = 1
X = np.random.randn(n_samples, n_features) * 2
true_weights = np.array([3.0])
true_bias = 2.0
noise = np.random.randn(n_samples) * 1.5
y = X @ true_weights + true_bias + noise
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=STUDENT_ID
)
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, alpha=0.6, label='Training data', edgecolors='k')
plt.scatter(X_test, y_test, alpha=0.6, label='Test data', edgecolors='k')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Synthetic Gaussian Data for Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"True parameters: weight={true_weights[0]}, bias={true_bias}")
Training samples: 140 Test samples: 60 True parameters: weight=3.0, bias=2.0
1.1 Least Squares Linear Regression¶
Implement least squares linear regression using the closed-form solution.
class LinearRegression:
def __init__(self):
"""
Initialize Linear Regression model.
"""
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the linear regression model using the closed-form solution.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Target values
"""
# YOUR CODE HERE
# 1. Add a column of ones to X for the bias term
# 2. Compute beta = (X^T X)^(-1) X^T y
# 3. Extract weights and bias from beta
# Hint: Use np.linalg.inv() or np.linalg.solve()
pass
def predict(self, X):
"""
Make predictions.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted values
"""
# YOUR CODE HERE
# Return X @ weights + bias
pass
# Train OLS Linear Regression
# YOUR CODE HERE
# 1. Create a LinearRegression instance
# 2. Fit it on training data
# 3. Make predictions on test data
# 4. Print the learned parameters
# Visualization
plt.figure(figsize=(14, 5))
# Plot 1: All models
plt.subplot(1, 2, 1)
plt.scatter(X_test, y_test, alpha=0.5, label='Test data', edgecolors='k', c='red')
X_line = np.linspace(X_test.min(), X_test.max(), 100).reshape(-1, 1)
plt.plot(X_line, ols.predict(X_line), label='Least squares', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Least squares')
plt.legend()
plt.grid(True, alpha=0.3)
1.2 Ridge Regression (L2 Regularization)¶
Implement Ridge Regression using the closed-form solution:
$$\bf{w} = (\mathbf{X}^T \mathbf{X} + \lambda \mathbf{I})^{-1} \mathbf{X}^T \mathbf{y}$$
where $\lambda$ is the regularization parameter.
class RidgeRegression:
def __init__(self, alpha=1.0):
"""
Initialize Ridge Regression model.
Parameters:
-----------
alpha : float
Regularization strength (lambda)
"""
self.alpha = alpha
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the ridge regression model.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Target values
"""
# YOUR CODE HERE
# 1. Add a column of ones to X for the bias term
# 2. Create identity matrix I (same size as X^T X)
# 3. Set the last diagonal element to 0 (don't regularize bias)
# 4. Compute beta = (X^T X + alpha * I)^(-1) X^T y
# 5. Extract weights and bias
pass
def predict(self, X):
"""
Make predictions.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted values
"""
# YOUR CODE HERE
pass
# Train Ridge Regression with different alpha values
# YOUR CODE HERE
# Try alpha values: [0.01, 0.1, 1.0, 10.0]
# Store models and predictions for comparison
# Visualization
plt.figure(figsize=(14, 5))
# Plot 1: All models
plt.subplot(1, 2, 1)
plt.scatter(X_test, y_test, alpha=0.5, label='Test data', edgecolors='k',c='red')
X_line = np.linspace(X_test.min(), X_test.max(), 100).reshape(-1, 1)
for alpha in alphas:
plt.plot(X_line, ridge_models[alpha].predict(X_line),
label=f'Ridge (α={alpha})', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Model Comparison - Ridge regression')
plt.legend()
plt.grid(True, alpha=0.3)
1.3 Evaluation and Comparison¶
def evaluate_regression(y_true, y_pred, model_name):
"""
Evaluate regression model.
Parameters:
-----------
y_true : array-like
True target values
y_pred : array-like
Predicted values
model_name : str
Name of the model
"""
# YOUR CODE HERE
# Compute and print:
# 1. Mean Squared Error (MSE)
# 2. Root Mean Squared Error (RMSE)
# 3. Mean Absolute Error (MAE)
# 4. R² Score
pass
# YOUR CODE HERE
# Evaluate all models (OLS and Ridge with different alphas)
# Create a visualization comparing predictions
Task 2: Robust Regression with Outliers¶
In this task, you will:
- Generate data with outliers
- Implement MAE-based Linear Programming for robust regression
- Implement regression with Huber loss
- Compare robustness to outliers
# Generate data with outliers
np.random.seed(STUDENT_ID)
n_samples = 150
n_outliers = 15
# Generate clean data
X_clean = np.random.randn(n_samples, 1) * 2
y_clean = 2.5 * X_clean.ravel() + 1.5 + np.random.randn(n_samples) * 0.5
# Add outliers
outlier_indices = np.random.choice(n_samples, n_outliers, replace=False)
y_outliers = y_clean.copy()
y_outliers[outlier_indices] += np.random.choice([0, 1], n_outliers) * np.random.uniform(8, 15, n_outliers)
# Split data
X_train_out, X_test_out, y_train_out, y_test_out = train_test_split(
X_clean, y_outliers, test_size=0.3, random_state=STUDENT_ID
)
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X_train_out, y_train_out, alpha=0.6, label='Training data', edgecolors='k')
plt.scatter(X_test_out, y_test_out, alpha=0.6, label='Test data', edgecolors='k')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Data with Outliers')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Total samples: {n_samples}")
print(f"Number of outliers: {n_outliers}")
print(f"Outlier percentage: {n_outliers/n_samples*100:.1f}%")
Total samples: 150 Number of outliers: 15 Outlier percentage: 10.0%
2.1 MAE-based ERM with Linear Programming¶
Minimize the Mean Absolute Error using Linear Programming:
$$\min_{{\bf w}, r_i^+ >=0,r_i^->=0} \sum_{i=1}^{N} r_i^+ + r_i^-$$
subject to: $$({\bf w}^T x_i + r_i^+ - r_i^- = y_i), \quad i=1,2,\cdots,N$$
where $t_i$ represents the absolute error for sample $i$.
def mae_linear_program(X, y):
"""
Solve L1 regression (MAE minimization) using linear programming.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Target values
Returns:
--------
weights : numpy array
Learned weights
bias : float
Learned bias
"""
n_samples, n_features = X.shape
# YOUR CODE HERE
# Use scipy.optimize.linprog
pass
# YOUR CODE HERE
# 1. Fit MAE regression using linear programming
# 2. Make predictions
# 3. Compare with OLS on the outlier data
# Visualization
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
# Plot test data with outliers
plt.scatter(X_test_out, y_test_out, alpha=0.6, label='Test data (with outliers)', edgecolors='k', c='red')
# Generate a range of X values for plotting regression lines
X_line_out = np.linspace(X_test_out.min() - 0.5, X_test_out.max() + 0.5, 100).reshape(-1, 1)
# Plot OLS regression line trained on outlier data
plt.plot(X_line_out, ols_outlier.predict(X_line_out), label='OLS on Outlier Data', linewidth=2, linestyle='--')
# Plot MAE regression line
if mae_weights is not None and mae_bias is not None:
plt.plot(X_line_out, X_line_out @ mae_weights + mae_bias, label='MAE (LP) on Outlier Data', linewidth=2, linestyle='-')
plt.xlabel('X')
plt.ylabel('y')
plt.title('OLS vs MAE Regression on Data with Outliers')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
2.3 Comparison of Robust Methods¶
# YOUR CODE HERE
# Create a comprehensive comparison:
# 1. Fit OLS, MAE (LP), and Huber regression on outlier data
# 2. Plot all three regression lines on the same plot
# 3. Compute and compare MSE and MAE for each method
# 4. Discuss which method is most robust to outliers
Task 3: Overfitting, Bias-Variance Tradeoff, and Regularization¶
In this task, you will:
- Explore overfitting using polynomial features
- Understand the bias-variance tradeoff
- Implement and compare L1 (Lasso), L2 (Ridge), and Lp norm regularization
# Generate data for overfitting demonstration
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(STUDENT_ID)
n_samples = 30 # Small dataset to encourage overfitting
X_poly = np.sort(np.random.uniform(-3, 3, n_samples))
y_true_poly = np.sin(X_poly) + 0.5 * X_poly
y_poly = y_true_poly + np.random.randn(n_samples) * 0.3
X_poly = X_poly.reshape(-1, 1)
# Create test data for evaluation
X_test_poly = np.linspace(-3, 3, 200).reshape(-1, 1)
y_test_true = np.sin(X_test_poly.ravel()) + 0.5 * X_test_poly.ravel()
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X_poly, y_poly, alpha=0.8, s=80, edgecolors='k', label='Training data')
plt.plot(X_test_poly, y_test_true, 'g--', linewidth=2, label='True function')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Data for Overfitting Analysis')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Training samples: {n_samples}")
Training samples: 30
3.1 Demonstrating Overfitting with Polynomial Features¶
# YOUR CODE HERE
# 1. Fit polynomial regression with degrees [1, 3, 5, 10, 15]
# 2. For each degree:
# a. Transform features using PolynomialFeatures
# b. Fit OLS regression
# c. Compute training and test MSE
# d. Store results
# 3. Plot all polynomial fits on the same graph
# 4. Create a plot showing training vs test error as a function of degree
#
# Hint: Use sklearn.preprocessing.PolynomialFeatures
3.2 Lasso Regression (L1 Regularization)¶
Lasso regression minimizes:
$$\min_{w, b} \frac{1}{2n}\sum_{i=1}^{n}(y_i - (w^T x_i + b))^2 + \alpha \|w\|_1$$
where $\|w\|_1 = \sum_{j}|w_j|$
class LassoRegression:
def __init__(self, alpha=1.0, learning_rate=0.01, n_iterations=1000):
"""
Initialize Lasso Regression.
Parameters:
-----------
alpha : float
Regularization strength
learning_rate : float
Learning rate
n_iterations : int
Number of iterations
"""
self.alpha = alpha
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.losses = []
def soft_threshold(self, x, threshold):
"""
Soft thresholding operator for L1 regularization.
Parameters:
-----------
x : float or numpy array
Input value(s)
threshold : float
Threshold value
Returns:
--------
result : float or numpy array
Soft-thresholded value(s)
"""
# YOUR CODE HERE
# Implement: sign(x) * max(|x| - threshold, 0)
pass
def fit(self, X, y):
"""
Fit Lasso regression using coordinate descent or gradient descent.
"""
n_samples, n_features = X.shape
# YOUR CODE HERE
# Use gradient descent with soft thresholding:
# 1. Initialize weights and bias
# 2. For each iteration:
# a. Compute predictions and residuals
# b. Compute gradients
# c. Update weights using gradient and soft thresholding
# d. Update bias normally (no regularization)
# e. Compute and store loss
pass
def predict(self, X):
# YOUR CODE HERE
pass
# YOUR CODE HERE
# Train Lasso with different alpha values on polynomial features (degree 10)
3.3 Lp Norm Regularization¶
Generalize to Lp norm regularization:
$$\min_{w, b} \frac{1}{2n}\sum_{i=1}^{n}(y_i - (w^T x_i + b))^2 + \alpha \|w\|_p^p$$
where $\|w\|_p = (\sum_{j}|w_j|^p)^{1/p}$
Test with $p \in \{0.5, 1, 1.5, 2\}$
class LpRegression:
def __init__(self, p=2, alpha=1.0, learning_rate=0.01, n_iterations=1000):
"""
Initialize Lp norm regularized regression.
Parameters:
-----------
p : float
Norm parameter (e.g., 1 for L1, 2 for L2)
alpha : float
Regularization strength
learning_rate : float
Learning rate
n_iterations : int
Number of iterations
"""
self.p = p
self.alpha = alpha
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.losses = []
def lp_gradient(self, weights):
"""
Compute gradient of Lp norm regularization term.
Parameters:
-----------
weights : numpy array
Current weights
Returns:
--------
gradient : numpy array
Gradient of Lp penalty
"""
# YOUR CODE HERE
# Gradient of ||w||_p^p is: p * |w|^(p-1) * sign(w)
pass
def fit(self, X, y):
"""
Fit Lp regularized regression.
"""
n_samples, n_features = X.shape
# YOUR CODE HERE
# Similar to Lasso but use lp_gradient instead of soft thresholding
pass
def predict(self, X):
# YOUR CODE HERE
pass
# YOUR CODE HERE
# Train models with p = [0.5, 1.0, 1.5, 2.0]
# Use polynomial features of degree 10
# Compare sparsity of learned weights
3.4 Comprehensive Comparison and Analysis¶
# YOUR CODE HERE
# Create comprehensive visualizations:
#
# 1. Plot showing all regularization methods on the same graph
# (No regularization, L1, L2, Lp with different p values)
#
# 2. Bar plot comparing number of non-zero coefficients for each method
# (to show sparsity)
#
# 3. Plot showing coefficient values for each method
# (to visualize shrinkage)
#
# 4. Table comparing train/test MSE for all methods
================================================================================ Method Train MSE Test MSE Non-zero -------------------------------------------------------------------------------- No Reg 31902756162.5732 25091332980.1736 10 Ridge (L2) 27003210.4675 22769894.4538 10 Lasso (L1) 0.0626 0.0659 5 L0.5 0.0658 0.0652 7 L1.5 0.0605 0.0647 10 ================================================================================