E2 236: Foundations of Machine Learning¶
Lab 1 Classification¶
Student Name: _________________
Last five digits of your SR number: _________________
Date: _________________
Instructions¶
- Fill in the last five digits of your Student ID above - this will be used as the random seed for reproducibility
- Complete all code cells marked with
# YOUR CODE HERE - Do not modify the test cells or data generation code
- This lab component will not be graded. So the solutions need not be returned
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import seed, multivariate_normal
from sklearn.datasets import make_classification, make_blobs, load_iris
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = "retina"
plt.rcParams["figure.figsize"] = (12, 5)
# Example student ID for demonstration
STUDENT_ID = # This will vary for each student
np.random.seed(STUDENT_ID)
print(f"Random seed set to: {STUDENT_ID}")
Task 1: Binary Classification on Gaussian Synthetic Dataset¶
# Generate binary classification dataset
cov = np.array([
[1, 0.25],
[0.25, 1]
])
mu1 = (3, 8)
mu2 = (3, 1)
seed(STUDENT_ID)
# The training dataset
X1 = multivariate_normal(mu1, cov, 500)
X2 = multivariate_normal(mu2, cov, 500)
y = np.concatenate([np.zeros(500), np.ones(500)])
X = np.concatenate([X1, X2])
# Get the number of samples
n_samples = X.shape[0]
# Generate a shuffled array of indices
shuffled_indices = np.random.permutation(n_samples)
# Apply the shuffled indices to X and y_binary
X_binary = X[shuffled_indices]
y_binary = y[shuffled_indices]
# Convert labels to {-1, 1} for perceptron and linear programming
y_binary_pm = 2 * y_binary - 1
# Split the data
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
X_binary, y_binary, test_size=0.3, random_state=STUDENT_ID
)
y_train_b_pm = 2 * y_train_b - 1
y_test_b_pm = 2 * y_test_b - 1
# Visualize the dataset
plt.figure(figsize=(8, 6))
plt.scatter(X_binary[y_binary == 0, 0], X_binary[y_binary == 0, 1],
c='gray', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_binary[y_binary == 1, 0], X_binary[y_binary == 1, 1],
c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Binary Classification Dataset')
plt.legend()
plt.grid(True, alpha=0.7)
plt.show()
print(f"Training set size: {len(X_train_b)}")
print(f"Test set size: {len(X_test_b)}")
Training set size: 700 Test set size: 300
1.1 Perceptron Algorithm¶
Implement the perceptron algorithm from scratch. The perceptron updates weights as follows:
For each misclassified point $(x_i, y_i)$: $$w \leftarrow w + \eta \cdot y_i \cdot x_i$$ $$b \leftarrow b + \eta \cdot y_i$$
where $\eta$ is the learning rate.
class Perceptron:
def __init__(self, learning_rate=0.01, n_iterations=1000):
"""
Initialize the Perceptron.
Parameters:
-----------
learning_rate : float
Learning rate for weight updates
n_iterations : int
Maximum number of iterations
"""
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Train the perceptron on the given data.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Labels (should be -1 or 1)
"""
# YOUR CODE HERE
# Initialize weights and bias
# Implement the perceptron learning algorithm
# Hint: Loop through iterations, and for each iteration,
# loop through all samples and update weights for misclassified points
pass
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted class labels (-1 or 1)
"""
# YOUR CODE HERE
# Compute: sign(X @ weights + bias)
pass
# Train the perceptron
perceptron = Perceptron(learning_rate=0.01, n_iterations=1000)
# YOUR CODE HERE: Fit the perceptron on training data
# Make predictions
# YOUR CODE HERE: Get predictions on test data
# Visualize the perceptron result
# The decision boundary is defined by w1*x1 + w2*x2 + b = 0
# Rearranging for x2 (y-axis) gives x2 = (-w1/w2)*x1 - (b/w2)
slope = -perceptron.weights[0] / perceptron.weights[1]
intercept = -perceptron.bias / perceptron.weights[1]
projected_space = np.linspace(X_binary[:, 0].min() - 1, X_binary[:, 0].max() + 1, 100) # Use a wider range for x-axis
projected_line = projected_space * slope + intercept
plt.figure(figsize=(8, 6))
plt.scatter(X_binary[y_binary == 0, 0], X_binary[y_binary == 0, 1],
c='gray', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_binary[y_binary == 1, 0], X_binary[y_binary == 1, 1],
c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.plot(projected_space, projected_line, linestyle="dashed", c="black", linewidth=3, label='Decision Boundary')
plt.xlim(X_binary[:, 0].min() - 0.5, X_binary[:, 0].max() + 0.5)
plt.ylim(X_binary[:, 1].min() - 0.5, X_binary[:, 1].max() + 0.5)
plt.grid(alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Perceptron Decision Boundary')
plt.legend()
plt.grid(True, alpha=0.7)
plt.show()
1.2 Linear Programming for Feasible Hyperplane¶
Use linear programming to find a separating hyperplane. We want to find $(w, b)$ such that:
$$y_i(w^T x_i + b) \geq 1, \quad \forall i$$
This can be formulated as a linear programming feasibility problem.
from scipy.optimize import linprog
def linear_programming_classifier(X_train, y_train):
"""
Find a separating hyperplane using linear programming.
Parameters:
-----------
X_train : numpy array of shape (n_samples, n_features)
Training data
y_train : numpy array of shape (n_samples,)
Labels (should be -1 or 1)
Returns:
--------
weights : numpy array of shape (n_features,)
Learned weights
bias : float
Learned bias
"""
n_samples, n_features = X_train.shape
# YOUR CODE HERE
# We want to find w and b such that y_i(w^T x_i + b) >= 1
# This can be rewritten as: -y_i(w^T x_i + b) <= -1
#
# Variables: [w1, w2, ..., wn, b]
# Objective: We can minimize sum of weights (arbitrary, just need feasibility)
# Constraints: For each sample i: -y_i * (w^T x_i + b) <= -1
#
# Use scipy.optimize.linprog(c, A_ub, b_ub, bounds)
# where:
# c: coefficients for objective function
# A_ub: inequality constraint matrix (A_ub @ x <= b_ub)
# b_ub: inequality constraint bounds
pass
# Find hyperplane using linear programming
# YOUR CODE HERE: Call linear_programming_classifier
# Make predictions
def predict_lp(X, weights, bias):
# YOUR CODE HERE
# Return sign(X @ weights + bias)
pass
# YOUR CODE HERE: Get predictions on test data
# Visualize the LP result
# The decision boundary is defined by w1*x1 + w2*x2 + b = 0
# Rearranging for x2 (y-axis) gives x2 = (-w1/w2)*x1 - (b/w2)
slope = -lp_weights[0] / lp_weights[1]
intercept = -lp_bias / lp_weights[1]
projected_space = np.linspace(X_binary[:, 0].min() - 1, X_binary[:, 0].max() + 1, 100) # Use a wider range for x-axis
projected_line = projected_space * slope + intercept
plt.figure(figsize=(8, 6))
plt.scatter(X_binary[y_binary == 0, 0], X_binary[y_binary == 0, 1],
c='gray', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_binary[y_binary == 1, 0], X_binary[y_binary == 1, 1],
c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.plot(projected_space, projected_line, linestyle="dashed", c="black", linewidth=3, label='Decision Boundary')
plt.xlim(X_binary[:, 0].min() - 0.5, X_binary[:, 0].max() + 0.5)
plt.ylim(X_binary[:, 1].min() - 0.5, X_binary[:, 1].max() + 0.5)
plt.grid(alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Perceptron Decision Boundary')
plt.legend()
plt.grid(True, alpha=0.7)
plt.show()
1.3 Logistic Regression¶
Implement logistic regression from scratch using gradient descent. The sigmoid function is:
$$\sigma(z) = \frac{1}{1 + e^{-z}}$$
The cost function (cross-entropy loss) is:
$$J(w, b) = -\frac{1}{m}\sum_{i=1}^{m}[y_i \log(h(x_i)) + (1-y_i)\log(1-h(x_i))]$$
where $h(x_i) = \sigma(w^T x_i + b)$
class LogisticRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
"""
Initialize Logistic Regression.
Parameters:
-----------
learning_rate : float
Learning rate for gradient descent
n_iterations : int
Number of iterations for gradient descent
"""
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.losses = []
def sigmoid(self, z):
# YOUR CODE HERE
# Implement sigmoid function: 1 / (1 + exp(-z))
# Tip: Use np.clip to avoid overflow
pass
def fit(self, X, y):
"""
Train logistic regression using gradient descent.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Labels (should be 0 or 1)
"""
n_samples, n_features = X.shape
# YOUR CODE HERE
# Initialize weights and bias
# Implement gradient descent
# For each iteration:
# 1. Compute predictions using sigmoid(X @ weights + bias)
# 2. Compute gradients:
# dw = (1/m) * X.T @ (predictions - y)
# db = (1/m) * sum(predictions - y)
# 3. Update weights and bias
# 4. Compute and store the loss
pass
def predict(self, X):
"""
Predict class labels.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted class labels (0 or 1)
"""
# YOUR CODE HERE
# Compute sigmoid(X @ weights + bias) and threshold at 0.5
pass
# Train logistic regression
log_reg = LogisticRegression(learning_rate=0.1, n_iterations=1000)
# YOUR CODE HERE: Fit the model on training data (use y_train_b, not y_train_b_pm)
# Plot the loss curve
# YOUR CODE HERE: Plot log_reg.losses
# Make predictions
# YOUR CODE HERE: Get predictions on test data
# Visualize the logistic regression result
# The decision boundary is defined by w1*x1 + w2*x2 + b = 0
# Rearranging for x2 (y-axis) gives x2 = (-w1/w2)*x1 - (b/w2)
slope = -log_reg.weights[0] / log_reg.weights[1]
intercept = -log_reg.bias / log_reg.weights[1]
projected_space = np.linspace(X_binary[:, 0].min() - 1, X_binary[:, 0].max() + 1, 100) # Use a wider range for x-axis
projected_line = projected_space * slope + intercept
plt.figure(figsize=(8, 6))
plt.scatter(X_binary[y_binary == 0, 0], X_binary[y_binary == 0, 1],
c='gray', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_binary[y_binary == 1, 0], X_binary[y_binary == 1, 1],
c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.plot(projected_space, projected_line, linestyle="dashed", c="black", linewidth=3, label='Decision Boundary')
plt.xlim(X_binary[:, 0].min() - 0.5, X_binary[:, 0].max() + 0.5)
plt.ylim(X_binary[:, 1].min() - 0.5, X_binary[:, 1].max() + 0.5)
plt.grid(alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Perceptron Decision Boundary')
plt.legend()
plt.grid(True, alpha=0.7)
plt.show()
1.4 Evaluation and Comparison¶
Compute and compare the performance metrics for all three methods.
def evaluate_classifier(y_true, y_pred, method_name):
"""
Compute and display evaluation metrics.
Parameters:
-----------
y_true : array-like
True labels
y_pred : array-like
Predicted labels
method_name : str
Name of the classification method
"""
# YOUR CODE HERE
# Compute accuracy, precision, and recall
# Print the results in a formatted way
# Hint: Use accuracy_score, precision_score, recall_score from sklearn.metrics
# Note: For binary classification with {-1, 1}, convert to {0, 1} first
pass
print("="*60)
print("Binary Classification Results")
print("="*60)
# YOUR CODE HERE: Evaluate all three methods
# evaluate_classifier for perceptron, linear programming, and logistic regression
Task 2: Naive Bayes Classifier on Bernoulli Dataset¶
Implement a Bernoulli Naive Bayes classifier from scratch for binary features.
For Bernoulli Naive Bayes:
- Features are binary (0 or 1)
- We estimate: $P(x_j = 1 | y = c)$ for each feature $j$ and class $c$
- Classification uses: $\arg\max_c P(y=c) \prod_j P(x_j | y=c)$
# Generate Bernoulli dataset (binary features)
from sklearn.datasets import make_classification
X_bernoulli, y_bernoulli = make_classification(
n_samples=1000,
n_features=10,
n_informative=8,
n_redundant=0,
n_classes=2,
flip_y=0.1,
random_state=STUDENT_ID
)
# Convert to binary features (threshold at median)
X_bernoulli = (X_bernoulli > np.median(X_bernoulli, axis=0)).astype(int)
# Split the data
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(
X_bernoulli, y_bernoulli, test_size=0.3, random_state=STUDENT_ID
)
print(f"Training set size: {len(X_train_nb)}")
print(f"Test set size: {len(X_test_nb)}")
print(f"Number of features: {X_bernoulli.shape[1]}")
print(f"Feature values are binary: {np.all(np.isin(X_bernoulli, [0, 1]))}")
Training set size: 700 Test set size: 300 Number of features: 10 Feature values are binary: True
2.1 Implement Bernoulli Naive Bayes¶
class BernoulliNaiveBayes:
def __init__(self, alpha=1.0):
"""
Initialize Bernoulli Naive Bayes classifier.
Parameters:
-----------
alpha : float
Laplace smoothing parameter (default=1.0)
"""
self.alpha = alpha
self.class_prior = None
self.feature_prob = None
self.classes = None
def fit(self, X, y):
"""
Train the Bernoulli Naive Bayes classifier.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data (binary features)
y : numpy array of shape (n_samples,)
Labels
"""
n_samples, n_features = X.shape
self.classes = np.unique(y)
n_classes = len(self.classes)
# YOUR CODE HERE
# 1. Compute class priors: P(y = c) for each class c
# Store in self.class_prior (shape: n_classes)
#
# 2. Compute feature probabilities: P(x_j = 1 | y = c)
# For each class c and feature j:
# P(x_j = 1 | y = c) = (count(x_j=1 and y=c) + alpha) / (count(y=c) + 2*alpha)
# Store in self.feature_prob (shape: n_classes, n_features)
#
# Hint: Use Laplace smoothing with parameter alpha
pass
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted class labels
"""
# YOUR CODE HERE
# For each sample, compute log-posterior for each class:
# log P(y=c|x) ∝ log P(y=c) + sum_j [x_j * log P(x_j=1|y=c) + (1-x_j) * log P(x_j=0|y=c)]
# where P(x_j=0|y=c) = 1 - P(x_j=1|y=c)
#
# Return the class with highest log-posterior
# Hint: Use log probabilities to avoid numerical underflow
pass
# Train Bernoulli Naive Bayes
bnb = BernoulliNaiveBayes(alpha=1.0)
# YOUR CODE HERE: Fit the model and make predictions
2.2 Evaluation¶
# YOUR CODE HERE
# Evaluate the Bernoulli Naive Bayes classifier
# Compute and print accuracy, precision, and recall
# Display confusion matrix using plot_confusion_matrix function
print("="*60)
print("Bernoulli Naive Bayes Results")
print("="*60)
Task 3: Multinomial Softmax Regression¶
Implement multinomial logistic regression (softmax regression) from scratch for multi-class classification.
The softmax function is: $$\text{softmax}(z)_i = \frac{e^{z_i}}{\sum_{j} e^{z_j}}$$
The cross-entropy loss for multi-class is: $$J(W, b) = -\frac{1}{m}\sum_{i=1}^{m}\sum_{k=1}^{K} \mathbb{1}\{y_i = k\} \log(h_k(x_i))$$
3.1 Softmax Regression on Gaussian Synthetic Dataset¶
# Generate multi-class Gaussian dataset
X_multi, y_multi = make_blobs(
n_samples=600,
n_features=2,
centers=4,
cluster_std=1.5,
random_state=STUDENT_ID
)
# Split the data
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
X_multi, y_multi, test_size=0.3, random_state=STUDENT_ID
)
# Visualize the dataset
plt.figure(figsize=(8, 6))
for i in range(4):
plt.scatter(X_multi[y_multi == i, 0], X_multi[y_multi == i, 1],
label=f'Class {i}', alpha=0.6, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Multi-class Classification Dataset (Synthetic)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Training set size: {len(X_train_m)}")
print(f"Test set size: {len(X_test_m)}")
print(f"Number of classes: {len(np.unique(y_multi))}")
Training set size: 420 Test set size: 180 Number of classes: 4
class SoftmaxRegression:
def __init__(self, learning_rate=0.1, n_iterations=1000, reg_lambda=0.01):
"""
Initialize Softmax Regression.
Parameters:
-----------
learning_rate : float
Learning rate for gradient descent
n_iterations : int
Number of iterations
reg_lambda : float
L2 regularization parameter
"""
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.reg_lambda = reg_lambda
self.weights = None
self.bias = None
self.losses = []
def softmax(self, z):
"""
Compute softmax function.
Parameters:
-----------
z : numpy array of shape (n_samples, n_classes)
Logits
Returns:
--------
softmax : numpy array of shape (n_samples, n_classes)
Softmax probabilities
"""
# YOUR CODE HERE
# Implement softmax with numerical stability
# Hint: Subtract max(z) from z before computing exp
pass
def fit(self, X, y):
"""
Train softmax regression using gradient descent.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Training data
y : numpy array of shape (n_samples,)
Labels (0, 1, ..., K-1)
"""
n_samples, n_features = X.shape
self.n_classes = len(np.unique(y))
# YOUR CODE HERE
# 1. Initialize weights (n_features, n_classes) and bias (n_classes,)
# 2. Convert y to one-hot encoding
# 3. Implement gradient descent:
# For each iteration:
# a. Compute logits: z = X @ weights + bias
# b. Compute probabilities: softmax(z)
# c. Compute loss (cross-entropy + L2 regularization)
# d. Compute gradients:
# dW = (1/m) * X.T @ (probs - y_onehot) + reg_lambda * weights
# db = (1/m) * sum(probs - y_onehot)
# e. Update weights and bias
pass
def predict(self, X):
"""
Predict class labels.
Parameters:
-----------
X : numpy array of shape (n_samples, n_features)
Data to predict
Returns:
--------
predictions : numpy array of shape (n_samples,)
Predicted class labels
"""
# YOUR CODE HERE
# Compute softmax probabilities and return argmax
pass
# Train softmax regression on synthetic data
softmax_model = SoftmaxRegression(learning_rate=0.1, n_iterations=1000, reg_lambda=0.01)
# YOUR CODE HERE: Fit the model and make predictions
# Plot loss curve
# YOUR CODE HERE
3.2 Softmax Regression on Iris Dataset¶
# Load Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
# Split the data
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.3, random_state=STUDENT_ID
)
print(f"Iris dataset:")
print(f" Training set size: {len(X_train_iris)}")
print(f" Test set size: {len(X_test_iris)}")
print(f" Number of features: {X_iris.shape[1]}")
print(f" Number of classes: {len(np.unique(y_iris))}")
print(f" Classes: {iris.target_names}")
Iris dataset: Training set size: 105 Test set size: 45 Number of features: 4 Number of classes: 3 Classes: ['setosa' 'versicolor' 'virginica']
# Train softmax regression on Iris dataset
# YOUR CODE HERE
# Create a new SoftmaxRegression instance
# Fit it on the Iris training data
# Make predictions on the test data
# Print the class labels for few test data points to verify your implementation