Chapter 2: Deep Learning Fundamentals and Convolutional Neural Networks
Haiyue
70min
Chapter 2: Deep Learning Fundamentals and Convolutional Neural Networks
Learning Objectives
- Master the basic principles of deep learning
- Understand the structure and working principles of Convolutional Neural Networks (CNN)
- Familiarize with common CNN architectures (LeNet, AlexNet, VGG, ResNet, etc.)
- Understand backpropagation algorithms and gradient descent optimization
2.1 Deep Learning Basic Principles
2.1.1 From Machine Learning to Deep Learning
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms
class DeepLearningFoundation:
def __init__(self):
self.learning_paradigms = {
"Traditional Machine Learning": {
"feature_engineering": "Hand-crafted features",
"model": "Shallow models (SVM, decision trees, etc.)",
"advantages": ["Strong interpretability", "Fast training", "Less data required"],
"disadvantages": ["Difficult feature design", "Limited expressiveness", "Poor generalization"]
},
"Deep Learning": {
"feature_engineering": "Automatic feature representation learning",
"model": "Multi-layer neural networks",
"advantages": ["End-to-end learning", "Powerful expressiveness", "Automatic feature extraction"],
"disadvantages": ["Requires large amounts of data", "High computational resource consumption", "Black box model"]
}
}
def compare_paradigms(self):
"""Compare learning paradigms"""
print("Machine Learning Paradigm Comparison:")
print("=" * 50)
for paradigm, details in self.learning_paradigms.items():
print(f"\n{paradigm}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
def deep_learning_workflow(self):
"""Deep learning workflow"""
workflow = {
"1. Data Preparation": [
"Data collection and cleaning",
"Data augmentation and preprocessing",
"Train/validation/test set splitting"
],
"2. Model Design": [
"Network architecture design",
"Loss function selection",
"Optimizer configuration"
],
"3. Model Training": [
"Forward propagation",
"Loss calculation",
"Backpropagation",
"Parameter update"
],
"4. Model Evaluation": [
"Validation set performance evaluation",
"Overfitting detection",
"Hyperparameter tuning"
],
"5. Model Deployment": [
"Model optimization",
"Inference acceleration",
"Production environment deployment"
]
}
print("\nDeep Learning Workflow:")
print("=" * 30)
for stage, steps in workflow.items():
print(f"\n{stage}:")
for step in steps:
print(f" - {step}")
return workflow
# Example usage
foundation = DeepLearningFoundation()
foundation.compare_paradigms()
workflow = foundation.deep_learning_workflow()
2.1.2 Neurons and Multi-Layer Perceptron
class NeuralNetworkBasics:
def __init__(self):
pass
def artificial_neuron(self, inputs, weights, bias, activation='sigmoid'):
"""
Artificial neuron simulation
"""
# Linear combination
z = np.dot(inputs, weights) + bias
# Activation function
if activation == 'sigmoid':
output = 1 / (1 + np.exp(-z))
elif activation == 'tanh':
output = np.tanh(z)
elif activation == 'relu':
output = np.maximum(0, z)
elif activation == 'leaky_relu':
output = np.where(z > 0, z, 0.01 * z)
else:
output = z # linear activation
return output, z
def activation_functions(self):
"""Common activation functions"""
x = np.linspace(-5, 5, 100)
activations = {
'Sigmoid': 1 / (1 + np.exp(-x)),
'Tanh': np.tanh(x),
'ReLU': np.maximum(0, x),
'Leaky ReLU': np.where(x > 0, x, 0.01 * x),
'ELU': np.where(x > 0, x, np.exp(x) - 1),
'Swish': x * (1 / (1 + np.exp(-x)))
}
# Visualize activation functions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, (name, y) in enumerate(activations.items()):
axes[i].plot(x, y, linewidth=2)
axes[i].set_title(name)
axes[i].grid(True)
axes[i].set_xlabel('x')
axes[i].set_ylabel('f(x)')
plt.tight_layout()
return fig, activations
def multilayer_perceptron(self):
"""Multi-layer perceptron implementation"""
class MLP(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size, activation='relu'):
super(MLP, self).__init__()
layers = []
prev_size = input_size
# Hidden layers
for hidden_size in hidden_sizes:
layers.append(nn.Linear(prev_size, hidden_size))
if activation == 'relu':
layers.append(nn.ReLU())
elif activation == 'sigmoid':
layers.append(nn.Sigmoid())
elif activation == 'tanh':
layers.append(nn.Tanh())
prev_size = hidden_size
# Output layer
layers.append(nn.Linear(prev_size, output_size))
self.network = nn.Sequential(*layers)
def forward(self, x):
return self.network(x)
return MLP
def gradient_descent_demo(self):
"""Gradient descent demonstration"""
# Simple quadratic function optimization
def quadratic_function(x):
return x**2 + 2*x + 1
def gradient(x):
return 2*x + 2
# Gradient descent process
x_history = []
loss_history = []
x = 5.0 # Initial point
learning_rate = 0.1
for i in range(20):
loss = quadratic_function(x)
grad = gradient(x)
x_history.append(x)
loss_history.append(loss)
# Parameter update
x = x - learning_rate * grad
# Visualize optimization process
x_range = np.linspace(-3, 6, 100)
y_range = quadratic_function(x_range)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x_range, y_range, 'b-', label='f(x) = x² + 2x + 1')
plt.plot(x_history, [quadratic_function(x) for x in x_history],
'ro-', label='Optimization Path')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Gradient Descent Optimization')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(loss_history, 'r-o')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Loss vs Iteration')
plt.grid(True)
plt.tight_layout()
return x_history, loss_history
# Example usage
nn_basics = NeuralNetworkBasics()
# Demonstrate neuron
inputs = np.array([1, 2, 3])
weights = np.array([0.5, -0.2, 0.1])
bias = 0.3
output, z = nn_basics.artificial_neuron(inputs, weights, bias, 'relu')
print(f"Neuron output: {output:.3f}")
# Activation function visualization
# fig, activations = nn_basics.activation_functions()
# plt.show()
# Create MLP
MLP = nn_basics.multilayer_perceptron()
model = MLP(input_size=784, hidden_sizes=[128, 64], output_size=10)
print(f"MLP architecture:\n{model}")
# Gradient descent demonstration
x_hist, loss_hist = nn_basics.gradient_descent_demo()
print(f"Final x value: {x_hist[-1]:.3f}")
2.2 Convolutional Neural Network Fundamentals
2.2.1 Convolution Operation Principles
class ConvolutionBasics:
def __init__(self):
pass
def convolution_2d(self, input_image, kernel, stride=1, padding=0):
"""
2D convolution operation implementation
"""
# Add padding
if padding > 0:
input_image = np.pad(input_image, padding, mode='constant')
input_h, input_w = input_image.shape
kernel_h, kernel_w = kernel.shape
# Calculate output size
output_h = (input_h - kernel_h) // stride + 1
output_w = (input_w - kernel_w) // stride + 1
output = np.zeros((output_h, output_w))
# Perform convolution
for i in range(0, output_h):
for j in range(0, output_w):
h_start = i * stride
h_end = h_start + kernel_h
w_start = j * stride
w_end = w_start + kernel_w
# Element-wise multiplication and summation
output[i, j] = np.sum(input_image[h_start:h_end, w_start:w_end] * kernel)
return output
def common_kernels(self):
"""Common convolution kernels"""
kernels = {
'Identity': np.array([[0, 0, 0],
[0, 1, 0],
[0, 0, 0]]),
'Edge Detection (Horizontal)': np.array([[-1, -1, -1],
[ 0, 0, 0],
[ 1, 1, 1]]),
'Edge Detection (Vertical)': np.array([[-1, 0, 1],
[-1, 0, 1],
[-1, 0, 1]]),
'Sharpen': np.array([[ 0, -1, 0],
[-1, 5, -1],
[ 0, -1, 0]]),
'Blur': np.array([[1, 1, 1],
[1, 1, 1],
[1, 1, 1]]) / 9,
'Gaussian Blur': np.array([[1, 2, 1],
[2, 4, 2],
[1, 2, 1]]) / 16
}
return kernels
def visualize_convolution(self, input_image, kernel, title="Convolution"):
"""Visualize convolution process"""
output = self.convolution_2d(input_image, kernel, padding=1)
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Input image
axes[0].imshow(input_image, cmap='gray')
axes[0].set_title('Input Image')
axes[0].axis('off')
# Kernel
im = axes[1].imshow(kernel, cmap='RdBu')
axes[1].set_title('Kernel')
for i in range(kernel.shape[0]):
for j in range(kernel.shape[1]):
axes[1].text(j, i, f'{kernel[i,j]:.1f}',
ha='center', va='center', fontsize=12)
# Output feature map
axes[2].imshow(output, cmap='gray')
axes[2].set_title('Output Feature Map')
axes[2].axis('off')
plt.tight_layout()
return fig, output
def convolution_properties(self):
"""Important properties of convolution"""
properties = {
"Translation Invariance": {
"definition": "When input translates, output translates accordingly",
"significance": "Robustness to object position changes",
"application": "Objects can appear anywhere in the image for object detection"
},
"Parameter Sharing": {
"definition": "Same convolution kernel reused across entire image",
"significance": "Dramatically reduces number of parameters",
"application": "Enables network to process inputs of different sizes"
},
"Local Connectivity": {
"definition": "Each neuron only connects to a local region",
"significance": "Preserves spatial structure information",
"application": "Extracts local features like edges, textures"
},
"Hierarchical Features": {
"definition": "Shallow layers extract basic features, deep layers extract complex features",
"significance": "Builds hierarchical feature structure",
"application": "Feature extraction from edges to shapes to objects"
}
}
print("Important Properties of Convolution:")
print("=" * 40)
for prop, details in properties.items():
print(f"\n{prop}:")
for key, value in details.items():
print(f" {key}: {value}")
return properties
# Example usage
conv_basics = ConvolutionBasics()
# Create example image
input_img = np.array([[1, 2, 3, 0, 1],
[0, 1, 2, 3, 0],
[0, 0, 1, 2, 3],
[1, 0, 0, 1, 2],
[2, 1, 0, 0, 1]])
# Get common kernels
kernels = conv_basics.common_kernels()
# Apply edge detection kernel
edge_kernel = kernels['Edge Detection (Horizontal)']
output = conv_basics.convolution_2d(input_img, edge_kernel, padding=1)
print("Input image:")
print(input_img)
print(f"\nKernel (Edge Detection):")
print(edge_kernel)
print(f"\nOutput feature map:")
print(output)
# Convolution properties
properties = conv_basics.convolution_properties()
2.2.2 Pooling and Other Operations
class CNNOperations:
def __init__(self):
pass
def max_pooling(self, input_feature, pool_size=2, stride=2):
"""Max pooling"""
input_h, input_w = input_feature.shape
output_h = (input_h - pool_size) // stride + 1
output_w = (input_w - pool_size) // stride + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
h_start = i * stride
h_end = h_start + pool_size
w_start = j * stride
w_end = w_start + pool_size
# Take maximum value
output[i, j] = np.max(input_feature[h_start:h_end, w_start:w_end])
return output
def average_pooling(self, input_feature, pool_size=2, stride=2):
"""Average pooling"""
input_h, input_w = input_feature.shape
output_h = (input_h - pool_size) // stride + 1
output_w = (input_w - pool_size) // stride + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
h_start = i * stride
h_end = h_start + pool_size
w_start = j * stride
w_end = w_start + pool_size
# Take average value
output[i, j] = np.mean(input_feature[h_start:h_end, w_start:w_end])
return output
def batch_normalization_concept(self):
"""Batch normalization concept"""
class BatchNormalization:
def __init__(self, num_features, eps=1e-5, momentum=0.1):
self.num_features = num_features
self.eps = eps
self.momentum = momentum
# Learnable parameters
self.gamma = np.ones(num_features) # Scale parameter
self.beta = np.zeros(num_features) # Shift parameter
# Moving averages
self.running_mean = np.zeros(num_features)
self.running_var = np.ones(num_features)
def forward(self, x, training=True):
if training:
# Calculate batch statistics
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
# Update moving averages
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
# Normalize
x_norm = (x - batch_mean) / np.sqrt(batch_var + self.eps)
else:
# Use moving averages during inference
x_norm = (x - self.running_mean) / np.sqrt(self.running_var + self.eps)
# Scale and shift
output = self.gamma * x_norm + self.beta
return output
return BatchNormalization
def dropout_concept(self):
"""Dropout concept"""
def dropout(x, drop_rate=0.5, training=True):
if not training:
return x
# Generate random mask
keep_prob = 1 - drop_rate
mask = np.random.rand(*x.shape) < keep_prob
# Apply mask and scale
output = x * mask / keep_prob
return output
return dropout
def receptive_field_calculation(self):
"""Receptive field calculation"""
def calculate_receptive_field(layers_info):
"""
Calculate receptive field size for each layer in CNN
layers_info: [(layer_type, kernel_size, stride, padding), ...]
"""
rf = 1 # Initial receptive field
jump = 1 # Jump distance
results = [{'layer': 'input', 'receptive_field': rf, 'jump': jump}]
for i, (layer_type, kernel_size, stride, padding) in enumerate(layers_info):
if layer_type in ['conv', 'pool']:
# Update receptive field and jump distance
rf = rf + (kernel_size - 1) * jump
jump = jump * stride
results.append({
'layer': f'{layer_type}_{i+1}',
'kernel_size': kernel_size,
'stride': stride,
'receptive_field': rf,
'jump': jump
})
return results
# Example: Receptive field calculation for typical CNN architecture
example_layers = [
('conv', 3, 1, 1), # 3x3 conv, stride=1
('pool', 2, 2, 0), # 2x2 pool, stride=2
('conv', 3, 1, 1), # 3x3 conv, stride=1
('conv', 3, 1, 1), # 3x3 conv, stride=1
('pool', 2, 2, 0), # 2x2 pool, stride=2
]
rf_results = calculate_receptive_field(example_layers)
print("Receptive Field Calculation Results:")
print("-" * 50)
for result in rf_results:
layer = result['layer']
rf = result['receptive_field']
jump = result['jump']
if 'kernel_size' in result:
kernel = result['kernel_size']
stride = result['stride']
print(f"{layer:10} | Kernel:{kernel} Stride:{stride} | RF:{rf:2d} Jump:{jump}")
else:
print(f"{layer:10} | {'':15} | RF:{rf:2d} Jump:{jump}")
return rf_results
# Example usage
cnn_ops = CNNOperations()
# Pooling operation example
test_feature = np.array([[1, 3, 2, 4],
[5, 6, 1, 8],
[2, 1, 4, 3],
[7, 2, 6, 5]])
max_pooled = cnn_ops.max_pooling(test_feature, pool_size=2, stride=2)
avg_pooled = cnn_ops.average_pooling(test_feature, pool_size=2, stride=2)
print("Original feature map:")
print(test_feature)
print(f"\nMax pooling result (2x2):")
print(max_pooled)
print(f"\nAverage pooling result (2x2):")
print(avg_pooled)
# Receptive field calculation
rf_calculation = cnn_ops.receptive_field_calculation()
# Batch normalization example
BatchNorm = cnn_ops.batch_normalization_concept()
bn = BatchNorm(num_features=3)
# Example data (batch_size=4, features=3)
test_data = np.array([[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[2.0, 3.0, 4.0]])
normalized = bn.forward(test_data, training=True)
print(f"\nBefore batch normalization:")
print(test_data)
print(f"\nAfter batch normalization:")
print(normalized)
2.3 Classic CNN Architectures
2.3.1 LeNet-5
class LeNet5:
def __init__(self):
self.architecture_info = {
"year": "1998",
"author": "Yann LeCun",
"application": "Handwritten digit recognition",
"characteristics": ["First successful CNN", "Established basic CNN structure", "Conv-Pool-FC pattern"]
}
def build_lenet5(self, num_classes=10):
"""Build LeNet-5 model"""
class LeNet5Model(nn.Module):
def __init__(self, num_classes):
super(LeNet5Model, self).__init__()
# Feature extraction layers
self.features = nn.Sequential(
# C1: Convolution layer
nn.Conv2d(1, 6, kernel_size=5, stride=1), # 32x32 -> 28x28
nn.Tanh(),
# S2: Pooling layer
nn.AvgPool2d(kernel_size=2, stride=2), # 28x28 -> 14x14
# C3: Convolution layer
nn.Conv2d(6, 16, kernel_size=5, stride=1), # 14x14 -> 10x10
nn.Tanh(),
# S4: Pooling layer
nn.AvgPool2d(kernel_size=2, stride=2), # 10x10 -> 5x5
# C5: Convolution layer (equivalent to fully connected)
nn.Conv2d(16, 120, kernel_size=5, stride=1), # 5x5 -> 1x1
nn.Tanh()
)
# Classification layers
self.classifier = nn.Sequential(
nn.Linear(120, 84),
nn.Tanh(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1) # Flatten
x = self.classifier(x)
return x
return LeNet5Model(num_classes)
def architecture_analysis(self):
"""Architecture analysis"""
layers = {
"Input layer": {
"size": "32x32x1",
"description": "Single channel grayscale image"
},
"C1 Conv": {
"parameters": "6 5x5 kernels",
"output": "28x28x6",
"param_count": "(5*5+1)*6 = 156"
},
"S2 Pool": {
"operation": "2x2 average pooling",
"output": "14x14x6",
"param_count": "0"
},
"C3 Conv": {
"parameters": "16 5x5 kernels",
"output": "10x10x16",
"param_count": "(5*5*6+1)*16 = 2416"
},
"S4 Pool": {
"operation": "2x2 average pooling",
"output": "5x5x16",
"param_count": "0"
},
"C5 Conv": {
"parameters": "120 5x5x16 kernels",
"output": "1x1x120",
"param_count": "(5*5*16+1)*120 = 48120"
},
"F6 FC": {
"parameters": "120->84",
"output": "84",
"param_count": "(120+1)*84 = 10164"
},
"Output layer": {
"parameters": "84->10",
"output": "10",
"param_count": "(84+1)*10 = 850"
}
}
total_params = 156 + 2416 + 48120 + 10164 + 850
print("LeNet-5 Architecture Detailed Analysis:")
print("=" * 50)
for layer, info in layers.items():
print(f"\n{layer}:")
for key, value in info.items():
print(f" {key}: {value}")
print(f"\nTotal parameters: {total_params:,}")
return layers, total_params
# Usage example
lenet = LeNet5()
model = lenet.build_lenet5(num_classes=10)
print("LeNet-5 Model Structure:")
print(model)
# Architecture analysis
layers_analysis, total_params = lenet.architecture_analysis()
# Calculate forward propagation for a sample
sample_input = torch.randn(1, 1, 32, 32)
with torch.no_grad():
output = model(sample_input)
print(f"\nInput size: {sample_input.shape}")
print(f"Output size: {output.shape}")
print(f"Output values: {output.squeeze()}")
2.3.2 AlexNet
class AlexNet:
def __init__(self):
self.architecture_info = {
"year": "2012",
"author": "Alex Krizhevsky",
"breakthrough": "Won ImageNet competition, deep learning renaissance",
"innovations": ["ReLU activation", "Dropout", "Data augmentation", "GPU training"]
}
def build_alexnet(self, num_classes=1000):
"""Build AlexNet model"""
class AlexNetModel(nn.Module):
def __init__(self, num_classes):
super(AlexNetModel, self).__init__()
self.features = nn.Sequential(
# Conv1: 96 11x11 kernels, stride=4
nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
# Conv2: 256 5x5 kernels
nn.Conv2d(96, 256, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
# Conv3: 384 3x3 kernels
nn.Conv2d(256, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
# Conv4: 384 3x3 kernels
nn.Conv2d(384, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
# Conv5: 256 3x3 kernels
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
return AlexNetModel(num_classes)
def key_innovations(self):
"""Key innovation analysis"""
innovations = {
"ReLU Activation": {
"replaces": "Sigmoid/Tanh",
"advantages": ["Alleviates vanishing gradient", "Simple computation", "Sparse activation"],
"impact": "Became standard activation for deep networks"
},
"Dropout Regularization": {
"location": "Fully connected layers",
"effect": "Prevents overfitting",
"mechanism": "Randomly zeros neurons",
"dropout_rate": 0.5
},
"Data Augmentation": {
"methods": ["Random cropping", "Horizontal flipping", "Color jittering"],
"effect": "Increases data diversity, improves generalization",
"importance": "Essential technique for modern training"
},
"GPU Parallel Training": {
"hardware": "NVIDIA GTX 580",
"strategy": "Model parallelism + Data parallelism",
"significance": "Opened the GPU era for deep learning"
},
"Local Response Normalization": {
"location": "After convolution layers",
"effect": "Enhances generalization",
"current_status": "Replaced by Batch Normalization"
}
}
print("AlexNet Key Innovations:")
print("=" * 40)
for innovation, details in innovations.items():
print(f"\n{innovation}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
return innovations
def performance_analysis(self):
"""Performance analysis"""
performance = {
"ImageNet2012 Results": {
"Top-1 error": "15.3%",
"Top-5 error": "15.3%",
"rank": "1st place",
"advantage": "Reduced error by 10.9 percentage points compared to 2nd place"
},
"Model Scale": {
"parameter_count": "60M",
"model_size": "240MB",
"FLOPs": "714M"
},
"Training Details": {
"training_time": "6 days",
"GPU_count": "2 GTX 580",
"batch_size": "128",
"learning_rate": "0.01"
}
}
return performance
# Usage example
alexnet = AlexNet()
model = alexnet.build_alexnet(num_classes=1000)
# Print model info
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print("AlexNet Model Info:")
print(f"Total parameters: {count_parameters(model):,}")
# Key innovation analysis
innovations = alexnet.key_innovations()
# Performance analysis
performance = alexnet.performance_analysis()
print("\nAlexNet Performance:")
for category, metrics in performance.items():
print(f"\n{category}:")
for metric, value in metrics.items():
print(f" {metric}: {value}")
# Test forward propagation
sample_input = torch.randn(1, 3, 224, 224)
with torch.no_grad():
output = model(sample_input)
print(f"\nInput size: {sample_input.shape}")
print(f"Output size: {output.shape}")
2.3.3 VGG Network
class VGGNet:
def __init__(self):
self.architecture_info = {
"year": "2014",
"author": "Karen Simonyan & Andrew Zisserman",
"contribution": "Proved the importance of network depth",
"characteristics": ["Small kernels (3x3)", "Deep networks", "Simple architecture"]
}
def build_vgg(self, vgg_type='VGG16', num_classes=1000):
"""Build VGG model"""
# VGG configurations
cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
class VGGModel(nn.Module):
def __init__(self, vgg_type, num_classes):
super(VGGModel, self).__init__()
self.features = self._make_layers(cfg[vgg_type])
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def _make_layers(self, cfg):
layers = []
in_channels = 3
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
return VGGModel(vgg_type, num_classes)
def architecture_comparison(self):
"""Compare different VGG architectures"""
architectures = {
'VGG11': {
'conv_layers': 8,
'fc_layers': 3,
'total_layers': 11,
'param_count': '132M',
'characteristic': 'Lightest VGG'
},
'VGG13': {
'conv_layers': 10,
'fc_layers': 3,
'total_layers': 13,
'param_count': '133M',
'characteristic': 'Added conv layers to VGG11'
},
'VGG16': {
'conv_layers': 13,
'fc_layers': 3,
'total_layers': 16,
'param_count': '138M',
'characteristic': 'Most commonly used VGG version'
},
'VGG19': {
'conv_layers': 16,
'fc_layers': 3,
'total_layers': 19,
'param_count': '144M',
'characteristic': 'Deepest VGG, slight performance improvement'
}
}
print("VGG Architecture Comparison:")
print("=" * 60)
for arch, info in architectures.items():
print(f"\n{arch}:")
for key, value in info.items():
print(f" {key}: {value}")
return architectures
def design_principles(self):
"""Design principles analysis"""
principles = {
"3x3 Kernels": {
"principle": "Replace large kernels with multiple 3x3 convolutions",
"advantages": [
"Fewer parameters: 2 * 3x3 < 1 * 5x5",
"More nonlinearity: ReLU at each layer",
"Same receptive field: two 3x3 = one 5x5"
],
"calculation": "3x3x2 = 18 < 5x5 = 25"
},
"Increasing Depth": {
"strategy": "Gradually increase network depth",
"effect": "Improves expressiveness and performance",
"validation": "VGG11 < VGG13 < VGG16 < VGG19"
},
"Channel Doubling": {
"pattern": "64->128->256->512->512",
"principle": "As spatial size decreases, increase feature dimensions",
"balance": "Balance between computation and feature representation"
},
"Uniform Architecture": {
"characteristic": "All convolutions are 3x3, all pooling is 2x2",
"benefit": "Simple architecture, easy to understand and implement",
"impact": "Established CNN design standards"
}
}
print("\nVGG Design Principles:")
print("=" * 40)
for principle, details in principles.items():
print(f"\n{principle}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}:")
for item in value:
print(f" - {item}")
else:
print(f" {key}: {value}")
return principles
def receptive_field_analysis(self):
"""Receptive field analysis"""
# Using VGG16 as example
layers = [
("conv1_1", 3, 1, 1),
("conv1_2", 3, 1, 1),
("pool1", 2, 2, 0),
("conv2_1", 3, 1, 1),
("conv2_2", 3, 1, 1),
("pool2", 2, 2, 0),
("conv3_1", 3, 1, 1),
("conv3_2", 3, 1, 1),
("conv3_3", 3, 1, 1),
("pool3", 2, 2, 0),
("conv4_1", 3, 1, 1),
("conv4_2", 3, 1, 1),
("conv4_3", 3, 1, 1),
("pool4", 2, 2, 0),
("conv5_1", 3, 1, 1),
("conv5_2", 3, 1, 1),
("conv5_3", 3, 1, 1),
("pool5", 2, 2, 0)
]
rf = 1
jump = 1
results = []
for name, kernel, stride, padding in layers:
if 'conv' in name:
rf = rf + (kernel - 1) * jump
elif 'pool' in name:
rf = rf + (kernel - 1) * jump
jump = jump * stride
results.append({
'layer': name,
'receptive_field': rf,
'jump': jump
})
print("\nVGG16 Receptive Field Analysis:")
print("-" * 40)
for result in results:
print(f"{result['layer']:10} | RF: {result['receptive_field']:3d} | Jump: {result['jump']:2d}")
return results
# Usage example
vgg = VGGNet()
# Build VGG16
model_vgg16 = vgg.build_vgg('VGG16', num_classes=1000)
print("VGG16 Model Structure:")
print(model_vgg16)
# Architecture comparison
arch_comparison = vgg.architecture_comparison()
# Design principles
design_principles = vgg.design_principles()
# Receptive field analysis
rf_analysis = vgg.receptive_field_analysis()
# Parameter statistics
def detailed_parameter_count(model):
total = 0
for name, param in model.named_parameters():
if param.requires_grad:
num_params = param.numel()
total += num_params
print(f"{name:30} | Shape: {str(list(param.shape)):20} | Params: {num_params:,}")
print(f"\nTotal parameters: {total:,}")
return total
print(f"\nVGG16 Detailed Parameter Statistics:")
print("-" * 80)
total_params = detailed_parameter_count(model_vgg16)
2.3.4 ResNet Residual Network
class ResNet:
def __init__(self):
self.architecture_info = {
"year": "2015",
"author": "Kaiming He et al.",
"breakthrough": "Solved deep network degradation problem",
"core": "Residual connection (Skip Connection)"
}
def residual_block_concept(self):
"""Residual block concept explanation"""
concept = {
"Traditional Network Problem": {
"phenomenon": "As network gets deeper, performance degrades",
"reasons": ["Vanishing gradient", "Exploding gradient", "Optimization difficulty"],
"example": "56-layer network performs worse than 20-layer"
},
"Residual Learning": {
"idea": "Learn residual function rather than direct mapping",
"formula": "H(x) = F(x) + x",
"advantage": "Even if F(x)=0, there's identity mapping x"
},
"Skip Connection": {
"mechanism": "Input directly added to output",
"effects": ["Alleviates vanishing gradient", "Promotes information flow", "Makes network easier to optimize"],
"implementation": "element-wise addition"
}
}
print("ResNet Residual Block Concept:")
print("=" * 40)
for key, details in concept.items():
print(f"\n{key}:")
for k, v in details.items():
if isinstance(v, list):
print(f" {k}:")
for item in v:
print(f" - {item}")
else:
print(f" {k}: {v}")
return concept
def build_resnet(self, layers=[3, 4, 6, 3], num_classes=1000):
"""Build ResNet model"""
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
)
def forward(self, x):
out = torch.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # Residual connection
out = torch.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, self.expansion * planes,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(self.expansion * planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
)
def forward(self, x):
out = torch.relu(self.bn1(self.conv1(x)))
out = torch.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
out += self.shortcut(x) # Residual connection
out = torch.relu(out)
return out
class ResNetModel(nn.Module):
def __init__(self, block, layers, num_classes):
super(ResNetModel, self).__init__()
self.in_planes = 64
# Initial convolution layer
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2,
padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Residual block groups
self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# Global average pooling and classifier
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, planes, blocks, stride):
strides = [stride] + [1] * (blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = torch.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
# Select block type based on layer count
if layers == [2, 2, 2, 2]: # ResNet18
return ResNetModel(BasicBlock, layers, num_classes)
elif layers == [3, 4, 6, 3]: # ResNet34
return ResNetModel(BasicBlock, layers, num_classes)
else: # ResNet50/101/152 use Bottleneck
return ResNetModel(Bottleneck, layers, num_classes)
def resnet_variants(self):
"""ResNet variants"""
variants = {
"ResNet18": {
"structure": [2, 2, 2, 2],
"block_type": "BasicBlock",
"param_count": "11.7M",
"characteristic": "Lightweight, suitable for resource-constrained environments"
},
"ResNet34": {
"structure": [3, 4, 6, 3],
"block_type": "BasicBlock",
"param_count": "21.8M",
"characteristic": "Medium scale, balanced performance"
},
"ResNet50": {
"structure": [3, 4, 6, 3],
"block_type": "Bottleneck",
"param_count": "25.6M",
"characteristic": "Classic version, widely used"
},
"ResNet101": {
"structure": [3, 4, 23, 3],
"block_type": "Bottleneck",
"param_count": "44.5M",
"characteristic": "Deeper network, better performance"
},
"ResNet152": {
"structure": [3, 8, 36, 3],
"block_type": "Bottleneck",
"param_count": "60.2M",
"characteristic": "Deepest version, best on ImageNet"
}
}
print("ResNet Variant Comparison:")
print("=" * 50)
for variant, info in variants.items():
print(f"\n{variant}:")
for key, value in info.items():
print(f" {key}: {value}")
return variants
def gradient_flow_analysis(self):
"""Gradient flow analysis"""
def simulate_gradient_flow():
# Simulate gradient flow at different depths
depths = [10, 20, 50, 100, 152]
# Gradient decay in vanilla networks (simplified simulation)
vanilla_gradients = []
resnet_gradients = []
for depth in depths:
# Assume decay rate per layer
vanilla_decay = 0.9 ** depth
resnet_decay = 0.95 ** depth # Residual connection alleviates decay
vanilla_gradients.append(vanilla_decay)
resnet_gradients.append(resnet_decay)
# Visualize gradient flow
plt.figure(figsize=(10, 6))
plt.plot(depths, vanilla_gradients, 'r-o', label='Vanilla CNN')
plt.plot(depths, resnet_gradients, 'b-o', label='ResNet')
plt.xlabel('Network Depth')
plt.ylabel('Gradient Magnitude (simulated)')
plt.title('Gradient Flow Comparison')
plt.legend()
plt.grid(True)
plt.yscale('log')
return depths, vanilla_gradients, resnet_gradients
depths, vanilla_grad, resnet_grad = simulate_gradient_flow()
print("Gradient Flow Analysis:")
print("-" * 30)
for i, depth in enumerate(depths):
print(f"Depth {depth:3d}: Vanilla={vanilla_grad[i]:.6f}, ResNet={resnet_grad[i]:.6f}")
return depths, vanilla_grad, resnet_grad
# Usage example
resnet = ResNet()
# Residual block concept
concept = resnet.residual_block_concept()
# Build ResNet50
model_resnet50 = resnet.build_resnet(layers=[3, 4, 6, 3], num_classes=1000)
print(f"\nResNet50 Model Structure:")
print(model_resnet50)
# ResNet variants
variants = resnet.resnet_variants()
# Gradient flow analysis
gradient_analysis = resnet.gradient_flow_analysis()
# Test forward propagation
sample_input = torch.randn(1, 3, 224, 224)
with torch.no_grad():
output = model_resnet50(sample_input)
print(f"\nInput size: {sample_input.shape}")
print(f"Output size: {output.shape}")
# Parameter count
def count_parameters(model):
total = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total
resnet50_params = count_parameters(model_resnet50)
print(f"\nResNet50 parameter count: {resnet50_params:,}")
2.4 Backpropagation and Optimization Algorithms
2.4.1 Backpropagation Algorithm
class BackpropagationDemo:
def __init__(self):
pass
def simple_network_example(self):
"""Simple network backpropagation demonstration"""
class SimpleNet:
def __init__(self):
# Weight initialization
self.W1 = np.random.randn(2, 3) * 0.01 # Input to hidden layer
self.b1 = np.zeros((1, 3))
self.W2 = np.random.randn(3, 1) * 0.01 # Hidden to output layer
self.b2 = np.zeros((1, 1))
# Save intermediate variables for backpropagation
self.z1 = None
self.a1 = None
self.z2 = None
self.a2 = None
def sigmoid(self, z):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def sigmoid_derivative(self, z):
"""Sigmoid derivative"""
s = self.sigmoid(z)
return s * (1 - s)
def forward(self, X):
"""Forward propagation"""
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def compute_cost(self, Y, A2):
"""Compute loss"""
m = Y.shape[0]
cost = -np.sum(Y * np.log(A2 + 1e-8) + (1 - Y) * np.log(1 - A2 + 1e-8)) / m
return cost
def backward(self, X, Y):
"""Backpropagation"""
m = X.shape[0]
# Output layer gradient
dZ2 = self.a2 - Y # For sigmoid + cross-entropy
dW2 = np.dot(self.a1.T, dZ2) / m
db2 = np.sum(dZ2, axis=0, keepdims=True) / m
# Hidden layer gradient
dA1 = np.dot(dZ2, self.W2.T)
dZ1 = dA1 * self.sigmoid_derivative(self.z1)
dW1 = np.dot(X.T, dZ1) / m
db1 = np.sum(dZ1, axis=0, keepdims=True) / m
gradients = {
"dW1": dW1, "db1": db1,
"dW2": dW2, "db2": db2
}
return gradients
def update_parameters(self, gradients, learning_rate):
"""Update parameters"""
self.W1 -= learning_rate * gradients["dW1"]
self.b1 -= learning_rate * gradients["db1"]
self.W2 -= learning_rate * gradients["dW2"]
self.b2 -= learning_rate * gradients["db2"]
def train(self, X, Y, epochs, learning_rate):
"""Training process"""
costs = []
for i in range(epochs):
# Forward propagation
A2 = self.forward(X)
# Compute loss
cost = self.compute_cost(Y, A2)
costs.append(cost)
# Backpropagation
gradients = self.backward(X, Y)
# Update parameters
self.update_parameters(gradients, learning_rate)
if i % 100 == 0:
print(f"Cost after epoch {i}: {cost:.6f}")
return costs
return SimpleNet
def gradient_checking(self):
"""Gradient checking"""
def numerical_gradient(f, x, h=1e-5):
"""Numerical gradient calculation"""
grad = np.zeros_like(x)
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
idx = it.multi_index
old_value = x[idx]
x[idx] = old_value + h
fxh_pos = f(x)
x[idx] = old_value - h
fxh_neg = f(x)
grad[idx] = (fxh_pos - fxh_neg) / (2 * h)
x[idx] = old_value
it.iternext()
return grad
def gradient_check_example():
"""Gradient checking example"""
# Simple quadratic function f(x) = x^2 + 2x + 1
def f(x):
return np.sum(x**2 + 2*x + 1)
def analytical_grad(x):
return 2*x + 2
x = np.array([1.0, 2.0, -1.0])
# Analytical gradient
grad_analytical = analytical_grad(x)
# Numerical gradient
grad_numerical = numerical_gradient(f, x)
# Calculate difference
diff = np.linalg.norm(grad_analytical - grad_numerical) / \
(np.linalg.norm(grad_analytical) + np.linalg.norm(grad_numerical))
print("Gradient Checking Results:")
print(f"Analytical gradient: {grad_analytical}")
print(f"Numerical gradient: {grad_numerical}")
print(f"Relative error: {diff:.10f}")
if diff < 1e-7:
print("✓ Gradient check passed")
else:
print("✗ Gradient check failed")
return diff
return gradient_check_example
def backprop_intuition(self):
"""Backpropagation intuition"""
intuition = {
"Chain Rule": {
"math_foundation": "Basic rule for derivative of composite functions",
"formula": "∂L/∂w = (∂L/∂y) × (∂y/∂z) × (∂z/∂w)",
"meaning": "Gradient of loss w.r.t. weights equals product of layer derivatives"
},
"Computational Graph": {
"concept": "Represent computation process as directed acyclic graph",
"forward": "Calculate output in graph direction",
"backward": "Calculate gradients in reverse graph direction",
"advantage": "Can automate differentiation process"
},
"Vanishing Gradient": {
"cause": "Gradients multiply across layers in deep networks",
"problem": "Shallow layer parameters update very slowly, learning difficulty",
"solutions": ["ReLU activation", "Residual connections", "BatchNorm"]
},
"Exploding Gradient": {
"cause": "Gradients grow exponentially during propagation",
"problem": "Parameter updates too large, unstable training",
"solutions": ["Gradient clipping", "Weight initialization", "Learning rate adjustment"]
}
}
print("Backpropagation Intuition:")
print("=" * 40)
for concept, details in intuition.items():
print(f"\n{concept}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}:")
for item in value:
print(f" - {item}")
else:
print(f" {key}: {value}")
return intuition
# Usage example
backprop_demo = BackpropagationDemo()
# Simple network example
SimpleNet = backprop_demo.simple_network_example()
net = SimpleNet()
# Generate training data (XOR problem)
np.random.seed(42)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = np.array([[0], [1], [1], [0]])
print("Training simple neural network to solve XOR problem:")
print("=" * 40)
# Train network
costs = net.train(X, Y, epochs=5000, learning_rate=1.0)
# Test results
predictions = net.forward(X)
print(f"\nFinal prediction results:")
for i in range(len(X)):
print(f"Input: {X[i]} -> Prediction: {predictions[i][0]:.6f}, Actual: {Y[i][0]}")
# Gradient checking
gradient_checker = backprop_demo.gradient_checking()
gradient_checker()
# Backpropagation intuition
intuition = backprop_demo.backprop_intuition()
2.4.2 Optimization Algorithms
class OptimizationAlgorithms:
def __init__(self):
pass
def gradient_descent_variants(self):
"""Gradient descent variants"""
class SGD:
def __init__(self, learning_rate=0.01):
self.learning_rate = learning_rate
def update(self, params, gradients):
"""Stochastic gradient descent update"""
updated_params = {}
for key in params:
updated_params[key] = params[key] - self.learning_rate * gradients[key]
return updated_params
class MomentumSGD:
def __init__(self, learning_rate=0.01, momentum=0.9):
self.learning_rate = learning_rate
self.momentum = momentum
self.velocity = {}
def update(self, params, gradients):
"""Momentum gradient descent"""
updated_params = {}
for key in params:
if key not in self.velocity:
self.velocity[key] = np.zeros_like(params[key])
# Update velocity
self.velocity[key] = self.momentum * self.velocity[key] - self.learning_rate * gradients[key]
# Update parameters
updated_params[key] = params[key] + self.velocity[key]
return updated_params
class AdaGrad:
def __init__(self, learning_rate=0.01, eps=1e-8):
self.learning_rate = learning_rate
self.eps = eps
self.accumulator = {}
def update(self, params, gradients):
"""AdaGrad optimizer"""
updated_params = {}
for key in params:
if key not in self.accumulator:
self.accumulator[key] = np.zeros_like(params[key])
# Accumulate squared gradient
self.accumulator[key] += gradients[key] ** 2
# Adaptive learning rate update
adapted_lr = self.learning_rate / (np.sqrt(self.accumulator[key]) + self.eps)
updated_params[key] = params[key] - adapted_lr * gradients[key]
return updated_params
class Adam:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.eps = eps
self.m = {} # First moment estimate
self.v = {} # Second moment estimate
self.t = 0 # Time step
def update(self, params, gradients):
"""Adam optimizer"""
self.t += 1
updated_params = {}
for key in params:
if key not in self.m:
self.m[key] = np.zeros_like(params[key])
self.v[key] = np.zeros_like(params[key])
# Update first and second moment estimates
self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * gradients[key]
self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (gradients[key] ** 2)
# Bias correction
m_hat = self.m[key] / (1 - self.beta1 ** self.t)
v_hat = self.v[key] / (1 - self.beta2 ** self.t)
# Parameter update
updated_params[key] = params[key] - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)
return updated_params
return {
'SGD': SGD,
'MomentumSGD': MomentumSGD,
'AdaGrad': AdaGrad,
'Adam': Adam
}
def optimizer_comparison(self):
"""Optimizer comparison experiment"""
# Define optimization problem: Rosenbrock function
def rosenbrock(x, y):
return (1 - x)**2 + 100 * (y - x**2)**2
def rosenbrock_gradient(x, y):
dx = -2 * (1 - x) - 400 * x * (y - x**2)
dy = 200 * (y - x**2)
return np.array([dx, dy])
# Initialize optimizers
optimizers = self.gradient_descent_variants()
sgd = optimizers['SGD'](learning_rate=0.001)
momentum = optimizers['MomentumSGD'](learning_rate=0.001)
adagrad = optimizers['AdaGrad'](learning_rate=0.1)
adam = optimizers['Adam'](learning_rate=0.01)
# Optimization process
def optimize_function(optimizer, steps=1000):
params = {'xy': np.array([0.0, 0.0])} # Starting point
trajectory = [params['xy'].copy()]
losses = []
for i in range(steps):
x, y = params['xy']
loss = rosenbrock(x, y)
losses.append(loss)
gradients = {'xy': rosenbrock_gradient(x, y)}
params = optimizer.update(params, gradients)
trajectory.append(params['xy'].copy())
# Prevent divergence
if loss > 1e6:
break
return np.array(trajectory), losses
# Run comparison
results = {}
opt_instances = {
'SGD': sgd,
'Momentum': momentum,
'AdaGrad': adagrad,
'Adam': adam
}
print("Optimizer Performance Comparison (Rosenbrock function):")
print("=" * 50)
for name, opt in opt_instances.items():
trajectory, losses = optimize_function(opt, 1000)
final_loss = losses[-1]
results[name] = {
'trajectory': trajectory,
'losses': losses,
'final_loss': final_loss,
'converged': final_loss < 1.0
}
print(f"{name:10} | Final loss: {final_loss:8.6f} | Converged: {'✓' if final_loss < 1.0 else '✗'}")
# Visualize optimization paths
def plot_optimization_paths():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Plot Rosenbrock function contours
x = np.linspace(-0.5, 1.5, 100)
y = np.linspace(-0.5, 1.5, 100)
X, Y = np.meshgrid(x, y)
Z = rosenbrock(X, Y)
colors = ['red', 'blue', 'green', 'orange']
for i, (name, result) in enumerate(results.items()):
trajectory = result['trajectory']
losses = result['losses']
# Optimization path
if i == 0:
ax1.contour(X, Y, Z, levels=50, alpha=0.3, colors='gray')
ax1.plot(trajectory[:, 0], trajectory[:, 1],
color=colors[i], label=name, linewidth=2)
ax1.plot(trajectory[0, 0], trajectory[0, 1],
'o', color=colors[i], markersize=8)
ax1.plot(trajectory[-1, 0], trajectory[-1, 1],
's', color=colors[i], markersize=8)
# Loss curve
ax2.plot(losses[:min(500, len(losses))],
color=colors[i], label=name, linewidth=2)
ax1.plot(1, 1, 'k*', markersize=15, label='Global Minimum')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_title('Optimization Paths')
ax1.legend()
ax1.grid(True)
ax2.set_xlabel('Iterations')
ax2.set_ylabel('Loss')
ax2.set_title('Loss Curves')
ax2.set_yscale('log')
ax2.legend()
ax2.grid(True)
plt.tight_layout()
return fig
return results, plot_optimization_paths
def learning_rate_scheduling(self):
"""Learning rate scheduling strategies"""
class LRScheduler:
def __init__(self):
pass
def step_decay(self, initial_lr, epoch, drop_rate=0.5, epochs_drop=10):
"""Step decay"""
return initial_lr * (drop_rate ** (epoch // epochs_drop))
def exponential_decay(self, initial_lr, epoch, decay_rate=0.95):
"""Exponential decay"""
return initial_lr * (decay_rate ** epoch)
def cosine_annealing(self, initial_lr, epoch, max_epochs):
"""Cosine annealing"""
return initial_lr * (1 + np.cos(np.pi * epoch / max_epochs)) / 2
def warm_up_cosine(self, initial_lr, epoch, warmup_epochs, max_epochs):
"""Warm-up + cosine annealing"""
if epoch < warmup_epochs:
return initial_lr * epoch / warmup_epochs
else:
return self.cosine_annealing(initial_lr, epoch - warmup_epochs,
max_epochs - warmup_epochs)
def visualize_schedules(self, initial_lr=0.1, max_epochs=100):
"""Visualize different scheduling strategies"""
epochs = np.arange(max_epochs)
schedules = {
'Constant': [initial_lr] * max_epochs,
'Step Decay': [self.step_decay(initial_lr, e) for e in epochs],
'Exponential': [self.exponential_decay(initial_lr, e) for e in epochs],
'Cosine': [self.cosine_annealing(initial_lr, e, max_epochs) for e in epochs],
'Warm-up Cosine': [self.warm_up_cosine(initial_lr, e, 10, max_epochs) for e in epochs]
}
plt.figure(figsize=(12, 8))
for name, schedule in schedules.items():
plt.plot(epochs, schedule, label=name, linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Scheduling Strategies')
plt.legend()
plt.grid(True)
plt.yscale('log')
return schedules
return LRScheduler()
# Usage example
opt_algorithms = OptimizationAlgorithms()
# Get optimizer classes
optimizers = opt_algorithms.gradient_descent_variants()
# Optimizer comparison
results, plot_function = opt_algorithms.optimizer_comparison()
# Visualize results (if needed)
# fig = plot_function()
# plt.show()
# Learning rate scheduling
lr_scheduler = opt_algorithms.learning_rate_scheduling()
schedules = lr_scheduler.visualize_schedules()
print(f"\nLearning rate scheduling example (first 10 epochs):")
print("-" * 40)
for name, schedule in schedules.items():
print(f"{name:15}: {schedule[:10]}")
Chapter Summary
2.5.1 Core Concepts Review
- Deep learning automatically learns feature representations through multi-layer networks
- Convolutional neural networks use convolution operations to extract spatial features
- Classic architectures demonstrate the evolution and design philosophy of CNNs
- Backpropagation is the core algorithm for deep network training
- Optimization algorithms determine network convergence performance
2.5.2 Important Technical Points
- Convolution operation: Parameter sharing, local connectivity, translation invariance
- Pooling operation: Dimensionality reduction, reduced computation, enhanced robustness
- Activation functions: Introduce nonlinearity, ReLU alleviates vanishing gradient
- Residual connections: Solve deep network degradation problem
- Batch normalization: Accelerates convergence, improves stability
2.5.3 Architecture Evolution Insights
- LeNet: Established basic CNN structure
- AlexNet: Proved the power of deep learning
- VGG: Advantages of small kernels
- ResNet: Breakthrough of residual learning
2.5.4 Optimization Algorithm Selection
- SGD: Simple and effective, suitable for large batches
- Momentum: Accelerates convergence, crosses saddle points
- Adam: Adaptive learning rate, widely applicable
- Learning rate scheduling: Key technique in training process
2.5.5 Next Chapter Preview
The next chapter will study the evolution of object detection and classic algorithms, understanding the progression from traditional methods to deep learning methods, preparing for in-depth understanding of YOLO algorithms. We will learn:
- Traditional object detection methods
- Two-stage detection algorithms (R-CNN series)
- Advantages of one-stage detection algorithms
- Laying foundation for YOLO learning
Through this chapter, we have mastered the fundamentals of deep learning and CNNs, providing a solid technical foundation for subsequent study of YOLO algorithms.