Chapter 3: Object Detection Evolution and Classic Algorithms
Haiyue
60min
Chapter 3: Object Detection Evolution and Classic Algorithms
Learning Objectives
- Understand the evolution of object detection algorithms
- Master traditional object detection methods (HOG+SVM, DPM, etc.)
- Comprehend two-stage detection algorithms (R-CNN, Fast R-CNN, Faster R-CNN)
- Recognize the advantages of one-stage detection algorithms
3.1 Object Detection Evolution Overview
3.1.1 Development Timeline
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import matplotlib.dates as mdates
class ObjectDetectionHistory:
def __init__(self):
self.timeline = {
2001: {"Algorithm": "Viola-Jones", "Type": "Traditional", "Breakthrough": "Real-time face detection"},
2005: {"Algorithm": "HOG", "Type": "Traditional", "Breakthrough": "Histogram of Oriented Gradients features"},
2008: {"Algorithm": "DPM", "Type": "Traditional", "Breakthrough": "Deformable Part Models"},
2012: {"Algorithm": "AlexNet", "Type": "Deep Learning", "Breakthrough": "CNN breakthrough in image classification"},
2014: {"Algorithm": "R-CNN", "Type": "Two-Stage", "Breakthrough": "CNN for object detection"},
2015: {"Algorithm": "Fast R-CNN", "Type": "Two-Stage", "Breakthrough": "End-to-end training"},
2015: {"Algorithm": "YOLO v1", "Type": "One-Stage", "Breakthrough": "Real-time object detection"},
2016: {"Algorithm": "SSD", "Type": "One-Stage", "Breakthrough": "Multi-scale detection"},
2016: {"Algorithm": "Faster R-CNN", "Type": "Two-Stage", "Breakthrough": "RPN network"},
2017: {"Algorithm": "RetinaNet", "Type": "One-Stage", "Breakthrough": "Focal Loss"},
2017: {"Algorithm": "Mask R-CNN", "Type": "Two-Stage", "Breakthrough": "Instance segmentation"},
2018: {"Algorithm": "YOLO v3", "Type": "One-Stage", "Breakthrough": "Multi-scale prediction"},
2020: {"Algorithm": "EfficientDet", "Type": "One-Stage", "Breakthrough": "Efficient architecture design"},
2020: {"Algorithm": "DETR", "Type": "Transformer", "Breakthrough": "End-to-end Transformer detection"}
}
def create_timeline_visualization(self):
"""Create development timeline visualization"""
years = list(self.timeline.keys())
algorithms = [self.timeline[year]["Algorithm"] for year in years]
types = [self.timeline[year]["Type"] for year in years]
# Color mapping
color_map = {
"Traditional": "red",
"Deep Learning": "blue",
"Two-Stage": "green",
"One-Stage": "orange",
"Transformer": "purple"
}
colors = [color_map[t] for t in types]
# Create timeline plot
fig, ax = plt.subplots(figsize=(15, 8))
# Draw timeline
ax.scatter(years, range(len(years)), c=colors, s=100, alpha=0.7)
# Add algorithm labels
for i, (year, alg) in enumerate(zip(years, algorithms)):
ax.annotate(f"{alg}\n({year})",
(year, i),
xytext=(10, 0),
textcoords='offset points',
ha='left',
fontsize=10,
bbox=dict(boxstyle='round,pad=0.3',
facecolor=colors[i],
alpha=0.3))
# Set plot properties
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Development Stage", fontsize=12)
ax.set_title("Object Detection Algorithm Timeline", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
# Add legend
legend_elements = [plt.Line2D([0], [0], marker='o', color='w',
markerfacecolor=color, markersize=10,
label=method)
for method, color in color_map.items()]
ax.legend(handles=legend_elements, loc='upper left')
return fig
def development_phases(self):
"""Development phase analysis"""
phases = {
"Traditional Methods Era (2001-2012)": {
"Characteristics": [
"Based on hand-crafted features",
"Using traditional machine learning algorithms",
"High computational efficiency but limited accuracy"
],
"Representative Algorithms": ["Viola-Jones", "HOG+SVM", "DPM"],
"Main Contribution": "Established the foundational framework for object detection"
},
"Deep Learning Emergence (2012-2014)": {
"Characteristics": [
"CNN breakthrough in image classification",
"Introducing deep learning to object detection",
"Beginning of end-to-end learning"
],
"Representative Algorithms": ["AlexNet", "R-CNN"],
"Main Contribution": "Proved the advantage of deep learning in vision tasks"
},
"Two-Stage Methods Refinement (2014-2017)": {
"Characteristics": [
"Region proposals + classification and regression",
"Pursuing high accuracy",
"Relatively slow speed"
],
"Representative Algorithms": ["R-CNN", "Fast R-CNN", "Faster R-CNN", "Mask R-CNN"],
"Main Contribution": "Established the standard paradigm for two-stage detection"
},
"One-Stage Methods Development (2015-2020)": {
"Characteristics": [
"End-to-end direct prediction",
"Pursuing speed-accuracy balance",
"Suitable for real-time applications"
],
"Representative Algorithms": ["YOLO", "SSD", "RetinaNet"],
"Main Contribution": "Achieved real-time high-accuracy object detection"
},
"New Architecture Exploration (2020-)": {
"Characteristics": [
"Introduction of Transformer architecture",
"Self-attention mechanism",
"End-to-end anchor-free detection"
],
"Representative Algorithms": ["DETR", "DETR series"],
"Main Contribution": "Exploring new network architecture possibilities"
}
}
print("Object Detection Development Phase Analysis:")
print("=" * 50)
for phase, details in phases.items():
print(f"\n{phase}:")
print(f" Characteristics:")
for feature in details["Characteristics"]:
print(f" - {feature}")
print(f" Representative Algorithms: {', '.join(details['Representative Algorithms'])}")
print(f" Main Contribution: {details['Main Contribution']}")
return phases
# Usage example
history = ObjectDetectionHistory()
# Create timeline plot
# timeline_fig = history.create_timeline_visualization()
# plt.show()
# Development phase analysis
phases = history.development_phases()
3.1.2 Technology Evolution Analysis
class TechnologyEvolution:
def __init__(self):
self.evolution_aspects = {
"Feature Representation": {
"Traditional Methods": {
"Features": ["HOG", "SIFT", "SURF", "Haar-like"],
"Advantages": "Simple computation, strong interpretability",
"Disadvantages": "Limited expressive power, requires specialized design"
},
"Deep Learning": {
"Features": "CNN automatically learns features",
"Advantages": "Strong expressive power, end-to-end learning",
"Disadvantages": "Requires large amounts of data, complex computation"
}
},
"Region Proposal Generation": {
"Sliding Window": {
"Method": "Exhaustively search all positions and scales",
"Advantages": "Simple and straightforward",
"Disadvantages": "High computational cost, much redundancy"
},
"Selective Search": {
"Method": "Based on segmentation and merging strategy",
"Advantages": "Significantly reduces region proposals",
"Disadvantages": "Depends on segmentation quality"
},
"Learning-based Generation": {
"Method": "Neural network generation like RPN",
"Advantages": "End-to-end learning, high quality",
"Disadvantages": "Increases network complexity"
},
"Anchor-free": {
"Method": "Directly regress detection results",
"Advantages": "Fast speed, simple architecture",
"Disadvantages": "Relatively lower localization accuracy"
}
},
"Multi-scale Processing": {
"Image Pyramid": {
"Method": "Build images at different scales",
"Advantages": "Comprehensive processing",
"Disadvantages": "Computation cost multiplies"
},
"Feature Pyramid": {
"Method": "Utilize features from different CNN layers",
"Advantages": "Computationally efficient",
"Disadvantages": "Inconsistent feature semantics"
},
"Feature Pyramid Network": {
"Method": "Top-down and lateral connections",
"Advantages": "Strong semantics and high resolution coexist",
"Disadvantages": "Complex network structure"
}
}
}
def analyze_evolution(self):
"""Technology evolution analysis"""
print("Object Detection Technology Evolution Analysis:")
print("=" * 60)
for aspect, methods in self.evolution_aspects.items():
print(f"\n[{aspect}]")
for method, details in methods.items():
print(f"\n {method}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
def performance_trends(self):
"""Performance development trends"""
# Simulated performance data on PASCAL VOC for different algorithms
algorithms = [
"DPM", "R-CNN", "Fast R-CNN", "Faster R-CNN",
"YOLO v1", "SSD", "YOLO v2", "RetinaNet",
"YOLO v3", "EfficientDet", "YOLO v5"
]
# mAP values (approximate data)
map_values = [33.7, 53.3, 66.9, 73.2, 63.4, 74.3, 78.6, 80.0, 82.0, 84.3, 85.0]
# FPS values (approximate data)
fps_values = [0.07, 0.05, 0.5, 7, 45, 19, 40, 5, 20, 15, 60]
# Years
years = [2008, 2014, 2015, 2016, 2016, 2016, 2017, 2017, 2018, 2020, 2020]
# Create dual-axis plot
fig, ax1 = plt.subplots(figsize=(14, 8))
# mAP trend
color1 = 'tab:red'
ax1.set_xlabel('Algorithm Development Order')
ax1.set_ylabel('mAP (%)', color=color1)
line1 = ax1.plot(algorithms, map_values, 'o-', color=color1, linewidth=2, markersize=8, label='mAP')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_ylim(30, 90)
# FPS trend
ax2 = ax1.twinx()
color2 = 'tab:blue'
ax2.set_ylabel('FPS', color=color2)
line2 = ax2.plot(algorithms, fps_values, 's-', color=color2, linewidth=2, markersize=8, label='FPS')
ax2.tick_params(axis='y', labelcolor=color2)
ax2.set_ylim(0, 70)
ax2.set_yscale('log')
# Set title and grid
plt.title('Object Detection Algorithm Performance Trends', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
# Rotate x-axis labels
plt.xticks(rotation=45, ha='right')
# Add legend
lines = line1 + line2
labels = ['mAP (%)', 'FPS']
ax1.legend(lines, labels, loc='upper left')
plt.tight_layout()
return fig
def complexity_analysis(self):
"""Computational complexity analysis"""
complexity_data = {
"Traditional Methods": {
"Time Complexity": "O(n²) - sliding window",
"Space Complexity": "O(1) - feature extraction",
"Parameters": "< 1M",
"Characteristics": "Simple computation but limited accuracy"
},
"Two-Stage Methods": {
"Time Complexity": "O(n) - number of region proposals",
"Space Complexity": "O(n) - CNN feature storage",
"Parameters": "100M+",
"Characteristics": "High accuracy but slow speed"
},
"One-Stage Methods": {
"Time Complexity": "O(1) - single forward pass",
"Space Complexity": "O(1) - fixed network structure",
"Parameters": "20-100M",
"Characteristics": "Fast speed, suitable for real-time applications"
}
}
print("Computational Complexity Comparison:")
print("=" * 40)
for method, analysis in complexity_data.items():
print(f"\n{method}:")
for key, value in analysis.items():
print(f" {key}: {value}")
return complexity_data
# Usage example
tech_evolution = TechnologyEvolution()
# Technology evolution analysis
tech_evolution.analyze_evolution()
# Performance trends
# performance_fig = tech_evolution.performance_trends()
# plt.show()
# Complexity analysis
complexity = tech_evolution.complexity_analysis()
3.2 Traditional Object Detection Methods
3.2.1 Viola-Jones Face Detection
class ViolaJonesDetector:
def __init__(self):
self.method_info = {
"Year": 2001,
"Authors": "Paul Viola & Michael Jones",
"Contribution": "First real-time object detection algorithm",
"Application": "Face detection"
}
def haar_like_features(self):
"""Haar-like features"""
features = {
"Edge Features": {
"Pattern": "Difference in pixel sums of adjacent rectangular regions",
"Types": ["Horizontal edges", "Vertical edges"],
"Computation": "White region pixel sum - Black region pixel sum"
},
"Line Features": {
"Pattern": "Contrast between middle and side regions",
"Types": ["Horizontal lines", "Vertical lines"],
"Usage": "Detect elongated structures"
},
"Center Features": {
"Pattern": "Contrast between center region and surrounding regions",
"Types": ["Four-rectangle features"],
"Usage": "Detect center-prominent structures"
}
}
# Simulate Haar-like feature computation
def compute_haar_feature(image, feature_type, position, scale):
"""
Compute Haar-like feature
image: Input image
feature_type: Feature type
position: Feature position (x, y)
scale: Feature scale
"""
x, y = position
w, h = scale
if feature_type == "edge_horizontal":
# Horizontal edge feature: top half - bottom half
top_sum = np.sum(image[y:y+h//2, x:x+w])
bottom_sum = np.sum(image[y+h//2:y+h, x:x+w])
return top_sum - bottom_sum
elif feature_type == "edge_vertical":
# Vertical edge feature: left half - right half
left_sum = np.sum(image[y:y+h, x:x+w//2])
right_sum = np.sum(image[y:y+h, x+w//2:x+w])
return left_sum - right_sum
elif feature_type == "line_horizontal":
# Horizontal line feature: middle - top and bottom
top_sum = np.sum(image[y:y+h//3, x:x+w])
middle_sum = np.sum(image[y+h//3:y+2*h//3, x:x+w])
bottom_sum = np.sum(image[y+2*h//3:y+h, x:x+w])
return middle_sum - (top_sum + bottom_sum)
return 0
print("Haar-like Feature Types:")
print("=" * 30)
for feature_type, details in features.items():
print(f"\n{feature_type}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
return features, compute_haar_feature
def integral_image(self):
"""Integral image computation"""
def compute_integral_image(image):
"""Compute integral image"""
height, width = image.shape
integral = np.zeros((height + 1, width + 1))
for i in range(1, height + 1):
for j in range(1, width + 1):
integral[i][j] = (image[i-1][j-1] +
integral[i-1][j] +
integral[i][j-1] -
integral[i-1][j-1])
return integral
def rectangle_sum(integral, x, y, w, h):
"""Fast computation of rectangular region pixel sum using integral image"""
return (integral[y+h][x+w] -
integral[y][x+w] -
integral[y+h][x] +
integral[y][x])
# Demonstrate integral image advantages
print("Integral Image Advantages:")
print("- Direct computation: O(w×h) time complexity")
print("- Integral image: O(1) time complexity")
print("- For many rectangle calculations, efficiency improvement is significant")
return compute_integral_image, rectangle_sum
def adaboost_classifier(self):
"""AdaBoost classifier"""
class WeakClassifier:
def __init__(self, feature_type, threshold, polarity):
self.feature_type = feature_type
self.threshold = threshold
self.polarity = polarity # 1 or -1
def classify(self, feature_value):
"""Weak classifier decision"""
if self.polarity * feature_value < self.polarity * self.threshold:
return 1
else:
return 0
class AdaBoostClassifier:
def __init__(self):
self.weak_classifiers = []
self.alphas = [] # Weak classifier weights
def train(self, features, labels, n_estimators=100):
"""AdaBoost training process (simplified version)"""
n_samples = len(features)
weights = np.ones(n_samples) / n_samples # Initial weights
for t in range(n_estimators):
# Find current best weak classifier
best_classifier, best_error = self._find_best_classifier(
features, labels, weights)
# Compute weak classifier weight
alpha = 0.5 * np.log((1 - best_error) / (best_error + 1e-10))
# Save weak classifier and weight
self.weak_classifiers.append(best_classifier)
self.alphas.append(alpha)
# Update sample weights
predictions = [best_classifier.classify(f) for f in features]
for i in range(n_samples):
if predictions[i] != labels[i]:
weights[i] *= np.exp(alpha)
else:
weights[i] *= np.exp(-alpha)
# Normalize weights
weights /= np.sum(weights)
if best_error < 0.01: # Early stopping
break
def _find_best_classifier(self, features, labels, weights):
"""Find current best weak classifier (simplified implementation)"""
best_error = float('inf')
best_classifier = None
# Simplified: only consider threshold variation
for threshold in np.linspace(min(features), max(features), 50):
for polarity in [-1, 1]:
classifier = WeakClassifier("simple", threshold, polarity)
# Compute weighted error rate
error = 0
for i, feature in enumerate(features):
prediction = classifier.classify(feature)
if prediction != labels[i]:
error += weights[i]
if error < best_error:
best_error = error
best_classifier = classifier
return best_classifier, best_error
def predict(self, features):
"""Strong classifier prediction"""
if not isinstance(features, list):
features = [features]
predictions = []
for feature in features:
weighted_sum = sum(alpha * classifier.classify(feature)
for alpha, classifier in
zip(self.alphas, self.weak_classifiers))
threshold = sum(self.alphas) / 2
predictions.append(1 if weighted_sum >= threshold else 0)
return predictions
print("AdaBoost Classifier Characteristics:")
print("- Combines multiple weak classifiers into a strong classifier")
print("- Adaptively adjusts sample weights")
print("- Focuses on difficult samples")
print("- Has theoretically guaranteed generalization capability")
return AdaBoostClassifier
def cascade_classifier(self):
"""Cascade classifier"""
cascade_info = {
"Design Philosophy": {
"Goal": "Quickly reject obvious negative samples",
"Strategy": "Multi-level cascade, progressive filtering",
"Advantage": "Dramatically improves detection speed"
},
"Structure Characteristics": {
"Layers": "Typically 20-30 layers",
"Each Layer": "One AdaBoost strong classifier",
"Threshold Setting": "Ensures high detection rate",
"Rejection Rate": "Each layer rejects 50%+ negative samples"
},
"Detection Process": {
"Input": "Image patches in sliding window",
"First Layer": "Quick filtering with minimal features",
"Subsequent Layers": "Gradually increase feature complexity",
"Output": "Must pass all layers to be recognized as target"
},
"Performance Advantages": {
"Speed": "Average 10 features computed per window",
"Accuracy": "Real-time performance on face detection",
"Practicality": "Widely used in libraries like OpenCV"
}
}
print("Cascade Classifier Architecture:")
print("=" * 40)
for aspect, details in cascade_info.items():
print(f"\n{aspect}:")
for key, value in details.items():
print(f" {key}: {value}")
return cascade_info
# Usage example
viola_jones = ViolaJonesDetector()
# Haar-like features
features, compute_feature = viola_jones.haar_like_features()
# Integral image
compute_integral, rectangle_sum = viola_jones.integral_image()
# AdaBoost classifier
AdaBoostClassifier = viola_jones.adaboost_classifier()
# Cascade classifier
cascade_info = viola_jones.cascade_classifier()
# Demonstrate integral image computation
print("\nIntegral Image Demonstration:")
print("-" * 20)
sample_image = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
integral = compute_integral(sample_image)
print("Original image:")
print(sample_image)
print("Integral image:")
print(integral[1:, 1:]) # Remove padding
# Compute rectangular region sum
rect_sum = rectangle_sum(integral, 0, 0, 2, 2) # Top-left 2x2 region
print(f"Top-left 2x2 region sum: {rect_sum}")
print(f"Direct computation verification: {np.sum(sample_image[:2, :2])}")
3.2.2 HOG+SVM Method
class HOGSVMDetector:
def __init__(self):
self.method_info = {
"Year": 2005,
"Authors": "Navneet Dalal & Bill Triggs",
"Contribution": "Proposed HOG feature descriptor",
"Application": "Pedestrian detection"
}
def hog_feature_extraction(self):
"""HOG feature extraction"""
def compute_gradients(image):
"""Compute image gradients"""
# Use Sobel operator to compute gradients
grad_x = np.array([[-1, 0, 1],
[-2, 0, 2],
[-1, 0, 1]])
grad_y = np.array([[-1, -2, -1],
[ 0, 0, 0],
[ 1, 2, 1]])
# Convolution to compute gradients
if len(image.shape) == 3:
image = np.mean(image, axis=2) # Convert to grayscale
gx = np.zeros_like(image)
gy = np.zeros_like(image)
for i in range(1, image.shape[0]-1):
for j in range(1, image.shape[1]-1):
gx[i, j] = np.sum(grad_x * image[i-1:i+2, j-1:j+2])
gy[i, j] = np.sum(grad_y * image[i-1:i+2, j-1:j+2])
# Compute gradient magnitude and direction
magnitude = np.sqrt(gx**2 + gy**2)
direction = np.arctan2(gy, gx) * 180 / np.pi
direction[direction < 0] += 180 # Convert to 0-180 degrees
return magnitude, direction
def compute_hog_descriptor(image, cell_size=(8, 8), block_size=(16, 16), nbins=9):
"""Compute HOG descriptor"""
magnitude, direction = compute_gradients(image)
height, width = image.shape
cell_h, cell_w = cell_size
block_h, block_w = block_size
# Calculate number of cells
n_cells_y = height // cell_h
n_cells_x = width // cell_w
# Compute histogram for each cell
cell_histograms = np.zeros((n_cells_y, n_cells_x, nbins))
for i in range(n_cells_y):
for j in range(n_cells_x):
# Current cell range
y_start = i * cell_h
y_end = (i + 1) * cell_h
x_start = j * cell_w
x_end = (j + 1) * cell_w
# Extract gradient information within cell
cell_mag = magnitude[y_start:y_end, x_start:x_end]
cell_dir = direction[y_start:y_end, x_start:x_end]
# Compute orientation histogram
hist = np.zeros(nbins)
bin_width = 180 / nbins
for y in range(cell_h):
for x in range(cell_w):
mag_val = cell_mag[y, x]
dir_val = cell_dir[y, x]
# Bilinear interpolation to distribute to adjacent bins
bin_idx = dir_val / bin_width
bin_low = int(bin_idx)
bin_high = (bin_low + 1) % nbins
weight_high = bin_idx - bin_low
weight_low = 1 - weight_high
hist[bin_low] += weight_low * mag_val
hist[bin_high] += weight_high * mag_val
cell_histograms[i, j] = hist
# Block normalization
blocks_per_row = n_cells_x - block_w // cell_w + 1
blocks_per_col = n_cells_y - block_h // cell_h + 1
hog_features = []
for i in range(blocks_per_col):
for j in range(blocks_per_row):
# Extract cell histograms within block
block_hist = cell_histograms[i:i+2, j:j+2].flatten()
# L2 normalization
norm = np.linalg.norm(block_hist)
if norm > 0:
block_hist = block_hist / norm
hog_features.extend(block_hist)
return np.array(hog_features)
return compute_hog_descriptor
def hog_parameters_analysis(self):
"""HOG parameter analysis"""
parameters = {
"Cell Size": {
"Common Value": "8x8 pixels",
"Role": "Basic unit for local gradient statistics",
"Impact": "Too small is noise-sensitive, too large loses detail"
},
"Block Size": {
"Common Value": "16x16 pixels (2x2 cells)",
"Role": "Basic unit for normalization",
"Impact": "Suppresses lighting variation effects"
},
"Orientation Bins": {
"Common Value": "9 bins (0-180 degrees)",
"Role": "Quantize gradient orientation",
"Impact": "Too few bins lose information, too many increase computation"
},
"Overlap Stride": {
"Common Value": "8 pixels (cell size)",
"Role": "Increase feature density",
"Impact": "Improves detection accuracy but increases computation"
},
"Normalization Method": {
"Common Value": "L2-norm",
"Role": "Suppress lighting variations",
"Impact": "Improves robustness"
}
}
print("HOG Parameter Detailed Analysis:")
print("=" * 40)
for param, details in parameters.items():
print(f"\n{param}:")
for key, value in details.items():
print(f" {key}: {value}")
return parameters
def svm_classifier(self):
"""SVM classifier"""
class SimpleSVM:
def __init__(self, C=1.0, kernel='linear'):
self.C = C
self.kernel = kernel
self.support_vectors = None
self.alphas = None
self.bias = None
def linear_kernel(self, X1, X2):
"""Linear kernel function"""
return np.dot(X1, X2.T)
def rbf_kernel(self, X1, X2, gamma=1.0):
"""RBF kernel function"""
pairwise_sq_dists = np.sum(X1**2, axis=1).reshape(-1, 1) + \
np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)
return np.exp(-gamma * pairwise_sq_dists)
def fit(self, X, y):
"""SVM training (simplified implementation)"""
# Here uses simplified SMO algorithm implementation
print("SVM Training Process (simplified implementation):")
print("1. Initialize Lagrange multipliers")
print("2. Select sample pairs that violate KKT conditions")
print("3. Optimize selected multipliers")
print("4. Update bias term")
print("5. Repeat until convergence")
# In actual implementation, there would be complete SMO algorithm
# Now just demonstrating the concept
self.support_vectors = X[:10] # Assume first 10 are support vectors
self.alphas = np.ones(10)
self.bias = 0.0
def predict(self, X):
"""Prediction"""
if self.kernel == 'linear':
kernel_values = self.linear_kernel(X, self.support_vectors)
else:
kernel_values = self.rbf_kernel(X, self.support_vectors)
decision_values = np.dot(kernel_values, self.alphas) + self.bias
return np.sign(decision_values)
svm_properties = {
"Core Idea": "Find maximum margin separating hyperplane",
"Support Vectors": "Key samples that determine decision boundary",
"Kernel Trick": "Handle non-linearly separable problems",
"Regularization": "C parameter controls balance between margin and misclassification",
"Sparsity": "Only support vectors affect decisions"
}
print("SVM Classifier Properties:")
print("=" * 30)
for prop, desc in svm_properties.items():
print(f" {prop}: {desc}")
return SimpleSVM, svm_properties
def sliding_window_detection(self):
"""Sliding window detection"""
def multi_scale_detection(image, detector, window_sizes, step_size=8):
"""Multi-scale sliding window detection"""
detections = []
for window_size in window_sizes:
w, h = window_size
# Sliding window
for y in range(0, image.shape[0] - h, step_size):
for x in range(0, image.shape[1] - w, step_size):
# Extract window
window = image[y:y+h, x:x+w]
# Feature extraction
features = detector.extract_features(window)
# Classification
confidence = detector.classify(features)
if confidence > detector.threshold:
detections.append({
'bbox': [x, y, w, h],
'confidence': confidence,
'scale': window_size
})
return detections
def non_maximum_suppression(detections, iou_threshold=0.5):
"""Non-maximum suppression"""
if len(detections) == 0:
return []
# Sort by confidence
detections = sorted(detections, key=lambda x: x['confidence'], reverse=True)
keep = []
while len(detections) > 0:
# Keep highest confidence detection
keep.append(detections[0])
current = detections.pop(0)
# Compute IoU with other detections
remaining = []
for det in detections:
iou = self.calculate_iou(current['bbox'], det['bbox'])
if iou < iou_threshold:
remaining.append(det)
detections = remaining
return keep
def calculate_iou(self, box1, box2):
"""Calculate IoU"""
x1, y1, w1, h1 = box1
x2, y2, w2, h2 = box2
# Compute intersection
x_left = max(x1, x2)
y_top = max(y1, y2)
x_right = min(x1 + w1, x2 + w2)
y_bottom = min(y1 + h1, y2 + h2)
if x_right < x_left or y_bottom < y_top:
return 0.0
intersection = (x_right - x_left) * (y_bottom - y_top)
union = w1 * h1 + w2 * h2 - intersection
return intersection / union
detection_process = {
"Step 1": "Image pyramid construction - multi-scale processing",
"Step 2": "Sliding window scan - exhaustively search possible positions",
"Step 3": "Feature extraction - compute HOG for each window",
"Step 4": "Classification decision - SVM classifier judgment",
"Step 5": "Non-maximum suppression - remove duplicate detections"
}
print("Sliding Window Detection Process:")
print("=" * 30)
for step, desc in detection_process.items():
print(f" {step}: {desc}")
return multi_scale_detection, non_maximum_suppression
# Usage example
hog_svm = HOGSVMDetector()
# HOG feature extraction
compute_hog = hog_svm.hog_feature_extraction()
# HOG parameter analysis
hog_params = hog_svm.hog_parameters_analysis()
# SVM classifier
SimpleSVM, svm_props = hog_svm.svm_classifier()
# Sliding window detection
multi_scale_detect, nms = hog_svm.sliding_window_detection()
# Demonstrate HOG feature extraction
print("\nHOG Feature Extraction Demonstration:")
print("-" * 20)
# Create test image
test_image = np.random.rand(64, 128) * 255 # 64x128 random image
hog_features = compute_hog(test_image)
print(f"Input image size: {test_image.shape}")
print(f"HOG feature dimension: {len(hog_features)}")
print(f"First 10 dimensions of feature vector: {hog_features[:10]}")
3.2.3 DPM (Deformable Part Models)
class DPMDetector:
def __init__(self):
self.method_info = {
"Year": 2008,
"Author": "Pedro Felzenszwalb",
"Contribution": "Deformable Part Models",
"Award": "PASCAL VOC 2007-2009 consecutive winner"
}
def dpm_architecture(self):
"""DPM architecture principles"""
architecture = {
"Root Filter": {
"Role": "Detect overall object shape",
"Features": "Low-resolution HOG features",
"Size": "Larger filter kernel"
},
"Part Filters": {
"Role": "Detect object local parts",
"Features": "High-resolution HOG features",
"Quantity": "Multiple parts per root filter"
},
"Spatial Model": {
"Role": "Constrain relative part positions",
"Parameters": "Position mean and deformation cost",
"Flexibility": "Allow parts to deform within a certain range"
},
"Mixture Model": {
"Role": "Handle viewpoint and pose variations",
"Strategy": "Multiple component combinations",
"Training": "Latent SVM training"
}
}
print("DPM Architecture Components:")
print("=" * 30)
for component, details in architecture.items():
print(f"\n{component}:")
for key, value in details.items():
print(f" {key}: {value}")
return architecture
def scoring_function(self):
"""DPM scoring function"""
def dpm_score(root_response, part_responses, part_positions, model_params):
"""
DPM scoring function
score = root_filter_score + Σ(part_filter_score - deformation_cost)
"""
# Root filter score
root_score = root_response
# Part score computation
part_score = 0
for i, (part_resp, part_pos) in enumerate(zip(part_responses, part_positions)):
# Part filter response
filter_score = part_resp
# Deformation cost computation
anchor_pos = model_params['anchors'][i] # Anchor position
deform_params = model_params['deformation'][i] # Deformation parameters
dx = part_pos[0] - anchor_pos[0]
dy = part_pos[1] - anchor_pos[1]
# Quadratic deformation cost: a*dx² + b*dx + c*dy² + d*dy
deform_cost = (deform_params['a'] * dx**2 +
deform_params['b'] * dx +
deform_params['c'] * dy**2 +
deform_params['d'] * dy)
part_score += filter_score - deform_cost
total_score = root_score + part_score
return total_score
# Advantages of scoring function
advantages = {
"Flexibility": "Allows variation in relative part positions",
"Robustness": "Robust to partial occlusion and deformation",
"Interpretability": "Explicit object structure representation",
"Generalization": "Applicable to multiple object categories"
}
print("DPM Scoring Function Advantages:")
print("=" * 25)
for adv, desc in advantages.items():
print(f" {adv}: {desc}")
return dpm_score, advantages
def latent_svm_training(self):
"""Latent SVM training"""
training_process = {
"Initialization": {
"Step": "Initialize with simple star model",
"Goal": "Provide initial parameters for root and part filters",
"Method": "Learn initial templates from positive samples"
},
"Iterative Optimization": {
"E-step": {
"Goal": "Fix model parameters, optimize latent variables",
"Operation": "Find best part positions for each positive sample",
"Method": "Dynamic programming or distance transform"
},
"M-step": {
"Goal": "Fix latent variables, optimize model parameters",
"Operation": "Update filter weights and deformation parameters",
"Method": "Standard SVM training"
}
},
"Hard Negative Mining": {
"Purpose": "Handle difficult negative samples",
"Strategy": "Select high-scoring negative samples to add to training set",
"Effect": "Improve model discriminative ability"
},
"Convergence Determination": {
"Condition": "Objective function change below threshold",
"Metric": "Detection accuracy on validation set",
"Termination": "Reach maximum iterations or convergence"
}
}
def latent_svm_algorithm():
"""Latent SVM algorithm framework"""
print("Latent SVM Training Algorithm:")
print("1. Initialization: Learn initial root template")
print("2. Repeat until convergence:")
print(" a. E-step: Optimize latent variables (part positions)")
print(" b. M-step: Optimize model parameters (filter weights)")
print(" c. Hard negative mining: Add difficult negative samples")
print("3. Output: Trained DPM model")
print("Latent SVM Training Process:")
print("=" * 30)
for phase, details in training_process.items():
print(f"\n{phase}:")
if isinstance(details, dict) and "Step" not in details:
for key, value in details.items():
print(f" {key}:")
if isinstance(value, dict):
for k, v in value.items():
print(f" {k}: {v}")
else:
print(f" {value}")
else:
for key, value in details.items():
print(f" {key}: {value}")
return latent_svm_algorithm
def dynamic_programming_inference(self):
"""Dynamic programming inference"""
def distance_transform_2d(cost_matrix):
"""2D distance transform"""
height, width = cost_matrix.shape
dt_result = np.zeros_like(cost_matrix)
# Simplified distance transform implementation
for i in range(height):
for j in range(width):
min_cost = float('inf')
# Find minimum cost in neighborhood
for di in range(-1, 2):
for dj in range(-1, 2):
ni, nj = i + di, j + dj
if 0 <= ni < height and 0 <= nj < width:
# Deformation cost + original cost
deform_cost = di**2 + dj**2 # Simplified deformation cost
total_cost = cost_matrix[ni, nj] + deform_cost
min_cost = min(min_cost, total_cost)
dt_result[i, j] = min_cost
return dt_result
def part_based_detection(root_scores, part_scores, deformation_params):
"""Part-based detection"""
# 1. Compute root filter response
root_response = root_scores
# 2. Compute optimal position for each part
part_contributions = []
for part_id, part_score in enumerate(part_scores):
# Build cost matrix (negative scores, because we want to maximize scores)
cost_matrix = -part_score
# Distance transform to find optimal position
dt_result = distance_transform_2d(cost_matrix)
# Optimal scores (convert back to scores by negating)
optimal_scores = -dt_result
part_contributions.append(optimal_scores)
# 3. Combine all scores
total_scores = root_response
for part_contrib in part_contributions:
total_scores += part_contrib
return total_scores
dp_advantages = {
"Efficiency": "O(n) time complexity, faster than brute force",
"Global Optimum": "Guarantees finding globally optimal part configuration",
"Scalability": "Easy to extend to more parts",
"Parallelization": "Different parts can be computed in parallel"
}
print("Dynamic Programming Inference Advantages:")
print("=" * 25)
for adv, desc in dp_advantages.items():
print(f" {adv}: {desc}")
return distance_transform_2d, part_based_detection
def dpm_vs_traditional_methods(self):
"""DPM vs traditional methods comparison"""
comparison = {
"Viola-Jones": {
"Advantages": ["Fast speed", "Good real-time performance"],
"Disadvantages": ["Only for specific objects", "Rigid templates"],
"vs DPM": "DPM is more flexible but slower"
},
"HOG+SVM": {
"Advantages": ["Good feature representation", "Strong generalization"],
"Disadvantages": ["Rigid detection window", "Sensitive to deformation"],
"vs DPM": "DPM adds deformation capability"
},
"Traditional Sliding Window": {
"Advantages": ["Simple implementation", "Easy to understand"],
"Disadvantages": ["High computational cost", "Complex multi-scale processing"],
"vs DPM": "DPM has better multi-scale processing"
}
}
dpm_innovations = {
"Deformable Modeling": "Allows variation in object part relative positions",
"Mixture Model": "Handles different viewpoints and poses",
"Latent Variable Learning": "Automatically learns optimal part configuration",
"Hierarchical Structure": "Two-level root-part structure",
"Dynamic Programming Inference": "Efficient optimization solving"
}
print("DPM vs Traditional Methods Comparison:")
print("=" * 35)
for method, details in comparison.items():
print(f"\n{method}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
print(f"\nDPM Main Innovations:")
print("-" * 20)
for innovation, desc in dpm_innovations.items():
print(f" {innovation}: {desc}")
return comparison, dpm_innovations
# Usage example
dpm = DPMDetector()
# DPM architecture
architecture = dpm.dpm_architecture()
# Scoring function
scoring_func, advantages = dpm.scoring_function()
# Latent SVM training
latent_svm_alg = dpm.latent_svm_training()
latent_svm_alg()
# Dynamic programming inference
distance_transform, part_detection = dpm.dynamic_programming_inference()
# Method comparison
comparison, innovations = dpm.dpm_vs_traditional_methods()
# Demonstrate distance transform
print(f"\nDistance Transform Demonstration:")
print("-" * 15)
# Create test cost matrix
test_costs = np.array([[1, 2, 3],
[2, 1, 2],
[3, 2, 1]])
print("Original cost matrix:")
print(test_costs)
dt_result = distance_transform(test_costs)
print("Distance transform result:")
print(dt_result)
3.3 Two-Stage Detection Algorithms
3.3.1 R-CNN
class RCNNDetector:
def __init__(self):
self.method_info = {
"Year": 2014,
"Authors": "Ross Girshick et al.",
"Contribution": "First application of CNN to object detection",
"Breakthrough": "Dramatically improved detection accuracy"
}
def rcnn_pipeline(self):
"""R-CNN pipeline"""
pipeline_steps = {
"Step 1: Region Proposal Generation": {
"Method": "Selective Search",
"Output": "~2000 region proposals",
"Role": "Reduce search space",
"Characteristics": "Based on segmentation and merging"
},
"Step 2: Feature Extraction": {
"Method": "CNN (AlexNet)",
"Preprocessing": "Resize region proposals to 227×227",
"Output": "4096-dimensional feature vector",
"Pretraining": "ImageNet pretrained weights"
},
"Step 3: Classification": {
"Method": "SVM classifier",
"Training": "Train one binary SVM per class",
"Output": "Class probabilities",
"Postprocessing": "Non-maximum suppression"
},
"Step 4: Bounding Box Regression": {
"Method": "Linear regression",
"Goal": "Refine bounding box locations",
"Input": "CNN features",
"Output": "Position offsets"
}
}
def selective_search_simulation():
"""Selective Search algorithm simulation"""
class SelectiveSearch:
def __init__(self):
self.similarity_measures = [
"Color similarity",
"Texture similarity",
"Size similarity",
"Fill similarity"
]
def generate_proposals(self, image):
"""Generate region proposals (simulated implementation)"""
proposals = []
print("Selective Search Process:")
print("1. Initial segmentation: Use Graph-based segmentation")
print("2. Similarity computation: Calculate adjacent region similarity")
print("3. Region merging: Merge most similar regions")
print("4. Repeat merging: Until only one region remains")
print("5. Extract bounding boxes: Record all regions during merging")
# Simulate generating some region proposals
height, width = image.shape[:2] if hasattr(image, 'shape') else (224, 224)
for i in range(2000): # R-CNN typically generates 2000 region proposals
x = np.random.randint(0, width - 50)
y = np.random.randint(0, height - 50)
w = np.random.randint(50, min(150, width - x))
h = np.random.randint(50, min(150, height - y))
proposals.append([x, y, x + w, y + h])
return proposals[:2000] # Return first 2000
def filter_proposals(self, proposals, min_size=20):
"""Filter region proposals"""
filtered = []
for proposal in proposals:
x1, y1, x2, y2 = proposal
if (x2 - x1) >= min_size and (y2 - y1) >= min_size:
filtered.append(proposal)
return filtered
return SelectiveSearch()
print("R-CNN Algorithm Pipeline:")
print("=" * 30)
for step, details in pipeline_steps.items():
print(f"\n{step}:")
for key, value in details.items():
print(f" {key}: {value}")
return pipeline_steps, selective_search_simulation()
def rcnn_training_process(self):
"""R-CNN training process"""
training_phases = {
"Phase 1: CNN Pretraining": {
"Data": "ImageNet classification data",
"Task": "1000-class image classification",
"Network": "AlexNet",
"Goal": "Learn general visual features"
},
"Phase 2: CNN Fine-tuning": {
"Data": "Detection dataset (PASCAL VOC)",
"Positive Samples": "Proposals with IoU ≥ 0.5 with GT",
"Negative Samples": "Proposals with IoU < 0.5 with GT",
"Modification": "Last layer changed to N+1 classes (N object classes + background)"
},
"Phase 3: SVM Training": {
"Positive Samples": "Ground Truth bounding boxes",
"Negative Samples": "Proposals with IoU < 0.3 with GT",
"Features": "fc7 layer output from fine-tuned CNN",
"Classifier": "Train one binary SVM per class"
},
"Phase 4: Bounding Box Regression": {
"Training Data": "Proposals with IoU ≥ 0.6 with GT",
"Input": "CNN features + proposal coordinates",
"Output": "4 regression values (dx, dy, dw, dh)",
"Loss": "Smooth L1 loss"
}
}
def bbox_regression_details():
"""Bounding box regression detailed explanation"""
regression_formulas = {
"Prediction Transform": {
"dx": "(Gx - Px) / Pw",
"dy": "(Gy - Py) / Ph",
"dw": "log(Gw / Pw)",
"dh": "log(Gh / Ph)"
},
"Apply Transform": {
"Gx_pred": "Px + Pw * dx",
"Gy_pred": "Py + Ph * dy",
"Gw_pred": "Pw * exp(dw)",
"Gh_pred": "Ph * exp(dh)"
}
}
print("Bounding Box Regression Formulas:")
print("-" * 20)
for transform_type, formulas in regression_formulas.items():
print(f"{transform_type}:")
for var, formula in formulas.items():
print(f" {var} = {formula}")
return regression_formulas
print("R-CNN Training Process:")
print("=" * 25)
for phase, details in training_phases.items():
print(f"\n{phase}:")
for key, value in details.items():
print(f" {key}: {value}")
bbox_formulas = bbox_regression_details()
return training_phases, bbox_formulas
def rcnn_limitations(self):
"""R-CNN limitations analysis"""
limitations = {
"Complex Training": {
"Problem": "Multi-stage training pipeline",
"Details": [
"CNN pretraining requires ImageNet data",
"CNN fine-tuning requires detection data",
"SVM needs separate training",
"Bounding box regression needs separate training"
],
"Impact": "Long training time, difficult debugging"
},
"Slow Inference": {
"Problem": "Each proposal must pass through CNN",
"Details": [
"2000 region proposals",
"Each requires one forward pass",
"Massive redundant computation"
],
"Data": "~13 seconds/image on GPU",
"Impact": "Cannot be used for real-time applications"
},
"Large Storage Requirements": {
"Problem": "Need to store large number of features",
"Details": [
"4096-dimensional features per proposal",
"2000×4096×4 bytes≈32MB/image"
],
"Impact": "High memory consumption"
},
"Limited Accuracy Improvement": {
"Problem": "Selective Search quality limitation",
"Details": [
"Region proposal quality not high",
"May miss small objects",
"Bounding boxes not precise enough"
],
"Impact": "Detection performance ceiling is limited"
}
}
improvements_needed = {
"Training Simplification": "Need end-to-end training approach",
"Speed Improvement": "Need to reduce redundant computation",
"Accuracy Enhancement": "Need better region proposal generation",
"System Integration": "Need unified framework"
}
print("R-CNN Main Limitations:")
print("=" * 25)
for limitation, details in limitations.items():
print(f"\n{limitation}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}:")
for item in value:
print(f" - {item}")
else:
print(f" {key}: {value}")
print(f"\nImprovement Directions:")
print("-" * 10)
for improvement, desc in improvements_needed.items():
print(f" {improvement}: {desc}")
return limitations, improvements_needed
# Usage example
rcnn = RCNNDetector()
# R-CNN pipeline
pipeline, selective_search = rcnn.rcnn_pipeline()
# Training process
training_phases, bbox_formulas = rcnn.rcnn_training_process()
# Limitations analysis
limitations, improvements = rcnn.rcnn_limitations()
# Demonstrate Selective Search
print(f"\nSelective Search Demonstration:")
print("-" * 20)
# Simulated image
dummy_image = np.random.rand(224, 224, 3)
proposals = selective_search.generate_proposals(dummy_image)
print(f"Generated region proposals: {len(proposals)}")
print(f"First 5 region proposals: {proposals[:5]}")
# Filter region proposals
filtered_proposals = selective_search.filter_proposals(proposals)
print(f"Filtered region proposals: {len(filtered_proposals)}")
Now the first part of Chapter 3 is complete. I will continue to complete the remaining parts of the two-stage detection algorithms (Fast R-CNN, Faster R-CNN) and the analysis of one-stage method advantages, as well as the chapter summary.
Continue learning progress, completing the systematic study of the YOLO course…