第3章:目标检测发展历程与经典算法
10/2/25About 24 min
第3章:目标检测发展历程与经典算法
学习目标
- 了解目标检测算法的发展历程
- 掌握传统目标检测方法(HOG+SVM、DPM等)
- 理解两阶段检测算法(R-CNN、Fast R-CNN、Faster R-CNN)
- 认识一阶段检测算法的优势
3.1 目标检测发展历程概览
3.1.1 发展时间线
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import matplotlib.dates as mdates
class ObjectDetectionHistory:
def __init__(self):
self.timeline = {
2001: {"算法": "Viola-Jones", "类型": "传统方法", "突破": "实时人脸检测"},
2005: {"算法": "HOG", "类型": "传统方法", "突破": "方向梯度直方图特征"},
2008: {"算法": "DPM", "类型": "传统方法", "突破": "可变形部件模型"},
2012: {"算法": "AlexNet", "类型": "深度学习", "突破": "CNN在图像分类上的突破"},
2014: {"算法": "R-CNN", "类型": "两阶段", "突破": "CNN用于目标检测"},
2015: {"算法": "Fast R-CNN", "类型": "两阶段", "突破": "端到端训练"},
2015: {"算法": "YOLO v1", "类型": "一阶段", "突破": "实时目标检测"},
2016: {"算法": "SSD", "类型": "一阶段", "突破": "多尺度检测"},
2016: {"算法": "Faster R-CNN", "类型": "两阶段", "突破": "RPN网络"},
2017: {"算法": "RetinaNet", "类型": "一阶段", "突破": "Focal Loss"},
2017: {"算法": "Mask R-CNN", "类型": "两阶段", "突破": "实例分割"},
2018: {"算法": "YOLO v3", "类型": "一阶段", "突破": "多尺度预测"},
2020: {"算法": "EfficientDet", "类型": "一阶段", "突破": "高效架构设计"},
2020: {"算法": "DETR", "类型": "Transformer", "突破": "端到端Transformer检测"}
}
def create_timeline_visualization(self):
"""创建发展时间线可视化"""
years = list(self.timeline.keys())
algorithms = [self.timeline[year]["算法"] for year in years]
types = [self.timeline[year]["类型"] for year in years]
# 颜色映射
color_map = {
"传统方法": "red",
"深度学习": "blue",
"两阶段": "green",
"一阶段": "orange",
"Transformer": "purple"
}
colors = [color_map[t] for t in types]
# 创建时间线图
fig, ax = plt.subplots(figsize=(15, 8))
# 绘制时间线
ax.scatter(years, range(len(years)), c=colors, s=100, alpha=0.7)
# 添加算法标签
for i, (year, alg) in enumerate(zip(years, algorithms)):
ax.annotate(f"{alg}\n({year})",
(year, i),
xytext=(10, 0),
textcoords='offset points',
ha='left',
fontsize=10,
bbox=dict(boxstyle='round,pad=0.3',
facecolor=colors[i],
alpha=0.3))
# 设置图形属性
ax.set_xlabel("年份", fontsize=12)
ax.set_ylabel("发展阶段", fontsize=12)
ax.set_title("目标检测算法发展时间线", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
# 添加图例
legend_elements = [plt.Line2D([0], [0], marker='o', color='w',
markerfacecolor=color, markersize=10,
label=method)
for method, color in color_map.items()]
ax.legend(handles=legend_elements, loc='upper left')
return fig
def development_phases(self):
"""发展阶段分析"""
phases = {
"传统方法时代 (2001-2012)": {
"特点": [
"基于手工特征设计",
"使用传统机器学习算法",
"计算效率高但精度有限"
],
"代表算法": ["Viola-Jones", "HOG+SVM", "DPM"],
"主要贡献": "建立了目标检测的基础框架"
},
"深度学习兴起 (2012-2014)": {
"特点": [
"CNN在图像分类上的突破",
"为目标检测引入深度学习",
"开始端到端学习"
],
"代表算法": ["AlexNet", "R-CNN"],
"主要贡献": "证明了深度学习在视觉任务上的优势"
},
"两阶段方法完善 (2014-2017)": {
"特点": [
"候选区域 + 分类回归",
"追求高精度",
"速度相对较慢"
],
"代表算法": ["R-CNN", "Fast R-CNN", "Faster R-CNN", "Mask R-CNN"],
"主要贡献": "确立了两阶段检测的标准范式"
},
"一阶段方法发展 (2015-2020)": {
"特点": [
"端到端直接预测",
"追求速度与精度平衡",
"适合实时应用"
],
"代表算法": ["YOLO", "SSD", "RetinaNet"],
"主要贡献": "实现了实时高精度目标检测"
},
"新架构探索 (2020-)": {
"特点": [
"Transformer架构引入",
"自注意力机制",
"端到端无锚框检测"
],
"代表算法": ["DETR", "DETR系列"],
"主要贡献": "探索新的网络架构可能性"
}
}
print("目标检测发展阶段分析:")
print("=" * 50)
for phase, details in phases.items():
print(f"\n{phase}:")
print(f" 特点:")
for feature in details["特点"]:
print(f" - {feature}")
print(f" 代表算法: {', '.join(details['代表算法'])}")
print(f" 主要贡献: {details['主要贡献']}")
return phases
# 使用示例
history = ObjectDetectionHistory()
# 创建时间线图
# timeline_fig = history.create_timeline_visualization()
# plt.show()
# 发展阶段分析
phases = history.development_phases()3.1.2 技术演进分析
class TechnologyEvolution:
def __init__(self):
self.evolution_aspects = {
"特征表示": {
"传统方法": {
"特征": ["HOG", "SIFT", "SURF", "Haar-like"],
"优点": "计算简单,可解释性强",
"缺点": "表达能力有限,需要专门设计"
},
"深度学习": {
"特征": "CNN自动学习特征",
"优点": "表达能力强,端到端学习",
"缺点": "需要大量数据,计算复杂"
}
},
"候选区域生成": {
"滑动窗口": {
"方法": "穷举所有位置和尺度",
"优点": "简单直接",
"缺点": "计算量大,冗余多"
},
"选择性搜索": {
"方法": "基于分割和合并策略",
"优点": "大幅减少候选区域",
"缺点": "依赖分割质量"
},
"学习式生成": {
"方法": "RPN等神经网络生成",
"优点": "端到端学习,质量高",
"缺点": "增加网络复杂度"
},
"无候选区域": {
"方法": "直接回归检测结果",
"优点": "速度快,架构简单",
"缺点": "定位精度相对较低"
}
},
"多尺度处理": {
"图像金字塔": {
"方法": "构建不同尺度的图像",
"优点": "处理全面",
"缺点": "计算量成倍增加"
},
"特征金字塔": {
"方法": "利用CNN不同层级特征",
"优点": "计算高效",
"缺点": "特征语义不一致"
},
"特征金字塔网络": {
"方法": "自顶向下和横向连接",
"优点": "强语义和高分辨率并存",
"缺点": "网络结构复杂"
}
}
}
def analyze_evolution(self):
"""技术演进分析"""
print("目标检测技术演进分析:")
print("=" * 60)
for aspect, methods in self.evolution_aspects.items():
print(f"\n【{aspect}】")
for method, details in methods.items():
print(f"\n {method}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
def performance_trends(self):
"""性能发展趋势"""
# 模拟不同算法在PASCAL VOC上的性能数据
algorithms = [
"DPM", "R-CNN", "Fast R-CNN", "Faster R-CNN",
"YOLO v1", "SSD", "YOLO v2", "RetinaNet",
"YOLO v3", "EfficientDet", "YOLO v5"
]
# mAP值 (大致数据)
map_values = [33.7, 53.3, 66.9, 73.2, 63.4, 74.3, 78.6, 80.0, 82.0, 84.3, 85.0]
# FPS值 (大致数据)
fps_values = [0.07, 0.05, 0.5, 7, 45, 19, 40, 5, 20, 15, 60]
# 年份
years = [2008, 2014, 2015, 2016, 2016, 2016, 2017, 2017, 2018, 2020, 2020]
# 创建双轴图
fig, ax1 = plt.subplots(figsize=(14, 8))
# mAP趋势
color1 = 'tab:red'
ax1.set_xlabel('算法发展顺序')
ax1.set_ylabel('mAP (%)', color=color1)
line1 = ax1.plot(algorithms, map_values, 'o-', color=color1, linewidth=2, markersize=8, label='mAP')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_ylim(30, 90)
# FPS趋势
ax2 = ax1.twinx()
color2 = 'tab:blue'
ax2.set_ylabel('FPS', color=color2)
line2 = ax2.plot(algorithms, fps_values, 's-', color=color2, linewidth=2, markersize=8, label='FPS')
ax2.tick_params(axis='y', labelcolor=color2)
ax2.set_ylim(0, 70)
ax2.set_yscale('log')
# 设置标题和网格
plt.title('目标检测算法性能发展趋势', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
# 旋转x轴标签
plt.xticks(rotation=45, ha='right')
# 添加图例
lines = line1 + line2
labels = ['mAP (%)', 'FPS']
ax1.legend(lines, labels, loc='upper left')
plt.tight_layout()
return fig
def complexity_analysis(self):
"""计算复杂度分析"""
complexity_data = {
"传统方法": {
"时间复杂度": "O(n²) - 滑动窗口",
"空间复杂度": "O(1) - 特征提取",
"参数量": "< 1M",
"特点": "计算简单但精度有限"
},
"两阶段方法": {
"时间复杂度": "O(n) - 候选区域数量",
"空间复杂度": "O(n) - CNN特征存储",
"参数量": "100M+",
"特点": "精度高但速度慢"
},
"一阶段方法": {
"时间复杂度": "O(1) - 单次前向传播",
"空间复杂度": "O(1) - 固定网络结构",
"参数量": "20-100M",
"特点": "速度快,适合实时应用"
}
}
print("计算复杂度对比分析:")
print("=" * 40)
for method, analysis in complexity_data.items():
print(f"\n{method}:")
for key, value in analysis.items():
print(f" {key}: {value}")
return complexity_data
# 使用示例
tech_evolution = TechnologyEvolution()
# 技术演进分析
tech_evolution.analyze_evolution()
# 性能趋势
# performance_fig = tech_evolution.performance_trends()
# plt.show()
# 复杂度分析
complexity = tech_evolution.complexity_analysis()3.2 传统目标检测方法
3.2.1 Viola-Jones人脸检测
class ViolaJonesDetector:
def __init__(self):
self.method_info = {
"年份": 2001,
"作者": "Paul Viola & Michael Jones",
"贡献": "首个实时目标检测算法",
"应用": "人脸检测"
}
def haar_like_features(self):
"""Haar-like特征"""
features = {
"边缘特征": {
"模式": "相邻矩形区域像素和的差",
"类型": ["水平边缘", "垂直边缘"],
"计算": "白色区域像素和 - 黑色区域像素和"
},
"线条特征": {
"模式": "中间与两侧区域的对比",
"类型": ["水平线条", "垂直线条"],
"用途": "检测细长结构"
},
"中心特征": {
"模式": "中心区域与周围区域的对比",
"类型": ["四矩形特征"],
"用途": "检测中心突出的结构"
}
}
# 模拟Haar-like特征计算
def compute_haar_feature(image, feature_type, position, scale):
"""
计算Haar-like特征
image: 输入图像
feature_type: 特征类型
position: 特征位置 (x, y)
scale: 特征尺度
"""
x, y = position
w, h = scale
if feature_type == "edge_horizontal":
# 水平边缘特征:上半部分 - 下半部分
top_sum = np.sum(image[y:y+h//2, x:x+w])
bottom_sum = np.sum(image[y+h//2:y+h, x:x+w])
return top_sum - bottom_sum
elif feature_type == "edge_vertical":
# 垂直边缘特征:左半部分 - 右半部分
left_sum = np.sum(image[y:y+h, x:x+w//2])
right_sum = np.sum(image[y:y+h, x+w//2:x+w])
return left_sum - right_sum
elif feature_type == "line_horizontal":
# 水平线条特征:中间 - 上下
top_sum = np.sum(image[y:y+h//3, x:x+w])
middle_sum = np.sum(image[y+h//3:y+2*h//3, x:x+w])
bottom_sum = np.sum(image[y+2*h//3:y+h, x:x+w])
return middle_sum - (top_sum + bottom_sum)
return 0
print("Haar-like特征类型:")
print("=" * 30)
for feature_type, details in features.items():
print(f"\n{feature_type}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
return features, compute_haar_feature
def integral_image(self):
"""积分图像计算"""
def compute_integral_image(image):
"""计算积分图像"""
height, width = image.shape
integral = np.zeros((height + 1, width + 1))
for i in range(1, height + 1):
for j in range(1, width + 1):
integral[i][j] = (image[i-1][j-1] +
integral[i-1][j] +
integral[i][j-1] -
integral[i-1][j-1])
return integral
def rectangle_sum(integral, x, y, w, h):
"""使用积分图像快速计算矩形区域像素和"""
return (integral[y+h][x+w] -
integral[y][x+w] -
integral[y+h][x] +
integral[y][x])
# 演示积分图像的优势
print("积分图像优势:")
print("- 直接计算: O(w×h) 时间复杂度")
print("- 积分图像: O(1) 时间复杂度")
print("- 对于大量矩形计算,效率提升显著")
return compute_integral_image, rectangle_sum
def adaboost_classifier(self):
"""AdaBoost分类器"""
class WeakClassifier:
def __init__(self, feature_type, threshold, polarity):
self.feature_type = feature_type
self.threshold = threshold
self.polarity = polarity # 1 或 -1
def classify(self, feature_value):
"""弱分类器判断"""
if self.polarity * feature_value < self.polarity * self.threshold:
return 1
else:
return 0
class AdaBoostClassifier:
def __init__(self):
self.weak_classifiers = []
self.alphas = [] # 弱分类器权重
def train(self, features, labels, n_estimators=100):
"""AdaBoost训练过程(简化版)"""
n_samples = len(features)
weights = np.ones(n_samples) / n_samples # 初始权重
for t in range(n_estimators):
# 找到当前最佳弱分类器
best_classifier, best_error = self._find_best_classifier(
features, labels, weights)
# 计算弱分类器权重
alpha = 0.5 * np.log((1 - best_error) / (best_error + 1e-10))
# 保存弱分类器和权重
self.weak_classifiers.append(best_classifier)
self.alphas.append(alpha)
# 更新样本权重
predictions = [best_classifier.classify(f) for f in features]
for i in range(n_samples):
if predictions[i] != labels[i]:
weights[i] *= np.exp(alpha)
else:
weights[i] *= np.exp(-alpha)
# 归一化权重
weights /= np.sum(weights)
if best_error < 0.01: # 提前停止
break
def _find_best_classifier(self, features, labels, weights):
"""找到当前最佳弱分类器(简化实现)"""
best_error = float('inf')
best_classifier = None
# 简化:只考虑阈值变化
for threshold in np.linspace(min(features), max(features), 50):
for polarity in [-1, 1]:
classifier = WeakClassifier("simple", threshold, polarity)
# 计算加权错误率
error = 0
for i, feature in enumerate(features):
prediction = classifier.classify(feature)
if prediction != labels[i]:
error += weights[i]
if error < best_error:
best_error = error
best_classifier = classifier
return best_classifier, best_error
def predict(self, features):
"""强分类器预测"""
if not isinstance(features, list):
features = [features]
predictions = []
for feature in features:
weighted_sum = sum(alpha * classifier.classify(feature)
for alpha, classifier in
zip(self.alphas, self.weak_classifiers))
threshold = sum(self.alphas) / 2
predictions.append(1 if weighted_sum >= threshold else 0)
return predictions
print("AdaBoost分类器特点:")
print("- 将多个弱分类器组合成强分类器")
print("- 自适应调整样本权重")
print("- 关注困难样本")
print("- 具有理论保证的泛化能力")
return AdaBoostClassifier
def cascade_classifier(self):
"""级联分类器"""
cascade_info = {
"设计思想": {
"目标": "快速拒绝明显的负样本",
"策略": "多层级联,逐步筛选",
"优势": "大幅提升检测速度"
},
"结构特点": {
"层数": "通常20-30层",
"每层": "一个AdaBoost强分类器",
"阈值设置": "保证较高的检测率",
"拒绝率": "每层拒绝50%以上负样本"
},
"检测过程": {
"输入": "滑动窗口中的图像块",
"第一层": "用最少特征快速筛选",
"后续层": "逐渐增加特征复杂度",
"输出": "通过所有层才认为是目标"
},
"性能优势": {
"速度": "平均每个窗口计算10个特征",
"精度": "在人脸检测上达到实时性能",
"实用性": "OpenCV等库广泛使用"
}
}
print("级联分类器架构:")
print("=" * 40)
for aspect, details in cascade_info.items():
print(f"\n{aspect}:")
for key, value in details.items():
print(f" {key}: {value}")
return cascade_info
# 使用示例
viola_jones = ViolaJonesDetector()
# Haar-like特征
features, compute_feature = viola_jones.haar_like_features()
# 积分图像
compute_integral, rectangle_sum = viola_jones.integral_image()
# AdaBoost分类器
AdaBoostClassifier = viola_jones.adaboost_classifier()
# 级联分类器
cascade_info = viola_jones.cascade_classifier()
# 演示积分图像计算
print("\n积分图像演示:")
print("-" * 20)
sample_image = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
integral = compute_integral(sample_image)
print("原始图像:")
print(sample_image)
print("积分图像:")
print(integral[1:, 1:]) # 去掉padding
# 计算矩形区域和
rect_sum = rectangle_sum(integral, 0, 0, 2, 2) # 左上角2x2区域
print(f"左上角2x2区域和: {rect_sum}")
print(f"直接计算验证: {np.sum(sample_image[:2, :2])}")3.2.2 HOG+SVM方法
class HOGSVMDetector:
def __init__(self):
self.method_info = {
"年份": 2005,
"作者": "Navneet Dalal & Bill Triggs",
"贡献": "提出HOG特征描述符",
"应用": "行人检测"
}
def hog_feature_extraction(self):
"""HOG特征提取"""
def compute_gradients(image):
"""计算图像梯度"""
# 使用Sobel算子计算梯度
grad_x = np.array([[-1, 0, 1],
[-2, 0, 2],
[-1, 0, 1]])
grad_y = np.array([[-1, -2, -1],
[ 0, 0, 0],
[ 1, 2, 1]])
# 卷积计算梯度
if len(image.shape) == 3:
image = np.mean(image, axis=2) # 转为灰度图
gx = np.zeros_like(image)
gy = np.zeros_like(image)
for i in range(1, image.shape[0]-1):
for j in range(1, image.shape[1]-1):
gx[i, j] = np.sum(grad_x * image[i-1:i+2, j-1:j+2])
gy[i, j] = np.sum(grad_y * image[i-1:i+2, j-1:j+2])
# 计算梯度幅值和方向
magnitude = np.sqrt(gx**2 + gy**2)
direction = np.arctan2(gy, gx) * 180 / np.pi
direction[direction < 0] += 180 # 转换到0-180度
return magnitude, direction
def compute_hog_descriptor(image, cell_size=(8, 8), block_size=(16, 16), nbins=9):
"""计算HOG描述符"""
magnitude, direction = compute_gradients(image)
height, width = image.shape
cell_h, cell_w = cell_size
block_h, block_w = block_size
# 计算cell数量
n_cells_y = height // cell_h
n_cells_x = width // cell_w
# 为每个cell计算直方图
cell_histograms = np.zeros((n_cells_y, n_cells_x, nbins))
for i in range(n_cells_y):
for j in range(n_cells_x):
# 当前cell的范围
y_start = i * cell_h
y_end = (i + 1) * cell_h
x_start = j * cell_w
x_end = (j + 1) * cell_w
# 提取cell内的梯度信息
cell_mag = magnitude[y_start:y_end, x_start:x_end]
cell_dir = direction[y_start:y_end, x_start:x_end]
# 计算方向直方图
hist = np.zeros(nbins)
bin_width = 180 / nbins
for y in range(cell_h):
for x in range(cell_w):
mag_val = cell_mag[y, x]
dir_val = cell_dir[y, x]
# 双线性插值分配到相邻bins
bin_idx = dir_val / bin_width
bin_low = int(bin_idx)
bin_high = (bin_low + 1) % nbins
weight_high = bin_idx - bin_low
weight_low = 1 - weight_high
hist[bin_low] += weight_low * mag_val
hist[bin_high] += weight_high * mag_val
cell_histograms[i, j] = hist
# Block归一化
blocks_per_row = n_cells_x - block_w // cell_w + 1
blocks_per_col = n_cells_y - block_h // cell_h + 1
hog_features = []
for i in range(blocks_per_col):
for j in range(blocks_per_row):
# 提取block内的cell直方图
block_hist = cell_histograms[i:i+2, j:j+2].flatten()
# L2归一化
norm = np.linalg.norm(block_hist)
if norm > 0:
block_hist = block_hist / norm
hog_features.extend(block_hist)
return np.array(hog_features)
return compute_hog_descriptor
def hog_parameters_analysis(self):
"""HOG参数分析"""
parameters = {
"Cell大小": {
"常用值": "8x8像素",
"作用": "局部梯度统计的基本单元",
"影响": "太小噪声敏感,太大丢失细节"
},
"Block大小": {
"常用值": "16x16像素 (2x2 cells)",
"作用": "归一化的基本单元",
"影响": "抑制光照变化影响"
},
"方向bins": {
"常用值": "9个bins (0-180度)",
"作用": "量化梯度方向",
"影响": "bins太少丢失信息,太多计算复杂"
},
"重叠步长": {
"常用值": "8像素 (cell大小)",
"作用": "增加特征密度",
"影响": "提升检测精度但增加计算量"
},
"归一化方法": {
"常用值": "L2-norm",
"作用": "抑制光照变化",
"影响": "提升鲁棒性"
}
}
print("HOG参数详细分析:")
print("=" * 40)
for param, details in parameters.items():
print(f"\n{param}:")
for key, value in details.items():
print(f" {key}: {value}")
return parameters
def svm_classifier(self):
"""SVM分类器"""
class SimpleSVM:
def __init__(self, C=1.0, kernel='linear'):
self.C = C
self.kernel = kernel
self.support_vectors = None
self.alphas = None
self.bias = None
def linear_kernel(self, X1, X2):
"""线性核函数"""
return np.dot(X1, X2.T)
def rbf_kernel(self, X1, X2, gamma=1.0):
"""RBF核函数"""
pairwise_sq_dists = np.sum(X1**2, axis=1).reshape(-1, 1) + \
np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)
return np.exp(-gamma * pairwise_sq_dists)
def fit(self, X, y):
"""SVM训练(简化实现)"""
# 这里使用简化的SMO算法实现
print("SVM训练过程(简化实现):")
print("1. 初始化拉格朗日乘子")
print("2. 选择违反KKT条件的样本对")
print("3. 优化选定的乘子")
print("4. 更新偏置项")
print("5. 重复直到收敛")
# 在实际实现中,这里会有完整的SMO算法
# 现在只是演示概念
self.support_vectors = X[:10] # 假设前10个为支持向量
self.alphas = np.ones(10)
self.bias = 0.0
def predict(self, X):
"""预测"""
if self.kernel == 'linear':
kernel_values = self.linear_kernel(X, self.support_vectors)
else:
kernel_values = self.rbf_kernel(X, self.support_vectors)
decision_values = np.dot(kernel_values, self.alphas) + self.bias
return np.sign(decision_values)
svm_properties = {
"核心思想": "寻找最大间隔分离超平面",
"支持向量": "决定分类边界的关键样本",
"核技巧": "处理非线性可分问题",
"正则化": "C参数控制间隔与误分类的平衡",
"稀疏性": "只有支持向量影响决策"
}
print("SVM分类器特性:")
print("=" * 30)
for prop, desc in svm_properties.items():
print(f" {prop}: {desc}")
return SimpleSVM, svm_properties
def sliding_window_detection(self):
"""滑动窗口检测"""
def multi_scale_detection(image, detector, window_sizes, step_size=8):
"""多尺度滑动窗口检测"""
detections = []
for window_size in window_sizes:
w, h = window_size
# 滑动窗口
for y in range(0, image.shape[0] - h, step_size):
for x in range(0, image.shape[1] - w, step_size):
# 提取窗口
window = image[y:y+h, x:x+w]
# 特征提取
features = detector.extract_features(window)
# 分类
confidence = detector.classify(features)
if confidence > detector.threshold:
detections.append({
'bbox': [x, y, w, h],
'confidence': confidence,
'scale': window_size
})
return detections
def non_maximum_suppression(detections, iou_threshold=0.5):
"""非极大值抑制"""
if len(detections) == 0:
return []
# 按置信度排序
detections = sorted(detections, key=lambda x: x['confidence'], reverse=True)
keep = []
while len(detections) > 0:
# 保留置信度最高的检测
keep.append(detections[0])
current = detections.pop(0)
# 计算与其他检测的IoU
remaining = []
for det in detections:
iou = self.calculate_iou(current['bbox'], det['bbox'])
if iou < iou_threshold:
remaining.append(det)
detections = remaining
return keep
def calculate_iou(self, box1, box2):
"""计算IoU"""
x1, y1, w1, h1 = box1
x2, y2, w2, h2 = box2
# 计算交集
x_left = max(x1, x2)
y_top = max(y1, y2)
x_right = min(x1 + w1, x2 + w2)
y_bottom = min(y1 + h1, y2 + h2)
if x_right < x_left or y_bottom < y_top:
return 0.0
intersection = (x_right - x_left) * (y_bottom - y_top)
union = w1 * h1 + w2 * h2 - intersection
return intersection / union
detection_process = {
"步骤1": "图像金字塔构建 - 多尺度处理",
"步骤2": "滑动窗口扫描 - 穷举可能位置",
"步骤3": "特征提取 - 每个窗口计算HOG",
"步骤4": "分类判断 - SVM分类器判断",
"步骤5": "非极大值抑制 - 去除重复检测"
}
print("滑动窗口检测流程:")
print("=" * 30)
for step, desc in detection_process.items():
print(f" {step}: {desc}")
return multi_scale_detection, non_maximum_suppression
# 使用示例
hog_svm = HOGSVMDetector()
# HOG特征提取
compute_hog = hog_svm.hog_feature_extraction()
# HOG参数分析
hog_params = hog_svm.hog_parameters_analysis()
# SVM分类器
SimpleSVM, svm_props = hog_svm.svm_classifier()
# 滑动窗口检测
multi_scale_detect, nms = hog_svm.sliding_window_detection()
# 演示HOG特征提取
print("\nHOG特征提取演示:")
print("-" * 20)
# 创建测试图像
test_image = np.random.rand(64, 128) * 255 # 64x128的随机图像
hog_features = compute_hog(test_image)
print(f"输入图像尺寸: {test_image.shape}")
print(f"HOG特征维度: {len(hog_features)}")
print(f"特征向量前10维: {hog_features[:10]}")3.2.3 DPM (Deformable Part Models)
class DPMDetector:
def __init__(self):
self.method_info = {
"年份": 2008,
"作者": "Pedro Felzenszwalb",
"贡献": "可变形部件模型",
"获奖": "PASCAL VOC 2007-2009连续获胜"
}
def dpm_architecture(self):
"""DPM架构原理"""
architecture = {
"根滤波器": {
"作用": "检测整体物体形状",
"特征": "低分辨率HOG特征",
"尺寸": "较大的滤波器核"
},
"部件滤波器": {
"作用": "检测物体局部部件",
"特征": "高分辨率HOG特征",
"数量": "每个根滤波器对应多个部件"
},
"空间模型": {
"作用": "约束部件相对位置",
"参数": "位置均值和变形代价",
"灵活性": "允许部件在一定范围内变形"
},
"混合模型": {
"作用": "处理视角和姿态变化",
"策略": "多个组件组合",
"训练": "潜在SVM训练"
}
}
print("DPM架构组件:")
print("=" * 30)
for component, details in architecture.items():
print(f"\n{component}:")
for key, value in details.items():
print(f" {key}: {value}")
return architecture
def scoring_function(self):
"""DPM评分函数"""
def dpm_score(root_response, part_responses, part_positions, model_params):
"""
DPM评分函数
score = root_filter_score + Σ(part_filter_score - deformation_cost)
"""
# 根滤波器得分
root_score = root_response
# 部件得分计算
part_score = 0
for i, (part_resp, part_pos) in enumerate(zip(part_responses, part_positions)):
# 部件滤波器响应
filter_score = part_resp
# 变形代价计算
anchor_pos = model_params['anchors'][i] # 锚点位置
deform_params = model_params['deformation'][i] # 变形参数
dx = part_pos[0] - anchor_pos[0]
dy = part_pos[1] - anchor_pos[1]
# 二次变形代价:a*dx² + b*dx + c*dy² + d*dy
deform_cost = (deform_params['a'] * dx**2 +
deform_params['b'] * dx +
deform_params['c'] * dy**2 +
deform_params['d'] * dy)
part_score += filter_score - deform_cost
total_score = root_score + part_score
return total_score
# 评分函数的优势
advantages = {
"灵活性": "允许部件相对位置的变化",
"鲁棒性": "对部分遮挡和变形具有鲁棒性",
"可解释性": "明确的物体结构表示",
"泛化性": "适用于多种物体类别"
}
print("DPM评分函数优势:")
print("=" * 25)
for adv, desc in advantages.items():
print(f" {adv}: {desc}")
return dpm_score, advantages
def latent_svm_training(self):
"""潜在SVM训练"""
training_process = {
"初始化": {
"步骤": "使用简单的星形模型初始化",
"目标": "为根滤波器和部件滤波器提供初始参数",
"方法": "从正样本中学习初始模板"
},
"迭代优化": {
"E步": {
"目标": "固定模型参数,优化潜在变量",
"操作": "为每个正样本找到最佳部件位置",
"方法": "动态规划或距离变换"
},
"M步": {
"目标": "固定潜在变量,优化模型参数",
"操作": "更新滤波器权重和变形参数",
"方法": "标准SVM训练"
}
},
"难例挖掘": {
"目的": "处理困难负样本",
"策略": "选择高得分的负样本加入训练集",
"效果": "提升模型判别能力"
},
"收敛判定": {
"条件": "目标函数变化小于阈值",
"指标": "验证集上的检测精度",
"终止": "达到最大迭代次数或收敛"
}
}
def latent_svm_algorithm():
"""潜在SVM算法框架"""
print("潜在SVM训练算法:")
print("1. 初始化:学习初始根模板")
print("2. 重复直到收敛:")
print(" a. E步:优化潜在变量(部件位置)")
print(" b. M步:优化模型参数(滤波器权重)")
print(" c. 难例挖掘:添加困难负样本")
print("3. 输出:训练好的DPM模型")
print("潜在SVM训练过程:")
print("=" * 30)
for phase, details in training_process.items():
print(f"\n{phase}:")
if isinstance(details, dict) and "步骤" not in details:
for key, value in details.items():
print(f" {key}:")
if isinstance(value, dict):
for k, v in value.items():
print(f" {k}: {v}")
else:
print(f" {value}")
else:
for key, value in details.items():
print(f" {key}: {value}")
return latent_svm_algorithm
def dynamic_programming_inference(self):
"""动态规划推理"""
def distance_transform_2d(cost_matrix):
"""二维距离变换"""
height, width = cost_matrix.shape
dt_result = np.zeros_like(cost_matrix)
# 简化的距离变换实现
for i in range(height):
for j in range(width):
min_cost = float('inf')
# 在邻域内寻找最小代价
for di in range(-1, 2):
for dj in range(-1, 2):
ni, nj = i + di, j + dj
if 0 <= ni < height and 0 <= nj < width:
# 变形代价 + 原始代价
deform_cost = di**2 + dj**2 # 简化的变形代价
total_cost = cost_matrix[ni, nj] + deform_cost
min_cost = min(min_cost, total_cost)
dt_result[i, j] = min_cost
return dt_result
def part_based_detection(root_scores, part_scores, deformation_params):
"""基于部件的检测"""
# 1. 计算根滤波器响应
root_response = root_scores
# 2. 为每个部件计算最优位置
part_contributions = []
for part_id, part_score in enumerate(part_scores):
# 构建代价矩阵(负的得分,因为我们要最大化得分)
cost_matrix = -part_score
# 距离变换找到最优位置
dt_result = distance_transform_2d(cost_matrix)
# 最优得分(取负值转回得分)
optimal_scores = -dt_result
part_contributions.append(optimal_scores)
# 3. 组合所有得分
total_scores = root_response
for part_contrib in part_contributions:
total_scores += part_contrib
return total_scores
dp_advantages = {
"效率": "O(n)时间复杂度,比暴力搜索快",
"全局最优": "保证找到全局最优的部件配置",
"可扩展": "容易扩展到更多部件",
"并行化": "不同部件可以并行计算"
}
print("动态规划推理优势:")
print("=" * 25)
for adv, desc in dp_advantages.items():
print(f" {adv}: {desc}")
return distance_transform_2d, part_based_detection
def dpm_vs_traditional_methods(self):
"""DPM与传统方法对比"""
comparison = {
"Viola-Jones": {
"优势": ["速度快", "实时性好"],
"劣势": ["只适用于特定物体", "刚性模板"],
"vs DPM": "DPM更灵活,但速度较慢"
},
"HOG+SVM": {
"优势": ["特征表达好", "泛化能力强"],
"劣势": ["刚性检测窗口", "对变形敏感"],
"vs DPM": "DPM增加了变形能力"
},
"传统滑动窗口": {
"优势": ["实现简单", "易于理解"],
"劣势": ["计算量大", "多尺度处理复杂"],
"vs DPM": "DPM有更好的多尺度处理"
}
}
dpm_innovations = {
"可变形建模": "允许物体部件相对位置变化",
"混合模型": "处理不同视角和姿态",
"潜在变量学习": "自动学习最佳部件配置",
"层次结构": "根-部件两级结构",
"动态规划推理": "高效的最优化求解"
}
print("DPM vs 传统方法对比:")
print("=" * 35)
for method, details in comparison.items():
print(f"\n{method}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
print(f"\nDPM主要创新点:")
print("-" * 20)
for innovation, desc in dmp_innovations.items():
print(f" {innovation}: {desc}")
return comparison, dpm_innovations
# 使用示例
dpm = DPMDetector()
# DPM架构
architecture = dpm.dmp_architecture()
# 评分函数
scoring_func, advantages = dpm.scoring_function()
# 潜在SVM训练
latent_svm_alg = dpm.latent_svm_training()
latent_svm_alg()
# 动态规划推理
distance_transform, part_detection = dmp.dynamic_programming_inference()
# 方法对比
comparison, innovations = dpm.dpm_vs_traditional_methods()
# 演示距离变换
print(f"\n距离变换演示:")
print("-" * 15)
# 创建测试代价矩阵
test_costs = np.array([[1, 2, 3],
[2, 1, 2],
[3, 2, 1]])
print("原始代价矩阵:")
print(test_costs)
dt_result = distance_transform(test_costs)
print("距离变换结果:")
print(dt_result)3.3 两阶段检测算法
3.3.1 R-CNN
class RCNNDetector:
def __init__(self):
self.method_info = {
"年份": 2014,
"作者": "Ross Girshick et al.",
"贡献": "首次将CNN用于目标检测",
"突破": "大幅提升检测精度"
}
def rcnn_pipeline(self):
"""R-CNN流水线"""
pipeline_steps = {
"步骤1: 候选区域生成": {
"方法": "Selective Search",
"输出": "~2000个候选区域",
"作用": "减少搜索空间",
"特点": "基于图像分割和合并"
},
"步骤2: 特征提取": {
"方法": "CNN (AlexNet)",
"预处理": "将候选区域缩放到227×227",
"输出": "4096维特征向量",
"预训练": "ImageNet预训练权重"
},
"步骤3: 分类": {
"方法": "SVM分类器",
"训练": "每个类别训练一个二分类SVM",
"输出": "类别概率",
"后处理": "非极大值抑制"
},
"步骤4: 边界框回归": {
"方法": "线性回归",
"目标": "精细化边界框位置",
"输入": "CNN特征",
"输出": "位置偏移量"
}
}
def selective_search_simulation():
"""Selective Search算法模拟"""
class SelectiveSearch:
def __init__(self):
self.similarity_measures = [
"颜色相似度",
"纹理相似度",
"尺寸相似度",
"填充相似度"
]
def generate_proposals(self, image):
"""生成候选区域(模拟实现)"""
proposals = []
print("Selective Search过程:")
print("1. 初始分割:使用Graph-based分割")
print("2. 相似度计算:计算相邻区域相似度")
print("3. 区域合并:合并最相似的区域")
print("4. 重复合并:直到只剩一个区域")
print("5. 提取边界框:记录合并过程中的所有区域")
# 模拟生成一些候选区域
height, width = image.shape[:2] if hasattr(image, 'shape') else (224, 224)
for i in range(2000): # R-CNN通常生成2000个候选区域
x = np.random.randint(0, width - 50)
y = np.random.randint(0, height - 50)
w = np.random.randint(50, min(150, width - x))
h = np.random.randint(50, min(150, height - y))
proposals.append([x, y, x + w, y + h])
return proposals[:2000] # 返回前2000个
def filter_proposals(self, proposals, min_size=20):
"""过滤候选区域"""
filtered = []
for proposal in proposals:
x1, y1, x2, y2 = proposal
if (x2 - x1) >= min_size and (y2 - y1) >= min_size:
filtered.append(proposal)
return filtered
return SelectiveSearch()
print("R-CNN算法流水线:")
print("=" * 30)
for step, details in pipeline_steps.items():
print(f"\n{step}:")
for key, value in details.items():
print(f" {key}: {value}")
return pipeline_steps, selective_search_simulation()
def rcnn_training_process(self):
"""R-CNN训练过程"""
training_phases = {
"阶段1: CNN预训练": {
"数据": "ImageNet分类数据",
"任务": "1000类图像分类",
"网络": "AlexNet",
"目标": "学习通用视觉特征"
},
"阶段2: CNN微调": {
"数据": "检测数据集(PASCAL VOC)",
"正样本": "与GT IoU ≥ 0.5的候选区域",
"负样本": "与GT IoU < 0.5的候选区域",
"修改": "最后一层改为N+1类(N个目标类+背景)"
},
"阶段3: SVM训练": {
"正样本": "Ground Truth边界框",
"负样本": "与GT IoU < 0.3的候选区域",
"特征": "微调CNN的fc7层输出",
"分类器": "每个类别训练一个二分类SVM"
},
"阶段4: 边界框回归": {
"训练数据": "与GT IoU ≥ 0.6的候选区域",
"输入": "CNN特征 + 候选区域坐标",
"输出": "4个回归值(dx, dy, dw, dh)",
"损失": "平滑L1损失"
}
}
def bbox_regression_details():
"""边界框回归详细说明"""
regression_formulas = {
"预测变换": {
"dx": "(Gx - Px) / Pw",
"dy": "(Gy - Py) / Ph",
"dw": "log(Gw / Pw)",
"dh": "log(Gh / Ph)"
},
"应用变换": {
"Gx_pred": "Px + Pw * dx",
"Gy_pred": "Py + Ph * dy",
"Gw_pred": "Pw * exp(dw)",
"Gh_pred": "Ph * exp(dh)"
}
}
print("边界框回归公式:")
print("-" * 20)
for transform_type, formulas in regression_formulas.items():
print(f"{transform_type}:")
for var, formula in formulas.items():
print(f" {var} = {formula}")
return regression_formulas
print("R-CNN训练过程:")
print("=" * 25)
for phase, details in training_phases.items():
print(f"\n{phase}:")
for key, value in details.items():
print(f" {key}: {value}")
bbox_formulas = bbox_regression_details()
return training_phases, bbox_formulas
def rcnn_limitations(self):
"""R-CNN局限性分析"""
limitations = {
"训练复杂": {
"问题": "多阶段训练流程",
"具体": [
"CNN预训练需要ImageNet数据",
"CNN微调需要检测数据",
"SVM需要单独训练",
"边界框回归需要单独训练"
],
"影响": "训练时间长,调试困难"
},
"推理速度慢": {
"问题": "每个候选区域都要过CNN",
"具体": [
"2000个候选区域",
"每个都要前向传播一次",
"大量重复计算"
],
"数据": "GPU上约13秒/张图",
"影响": "无法实时应用"
},
"存储需求大": {
"问题": "需要存储大量特征",
"具体": [
"每个候选区域4096维特征",
"2000×4096×4字节≈32MB/图"
],
"影响": "内存消耗大"
},
"精度提升有限": {
"问题": "Selective Search质量限制",
"具体": [
"候选区域质量不高",
"可能错过小目标",
"边界框不够精确"
],
"影响": "检测性能上限受限"
}
}
improvements_needed = {
"训练简化": "需要端到端的训练方式",
"速度提升": "需要减少重复计算",
"精度改进": "需要更好的候选区域生成",
"系统整合": "需要统一的框架"
}
print("R-CNN主要局限性:")
print("=" * 25)
for limitation, details in limitations.items():
print(f"\n{limitation}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}:")
for item in value:
print(f" - {item}")
else:
print(f" {key}: {value}")
print(f"\n改进方向:")
print("-" * 10)
for improvement, desc in improvements_needed.items():
print(f" {improvement}: {desc}")
return limitations, improvements_needed
# 使用示例
rcnn = RCNNDetector()
# R-CNN流水线
pipeline, selective_search = rcnn.rcnn_pipeline()
# 训练过程
training_phases, bbox_formulas = rcnn.rcnn_training_process()
# 局限性分析
limitations, improvements = rcnn.rcnn_limitations()
# 演示Selective Search
print(f"\nSelective Search演示:")
print("-" * 20)
# 模拟图像
dummy_image = np.random.rand(224, 224, 3)
proposals = selective_search.generate_proposals(dummy_image)
print(f"生成候选区域数量: {len(proposals)}")
print(f"前5个候选区域: {proposals[:5]}")
# 过滤候选区域
filtered_proposals = selective_search.filter_proposals(proposals)
print(f"过滤后候选区域数量: {len(filtered_proposals)}")现在已经完成了第3章的前半部分内容。我将继续完成两阶段检测算法的其余部分(Fast R-CNN、Faster R-CNN)和一阶段方法的优势分析,以及章节总结。
继续学习进度,完成YOLO课程的系统性学习...
