Chapter 5: YOLO Series Evolution (v2-v5)
Haiyue
50min
Chapter 5: YOLO Series Evolution (v2-v5)
Learning Objectives
- Master the key improvements of YOLO v2 (anchor boxes, batch normalization, multi-scale training, etc.)
- Understand the feature pyramid and multi-scale detection mechanism of YOLO v3
- Learn about the engineering tricks integration and performance optimization of YOLO v4
- Familiarize yourself with the practical improvements and deployment optimization of YOLO v5
5.1 YOLO v2 (YOLO9000) - Better, Faster, Stronger
5.1.1 Core Improvements
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class YOLOv2Improvements:
def __init__(self):
self.improvements = {
"Batch Normalization": {
"Effect": "Accelerates convergence, improves stability",
"Location": "After each convolutional layer",
"Result": "mAP improved by 2%, removes Dropout"
},
"High Resolution Classifier": {
"Pretraining": "448×448 classification task",
"Result": "mAP improved by 4%",
"Principle": "Adapts to high resolution input"
},
"Anchor Mechanism": {
"Concept": "Predefined bounding box shapes",
"Quantity": "5 anchor boxes",
"Result": "Recall improved from 81% to 88%"
},
"Dimension Clustering": {
"Method": "K-means clustering to select anchors",
"Distance": "1-IoU as distance metric",
"Result": "Anchors better suited to dataset"
},
"Direct Location Prediction": {
"Problem": "Anchors can appear anywhere in image",
"Solution": "Use sigmoid to constrain offset",
"Stability": "More stable training"
},
"Fine-Grained Features": {
"Method": "Passthrough layer",
"Fusion": "26×26 features with 13×13 features",
"Result": "Improved small object detection"
},
"Multi-Scale Training": {
"Sizes": "320-608 pixels, 32-pixel intervals",
"Frequency": "Change size every 10 batches",
"Generalization": "Improves generalization at different scales"
}
}
def anchor_mechanism(self):
"""Detailed anchor mechanism"""
class AnchorGenerator:
def __init__(self, anchor_sizes, grid_size=13):
self.anchor_sizes = anchor_sizes # [(w1,h1), (w2,h2), ...]
self.grid_size = grid_size
def generate_anchors(self):
"""Generate all anchor boxes"""
anchors = []
for i in range(self.grid_size):
for j in range(self.grid_size):
for w, h in self.anchor_sizes:
# Anchor center at grid center
cx = (j + 0.5) / self.grid_size
cy = (i + 0.5) / self.grid_size
anchors.append([cx, cy, w, h])
return np.array(anchors)
def kmeans_anchors(self, boxes, k=5):
"""Generate anchors using K-means clustering"""
# Extract width and height
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
sizes = np.column_stack([widths, heights])
# K-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(sizes)
# Return cluster centers as anchor sizes
anchor_sizes = kmeans.cluster_centers_
# Sort by area
areas = anchor_sizes[:, 0] * anchor_sizes[:, 1]
sorted_indices = np.argsort(areas)
return anchor_sizes[sorted_indices]
# Improved location prediction
def direct_location_prediction():
"""Direct location prediction"""
# YOLO v1 problem: predicting (x, y) can be unstable
# YOLO v2 solution: predict offset, constrain with sigmoid
def predict_bbox(tx, ty, tw, th, anchor_w, anchor_h, grid_x, grid_y, grid_size):
"""
tx, ty, tw, th: Network predictions
anchor_w, anchor_h: Anchor dimensions
grid_x, grid_y: Grid coordinates
"""
# Center point prediction (sigmoid constrains within grid)
bx = torch.sigmoid(tx) + grid_x
by = torch.sigmoid(ty) + grid_y
# Width and height prediction (exponential transform)
bw = anchor_w * torch.exp(tw)
bh = anchor_h * torch.exp(th)
# Normalize to [0,1]
bx = bx / grid_size
by = by / grid_size
return bx, by, bw, bh
return predict_bbox
print("YOLO v2 Anchor Mechanism:")
print("=" * 25)
# Example anchor generation
anchor_sizes = [(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892),
(9.47112, 4.84053), (11.2364, 10.0071)]
anchor_gen = AnchorGenerator(anchor_sizes)
anchors = anchor_gen.generate_anchors()
print(f"Number of anchors: {len(anchors)}")
print(f"First 5 anchors: {anchors[:5]}")
return AnchorGenerator, direct_location_prediction()
def passthrough_layer(self):
"""Passthrough layer implementation"""
class PassthroughLayer(nn.Module):
def __init__(self, stride=2):
super(PassthroughLayer, self).__init__()
self.stride = stride
def forward(self, x):
"""
Reorganize 26×26×512 feature map to 13×13×2048
"""
batch_size, channels, height, width = x.size()
# Ensure dimensions are divisible by stride
assert height % self.stride == 0 and width % self.stride == 0
new_height = height // self.stride
new_width = width // self.stride
# Reorganize tensor
x = x.view(batch_size, channels, new_height, self.stride, new_width, self.stride)
x = x.permute(0, 1, 3, 5, 2, 4).contiguous()
x = x.view(batch_size, channels * self.stride * self.stride, new_height, new_width)
return x
# Feature fusion example
def feature_fusion_example():
"""Feature fusion example"""
# High resolution features (26×26×512)
high_res_feat = torch.randn(1, 512, 26, 26)
# Low resolution features (13×13×1024)
low_res_feat = torch.randn(1, 1024, 13, 13)
# Passthrough layer
passthrough = PassthroughLayer(stride=2)
transformed_feat = passthrough(high_res_feat)
print(f"High resolution features: {high_res_feat.shape}")
print(f"After Passthrough: {transformed_feat.shape}")
print(f"Low resolution features: {low_res_feat.shape}")
# Feature fusion
fused_feat = torch.cat([low_res_feat, transformed_feat], dim=1)
print(f"Fused features: {fused_feat.shape}")
return fused_feat
return PassthroughLayer, feature_fusion_example
def multi_scale_training(self):
"""Multi-scale training"""
class MultiScaleTraining:
def __init__(self, min_size=320, max_size=608, step=32):
self.min_size = min_size
self.max_size = max_size
self.step = step
self.scales = list(range(min_size, max_size + step, step))
self.current_scale = 416 # Default size
def get_random_scale(self):
"""Randomly select training size"""
return np.random.choice(self.scales)
def resize_batch(self, images, targets, new_size):
"""Resize batch dimensions"""
# Image resizing
resized_images = F.interpolate(images, size=(new_size, new_size),
mode='bilinear', align_corners=False)
# Target coordinate adjustment
scale_factor = new_size / images.size(-1)
if targets is not None:
# Assume targets format is [batch_idx, class, x, y, w, h]
targets[:, 2:] *= scale_factor
return resized_images, targets
def training_step(self, model, images, targets, step_count):
"""Training step (including size adjustment)"""
# Adjust size every 10 batches
if step_count % 10 == 0:
self.current_scale = self.get_random_scale()
print(f"Switching to size: {self.current_scale}")
# Adjust input size
resized_images, resized_targets = self.resize_batch(
images, targets, self.current_scale)
# Model forward pass
outputs = model(resized_images)
return outputs, resized_targets
multi_scale_benefits = {
"Robustness": "Adapts to different sized inputs",
"Generalization": "Improves performance at different resolutions",
"Practicality": "Same model works for multiple application scenarios",
"Efficiency": "Can adjust inference size based on accuracy requirements"
}
print("Multi-scale training advantages:")
print("=" * 20)
for benefit, desc in multi_scale_benefits.items():
print(f" {benefit}: {desc}")
return MultiScaleTraining, multi_scale_benefits
# Usage example
yolo_v2 = YOLOv2Improvements()
print("YOLO v2 Main Improvements:")
print("=" * 25)
for improvement, details in yolo_v2.improvements.items():
print(f"\n{improvement}:")
for key, value in details.items():
print(f" {key}: {value}")
# Anchor mechanism
AnchorGenerator, bbox_prediction = yolo_v2.anchor_mechanism()
# Passthrough layer
PassthroughLayer, feature_fusion = yolo_v2.passthrough_layer()
# Multi-scale training
MultiScaleTraining, benefits = yolo_v2.multi_scale_training()
# Demonstrate feature fusion
print("\nFeature Fusion Demo:")
print("-" * 15)
fused_features = feature_fusion()
5.2 YOLO v3 - Multi-Scale Prediction
5.2.1 Darknet-53 Backbone
class YOLOv3Architecture:
def __init__(self):
self.key_features = {
"Multi-scale Prediction": "3 feature maps at different scales",
"Feature Pyramid": "FPN-like feature fusion",
"Darknet-53": "Backbone network with residual connections",
"Pointwise Convolution": "1×1 convolution for dimensionality reduction",
"Binary Classification Loss": "Independent sigmoid for each class"
}
def build_darknet53(self):
"""Build Darknet-53 backbone"""
class ConvBNLeaky(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
super(ConvBNLeaky, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size,
stride, padding, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.leaky = nn.LeakyReLU(0.1, inplace=True)
def forward(self, x):
return self.leaky(self.bn(self.conv(x)))
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.conv1 = ConvBNLeaky(channels, channels // 2, 1)
self.conv2 = ConvBNLeaky(channels // 2, channels, 3, padding=1)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
return out + residual
class Darknet53(nn.Module):
def __init__(self):
super(Darknet53, self).__init__()
# Initial convolution
self.conv1 = ConvBNLeaky(3, 32, 3, padding=1)
self.conv2 = ConvBNLeaky(32, 64, 3, stride=2, padding=1)
# Residual block groups
self.res_block1 = self._make_layer(64, 1)
self.conv3 = ConvBNLeaky(64, 128, 3, stride=2, padding=1)
self.res_block2 = self._make_layer(128, 2)
self.conv4 = ConvBNLeaky(128, 256, 3, stride=2, padding=1)
self.res_block3 = self._make_layer(256, 8)
self.conv5 = ConvBNLeaky(256, 512, 3, stride=2, padding=1)
self.res_block4 = self._make_layer(512, 8)
self.conv6 = ConvBNLeaky(512, 1024, 3, stride=2, padding=1)
self.res_block5 = self._make_layer(1024, 4)
def _make_layer(self, channels, num_blocks):
layers = []
for _ in range(num_blocks):
layers.append(ResidualBlock(channels))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.res_block1(x)
x = self.conv3(x)
x = self.res_block2(x)
x = self.conv4(x)
x = self.res_block3(x)
route1 = x # 52×52 feature map
x = self.conv5(x)
x = self.res_block4(x)
route2 = x # 26×26 feature map
x = self.conv6(x)
x = self.res_block5(x) # 13×13 feature map
return route1, route2, x
return Darknet53, ConvBNLeaky
def feature_pyramid_network(self):
"""Feature Pyramid Network"""
class YOLOv3FPN(nn.Module):
def __init__(self, num_classes=80, num_anchors=3):
super(YOLOv3FPN, self).__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors
# Darknet-53 backbone
Darknet53, ConvBNLeaky = self.build_darknet53()
self.backbone = Darknet53()
# Detection heads
self.detection_head1 = self._make_detection_head(1024, 512)
self.detection_head2 = self._make_detection_head(768, 256) # 512 + 256
self.detection_head3 = self._make_detection_head(384, 128) # 256 + 128
# Upsampling
self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')
self.upsample2 = nn.Upsample(scale_factor=2, mode='nearest')
# 1×1 convolution for dimensionality reduction
self.conv_reduce1 = ConvBNLeaky(512, 256, 1)
self.conv_reduce2 = ConvBNLeaky(256, 128, 1)
def _make_detection_head(self, in_channels, mid_channels):
"""Create detection head"""
layers = []
# 5 convolutional layers
for i in range(5):
if i % 2 == 0:
layers.append(ConvBNLeaky(in_channels if i == 0 else mid_channels * 2,
mid_channels, 1))
else:
layers.append(ConvBNLeaky(mid_channels, mid_channels * 2, 3, padding=1))
# Detection convolution
detection_conv = nn.Conv2d(mid_channels,
self.num_anchors * (5 + self.num_classes),
1)
layers.append(detection_conv)
return nn.Sequential(*layers)
def forward(self, x):
# Backbone forward pass
route1, route2, x = self.backbone(x) # 52×52, 26×26, 13×13
# First scale detection (13×13)
detection1 = self.detection_head1(x)
# Upsample and fuse (26×26)
x = self.conv_reduce1(x[:, :512]) # Take first 512 channels
x = self.upsample1(x)
x = torch.cat([x, route2], dim=1)
detection2 = self.detection_head2(x)
# Upsample and fuse (52×52)
x = self.conv_reduce2(x[:, :256]) # Take first 256 channels
x = self.upsample2(x)
x = torch.cat([x, route1], dim=1)
detection3 = self.detection_head3(x)
return detection1, detection2, detection3
return YOLOv3FPN
def multi_scale_anchors(self):
"""Multi-scale anchor design"""
# YOLOv3's 9 anchors (3 scales × 3 anchors)
anchors = {
"Large scale (13×13)": [(116, 90), (156, 198), (373, 326)],
"Medium scale (26×26)": [(30, 61), (62, 45), (59, 119)],
"Small scale (52×52)": [(10, 13), (16, 30), (33, 23)]
}
def assign_anchors_to_scales():
"""Anchor assignment strategy"""
assignment_strategy = {
"Principle": "Assign anchors to appropriate scales based on size",
"Large objects": "Assign to low-resolution feature map (13×13)",
"Medium objects": "Assign to medium-resolution feature map (26×26)",
"Small objects": "Assign to high-resolution feature map (52×52)",
"Advantage": "Each scale focuses on specific object sizes"
}
return assignment_strategy
print("YOLOv3 Multi-scale Anchors:")
print("=" * 25)
for scale, anchor_list in anchors.items():
print(f"\n{scale}:")
for i, (w, h) in enumerate(anchor_list):
print(f" Anchor{i+1}: {w}×{h}")
strategy = assign_anchors_to_scales()
print(f"\nAssignment strategy:")
for key, value in strategy.items():
print(f" {key}: {value}")
return anchors, strategy
# Usage example
yolo_v3 = YOLOv3Architecture()
print("YOLO v3 Key Features:")
print("=" * 25)
for feature, description in yolo_v3.key_features.items():
print(f" {feature}: {description}")
# Build Darknet-53
Darknet53, ConvBNLeaky = yolo_v3.build_darknet53()
backbone = Darknet53()
# Count parameters
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nDarknet-53 parameters: {count_parameters(backbone):,}")
# Test backbone
test_input = torch.randn(1, 3, 416, 416)
with torch.no_grad():
route1, route2, output = backbone(test_input)
print(f"\nFeature map sizes:")
print(f" route1 (52×52): {route1.shape}")
print(f" route2 (26×26): {route2.shape}")
print(f" output (13×13): {output.shape}")
# Multi-scale anchors
anchors, strategy = yolo_v3.multi_scale_anchors()
5.3 YOLO v4 - Engineering Tricks Integration
5.3.1 Bag of Freebies and Specials
class YOLOv4Optimizations:
def __init__(self):
self.bag_of_freebies = {
"Data Augmentation": {
"Mosaic": "4-image stitching",
"CutMix": "Image cropping and mixing",
"MixUp": "Linear image mixing",
"Self-Adversarial Training": "Adversarial sample augmentation"
},
"Regularization": {
"DropBlock": "Structured Dropout",
"Label Smoothing": "Label smoothing",
"Class label smoothing": "Class label smoothing"
},
"Loss Functions": {
"CIoU Loss": "Complete IoU loss",
"Focal Loss": "Hard example mining loss",
"DIoU Loss": "Distance IoU loss"
}
}
self.bag_of_specials = {
"Activation Functions": {
"Mish": "Self-gated activation function",
"Swish": "Self-gated linear unit",
"ReLU6": "Truncated ReLU"
},
"Attention Mechanisms": {
"SE": "Squeeze-and-Excitation",
"CBAM": "Convolutional Block Attention Module",
"ECA": "Efficient Channel Attention"
},
"Normalization": {
"Cross-stage": "Cross-stage partial connections",
"Cross mini-Batch": "Cross mini-batch normalization"
},
"Skip Connections": {
"Residual": "Residual connections",
"Weighted residual": "Weighted residual connections",
"Multi-input weighted": "Multi-input weighted connections"
}
}
def mosaic_augmentation(self):
"""Mosaic data augmentation"""
class MosaicAugmentation:
def __init__(self, image_size=640):
self.image_size = image_size
def mosaic_augment(self, images, targets):
"""
Mosaic augmentation: stitch 4 images into one
images: List of 4 images
targets: Corresponding annotation list
"""
assert len(images) == 4, "Mosaic requires 4 images"
# Randomly choose stitching center point
cut_x = np.random.randint(self.image_size // 4, 3 * self.image_size // 4)
cut_y = np.random.randint(self.image_size // 4, 3 * self.image_size // 4)
# Create output image
mosaic_image = np.zeros((self.image_size, self.image_size, 3), dtype=np.uint8)
mosaic_targets = []
# Define positions for 4 quadrants
positions = [
(0, 0, cut_x, cut_y), # Top left
(cut_x, 0, self.image_size, cut_y), # Top right
(0, cut_y, cut_x, self.image_size), # Bottom left
(cut_x, cut_y, self.image_size, self.image_size) # Bottom right
]
for i, (image, target) in enumerate(zip(images, targets)):
x1, y1, x2, y2 = positions[i]
# Resize image to fit region
h, w = image.shape[:2]
scale = min((x2 - x1) / w, (y2 - y1) / h)
new_w = int(w * scale)
new_h = int(h * scale)
resized_image = cv2.resize(image, (new_w, new_h))
# Place image
mosaic_image[y1:y1+new_h, x1:x1+new_w] = resized_image
# Adjust annotations
if target is not None:
adjusted_target = target.copy()
adjusted_target[:, [0, 2]] = adjusted_target[:, [0, 2]] * scale + x1
adjusted_target[:, [1, 3]] = adjusted_target[:, [1, 3]] * scale + y1
mosaic_targets.append(adjusted_target)
# Merge all annotations
if mosaic_targets:
mosaic_targets = np.concatenate(mosaic_targets, axis=0)
return mosaic_image, mosaic_targets
def cutmix_augment(self, image1, target1, image2, target2, alpha=1.0):
"""CutMix augmentation"""
lam = np.random.beta(alpha, alpha)
h, w = image1.shape[:2]
cut_rat = np.sqrt(1. - lam)
cut_w = int(w * cut_rat)
cut_h = int(h * cut_rat)
# Randomly select cut position
cx = np.random.randint(w)
cy = np.random.randint(h)
bbx1 = np.clip(cx - cut_w // 2, 0, w)
bby1 = np.clip(cy - cut_h // 2, 0, h)
bbx2 = np.clip(cx + cut_w // 2, 0, w)
bby2 = np.clip(cy + cut_h // 2, 0, h)
# Execute CutMix
mixed_image = image1.copy()
mixed_image[bby1:bby2, bbx1:bbx2] = image2[bby1:bby2, bbx1:bbx2]
# Mix annotations
mixed_targets = []
if target1 is not None:
mixed_targets.append(target1)
if target2 is not None:
# Filter targets outside the cut region
valid_targets = []
for target in target2:
x1, y1, x2, y2 = target[:4]
if not (x2 < bbx1 or x1 > bbx2 or y2 < bby1 or y1 > bby2):
valid_targets.append(target)
if valid_targets:
mixed_targets.append(np.array(valid_targets))
if mixed_targets:
mixed_targets = np.concatenate(mixed_targets, axis=0)
return mixed_image, mixed_targets
return MosaicAugmentation
def mish_activation(self):
"""Mish activation function"""
class Mish(nn.Module):
def __init__(self):
super(Mish, self).__init__()
def forward(self, x):
return x * torch.tanh(F.softplus(x))
def mish_vs_others():
"""Mish vs other activation functions"""
x = torch.linspace(-3, 3, 1000)
activations = {
'ReLU': F.relu(x),
'Swish': x * torch.sigmoid(x),
'Mish': x * torch.tanh(F.softplus(x)),
'LeakyReLU': F.leaky_relu(x, 0.1)
}
properties = {
'ReLU': "Simple and fast, but suffers from gradient vanishing",
'Swish': "Smooth, self-gated, good performance",
'Mish': "Smoother, better convergence, higher accuracy",
'LeakyReLU': "Alleviates gradient vanishing, but not self-gated"
}
print("Activation function comparison:")
print("=" * 25)
for name, prop in properties.items():
print(f" {name}: {prop}")
return activations, properties
return Mish, mish_vs_others
def ciou_loss(self):
"""Complete IoU Loss"""
def ciou_loss_function(pred_boxes, target_boxes):
"""
CIoU loss function
Considers overlap area, center distance, aspect ratio
"""
# Calculate IoU
def calculate_iou(box1, box2):
x1 = torch.max(box1[:, 0], box2[:, 0])
y1 = torch.max(box1[:, 1], box2[:, 1])
x2 = torch.min(box1[:, 2], box2[:, 2])
y2 = torch.min(box1[:, 3], box2[:, 3])
intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)
area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
union = area1 + area2 - intersection
return intersection / (union + 1e-6)
# Calculate center distance
def center_distance(box1, box2):
center1_x = (box1[:, 0] + box1[:, 2]) / 2
center1_y = (box1[:, 1] + box1[:, 3]) / 2
center2_x = (box2[:, 0] + box2[:, 2]) / 2
center2_y = (box2[:, 1] + box2[:, 3]) / 2
return (center1_x - center2_x)**2 + (center1_y - center2_y)**2
# Calculate minimum enclosing box diagonal length
def diagonal_length(box1, box2):
c_x = torch.max(box1[:, 2], box2[:, 2]) - torch.min(box1[:, 0], box2[:, 0])
c_y = torch.max(box1[:, 3], box2[:, 3]) - torch.min(box1[:, 1], box2[:, 1])
return c_x**2 + c_y**2
# Calculate aspect ratio consistency
def aspect_ratio_consistency(box1, box2):
w1 = box1[:, 2] - box1[:, 0]
h1 = box1[:, 3] - box1[:, 1]
w2 = box2[:, 2] - box2[:, 0]
h2 = box2[:, 3] - box2[:, 1]
v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w2/h2) - torch.atan(w1/h1), 2)
return v
# Calculate CIoU
iou = calculate_iou(pred_boxes, target_boxes)
rho2 = center_distance(pred_boxes, target_boxes)
c2 = diagonal_length(pred_boxes, target_boxes)
v = aspect_ratio_consistency(pred_boxes, target_boxes)
with torch.no_grad():
alpha = v / (1 - iou + v + 1e-6)
ciou = iou - rho2 / (c2 + 1e-6) - alpha * v
return 1 - ciou # CIoU loss
loss_comparison = {
"IoU Loss": "Only considers overlap area",
"GIoU Loss": "Considers minimum enclosing box",
"DIoU Loss": "Additionally considers center distance",
"CIoU Loss": "Also considers aspect ratio consistency",
"Advantage": "Faster convergence, more accurate regression"
}
print("CIoU Loss advantages:")
print("=" * 20)
for loss_type, description in loss_comparison.items():
print(f" {loss_type}: {description}")
return ciou_loss_function, loss_comparison
def csp_darknet53(self):
"""CSPDarknet53 backbone"""
class CSPBlock(nn.Module):
def __init__(self, in_channels, out_channels, num_blocks):
super(CSPBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels // 2, 1, bias=False)
self.conv2 = nn.Conv2d(in_channels, out_channels // 2, 1, bias=False)
# Residual blocks
self.res_blocks = nn.ModuleList()
for _ in range(num_blocks):
self.res_blocks.append(nn.Sequential(
nn.Conv2d(out_channels // 2, out_channels // 2, 1, bias=False),
nn.BatchNorm2d(out_channels // 2),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(out_channels // 2, out_channels // 2, 3, padding=1, bias=False),
nn.BatchNorm2d(out_channels // 2),
nn.LeakyReLU(0.1, inplace=True)
))
self.conv3 = nn.Conv2d(out_channels, out_channels, 1, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.activation = nn.LeakyReLU(0.1, inplace=True)
def forward(self, x):
# Split features
x1 = self.conv1(x)
x2 = self.conv2(x)
# Residual connections
for res_block in self.res_blocks:
x2 = x2 + res_block(x2)
# Feature fusion
out = torch.cat([x1, x2], dim=1)
out = self.conv3(out)
out = self.bn(out)
out = self.activation(out)
return out
csp_advantages = {
"Gradient Flow": "Splits gradient flow, reduces computation",
"Feature Reuse": "Better feature reuse",
"Parameter Efficiency": "Fewer parameters for same accuracy",
"Inference Speed": "Faster inference"
}
print("CSP advantages:")
print("=" * 10)
for advantage, description in csp_advantages.items():
print(f" {advantage}: {description}")
return CSPBlock, csp_advantages
# Usage example
yolo_v4 = YOLOv4Optimizations()
print("YOLO v4 Bag of Freebies:")
print("=" * 30)
for category, techniques in yolo_v4.bag_of_freebies.items():
print(f"\n{category}:")
for technique, description in techniques.items():
print(f" {technique}: {description}")
print("\nYOLO v4 Bag of Specials:")
print("=" * 30)
for category, techniques in yolo_v4.bag_of_specials.items():
print(f"\n{category}:")
for technique, description in techniques.items():
print(f" {technique}: {description}")
# Mosaic augmentation
MosaicAugmentation = yolo_v4.mosaic_augmentation()
# Mish activation function
Mish, mish_comparison = yolo_v4.mish_activation()
activations, properties = mish_comparison()
# CIoU loss
ciou_loss_fn, loss_comparison = yolo_v4.ciou_loss()
# CSP structure
CSPBlock, csp_advantages = yolo_v4.csp_darknet53()
5.4 YOLO v5 - Engineering Optimization
5.4.1 Practical Improvements
class YOLOv5Improvements:
def __init__(self):
self.improvements = {
"Data Loading": {
"Adaptive Anchors": "Automatically calculate optimal anchors",
"Adaptive Image Scaling": "Aspect ratio preserving scaling",
"Efficient Data Loading": "Multi-process data loading optimization"
},
"Training Optimization": {
"Automatic Mixed Precision": "FP16 training acceleration",
"Exponential Moving Average": "Model weight smoothing",
"Cosine Learning Rate": "Better learning rate scheduling",
"Early Stopping": "Prevent overfitting"
},
"Model Architecture": {
"Focus Structure": "Efficient downsampling",
"CSP Structure": "Cross-stage partial connections",
"SPP Structure": "Spatial pyramid pooling",
"PANet": "Path aggregation network"
},
"Engineering": {
"Model Scaling": "Different sized model family",
"ONNX Export": "Easy deployment",
"TensorRT Optimization": "Inference acceleration",
"Mobile Optimization": "Lightweight versions"
}
}
def focus_structure(self):
"""Focus structure"""
class Focus(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
super(Focus, self).__init__()
self.conv = nn.Conv2d(in_channels * 4, out_channels, kernel_size, stride, padding, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.act = nn.SiLU(inplace=True) # Swish/SiLU activation
def forward(self, x):
# Rearrange 2x2 pixel blocks into 4x channels
# Example: (B, 3, 640, 640) -> (B, 12, 320, 320)
return self.act(self.bn(self.conv(torch.cat([
x[..., ::2, ::2], # Top left
x[..., 1::2, ::2], # Top right
x[..., ::2, 1::2], # Bottom left
x[..., 1::2, 1::2] # Bottom right
], 1))))
def focus_advantages():
"""Focus structure advantages"""
advantages = {
"No Information Loss": "Unlike conv stride=2, doesn't lose information",
"Computational Efficiency": "Reduces computation",
"Feature Preservation": "Preserves all pixel information",
"Compatibility": "Easy to integrate into existing architectures"
}
return advantages
return Focus, focus_advantages()
def adaptive_anchor(self):
"""Adaptive anchors"""
class AdaptiveAnchor:
def __init__(self, dataset, num_anchors=9, thr=4.0):
self.dataset = dataset
self.num_anchors = num_anchors
self.thr = thr
def check_anchor_order(self, anchors, targets, img_size):
"""Check anchor order"""
m = len(anchors)
bpr, aat = self.metric(anchors, targets)
print(f'Anchor fitness: {bpr:.3f}, Best possible recall: {aat:.3f}')
if bpr < 0.98:
print('Running automatic anchor optimization...')
new_anchors = self.kmean_anchors(targets, n=m, img_size=img_size, thr=self.thr)
new_bpr, new_aat = self.metric(new_anchors, targets)
if new_bpr > bpr:
print(f'New anchor BPR: {new_bpr:.3f}, AAT: {new_aat:.3f}')
return new_anchors
else:
print('Keeping original anchors')
return anchors
return anchors
def metric(self, anchors, targets):
"""Calculate anchor metrics"""
if len(targets) == 0:
return 0, 0
na = len(anchors)
txy, twh = targets[:, 2:4], targets[:, 4:6] # Target center and size
# Calculate aspect ratio
r = twh[:, None] / anchors[None] # wh ratio
j = torch.max(r, 1. / r).max(2)[0] < self.thr # Compare
# Best possible recall and average anchor threshold
bpr = (j * (txy[:, None] > 0.1).all(2) * (txy[:, None] < 0.9).all(2)).float().sum(1).mean()
aat = (j & (txy[:, None] > 0.1).all(2) & (txy[:, None] < 0.9).all(2)).float().sum(1).mean()
return bpr, aat
def kmean_anchors(self, targets, n=9, img_size=640, thr=4.0, gen=1000):
"""K-means anchor clustering"""
from scipy.cluster.vq import kmeans
def fitness(k):
_, dist = kmeans(wh, k)
return 1 / dist
# Extract width and height
wh = targets[:, 4:6] * img_size # Convert to pixel coordinates
# K-means clustering
print(f'Performing K-means clustering with {len(wh)} targets...')
s = wh.std(0) # Standard deviation
k, dist = kmeans(wh / s, n, iter=30) # Cluster
k *= s
# Sort by area
k = k[np.argsort(k.prod(1))]
f = fitness(k)
print(f'Anchor fitness: {f:.3f}')
return k
return AdaptiveAnchor
def model_scaling(self):
"""Model scaling strategy"""
def create_model_variants():
"""Create different sized model variants"""
variants = {
'YOLOv5n': { # nano
'depth_multiple': 0.33,
'width_multiple': 0.25,
'parameters': '1.9M',
'gflops': '4.5',
'speed_cpu': '6.3ms',
'speed_gpu': '0.6ms'
},
'YOLOv5s': { # small
'depth_multiple': 0.33,
'width_multiple': 0.50,
'parameters': '7.2M',
'gflops': '16.5',
'speed_cpu': '11.9ms',
'speed_gpu': '0.9ms'
},
'YOLOv5m': { # medium
'depth_multiple': 0.67,
'width_multiple': 0.75,
'parameters': '21.2M',
'gflops': '49.0',
'speed_cpu': '25.1ms',
'speed_gpu': '1.7ms'
},
'YOLOv5l': { # large
'depth_multiple': 1.0,
'width_multiple': 1.0,
'parameters': '46.5M',
'gflops': '109.1',
'speed_cpu': '47.9ms',
'speed_gpu': '2.7ms'
},
'YOLOv5x': { # extra large
'depth_multiple': 1.33,
'width_multiple': 1.25,
'parameters': '86.7M',
'gflops': '205.7',
'speed_cpu': '95.2ms',
'speed_gpu': '4.6ms'
}
}
return variants
def scale_model(base_channels, base_depth, width_mult, depth_mult):
"""Adjust model based on scaling factors"""
scaled_channels = int(base_channels * width_mult)
scaled_depth = max(1, int(base_depth * depth_mult))
return scaled_channels, scaled_depth
variants = create_model_variants()
print("YOLOv5 Model Variants:")
print("=" * 25)
for model, specs in variants.items():
print(f"\n{model}:")
for key, value in specs.items():
print(f" {key}: {value}")
return variants, scale_model
def training_optimizations(self):
"""Training optimization techniques"""
class TrainingOptimizer:
def __init__(self):
self.techniques = {
"Automatic Mixed Precision": self.setup_amp,
"Exponential Moving Average": self.setup_ema,
"Cosine Learning Rate": self.setup_cosine_lr,
"Early Stopping": self.setup_early_stopping
}
def setup_amp(self):
"""Automatic mixed precision"""
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()
def training_step(model, loss_fn, optimizer, inputs, targets):
with autocast():
outputs = model(inputs)
loss = loss_fn(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
return loss
return training_step
def setup_ema(self, model, decay=0.9999):
"""Exponential moving average"""
class ModelEMA:
def __init__(self, model, decay=0.9999):
self.ema = {k: v.clone().detach() for k, v in model.state_dict().items()}
self.decay = decay
def update(self, model):
with torch.no_grad():
for k, v in model.state_dict().items():
self.ema[k] = self.ema[k] * self.decay + v * (1 - self.decay)
def apply_shadow(self, model):
model.load_state_dict(self.ema)
return ModelEMA(model, decay)
def setup_cosine_lr(self, optimizer, T_max, eta_min=0):
"""Cosine learning rate scheduling"""
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)
return scheduler
def setup_early_stopping(self, patience=10, min_delta=0.001):
"""Early stopping mechanism"""
class EarlyStopping:
def __init__(self, patience=10, min_delta=0.001):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float('inf')
def __call__(self, val_loss):
if val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
return False
else:
self.counter += 1
return self.counter >= self.patience
return EarlyStopping(patience, min_delta)
return TrainingOptimizer
# Usage example
yolo_v5 = YOLOv5Improvements()
print("YOLO v5 Improvements:")
print("=" * 20)
for category, improvements in yolo_v5.improvements.items():
print(f"\n{category}:")
for improvement, description in improvements.items():
print(f" {improvement}: {description}")
# Focus structure
Focus, focus_advantages = yolo_v5.focus_structure()
print(f"\nFocus structure advantages:")
print("-" * 15)
for advantage, description in focus_advantages.items():
print(f" {advantage}: {description}")
# Model scaling
variants, scale_model = yolo_v5.model_scaling()
# Training optimization
TrainingOptimizer = yolo_v5.training_optimizations()
optimizer = TrainingOptimizer()
# Test Focus structure
focus_layer = Focus(3, 32)
test_input = torch.randn(1, 3, 640, 640)
with torch.no_grad():
output = focus_layer(test_input)
print(f"\nFocus test:")
print(f" Input: {test_input.shape}")
print(f" Output: {output.shape}")
Chapter Summary
5.5 YOLO Series Evolution Summary
class YOLOEvolutionSummary:
def __init__(self):
self.evolution_timeline = {
"YOLO v2 (2017)": {
"Core Improvements": ["Anchor mechanism", "Batch normalization", "Multi-scale training", "Fine-grained features"],
"Performance": "PASCAL VOC mAP 76.8%",
"Innovation": "Introduced anchor concept to YOLO"
},
"YOLO v3 (2018)": {
"Core Improvements": ["Multi-scale prediction", "Darknet-53", "Feature pyramid", "Binary classification loss"],
"Performance": "COCO mAP 57.9%",
"Innovation": "Multi-scale detection architecture"
},
"YOLO v4 (2020)": {
"Core Improvements": ["CSPDarknet53", "Mosaic augmentation", "CIoU loss", "Many tricks"],
"Performance": "COCO mAP 65.7%",
"Innovation": "Large-scale engineering tricks integration"
},
"YOLO v5 (2020)": {
"Core Improvements": ["Focus structure", "Adaptive anchors", "Model scaling", "Engineering optimization"],
"Performance": "COCO mAP 68.9%",
"Innovation": "Engineering and practicality"
}
}
def performance_comparison(self):
"""Performance comparison"""
comparison = {
"Metrics": ["Accuracy", "Speed", "Model Size", "Usability"],
"YOLO v2": ["Medium", "Fast", "Medium", "Fair"],
"YOLO v3": ["High", "Medium", "Large", "Fair"],
"YOLO v4": ["High", "Fast", "Large", "Good"],
"YOLO v5": ["High", "Fast", "Flexible", "Excellent"]
}
return comparison
def key_innovations(self):
"""Key innovations summary"""
innovations = {
"Network Architecture": {
"v2": "Darknet-19 + Anchors",
"v3": "Darknet-53 + FPN",
"v4": "CSPDarknet53 + SPP + PANet",
"v5": "CSP + Focus + PANet"
},
"Training Tricks": {
"v2": "Multi-scale training",
"v3": "Data augmentation optimization",
"v4": "Mosaic + CutMix + SAT",
"v5": "Adaptive training + AutoML"
},
"Loss Functions": {
"v2": "Improved IoU loss",
"v3": "Binary cross-entropy",
"v4": "CIoU + Focal Loss",
"v5": "Optimized CIoU"
},
"Engineering": {
"v2": "Basic engineering",
"v3": "Modular improvements",
"v4": "Tricks integration",
"v5": "Full engineering"
}
}
return innovations
# Summary display
summary = YOLOEvolutionSummary()
print("YOLO Series Evolution Timeline:")
print("=" * 30)
for version, details in summary.evolution_timeline.items():
print(f"\n{version}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {', '.join(value)}")
else:
print(f" {key}: {value}")
# Performance comparison
comparison = summary.performance_comparison()
print(f"\nPerformance Comparison:")
print("=" * 15)
metrics = comparison["Metrics"]
for i, metric in enumerate(metrics):
print(f"\n{metric}:")
for version in ["YOLO v2", "YOLO v3", "YOLO v4", "YOLO v5"]:
print(f" {version}: {comparison[version][i]}")
# Key innovations
innovations = summary.key_innovations()
print(f"\nKey Innovations Summary:")
print("=" * 20)
for category, versions in innovations.items():
print(f"\n{category}:")
for version, innovation in versions.items():
print(f" {version}: {innovation}")
5.6 Next Chapter Preview
The next chapter will cover YOLO’s latest versions (v6-v11) and cutting-edge developments, exploring:
- YOLO v6-v8: Latest architecture designs and performance optimization
- YOLO v9-v11: Cutting-edge technologies and future developments
- New Technologies: Transformer, attention mechanisms, neural architecture search
- Application Extensions: Segmentation, pose estimation, 3D detection
Through this chapter, we have comprehensively learned about the evolution of YOLO v2 to v5. Each version has made important improvements on the previous version, driving the development of real-time object detection technology. These improvements provide important references for subsequent versions and other detection algorithms.