Chapter 6: YOLO Latest Versions (v6-v11) and Cutting-Edge Developments
Chapter 6: YOLO Latest Versions (v6-v11) and Cutting-Edge Developments
Learning Objectives
- Understand the latest technical features of YOLO v6-v11
- Master network architecture optimizations in new versions
- Comprehend cutting-edge technologies in modern object detection
- Familiarize with trends in YOLO and Transformer integration
6.1 YOLO v6 (2022)
6.1.1 Industrial-Grade Optimization Design
YOLO v6, developed by the Meituan team, focuses on industrial deployment needs, achieving a better balance between accuracy and inference speed.
import torch
import torch.nn as nn
import torch.nn.functional as F
class YOLOv6Features:
"""YOLO v6 Feature Analysis"""
def __init__(self):
self.key_innovations = {
"Backbone": "EfficientRep - Efficient Reparameterization Design",
"Neck": "Rep-PAN - Reparameterized Path Aggregation Network",
"Detection Head": "Efficient Decoupled Head",
"Training Strategy": "Self-Distillation Training",
"Anchor Strategy": "Anchor-free + SimOTA Label Assignment",
"Loss Function": "VFL + DFL + GIoU Loss Combination"
}
self.model_variants = {
"YOLOv6-N": {"mAP": 37.5, "Speed": "1187 FPS", "Params": "4.7M"},
"YOLOv6-T": {"mAP": 41.3, "Speed": "425 FPS", "Params": "15.0M"},
"YOLOv6-S": {"mAP": 45.0, "Speed": "373 FPS", "Params": "18.5M"},
"YOLOv6-M": {"mAP": 50.0, "Speed": "231 FPS", "Params": "34.9M"},
"YOLOv6-L": {"mAP": 52.8, "Speed": "161 FPS", "Params": "59.6M"}
}
# EfficientRep Backbone
class RepBlock(nn.Module):
"""Reparameterization Block"""
def __init__(self, in_channels, out_channels, stride=1):
super(RepBlock, self).__init__()
self.stride = stride
self.in_channels = in_channels
self.out_channels = out_channels
# Multi-branch structure during training
if stride == 1 and in_channels == out_channels:
self.identity = nn.BatchNorm2d(in_channels)
else:
self.identity = None
self.conv_3x3 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False),
nn.BatchNorm2d(out_channels)
)
self.conv_1x1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride, 0, bias=False),
nn.BatchNorm2d(out_channels)
)
self.activation = nn.ReLU(inplace=True)
# Single-branch structure for inference
self.deploy = False
self.rep_conv = None
def forward(self, x):
if self.deploy:
return self.activation(self.rep_conv(x))
# Multi-branch during training
out = self.conv_3x3(x) + self.conv_1x1(x)
if self.identity is not None:
out += self.identity(x)
return self.activation(out)
def switch_to_deploy(self):
"""Convert to deployment mode with single-branch structure"""
if self.deploy:
return
# Get equivalent 3x3 convolution parameters
kernel, bias = self._get_equivalent_kernel_bias()
# Create reparameterized convolution
self.rep_conv = nn.Conv2d(
self.in_channels, self.out_channels, 3, self.stride, 1, bias=True
)
self.rep_conv.weight.data = kernel
self.rep_conv.bias.data = bias
# Delete original branches
self.__delattr__('conv_3x3')
self.__delattr__('conv_1x1')
if hasattr(self, 'identity'):
self.__delattr__('identity')
self.deploy = True
def _get_equivalent_kernel_bias(self):
"""Calculate equivalent kernel and bias"""
# Get parameters from 3x3 branch
kernel_3x3, bias_3x3 = self._fuse_bn_tensor(self.conv_3x3)
# Get parameters from 1x1 branch (pad to 3x3)
kernel_1x1, bias_1x1 = self._fuse_bn_tensor(self.conv_1x1)
kernel_1x1 = F.pad(kernel_1x1, [1, 1, 1, 1])
# Identity branch
kernel_id, bias_id = 0, 0
if self.identity is not None:
kernel_id, bias_id = self._fuse_bn_tensor(self.identity)
# Create identity mapping 3x3 kernel
kernel_id = F.pad(torch.eye(self.in_channels).view(self.in_channels, self.in_channels, 1, 1), [1, 1, 1, 1])
# Merge all branches
return kernel_3x3 + kernel_1x1 + kernel_id, bias_3x3 + bias_1x1 + bias_id
def _fuse_bn_tensor(self, branch):
"""Fuse BN layer parameters"""
if isinstance(branch, nn.Sequential):
kernel = branch[0].weight
running_mean = branch[1].running_mean
running_var = branch[1].running_var
gamma = branch[1].weight
beta = branch[1].bias
eps = branch[1].eps
else: # BatchNorm only
kernel = torch.eye(self.in_channels).view(self.in_channels, self.in_channels, 1, 1)
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
# EfficientRep Backbone
class EfficientRep(nn.Module):
"""EfficientRep Backbone Network"""
def __init__(self, channels_list=[64, 128, 256, 512, 1024], num_repeats=[1, 6, 12, 18, 6]):
super(EfficientRep, self).__init__()
# Stem
self.stem = nn.Sequential(
RepBlock(3, channels_list[0]//2, 2),
RepBlock(channels_list[0]//2, channels_list[0]//2, 1),
RepBlock(channels_list[0]//2, channels_list[0], 1)
)
# Build stages
self.stages = nn.ModuleList()
in_channels = channels_list[0]
for i, (out_channels, num_repeat) in enumerate(zip(channels_list[1:], num_repeats[1:])):
stage = []
# Downsampling
stage.append(RepBlock(in_channels, out_channels, 2))
# Repeated blocks
for _ in range(num_repeat):
stage.append(RepBlock(out_channels, out_channels, 1))
self.stages.append(nn.Sequential(*stage))
in_channels = out_channels
def forward(self, x):
outputs = []
x = self.stem(x)
for stage in self.stages:
x = stage(x)
outputs.append(x)
# Return last three stage outputs for FPN
return outputs[-3:]
# SimOTA Label Assignment
class SimOTA:
"""SimOTA Dynamic Label Assignment"""
def __init__(self, center_radius=2.5, candidate_topk=10):
self.center_radius = center_radius
self.candidate_topk = candidate_topk
def assign(self, pred_scores, pred_bboxes, gt_bboxes, gt_labels):
"""
Dynamic label assignment
pred_scores: (num_anchors, num_classes)
pred_bboxes: (num_anchors, 4)
gt_bboxes: (num_gt, 4)
gt_labels: (num_gt,)
"""
num_gt = gt_bboxes.size(0)
num_anchors = pred_scores.size(0)
if num_gt == 0:
# No GT, all anchors are negative samples
return torch.zeros(num_anchors, dtype=torch.long), \
torch.zeros(num_anchors, num_gt, dtype=torch.float)
# 1. Calculate geometric constraints (center prior)
is_in_centers = self._get_in_centers_info(pred_bboxes, gt_bboxes)
# 2. Calculate cost matrix
cost_matrix = self._compute_cost_matrix(
pred_scores, pred_bboxes, gt_bboxes, gt_labels, is_in_centers
)
# 3. Dynamic k value selection
dynamic_ks = self._get_dynamic_k(cost_matrix, gt_bboxes)
# 4. Perform matching
matched_gt_inds, matched_labels = self._dynamic_k_matching(
cost_matrix, dynamic_ks, num_gt
)
return matched_gt_inds, matched_labels
def _get_in_centers_info(self, anchors, gt_bboxes):
"""Get center prior information"""
num_anchors = anchors.size(0)
num_gt = gt_bboxes.size(0)
# Calculate anchor centers
anchor_centers = (anchors[:, :2] + anchors[:, 2:]) / 2 # (num_anchors, 2)
# Calculate GT centers
gt_centers = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2 # (num_gt, 2)
# Calculate distances
distances = torch.cdist(anchor_centers, gt_centers) # (num_anchors, num_gt)
# Check if within center region
is_in_centers = distances < self.center_radius
return is_in_centers
def _compute_cost_matrix(self, pred_scores, pred_bboxes, gt_bboxes, gt_labels, is_in_centers):
"""Calculate cost matrix"""
num_anchors = pred_scores.size(0)
num_gt = gt_bboxes.size(0)
# Classification cost
cls_cost = -pred_scores[:, gt_labels] # (num_anchors, num_gt)
# Regression cost (IoU)
ious = self._compute_iou(pred_bboxes[:, None, :], gt_bboxes[None, :, :])
reg_cost = -ious # (num_anchors, num_gt)
# Total cost
cost_matrix = cls_cost + 3.0 * reg_cost
# Apply geometric constraints
cost_matrix = cost_matrix * is_in_centers.float() + \
1e8 * (~is_in_centers).float()
return cost_matrix
def _compute_iou(self, boxes1, boxes2):
"""Calculate IoU"""
# boxes1: (num_anchors, 1, 4)
# boxes2: (1, num_gt, 4)
# Calculate intersection
lt = torch.max(boxes1[..., :2], boxes2[..., :2])
rb = torch.min(boxes1[..., 2:], boxes2[..., 2:])
wh = (rb - lt).clamp(min=0)
intersection = wh[..., 0] * wh[..., 1]
# Calculate areas
area1 = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
area2 = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
union = area1 + area2 - intersection
iou = intersection / union.clamp(min=1e-8)
return iou
def _get_dynamic_k(self, cost_matrix, gt_bboxes):
"""Dynamically calculate k value for each GT"""
num_gt = gt_bboxes.size(0)
dynamic_ks = []
for gt_idx in range(num_gt):
# Select top-k anchors with smallest cost
_, topk_indices = torch.topk(
cost_matrix[:, gt_idx], k=self.candidate_topk, largest=False
)
# Calculate IoU of these anchors
ious = self._compute_iou(
cost_matrix.new_zeros(self.candidate_topk, 4),
gt_bboxes[gt_idx:gt_idx+1]
)
# Dynamic k is the integer part of IoU sum
dynamic_k = int(ious.sum().item())
dynamic_k = max(1, dynamic_k) # At least 1
dynamic_ks.append(dynamic_k)
return dynamic_ks
def _dynamic_k_matching(self, cost_matrix, dynamic_ks, num_gt):
"""Perform dynamic k matching"""
num_anchors = cost_matrix.size(0)
matched_gt_inds = torch.zeros(num_anchors, dtype=torch.long) - 1
matched_labels = torch.zeros(num_anchors, num_gt, dtype=torch.float)
for gt_idx in range(num_gt):
k = dynamic_ks[gt_idx]
# Select k anchors with smallest cost
_, topk_indices = torch.topk(
cost_matrix[:, gt_idx], k=k, largest=False
)
# Assign labels
matched_gt_inds[topk_indices] = gt_idx
matched_labels[topk_indices, gt_idx] = 1.0
return matched_gt_inds, matched_labels
# Self-Distillation Training
class SelfDistillation:
"""Self-Distillation Training Strategy"""
def __init__(self, teacher_model, student_model, temperature=4.0, alpha=0.7):
self.teacher_model = teacher_model
self.student_model = student_model
self.temperature = temperature
self.alpha = alpha
# Freeze teacher model
for param in self.teacher_model.parameters():
param.requires_grad = False
def compute_distillation_loss(self, student_outputs, teacher_outputs, targets):
"""Calculate distillation loss"""
# 1. Original task loss
task_loss = self._compute_task_loss(student_outputs, targets)
# 2. Knowledge distillation loss
kd_loss = self._compute_kd_loss(student_outputs, teacher_outputs)
# 3. Combined loss
total_loss = self.alpha * task_loss + (1 - self.alpha) * kd_loss
return total_loss, task_loss, kd_loss
def _compute_task_loss(self, outputs, targets):
"""Calculate original task loss"""
# Simplified implementation
return F.mse_loss(outputs, targets)
def _compute_kd_loss(self, student_outputs, teacher_outputs):
"""Calculate knowledge distillation loss"""
# Soften predictions
student_soft = F.softmax(student_outputs / self.temperature, dim=-1)
teacher_soft = F.softmax(teacher_outputs / self.temperature, dim=-1)
# KL divergence
kd_loss = F.kl_div(
student_soft.log(), teacher_soft, reduction='batchmean'
) * (self.temperature ** 2)
return kd_loss
# Demonstrate YOLOv6 features
def demonstrate_yolov6_features():
"""Demonstrate YOLOv6 Features"""
print("YOLOv6 Key Features:")
features = YOLOv6Features()
print("\nCore Innovations:")
for innovation, description in features.key_innovations.items():
print(f" {innovation}: {description}")
print(f"\nModel Variants Performance:")
print("-" * 60)
print(f"{'Model':<12}{'mAP':<8}{'Speed':<12}{'Params':<10}")
print("-" * 60)
for model, specs in features.model_variants.items():
print(f"{model:<12}{specs['mAP']:<8}{specs['Speed']:<12}{specs['Params']:<10}")
# Reparameterization demonstration
print(f"\nReparameterization Demonstration:")
rep_block = RepBlock(64, 64, 1)
# Training mode
x = torch.randn(1, 64, 32, 32)
train_output = rep_block(x)
print(f"Training mode output shape: {train_output.shape}")
# Deployment mode
rep_block.switch_to_deploy()
deploy_output = rep_block(x)
print(f"Deployment mode output shape: {deploy_output.shape}")
print(f"Output difference: {torch.mean(torch.abs(train_output - deploy_output)):.6f}")
# Run demonstration
demonstrate_yolov6_features()
6.2 YOLO v7 (2022)
6.2.1 Trainable Bag-of-Freebies
YOLO v7 introduces trainable bag-of-freebies, further improving model performance.
class YOLOv7Innovations:
"""YOLO v7 Innovation Analysis"""
def __init__(self):
self.innovations = {
"Architecture Design": [
"Extended Efficient Layer Aggregation Networks (E-ELAN)",
"Model Scaling for Concatenation-based Models",
"Planned Re-parameterized Convolution"
],
"Training Optimization": [
"Trainable Bag-of-Freebies",
"Label Assignment Optimization",
"Auxiliary Head Training Strategy"
],
"Performance Improvements": [
"Better speed-accuracy balance",
"More stable training process",
"Stronger generalization capability"
]
}
self.performance = {
"YOLOv7": {"mAP": 51.4, "FPS": 161, "Params": "36.9M"},
"YOLOv7-X": {"mAP": 53.1, "FPS": 114, "Params": "71.3M"},
"YOLOv7-W6": {"mAP": 54.9, "FPS": 84, "Params": "70.8M"},
"YOLOv7-E6": {"mAP": 56.0, "FPS": 56, "Params": "97.2M"}
}
# E-ELAN Module
class ELAN(nn.Module):
"""Extended Efficient Layer Aggregation Network"""
def __init__(self, in_channels, out_channels, num_blocks=4, expand_ratio=0.5):
super(ELAN, self).__init__()
hidden_channels = int(out_channels * expand_ratio)
# Initial transformation
self.conv1 = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
self.conv2 = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
# ELAN blocks
self.blocks = nn.ModuleList()
for i in range(num_blocks):
self.blocks.append(
nn.Sequential(
nn.Conv2d(hidden_channels, hidden_channels, 3, padding=1, bias=False),
nn.BatchNorm2d(hidden_channels),
nn.SiLU(inplace=True),
nn.Conv2d(hidden_channels, hidden_channels, 3, padding=1, bias=False),
nn.BatchNorm2d(hidden_channels),
nn.SiLU(inplace=True)
)
)
# Final fusion
final_channels = hidden_channels * (2 + num_blocks)
self.conv_final = nn.Conv2d(final_channels, out_channels, 1, bias=False)
self.bn_final = nn.BatchNorm2d(out_channels)
self.act_final = nn.SiLU(inplace=True)
def forward(self, x):
# Branch 1 and 2
x1 = self.conv1(x)
x2 = self.conv2(x)
# Collect all features
features = [x1, x2]
# Through ELAN blocks
current = x2
for block in self.blocks:
current = block(current)
features.append(current)
# Feature fusion
x = torch.cat(features, dim=1)
x = self.conv_final(x)
x = self.bn_final(x)
x = self.act_final(x)
return x
# Trainable Bag-of-Freebies
class TrainableBagOfFreebies(nn.Module):
"""Trainable Bag-of-Freebies"""
def __init__(self, num_classes=80):
super(TrainableBagOfFreebies, self).__init__()
self.num_classes = num_classes
# Learnable label assignment weights
self.label_assignment_weights = nn.Parameter(torch.ones(4)) # cls, obj, box, iou
# Learnable loss weights
self.loss_weights = nn.Parameter(torch.tensor([1.0, 1.0, 1.0])) # cls, box, obj
# Learnable NMS parameters
self.nms_conf_threshold = nn.Parameter(torch.tensor(0.25))
self.nms_iou_threshold = nn.Parameter(torch.tensor(0.45))
def adaptive_label_assignment(self, pred_cls, pred_box, pred_obj, targets):
"""Adaptive label assignment"""
# Use learnable weights to adjust importance of different loss components
weights = F.softmax(self.label_assignment_weights, dim=0)
cls_weight, obj_weight, box_weight, iou_weight = weights
# Calculate weighted cost
cls_cost = self._compute_classification_cost(pred_cls, targets) * cls_weight
box_cost = self._compute_box_cost(pred_box, targets) * box_weight
obj_cost = self._compute_objectness_cost(pred_obj, targets) * obj_weight
iou_cost = self._compute_iou_cost(pred_box, targets) * iou_weight
total_cost = cls_cost + box_cost + obj_cost + iou_cost
return self._hungarian_matching(total_cost)
def adaptive_loss_weighting(self, cls_loss, box_loss, obj_loss):
"""Adaptive loss weighting"""
weights = F.softmax(self.loss_weights, dim=0)
total_loss = (weights[0] * cls_loss +
weights[1] * box_loss +
weights[2] * obj_loss)
return total_loss
def learnable_nms(self, predictions):
"""Learnable NMS parameters"""
conf_thresh = torch.sigmoid(self.nms_conf_threshold)
iou_thresh = torch.sigmoid(self.nms_iou_threshold)
# Apply NMS with learned thresholds
return self._apply_nms(predictions, conf_thresh, iou_thresh)
def _compute_classification_cost(self, pred_cls, targets):
"""Classification cost calculation"""
# Simplified implementation
return F.cross_entropy(pred_cls, targets['labels'], reduction='none')
def _compute_box_cost(self, pred_box, targets):
"""Bounding box cost calculation"""
return F.l1_loss(pred_box, targets['boxes'], reduction='none').sum(-1)
def _compute_objectness_cost(self, pred_obj, targets):
"""Objectness cost calculation"""
return F.binary_cross_entropy_with_logits(pred_obj, targets['objectness'], reduction='none')
def _compute_iou_cost(self, pred_box, targets):
"""IoU cost calculation"""
ious = self._compute_iou(pred_box, targets['boxes'])
return 1 - ious
def _hungarian_matching(self, cost_matrix):
"""Hungarian matching algorithm"""
# Simplified implementation
return torch.argmin(cost_matrix, dim=-1)
def _apply_nms(self, predictions, conf_thresh, iou_thresh):
"""Apply NMS"""
# Simplified implementation
return predictions
def _compute_iou(self, boxes1, boxes2):
"""Calculate IoU"""
# Simplified implementation
return torch.rand(boxes1.size(0))
# Auxiliary Head Training
class AuxiliaryHead(nn.Module):
"""Auxiliary Detection Head"""
def __init__(self, in_channels, num_classes=80):
super(AuxiliaryHead, self).__init__()
self.num_classes = num_classes
self.conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels//2, 3, padding=1),
nn.BatchNorm2d(in_channels//2),
nn.SiLU(inplace=True),
nn.Conv2d(in_channels//2, 3 * (5 + num_classes), 1)
)
def forward(self, x):
return self.conv(x)
class YOLOv7(nn.Module):
"""YOLO v7 Network Architecture"""
def __init__(self, num_classes=80):
super(YOLOv7, self).__init__()
self.num_classes = num_classes
# Backbone using E-ELAN
self.backbone = self._build_backbone()
# Neck network
self.neck = self._build_neck()
# Main detection head
self.head = self._build_head()
# Auxiliary detection head
self.aux_head = AuxiliaryHead(512, num_classes)
# Trainable bag-of-freebies
self.bag_of_freebies = TrainableBagOfFreebies(num_classes)
def _build_backbone(self):
"""Build backbone network"""
return nn.Sequential(
# Stem
nn.Conv2d(3, 32, 3, stride=1, padding=1),
nn.BatchNorm2d(32),
nn.SiLU(inplace=True),
# Stage 1
ELAN(32, 64, num_blocks=2),
nn.Conv2d(64, 128, 3, stride=2, padding=1),
# Stage 2
ELAN(128, 256, num_blocks=4),
nn.Conv2d(256, 512, 3, stride=2, padding=1),
# Stage 3
ELAN(512, 1024, num_blocks=6),
)
def _build_neck(self):
"""Build neck network"""
return nn.Identity() # Simplified implementation
def _build_head(self):
"""Build detection head"""
return nn.Conv2d(1024, 3 * (5 + self.num_classes), 1)
def forward(self, x, targets=None):
# Backbone
backbone_features = self.backbone(x)
# Neck
neck_features = self.neck(backbone_features)
# Main detection head
main_output = self.head(neck_features)
# Auxiliary detection head (only during training)
if self.training and targets is not None:
aux_output = self.aux_head(neck_features)
# Calculate loss
main_loss = self._compute_loss(main_output, targets, is_main=True)
aux_loss = self._compute_loss(aux_output, targets, is_main=False)
return main_output, main_loss + 0.4 * aux_loss
else:
return main_output
def _compute_loss(self, predictions, targets, is_main=True):
"""Calculate loss"""
# Simplified implementation
if is_main:
# Use trainable bag-of-freebies
return self.bag_of_freebies.adaptive_loss_weighting(
torch.tensor(1.0), torch.tensor(1.0), torch.tensor(1.0)
)
else:
return torch.tensor(1.0)
6.3 YOLO v8 (2023)
6.3.1 Unified Architecture Design
YOLO v8 adopts a unified architecture supporting multiple tasks such as detection, segmentation, and classification.
class YOLOv8Features:
"""YOLO v8 Feature Analysis"""
def __init__(self):
self.unified_architecture = {
"Detection": "Object Detection",
"Segmentation": "Instance Segmentation",
"Classification": "Image Classification",
"Pose Estimation": "Keypoint Detection"
}
self.key_improvements = {
"Architecture": "C2f Module + Anchor-free Design",
"Loss Function": "VFL + DFL + CIoU Loss",
"Data Augmentation": "Mosaic + MixUp + CopyPaste",
"Label Assignment": "Task-Aligned Assigner (TAL)",
"Optimizer": "AdamW + Cosine Annealing"
}
# C2f Module - CSP Bottleneck with 2 Convolutions
class C2f(nn.Module):
"""C2f Module - Lighter CSP Design"""
def __init__(self, in_channels, out_channels, num_bottlenecks=1, shortcut=False, expansion=0.5):
super(C2f, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = nn.Conv2d(in_channels, 2 * hidden_channels, 1, bias=False)
self.conv2 = nn.Conv2d((2 + num_bottlenecks) * hidden_channels, out_channels, 1, bias=False)
self.bottlenecks = nn.ModuleList([
Bottleneck(hidden_channels, hidden_channels, shortcut, groups=1, expansion=1.0)
for _ in range(num_bottlenecks)
])
def forward(self, x):
# Split features
y = self.conv1(x)
y = list(y.chunk(2, dim=1))
# Through bottleneck
for bottleneck in self.bottlenecks:
y.append(bottleneck(y[-1]))
# Concatenate all features
return self.conv2(torch.cat(y, dim=1))
# Task-Aligned Assigner
class TaskAlignedAssigner:
"""Task-Aligned Assigner"""
def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0):
self.topk = topk
self.num_classes = num_classes
self.alpha = alpha
self.beta = beta
def assign(self, pred_scores, pred_bboxes, anchor_points, gt_bboxes, gt_labels):
"""
Perform task-aligned label assignment
"""
num_anchors, num_gt = len(anchor_points), len(gt_bboxes)
if num_gt == 0:
return torch.zeros(num_anchors, dtype=torch.long), \
torch.zeros(num_anchors), \
torch.zeros(num_anchors, 4)
# 1. Calculate alignment metrics
alignment_metrics = self._compute_alignment_metrics(
pred_scores, pred_bboxes, gt_bboxes, gt_labels
)
# 2. Select top-k candidates
topk_metrics, topk_indices = torch.topk(
alignment_metrics, k=min(self.topk, num_anchors), dim=0
)
# 3. Dynamic thresholds
dynamic_thresholds = topk_metrics.mean(dim=0, keepdim=True)
# 4. Positive sample selection
positive_mask = alignment_metrics > dynamic_thresholds
# 5. Assign labels
assigned_labels = torch.zeros(num_anchors, dtype=torch.long)
assigned_bboxes = torch.zeros(num_anchors, 4)
assigned_scores = torch.zeros(num_anchors)
for gt_idx in range(num_gt):
pos_indices = positive_mask[:, gt_idx].nonzero().squeeze(-1)
if len(pos_indices) > 0:
assigned_labels[pos_indices] = gt_labels[gt_idx]
assigned_bboxes[pos_indices] = gt_bboxes[gt_idx]
assigned_scores[pos_indices] = alignment_metrics[pos_indices, gt_idx]
return assigned_labels, assigned_scores, assigned_bboxes
def _compute_alignment_metrics(self, pred_scores, pred_bboxes, gt_bboxes, gt_labels):
"""Calculate alignment metrics"""
num_anchors, num_gt = pred_scores.size(0), len(gt_bboxes)
# Classification scores
cls_scores = pred_scores[torch.arange(num_anchors)[:, None], gt_labels[None, :]]
# IoU scores
iou_scores = self._compute_iou_matrix(pred_bboxes, gt_bboxes)
# Alignment metric = classification_score^alpha * IoU_score^beta
alignment_metrics = cls_scores.pow(self.alpha) * iou_scores.pow(self.beta)
return alignment_metrics
def _compute_iou_matrix(self, boxes1, boxes2):
"""Calculate IoU matrix"""
num_boxes1, num_boxes2 = boxes1.size(0), boxes2.size(0)
# Expand dimensions for broadcasting
boxes1 = boxes1[:, None, :] # (num_boxes1, 1, 4)
boxes2 = boxes2[None, :, :] # (1, num_boxes2, 4)
# Calculate intersection
lt = torch.max(boxes1[..., :2], boxes2[..., :2])
rb = torch.min(boxes1[..., 2:], boxes2[..., 2:])
wh = (rb - lt).clamp(min=0)
intersection = wh[..., 0] * wh[..., 1]
# Calculate union
area1 = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
area2 = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
union = area1 + area2 - intersection
# Calculate IoU
iou = intersection / union.clamp(min=1e-8)
return iou
# Distribution Focal Loss
class DistributionFocalLoss(nn.Module):
"""Distribution Focal Loss - For better bounding box regression"""
def __init__(self, reg_max=16):
super(DistributionFocalLoss, self).__init__()
self.reg_max = reg_max
def forward(self, pred_dist, target_dist):
"""
pred_dist: (N, 4, reg_max+1) - Predicted distribution
target_dist: (N, 4, reg_max+1) - Target distribution
"""
# Calculate focal weights
target_label = target_dist.argmax(dim=-1, keepdim=True)
weight = target_dist.gather(dim=-1, index=target_label)
weight = weight.squeeze(-1)
# Calculate cross-entropy loss
loss = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1),
target_dist.view(-1, self.reg_max + 1).argmax(-1),
reduction='none')
# Apply focal weights
loss = loss.view(pred_dist.shape[:-1]) # (N, 4)
loss = (loss * weight.pow(2)).mean()
return loss
# Unified YOLOv8 Architecture
class YOLOv8(nn.Module):
"""YOLOv8 Unified Architecture"""
def __init__(self, num_classes=80, task='detect', depth_multiple=1.0, width_multiple=1.0):
super(YOLOv8, self).__init__()
self.num_classes = num_classes
self.task = task
# Build backbone
self.backbone = self._build_backbone(depth_multiple, width_multiple)
# Build neck
self.neck = self._build_neck(width_multiple)
# Build task-specific head
if task == 'detect':
self.head = self._build_detect_head(width_multiple)
elif task == 'segment':
self.head = self._build_segment_head(width_multiple)
elif task == 'classify':
self.head = self._build_classify_head(width_multiple)
elif task == 'pose':
self.head = self._build_pose_head(width_multiple)
def _build_backbone(self, depth_multiple, width_multiple):
"""Build backbone network"""
def make_divisible(x, divisor=8):
return int(math.ceil(x / divisor) * divisor)
layers = []
# Stem
layers.append(
nn.Conv2d(3, make_divisible(64 * width_multiple), 3, stride=2, padding=1)
)
layers.append(nn.BatchNorm2d(make_divisible(64 * width_multiple)))
layers.append(nn.SiLU(inplace=True))
# Stage 1
layers.append(
nn.Conv2d(make_divisible(64 * width_multiple),
make_divisible(128 * width_multiple), 3, stride=2, padding=1)
)
layers.append(
C2f(make_divisible(128 * width_multiple),
make_divisible(128 * width_multiple),
max(round(3 * depth_multiple), 1), True)
)
# Stage 2
layers.append(
nn.Conv2d(make_divisible(128 * width_multiple),
make_divisible(256 * width_multiple), 3, stride=2, padding=1)
)
layers.append(
C2f(make_divisible(256 * width_multiple),
make_divisible(256 * width_multiple),
max(round(6 * depth_multiple), 1), True)
)
# Stage 3
layers.append(
nn.Conv2d(make_divisible(256 * width_multiple),
make_divisible(512 * width_multiple), 3, stride=2, padding=1)
)
layers.append(
C2f(make_divisible(512 * width_multiple),
make_divisible(512 * width_multiple),
max(round(6 * depth_multiple), 1), True)
)
# Stage 4
layers.append(
nn.Conv2d(make_divisible(512 * width_multiple),
make_divisible(1024 * width_multiple), 3, stride=2, padding=1)
)
layers.append(
C2f(make_divisible(1024 * width_multiple),
make_divisible(1024 * width_multiple),
max(round(3 * depth_multiple), 1), True)
)
return nn.Sequential(*layers)
def _build_neck(self, width_multiple):
"""Build neck network - FPN + PAN"""
return nn.Identity() # Simplified implementation
def _build_detect_head(self, width_multiple):
"""Build detection head"""
return nn.Conv2d(int(1024 * width_multiple),
3 * (4 + self.num_classes), 1)
def _build_segment_head(self, width_multiple):
"""Build segmentation head"""
return nn.Sequential(
nn.Conv2d(int(1024 * width_multiple), 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.SiLU(inplace=True),
nn.Conv2d(256, self.num_classes, 1)
)
def _build_classify_head(self, width_multiple):
"""Build classification head"""
return nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(int(1024 * width_multiple), self.num_classes)
)
def _build_pose_head(self, width_multiple):
"""Build pose estimation head"""
# Assume 17 keypoints, each with 3 values (x, y, visibility)
return nn.Conv2d(int(1024 * width_multiple), 17 * 3, 1)
def forward(self, x):
# Backbone
features = self.backbone(x)
# Neck
neck_features = self.neck(features)
# Task head
output = self.head(neck_features)
return output
6.4 YOLO v9-v11 Latest Developments
6.4.1 Cutting-Edge Technology Integration
class YOLOLatestVersions:
"""YOLO Latest Version Features"""
def __init__(self):
self.versions_summary = {
"YOLOv9 (2024)": {
"Core Innovation": "Programmable Gradient Information (PGI)",
"Key Features": ["Programmable Gradients", "GELAN Architecture", "Auxiliary Branch Training"],
"Performance Improvement": "Better information flow and gradient propagation"
},
"YOLOv10 (2024)": {
"Core Innovation": "NMS-free Training",
"Key Features": ["Consistent Dual Assignment", "Holistic Feature Fusion", "Large Kernel Convolution"],
"Performance Improvement": "Eliminate post-processing dependency, end-to-end optimization"
},
"YOLOv11 (2024)": {
"Core Innovation": "Deep Attention Mechanism Integration",
"Key Features": ["C3k2 Module", "C2PSA Attention", "Improved Detection Head"],
"Performance Improvement": "Stronger feature representation and attention mechanisms"
}
}
# YOLOv9 PGI Mechanism
class ProgrammableGradientInformation(nn.Module):
"""Programmable Gradient Information"""
def __init__(self, channels_list):
super(ProgrammableGradientInformation, self).__init__()
self.channels_list = channels_list
# Auxiliary branches
self.aux_branches = nn.ModuleList([
self._make_aux_branch(channels) for channels in channels_list
])
# Main branch
self.main_branch = self._make_main_branch()
# Information fusion
self.info_fusion = nn.ModuleList([
nn.Conv2d(channels, channels, 1) for channels in channels_list
])
def _make_aux_branch(self, channels):
"""Create auxiliary branch"""
return nn.Sequential(
nn.Conv2d(channels, channels // 2, 1),
nn.BatchNorm2d(channels // 2),
nn.SiLU(inplace=True),
nn.Conv2d(channels // 2, channels, 3, padding=1),
nn.BatchNorm2d(channels),
nn.SiLU(inplace=True)
)
def _make_main_branch(self):
"""Create main branch"""
return nn.Identity() # Simplified implementation
def forward(self, features):
"""
features: list of feature maps from different stages
"""
aux_outputs = []
main_features = []
# Auxiliary branch processing
for i, (feature, aux_branch) in enumerate(zip(features, self.aux_branches)):
aux_out = aux_branch(feature)
aux_outputs.append(aux_out)
# Information fusion
fused_feature = self.info_fusion[i](feature + aux_out)
main_features.append(fused_feature)
return main_features, aux_outputs
# YOLOv10 NMS-free Design
class NMSFreeHead(nn.Module):
"""NMS-free Detection Head"""
def __init__(self, num_classes, in_channels):
super(NMSFreeHead, self).__init__()
self.num_classes = num_classes
# Two heads for consistent dual assignment
self.one2one_head = nn.Conv2d(in_channels, 4 + num_classes, 1)
self.one2many_head = nn.Conv2d(in_channels, 4 + num_classes, 1)
def forward(self, x):
# Use one2many during training, one2one during inference
if self.training:
one2one_out = self.one2one_head(x)
one2many_out = self.one2many_head(x)
return one2one_out, one2many_out
else:
return self.one2one_head(x)
# YOLOv11 C2PSA Attention Module
class C2PSA(nn.Module):
"""C2f with Position-Sensitive Attention"""
def __init__(self, in_channels, out_channels, num_heads=8, expansion=0.5):
super(C2PSA, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = nn.Conv2d(in_channels, 2 * hidden_channels, 1)
self.conv2 = nn.Conv2d(2 * hidden_channels, out_channels, 1)
# Position-sensitive attention
self.psa = PositionSensitiveAttention(hidden_channels, num_heads)
def forward(self, x):
# Split feature channels
y = self.conv1(x)
y1, y2 = y.chunk(2, dim=1)
# Apply position-sensitive attention
y2_att = self.psa(y2)
# Feature fusion
out = torch.cat([y1, y2_att], dim=1)
return self.conv2(out)
class PositionSensitiveAttention(nn.Module):
"""Position-Sensitive Attention"""
def __init__(self, channels, num_heads=8):
super(PositionSensitiveAttention, self).__init__()
self.channels = channels
self.num_heads = num_heads
self.head_dim = channels // num_heads
# Query, key, value projection
self.qkv = nn.Conv2d(channels, channels * 3, 1, bias=False)
# Position encoding
self.pos_embed = nn.Conv2d(channels, channels, 3, padding=1, groups=channels)
# Output projection
self.proj = nn.Conv2d(channels, channels, 1)
self.scale = self.head_dim ** -0.5
def forward(self, x):
B, C, H, W = x.shape
# Generate QKV
qkv = self.qkv(x) # (B, 3*C, H, W)
q, k, v = qkv.chunk(3, dim=1)
# Add position information
pos = self.pos_embed(x)
q = q + pos
k = k + pos
# Reshape for multi-head attention
q = q.view(B, self.num_heads, self.head_dim, H * W).transpose(-2, -1)
k = k.view(B, self.num_heads, self.head_dim, H * W)
v = v.view(B, self.num_heads, self.head_dim, H * W).transpose(-2, -1)
# Calculate attention
attn = (q @ k) * self.scale # (B, num_heads, H*W, H*W)
attn = F.softmax(attn, dim=-1)
# Apply attention
out = (attn @ v).transpose(-2, -1) # (B, num_heads, head_dim, H*W)
out = out.contiguous().view(B, C, H, W)
# Output projection
out = self.proj(out)
return out
# Transformer Fusion Trend
class YOLOTransformer(nn.Module):
"""YOLO and Transformer Integration Exploration"""
def __init__(self, embed_dim=256, num_heads=8, num_layers=6):
super(YOLOTransformer, self).__init__()
# CNN feature extraction
self.cnn_backbone = self._build_cnn_backbone()
# Transformer encoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim, nhead=num_heads, batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
# Feature mapping
self.feature_proj = nn.Linear(1024, embed_dim)
# Detection head
self.detection_head = nn.Linear(embed_dim, 4 + 80) # 4 bbox + 80 classes
def _build_cnn_backbone(self):
"""Build CNN backbone"""
return nn.Sequential(
nn.Conv2d(3, 64, 7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, stride=2, padding=1),
# ... more layers
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(1024, 1024)
)
def forward(self, x):
# CNN feature extraction
cnn_features = self.cnn_backbone(x) # (B, 1024)
# Convert to transformer input
transformer_input = self.feature_proj(cnn_features).unsqueeze(1) # (B, 1, embed_dim)
# Transformer encoding
transformer_output = self.transformer(transformer_input) # (B, 1, embed_dim)
# Detection prediction
predictions = self.detection_head(transformer_output.squeeze(1)) # (B, 84)
return predictions
# Performance Comparison and Trend Analysis
class LatestYOLOComparison:
"""Latest YOLO Version Comparison"""
def __init__(self):
self.performance_data = {
"YOLOv8n": {"mAP": 37.3, "FPS": 1100, "Params": "3.2M", "Year": "2023"},
"YOLOv9t": {"mAP": 38.3, "FPS": 1100, "Params": "2.0M", "Year": "2024"},
"YOLOv10n": {"mAP": 39.5, "FPS": 1200, "Params": "2.3M", "Year": "2024"},
"YOLOv11n": {"mAP": 39.9, "FPS": 1000, "Params": "2.6M", "Year": "2024"}
}
self.technical_trends = [
"Automated architecture search",
"Widespread attention mechanisms",
"End-to-end optimization",
"Multi-task unification",
"Hardware-friendly design",
"Enhanced interpretability"
]
def plot_evolution_trend(self):
"""Plot evolution trends"""
models = list(self.performance_data.keys())
maps = [data["mAP"] for data in self.performance_data.values()]
fps = [data["FPS"] for data in self.performance_data.values()]
print("Latest YOLO Version Performance Comparison:")
print("-" * 50)
print(f"{'Model':<10}{'mAP':<8}{'FPS':<8}{'Params':<10}{'Year':<8}")
print("-" * 50)
for model, data in self.performance_data.items():
print(f"{model:<10}{data['mAP']:<8}{data['FPS']:<8}{data['Params']:<10}{data['Year']:<8}")
print(f"\nTechnical Development Trends:")
for i, trend in enumerate(self.technical_trends, 1):
print(f"{i}. {trend}")
# Usage example
comparison = LatestYOLOComparison()
comparison.plot_evolution_trend()
6.5 Cutting-Edge Technology Trends
6.5.1 Technology Development Directions
class FutureTrends:
"""Future Development Trends"""
def __init__(self):
self.technical_directions = {
"Architecture Innovation": [
"Neural Architecture Search (NAS) automated design",
"Deep fusion of Transformer and CNN",
"Dynamic network architectures",
"Differentiable architecture search"
],
"Training Optimization": [
"Self-supervised pre-training",
"Unsupervised domain adaptation",
"Continual learning capabilities",
"Few-shot learning"
],
"Inference Optimization": [
"Model quantization and pruning",
"Neural network compilers",
"Edge device optimization",
"Real-time performance improvement"
],
"Application Extension": [
"3D object detection",
"Video understanding",
"Multi-modal fusion",
"Scene graph generation"
]
}
self.emerging_technologies = [
"Vision Transformer (ViT) fusion",
"Diffusion model applications",
"Large-scale pre-trained models",
"Multi-modal large models",
"Neural Radiance Fields (NeRF)",
"Causal reasoning integration"
]
def analyze_future_directions(self):
"""Analyze future development directions"""
print("YOLO Future Development Direction Analysis:")
print("=" * 50)
for category, directions in self.technical_directions.items():
print(f"\n{category}:")
for direction in directions:
print(f" • {direction}")
print(f"\nEmerging Technology Integration:")
for tech in self.emerging_technologies:
print(f" • {tech}")
# Challenges and Opportunities in Practical Applications
class ChallengesAndOpportunities:
"""Challenge and Opportunity Analysis"""
def __init__(self):
self.challenges = {
"Technical Challenges": [
"Small object detection still needs improvement",
"Robustness in complex scenarios",
"Balance between real-time and accuracy",
"Long-tail distribution problem"
],
"Engineering Challenges": [
"Model deployment complexity",
"Adaptation to different hardware platforms",
"Version compatibility issues",
"Performance tuning difficulty"
],
"Application Challenges": [
"Data privacy protection",
"Model interpretability",
"Edge computing limitations",
"Real-world scenario complexity"
]
}
self.opportunities = {
"Technical Opportunities": [
"Transfer from large model pre-training",
"Multi-modal information fusion",
"Adaptive architecture design",
"Edge-cloud collaborative inference"
],
"Application Opportunities": [
"Rapid autonomous driving development",
"Growing demand for intelligent surveillance",
"Industrial inspection automation",
"Medical image analysis"
],
"Ecosystem Opportunities": [
"Active open-source community",
"Hardware performance improvement",
"Standardized tool chains",
"Industry-academia-research collaboration"
]
}
def print_analysis(self):
"""Print analysis results"""
print("YOLO Development Challenges and Opportunities:")
print("=" * 50)
print("\n[Challenge Analysis]")
for category, items in self.challenges.items():
print(f"\n{category}:")
for item in items:
print(f" Warning: {item}")
print(f"\n[Opportunity Analysis]")
for category, items in self.opportunities.items():
print(f"\n{category}:")
for item in items:
print(f" Opportunity: {item}")
# Usage example
trends = FutureTrends()
challenges = ChallengesAndOpportunities()
trends.analyze_future_directions()
print("\n")
challenges.print_analysis()
6.6 Chapter Summary
6.6.1 Core Features of Latest Versions
Through this chapter, we learned about the main features of YOLO v6-v11:
- YOLOv6: Industrial-grade optimization, reparameterization design, self-distillation training
- YOLOv7: Trainable bag-of-freebies, E-ELAN architecture, auxiliary head training
- YOLOv8: Unified architecture, multi-task support, task-aligned assignment
- YOLOv9: Programmable gradient information, information flow optimization
- YOLOv10: NMS-free design, end-to-end optimization
- YOLOv11: Deep attention integration, enhanced feature representation
6.6.2 Technology Evolution Patterns
def summarize_latest_evolution():
"""Summarize latest evolution patterns"""
evolution_patterns = {
"Continuous Accuracy Improvement": "mAP improved from 37% to 40%+",
"Ongoing Speed Optimization": "Inference speed exceeds 1000+ FPS",
"Increasingly Mature Architecture": "Modular, reusable design philosophy",
"High Engineering Level": "Significantly improved usability and deployment convenience",
"Multi-task Unification": "Unified architecture for detection, segmentation, classification",
"Cutting-edge Technology Fusion": "New technologies like Transformer, attention mechanisms"
}
future_predictions = [
"Stronger generalization and zero-shot learning",
"More efficient model compression and acceleration techniques",
"Smarter automated design and optimization",
"Richer multi-modal understanding capabilities"
]
print("Latest YOLO Evolution Patterns:")
for pattern, description in evolution_patterns.items():
print(f" • {pattern}: {description}")
print(f"\nFuture Development Predictions:")
for prediction in future_predictions:
print(f" Crystal Ball: {prediction}")
summarize_latest_evolution()
6.6.3 Learning Checkpoints
After completing this chapter, you should be able to:
- Understand the main technical innovations of YOLO v6-v11
- Comprehend cutting-edge technologies like reparameterization and attention mechanisms
- Master the design philosophy of unified architecture and multi-task learning
- Recognize trends in NMS-free and other end-to-end optimizations
- Analyze the development direction of YOLO and Transformer fusion
- Grasp future technology trends in object detection
The latest YOLO versions demonstrate rapid development in object detection technology. From engineering optimization to architectural innovation, from single-task to multi-task unification, each version is pushing the boundaries of technology. With the fusion of cutting-edge technologies like Transformers and attention mechanisms, and exploration of NMS-free end-to-end optimization, YOLO is evolving toward more intelligent, efficient, and general-purpose directions.
In the next chapter, we will learn how to set up the YOLO development environment to prepare for actual model training and deployment.
Key Points: Master the core technologies of the latest YOLO versions, understand cutting-edge development trends, and lay the foundation for practical applications and further research.