Chapter 4: YOLO v1 Detailed Explanation
Haiyue
27min
Chapter 4: YOLO v1 Detailed Explanation
Learning Objectives
- Understand the core concepts and innovations of YOLO v1
- Master the network architecture design of YOLO v1
- Familiarize with the loss function design principles
- Learn about the training and inference process
4.1 Core Concepts of YOLO v1
4.1.1 “You Only Look Once” Revolutionary Concept
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
class YOLOv1Philosophy:
def __init__(self):
self.core_concepts = {
"Unified Detection": {
"Concept": "Redefine object detection as a single regression problem",
"Comparison": "Traditional methods require two steps: region proposal + classification",
"Advantage": "End-to-end training, simple architecture"
},
"Global Reasoning": {
"Concept": "See the entire image for prediction",
"Comparison": "Sliding windows only see local information",
"Advantage": "Reduce background false positives, utilize global context"
},
"Real-time Detection": {
"Concept": "Complete detection in a single forward pass",
"Performance": "45 FPS on Titan X",
"Significance": "First real-time high-accuracy object detection"
},
"Grid Prediction": {
"Concept": "Divide image into S×S grid",
"Responsibility": "Each grid cell responsible for detecting objects whose center falls in it",
"Simplification": "Avoid complex region proposal generation"
}
}
def paradigm_shift(self):
"""Detection paradigm shift analysis"""
traditional_vs_yolo = {
"Traditional Two-Stage Method": {
"Process": ["Region proposal", "Feature extraction", "Classification", "Regression"],
"Pros": ["High accuracy", "Mature and stable"],
"Cons": ["Slow speed", "Complex system", "Difficult optimization"],
"Representative": "R-CNN series"
},
"YOLO One-Stage Method": {
"Process": ["Single CNN", "Direct detection output"],
"Pros": ["Fast speed", "End-to-end", "Global optimization"],
"Cons": ["Slightly lower accuracy", "Small objects difficult"],
"Breakthrough": "Redefine the detection problem"
}
}
print("Object Detection Paradigm Shift:")
print("=" * 40)
for paradigm, details in traditional_vs_yolo.items():
print(f"\n{paradigm}:")
for key, value in details.items():
if isinstance(value, list):
print(f" {key}: {' -> '.join(value)}")
else:
print(f" {key}: {value}")
return traditional_vs_yolo
def detection_as_regression(self):
"""Detection as regression problem"""
regression_formulation = {
"Problem Redefinition": {
"Input": "H×W×3 image",
"Output": "S×S×(B×5+C) tensor",
"Meaning": "Each grid predicts B bounding boxes and C class probabilities"
},
"Output Interpretation": {
"Bounding Box": "(x, y, w, h) relative coordinates",
"Confidence": "P(Object) × IoU(pred, truth)",
"Class Probability": "P(Class_i | Object)",
"Final Prediction": "P(Class_i) × P(Object) × IoU"
},
"Grid Responsibility": {
"Principle": "Grid cell where object center falls is responsible for predicting that object",
"Advantage": "Avoid duplicate detections of the same object",
"Limitation": "Each grid can detect at most one object"
}
}
print("Detection as Regression Problem:")
print("=" * 30)
for aspect, details in regression_formulation.items():
print(f"\n{aspect}:")
for key, value in details.items():
print(f" {key}: {value}")
return regression_formulation
# Usage example
yolo_philosophy = YOLOv1Philosophy()
# Core concepts
print("YOLO v1 Core Concepts:")
print("=" * 25)
for concept, details in yolo_philosophy.core_concepts.items():
print(f"\n{concept}:")
for key, value in details.items():
print(f" {key}: {value}")
# Paradigm shift
paradigm_comparison = yolo_philosophy.paradigm_shift()
# Regression problem redefinition
regression_details = yolo_philosophy.detection_as_regression()
4.2 YOLO v1 Network Architecture
4.2.1 Overall Architecture Design
class YOLOv1Architecture:
def __init__(self):
self.network_specs = {
"Input": "448×448×3",
"Grid Size": "7×7",
"Boxes per Grid": "2",
"Classes": "20 (PASCAL VOC)",
"Output": "7×7×30"
}
def build_yolov1_network(self, num_classes=20, num_boxes=2, grid_size=7):
"""Build YOLO v1 network"""
class YOLOv1(nn.Module):
def __init__(self, num_classes=20, num_boxes=2, grid_size=7):
super(YOLOv1, self).__init__()
self.num_classes = num_classes
self.num_boxes = num_boxes
self.grid_size = grid_size
# Convolutional feature extraction layers (inspired by GoogLeNet)
self.features = nn.Sequential(
# First conv group
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.LeakyReLU(0.1, inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Second conv group
nn.Conv2d(64, 192, kernel_size=3, padding=1),
nn.BatchNorm2d(192),
nn.LeakyReLU(0.1, inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Third conv group
nn.Conv2d(192, 128, kernel_size=1),
nn.BatchNorm2d(128),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 256, kernel_size=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Fourth conv group (alternating 1×1 and 3×3)
nn.Conv2d(512, 256, kernel_size=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 256, kernel_size=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 256, kernel_size=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 256, kernel_size=1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 512, kernel_size=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Fifth conv group
nn.Conv2d(1024, 512, kernel_size=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(1024, 512, kernel_size=1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(512, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
# Final conv layers
nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1, inplace=True),
)
# Fully connected detection layers
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(1024 * grid_size * grid_size, 4096),
nn.LeakyReLU(0.1, inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, grid_size * grid_size * (num_boxes * 5 + num_classes)),
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
# Reshape to (batch_size, grid_size, grid_size, num_boxes*5 + num_classes)
batch_size = x.size(0)
x = x.view(batch_size, self.grid_size, self.grid_size,
self.num_boxes * 5 + self.num_classes)
return x
return YOLOv1(num_classes, num_boxes, grid_size)
def architecture_analysis(self):
"""Detailed architecture analysis"""
layer_analysis = {
"Convolutional Layers": {
"Total Layers": "24 convolutional layers",
"Design Inspiration": "GoogLeNet architecture",
"Features": "1×1 conv for dimensionality reduction + 3×3 conv for feature extraction",
"Activation": "Leaky ReLU (α=0.1)"
},
"Fully Connected Layers": {
"Layers": "2 fully connected layers",
"First Layer": "4096 neurons",
"Second Layer": "7×7×30 = 1470 outputs",
"Dropout": "0.5 to prevent overfitting"
},
"Output Tensor": {
"Dimensions": "7×7×30",
"Bounding Boxes": "2 boxes per grid, 5 parameters each",
"Classes": "20 class probabilities",
"Calculation": "2×5 + 20 = 30"
},
"Parameters": {
"Total Parameters": "~45M parameters",
"Convolutional": "~40M parameters",
"Fully Connected": "~5M parameters"
}
}
print("YOLO v1 Architecture Analysis:")
print("=" * 30)
for aspect, details in layer_analysis.items():
print(f"\n{aspect}:")
for key, value in details.items():
print(f" {key}: {value}")
return layer_analysis
def output_interpretation(self):
"""Output interpretation"""
def parse_yolo_output(output_tensor, grid_size=7, num_boxes=2, num_classes=20):
"""Parse YOLO output tensor"""
batch_size = output_tensor.size(0)
# Separate bounding box and class predictions
bbox_predictions = output_tensor[:, :, :, :num_boxes*5].view(
batch_size, grid_size, grid_size, num_boxes, 5)
class_predictions = output_tensor[:, :, :, num_boxes*5:]
# Bounding box parameters
bbox_coords = bbox_predictions[:, :, :, :, :4] # (x, y, w, h)
bbox_confidence = bbox_predictions[:, :, :, :, 4] # confidence
return {
'bbox_coords': bbox_coords,
'bbox_confidence': bbox_confidence,
'class_probs': class_predictions
}
output_format = {
"Grid Cell Output": {
"Bounding Box 1": "[x1, y1, w1, h1, conf1]",
"Bounding Box 2": "[x2, y2, w2, h2, conf2]",
"Class Probabilities": "[P(class1), P(class2), ..., P(class20)]"
},
"Coordinate Encoding": {
"x, y": "Offset relative to grid cell (0-1)",
"w, h": "Ratio relative to entire image (0-1)",
"Confidence": "P(Object) × IoU(pred, truth)"
},
"Class Prediction": {
"Shared": "Multiple boxes per grid share class prediction",
"Conditional Probability": "P(Class_i | Object)",
"Final Probability": "conf × P(Class_i | Object)"
}
}
print("YOLO Output Format:")
print("=" * 20)
for aspect, details in output_format.items():
print(f"\n{aspect}:")
for key, value in details.items():
print(f" {key}: {value}")
return parse_yolo_output, output_format
# Usage example
yolo_arch = YOLOv1Architecture()
# Build network
model = yolo_arch.build_yolov1_network()
print("YOLO v1 Network Structure:")
print("=" * 25)
print(model)
# Count parameters
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
param_count = count_parameters(model)
print(f"\nTotal Parameters: {param_count:,}")
# Architecture analysis
arch_analysis = yolo_arch.architecture_analysis()
# Output interpretation
parse_output, output_format = yolo_arch.output_interpretation()
# Test forward pass
test_input = torch.randn(1, 3, 448, 448)
with torch.no_grad():
output = model(test_input)
print(f"\nInput Shape: {test_input.shape}")
print(f"Output Shape: {output.shape}")
# Parse output
parsed = parse_output(output)
print(f"Bounding Box Coords Shape: {parsed['bbox_coords'].shape}")
print(f"Bounding Box Confidence Shape: {parsed['bbox_confidence'].shape}")
print(f"Class Probabilities Shape: {parsed['class_probs'].shape}")
4.3 Loss Function Design
4.3.1 Multi-task Loss Function
class YOLOv1Loss:
def __init__(self, lambda_coord=5, lambda_noobj=0.5, grid_size=7, num_boxes=2, num_classes=20):
self.lambda_coord = lambda_coord # coordinate loss weight
self.lambda_noobj = lambda_noobj # no-object confidence loss weight
self.grid_size = grid_size
self.num_boxes = num_boxes
self.num_classes = num_classes
def yolo_loss_function(self, predictions, targets):
"""YOLO v1 loss function implementation"""
batch_size = predictions.size(0)
# Parse predictions
pred_boxes = predictions[:, :, :, :self.num_boxes*5].view(
batch_size, self.grid_size, self.grid_size, self.num_boxes, 5)
pred_classes = predictions[:, :, :, self.num_boxes*5:]
# Parse targets
target_boxes = targets[:, :, :, :self.num_boxes*5].view(
batch_size, self.grid_size, self.grid_size, self.num_boxes, 5)
target_classes = targets[:, :, :, self.num_boxes*5:]
# Loss components
coord_loss = 0
size_loss = 0
conf_loss_obj = 0
conf_loss_noobj = 0
class_loss = 0
for b in range(batch_size):
for i in range(self.grid_size):
for j in range(self.grid_size):
# Check if object exists
target_confidence = target_boxes[b, i, j, :, 4]
has_object = torch.any(target_confidence > 0)
if has_object:
# Find responsible bounding box
responsible_box_idx = self._find_responsible_box(
pred_boxes[b, i, j], target_boxes[b, i, j])
# Coordinate loss (x, y)
pred_xy = pred_boxes[b, i, j, responsible_box_idx, :2]
target_xy = target_boxes[b, i, j, responsible_box_idx, :2]
coord_loss += F.mse_loss(pred_xy, target_xy)
# Size loss (w, h) - take square root
pred_wh = pred_boxes[b, i, j, responsible_box_idx, 2:4]
target_wh = target_boxes[b, i, j, responsible_box_idx, 2:4]
# Prevent negative and zero values
pred_wh = torch.clamp(pred_wh, min=1e-6)
target_wh = torch.clamp(target_wh, min=1e-6)
size_loss += F.mse_loss(torch.sqrt(pred_wh), torch.sqrt(target_wh))
# Object confidence loss
pred_conf = pred_boxes[b, i, j, responsible_box_idx, 4]
target_conf = target_boxes[b, i, j, responsible_box_idx, 4]
conf_loss_obj += F.mse_loss(pred_conf, target_conf)
# Class loss
pred_class = pred_classes[b, i, j]
target_class = target_classes[b, i, j]
class_loss += F.mse_loss(pred_class, target_class)
# Other boxes confidence loss (no object)
for box_idx in range(self.num_boxes):
if box_idx != responsible_box_idx:
pred_conf_noobj = pred_boxes[b, i, j, box_idx, 4]
conf_loss_noobj += F.mse_loss(pred_conf_noobj, torch.tensor(0.0))
else:
# No-object confidence loss
for box_idx in range(self.num_boxes):
pred_conf_noobj = pred_boxes[b, i, j, box_idx, 4]
conf_loss_noobj += F.mse_loss(pred_conf_noobj, torch.tensor(0.0))
# Total loss
total_loss = (self.lambda_coord * coord_loss +
self.lambda_coord * size_loss +
conf_loss_obj +
self.lambda_noobj * conf_loss_noobj +
class_loss)
loss_components = {
'coord_loss': coord_loss.item(),
'size_loss': size_loss.item(),
'conf_loss_obj': conf_loss_obj.item(),
'conf_loss_noobj': conf_loss_noobj.item(),
'class_loss': class_loss.item(),
'total_loss': total_loss.item()
}
return total_loss, loss_components
def _find_responsible_box(self, pred_boxes, target_boxes):
"""Find the responsible bounding box"""
max_iou = 0
responsible_idx = 0
for i in range(self.num_boxes):
if target_boxes[i, 4] > 0: # if object exists
iou = self._calculate_iou(pred_boxes[i, :4], target_boxes[i, :4])
if iou > max_iou:
max_iou = iou
responsible_idx = i
return responsible_idx
def _calculate_iou(self, box1, box2):
"""Calculate IoU"""
# Convert to corner coordinates
box1_x1 = box1[0] - box1[2] / 2
box1_y1 = box1[1] - box1[3] / 2
box1_x2 = box1[0] + box1[2] / 2
box1_y2 = box1[1] + box1[3] / 2
box2_x1 = box2[0] - box2[2] / 2
box2_y1 = box2[1] - box2[3] / 2
box2_x2 = box2[0] + box2[2] / 2
box2_y2 = box2[1] + box2[3] / 2
# Calculate intersection
inter_x1 = torch.max(box1_x1, box2_x1)
inter_y1 = torch.max(box1_y1, box2_y1)
inter_x2 = torch.min(box1_x2, box2_x2)
inter_y2 = torch.min(box1_y2, box2_y2)
inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
# Calculate union
box1_area = (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
box2_area = (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
union_area = box1_area + box2_area - inter_area
iou = inter_area / (union_area + 1e-6)
return iou
def loss_component_analysis(self):
"""Loss function component analysis"""
loss_components = {
"Coordinate Loss": {
"Formula": "λ_coord × Σ[(x_pred - x_true)² + (y_pred - y_true)²]",
"Weight": "λ_coord = 5",
"Purpose": "Regress bounding box center coordinates",
"Reason": "Coordinate prediction is important, given higher weight"
},
"Size Loss": {
"Formula": "λ_coord × Σ[(√w_pred - √w_true)² + (√h_pred - √h_true)²]",
"Weight": "λ_coord = 5",
"Square Root": "Reduce size difference impact for large objects",
"Purpose": "Regress bounding box width and height"
},
"Object Confidence Loss": {
"Formula": "Σ[(C_pred - IoU)²]",
"Weight": "1.0",
"Target": "IoU value as confidence label",
"Purpose": "Predict probability of containing object"
},
"No-Object Confidence Loss": {
"Formula": "λ_noobj × Σ[(C_pred - 0)²]",
"Weight": "λ_noobj = 0.5",
"Downweight": "Most grids have no objects, reduce weight for balance",
"Purpose": "Suppress confidence in background regions"
},
"Classification Loss": {
"Formula": "Σ[(P_pred(c) - P_true(c))²]",
"Weight": "1.0",
"Condition": "Only calculated for grids with objects",
"Purpose": "Predict object class probabilities"
}
}
print("YOLO v1 Loss Function Components:")
print("=" * 35)
for component, details in loss_components.items():
print(f"\n{component}:")
for key, value in details.items():
print(f" {key}: {value}")
return loss_components
def loss_balancing_strategy(self):
"""Loss balancing strategy"""
balancing_reasons = {
"λ_coord = 5": {
"Problem": "Coordinate loss has small proportion in total loss",
"Reason": "Most grids have no objects, classification and confidence loss dominate",
"Solution": "Increase coordinate loss weight, emphasize localization importance"
},
"λ_noobj = 0.5": {
"Problem": "Number of no-object grids far exceeds object grids",
"Reason": "7×7=49 grids, usually only 1-3 contain objects",
"Solution": "Reduce no-object confidence loss weight"
},
"Square Root Size": {
"Problem": "Size errors of large objects impact loss too much",
"Reason": "Few pixels offset in large objects differs from one pixel in small objects",
"Solution": "Take square root of width/height to reduce size difference"
},
"MSE Loss": {
"Choice": "All loss components use mean squared error",
"Pros": "Simple, stable, easy to optimize",
"Cons": "Sensitive to outliers"
}
}
print("Loss Balancing Strategy:")
print("=" * 20)
for strategy, details in balancing_reasons.items():
print(f"\n{strategy}:")
for key, value in details.items():
print(f" {key}: {value}")
return balancing_reasons
# Usage example
yolo_loss = YOLOv1Loss()
# Loss function component analysis
loss_analysis = yolo_loss.loss_component_analysis()
# Loss balancing strategy
balancing_strategy = yolo_loss.loss_balancing_strategy()
# Create simulated data to test loss function
print("\nLoss Function Test:")
print("-" * 15)
batch_size, grid_size, num_boxes, num_classes = 2, 7, 2, 20
output_size = num_boxes * 5 + num_classes
# Simulate predictions and targets
predictions = torch.randn(batch_size, grid_size, grid_size, output_size)
targets = torch.zeros(batch_size, grid_size, grid_size, output_size)
# Set some targets
targets[0, 3, 3, 4] = 0.8 # first bounding box confidence
targets[0, 3, 3, :4] = torch.tensor([0.5, 0.5, 0.3, 0.4]) # coordinates
targets[0, 3, 3, 10] = 1.0 # first class
# Calculate loss
total_loss, loss_components = yolo_loss.yolo_loss_function(predictions, targets)
print(f"Total Loss: {total_loss:.4f}")
print("Loss Components:")
for component, value in loss_components.items():
print(f" {component}: {value:.4f}")
[Note: Due to length constraints, I’ll continue with the remaining sections in the next response. This translation covers approximately the first third of the document, maintaining all markdown formatting, code blocks, and YAML frontmatter exactly as in the original.]
Would you like me to continue with the remaining sections of this file and the other files?