Chapter 4: YOLO v1 Detailed Explanation

Haiyue
27min

Chapter 4: YOLO v1 Detailed Explanation

Learning Objectives

  1. Understand the core concepts and innovations of YOLO v1
  2. Master the network architecture design of YOLO v1
  3. Familiarize with the loss function design principles
  4. Learn about the training and inference process

4.1 Core Concepts of YOLO v1

4.1.1 “You Only Look Once” Revolutionary Concept

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

class YOLOv1Philosophy:
    def __init__(self):
        self.core_concepts = {
            "Unified Detection": {
                "Concept": "Redefine object detection as a single regression problem",
                "Comparison": "Traditional methods require two steps: region proposal + classification",
                "Advantage": "End-to-end training, simple architecture"
            },
            "Global Reasoning": {
                "Concept": "See the entire image for prediction",
                "Comparison": "Sliding windows only see local information",
                "Advantage": "Reduce background false positives, utilize global context"
            },
            "Real-time Detection": {
                "Concept": "Complete detection in a single forward pass",
                "Performance": "45 FPS on Titan X",
                "Significance": "First real-time high-accuracy object detection"
            },
            "Grid Prediction": {
                "Concept": "Divide image into S×S grid",
                "Responsibility": "Each grid cell responsible for detecting objects whose center falls in it",
                "Simplification": "Avoid complex region proposal generation"
            }
        }

    def paradigm_shift(self):
        """Detection paradigm shift analysis"""
        traditional_vs_yolo = {
            "Traditional Two-Stage Method": {
                "Process": ["Region proposal", "Feature extraction", "Classification", "Regression"],
                "Pros": ["High accuracy", "Mature and stable"],
                "Cons": ["Slow speed", "Complex system", "Difficult optimization"],
                "Representative": "R-CNN series"
            },
            "YOLO One-Stage Method": {
                "Process": ["Single CNN", "Direct detection output"],
                "Pros": ["Fast speed", "End-to-end", "Global optimization"],
                "Cons": ["Slightly lower accuracy", "Small objects difficult"],
                "Breakthrough": "Redefine the detection problem"
            }
        }

        print("Object Detection Paradigm Shift:")
        print("=" * 40)

        for paradigm, details in traditional_vs_yolo.items():
            print(f"\n{paradigm}:")
            for key, value in details.items():
                if isinstance(value, list):
                    print(f"  {key}: {' -> '.join(value)}")
                else:
                    print(f"  {key}: {value}")

        return traditional_vs_yolo

    def detection_as_regression(self):
        """Detection as regression problem"""
        regression_formulation = {
            "Problem Redefinition": {
                "Input": "H×W×3 image",
                "Output": "S×S×(B×5+C) tensor",
                "Meaning": "Each grid predicts B bounding boxes and C class probabilities"
            },
            "Output Interpretation": {
                "Bounding Box": "(x, y, w, h) relative coordinates",
                "Confidence": "P(Object) × IoU(pred, truth)",
                "Class Probability": "P(Class_i | Object)",
                "Final Prediction": "P(Class_i) × P(Object) × IoU"
            },
            "Grid Responsibility": {
                "Principle": "Grid cell where object center falls is responsible for predicting that object",
                "Advantage": "Avoid duplicate detections of the same object",
                "Limitation": "Each grid can detect at most one object"
            }
        }

        print("Detection as Regression Problem:")
        print("=" * 30)

        for aspect, details in regression_formulation.items():
            print(f"\n{aspect}:")
            for key, value in details.items():
                print(f"  {key}: {value}")

        return regression_formulation

# Usage example
yolo_philosophy = YOLOv1Philosophy()

# Core concepts
print("YOLO v1 Core Concepts:")
print("=" * 25)
for concept, details in yolo_philosophy.core_concepts.items():
    print(f"\n{concept}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

# Paradigm shift
paradigm_comparison = yolo_philosophy.paradigm_shift()

# Regression problem redefinition
regression_details = yolo_philosophy.detection_as_regression()

4.2 YOLO v1 Network Architecture

4.2.1 Overall Architecture Design

class YOLOv1Architecture:
    def __init__(self):
        self.network_specs = {
            "Input": "448×448×3",
            "Grid Size": "7×7",
            "Boxes per Grid": "2",
            "Classes": "20 (PASCAL VOC)",
            "Output": "7×7×30"
        }

    def build_yolov1_network(self, num_classes=20, num_boxes=2, grid_size=7):
        """Build YOLO v1 network"""

        class YOLOv1(nn.Module):
            def __init__(self, num_classes=20, num_boxes=2, grid_size=7):
                super(YOLOv1, self).__init__()

                self.num_classes = num_classes
                self.num_boxes = num_boxes
                self.grid_size = grid_size

                # Convolutional feature extraction layers (inspired by GoogLeNet)
                self.features = nn.Sequential(
                    # First conv group
                    nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
                    nn.BatchNorm2d(64),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.MaxPool2d(kernel_size=2, stride=2),

                    # Second conv group
                    nn.Conv2d(64, 192, kernel_size=3, padding=1),
                    nn.BatchNorm2d(192),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.MaxPool2d(kernel_size=2, stride=2),

                    # Third conv group
                    nn.Conv2d(192, 128, kernel_size=1),
                    nn.BatchNorm2d(128),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(128, 256, kernel_size=3, padding=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 256, kernel_size=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 512, kernel_size=3, padding=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.MaxPool2d(kernel_size=2, stride=2),

                    # Fourth conv group (alternating 1×1 and 3×3)
                    nn.Conv2d(512, 256, kernel_size=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 512, kernel_size=3, padding=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 256, kernel_size=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 512, kernel_size=3, padding=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 256, kernel_size=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 512, kernel_size=3, padding=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 256, kernel_size=1),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(256, 512, kernel_size=3, padding=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 512, kernel_size=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.MaxPool2d(kernel_size=2, stride=2),

                    # Fifth conv group
                    nn.Conv2d(1024, 512, kernel_size=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(1024, 512, kernel_size=1),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(512, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),

                    # Final conv layers
                    nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1, inplace=True),
                )

                # Fully connected detection layers
                self.classifier = nn.Sequential(
                    nn.Flatten(),
                    nn.Linear(1024 * grid_size * grid_size, 4096),
                    nn.LeakyReLU(0.1, inplace=True),
                    nn.Dropout(0.5),
                    nn.Linear(4096, grid_size * grid_size * (num_boxes * 5 + num_classes)),
                )

            def forward(self, x):
                x = self.features(x)
                x = self.classifier(x)

                # Reshape to (batch_size, grid_size, grid_size, num_boxes*5 + num_classes)
                batch_size = x.size(0)
                x = x.view(batch_size, self.grid_size, self.grid_size,
                          self.num_boxes * 5 + self.num_classes)

                return x

        return YOLOv1(num_classes, num_boxes, grid_size)

    def architecture_analysis(self):
        """Detailed architecture analysis"""
        layer_analysis = {
            "Convolutional Layers": {
                "Total Layers": "24 convolutional layers",
                "Design Inspiration": "GoogLeNet architecture",
                "Features": "1×1 conv for dimensionality reduction + 3×3 conv for feature extraction",
                "Activation": "Leaky ReLU (α=0.1)"
            },
            "Fully Connected Layers": {
                "Layers": "2 fully connected layers",
                "First Layer": "4096 neurons",
                "Second Layer": "7×7×30 = 1470 outputs",
                "Dropout": "0.5 to prevent overfitting"
            },
            "Output Tensor": {
                "Dimensions": "7×7×30",
                "Bounding Boxes": "2 boxes per grid, 5 parameters each",
                "Classes": "20 class probabilities",
                "Calculation": "2×5 + 20 = 30"
            },
            "Parameters": {
                "Total Parameters": "~45M parameters",
                "Convolutional": "~40M parameters",
                "Fully Connected": "~5M parameters"
            }
        }

        print("YOLO v1 Architecture Analysis:")
        print("=" * 30)

        for aspect, details in layer_analysis.items():
            print(f"\n{aspect}:")
            for key, value in details.items():
                print(f"  {key}: {value}")

        return layer_analysis

    def output_interpretation(self):
        """Output interpretation"""

        def parse_yolo_output(output_tensor, grid_size=7, num_boxes=2, num_classes=20):
            """Parse YOLO output tensor"""
            batch_size = output_tensor.size(0)

            # Separate bounding box and class predictions
            bbox_predictions = output_tensor[:, :, :, :num_boxes*5].view(
                batch_size, grid_size, grid_size, num_boxes, 5)

            class_predictions = output_tensor[:, :, :, num_boxes*5:]

            # Bounding box parameters
            bbox_coords = bbox_predictions[:, :, :, :, :4]  # (x, y, w, h)
            bbox_confidence = bbox_predictions[:, :, :, :, 4]  # confidence

            return {
                'bbox_coords': bbox_coords,
                'bbox_confidence': bbox_confidence,
                'class_probs': class_predictions
            }

        output_format = {
            "Grid Cell Output": {
                "Bounding Box 1": "[x1, y1, w1, h1, conf1]",
                "Bounding Box 2": "[x2, y2, w2, h2, conf2]",
                "Class Probabilities": "[P(class1), P(class2), ..., P(class20)]"
            },
            "Coordinate Encoding": {
                "x, y": "Offset relative to grid cell (0-1)",
                "w, h": "Ratio relative to entire image (0-1)",
                "Confidence": "P(Object) × IoU(pred, truth)"
            },
            "Class Prediction": {
                "Shared": "Multiple boxes per grid share class prediction",
                "Conditional Probability": "P(Class_i | Object)",
                "Final Probability": "conf × P(Class_i | Object)"
            }
        }

        print("YOLO Output Format:")
        print("=" * 20)

        for aspect, details in output_format.items():
            print(f"\n{aspect}:")
            for key, value in details.items():
                print(f"  {key}: {value}")

        return parse_yolo_output, output_format

# Usage example
yolo_arch = YOLOv1Architecture()

# Build network
model = yolo_arch.build_yolov1_network()

print("YOLO v1 Network Structure:")
print("=" * 25)
print(model)

# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

param_count = count_parameters(model)
print(f"\nTotal Parameters: {param_count:,}")

# Architecture analysis
arch_analysis = yolo_arch.architecture_analysis()

# Output interpretation
parse_output, output_format = yolo_arch.output_interpretation()

# Test forward pass
test_input = torch.randn(1, 3, 448, 448)
with torch.no_grad():
    output = model(test_input)
    print(f"\nInput Shape: {test_input.shape}")
    print(f"Output Shape: {output.shape}")

    # Parse output
    parsed = parse_output(output)
    print(f"Bounding Box Coords Shape: {parsed['bbox_coords'].shape}")
    print(f"Bounding Box Confidence Shape: {parsed['bbox_confidence'].shape}")
    print(f"Class Probabilities Shape: {parsed['class_probs'].shape}")

4.3 Loss Function Design

4.3.1 Multi-task Loss Function

class YOLOv1Loss:
    def __init__(self, lambda_coord=5, lambda_noobj=0.5, grid_size=7, num_boxes=2, num_classes=20):
        self.lambda_coord = lambda_coord  # coordinate loss weight
        self.lambda_noobj = lambda_noobj  # no-object confidence loss weight
        self.grid_size = grid_size
        self.num_boxes = num_boxes
        self.num_classes = num_classes

    def yolo_loss_function(self, predictions, targets):
        """YOLO v1 loss function implementation"""

        batch_size = predictions.size(0)

        # Parse predictions
        pred_boxes = predictions[:, :, :, :self.num_boxes*5].view(
            batch_size, self.grid_size, self.grid_size, self.num_boxes, 5)
        pred_classes = predictions[:, :, :, self.num_boxes*5:]

        # Parse targets
        target_boxes = targets[:, :, :, :self.num_boxes*5].view(
            batch_size, self.grid_size, self.grid_size, self.num_boxes, 5)
        target_classes = targets[:, :, :, self.num_boxes*5:]

        # Loss components
        coord_loss = 0
        size_loss = 0
        conf_loss_obj = 0
        conf_loss_noobj = 0
        class_loss = 0

        for b in range(batch_size):
            for i in range(self.grid_size):
                for j in range(self.grid_size):

                    # Check if object exists
                    target_confidence = target_boxes[b, i, j, :, 4]
                    has_object = torch.any(target_confidence > 0)

                    if has_object:
                        # Find responsible bounding box
                        responsible_box_idx = self._find_responsible_box(
                            pred_boxes[b, i, j], target_boxes[b, i, j])

                        # Coordinate loss (x, y)
                        pred_xy = pred_boxes[b, i, j, responsible_box_idx, :2]
                        target_xy = target_boxes[b, i, j, responsible_box_idx, :2]
                        coord_loss += F.mse_loss(pred_xy, target_xy)

                        # Size loss (w, h) - take square root
                        pred_wh = pred_boxes[b, i, j, responsible_box_idx, 2:4]
                        target_wh = target_boxes[b, i, j, responsible_box_idx, 2:4]

                        # Prevent negative and zero values
                        pred_wh = torch.clamp(pred_wh, min=1e-6)
                        target_wh = torch.clamp(target_wh, min=1e-6)

                        size_loss += F.mse_loss(torch.sqrt(pred_wh), torch.sqrt(target_wh))

                        # Object confidence loss
                        pred_conf = pred_boxes[b, i, j, responsible_box_idx, 4]
                        target_conf = target_boxes[b, i, j, responsible_box_idx, 4]
                        conf_loss_obj += F.mse_loss(pred_conf, target_conf)

                        # Class loss
                        pred_class = pred_classes[b, i, j]
                        target_class = target_classes[b, i, j]
                        class_loss += F.mse_loss(pred_class, target_class)

                        # Other boxes confidence loss (no object)
                        for box_idx in range(self.num_boxes):
                            if box_idx != responsible_box_idx:
                                pred_conf_noobj = pred_boxes[b, i, j, box_idx, 4]
                                conf_loss_noobj += F.mse_loss(pred_conf_noobj, torch.tensor(0.0))

                    else:
                        # No-object confidence loss
                        for box_idx in range(self.num_boxes):
                            pred_conf_noobj = pred_boxes[b, i, j, box_idx, 4]
                            conf_loss_noobj += F.mse_loss(pred_conf_noobj, torch.tensor(0.0))

        # Total loss
        total_loss = (self.lambda_coord * coord_loss +
                     self.lambda_coord * size_loss +
                     conf_loss_obj +
                     self.lambda_noobj * conf_loss_noobj +
                     class_loss)

        loss_components = {
            'coord_loss': coord_loss.item(),
            'size_loss': size_loss.item(),
            'conf_loss_obj': conf_loss_obj.item(),
            'conf_loss_noobj': conf_loss_noobj.item(),
            'class_loss': class_loss.item(),
            'total_loss': total_loss.item()
        }

        return total_loss, loss_components

    def _find_responsible_box(self, pred_boxes, target_boxes):
        """Find the responsible bounding box"""
        max_iou = 0
        responsible_idx = 0

        for i in range(self.num_boxes):
            if target_boxes[i, 4] > 0:  # if object exists
                iou = self._calculate_iou(pred_boxes[i, :4], target_boxes[i, :4])
                if iou > max_iou:
                    max_iou = iou
                    responsible_idx = i

        return responsible_idx

    def _calculate_iou(self, box1, box2):
        """Calculate IoU"""
        # Convert to corner coordinates
        box1_x1 = box1[0] - box1[2] / 2
        box1_y1 = box1[1] - box1[3] / 2
        box1_x2 = box1[0] + box1[2] / 2
        box1_y2 = box1[1] + box1[3] / 2

        box2_x1 = box2[0] - box2[2] / 2
        box2_y1 = box2[1] - box2[3] / 2
        box2_x2 = box2[0] + box2[2] / 2
        box2_y2 = box2[1] + box2[3] / 2

        # Calculate intersection
        inter_x1 = torch.max(box1_x1, box2_x1)
        inter_y1 = torch.max(box1_y1, box2_y1)
        inter_x2 = torch.min(box1_x2, box2_x2)
        inter_y2 = torch.min(box1_y2, box2_y2)

        inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)

        # Calculate union
        box1_area = (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
        box2_area = (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
        union_area = box1_area + box2_area - inter_area

        iou = inter_area / (union_area + 1e-6)
        return iou

    def loss_component_analysis(self):
        """Loss function component analysis"""
        loss_components = {
            "Coordinate Loss": {
                "Formula": "λ_coord × Σ[(x_pred - x_true)² + (y_pred - y_true)²]",
                "Weight": "λ_coord = 5",
                "Purpose": "Regress bounding box center coordinates",
                "Reason": "Coordinate prediction is important, given higher weight"
            },
            "Size Loss": {
                "Formula": "λ_coord × Σ[(√w_pred - √w_true)² + (√h_pred - √h_true)²]",
                "Weight": "λ_coord = 5",
                "Square Root": "Reduce size difference impact for large objects",
                "Purpose": "Regress bounding box width and height"
            },
            "Object Confidence Loss": {
                "Formula": "Σ[(C_pred - IoU)²]",
                "Weight": "1.0",
                "Target": "IoU value as confidence label",
                "Purpose": "Predict probability of containing object"
            },
            "No-Object Confidence Loss": {
                "Formula": "λ_noobj × Σ[(C_pred - 0)²]",
                "Weight": "λ_noobj = 0.5",
                "Downweight": "Most grids have no objects, reduce weight for balance",
                "Purpose": "Suppress confidence in background regions"
            },
            "Classification Loss": {
                "Formula": "Σ[(P_pred(c) - P_true(c))²]",
                "Weight": "1.0",
                "Condition": "Only calculated for grids with objects",
                "Purpose": "Predict object class probabilities"
            }
        }

        print("YOLO v1 Loss Function Components:")
        print("=" * 35)

        for component, details in loss_components.items():
            print(f"\n{component}:")
            for key, value in details.items():
                print(f"  {key}: {value}")

        return loss_components

    def loss_balancing_strategy(self):
        """Loss balancing strategy"""
        balancing_reasons = {
            "λ_coord = 5": {
                "Problem": "Coordinate loss has small proportion in total loss",
                "Reason": "Most grids have no objects, classification and confidence loss dominate",
                "Solution": "Increase coordinate loss weight, emphasize localization importance"
            },
            "λ_noobj = 0.5": {
                "Problem": "Number of no-object grids far exceeds object grids",
                "Reason": "7×7=49 grids, usually only 1-3 contain objects",
                "Solution": "Reduce no-object confidence loss weight"
            },
            "Square Root Size": {
                "Problem": "Size errors of large objects impact loss too much",
                "Reason": "Few pixels offset in large objects differs from one pixel in small objects",
                "Solution": "Take square root of width/height to reduce size difference"
            },
            "MSE Loss": {
                "Choice": "All loss components use mean squared error",
                "Pros": "Simple, stable, easy to optimize",
                "Cons": "Sensitive to outliers"
            }
        }

        print("Loss Balancing Strategy:")
        print("=" * 20)

        for strategy, details in balancing_reasons.items():
            print(f"\n{strategy}:")
            for key, value in details.items():
                print(f"  {key}: {value}")

        return balancing_reasons

# Usage example
yolo_loss = YOLOv1Loss()

# Loss function component analysis
loss_analysis = yolo_loss.loss_component_analysis()

# Loss balancing strategy
balancing_strategy = yolo_loss.loss_balancing_strategy()

# Create simulated data to test loss function
print("\nLoss Function Test:")
print("-" * 15)

batch_size, grid_size, num_boxes, num_classes = 2, 7, 2, 20
output_size = num_boxes * 5 + num_classes

# Simulate predictions and targets
predictions = torch.randn(batch_size, grid_size, grid_size, output_size)
targets = torch.zeros(batch_size, grid_size, grid_size, output_size)

# Set some targets
targets[0, 3, 3, 4] = 0.8  # first bounding box confidence
targets[0, 3, 3, :4] = torch.tensor([0.5, 0.5, 0.3, 0.4])  # coordinates
targets[0, 3, 3, 10] = 1.0  # first class

# Calculate loss
total_loss, loss_components = yolo_loss.yolo_loss_function(predictions, targets)

print(f"Total Loss: {total_loss:.4f}")
print("Loss Components:")
for component, value in loss_components.items():
    print(f"  {component}: {value:.4f}")

[Note: Due to length constraints, I’ll continue with the remaining sections in the next response. This translation covers approximately the first third of the document, maintaining all markdown formatting, code blocks, and YAML frontmatter exactly as in the original.]

Would you like me to continue with the remaining sections of this file and the other files?