Chapter 5: YOLO Series Evolution (v2-v5)

Haiyue
50min

Chapter 5: YOLO Series Evolution (v2-v5)

Learning Objectives

  1. Master the key improvements of YOLO v2 (anchor boxes, batch normalization, multi-scale training, etc.)
  2. Understand the feature pyramid and multi-scale detection mechanism of YOLO v3
  3. Learn about the engineering tricks integration and performance optimization of YOLO v4
  4. Familiarize yourself with the practical improvements and deployment optimization of YOLO v5

5.1 YOLO v2 (YOLO9000) - Better, Faster, Stronger

5.1.1 Core Improvements

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class YOLOv2Improvements:
    def __init__(self):
        self.improvements = {
            "Batch Normalization": {
                "Effect": "Accelerates convergence, improves stability",
                "Location": "After each convolutional layer",
                "Result": "mAP improved by 2%, removes Dropout"
            },
            "High Resolution Classifier": {
                "Pretraining": "448×448 classification task",
                "Result": "mAP improved by 4%",
                "Principle": "Adapts to high resolution input"
            },
            "Anchor Mechanism": {
                "Concept": "Predefined bounding box shapes",
                "Quantity": "5 anchor boxes",
                "Result": "Recall improved from 81% to 88%"
            },
            "Dimension Clustering": {
                "Method": "K-means clustering to select anchors",
                "Distance": "1-IoU as distance metric",
                "Result": "Anchors better suited to dataset"
            },
            "Direct Location Prediction": {
                "Problem": "Anchors can appear anywhere in image",
                "Solution": "Use sigmoid to constrain offset",
                "Stability": "More stable training"
            },
            "Fine-Grained Features": {
                "Method": "Passthrough layer",
                "Fusion": "26×26 features with 13×13 features",
                "Result": "Improved small object detection"
            },
            "Multi-Scale Training": {
                "Sizes": "320-608 pixels, 32-pixel intervals",
                "Frequency": "Change size every 10 batches",
                "Generalization": "Improves generalization at different scales"
            }
        }

    def anchor_mechanism(self):
        """Detailed anchor mechanism"""

        class AnchorGenerator:
            def __init__(self, anchor_sizes, grid_size=13):
                self.anchor_sizes = anchor_sizes  # [(w1,h1), (w2,h2), ...]
                self.grid_size = grid_size

            def generate_anchors(self):
                """Generate all anchor boxes"""
                anchors = []

                for i in range(self.grid_size):
                    for j in range(self.grid_size):
                        for w, h in self.anchor_sizes:
                            # Anchor center at grid center
                            cx = (j + 0.5) / self.grid_size
                            cy = (i + 0.5) / self.grid_size

                            anchors.append([cx, cy, w, h])

                return np.array(anchors)

            def kmeans_anchors(self, boxes, k=5):
                """Generate anchors using K-means clustering"""
                # Extract width and height
                widths = boxes[:, 2] - boxes[:, 0]
                heights = boxes[:, 3] - boxes[:, 1]
                sizes = np.column_stack([widths, heights])

                # K-means clustering
                from sklearn.cluster import KMeans
                kmeans = KMeans(n_clusters=k, random_state=42)
                kmeans.fit(sizes)

                # Return cluster centers as anchor sizes
                anchor_sizes = kmeans.cluster_centers_

                # Sort by area
                areas = anchor_sizes[:, 0] * anchor_sizes[:, 1]
                sorted_indices = np.argsort(areas)

                return anchor_sizes[sorted_indices]

        # Improved location prediction
        def direct_location_prediction():
            """Direct location prediction"""

            # YOLO v1 problem: predicting (x, y) can be unstable
            # YOLO v2 solution: predict offset, constrain with sigmoid

            def predict_bbox(tx, ty, tw, th, anchor_w, anchor_h, grid_x, grid_y, grid_size):
                """
                tx, ty, tw, th: Network predictions
                anchor_w, anchor_h: Anchor dimensions
                grid_x, grid_y: Grid coordinates
                """
                # Center point prediction (sigmoid constrains within grid)
                bx = torch.sigmoid(tx) + grid_x
                by = torch.sigmoid(ty) + grid_y

                # Width and height prediction (exponential transform)
                bw = anchor_w * torch.exp(tw)
                bh = anchor_h * torch.exp(th)

                # Normalize to [0,1]
                bx = bx / grid_size
                by = by / grid_size

                return bx, by, bw, bh

            return predict_bbox

        print("YOLO v2 Anchor Mechanism:")
        print("=" * 25)

        # Example anchor generation
        anchor_sizes = [(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892),
                       (9.47112, 4.84053), (11.2364, 10.0071)]

        anchor_gen = AnchorGenerator(anchor_sizes)
        anchors = anchor_gen.generate_anchors()

        print(f"Number of anchors: {len(anchors)}")
        print(f"First 5 anchors: {anchors[:5]}")

        return AnchorGenerator, direct_location_prediction()

    def passthrough_layer(self):
        """Passthrough layer implementation"""

        class PassthroughLayer(nn.Module):
            def __init__(self, stride=2):
                super(PassthroughLayer, self).__init__()
                self.stride = stride

            def forward(self, x):
                """
                Reorganize 26×26×512 feature map to 13×13×2048
                """
                batch_size, channels, height, width = x.size()

                # Ensure dimensions are divisible by stride
                assert height % self.stride == 0 and width % self.stride == 0

                new_height = height // self.stride
                new_width = width // self.stride

                # Reorganize tensor
                x = x.view(batch_size, channels, new_height, self.stride, new_width, self.stride)
                x = x.permute(0, 1, 3, 5, 2, 4).contiguous()
                x = x.view(batch_size, channels * self.stride * self.stride, new_height, new_width)

                return x

        # Feature fusion example
        def feature_fusion_example():
            """Feature fusion example"""

            # High resolution features (26×26×512)
            high_res_feat = torch.randn(1, 512, 26, 26)

            # Low resolution features (13×13×1024)
            low_res_feat = torch.randn(1, 1024, 13, 13)

            # Passthrough layer
            passthrough = PassthroughLayer(stride=2)
            transformed_feat = passthrough(high_res_feat)

            print(f"High resolution features: {high_res_feat.shape}")
            print(f"After Passthrough: {transformed_feat.shape}")
            print(f"Low resolution features: {low_res_feat.shape}")

            # Feature fusion
            fused_feat = torch.cat([low_res_feat, transformed_feat], dim=1)
            print(f"Fused features: {fused_feat.shape}")

            return fused_feat

        return PassthroughLayer, feature_fusion_example

    def multi_scale_training(self):
        """Multi-scale training"""

        class MultiScaleTraining:
            def __init__(self, min_size=320, max_size=608, step=32):
                self.min_size = min_size
                self.max_size = max_size
                self.step = step
                self.scales = list(range(min_size, max_size + step, step))
                self.current_scale = 416  # Default size

            def get_random_scale(self):
                """Randomly select training size"""
                return np.random.choice(self.scales)

            def resize_batch(self, images, targets, new_size):
                """Resize batch dimensions"""
                # Image resizing
                resized_images = F.interpolate(images, size=(new_size, new_size),
                                             mode='bilinear', align_corners=False)

                # Target coordinate adjustment
                scale_factor = new_size / images.size(-1)

                if targets is not None:
                    # Assume targets format is [batch_idx, class, x, y, w, h]
                    targets[:, 2:] *= scale_factor

                return resized_images, targets

            def training_step(self, model, images, targets, step_count):
                """Training step (including size adjustment)"""

                # Adjust size every 10 batches
                if step_count % 10 == 0:
                    self.current_scale = self.get_random_scale()
                    print(f"Switching to size: {self.current_scale}")

                # Adjust input size
                resized_images, resized_targets = self.resize_batch(
                    images, targets, self.current_scale)

                # Model forward pass
                outputs = model(resized_images)

                return outputs, resized_targets

        multi_scale_benefits = {
            "Robustness": "Adapts to different sized inputs",
            "Generalization": "Improves performance at different resolutions",
            "Practicality": "Same model works for multiple application scenarios",
            "Efficiency": "Can adjust inference size based on accuracy requirements"
        }

        print("Multi-scale training advantages:")
        print("=" * 20)
        for benefit, desc in multi_scale_benefits.items():
            print(f"  {benefit}: {desc}")

        return MultiScaleTraining, multi_scale_benefits

# Usage example
yolo_v2 = YOLOv2Improvements()

print("YOLO v2 Main Improvements:")
print("=" * 25)
for improvement, details in yolo_v2.improvements.items():
    print(f"\n{improvement}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

# Anchor mechanism
AnchorGenerator, bbox_prediction = yolo_v2.anchor_mechanism()

# Passthrough layer
PassthroughLayer, feature_fusion = yolo_v2.passthrough_layer()

# Multi-scale training
MultiScaleTraining, benefits = yolo_v2.multi_scale_training()

# Demonstrate feature fusion
print("\nFeature Fusion Demo:")
print("-" * 15)
fused_features = feature_fusion()

5.2 YOLO v3 - Multi-Scale Prediction

5.2.1 Darknet-53 Backbone

class YOLOv3Architecture:
    def __init__(self):
        self.key_features = {
            "Multi-scale Prediction": "3 feature maps at different scales",
            "Feature Pyramid": "FPN-like feature fusion",
            "Darknet-53": "Backbone network with residual connections",
            "Pointwise Convolution": "1×1 convolution for dimensionality reduction",
            "Binary Classification Loss": "Independent sigmoid for each class"
        }

    def build_darknet53(self):
        """Build Darknet-53 backbone"""

        class ConvBNLeaky(nn.Module):
            def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
                super(ConvBNLeaky, self).__init__()
                self.conv = nn.Conv2d(in_channels, out_channels, kernel_size,
                                    stride, padding, bias=False)
                self.bn = nn.BatchNorm2d(out_channels)
                self.leaky = nn.LeakyReLU(0.1, inplace=True)

            def forward(self, x):
                return self.leaky(self.bn(self.conv(x)))

        class ResidualBlock(nn.Module):
            def __init__(self, channels):
                super(ResidualBlock, self).__init__()
                self.conv1 = ConvBNLeaky(channels, channels // 2, 1)
                self.conv2 = ConvBNLeaky(channels // 2, channels, 3, padding=1)

            def forward(self, x):
                residual = x
                out = self.conv1(x)
                out = self.conv2(out)
                return out + residual

        class Darknet53(nn.Module):
            def __init__(self):
                super(Darknet53, self).__init__()

                # Initial convolution
                self.conv1 = ConvBNLeaky(3, 32, 3, padding=1)
                self.conv2 = ConvBNLeaky(32, 64, 3, stride=2, padding=1)

                # Residual block groups
                self.res_block1 = self._make_layer(64, 1)
                self.conv3 = ConvBNLeaky(64, 128, 3, stride=2, padding=1)

                self.res_block2 = self._make_layer(128, 2)
                self.conv4 = ConvBNLeaky(128, 256, 3, stride=2, padding=1)

                self.res_block3 = self._make_layer(256, 8)
                self.conv5 = ConvBNLeaky(256, 512, 3, stride=2, padding=1)

                self.res_block4 = self._make_layer(512, 8)
                self.conv6 = ConvBNLeaky(512, 1024, 3, stride=2, padding=1)

                self.res_block5 = self._make_layer(1024, 4)

            def _make_layer(self, channels, num_blocks):
                layers = []
                for _ in range(num_blocks):
                    layers.append(ResidualBlock(channels))
                return nn.Sequential(*layers)

            def forward(self, x):
                x = self.conv1(x)
                x = self.conv2(x)

                x = self.res_block1(x)
                x = self.conv3(x)

                x = self.res_block2(x)
                x = self.conv4(x)

                x = self.res_block3(x)
                route1 = x  # 52×52 feature map
                x = self.conv5(x)

                x = self.res_block4(x)
                route2 = x  # 26×26 feature map
                x = self.conv6(x)

                x = self.res_block5(x)  # 13×13 feature map

                return route1, route2, x

        return Darknet53, ConvBNLeaky

    def feature_pyramid_network(self):
        """Feature Pyramid Network"""

        class YOLOv3FPN(nn.Module):
            def __init__(self, num_classes=80, num_anchors=3):
                super(YOLOv3FPN, self).__init__()
                self.num_classes = num_classes
                self.num_anchors = num_anchors

                # Darknet-53 backbone
                Darknet53, ConvBNLeaky = self.build_darknet53()
                self.backbone = Darknet53()

                # Detection heads
                self.detection_head1 = self._make_detection_head(1024, 512)
                self.detection_head2 = self._make_detection_head(768, 256)  # 512 + 256
                self.detection_head3 = self._make_detection_head(384, 128)  # 256 + 128

                # Upsampling
                self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')
                self.upsample2 = nn.Upsample(scale_factor=2, mode='nearest')

                # 1×1 convolution for dimensionality reduction
                self.conv_reduce1 = ConvBNLeaky(512, 256, 1)
                self.conv_reduce2 = ConvBNLeaky(256, 128, 1)

            def _make_detection_head(self, in_channels, mid_channels):
                """Create detection head"""
                layers = []

                # 5 convolutional layers
                for i in range(5):
                    if i % 2 == 0:
                        layers.append(ConvBNLeaky(in_channels if i == 0 else mid_channels * 2,
                                                mid_channels, 1))
                    else:
                        layers.append(ConvBNLeaky(mid_channels, mid_channels * 2, 3, padding=1))

                # Detection convolution
                detection_conv = nn.Conv2d(mid_channels,
                                         self.num_anchors * (5 + self.num_classes),
                                         1)
                layers.append(detection_conv)

                return nn.Sequential(*layers)

            def forward(self, x):
                # Backbone forward pass
                route1, route2, x = self.backbone(x)  # 52×52, 26×26, 13×13

                # First scale detection (13×13)
                detection1 = self.detection_head1(x)

                # Upsample and fuse (26×26)
                x = self.conv_reduce1(x[:, :512])  # Take first 512 channels
                x = self.upsample1(x)
                x = torch.cat([x, route2], dim=1)
                detection2 = self.detection_head2(x)

                # Upsample and fuse (52×52)
                x = self.conv_reduce2(x[:, :256])  # Take first 256 channels
                x = self.upsample2(x)
                x = torch.cat([x, route1], dim=1)
                detection3 = self.detection_head3(x)

                return detection1, detection2, detection3

        return YOLOv3FPN

    def multi_scale_anchors(self):
        """Multi-scale anchor design"""

        # YOLOv3's 9 anchors (3 scales × 3 anchors)
        anchors = {
            "Large scale (13×13)": [(116, 90), (156, 198), (373, 326)],
            "Medium scale (26×26)": [(30, 61), (62, 45), (59, 119)],
            "Small scale (52×52)": [(10, 13), (16, 30), (33, 23)]
        }

        def assign_anchors_to_scales():
            """Anchor assignment strategy"""
            assignment_strategy = {
                "Principle": "Assign anchors to appropriate scales based on size",
                "Large objects": "Assign to low-resolution feature map (13×13)",
                "Medium objects": "Assign to medium-resolution feature map (26×26)",
                "Small objects": "Assign to high-resolution feature map (52×52)",
                "Advantage": "Each scale focuses on specific object sizes"
            }

            return assignment_strategy

        print("YOLOv3 Multi-scale Anchors:")
        print("=" * 25)

        for scale, anchor_list in anchors.items():
            print(f"\n{scale}:")
            for i, (w, h) in enumerate(anchor_list):
                print(f"  Anchor{i+1}: {w}×{h}")

        strategy = assign_anchors_to_scales()
        print(f"\nAssignment strategy:")
        for key, value in strategy.items():
            print(f"  {key}: {value}")

        return anchors, strategy

# Usage example
yolo_v3 = YOLOv3Architecture()

print("YOLO v3 Key Features:")
print("=" * 25)
for feature, description in yolo_v3.key_features.items():
    print(f"  {feature}: {description}")

# Build Darknet-53
Darknet53, ConvBNLeaky = yolo_v3.build_darknet53()
backbone = Darknet53()

# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nDarknet-53 parameters: {count_parameters(backbone):,}")

# Test backbone
test_input = torch.randn(1, 3, 416, 416)
with torch.no_grad():
    route1, route2, output = backbone(test_input)
    print(f"\nFeature map sizes:")
    print(f"  route1 (52×52): {route1.shape}")
    print(f"  route2 (26×26): {route2.shape}")
    print(f"  output (13×13): {output.shape}")

# Multi-scale anchors
anchors, strategy = yolo_v3.multi_scale_anchors()

5.3 YOLO v4 - Engineering Tricks Integration

5.3.1 Bag of Freebies and Specials

class YOLOv4Optimizations:
    def __init__(self):
        self.bag_of_freebies = {
            "Data Augmentation": {
                "Mosaic": "4-image stitching",
                "CutMix": "Image cropping and mixing",
                "MixUp": "Linear image mixing",
                "Self-Adversarial Training": "Adversarial sample augmentation"
            },
            "Regularization": {
                "DropBlock": "Structured Dropout",
                "Label Smoothing": "Label smoothing",
                "Class label smoothing": "Class label smoothing"
            },
            "Loss Functions": {
                "CIoU Loss": "Complete IoU loss",
                "Focal Loss": "Hard example mining loss",
                "DIoU Loss": "Distance IoU loss"
            }
        }

        self.bag_of_specials = {
            "Activation Functions": {
                "Mish": "Self-gated activation function",
                "Swish": "Self-gated linear unit",
                "ReLU6": "Truncated ReLU"
            },
            "Attention Mechanisms": {
                "SE": "Squeeze-and-Excitation",
                "CBAM": "Convolutional Block Attention Module",
                "ECA": "Efficient Channel Attention"
            },
            "Normalization": {
                "Cross-stage": "Cross-stage partial connections",
                "Cross mini-Batch": "Cross mini-batch normalization"
            },
            "Skip Connections": {
                "Residual": "Residual connections",
                "Weighted residual": "Weighted residual connections",
                "Multi-input weighted": "Multi-input weighted connections"
            }
        }

    def mosaic_augmentation(self):
        """Mosaic data augmentation"""

        class MosaicAugmentation:
            def __init__(self, image_size=640):
                self.image_size = image_size

            def mosaic_augment(self, images, targets):
                """
                Mosaic augmentation: stitch 4 images into one
                images: List of 4 images
                targets: Corresponding annotation list
                """
                assert len(images) == 4, "Mosaic requires 4 images"

                # Randomly choose stitching center point
                cut_x = np.random.randint(self.image_size // 4, 3 * self.image_size // 4)
                cut_y = np.random.randint(self.image_size // 4, 3 * self.image_size // 4)

                # Create output image
                mosaic_image = np.zeros((self.image_size, self.image_size, 3), dtype=np.uint8)
                mosaic_targets = []

                # Define positions for 4 quadrants
                positions = [
                    (0, 0, cut_x, cut_y),           # Top left
                    (cut_x, 0, self.image_size, cut_y),     # Top right
                    (0, cut_y, cut_x, self.image_size),     # Bottom left
                    (cut_x, cut_y, self.image_size, self.image_size)  # Bottom right
                ]

                for i, (image, target) in enumerate(zip(images, targets)):
                    x1, y1, x2, y2 = positions[i]

                    # Resize image to fit region
                    h, w = image.shape[:2]
                    scale = min((x2 - x1) / w, (y2 - y1) / h)

                    new_w = int(w * scale)
                    new_h = int(h * scale)

                    resized_image = cv2.resize(image, (new_w, new_h))

                    # Place image
                    mosaic_image[y1:y1+new_h, x1:x1+new_w] = resized_image

                    # Adjust annotations
                    if target is not None:
                        adjusted_target = target.copy()
                        adjusted_target[:, [0, 2]] = adjusted_target[:, [0, 2]] * scale + x1
                        adjusted_target[:, [1, 3]] = adjusted_target[:, [1, 3]] * scale + y1
                        mosaic_targets.append(adjusted_target)

                # Merge all annotations
                if mosaic_targets:
                    mosaic_targets = np.concatenate(mosaic_targets, axis=0)

                return mosaic_image, mosaic_targets

            def cutmix_augment(self, image1, target1, image2, target2, alpha=1.0):
                """CutMix augmentation"""
                lam = np.random.beta(alpha, alpha)

                h, w = image1.shape[:2]
                cut_rat = np.sqrt(1. - lam)
                cut_w = int(w * cut_rat)
                cut_h = int(h * cut_rat)

                # Randomly select cut position
                cx = np.random.randint(w)
                cy = np.random.randint(h)

                bbx1 = np.clip(cx - cut_w // 2, 0, w)
                bby1 = np.clip(cy - cut_h // 2, 0, h)
                bbx2 = np.clip(cx + cut_w // 2, 0, w)
                bby2 = np.clip(cy + cut_h // 2, 0, h)

                # Execute CutMix
                mixed_image = image1.copy()
                mixed_image[bby1:bby2, bbx1:bbx2] = image2[bby1:bby2, bbx1:bbx2]

                # Mix annotations
                mixed_targets = []
                if target1 is not None:
                    mixed_targets.append(target1)
                if target2 is not None:
                    # Filter targets outside the cut region
                    valid_targets = []
                    for target in target2:
                        x1, y1, x2, y2 = target[:4]
                        if not (x2 < bbx1 or x1 > bbx2 or y2 < bby1 or y1 > bby2):
                            valid_targets.append(target)
                    if valid_targets:
                        mixed_targets.append(np.array(valid_targets))

                if mixed_targets:
                    mixed_targets = np.concatenate(mixed_targets, axis=0)

                return mixed_image, mixed_targets

        return MosaicAugmentation

    def mish_activation(self):
        """Mish activation function"""

        class Mish(nn.Module):
            def __init__(self):
                super(Mish, self).__init__()

            def forward(self, x):
                return x * torch.tanh(F.softplus(x))

        def mish_vs_others():
            """Mish vs other activation functions"""
            x = torch.linspace(-3, 3, 1000)

            activations = {
                'ReLU': F.relu(x),
                'Swish': x * torch.sigmoid(x),
                'Mish': x * torch.tanh(F.softplus(x)),
                'LeakyReLU': F.leaky_relu(x, 0.1)
            }

            properties = {
                'ReLU': "Simple and fast, but suffers from gradient vanishing",
                'Swish': "Smooth, self-gated, good performance",
                'Mish': "Smoother, better convergence, higher accuracy",
                'LeakyReLU': "Alleviates gradient vanishing, but not self-gated"
            }

            print("Activation function comparison:")
            print("=" * 25)
            for name, prop in properties.items():
                print(f"  {name}: {prop}")

            return activations, properties

        return Mish, mish_vs_others

    def ciou_loss(self):
        """Complete IoU Loss"""

        def ciou_loss_function(pred_boxes, target_boxes):
            """
            CIoU loss function
            Considers overlap area, center distance, aspect ratio
            """
            # Calculate IoU
            def calculate_iou(box1, box2):
                x1 = torch.max(box1[:, 0], box2[:, 0])
                y1 = torch.max(box1[:, 1], box2[:, 1])
                x2 = torch.min(box1[:, 2], box2[:, 2])
                y2 = torch.min(box1[:, 3], box2[:, 3])

                intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)
                area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
                area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
                union = area1 + area2 - intersection

                return intersection / (union + 1e-6)

            # Calculate center distance
            def center_distance(box1, box2):
                center1_x = (box1[:, 0] + box1[:, 2]) / 2
                center1_y = (box1[:, 1] + box1[:, 3]) / 2
                center2_x = (box2[:, 0] + box2[:, 2]) / 2
                center2_y = (box2[:, 1] + box2[:, 3]) / 2

                return (center1_x - center2_x)**2 + (center1_y - center2_y)**2

            # Calculate minimum enclosing box diagonal length
            def diagonal_length(box1, box2):
                c_x = torch.max(box1[:, 2], box2[:, 2]) - torch.min(box1[:, 0], box2[:, 0])
                c_y = torch.max(box1[:, 3], box2[:, 3]) - torch.min(box1[:, 1], box2[:, 1])
                return c_x**2 + c_y**2

            # Calculate aspect ratio consistency
            def aspect_ratio_consistency(box1, box2):
                w1 = box1[:, 2] - box1[:, 0]
                h1 = box1[:, 3] - box1[:, 1]
                w2 = box2[:, 2] - box2[:, 0]
                h2 = box2[:, 3] - box2[:, 1]

                v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w2/h2) - torch.atan(w1/h1), 2)
                return v

            # Calculate CIoU
            iou = calculate_iou(pred_boxes, target_boxes)
            rho2 = center_distance(pred_boxes, target_boxes)
            c2 = diagonal_length(pred_boxes, target_boxes)
            v = aspect_ratio_consistency(pred_boxes, target_boxes)

            with torch.no_grad():
                alpha = v / (1 - iou + v + 1e-6)

            ciou = iou - rho2 / (c2 + 1e-6) - alpha * v

            return 1 - ciou  # CIoU loss

        loss_comparison = {
            "IoU Loss": "Only considers overlap area",
            "GIoU Loss": "Considers minimum enclosing box",
            "DIoU Loss": "Additionally considers center distance",
            "CIoU Loss": "Also considers aspect ratio consistency",
            "Advantage": "Faster convergence, more accurate regression"
        }

        print("CIoU Loss advantages:")
        print("=" * 20)
        for loss_type, description in loss_comparison.items():
            print(f"  {loss_type}: {description}")

        return ciou_loss_function, loss_comparison

    def csp_darknet53(self):
        """CSPDarknet53 backbone"""

        class CSPBlock(nn.Module):
            def __init__(self, in_channels, out_channels, num_blocks):
                super(CSPBlock, self).__init__()

                self.conv1 = nn.Conv2d(in_channels, out_channels // 2, 1, bias=False)
                self.conv2 = nn.Conv2d(in_channels, out_channels // 2, 1, bias=False)

                # Residual blocks
                self.res_blocks = nn.ModuleList()
                for _ in range(num_blocks):
                    self.res_blocks.append(nn.Sequential(
                        nn.Conv2d(out_channels // 2, out_channels // 2, 1, bias=False),
                        nn.BatchNorm2d(out_channels // 2),
                        nn.LeakyReLU(0.1, inplace=True),
                        nn.Conv2d(out_channels // 2, out_channels // 2, 3, padding=1, bias=False),
                        nn.BatchNorm2d(out_channels // 2),
                        nn.LeakyReLU(0.1, inplace=True)
                    ))

                self.conv3 = nn.Conv2d(out_channels, out_channels, 1, bias=False)
                self.bn = nn.BatchNorm2d(out_channels)
                self.activation = nn.LeakyReLU(0.1, inplace=True)

            def forward(self, x):
                # Split features
                x1 = self.conv1(x)
                x2 = self.conv2(x)

                # Residual connections
                for res_block in self.res_blocks:
                    x2 = x2 + res_block(x2)

                # Feature fusion
                out = torch.cat([x1, x2], dim=1)
                out = self.conv3(out)
                out = self.bn(out)
                out = self.activation(out)

                return out

        csp_advantages = {
            "Gradient Flow": "Splits gradient flow, reduces computation",
            "Feature Reuse": "Better feature reuse",
            "Parameter Efficiency": "Fewer parameters for same accuracy",
            "Inference Speed": "Faster inference"
        }

        print("CSP advantages:")
        print("=" * 10)
        for advantage, description in csp_advantages.items():
            print(f"  {advantage}: {description}")

        return CSPBlock, csp_advantages

# Usage example
yolo_v4 = YOLOv4Optimizations()

print("YOLO v4 Bag of Freebies:")
print("=" * 30)
for category, techniques in yolo_v4.bag_of_freebies.items():
    print(f"\n{category}:")
    for technique, description in techniques.items():
        print(f"  {technique}: {description}")

print("\nYOLO v4 Bag of Specials:")
print("=" * 30)
for category, techniques in yolo_v4.bag_of_specials.items():
    print(f"\n{category}:")
    for technique, description in techniques.items():
        print(f"  {technique}: {description}")

# Mosaic augmentation
MosaicAugmentation = yolo_v4.mosaic_augmentation()

# Mish activation function
Mish, mish_comparison = yolo_v4.mish_activation()
activations, properties = mish_comparison()

# CIoU loss
ciou_loss_fn, loss_comparison = yolo_v4.ciou_loss()

# CSP structure
CSPBlock, csp_advantages = yolo_v4.csp_darknet53()

5.4 YOLO v5 - Engineering Optimization

5.4.1 Practical Improvements

class YOLOv5Improvements:
    def __init__(self):
        self.improvements = {
            "Data Loading": {
                "Adaptive Anchors": "Automatically calculate optimal anchors",
                "Adaptive Image Scaling": "Aspect ratio preserving scaling",
                "Efficient Data Loading": "Multi-process data loading optimization"
            },
            "Training Optimization": {
                "Automatic Mixed Precision": "FP16 training acceleration",
                "Exponential Moving Average": "Model weight smoothing",
                "Cosine Learning Rate": "Better learning rate scheduling",
                "Early Stopping": "Prevent overfitting"
            },
            "Model Architecture": {
                "Focus Structure": "Efficient downsampling",
                "CSP Structure": "Cross-stage partial connections",
                "SPP Structure": "Spatial pyramid pooling",
                "PANet": "Path aggregation network"
            },
            "Engineering": {
                "Model Scaling": "Different sized model family",
                "ONNX Export": "Easy deployment",
                "TensorRT Optimization": "Inference acceleration",
                "Mobile Optimization": "Lightweight versions"
            }
        }

    def focus_structure(self):
        """Focus structure"""

        class Focus(nn.Module):
            def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
                super(Focus, self).__init__()
                self.conv = nn.Conv2d(in_channels * 4, out_channels, kernel_size, stride, padding, bias=False)
                self.bn = nn.BatchNorm2d(out_channels)
                self.act = nn.SiLU(inplace=True)  # Swish/SiLU activation

            def forward(self, x):
                # Rearrange 2x2 pixel blocks into 4x channels
                # Example: (B, 3, 640, 640) -> (B, 12, 320, 320)
                return self.act(self.bn(self.conv(torch.cat([
                    x[..., ::2, ::2],    # Top left
                    x[..., 1::2, ::2],   # Top right
                    x[..., ::2, 1::2],   # Bottom left
                    x[..., 1::2, 1::2]   # Bottom right
                ], 1))))

        def focus_advantages():
            """Focus structure advantages"""
            advantages = {
                "No Information Loss": "Unlike conv stride=2, doesn't lose information",
                "Computational Efficiency": "Reduces computation",
                "Feature Preservation": "Preserves all pixel information",
                "Compatibility": "Easy to integrate into existing architectures"
            }

            return advantages

        return Focus, focus_advantages()

    def adaptive_anchor(self):
        """Adaptive anchors"""

        class AdaptiveAnchor:
            def __init__(self, dataset, num_anchors=9, thr=4.0):
                self.dataset = dataset
                self.num_anchors = num_anchors
                self.thr = thr

            def check_anchor_order(self, anchors, targets, img_size):
                """Check anchor order"""
                m = len(anchors)
                bpr, aat = self.metric(anchors, targets)

                print(f'Anchor fitness: {bpr:.3f}, Best possible recall: {aat:.3f}')

                if bpr < 0.98:
                    print('Running automatic anchor optimization...')
                    new_anchors = self.kmean_anchors(targets, n=m, img_size=img_size, thr=self.thr)
                    new_bpr, new_aat = self.metric(new_anchors, targets)

                    if new_bpr > bpr:
                        print(f'New anchor BPR: {new_bpr:.3f}, AAT: {new_aat:.3f}')
                        return new_anchors
                    else:
                        print('Keeping original anchors')
                        return anchors
                return anchors

            def metric(self, anchors, targets):
                """Calculate anchor metrics"""
                if len(targets) == 0:
                    return 0, 0

                na = len(anchors)
                txy, twh = targets[:, 2:4], targets[:, 4:6]  # Target center and size

                # Calculate aspect ratio
                r = twh[:, None] / anchors[None]  # wh ratio
                j = torch.max(r, 1. / r).max(2)[0] < self.thr  # Compare

                # Best possible recall and average anchor threshold
                bpr = (j * (txy[:, None] > 0.1).all(2) * (txy[:, None] < 0.9).all(2)).float().sum(1).mean()
                aat = (j & (txy[:, None] > 0.1).all(2) & (txy[:, None] < 0.9).all(2)).float().sum(1).mean()

                return bpr, aat

            def kmean_anchors(self, targets, n=9, img_size=640, thr=4.0, gen=1000):
                """K-means anchor clustering"""
                from scipy.cluster.vq import kmeans

                def fitness(k):
                    _, dist = kmeans(wh, k)
                    return 1 / dist

                # Extract width and height
                wh = targets[:, 4:6] * img_size  # Convert to pixel coordinates

                # K-means clustering
                print(f'Performing K-means clustering with {len(wh)} targets...')
                s = wh.std(0)  # Standard deviation
                k, dist = kmeans(wh / s, n, iter=30)  # Cluster
                k *= s

                # Sort by area
                k = k[np.argsort(k.prod(1))]

                f = fitness(k)
                print(f'Anchor fitness: {f:.3f}')

                return k

        return AdaptiveAnchor

    def model_scaling(self):
        """Model scaling strategy"""

        def create_model_variants():
            """Create different sized model variants"""
            variants = {
                'YOLOv5n': {  # nano
                    'depth_multiple': 0.33,
                    'width_multiple': 0.25,
                    'parameters': '1.9M',
                    'gflops': '4.5',
                    'speed_cpu': '6.3ms',
                    'speed_gpu': '0.6ms'
                },
                'YOLOv5s': {  # small
                    'depth_multiple': 0.33,
                    'width_multiple': 0.50,
                    'parameters': '7.2M',
                    'gflops': '16.5',
                    'speed_cpu': '11.9ms',
                    'speed_gpu': '0.9ms'
                },
                'YOLOv5m': {  # medium
                    'depth_multiple': 0.67,
                    'width_multiple': 0.75,
                    'parameters': '21.2M',
                    'gflops': '49.0',
                    'speed_cpu': '25.1ms',
                    'speed_gpu': '1.7ms'
                },
                'YOLOv5l': {  # large
                    'depth_multiple': 1.0,
                    'width_multiple': 1.0,
                    'parameters': '46.5M',
                    'gflops': '109.1',
                    'speed_cpu': '47.9ms',
                    'speed_gpu': '2.7ms'
                },
                'YOLOv5x': {  # extra large
                    'depth_multiple': 1.33,
                    'width_multiple': 1.25,
                    'parameters': '86.7M',
                    'gflops': '205.7',
                    'speed_cpu': '95.2ms',
                    'speed_gpu': '4.6ms'
                }
            }

            return variants

        def scale_model(base_channels, base_depth, width_mult, depth_mult):
            """Adjust model based on scaling factors"""
            scaled_channels = int(base_channels * width_mult)
            scaled_depth = max(1, int(base_depth * depth_mult))

            return scaled_channels, scaled_depth

        variants = create_model_variants()

        print("YOLOv5 Model Variants:")
        print("=" * 25)
        for model, specs in variants.items():
            print(f"\n{model}:")
            for key, value in specs.items():
                print(f"  {key}: {value}")

        return variants, scale_model

    def training_optimizations(self):
        """Training optimization techniques"""

        class TrainingOptimizer:
            def __init__(self):
                self.techniques = {
                    "Automatic Mixed Precision": self.setup_amp,
                    "Exponential Moving Average": self.setup_ema,
                    "Cosine Learning Rate": self.setup_cosine_lr,
                    "Early Stopping": self.setup_early_stopping
                }

            def setup_amp(self):
                """Automatic mixed precision"""
                from torch.cuda.amp import GradScaler, autocast

                scaler = GradScaler()

                def training_step(model, loss_fn, optimizer, inputs, targets):
                    with autocast():
                        outputs = model(inputs)
                        loss = loss_fn(outputs, targets)

                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()

                    return loss

                return training_step

            def setup_ema(self, model, decay=0.9999):
                """Exponential moving average"""
                class ModelEMA:
                    def __init__(self, model, decay=0.9999):
                        self.ema = {k: v.clone().detach() for k, v in model.state_dict().items()}
                        self.decay = decay

                    def update(self, model):
                        with torch.no_grad():
                            for k, v in model.state_dict().items():
                                self.ema[k] = self.ema[k] * self.decay + v * (1 - self.decay)

                    def apply_shadow(self, model):
                        model.load_state_dict(self.ema)

                return ModelEMA(model, decay)

            def setup_cosine_lr(self, optimizer, T_max, eta_min=0):
                """Cosine learning rate scheduling"""
                from torch.optim.lr_scheduler import CosineAnnealingLR

                scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)
                return scheduler

            def setup_early_stopping(self, patience=10, min_delta=0.001):
                """Early stopping mechanism"""
                class EarlyStopping:
                    def __init__(self, patience=10, min_delta=0.001):
                        self.patience = patience
                        self.min_delta = min_delta
                        self.counter = 0
                        self.best_loss = float('inf')

                    def __call__(self, val_loss):
                        if val_loss < self.best_loss - self.min_delta:
                            self.best_loss = val_loss
                            self.counter = 0
                            return False
                        else:
                            self.counter += 1
                            return self.counter >= self.patience

                return EarlyStopping(patience, min_delta)

        return TrainingOptimizer

# Usage example
yolo_v5 = YOLOv5Improvements()

print("YOLO v5 Improvements:")
print("=" * 20)
for category, improvements in yolo_v5.improvements.items():
    print(f"\n{category}:")
    for improvement, description in improvements.items():
        print(f"  {improvement}: {description}")

# Focus structure
Focus, focus_advantages = yolo_v5.focus_structure()

print(f"\nFocus structure advantages:")
print("-" * 15)
for advantage, description in focus_advantages.items():
    print(f"  {advantage}: {description}")

# Model scaling
variants, scale_model = yolo_v5.model_scaling()

# Training optimization
TrainingOptimizer = yolo_v5.training_optimizations()
optimizer = TrainingOptimizer()

# Test Focus structure
focus_layer = Focus(3, 32)
test_input = torch.randn(1, 3, 640, 640)
with torch.no_grad():
    output = focus_layer(test_input)
    print(f"\nFocus test:")
    print(f"  Input: {test_input.shape}")
    print(f"  Output: {output.shape}")

Chapter Summary

5.5 YOLO Series Evolution Summary

class YOLOEvolutionSummary:
    def __init__(self):
        self.evolution_timeline = {
            "YOLO v2 (2017)": {
                "Core Improvements": ["Anchor mechanism", "Batch normalization", "Multi-scale training", "Fine-grained features"],
                "Performance": "PASCAL VOC mAP 76.8%",
                "Innovation": "Introduced anchor concept to YOLO"
            },
            "YOLO v3 (2018)": {
                "Core Improvements": ["Multi-scale prediction", "Darknet-53", "Feature pyramid", "Binary classification loss"],
                "Performance": "COCO mAP 57.9%",
                "Innovation": "Multi-scale detection architecture"
            },
            "YOLO v4 (2020)": {
                "Core Improvements": ["CSPDarknet53", "Mosaic augmentation", "CIoU loss", "Many tricks"],
                "Performance": "COCO mAP 65.7%",
                "Innovation": "Large-scale engineering tricks integration"
            },
            "YOLO v5 (2020)": {
                "Core Improvements": ["Focus structure", "Adaptive anchors", "Model scaling", "Engineering optimization"],
                "Performance": "COCO mAP 68.9%",
                "Innovation": "Engineering and practicality"
            }
        }

    def performance_comparison(self):
        """Performance comparison"""
        comparison = {
            "Metrics": ["Accuracy", "Speed", "Model Size", "Usability"],
            "YOLO v2": ["Medium", "Fast", "Medium", "Fair"],
            "YOLO v3": ["High", "Medium", "Large", "Fair"],
            "YOLO v4": ["High", "Fast", "Large", "Good"],
            "YOLO v5": ["High", "Fast", "Flexible", "Excellent"]
        }

        return comparison

    def key_innovations(self):
        """Key innovations summary"""
        innovations = {
            "Network Architecture": {
                "v2": "Darknet-19 + Anchors",
                "v3": "Darknet-53 + FPN",
                "v4": "CSPDarknet53 + SPP + PANet",
                "v5": "CSP + Focus + PANet"
            },
            "Training Tricks": {
                "v2": "Multi-scale training",
                "v3": "Data augmentation optimization",
                "v4": "Mosaic + CutMix + SAT",
                "v5": "Adaptive training + AutoML"
            },
            "Loss Functions": {
                "v2": "Improved IoU loss",
                "v3": "Binary cross-entropy",
                "v4": "CIoU + Focal Loss",
                "v5": "Optimized CIoU"
            },
            "Engineering": {
                "v2": "Basic engineering",
                "v3": "Modular improvements",
                "v4": "Tricks integration",
                "v5": "Full engineering"
            }
        }

        return innovations

# Summary display
summary = YOLOEvolutionSummary()

print("YOLO Series Evolution Timeline:")
print("=" * 30)
for version, details in summary.evolution_timeline.items():
    print(f"\n{version}:")
    for key, value in details.items():
        if isinstance(value, list):
            print(f"  {key}: {', '.join(value)}")
        else:
            print(f"  {key}: {value}")

# Performance comparison
comparison = summary.performance_comparison()
print(f"\nPerformance Comparison:")
print("=" * 15)
metrics = comparison["Metrics"]
for i, metric in enumerate(metrics):
    print(f"\n{metric}:")
    for version in ["YOLO v2", "YOLO v3", "YOLO v4", "YOLO v5"]:
        print(f"  {version}: {comparison[version][i]}")

# Key innovations
innovations = summary.key_innovations()
print(f"\nKey Innovations Summary:")
print("=" * 20)
for category, versions in innovations.items():
    print(f"\n{category}:")
    for version, innovation in versions.items():
        print(f"  {version}: {innovation}")

5.6 Next Chapter Preview

The next chapter will cover YOLO’s latest versions (v6-v11) and cutting-edge developments, exploring:

  • YOLO v6-v8: Latest architecture designs and performance optimization
  • YOLO v9-v11: Cutting-edge technologies and future developments
  • New Technologies: Transformer, attention mechanisms, neural architecture search
  • Application Extensions: Segmentation, pose estimation, 3D detection

Through this chapter, we have comprehensively learned about the evolution of YOLO v2 to v5. Each version has made important improvements on the previous version, driving the development of real-time object detection technology. These improvements provide important references for subsequent versions and other detection algorithms.