Chapter 9: YOLO Model Training Practice

Haiyue
37min

Chapter 9: YOLO Model Training Practice

Learning Objectives

  1. Master the complete YOLO model training workflow
  2. Understand hyperparameter tuning strategies
  3. Learn training process monitoring and debugging techniques
  4. Familiarize with transfer learning and pretrained model usage

9.1 Training Environment Preparation

from ultralytics import YOLO
import torch
import yaml
from pathlib import Path
import matplotlib.pyplot as plt

class TrainingSetup:
    """Training Environment Setup"""

    def __init__(self):
        self.device = self.get_device()
        self.setup_reproducibility()

    def get_device(self):
        """Get training device"""
        if torch.cuda.is_available():
            device = 'cuda'
            print(f"Using GPU: {torch.cuda.get_device_name()}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
        else:
            device = 'cpu'
            print("Using CPU for training")
        return device

    def setup_reproducibility(self, seed=42):
        """Set random seed for reproducibility"""
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
        print(f"Random seed set to {seed}")

    def create_training_config(self, data_path, model_size='n'):
        """Create training configuration"""
        config = {
            'model': f'yolov8{model_size}.pt',
            'data': data_path,
            'epochs': 100,
            'batch_size': 16,
            'imgsz': 640,
            'lr0': 0.01,
            'lrf': 0.01,
            'momentum': 0.937,
            'weight_decay': 0.0005,
            'warmup_epochs': 3.0,
            'warmup_momentum': 0.8,
            'warmup_bias_lr': 0.1,
            'box': 7.5,
            'cls': 0.5,
            'dfl': 1.5,
            'pose': 12.0,
            'kobj': 2.0,
            'label_smoothing': 0.0,
            'nbs': 64,
            'overlap_mask': True,
            'mask_ratio': 4,
            'dropout': 0.0,
            'val': True,
            'plots': True,
            'save': True,
            'save_period': -1,
            'cache': False,
            'device': self.device,
            'workers': 8,
            'project': 'runs/train',
            'name': 'exp',
            'exist_ok': False,
            'pretrained': True,
            'optimizer': 'SGD',
            'verbose': True,
            'seed': 0,
            'deterministic': True,
            'single_cls': False,
            'rect': False,
            'cos_lr': False,
            'close_mosaic': 10,
            'resume': False,
            'amp': True,
            'fraction': 1.0,
            'profile': False,
            'freeze': None,
        }
        return config

# Initialize training setup
trainer = TrainingSetup()
training_config = trainer.create_training_config('data.yaml')
print("Training environment preparation complete")

9.2 Basic Training Workflow

class YOLOTrainer:
    """YOLO Trainer"""

    def __init__(self, config):
        self.config = config
        self.model = None
        self.training_results = None

    def load_model(self):
        """Load model"""
        model_name = self.config.get('model', 'yolov8n.pt')
        self.model = YOLO(model_name)
        print(f"Model loaded: {model_name}")
        return self.model

    def start_training(self):
        """Start training"""
        if self.model is None:
            self.load_model()

        print("Starting training...")

        # Train model
        self.training_results = self.model.train(
            data=self.config['data'],
            epochs=self.config['epochs'],
            batch=self.config['batch_size'],
            imgsz=self.config['imgsz'],
            device=self.config['device'],
            workers=self.config['workers'],
            project=self.config['project'],
            name=self.config['name'],
            optimizer=self.config['optimizer'],
            lr0=self.config['lr0'],
            lrf=self.config['lrf'],
            momentum=self.config['momentum'],
            weight_decay=self.config['weight_decay'],
            warmup_epochs=self.config['warmup_epochs'],
            warmup_momentum=self.config['warmup_momentum'],
            warmup_bias_lr=self.config['warmup_bias_lr'],
            box=self.config['box'],
            cls=self.config['cls'],
            dfl=self.config['dfl'],
            save=self.config['save'],
            save_period=self.config['save_period'],
            cache=self.config['cache'],
            plots=self.config['plots'],
            val=self.config['val'],
            resume=self.config['resume'],
            amp=self.config['amp'],
            fraction=self.config['fraction'],
            profile=self.config['profile'],
            freeze=self.config['freeze'],
            cos_lr=self.config['cos_lr'],
            close_mosaic=self.config['close_mosaic'],
            overlap_mask=self.config['overlap_mask'],
            mask_ratio=self.config['mask_ratio'],
            dropout=self.config['dropout'],
            label_smoothing=self.config['label_smoothing'],
            nbs=self.config['nbs'],
            single_cls=self.config['single_cls'],
            rect=self.config['rect'],
            deterministic=self.config['deterministic'],
            verbose=self.config['verbose']
        )

        print("Training complete!")
        return self.training_results

    def evaluate_model(self):
        """Evaluate model"""
        if self.model is None:
            print("Please load model first")
            return None

        print("Starting validation...")
        validation_results = self.model.val(
            data=self.config['data'],
            imgsz=self.config['imgsz'],
            batch=self.config['batch_size'],
            device=self.config['device'],
            plots=True,
            verbose=True
        )

        print("Validation complete!")
        return validation_results

    def save_model(self, save_path):
        """Save model"""
        if self.model is None:
            print("No model to save")
            return

        self.model.save(save_path)
        print(f"Model saved to: {save_path}")

    def export_model(self, format='onnx', **kwargs):
        """Export model"""
        if self.model is None:
            print("Please load model first")
            return

        export_path = self.model.export(format=format, **kwargs)
        print(f"Model exported: {export_path}")
        return export_path

# Usage example
trainer = YOLOTrainer(training_config)
# results = trainer.start_training()
print("YOLO trainer initialized")

9.3 Hyperparameter Optimization

import optuna
from optuna.samplers import TPESampler

class HyperparameterOptimizer:
    """Hyperparameter Optimizer"""

    def __init__(self, data_path, base_config):
        self.data_path = data_path
        self.base_config = base_config
        self.study = None

    def objective(self, trial):
        """Optimization objective function"""
        # Define hyperparameter search space
        lr0 = trial.suggest_float('lr0', 1e-5, 1e-1, log=True)
        lrf = trial.suggest_float('lrf', 0.01, 1.0)
        momentum = trial.suggest_float('momentum', 0.6, 0.98)
        weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
        warmup_epochs = trial.suggest_float('warmup_epochs', 0.0, 5.0)
        box_loss_gain = trial.suggest_float('box', 0.02, 0.2)
        cls_loss_gain = trial.suggest_float('cls', 0.2, 4.0)
        dfl_loss_gain = trial.suggest_float('dfl', 0.4, 6.0)

        # Create configuration
        config = self.base_config.copy()
        config.update({
            'lr0': lr0,
            'lrf': lrf,
            'momentum': momentum,
            'weight_decay': weight_decay,
            'warmup_epochs': warmup_epochs,
            'box': box_loss_gain,
            'cls': cls_loss_gain,
            'dfl': dfl_loss_gain,
            'epochs': 30,  # Reduce epochs for faster optimization
            'name': f'trial_{trial.number}',
            'verbose': False
        })

        # Train model
        try:
            model = YOLO(config['model'])
            results = model.train(**config)

            # Return validation mAP50-95 as optimization target
            return results.results_dict['metrics/mAP50-95(B)']

        except Exception as e:
            print(f"Trial {trial.number} failed: {e}")
            return 0.0

    def optimize(self, n_trials=50):
        """Execute hyperparameter optimization"""
        self.study = optuna.create_study(
            direction='maximize',
            sampler=TPESampler(seed=42)
        )

        print(f"Starting hyperparameter optimization with {n_trials} trials...")
        self.study.optimize(self.objective, n_trials=n_trials)

        print("\nOptimization complete!")
        print(f"Best parameters: {self.study.best_params}")
        print(f"Best score: {self.study.best_value:.4f}")

        return self.study.best_params

    def plot_optimization_history(self):
        """Visualize optimization history"""
        if self.study is None:
            print("Please run optimization first")
            return

        fig = optuna.visualization.plot_optimization_history(self.study)
        fig.show()

    def plot_parameter_importances(self):
        """Visualize parameter importance"""
        if self.study is None:
            print("Please run optimization first")
            return

        fig = optuna.visualization.plot_param_importances(self.study)
        fig.show()

# Learning Rate Scheduling Strategies
class LearningRateScheduler:
    """Learning Rate Scheduler"""

    @staticmethod
    def cosine_annealing(epoch, total_epochs, lr0, lrf):
        """Cosine annealing"""
        import math
        return lrf + (lr0 - lrf) * (1 + math.cos(math.pi * epoch / total_epochs)) / 2

    @staticmethod
    def linear_decay(epoch, total_epochs, lr0, lrf):
        """Linear decay"""
        return lr0 * (1 - epoch / total_epochs) + lrf * (epoch / total_epochs)

    @staticmethod
    def exponential_decay(epoch, total_epochs, lr0, lrf):
        """Exponential decay"""
        import math
        decay_rate = -math.log(lrf / lr0) / total_epochs
        return lr0 * math.exp(-decay_rate * epoch)

    @staticmethod
    def step_decay(epoch, step_size=30, gamma=0.1, lr0=0.01):
        """Step decay"""
        return lr0 * (gamma ** (epoch // step_size))

    @staticmethod
    def plot_schedules(total_epochs=100, lr0=0.01, lrf=0.001):
        """Visualize different scheduling strategies"""
        epochs = list(range(total_epochs))

        cosine_lrs = [LearningRateScheduler.cosine_annealing(e, total_epochs, lr0, lrf) for e in epochs]
        linear_lrs = [LearningRateScheduler.linear_decay(e, total_epochs, lr0, lrf) for e in epochs]
        exp_lrs = [LearningRateScheduler.exponential_decay(e, total_epochs, lr0, lrf) for e in epochs]
        step_lrs = [LearningRateScheduler.step_decay(e, 30, 0.1, lr0) for e in epochs]

        plt.figure(figsize=(12, 8))
        plt.plot(epochs, cosine_lrs, label='Cosine Annealing', linewidth=2)
        plt.plot(epochs, linear_lrs, label='Linear Decay', linewidth=2)
        plt.plot(epochs, exp_lrs, label='Exponential Decay', linewidth=2)
        plt.plot(epochs, step_lrs, label='Step Decay', linewidth=2)

        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.title('Learning Rate Schedules Comparison')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

# Visualize learning rate scheduling strategies
LearningRateScheduler.plot_schedules()
print("Hyperparameter optimization tools ready")

9.4 Training Monitoring and Debugging

import wandb
from torch.utils.tensorboard import SummaryWriter
import logging
from datetime import datetime

class TrainingMonitor:
    """Training Monitor"""

    def __init__(self, project_name="yolo_training", use_wandb=True, use_tensorboard=True):
        self.project_name = project_name
        self.use_wandb = use_wandb
        self.use_tensorboard = use_tensorboard

        # Initialize monitoring tools
        if self.use_wandb:
            wandb.init(project=project_name)

        if self.use_tensorboard:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.writer = SummaryWriter(f'runs/{project_name}_{timestamp}')

        # Setup logging
        self.setup_logging()

    def setup_logging(self):
        """Setup logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(f'training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def log_metrics(self, metrics_dict, step):
        """Log training metrics"""
        # Log to file
        metrics_str = ", ".join([f"{k}: {v:.4f}" for k, v in metrics_dict.items()])
        self.logger.info(f"Step {step} - {metrics_str}")

        # W&B logging
        if self.use_wandb:
            wandb.log(metrics_dict, step=step)

        # TensorBoard logging
        if self.use_tensorboard:
            for key, value in metrics_dict.items():
                self.writer.add_scalar(key, value, step)

    def log_learning_rate(self, lr, step):
        """Log learning rate"""
        self.log_metrics({'learning_rate': lr}, step)

    def log_loss_components(self, losses, step):
        """Log loss components"""
        loss_dict = {
            'train/box_loss': losses.get('train/box_loss', 0),
            'train/cls_loss': losses.get('train/cls_loss', 0),
            'train/dfl_loss': losses.get('train/dfl_loss', 0),
            'val/box_loss': losses.get('val/box_loss', 0),
            'val/cls_loss': losses.get('val/cls_loss', 0),
            'val/dfl_loss': losses.get('val/dfl_loss', 0)
        }
        self.log_metrics(loss_dict, step)

    def log_model_metrics(self, metrics, step):
        """Log model evaluation metrics"""
        metric_dict = {
            'metrics/precision': metrics.get('metrics/precision(B)', 0),
            'metrics/recall': metrics.get('metrics/recall(B)', 0),
            'metrics/mAP50': metrics.get('metrics/mAP50(B)', 0),
            'metrics/mAP50-95': metrics.get('metrics/mAP50-95(B)', 0)
        }
        self.log_metrics(metric_dict, step)

    def close(self):
        """Close monitor"""
        if self.use_tensorboard:
            self.writer.close()

        if self.use_wandb:
            wandb.finish()

class TrainingDebugger:
    """Training Debugging Tool"""

    def __init__(self):
        pass

    def diagnose_training_issues(self, results_dict):
        """Diagnose training issues"""
        issues = []

        # Check loss trends
        if 'train/box_loss' in results_dict:
            box_loss = results_dict['train/box_loss']
            if box_loss > 1.0:
                issues.append("Bounding box loss too high, may need to adjust box loss weight or learning rate")

        if 'train/cls_loss' in results_dict:
            cls_loss = results_dict['train/cls_loss']
            if cls_loss > 1.0:
                issues.append("Classification loss too high, check if class labels are correct")

        # Check mAP
        if 'metrics/mAP50-95(B)' in results_dict:
            mAP = results_dict['metrics/mAP50-95(B)']
            if mAP < 0.1:
                issues.append("mAP too low, check data quality or model configuration")

        # Check precision and recall
        if 'metrics/precision(B)' in results_dict and 'metrics/recall(B)' in results_dict:
            precision = results_dict['metrics/precision(B)']
            recall = results_dict['metrics/recall(B)']

            if precision < 0.3:
                issues.append("Low precision, may have too many false positives")
            if recall < 0.3:
                issues.append("Low recall, may be missing many targets")

        return issues

    def suggest_fixes(self, issues):
        """Suggest solutions"""
        suggestions = {
            "High Loss": [
                "Lower learning rate",
                "Check data annotation quality",
                "Adjust loss weights",
                "Increase warmup epochs"
            ],
            "Low mAP": [
                "Increase training epochs",
                "Use larger model",
                "Improve data augmentation",
                "Check anchor settings"
            ],
            "Low Precision": [
                "Increase confidence threshold",
                "Improve NMS settings",
                "Add negative samples"
            ],
            "Low Recall": [
                "Lower confidence threshold",
                "Increase data augmentation",
                "Use multi-scale training"
            ]
        }

        print("Training Issue Diagnosis:")
        print("=" * 40)

        if not issues:
            print("No obvious issues found")
            return

        for issue in issues:
            print(f"Issue: {issue}")

            # Match suggestions
            for category, suggestions_list in suggestions.items():
                if any(keyword in issue for keyword in category.split()):
                    print("Suggested Solutions:")
                    for suggestion in suggestions_list:
                        print(f"  • {suggestion}")
                    break
            print()

# Training Health Check
class TrainingHealthCheck:
    """Training Health Check"""

    def __init__(self):
        self.loss_history = []
        self.metric_history = []

    def check_convergence(self, loss_values, window_size=10):
        """Check convergence status"""
        if len(loss_values) < window_size * 2:
            return "Insufficient data"

        recent_losses = loss_values[-window_size:]
        previous_losses = loss_values[-window_size*2:-window_size]

        recent_avg = sum(recent_losses) / len(recent_losses)
        previous_avg = sum(previous_losses) / len(previous_losses)

        improvement = (previous_avg - recent_avg) / previous_avg

        if improvement > 0.05:
            return "Converging"
        elif improvement > -0.02:
            return "Slow convergence"
        else:
            return "Possibly diverging"

    def detect_overfitting(self, train_loss, val_loss, threshold=0.1):
        """Detect overfitting"""
        if len(train_loss) < 10 or len(val_loss) < 10:
            return "Insufficient data"

        train_trend = (train_loss[-1] - train_loss[-10]) / 10
        val_trend = (val_loss[-1] - val_loss[-10]) / 10

        if train_trend < -0.01 and val_trend > 0.01:
            return "Possible overfitting"
        elif abs(train_trend) < 0.001 and abs(val_trend) < 0.001:
            return "Training stable"
        else:
            return "Normal training"

    def check_learning_rate(self, loss_values, lr_values):
        """Check if learning rate is appropriate"""
        if len(loss_values) < 5:
            return "Insufficient data"

        recent_loss_change = (loss_values[-1] - loss_values[-5]) / 5
        current_lr = lr_values[-1] if lr_values else 0.01

        if recent_loss_change > 0.01:
            return f"Learning rate may be too high (current: {current_lr:.6f})"
        elif abs(recent_loss_change) < 0.0001:
            return f"Learning rate may be too low (current: {current_lr:.6f})"
        else:
            return f"Learning rate appropriate (current: {current_lr:.6f})"

print("Training monitoring and debugging tools ready")

9.5 Transfer Learning Strategies

class TransferLearningManager:
    """Transfer Learning Manager"""

    def __init__(self):
        self.pretrained_models = {
            'yolov8n': 'yolov8n.pt',
            'yolov8s': 'yolov8s.pt',
            'yolov8m': 'yolov8m.pt',
            'yolov8l': 'yolov8l.pt',
            'yolov8x': 'yolov8x.pt'
        }

    def select_pretrained_model(self, target_classes, data_size, compute_budget):
        """Select appropriate pretrained model"""
        recommendations = []

        if data_size < 1000:
            if compute_budget == 'low':
                recommendations.append('yolov8n - Small dataset, limited compute resources')
            else:
                recommendations.append('yolov8s - Small dataset, but larger model can improve performance')

        elif data_size < 10000:
            if compute_budget == 'low':
                recommendations.append('yolov8s - Medium dataset, reasonable model size')
            elif compute_budget == 'medium':
                recommendations.append('yolov8m - Balanced performance and efficiency')
            else:
                recommendations.append('yolov8l - Sufficient data, can use large model')

        else:  # data_size >= 10000
            if compute_budget == 'low':
                recommendations.append('yolov8s - Large dataset, but compute limited')
            elif compute_budget == 'medium':
                recommendations.append('yolov8m - Large dataset, medium model')
            elif compute_budget == 'high':
                recommendations.append('yolov8l - Large dataset, large model')
            else:
                recommendations.append('yolov8x - Maximum performance requirement')

        # Adjust recommendations based on class count
        if target_classes > 80:
            recommendations.append('Recommendation: Many classes, consider using larger model')
        elif target_classes < 10:
            recommendations.append('Recommendation: Few classes, can use smaller model')

        return recommendations

    def create_transfer_config(self, pretrained_model, freeze_layers=None):
        """Create transfer learning configuration"""
        config = {
            'model': pretrained_model,
            'pretrained': True,
            'freeze': freeze_layers,  # Number of layers to freeze, None means no freezing
        }

        # Adjust learning rate based on freezing
        if freeze_layers:
            config.update({
                'lr0': 0.001,  # Lower learning rate
                'warmup_epochs': 1.0,
            })
        else:
            config.update({
                'lr0': 0.01,   # Standard learning rate
                'warmup_epochs': 3.0,
            })

        return config

    def gradual_unfreezing_schedule(self, total_epochs):
        """Gradual unfreezing schedule"""
        schedule = []

        # Stage 1: Freeze backbone
        schedule.append({
            'epochs': total_epochs // 4,
            'freeze': 10,  # Freeze first 10 layers
            'lr': 0.001,
            'description': 'Freeze backbone, train detection head'
        })

        # Stage 2: Partial unfreezing
        schedule.append({
            'epochs': total_epochs // 4,
            'freeze': 5,   # Freeze only first 5 layers
            'lr': 0.0005,
            'description': 'Partial unfreezing, fine-tune later layers'
        })

        # Stage 3: Complete unfreezing
        schedule.append({
            'epochs': total_epochs // 2,
            'freeze': None,
            'lr': 0.0001,
            'description': 'Complete unfreezing, end-to-end fine-tuning'
        })

        return schedule

    def domain_adaptation_config(self, source_domain, target_domain):
        """Domain adaptation configuration"""
        adaptation_strategies = {
            ('general', 'medical'): {
                'data_augmentation': ['contrast', 'brightness', 'gaussian_blur'],
                'loss_weights': {'box': 7.5, 'cls': 1.0, 'dfl': 1.5},
                'learning_rate': 0.001,
                'freeze_backbone': True
            },
            ('general', 'industrial'): {
                'data_augmentation': ['rotation', 'scale', 'noise'],
                'loss_weights': {'box': 10.0, 'cls': 0.5, 'dfl': 2.0},
                'learning_rate': 0.005,
                'freeze_backbone': False
            },
            ('general', 'aerial'): {
                'data_augmentation': ['rotation', 'scale', 'flip'],
                'loss_weights': {'box': 5.0, 'cls': 1.5, 'dfl': 1.0},
                'learning_rate': 0.01,
                'freeze_backbone': False
            }
        }

        key = (source_domain, target_domain)
        if key in adaptation_strategies:
            return adaptation_strategies[key]
        else:
            # Default configuration
            return {
                'data_augmentation': ['horizontal_flip', 'scale', 'brightness'],
                'loss_weights': {'box': 7.5, 'cls': 0.5, 'dfl': 1.5},
                'learning_rate': 0.01,
                'freeze_backbone': False
            }

class FineTuningStrategies:
    """Fine-tuning Strategies"""

    @staticmethod
    def discriminative_learning_rates(base_lr=0.01, backbone_ratio=0.1, neck_ratio=0.5):
        """Discriminative learning rate strategy"""
        return {
            'backbone_lr': base_lr * backbone_ratio,
            'neck_lr': base_lr * neck_ratio,
            'head_lr': base_lr,
            'description': 'Backbone uses smaller learning rate, detection head uses larger learning rate'
        }

    @staticmethod
    def layer_wise_decay(base_lr=0.01, decay_rate=0.9, num_layers=24):
        """Layer-wise decay learning rate"""
        layer_lrs = []
        for i in range(num_layers):
            lr = base_lr * (decay_rate ** (num_layers - i - 1))
            layer_lrs.append(lr)

        return {
            'layer_learning_rates': layer_lrs,
            'description': 'Deeper layers use larger learning rate, shallower layers use smaller learning rate'
        }

    @staticmethod
    def cosine_restart_schedule(T_0=10, T_mult=2, eta_min=1e-6, eta_max=1e-2):
        """Cosine restart scheduling"""
        return {
            'scheduler': 'cosine_restart',
            'T_0': T_0,
            'T_mult': T_mult,
            'eta_min': eta_min,
            'eta_max': eta_max,
            'description': 'Periodic learning rate restart to avoid local optima'
        }

# Usage example
tl_manager = TransferLearningManager()

# Get model recommendations
recommendations = tl_manager.select_pretrained_model(
    target_classes=20,
    data_size=5000,
    compute_budget='medium'
)

print("Transfer Learning Model Recommendations:")
for rec in recommendations:
    print(f"  • {rec}")

# Create transfer learning configuration
transfer_config = tl_manager.create_transfer_config('yolov8m.pt', freeze_layers=5)
print(f"\nTransfer Learning Configuration: {transfer_config}")

# Gradual unfreezing schedule
schedule = tl_manager.gradual_unfreezing_schedule(total_epochs=100)
print(f"\nGradual Unfreezing Schedule:")
for i, stage in enumerate(schedule, 1):
    print(f"  Stage {i}: {stage['description']}")
    print(f"    Epochs: {stage['epochs']}, Freeze: {stage['freeze']}, LR: {stage['lr']}")

9.6 Advanced Training Techniques

class AdvancedTrainingTechniques:
    """Advanced Training Techniques"""

    def __init__(self):
        pass

    def mixed_precision_training(self):
        """Mixed precision training configuration"""
        return {
            'amp': True,  # Enable automatic mixed precision
            'description': 'Use FP16 and FP32 mixed precision, accelerate training and save memory',
            'benefits': [
                '1.5-2x training speed improvement',
                'Approximately 50% memory reduction',
                'Almost no accuracy loss'
            ],
            'requirements': [
                'GPU supports Tensor Cores (V100, RTX series)',
                'PyTorch 1.6+',
                'CUDA 10.2+'
            ]
        }

    def exponential_moving_average(self, decay=0.9999):
        """Exponential moving average configuration"""
        return {
            'ema_decay': decay,
            'description': 'Use EMA to smooth model weights, improve generalization',
            'implementation': '''
            # EMA update formula
            ema_weights = decay * ema_weights + (1 - decay) * current_weights
            ''',
            'benefits': [
                'Reduce model weight oscillation',
                'Improve validation performance',
                'More stable convergence'
            ]
        }

    def gradient_clipping(self, max_norm=10.0):
        """Gradient clipping configuration"""
        return {
            'max_norm': max_norm,
            'description': 'Limit gradient norm to prevent gradient explosion',
            'when_to_use': [
                'Gradient norm often >10',
                'Loss becomes NaN or Inf',
                'Training unstable'
            ]
        }

    def knowledge_distillation_setup(self, teacher_model, temperature=4.0, alpha=0.7):
        """Knowledge distillation setup"""
        return {
            'teacher_model': teacher_model,
            'temperature': temperature,
            'alpha': alpha,
            'description': 'Use large model to guide small model training',
            'loss_function': '''
            distillation_loss = alpha * KL_div(student_soft, teacher_soft) +
                              (1 - alpha) * cross_entropy(student, targets)
            where:
                student_soft = softmax(student_logits / temperature)
                teacher_soft = softmax(teacher_logits / temperature)
            '''
        }

    def multi_scale_training(self, scales=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640]):
        """Multi-scale training"""
        return {
            'scales': scales,
            'description': 'Randomly select input scale to improve multi-scale generalization',
            'strategy': {
                'random_scale': 'Randomly select a scale for each batch',
                'scheduled_scale': 'Change scale according to schedule',
                'progressive_scale': 'Gradually increase from small to large scale'
            }
        }

    def label_smoothing(self, smoothing=0.1):
        """Label smoothing"""
        return {
            'label_smoothing': smoothing,
            'description': 'Soften one-hot labels to improve generalization',
            'formula': f'y_smooth = (1 - {smoothing}) * y_hot + {smoothing} / num_classes',
            'benefits': [
                'Reduce overfitting',
                'Improve model calibration',
                'Enhance generalization'
            ]
        }

    def focal_loss_config(self, alpha=0.25, gamma=2.0):
        """Focal loss configuration"""
        return {
            'alpha': alpha,
            'gamma': gamma,
            'description': 'Address class imbalance and hard sample problems',
            'formula': 'FL = -α(1-p)^γ * log(p)',
            'use_cases': [
                'Severe class imbalance',
                'Many easy negative samples',
                'Need to focus on hard samples'
            ]
        }

class TrainingRecipes:
    """Training Recipe Collection"""

    @staticmethod
    def small_dataset_recipe(epochs=200):
        """Small dataset training recipe"""
        return {
            'name': 'Small Dataset Training Recipe',
            'epochs': epochs,
            'model': 'yolov8n.pt',
            'batch_size': 32,
            'lr0': 0.001,
            'lrf': 0.01,
            'warmup_epochs': 5.0,
            'label_smoothing': 0.1,
            'mixup': 0.5,
            'copy_paste': 0.3,
            'mosaic': 0.8,
            'freeze': 10,  # Freeze backbone
            'data_augmentation': 'aggressive',
            'description': 'Suitable for datasets with <1000 images'
        }

    @staticmethod
    def large_dataset_recipe(epochs=100):
        """Large dataset training recipe"""
        return {
            'name': 'Large Dataset Training Recipe',
            'epochs': epochs,
            'model': 'yolov8l.pt',
            'batch_size': 16,
            'lr0': 0.01,
            'lrf': 0.001,
            'warmup_epochs': 3.0,
            'label_smoothing': 0.0,
            'mixup': 0.0,
            'mosaic': 1.0,
            'freeze': None,  # No freezing
            'amp': True,
            'description': 'Suitable for datasets with >10000 images'
        }

    @staticmethod
    def production_ready_recipe(epochs=150):
        """Production environment training recipe"""
        return {
            'name': 'Production Environment Training Recipe',
            'epochs': epochs,
            'model': 'yolov8m.pt',
            'batch_size': 16,
            'lr0': 0.01,
            'lrf': 0.01,
            'warmup_epochs': 3.0,
            'cos_lr': True,
            'label_smoothing': 0.05,
            'amp': True,
            'ema_decay': 0.9999,
            'save_period': 10,
            'val': True,
            'plots': True,
            'deterministic': True,
            'description': 'Recommended production configuration, balanced speed and accuracy'
        }

    @staticmethod
    def quick_experiment_recipe(epochs=50):
        """Quick experiment recipe"""
        return {
            'name': 'Quick Experiment Recipe',
            'epochs': epochs,
            'model': 'yolov8n.pt',
            'batch_size': 64,
            'lr0': 0.01,
            'imgsz': 416,  # Smaller input size
            'cache': 'ram',  # Cache to memory
            'workers': 8,
            'amp': True,
            'val': False,  # Skip validation to accelerate training
            'plots': False,
            'description': 'Quick validation of ideas, suitable for hyperparameter search'
        }

# Usage example
advanced_techniques = AdvancedTrainingTechniques()
recipes = TrainingRecipes()

print("Advanced Training Techniques:")
print("=" * 40)

# Mixed precision training
mp_config = advanced_techniques.mixed_precision_training()
print(f"\nMixed Precision Training: {mp_config['description']}")
for benefit in mp_config['benefits']:
    print(f"  • {benefit}")

# Exponential moving average
ema_config = advanced_techniques.exponential_moving_average()
print(f"\nEMA: {ema_config['description']}")

# Training recipes
print(f"\nTraining Recipe Examples:")
small_recipe = recipes.small_dataset_recipe()
print(f"  {small_recipe['name']}: {small_recipe['description']}")

print("\nAdvanced training techniques ready")

9.7 Chapter Summary

After completing this chapter, you should be able to:

  1. Configure and start YOLO model training
  2. Understand and tune key hyperparameters
  3. Implement effective training monitoring
  4. Apply transfer learning strategies
  5. Use advanced training techniques
  6. Diagnose and solve training issues
  7. Select appropriate training recipes
def training_checklist():
    """Training Checklist"""
    checklist = {
        "Pre-training Preparation": [
            "□ Dataset format correct",
            "□ Data quality check passed",
            "□ Train/validation split reasonable",
            "□ Hardware environment confirmed",
            "□ Dependency library versions compatible"
        ],
        "Configuration Setup": [
            "□ Model size selection appropriate",
            "□ Batch size and learning rate matched",
            "□ Data augmentation strategy determined",
            "□ Loss weights adjusted",
            "□ Monitoring tools configured"
        ],
        "Training Process": [
            "□ Learning rate schedule reasonable",
            "□ Loss decreasing normally",
            "□ Validation metrics improving",
            "□ No overfitting signs",
            "□ Regular checkpoint saving"
        ],
        "Training Completion": [
            "□ Model performance meets requirements",
            "□ Best weights saved",
            "□ Training logs complete",
            "□ Model exported to deployment format",
            "□ Experimental results recorded"
        ]
    }

    print("YOLO Training Checklist:")
    print("=" * 40)

    for category, items in checklist.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  {item}")

    print("\nKey Factors for Training Success:")
    success_factors = [
        "High-quality annotated data",
        "Appropriate model size",
        "Proper hyperparameter settings",
        "Sufficient training time",
        "Continuous monitoring and tuning"
    ]

    for factor in success_factors:
        print(f"  • {factor}")

training_checklist()
print("\nYOLO Model Training Practice Complete!")

Key Points: Master the complete YOLO model training workflow, from environment setup to advanced technique application, building systematic training and tuning capabilities.