第9章：YOLO模型训练实战

Haiyue10/2/25About 14 min

第9章：YOLO模型训练实战

学习目标

掌握YOLO模型的完整训练流程
理解超参数调优策略
学习训练过程监控和调试技巧
熟悉迁移学习和预训练模型使用

9.1 训练环境准备

from ultralytics import YOLO
import torch
import yaml
from pathlib import Path
import matplotlib.pyplot as plt

class TrainingSetup:
    """训练环境设置"""
    
    def __init__(self):
        self.device = self.get_device()
        self.setup_reproducibility()
    
    def get_device(self):
        """获取训练设备"""
        if torch.cuda.is_available():
            device = 'cuda'
            print(f"Using GPU: {torch.cuda.get_device_name()}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
        else:
            device = 'cpu'
            print("Using CPU for training")
        return device
    
    def setup_reproducibility(self, seed=42):
        """设置随机种子保证可重复性"""
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
        print(f"Random seed set to {seed}")
    
    def create_training_config(self, data_path, model_size='n'):
        """创建训练配置"""
        config = {
            'model': f'yolov8{model_size}.pt',
            'data': data_path,
            'epochs': 100,
            'batch_size': 16,
            'imgsz': 640,
            'lr0': 0.01,
            'lrf': 0.01,
            'momentum': 0.937,
            'weight_decay': 0.0005,
            'warmup_epochs': 3.0,
            'warmup_momentum': 0.8,
            'warmup_bias_lr': 0.1,
            'box': 7.5,
            'cls': 0.5,
            'dfl': 1.5,
            'pose': 12.0,
            'kobj': 2.0,
            'label_smoothing': 0.0,
            'nbs': 64,
            'overlap_mask': True,
            'mask_ratio': 4,
            'dropout': 0.0,
            'val': True,
            'plots': True,
            'save': True,
            'save_period': -1,
            'cache': False,
            'device': self.device,
            'workers': 8,
            'project': 'runs/train',
            'name': 'exp',
            'exist_ok': False,
            'pretrained': True,
            'optimizer': 'SGD',
            'verbose': True,
            'seed': 0,
            'deterministic': True,
            'single_cls': False,
            'rect': False,
            'cos_lr': False,
            'close_mosaic': 10,
            'resume': False,
            'amp': True,
            'fraction': 1.0,
            'profile': False,
            'freeze': None,
        }
        return config

# 初始化训练设置
trainer = TrainingSetup()
training_config = trainer.create_training_config('data.yaml')
print("训练环境准备完成")

9.2 基础训练流程

class YOLOTrainer:
    """YOLO训练器"""
    
    def __init__(self, config):
        self.config = config
        self.model = None
        self.training_results = None
    
    def load_model(self):
        """加载模型"""
        model_name = self.config.get('model', 'yolov8n.pt')
        self.model = YOLO(model_name)
        print(f"模型加载完成: {model_name}")
        return self.model
    
    def start_training(self):
        """开始训练"""
        if self.model is None:
            self.load_model()
        
        print("开始训练...")
        
        # 训练模型
        self.training_results = self.model.train(
            data=self.config['data'],
            epochs=self.config['epochs'],
            batch=self.config['batch_size'],
            imgsz=self.config['imgsz'],
            device=self.config['device'],
            workers=self.config['workers'],
            project=self.config['project'],
            name=self.config['name'],
            optimizer=self.config['optimizer'],
            lr0=self.config['lr0'],
            lrf=self.config['lrf'],
            momentum=self.config['momentum'],
            weight_decay=self.config['weight_decay'],
            warmup_epochs=self.config['warmup_epochs'],
            warmup_momentum=self.config['warmup_momentum'],
            warmup_bias_lr=self.config['warmup_bias_lr'],
            box=self.config['box'],
            cls=self.config['cls'],
            dfl=self.config['dfl'],
            save=self.config['save'],
            save_period=self.config['save_period'],
            cache=self.config['cache'],
            plots=self.config['plots'],
            val=self.config['val'],
            resume=self.config['resume'],
            amp=self.config['amp'],
            fraction=self.config['fraction'],
            profile=self.config['profile'],
            freeze=self.config['freeze'],
            cos_lr=self.config['cos_lr'],
            close_mosaic=self.config['close_mosaic'],
            overlap_mask=self.config['overlap_mask'],
            mask_ratio=self.config['mask_ratio'],
            dropout=self.config['dropout'],
            label_smoothing=self.config['label_smoothing'],
            nbs=self.config['nbs'],
            single_cls=self.config['single_cls'],
            rect=self.config['rect'],
            deterministic=self.config['deterministic'],
            verbose=self.config['verbose']
        )
        
        print("训练完成!")
        return self.training_results
    
    def evaluate_model(self):
        """评估模型"""
        if self.model is None:
            print("请先加载模型")
            return None
        
        print("开始验证...")
        validation_results = self.model.val(
            data=self.config['data'],
            imgsz=self.config['imgsz'],
            batch=self.config['batch_size'],
            device=self.config['device'],
            plots=True,
            verbose=True
        )
        
        print("验证完成!")
        return validation_results
    
    def save_model(self, save_path):
        """保存模型"""
        if self.model is None:
            print("没有可保存的模型")
            return
        
        self.model.save(save_path)
        print(f"模型已保存至: {save_path}")
    
    def export_model(self, format='onnx', **kwargs):
        """导出模型"""
        if self.model is None:
            print("请先加载模型")
            return
        
        export_path = self.model.export(format=format, **kwargs)
        print(f"模型已导出: {export_path}")
        return export_path

# 使用示例
trainer = YOLOTrainer(training_config)
# results = trainer.start_training()
print("YOLO训练器初始化完成")

9.3 超参数优化

import optuna
from optuna.samplers import TPESampler

class HyperparameterOptimizer:
    """超参数优化器"""
    
    def __init__(self, data_path, base_config):
        self.data_path = data_path
        self.base_config = base_config
        self.study = None
    
    def objective(self, trial):
        """优化目标函数"""
        # 定义超参数搜索空间
        lr0 = trial.suggest_float('lr0', 1e-5, 1e-1, log=True)
        lrf = trial.suggest_float('lrf', 0.01, 1.0)
        momentum = trial.suggest_float('momentum', 0.6, 0.98)
        weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
        warmup_epochs = trial.suggest_float('warmup_epochs', 0.0, 5.0)
        box_loss_gain = trial.suggest_float('box', 0.02, 0.2)
        cls_loss_gain = trial.suggest_float('cls', 0.2, 4.0)
        dfl_loss_gain = trial.suggest_float('dfl', 0.4, 6.0)
        
        # 创建配置
        config = self.base_config.copy()
        config.update({
            'lr0': lr0,
            'lrf': lrf,
            'momentum': momentum,
            'weight_decay': weight_decay,
            'warmup_epochs': warmup_epochs,
            'box': box_loss_gain,
            'cls': cls_loss_gain,
            'dfl': dfl_loss_gain,
            'epochs': 30,  # 减少epoch加速优化
            'name': f'trial_{trial.number}',
            'verbose': False
        })
        
        # 训练模型
        try:
            model = YOLO(config['model'])
            results = model.train(**config)
            
            # 返回验证mAP50-95作为优化目标
            return results.results_dict['metrics/mAP50-95(B)']
            
        except Exception as e:
            print(f"Trial {trial.number} failed: {e}")
            return 0.0
    
    def optimize(self, n_trials=50):
        """执行超参数优化"""
        self.study = optuna.create_study(
            direction='maximize',
            sampler=TPESampler(seed=42)
        )
        
        print(f"开始超参数优化，总共{n_trials}次试验...")
        self.study.optimize(self.objective, n_trials=n_trials)
        
        print("\n优化完成!")
        print(f"最佳参数: {self.study.best_params}")
        print(f"最佳分数: {self.study.best_value:.4f}")
        
        return self.study.best_params
    
    def plot_optimization_history(self):
        """可视化优化历史"""
        if self.study is None:
            print("请先运行优化")
            return
        
        fig = optuna.visualization.plot_optimization_history(self.study)
        fig.show()
    
    def plot_parameter_importances(self):
        """可视化参数重要性"""
        if self.study is None:
            print("请先运行优化")
            return
        
        fig = optuna.visualization.plot_param_importances(self.study)
        fig.show()

# 学习率调度策略
class LearningRateScheduler:
    """学习率调度器"""
    
    @staticmethod
    def cosine_annealing(epoch, total_epochs, lr0, lrf):
        """余弦退火"""
        import math
        return lrf + (lr0 - lrf) * (1 + math.cos(math.pi * epoch / total_epochs)) / 2
    
    @staticmethod
    def linear_decay(epoch, total_epochs, lr0, lrf):
        """线性衰减"""
        return lr0 * (1 - epoch / total_epochs) + lrf * (epoch / total_epochs)
    
    @staticmethod
    def exponential_decay(epoch, total_epochs, lr0, lrf):
        """指数衰减"""
        import math
        decay_rate = -math.log(lrf / lr0) / total_epochs
        return lr0 * math.exp(-decay_rate * epoch)
    
    @staticmethod
    def step_decay(epoch, step_size=30, gamma=0.1, lr0=0.01):
        """阶梯衰减"""
        return lr0 * (gamma ** (epoch // step_size))
    
    @staticmethod
    def plot_schedules(total_epochs=100, lr0=0.01, lrf=0.001):
        """可视化不同调度策略"""
        epochs = list(range(total_epochs))
        
        cosine_lrs = [LearningRateScheduler.cosine_annealing(e, total_epochs, lr0, lrf) for e in epochs]
        linear_lrs = [LearningRateScheduler.linear_decay(e, total_epochs, lr0, lrf) for e in epochs]
        exp_lrs = [LearningRateScheduler.exponential_decay(e, total_epochs, lr0, lrf) for e in epochs]
        step_lrs = [LearningRateScheduler.step_decay(e, 30, 0.1, lr0) for e in epochs]
        
        plt.figure(figsize=(12, 8))
        plt.plot(epochs, cosine_lrs, label='Cosine Annealing', linewidth=2)
        plt.plot(epochs, linear_lrs, label='Linear Decay', linewidth=2)
        plt.plot(epochs, exp_lrs, label='Exponential Decay', linewidth=2)
        plt.plot(epochs, step_lrs, label='Step Decay', linewidth=2)
        
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.title('Learning Rate Schedules Comparison')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

# 可视化学习率调度策略
LearningRateScheduler.plot_schedules()
print("超参数优化工具准备完成")

9.4 训练监控与调试

import wandb
from torch.utils.tensorboard import SummaryWriter
import logging
from datetime import datetime

class TrainingMonitor:
    """训练监控器"""
    
    def __init__(self, project_name="yolo_training", use_wandb=True, use_tensorboard=True):
        self.project_name = project_name
        self.use_wandb = use_wandb
        self.use_tensorboard = use_tensorboard
        
        # 初始化监控工具
        if self.use_wandb:
            wandb.init(project=project_name)
        
        if self.use_tensorboard:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.writer = SummaryWriter(f'runs/{project_name}_{timestamp}')
        
        # 设置日志
        self.setup_logging()
    
    def setup_logging(self):
        """设置日志记录"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(f'training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def log_metrics(self, metrics_dict, step):
        """记录训练指标"""
        # 记录到日志
        metrics_str = ", ".join([f"{k}: {v:.4f}" for k, v in metrics_dict.items()])
        self.logger.info(f"Step {step} - {metrics_str}")
        
        # 记录到W&B
        if self.use_wandb:
            wandb.log(metrics_dict, step=step)
        
        # 记录到TensorBoard
        if self.use_tensorboard:
            for key, value in metrics_dict.items():
                self.writer.add_scalar(key, value, step)
    
    def log_learning_rate(self, lr, step):
        """记录学习率"""
        self.log_metrics({'learning_rate': lr}, step)
    
    def log_loss_components(self, losses, step):
        """记录损失组件"""
        loss_dict = {
            'train/box_loss': losses.get('train/box_loss', 0),
            'train/cls_loss': losses.get('train/cls_loss', 0),
            'train/dfl_loss': losses.get('train/dfl_loss', 0),
            'val/box_loss': losses.get('val/box_loss', 0),
            'val/cls_loss': losses.get('val/cls_loss', 0),
            'val/dfl_loss': losses.get('val/dfl_loss', 0)
        }
        self.log_metrics(loss_dict, step)
    
    def log_model_metrics(self, metrics, step):
        """记录模型评估指标"""
        metric_dict = {
            'metrics/precision': metrics.get('metrics/precision(B)', 0),
            'metrics/recall': metrics.get('metrics/recall(B)', 0),
            'metrics/mAP50': metrics.get('metrics/mAP50(B)', 0),
            'metrics/mAP50-95': metrics.get('metrics/mAP50-95(B)', 0)
        }
        self.log_metrics(metric_dict, step)
    
    def close(self):
        """关闭监控器"""
        if self.use_tensorboard:
            self.writer.close()
        
        if self.use_wandb:
            wandb.finish()

class TrainingDebugger:
    """训练调试工具"""
    
    def __init__(self):
        pass
    
    def diagnose_training_issues(self, results_dict):
        """诊断训练问题"""
        issues = []
        
        # 检查损失趋势
        if 'train/box_loss' in results_dict:
            box_loss = results_dict['train/box_loss']
            if box_loss > 1.0:
                issues.append("边界框损失过高，可能需要调整box loss权重或学习率")
        
        if 'train/cls_loss' in results_dict:
            cls_loss = results_dict['train/cls_loss']
            if cls_loss > 1.0:
                issues.append("分类损失过高，检查类别标签是否正确")
        
        # 检查mAP
        if 'metrics/mAP50-95(B)' in results_dict:
            mAP = results_dict['metrics/mAP50-95(B)']
            if mAP < 0.1:
                issues.append("mAP过低，检查数据质量或模型配置")
        
        # 检查精确率和召回率
        if 'metrics/precision(B)' in results_dict and 'metrics/recall(B)' in results_dict:
            precision = results_dict['metrics/precision(B)']
            recall = results_dict['metrics/recall(B)']
            
            if precision < 0.3:
                issues.append("精确率低，可能存在过多误检")
            if recall < 0.3:
                issues.append("召回率低，可能遗漏了很多目标")
        
        return issues
    
    def suggest_fixes(self, issues):
        """建议解决方案"""
        suggestions = {
            "高损失": [
                "降低学习率",
                "检查数据标注质量",
                "调整损失权重",
                "增加warmup epochs"
            ],
            "低mAP": [
                "增加训练epochs",
                "使用更大的模型",
                "改进数据增强",
                "检查anchor设置"
            ],
            "低精确率": [
                "提高置信度阈值",
                "改进NMS设置",
                "增加负样本"
            ],
            "低召回率": [
                "降低置信度阈值",
                "增加数据增强",
                "使用多尺度训练"
            ]
        }
        
        print("训练问题诊断:")
        print("=" * 40)
        
        if not issues:
            print("未发现明显问题")
            return
        
        for issue in issues:
            print(f"问题: {issue}")
            
            # 匹配建议
            for category, suggestions_list in suggestions.items():
                if any(keyword in issue for keyword in category.split()):
                    print("建议解决方案:")
                    for suggestion in suggestions_list:
                        print(f"  • {suggestion}")
                    break
            print()

# 训练状态检查
class TrainingHealthCheck:
    """训练健康状态检查"""
    
    def __init__(self):
        self.loss_history = []
        self.metric_history = []
    
    def check_convergence(self, loss_values, window_size=10):
        """检查收敛状态"""
        if len(loss_values) < window_size * 2:
            return "数据不足"
        
        recent_losses = loss_values[-window_size:]
        previous_losses = loss_values[-window_size*2:-window_size]
        
        recent_avg = sum(recent_losses) / len(recent_losses)
        previous_avg = sum(previous_losses) / len(previous_losses)
        
        improvement = (previous_avg - recent_avg) / previous_avg
        
        if improvement > 0.05:
            return "正在收敛"
        elif improvement > -0.02:
            return "收敛缓慢"
        else:
            return "可能发散"
    
    def detect_overfitting(self, train_loss, val_loss, threshold=0.1):
        """检测过拟合"""
        if len(train_loss) < 10 or len(val_loss) < 10:
            return "数据不足"
        
        train_trend = (train_loss[-1] - train_loss[-10]) / 10
        val_trend = (val_loss[-1] - val_loss[-10]) / 10
        
        if train_trend < -0.01 and val_trend > 0.01:
            return "可能过拟合"
        elif abs(train_trend) < 0.001 and abs(val_trend) < 0.001:
            return "训练稳定"
        else:
            return "正常训练"
    
    def check_learning_rate(self, loss_values, lr_values):
        """检查学习率是否合适"""
        if len(loss_values) < 5:
            return "数据不足"
        
        recent_loss_change = (loss_values[-1] - loss_values[-5]) / 5
        current_lr = lr_values[-1] if lr_values else 0.01
        
        if recent_loss_change > 0.01:
            return f"学习率可能过高 (当前: {current_lr:.6f})"
        elif abs(recent_loss_change) < 0.0001:
            return f"学习率可能过低 (当前: {current_lr:.6f})"
        else:
            return f"学习率合适 (当前: {current_lr:.6f})"

print("训练监控和调试工具已准备完成")

9.5 迁移学习策略

class TransferLearningManager:
    """迁移学习管理器"""
    
    def __init__(self):
        self.pretrained_models = {
            'yolov8n': 'yolov8n.pt',
            'yolov8s': 'yolov8s.pt', 
            'yolov8m': 'yolov8m.pt',
            'yolov8l': 'yolov8l.pt',
            'yolov8x': 'yolov8x.pt'
        }
    
    def select_pretrained_model(self, target_classes, data_size, compute_budget):
        """选择合适的预训练模型"""
        recommendations = []
        
        if data_size < 1000:
            if compute_budget == 'low':
                recommendations.append('yolov8n - 数据量少，计算资源有限')
            else:
                recommendations.append('yolov8s - 数据量少，但可用更大模型提升性能')
        
        elif data_size < 10000:
            if compute_budget == 'low':
                recommendations.append('yolov8s - 中等数据量，合理的模型大小')
            elif compute_budget == 'medium':
                recommendations.append('yolov8m - 平衡性能和效率')
            else:
                recommendations.append('yolov8l - 数据充足，可用大模型')
        
        else:  # data_size >= 10000
            if compute_budget == 'low':
                recommendations.append('yolov8s - 大数据量，但计算受限')
            elif compute_budget == 'medium':
                recommendations.append('yolov8m - 大数据量，中等模型')
            elif compute_budget == 'high':
                recommendations.append('yolov8l - 大数据量，大模型')
            else:
                recommendations.append('yolov8x - 最大性能需求')
        
        # 根据类别数调整建议
        if target_classes > 80:
            recommendations.append('建议：类别数多，考虑使用更大的模型')
        elif target_classes < 10:
            recommendations.append('建议：类别数少，可以使用较小的模型')
        
        return recommendations
    
    def create_transfer_config(self, pretrained_model, freeze_layers=None):
        """创建迁移学习配置"""
        config = {
            'model': pretrained_model,
            'pretrained': True,
            'freeze': freeze_layers,  # 冻结层数，None表示不冻结
        }
        
        # 根据是否冻结层调整学习率
        if freeze_layers:
            config.update({
                'lr0': 0.001,  # 较低的学习率
                'warmup_epochs': 1.0,
            })
        else:
            config.update({
                'lr0': 0.01,   # 标准学习率 
                'warmup_epochs': 3.0,
            })
        
        return config
    
    def gradual_unfreezing_schedule(self, total_epochs):
        """渐进式解冻计划"""
        schedule = []
        
        # 第一阶段：冻结骨干网络
        schedule.append({
            'epochs': total_epochs // 4,
            'freeze': 10,  # 冻结前10层
            'lr': 0.001,
            'description': '冻结骨干网络，训练检测头'
        })
        
        # 第二阶段：部分解冻
        schedule.append({
            'epochs': total_epochs // 4,
            'freeze': 5,   # 只冻结前5层
            'lr': 0.0005,
            'description': '部分解冻，精调后层'
        })
        
        # 第三阶段：完全解冻
        schedule.append({
            'epochs': total_epochs // 2,
            'freeze': None,
            'lr': 0.0001,
            'description': '完全解冻，端到端精调'
        })
        
        return schedule
    
    def domain_adaptation_config(self, source_domain, target_domain):
        """领域自适应配置"""
        adaptation_strategies = {
            ('general', 'medical'): {
                'data_augmentation': ['contrast', 'brightness', 'gaussian_blur'],
                'loss_weights': {'box': 7.5, 'cls': 1.0, 'dfl': 1.5},
                'learning_rate': 0.001,
                'freeze_backbone': True
            },
            ('general', 'industrial'): {
                'data_augmentation': ['rotation', 'scale', 'noise'],
                'loss_weights': {'box': 10.0, 'cls': 0.5, 'dfl': 2.0},
                'learning_rate': 0.005,
                'freeze_backbone': False
            },
            ('general', 'aerial'): {
                'data_augmentation': ['rotation', 'scale', 'flip'],
                'loss_weights': {'box': 5.0, 'cls': 1.5, 'dfl': 1.0},
                'learning_rate': 0.01,
                'freeze_backbone': False
            }
        }
        
        key = (source_domain, target_domain)
        if key in adaptation_strategies:
            return adaptation_strategies[key]
        else:
            # 默认配置
            return {
                'data_augmentation': ['horizontal_flip', 'scale', 'brightness'],
                'loss_weights': {'box': 7.5, 'cls': 0.5, 'dfl': 1.5},
                'learning_rate': 0.01,
                'freeze_backbone': False
            }

class FineTuningStrategies:
    """微调策略"""
    
    @staticmethod
    def discriminative_learning_rates(base_lr=0.01, backbone_ratio=0.1, neck_ratio=0.5):
        """差分学习率策略"""
        return {
            'backbone_lr': base_lr * backbone_ratio,
            'neck_lr': base_lr * neck_ratio,
            'head_lr': base_lr,
            'description': '骨干网络用较小学习率，检测头用较大学习率'
        }
    
    @staticmethod
    def layer_wise_decay(base_lr=0.01, decay_rate=0.9, num_layers=24):
        """层级衰减学习率"""
        layer_lrs = []
        for i in range(num_layers):
            lr = base_lr * (decay_rate ** (num_layers - i - 1))
            layer_lrs.append(lr)
        
        return {
            'layer_learning_rates': layer_lrs,
            'description': '深层用较大学习率，浅层用较小学习率'
        }
    
    @staticmethod
    def cosine_restart_schedule(T_0=10, T_mult=2, eta_min=1e-6, eta_max=1e-2):
        """余弦重启调度"""
        return {
            'scheduler': 'cosine_restart',
            'T_0': T_0,
            'T_mult': T_mult,
            'eta_min': eta_min,
            'eta_max': eta_max,
            'description': '周期性重启学习率，避免局部最优'
        }

# 使用示例
tl_manager = TransferLearningManager()

# 获取模型推荐
recommendations = tl_manager.select_pretrained_model(
    target_classes=20, 
    data_size=5000, 
    compute_budget='medium'
)

print("迁移学习模型推荐:")
for rec in recommendations:
    print(f"  • {rec}")

# 创建迁移学习配置
transfer_config = tl_manager.create_transfer_config('yolov8m.pt', freeze_layers=5)
print(f"\n迁移学习配置: {transfer_config}")

# 渐进式解冻计划
schedule = tl_manager.gradual_unfreezing_schedule(total_epochs=100)
print(f"\n渐进式解冻计划:")
for i, stage in enumerate(schedule, 1):
    print(f"  阶段{i}: {stage['description']}")
    print(f"    Epochs: {stage['epochs']}, Freeze: {stage['freeze']}, LR: {stage['lr']}")

9.6 高级训练技巧

class AdvancedTrainingTechniques:
    """高级训练技巧"""
    
    def __init__(self):
        pass
    
    def mixed_precision_training(self):
        """混合精度训练配置"""
        return {
            'amp': True,  # 启用自动混合精度
            'description': '使用FP16和FP32混合精度，加速训练并节省显存',
            'benefits': [
                '训练速度提升1.5-2倍',
                '显存使用减少约50%',
                '几乎不损失精度'
            ],
            'requirements': [
                'GPU支持Tensor Cores (V100, RTX系列)',
                'PyTorch 1.6+',
                'CUDA 10.2+'
            ]
        }
    
    def exponential_moving_average(self, decay=0.9999):
        """指数移动平均配置"""
        return {
            'ema_decay': decay,
            'description': '使用EMA平滑模型权重，提升泛化性能',
            'implementation': '''
            # EMA更新公式
            ema_weights = decay * ema_weights + (1 - decay) * current_weights
            ''',
            'benefits': [
                '减少模型权重抖动',
                '提升验证集性能',
                '更稳定的收敛'
            ]
        }
    
    def gradient_clipping(self, max_norm=10.0):
        """梯度裁剪配置"""
        return {
            'max_norm': max_norm,
            'description': '限制梯度范数，防止梯度爆炸',
            'when_to_use': [
                '梯度范数经常>10',
                '损失出现NaN或Inf',
                '训练不稳定'
            ]
        }
    
    def knowledge_distillation_setup(self, teacher_model, temperature=4.0, alpha=0.7):
        """知识蒸馏设置"""
        return {
            'teacher_model': teacher_model,
            'temperature': temperature,
            'alpha': alpha,
            'description': '使用大模型指导小模型训练',
            'loss_function': '''
            distillation_loss = alpha * KL_div(student_soft, teacher_soft) + 
                              (1 - alpha) * cross_entropy(student, targets)
            where:
                student_soft = softmax(student_logits / temperature)
                teacher_soft = softmax(teacher_logits / temperature)
            '''
        }
    
    def multi_scale_training(self, scales=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640]):
        """多尺度训练"""
        return {
            'scales': scales,
            'description': '随机选择输入尺度，提升多尺度泛化能力',
            'strategy': {
                'random_scale': '每个batch随机选择一个尺度',
                'scheduled_scale': '按照计划改变尺度',
                'progressive_scale': '从小尺度逐渐增加到大尺度'
            }
        }
    
    def label_smoothing(self, smoothing=0.1):
        """标签平滑"""
        return {
            'label_smoothing': smoothing,
            'description': '软化one-hot标签，提高泛化性',
            'formula': f'y_smooth = (1 - {smoothing}) * y_hot + {smoothing} / num_classes',
            'benefits': [
                '减少过拟合',
                '提升模型校准',
                '增强泛化能力'
            ]
        }
    
    def focal_loss_config(self, alpha=0.25, gamma=2.0):
        """焦点损失配置"""
        return {
            'alpha': alpha,
            'gamma': gamma,
            'description': '解决类别不平衡和困难样本问题',
            'formula': 'FL = -α(1-p)^γ * log(p)',
            'use_cases': [
                '类别严重不平衡',
                '存在大量简单负样本',
                '需要关注困难样本'
            ]
        }

class TrainingRecipes:
    """训练配方集合"""
    
    @staticmethod
    def small_dataset_recipe(epochs=200):
        """小数据集训练配方"""
        return {
            'name': '小数据集训练配方',
            'epochs': epochs,
            'model': 'yolov8n.pt',
            'batch_size': 32,
            'lr0': 0.001,
            'lrf': 0.01,
            'warmup_epochs': 5.0,
            'label_smoothing': 0.1,
            'mixup': 0.5,
            'copy_paste': 0.3,
            'mosaic': 0.8,
            'freeze': 10,  # 冻结骨干网络
            'data_augmentation': 'aggressive',
            'description': '适用于<1000张图像的小数据集'
        }
    
    @staticmethod
    def large_dataset_recipe(epochs=100):
        """大数据集训练配方"""
        return {
            'name': '大数据集训练配方', 
            'epochs': epochs,
            'model': 'yolov8l.pt',
            'batch_size': 16,
            'lr0': 0.01,
            'lrf': 0.001,
            'warmup_epochs': 3.0,
            'label_smoothing': 0.0,
            'mixup': 0.0,
            'mosaic': 1.0,
            'freeze': None,  # 不冻结
            'amp': True,
            'description': '适用于>10000张图像的大数据集'
        }
    
    @staticmethod
    def production_ready_recipe(epochs=150):
        """生产环境训练配方"""
        return {
            'name': '生产环境训练配方',
            'epochs': epochs,
            'model': 'yolov8m.pt',
            'batch_size': 16,
            'lr0': 0.01,
            'lrf': 0.01,
            'warmup_epochs': 3.0,
            'cos_lr': True,
            'label_smoothing': 0.05,
            'amp': True,
            'ema_decay': 0.9999,
            'save_period': 10,
            'val': True,
            'plots': True,
            'deterministic': True,
            'description': '生产环境推荐配置，平衡速度和精度'
        }
    
    @staticmethod
    def quick_experiment_recipe(epochs=50):
        """快速实验配方"""
        return {
            'name': '快速实验配方',
            'epochs': epochs,
            'model': 'yolov8n.pt',
            'batch_size': 64,
            'lr0': 0.01,
            'imgsz': 416,  # 较小的输入尺寸
            'cache': 'ram',  # 缓存到内存
            'workers': 8,
            'amp': True,
            'val': False,  # 跳过验证加速训练
            'plots': False,
            'description': '快速验证想法，适合超参数搜索'
        }

# 使用示例
advanced_techniques = AdvancedTrainingTechniques()
recipes = TrainingRecipes()

print("高级训练技巧:")
print("=" * 40)

# 混合精度训练
mp_config = advanced_techniques.mixed_precision_training()
print(f"\n混合精度训练: {mp_config['description']}")
for benefit in mp_config['benefits']:
    print(f"  • {benefit}")

# 指数移动平均
ema_config = advanced_techniques.exponential_moving_average()
print(f"\nEMA: {ema_config['description']}")

# 训练配方
print(f"\n训练配方示例:")
small_recipe = recipes.small_dataset_recipe()
print(f"  {small_recipe['name']}: {small_recipe['description']}")

print("\n高级训练技巧准备完成")

9.7 章节总结

完成本章学习后，你应该能够：

✅ 配置和启动YOLO模型训练
✅ 理解和调优关键超参数
✅ 实施有效的训练监控
✅ 应用迁移学习策略
✅ 使用高级训练技巧
✅ 诊断和解决训练问题
✅ 选择适合的训练配方

def training_checklist():
    """训练检查清单"""
    checklist = {
        "训练前准备": [
            "□ 数据集格式正确",
            "□ 数据质量检查通过", 
            "□ 训练/验证集划分合理",
            "□ 硬件环境确认",
            "□ 依赖库版本兼容"
        ],
        "配置设置": [
            "□ 模型大小选择合适",
            "□ 批大小和学习率匹配",
            "□ 数据增强策略确定", 
            "□ 损失权重调整",
            "□ 监控工具配置"
        ],
        "训练过程": [
            "□ 学习率调度合理",
            "□ 损失下降正常",
            "□ 验证指标提升",
            "□ 无过拟合迹象",
            "□ 定期保存检查点"
        ],
        "训练完成": [
            "□ 模型性能满足要求",
            "□ 最佳权重已保存",
            "□ 训练日志完整",
            "□ 模型已导出部署格式",
            "□ 实验结果已记录"
        ]
    }
    
    print("YOLO训练检查清单:")
    print("=" * 40)
    
    for category, items in checklist.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  {item}")
    
    print("\n🎯 训练成功的关键因素:")
    success_factors = [
        "高质量的标注数据",
        "合适的模型大小",
        "恰当的超参数设置",
        "充分的训练时间",
        "持续的监控和调优"
    ]
    
    for factor in success_factors:
        print(f"  • {factor}")

training_checklist()
print("\nYOLO模型训练实战完成！")

本章重点：掌握YOLO模型的完整训练流程，从环境搭建到高级技巧应用，建立系统性的训练和调优能力。