第9章:YOLO模型训练实战
10/2/25About 14 min
第9章:YOLO模型训练实战
学习目标
- 掌握YOLO模型的完整训练流程
- 理解超参数调优策略
- 学习训练过程监控和调试技巧
- 熟悉迁移学习和预训练模型使用
9.1 训练环境准备
from ultralytics import YOLO
import torch
import yaml
from pathlib import Path
import matplotlib.pyplot as plt
class TrainingSetup:
"""训练环境设置"""
def __init__(self):
self.device = self.get_device()
self.setup_reproducibility()
def get_device(self):
"""获取训练设备"""
if torch.cuda.is_available():
device = 'cuda'
print(f"Using GPU: {torch.cuda.get_device_name()}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
device = 'cpu'
print("Using CPU for training")
return device
def setup_reproducibility(self, seed=42):
"""设置随机种子保证可重复性"""
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
print(f"Random seed set to {seed}")
def create_training_config(self, data_path, model_size='n'):
"""创建训练配置"""
config = {
'model': f'yolov8{model_size}.pt',
'data': data_path,
'epochs': 100,
'batch_size': 16,
'imgsz': 640,
'lr0': 0.01,
'lrf': 0.01,
'momentum': 0.937,
'weight_decay': 0.0005,
'warmup_epochs': 3.0,
'warmup_momentum': 0.8,
'warmup_bias_lr': 0.1,
'box': 7.5,
'cls': 0.5,
'dfl': 1.5,
'pose': 12.0,
'kobj': 2.0,
'label_smoothing': 0.0,
'nbs': 64,
'overlap_mask': True,
'mask_ratio': 4,
'dropout': 0.0,
'val': True,
'plots': True,
'save': True,
'save_period': -1,
'cache': False,
'device': self.device,
'workers': 8,
'project': 'runs/train',
'name': 'exp',
'exist_ok': False,
'pretrained': True,
'optimizer': 'SGD',
'verbose': True,
'seed': 0,
'deterministic': True,
'single_cls': False,
'rect': False,
'cos_lr': False,
'close_mosaic': 10,
'resume': False,
'amp': True,
'fraction': 1.0,
'profile': False,
'freeze': None,
}
return config
# 初始化训练设置
trainer = TrainingSetup()
training_config = trainer.create_training_config('data.yaml')
print("训练环境准备完成")9.2 基础训练流程
class YOLOTrainer:
"""YOLO训练器"""
def __init__(self, config):
self.config = config
self.model = None
self.training_results = None
def load_model(self):
"""加载模型"""
model_name = self.config.get('model', 'yolov8n.pt')
self.model = YOLO(model_name)
print(f"模型加载完成: {model_name}")
return self.model
def start_training(self):
"""开始训练"""
if self.model is None:
self.load_model()
print("开始训练...")
# 训练模型
self.training_results = self.model.train(
data=self.config['data'],
epochs=self.config['epochs'],
batch=self.config['batch_size'],
imgsz=self.config['imgsz'],
device=self.config['device'],
workers=self.config['workers'],
project=self.config['project'],
name=self.config['name'],
optimizer=self.config['optimizer'],
lr0=self.config['lr0'],
lrf=self.config['lrf'],
momentum=self.config['momentum'],
weight_decay=self.config['weight_decay'],
warmup_epochs=self.config['warmup_epochs'],
warmup_momentum=self.config['warmup_momentum'],
warmup_bias_lr=self.config['warmup_bias_lr'],
box=self.config['box'],
cls=self.config['cls'],
dfl=self.config['dfl'],
save=self.config['save'],
save_period=self.config['save_period'],
cache=self.config['cache'],
plots=self.config['plots'],
val=self.config['val'],
resume=self.config['resume'],
amp=self.config['amp'],
fraction=self.config['fraction'],
profile=self.config['profile'],
freeze=self.config['freeze'],
cos_lr=self.config['cos_lr'],
close_mosaic=self.config['close_mosaic'],
overlap_mask=self.config['overlap_mask'],
mask_ratio=self.config['mask_ratio'],
dropout=self.config['dropout'],
label_smoothing=self.config['label_smoothing'],
nbs=self.config['nbs'],
single_cls=self.config['single_cls'],
rect=self.config['rect'],
deterministic=self.config['deterministic'],
verbose=self.config['verbose']
)
print("训练完成!")
return self.training_results
def evaluate_model(self):
"""评估模型"""
if self.model is None:
print("请先加载模型")
return None
print("开始验证...")
validation_results = self.model.val(
data=self.config['data'],
imgsz=self.config['imgsz'],
batch=self.config['batch_size'],
device=self.config['device'],
plots=True,
verbose=True
)
print("验证完成!")
return validation_results
def save_model(self, save_path):
"""保存模型"""
if self.model is None:
print("没有可保存的模型")
return
self.model.save(save_path)
print(f"模型已保存至: {save_path}")
def export_model(self, format='onnx', **kwargs):
"""导出模型"""
if self.model is None:
print("请先加载模型")
return
export_path = self.model.export(format=format, **kwargs)
print(f"模型已导出: {export_path}")
return export_path
# 使用示例
trainer = YOLOTrainer(training_config)
# results = trainer.start_training()
print("YOLO训练器初始化完成")9.3 超参数优化
import optuna
from optuna.samplers import TPESampler
class HyperparameterOptimizer:
"""超参数优化器"""
def __init__(self, data_path, base_config):
self.data_path = data_path
self.base_config = base_config
self.study = None
def objective(self, trial):
"""优化目标函数"""
# 定义超参数搜索空间
lr0 = trial.suggest_float('lr0', 1e-5, 1e-1, log=True)
lrf = trial.suggest_float('lrf', 0.01, 1.0)
momentum = trial.suggest_float('momentum', 0.6, 0.98)
weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
warmup_epochs = trial.suggest_float('warmup_epochs', 0.0, 5.0)
box_loss_gain = trial.suggest_float('box', 0.02, 0.2)
cls_loss_gain = trial.suggest_float('cls', 0.2, 4.0)
dfl_loss_gain = trial.suggest_float('dfl', 0.4, 6.0)
# 创建配置
config = self.base_config.copy()
config.update({
'lr0': lr0,
'lrf': lrf,
'momentum': momentum,
'weight_decay': weight_decay,
'warmup_epochs': warmup_epochs,
'box': box_loss_gain,
'cls': cls_loss_gain,
'dfl': dfl_loss_gain,
'epochs': 30, # 减少epoch加速优化
'name': f'trial_{trial.number}',
'verbose': False
})
# 训练模型
try:
model = YOLO(config['model'])
results = model.train(**config)
# 返回验证mAP50-95作为优化目标
return results.results_dict['metrics/mAP50-95(B)']
except Exception as e:
print(f"Trial {trial.number} failed: {e}")
return 0.0
def optimize(self, n_trials=50):
"""执行超参数优化"""
self.study = optuna.create_study(
direction='maximize',
sampler=TPESampler(seed=42)
)
print(f"开始超参数优化,总共{n_trials}次试验...")
self.study.optimize(self.objective, n_trials=n_trials)
print("\n优化完成!")
print(f"最佳参数: {self.study.best_params}")
print(f"最佳分数: {self.study.best_value:.4f}")
return self.study.best_params
def plot_optimization_history(self):
"""可视化优化历史"""
if self.study is None:
print("请先运行优化")
return
fig = optuna.visualization.plot_optimization_history(self.study)
fig.show()
def plot_parameter_importances(self):
"""可视化参数重要性"""
if self.study is None:
print("请先运行优化")
return
fig = optuna.visualization.plot_param_importances(self.study)
fig.show()
# 学习率调度策略
class LearningRateScheduler:
"""学习率调度器"""
@staticmethod
def cosine_annealing(epoch, total_epochs, lr0, lrf):
"""余弦退火"""
import math
return lrf + (lr0 - lrf) * (1 + math.cos(math.pi * epoch / total_epochs)) / 2
@staticmethod
def linear_decay(epoch, total_epochs, lr0, lrf):
"""线性衰减"""
return lr0 * (1 - epoch / total_epochs) + lrf * (epoch / total_epochs)
@staticmethod
def exponential_decay(epoch, total_epochs, lr0, lrf):
"""指数衰减"""
import math
decay_rate = -math.log(lrf / lr0) / total_epochs
return lr0 * math.exp(-decay_rate * epoch)
@staticmethod
def step_decay(epoch, step_size=30, gamma=0.1, lr0=0.01):
"""阶梯衰减"""
return lr0 * (gamma ** (epoch // step_size))
@staticmethod
def plot_schedules(total_epochs=100, lr0=0.01, lrf=0.001):
"""可视化不同调度策略"""
epochs = list(range(total_epochs))
cosine_lrs = [LearningRateScheduler.cosine_annealing(e, total_epochs, lr0, lrf) for e in epochs]
linear_lrs = [LearningRateScheduler.linear_decay(e, total_epochs, lr0, lrf) for e in epochs]
exp_lrs = [LearningRateScheduler.exponential_decay(e, total_epochs, lr0, lrf) for e in epochs]
step_lrs = [LearningRateScheduler.step_decay(e, 30, 0.1, lr0) for e in epochs]
plt.figure(figsize=(12, 8))
plt.plot(epochs, cosine_lrs, label='Cosine Annealing', linewidth=2)
plt.plot(epochs, linear_lrs, label='Linear Decay', linewidth=2)
plt.plot(epochs, exp_lrs, label='Exponential Decay', linewidth=2)
plt.plot(epochs, step_lrs, label='Step Decay', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedules Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 可视化学习率调度策略
LearningRateScheduler.plot_schedules()
print("超参数优化工具准备完成")9.4 训练监控与调试
import wandb
from torch.utils.tensorboard import SummaryWriter
import logging
from datetime import datetime
class TrainingMonitor:
"""训练监控器"""
def __init__(self, project_name="yolo_training", use_wandb=True, use_tensorboard=True):
self.project_name = project_name
self.use_wandb = use_wandb
self.use_tensorboard = use_tensorboard
# 初始化监控工具
if self.use_wandb:
wandb.init(project=project_name)
if self.use_tensorboard:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.writer = SummaryWriter(f'runs/{project_name}_{timestamp}')
# 设置日志
self.setup_logging()
def setup_logging(self):
"""设置日志记录"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def log_metrics(self, metrics_dict, step):
"""记录训练指标"""
# 记录到日志
metrics_str = ", ".join([f"{k}: {v:.4f}" for k, v in metrics_dict.items()])
self.logger.info(f"Step {step} - {metrics_str}")
# 记录到W&B
if self.use_wandb:
wandb.log(metrics_dict, step=step)
# 记录到TensorBoard
if self.use_tensorboard:
for key, value in metrics_dict.items():
self.writer.add_scalar(key, value, step)
def log_learning_rate(self, lr, step):
"""记录学习率"""
self.log_metrics({'learning_rate': lr}, step)
def log_loss_components(self, losses, step):
"""记录损失组件"""
loss_dict = {
'train/box_loss': losses.get('train/box_loss', 0),
'train/cls_loss': losses.get('train/cls_loss', 0),
'train/dfl_loss': losses.get('train/dfl_loss', 0),
'val/box_loss': losses.get('val/box_loss', 0),
'val/cls_loss': losses.get('val/cls_loss', 0),
'val/dfl_loss': losses.get('val/dfl_loss', 0)
}
self.log_metrics(loss_dict, step)
def log_model_metrics(self, metrics, step):
"""记录模型评估指标"""
metric_dict = {
'metrics/precision': metrics.get('metrics/precision(B)', 0),
'metrics/recall': metrics.get('metrics/recall(B)', 0),
'metrics/mAP50': metrics.get('metrics/mAP50(B)', 0),
'metrics/mAP50-95': metrics.get('metrics/mAP50-95(B)', 0)
}
self.log_metrics(metric_dict, step)
def close(self):
"""关闭监控器"""
if self.use_tensorboard:
self.writer.close()
if self.use_wandb:
wandb.finish()
class TrainingDebugger:
"""训练调试工具"""
def __init__(self):
pass
def diagnose_training_issues(self, results_dict):
"""诊断训练问题"""
issues = []
# 检查损失趋势
if 'train/box_loss' in results_dict:
box_loss = results_dict['train/box_loss']
if box_loss > 1.0:
issues.append("边界框损失过高,可能需要调整box loss权重或学习率")
if 'train/cls_loss' in results_dict:
cls_loss = results_dict['train/cls_loss']
if cls_loss > 1.0:
issues.append("分类损失过高,检查类别标签是否正确")
# 检查mAP
if 'metrics/mAP50-95(B)' in results_dict:
mAP = results_dict['metrics/mAP50-95(B)']
if mAP < 0.1:
issues.append("mAP过低,检查数据质量或模型配置")
# 检查精确率和召回率
if 'metrics/precision(B)' in results_dict and 'metrics/recall(B)' in results_dict:
precision = results_dict['metrics/precision(B)']
recall = results_dict['metrics/recall(B)']
if precision < 0.3:
issues.append("精确率低,可能存在过多误检")
if recall < 0.3:
issues.append("召回率低,可能遗漏了很多目标")
return issues
def suggest_fixes(self, issues):
"""建议解决方案"""
suggestions = {
"高损失": [
"降低学习率",
"检查数据标注质量",
"调整损失权重",
"增加warmup epochs"
],
"低mAP": [
"增加训练epochs",
"使用更大的模型",
"改进数据增强",
"检查anchor设置"
],
"低精确率": [
"提高置信度阈值",
"改进NMS设置",
"增加负样本"
],
"低召回率": [
"降低置信度阈值",
"增加数据增强",
"使用多尺度训练"
]
}
print("训练问题诊断:")
print("=" * 40)
if not issues:
print("未发现明显问题")
return
for issue in issues:
print(f"问题: {issue}")
# 匹配建议
for category, suggestions_list in suggestions.items():
if any(keyword in issue for keyword in category.split()):
print("建议解决方案:")
for suggestion in suggestions_list:
print(f" • {suggestion}")
break
print()
# 训练状态检查
class TrainingHealthCheck:
"""训练健康状态检查"""
def __init__(self):
self.loss_history = []
self.metric_history = []
def check_convergence(self, loss_values, window_size=10):
"""检查收敛状态"""
if len(loss_values) < window_size * 2:
return "数据不足"
recent_losses = loss_values[-window_size:]
previous_losses = loss_values[-window_size*2:-window_size]
recent_avg = sum(recent_losses) / len(recent_losses)
previous_avg = sum(previous_losses) / len(previous_losses)
improvement = (previous_avg - recent_avg) / previous_avg
if improvement > 0.05:
return "正在收敛"
elif improvement > -0.02:
return "收敛缓慢"
else:
return "可能发散"
def detect_overfitting(self, train_loss, val_loss, threshold=0.1):
"""检测过拟合"""
if len(train_loss) < 10 or len(val_loss) < 10:
return "数据不足"
train_trend = (train_loss[-1] - train_loss[-10]) / 10
val_trend = (val_loss[-1] - val_loss[-10]) / 10
if train_trend < -0.01 and val_trend > 0.01:
return "可能过拟合"
elif abs(train_trend) < 0.001 and abs(val_trend) < 0.001:
return "训练稳定"
else:
return "正常训练"
def check_learning_rate(self, loss_values, lr_values):
"""检查学习率是否合适"""
if len(loss_values) < 5:
return "数据不足"
recent_loss_change = (loss_values[-1] - loss_values[-5]) / 5
current_lr = lr_values[-1] if lr_values else 0.01
if recent_loss_change > 0.01:
return f"学习率可能过高 (当前: {current_lr:.6f})"
elif abs(recent_loss_change) < 0.0001:
return f"学习率可能过低 (当前: {current_lr:.6f})"
else:
return f"学习率合适 (当前: {current_lr:.6f})"
print("训练监控和调试工具已准备完成")9.5 迁移学习策略
class TransferLearningManager:
"""迁移学习管理器"""
def __init__(self):
self.pretrained_models = {
'yolov8n': 'yolov8n.pt',
'yolov8s': 'yolov8s.pt',
'yolov8m': 'yolov8m.pt',
'yolov8l': 'yolov8l.pt',
'yolov8x': 'yolov8x.pt'
}
def select_pretrained_model(self, target_classes, data_size, compute_budget):
"""选择合适的预训练模型"""
recommendations = []
if data_size < 1000:
if compute_budget == 'low':
recommendations.append('yolov8n - 数据量少,计算资源有限')
else:
recommendations.append('yolov8s - 数据量少,但可用更大模型提升性能')
elif data_size < 10000:
if compute_budget == 'low':
recommendations.append('yolov8s - 中等数据量,合理的模型大小')
elif compute_budget == 'medium':
recommendations.append('yolov8m - 平衡性能和效率')
else:
recommendations.append('yolov8l - 数据充足,可用大模型')
else: # data_size >= 10000
if compute_budget == 'low':
recommendations.append('yolov8s - 大数据量,但计算受限')
elif compute_budget == 'medium':
recommendations.append('yolov8m - 大数据量,中等模型')
elif compute_budget == 'high':
recommendations.append('yolov8l - 大数据量,大模型')
else:
recommendations.append('yolov8x - 最大性能需求')
# 根据类别数调整建议
if target_classes > 80:
recommendations.append('建议:类别数多,考虑使用更大的模型')
elif target_classes < 10:
recommendations.append('建议:类别数少,可以使用较小的模型')
return recommendations
def create_transfer_config(self, pretrained_model, freeze_layers=None):
"""创建迁移学习配置"""
config = {
'model': pretrained_model,
'pretrained': True,
'freeze': freeze_layers, # 冻结层数,None表示不冻结
}
# 根据是否冻结层调整学习率
if freeze_layers:
config.update({
'lr0': 0.001, # 较低的学习率
'warmup_epochs': 1.0,
})
else:
config.update({
'lr0': 0.01, # 标准学习率
'warmup_epochs': 3.0,
})
return config
def gradual_unfreezing_schedule(self, total_epochs):
"""渐进式解冻计划"""
schedule = []
# 第一阶段:冻结骨干网络
schedule.append({
'epochs': total_epochs // 4,
'freeze': 10, # 冻结前10层
'lr': 0.001,
'description': '冻结骨干网络,训练检测头'
})
# 第二阶段:部分解冻
schedule.append({
'epochs': total_epochs // 4,
'freeze': 5, # 只冻结前5层
'lr': 0.0005,
'description': '部分解冻,精调后层'
})
# 第三阶段:完全解冻
schedule.append({
'epochs': total_epochs // 2,
'freeze': None,
'lr': 0.0001,
'description': '完全解冻,端到端精调'
})
return schedule
def domain_adaptation_config(self, source_domain, target_domain):
"""领域自适应配置"""
adaptation_strategies = {
('general', 'medical'): {
'data_augmentation': ['contrast', 'brightness', 'gaussian_blur'],
'loss_weights': {'box': 7.5, 'cls': 1.0, 'dfl': 1.5},
'learning_rate': 0.001,
'freeze_backbone': True
},
('general', 'industrial'): {
'data_augmentation': ['rotation', 'scale', 'noise'],
'loss_weights': {'box': 10.0, 'cls': 0.5, 'dfl': 2.0},
'learning_rate': 0.005,
'freeze_backbone': False
},
('general', 'aerial'): {
'data_augmentation': ['rotation', 'scale', 'flip'],
'loss_weights': {'box': 5.0, 'cls': 1.5, 'dfl': 1.0},
'learning_rate': 0.01,
'freeze_backbone': False
}
}
key = (source_domain, target_domain)
if key in adaptation_strategies:
return adaptation_strategies[key]
else:
# 默认配置
return {
'data_augmentation': ['horizontal_flip', 'scale', 'brightness'],
'loss_weights': {'box': 7.5, 'cls': 0.5, 'dfl': 1.5},
'learning_rate': 0.01,
'freeze_backbone': False
}
class FineTuningStrategies:
"""微调策略"""
@staticmethod
def discriminative_learning_rates(base_lr=0.01, backbone_ratio=0.1, neck_ratio=0.5):
"""差分学习率策略"""
return {
'backbone_lr': base_lr * backbone_ratio,
'neck_lr': base_lr * neck_ratio,
'head_lr': base_lr,
'description': '骨干网络用较小学习率,检测头用较大学习率'
}
@staticmethod
def layer_wise_decay(base_lr=0.01, decay_rate=0.9, num_layers=24):
"""层级衰减学习率"""
layer_lrs = []
for i in range(num_layers):
lr = base_lr * (decay_rate ** (num_layers - i - 1))
layer_lrs.append(lr)
return {
'layer_learning_rates': layer_lrs,
'description': '深层用较大学习率,浅层用较小学习率'
}
@staticmethod
def cosine_restart_schedule(T_0=10, T_mult=2, eta_min=1e-6, eta_max=1e-2):
"""余弦重启调度"""
return {
'scheduler': 'cosine_restart',
'T_0': T_0,
'T_mult': T_mult,
'eta_min': eta_min,
'eta_max': eta_max,
'description': '周期性重启学习率,避免局部最优'
}
# 使用示例
tl_manager = TransferLearningManager()
# 获取模型推荐
recommendations = tl_manager.select_pretrained_model(
target_classes=20,
data_size=5000,
compute_budget='medium'
)
print("迁移学习模型推荐:")
for rec in recommendations:
print(f" • {rec}")
# 创建迁移学习配置
transfer_config = tl_manager.create_transfer_config('yolov8m.pt', freeze_layers=5)
print(f"\n迁移学习配置: {transfer_config}")
# 渐进式解冻计划
schedule = tl_manager.gradual_unfreezing_schedule(total_epochs=100)
print(f"\n渐进式解冻计划:")
for i, stage in enumerate(schedule, 1):
print(f" 阶段{i}: {stage['description']}")
print(f" Epochs: {stage['epochs']}, Freeze: {stage['freeze']}, LR: {stage['lr']}")9.6 高级训练技巧
class AdvancedTrainingTechniques:
"""高级训练技巧"""
def __init__(self):
pass
def mixed_precision_training(self):
"""混合精度训练配置"""
return {
'amp': True, # 启用自动混合精度
'description': '使用FP16和FP32混合精度,加速训练并节省显存',
'benefits': [
'训练速度提升1.5-2倍',
'显存使用减少约50%',
'几乎不损失精度'
],
'requirements': [
'GPU支持Tensor Cores (V100, RTX系列)',
'PyTorch 1.6+',
'CUDA 10.2+'
]
}
def exponential_moving_average(self, decay=0.9999):
"""指数移动平均配置"""
return {
'ema_decay': decay,
'description': '使用EMA平滑模型权重,提升泛化性能',
'implementation': '''
# EMA更新公式
ema_weights = decay * ema_weights + (1 - decay) * current_weights
''',
'benefits': [
'减少模型权重抖动',
'提升验证集性能',
'更稳定的收敛'
]
}
def gradient_clipping(self, max_norm=10.0):
"""梯度裁剪配置"""
return {
'max_norm': max_norm,
'description': '限制梯度范数,防止梯度爆炸',
'when_to_use': [
'梯度范数经常>10',
'损失出现NaN或Inf',
'训练不稳定'
]
}
def knowledge_distillation_setup(self, teacher_model, temperature=4.0, alpha=0.7):
"""知识蒸馏设置"""
return {
'teacher_model': teacher_model,
'temperature': temperature,
'alpha': alpha,
'description': '使用大模型指导小模型训练',
'loss_function': '''
distillation_loss = alpha * KL_div(student_soft, teacher_soft) +
(1 - alpha) * cross_entropy(student, targets)
where:
student_soft = softmax(student_logits / temperature)
teacher_soft = softmax(teacher_logits / temperature)
'''
}
def multi_scale_training(self, scales=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640]):
"""多尺度训练"""
return {
'scales': scales,
'description': '随机选择输入尺度,提升多尺度泛化能力',
'strategy': {
'random_scale': '每个batch随机选择一个尺度',
'scheduled_scale': '按照计划改变尺度',
'progressive_scale': '从小尺度逐渐增加到大尺度'
}
}
def label_smoothing(self, smoothing=0.1):
"""标签平滑"""
return {
'label_smoothing': smoothing,
'description': '软化one-hot标签,提高泛化性',
'formula': f'y_smooth = (1 - {smoothing}) * y_hot + {smoothing} / num_classes',
'benefits': [
'减少过拟合',
'提升模型校准',
'增强泛化能力'
]
}
def focal_loss_config(self, alpha=0.25, gamma=2.0):
"""焦点损失配置"""
return {
'alpha': alpha,
'gamma': gamma,
'description': '解决类别不平衡和困难样本问题',
'formula': 'FL = -α(1-p)^γ * log(p)',
'use_cases': [
'类别严重不平衡',
'存在大量简单负样本',
'需要关注困难样本'
]
}
class TrainingRecipes:
"""训练配方集合"""
@staticmethod
def small_dataset_recipe(epochs=200):
"""小数据集训练配方"""
return {
'name': '小数据集训练配方',
'epochs': epochs,
'model': 'yolov8n.pt',
'batch_size': 32,
'lr0': 0.001,
'lrf': 0.01,
'warmup_epochs': 5.0,
'label_smoothing': 0.1,
'mixup': 0.5,
'copy_paste': 0.3,
'mosaic': 0.8,
'freeze': 10, # 冻结骨干网络
'data_augmentation': 'aggressive',
'description': '适用于<1000张图像的小数据集'
}
@staticmethod
def large_dataset_recipe(epochs=100):
"""大数据集训练配方"""
return {
'name': '大数据集训练配方',
'epochs': epochs,
'model': 'yolov8l.pt',
'batch_size': 16,
'lr0': 0.01,
'lrf': 0.001,
'warmup_epochs': 3.0,
'label_smoothing': 0.0,
'mixup': 0.0,
'mosaic': 1.0,
'freeze': None, # 不冻结
'amp': True,
'description': '适用于>10000张图像的大数据集'
}
@staticmethod
def production_ready_recipe(epochs=150):
"""生产环境训练配方"""
return {
'name': '生产环境训练配方',
'epochs': epochs,
'model': 'yolov8m.pt',
'batch_size': 16,
'lr0': 0.01,
'lrf': 0.01,
'warmup_epochs': 3.0,
'cos_lr': True,
'label_smoothing': 0.05,
'amp': True,
'ema_decay': 0.9999,
'save_period': 10,
'val': True,
'plots': True,
'deterministic': True,
'description': '生产环境推荐配置,平衡速度和精度'
}
@staticmethod
def quick_experiment_recipe(epochs=50):
"""快速实验配方"""
return {
'name': '快速实验配方',
'epochs': epochs,
'model': 'yolov8n.pt',
'batch_size': 64,
'lr0': 0.01,
'imgsz': 416, # 较小的输入尺寸
'cache': 'ram', # 缓存到内存
'workers': 8,
'amp': True,
'val': False, # 跳过验证加速训练
'plots': False,
'description': '快速验证想法,适合超参数搜索'
}
# 使用示例
advanced_techniques = AdvancedTrainingTechniques()
recipes = TrainingRecipes()
print("高级训练技巧:")
print("=" * 40)
# 混合精度训练
mp_config = advanced_techniques.mixed_precision_training()
print(f"\n混合精度训练: {mp_config['description']}")
for benefit in mp_config['benefits']:
print(f" • {benefit}")
# 指数移动平均
ema_config = advanced_techniques.exponential_moving_average()
print(f"\nEMA: {ema_config['description']}")
# 训练配方
print(f"\n训练配方示例:")
small_recipe = recipes.small_dataset_recipe()
print(f" {small_recipe['name']}: {small_recipe['description']}")
print("\n高级训练技巧准备完成")9.7 章节总结
完成本章学习后,你应该能够:
- ✅ 配置和启动YOLO模型训练
- ✅ 理解和调优关键超参数
- ✅ 实施有效的训练监控
- ✅ 应用迁移学习策略
- ✅ 使用高级训练技巧
- ✅ 诊断和解决训练问题
- ✅ 选择适合的训练配方
def training_checklist():
"""训练检查清单"""
checklist = {
"训练前准备": [
"□ 数据集格式正确",
"□ 数据质量检查通过",
"□ 训练/验证集划分合理",
"□ 硬件环境确认",
"□ 依赖库版本兼容"
],
"配置设置": [
"□ 模型大小选择合适",
"□ 批大小和学习率匹配",
"□ 数据增强策略确定",
"□ 损失权重调整",
"□ 监控工具配置"
],
"训练过程": [
"□ 学习率调度合理",
"□ 损失下降正常",
"□ 验证指标提升",
"□ 无过拟合迹象",
"□ 定期保存检查点"
],
"训练完成": [
"□ 模型性能满足要求",
"□ 最佳权重已保存",
"□ 训练日志完整",
"□ 模型已导出部署格式",
"□ 实验结果已记录"
]
}
print("YOLO训练检查清单:")
print("=" * 40)
for category, items in checklist.items():
print(f"\n{category}:")
for item in items:
print(f" {item}")
print("\n🎯 训练成功的关键因素:")
success_factors = [
"高质量的标注数据",
"合适的模型大小",
"恰当的超参数设置",
"充分的训练时间",
"持续的监控和调优"
]
for factor in success_factors:
print(f" • {factor}")
training_checklist()
print("\nYOLO模型训练实战完成!")本章重点:掌握YOLO模型的完整训练流程,从环境搭建到高级技巧应用,建立系统性的训练和调优能力。
