Chapter 04: DSPy Optimizers and Compilation

Haiyue
39min

Chapter 04: DSPy Optimizers and Compilation

Learning Objectives
  • Understand how the DSPy compilation process works
  • Learn Bootstrap Few-Shot optimizer
  • Master the use of LabeledFewShot optimizer
  • Explore COPRO optimizer functionality
  • Understand optimizer evaluation and tuning strategies

Key Concepts

1. Overview of DSPy Compilation Mechanism

DSPy’s compilation process is the core mechanism for transforming high-level DSPy programs into optimized prompts and reasoning chains. The compiler automatically optimizes prompt templates and reasoning strategies by analyzing program structure and training data.

Core Components of the Compilation Process

# Basic flow of the compilation process
import dspy

# 1. Define language model
lm = dspy.OpenAI(model='gpt-3.5-turbo')
dspy.settings.configure(lm=lm)

# 2. Define program
class BasicQA(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought("question -> answer")

    def forward(self, question):
        return self.generate_answer(question=question)

# 3. Prepare training data
trainset = [
    dspy.Example(question="What is Python?", answer="Python is a programming language").with_inputs('question'),
    dspy.Example(question="What is AI?", answer="AI is artificial intelligence").with_inputs('question')
]

# 4. Configure optimizer
optimizer = dspy.BootstrapFewShot(metric=lambda example, pred, trace=None:
                                 example.answer.lower() in pred.answer.lower())

# 5. Compile program
compiled_qa = optimizer.compile(BasicQA(), trainset=trainset)

How the Compiler Works

# Example of compiler internal workflow
class CompilationProcess:
    """Demonstrates the internal mechanism of DSPy compilation process"""

    def __init__(self, program, optimizer, trainset):
        self.program = program
        self.optimizer = optimizer
        self.trainset = trainset
        self.compiled_program = None

    def analyze_program_structure(self):
        """Analyze program structure, identify optimizable components"""
        # Identify all predictor modules
        predictors = []
        for module in self.program.modules():
            if isinstance(module, dspy.Predict):
                predictors.append(module)
        return predictors

    def generate_demonstrations(self, predictors):
        """Generate examples for each predictor"""
        demonstrations = {}

        for predictor in predictors:
            # Generate high-quality examples using training data
            demos = []
            for example in self.trainset[:5]:  # Select first 5 as examples
                try:
                    # Run original program to get intermediate results
                    with dspy.context(lm=self.optimizer.student):
                        result = self.program(example.question)
                        if self.optimizer.metric(example, result):
                            demos.append({
                                'input': example.question,
                                'output': result.answer,
                                'reasoning': getattr(result, 'rationale', '')
                            })
                except Exception as e:
                    continue
            demonstrations[predictor] = demos

        return demonstrations

    def optimize_prompts(self, demonstrations):
        """Optimize prompt templates based on examples"""
        optimized_predictors = {}

        for predictor, demos in demonstrations.items():
            if demos:
                # Build optimized few-shot prompts
                few_shot_examples = []
                for demo in demos:
                    few_shot_examples.append({
                        'input': demo['input'],
                        'output': demo['output']
                    })

                # Create optimized predictor
                optimized_predictor = dspy.Predict(predictor.signature)
                optimized_predictor.demos = few_shot_examples
                optimized_predictors[predictor] = optimized_predictor

        return optimized_predictors

    def compile(self):
        """Execute complete compilation process"""
        print("Starting program compilation...")

        # 1. Analyze program structure
        predictors = self.analyze_program_structure()
        print(f"Found {len(predictors)} predictors")

        # 2. Generate examples
        demonstrations = self.generate_demonstrations(predictors)
        print(f"Generated {sum(len(demos) for demos in demonstrations.values())} examples")

        # 3. Optimize prompts
        optimized_predictors = self.optimize_prompts(demonstrations)
        print(f"Optimized {len(optimized_predictors)} predictors")

        # 4. Rebuild program
        self.compiled_program = self._rebuild_program_with_optimizations(optimized_predictors)
        print("Compilation complete!")

        return self.compiled_program

# Usage example
program = BasicQA()
optimizer = dspy.BootstrapFewShot()
compiler = CompilationProcess(program, optimizer, trainset)
compiled_program = compiler.compile()

2. Bootstrap Few-Shot Optimizer

Bootstrap Few-Shot is the most commonly used optimizer in DSPy, generating high-quality few-shot examples through bootstrapping methods.

Basic Principles and Usage

import dspy
import random
from typing import List, Callable

class AdvancedBootstrapFewShot:
    """Enhanced Bootstrap Few-Shot optimizer"""

    def __init__(self,
                 metric: Callable,
                 teacher: dspy.LM = None,
                 max_bootstrapped_demos: int = 4,
                 max_labeled_demos: int = 16,
                 max_rounds: int = 1,
                 num_candidate_programs: int = 16,
                 num_threads: int = 6):

        self.metric = metric
        self.teacher = teacher
        self.max_bootstrapped_demos = max_bootstrapped_demos
        self.max_labeled_demos = max_labeled_demos
        self.max_rounds = max_rounds
        self.num_candidate_programs = num_candidate_programs
        self.num_threads = num_threads

    def bootstrap_one_example(self, program, example):
        """Generate bootstrap demonstration for a single example"""
        try:
            # Use teacher model (if available) or current model to generate predictions
            with dspy.context(lm=self.teacher if self.teacher else dspy.settings.lm):
                prediction = program(**example.inputs())

                # Validate prediction quality
                if self.metric(example, prediction):
                    # Build demonstration sample
                    demo = dspy.Example()
                    demo = demo.with_inputs(**example.inputs())
                    demo = demo.with_outputs(**prediction.outputs())
                    return demo
        except Exception as e:
            print(f"Bootstrap failed: {e}")

        return None

    def bootstrap_examples(self, program, trainset):
        """Batch generate bootstrap examples"""
        bootstrapped_examples = []

        for example in trainset:
            if len(bootstrapped_examples) >= self.max_bootstrapped_demos:
                break

            demo = self.bootstrap_one_example(program, example)
            if demo:
                bootstrapped_examples.append(demo)
                print(f"Generated example {len(bootstrapped_examples)}/{self.max_bootstrapped_demos}")

        return bootstrapped_examples

    def evaluate_program(self, program, devset):
        """Evaluate program performance"""
        correct = 0
        total = len(devset)

        for example in devset:
            try:
                prediction = program(**example.inputs())
                if self.metric(example, prediction):
                    correct += 1
            except Exception:
                continue

        return correct / total if total > 0 else 0.0

    def compile(self, student_program, trainset, valset=None):
        """Compile student program"""
        if valset is None:
            # If no validation set, randomly split from training set
            random.shuffle(trainset)
            split_point = int(len(trainset) * 0.8)
            trainset, valset = trainset[:split_point], trainset[split_point:]

        print(f"Starting Bootstrap optimization, training set: {len(trainset)}, validation set: {len(valset)}")

        best_program = None
        best_score = 0.0

        for round_idx in range(self.max_rounds):
            print(f"\nOptimization round {round_idx + 1}")

            # Generate bootstrap examples
            bootstrapped_demos = self.bootstrap_examples(student_program, trainset)
            print(f"Generated {len(bootstrapped_demos)} bootstrap examples")

            # Generate multiple candidate programs
            candidate_programs = []

            for candidate_idx in range(self.num_candidate_programs):
                # Randomly select example subset
                selected_demos = random.sample(
                    bootstrapped_demos,
                    min(len(bootstrapped_demos), self.max_bootstrapped_demos)
                )

                # Create candidate program
                candidate = student_program.deepcopy()

                # Add examples to each predictor
                for module in candidate.modules():
                    if isinstance(module, dspy.Predict):
                        module.demos = selected_demos

                candidate_programs.append(candidate)

            # Evaluate all candidate programs
            print("Evaluating candidate programs...")
            for i, candidate in enumerate(candidate_programs):
                score = self.evaluate_program(candidate, valset)
                print(f"Candidate program {i+1}: {score:.3f}")

                if score > best_score:
                    best_score = score
                    best_program = candidate

        print(f"\nBest program score: {best_score:.3f}")
        return best_program

# Practical application example
class MathWordProblem(dspy.Module):
    """Math word problem solver"""

    def __init__(self):
        super().__init__()
        self.solve = dspy.ChainOfThought("problem -> reasoning, answer")

    def forward(self, problem):
        result = self.solve(problem=problem)
        return dspy.Prediction(
            reasoning=result.reasoning,
            answer=result.answer
        )

# Prepare data
math_trainset = [
    dspy.Example(
        problem="Xiao Ming has 5 apples, ate 2, how many are left?",
        answer="3"
    ).with_inputs('problem'),
    dspy.Example(
        problem="A class has 30 students, 12 are boys, how many are girls?",
        answer="18"
    ).with_inputs('problem'),
    # ... more examples
]

# Define evaluation metric
def math_metric(example, prediction, trace=None):
    """Evaluation metric for math problems"""
    try:
        # Extract numeric answer
        import re
        pred_numbers = re.findall(r'\d+', prediction.answer)
        true_numbers = re.findall(r'\d+', example.answer)

        if pred_numbers and true_numbers:
            return pred_numbers[-1] == true_numbers[-1]
    except:
        pass
    return False

# Use Bootstrap optimizer
math_program = MathWordProblem()
optimizer = AdvancedBootstrapFewShot(
    metric=math_metric,
    max_bootstrapped_demos=6,
    num_candidate_programs=10
)

compiled_math_program = optimizer.compile(math_program, math_trainset)

3. LabeledFewShot Optimizer

The LabeledFewShot optimizer uses pre-labeled examples to optimize program performance.

class LabeledFewShotOptimizer:
    """Detailed implementation of Labeled Few-Shot optimizer"""

    def __init__(self, k: int = 16):
        self.k = k  # Number of examples to use

    def select_examples(self, trainset, program_signature):
        """Intelligently select best examples"""
        # Method 1: Random selection
        random_examples = random.sample(trainset, min(self.k, len(trainset)))

        # Method 2: Diversity selection
        diverse_examples = self.select_diverse_examples(trainset)

        # Method 3: Balanced difficulty selection
        balanced_examples = self.select_balanced_examples(trainset)

        return diverse_examples

    def select_diverse_examples(self, trainset):
        """Select diverse examples"""
        if len(trainset) <= self.k:
            return trainset

        selected = []
        remaining = trainset.copy()

        # First randomly select one
        first = random.choice(remaining)
        selected.append(first)
        remaining.remove(first)

        # Iteratively select most dissimilar examples
        while len(selected) < self.k and remaining:
            best_candidate = None
            best_diversity_score = -1

            for candidate in remaining:
                # Calculate diversity score with already selected examples
                diversity_score = self.calculate_diversity(candidate, selected)

                if diversity_score > best_diversity_score:
                    best_diversity_score = diversity_score
                    best_candidate = candidate

            if best_candidate:
                selected.append(best_candidate)
                remaining.remove(best_candidate)

        return selected

    def calculate_diversity(self, candidate, selected_examples):
        """Calculate diversity score of example"""
        if not selected_examples:
            return 1.0

        # Simple diversity measure based on text length and vocabulary
        candidate_words = set(candidate.question.lower().split())

        diversity_scores = []
        for selected in selected_examples:
            selected_words = set(selected.question.lower().split())

            # Jaccard distance as diversity measure
            intersection = len(candidate_words & selected_words)
            union = len(candidate_words | selected_words)

            if union == 0:
                diversity = 1.0
            else:
                diversity = 1.0 - (intersection / union)

            diversity_scores.append(diversity)

        # Return average diversity
        return sum(diversity_scores) / len(diversity_scores)

    def select_balanced_examples(self, trainset):
        """Select difficulty-balanced examples"""
        # Classify by problem complexity
        simple_examples = []
        medium_examples = []
        complex_examples = []

        for example in trainset:
            complexity = self.estimate_complexity(example)
            if complexity < 0.3:
                simple_examples.append(example)
            elif complexity < 0.7:
                medium_examples.append(example)
            else:
                complex_examples.append(example)

        # Balanced selection
        selected = []
        target_simple = self.k // 3
        target_medium = self.k // 3
        target_complex = self.k - target_simple - target_medium

        selected.extend(random.sample(simple_examples, min(target_simple, len(simple_examples))))
        selected.extend(random.sample(medium_examples, min(target_medium, len(medium_examples))))
        selected.extend(random.sample(complex_examples, min(target_complex, len(complex_examples))))

        # If insufficient, supplement from remaining
        while len(selected) < self.k and len(selected) < len(trainset):
            remaining = [ex for ex in trainset if ex not in selected]
            if remaining:
                selected.append(random.choice(remaining))

        return selected[:self.k]

    def estimate_complexity(self, example):
        """Estimate example complexity"""
        question_length = len(example.question.split())
        answer_length = len(example.answer.split())

        # Simple complexity estimation based on length and special characters
        complexity = (question_length + answer_length) / 50.0

        # Add weight for special patterns
        if any(word in example.question.lower() for word in ['why', 'how', 'explain']):
            complexity += 0.3

        if any(char in example.question for char in ['?', '!', ';']):
            complexity += 0.1

        return min(complexity, 1.0)

    def compile(self, student_program, trainset):
        """Compile program using labeled examples"""
        print(f"Using LabeledFewShot optimizer, training set size: {len(trainset)}")

        # Select best examples
        selected_examples = self.select_examples(trainset, None)
        print(f"Selected {len(selected_examples)} examples")

        # Create optimized program
        optimized_program = student_program.deepcopy()

        # Add examples to each predictor
        for module in optimized_program.modules():
            if isinstance(module, dspy.Predict):
                module.demos = selected_examples
                print(f"Added {len(selected_examples)} examples to predictor")

        return optimized_program

# Usage example
class QuestionClassifier(dspy.Module):
    """Question classifier"""

    def __init__(self):
        super().__init__()
        self.classify = dspy.Predict("question -> category")

    def forward(self, question):
        result = self.classify(question=question)
        return result

# Prepare classification training data
classification_trainset = [
    dspy.Example(question="What is the weather today?", category="weather").with_inputs('question'),
    dspy.Example(question="How do I cook pasta?", category="cooking").with_inputs('question'),
    dspy.Example(question="What is machine learning?", category="technology").with_inputs('question'),
    dspy.Example(question="Where is Paris?", category="geography").with_inputs('question'),
    # ... more examples
]

# Use LabeledFewShot optimization
classifier = QuestionClassifier()
labeled_optimizer = LabeledFewShotOptimizer(k=8)
optimized_classifier = labeled_optimizer.compile(classifier, classification_trainset)

# Test optimized classifier
test_questions = [
    "How is the weather in Tokyo?",
    "Recipe for chocolate cake",
    "Explain neural networks"
]

for question in test_questions:
    result = optimized_classifier(question=question)
    print(f"Question: {question}")
    print(f"Category: {result.category}\n")

4. COPRO Optimizer

COPRO (Constrained Optimization with Prompt-based Reasoning) is an advanced optimizer focused on prompt optimization.

class COPROOptimizer:
    """Implementation of COPRO optimizer"""

    def __init__(self,
                 metric,
                 breadth: int = 10,
                 depth: int = 3,
                 init_temperature: float = 1.4,
                 verbose: bool = False):

        self.metric = metric
        self.breadth = breadth  # Number of candidates generated each time
        self.depth = depth      # Number of optimization rounds
        self.init_temperature = init_temperature
        self.verbose = verbose

    def generate_instruction_variants(self, original_instruction, num_variants=10):
        """Generate instruction variants"""
        # Use language model to generate instruction variants
        variation_prompt = f"""
Given the following instruction, please generate {num_variants} functionally equivalent but differently worded variant instructions.
Requirements:
1. Keep the core purpose of the original instruction unchanged
2. Use different wording and expressions
3. Each variant on one line

Original instruction: {original_instruction}

Variant instructions:
"""

        # This needs to call the language model
        # For demonstration, we use predefined variants
        variants = [
            f"Please carefully analyze and {original_instruction.lower()}",
            f"Based on the given information, {original_instruction.lower()}",
            f"Using the following content, please {original_instruction.lower()}",
            f"Please provide detailed {original_instruction.lower()}",
            f"After careful consideration, {original_instruction.lower()}",
        ]

        return variants[:num_variants]

    def optimize_signature_instructions(self, program, trainset):
        """Optimize instructions in signatures"""
        optimized_predictors = {}

        for module in program.modules():
            if isinstance(module, dspy.Predict):
                signature = module.signature
                original_instructions = getattr(signature, 'instructions', '')

                if self.verbose:
                    print(f"Optimizing predictor instruction: {original_instructions}")

                best_instruction = original_instructions
                best_score = self.evaluate_instruction(
                    module, best_instruction, trainset
                )

                # Generate instruction variants
                instruction_variants = self.generate_instruction_variants(
                    original_instructions, self.breadth
                )

                # Test each variant
                for variant in instruction_variants:
                    score = self.evaluate_instruction(module, variant, trainset)

                    if self.verbose:
                        print(f"  Variant: {variant[:50]}... Score: {score:.3f}")

                    if score > best_score:
                        best_score = score
                        best_instruction = variant

                # Save best instruction
                optimized_predictors[module] = {
                    'instruction': best_instruction,
                    'score': best_score
                }

        return optimized_predictors

    def evaluate_instruction(self, predictor, instruction, examples):
        """Evaluate performance of specific instruction"""
        # Create temporary predictor
        temp_predictor = predictor.deepcopy()

        # Update instruction
        if hasattr(temp_predictor.signature, 'instructions'):
            temp_predictor.signature.instructions = instruction

        # Evaluate on example subset
        correct = 0
        total = min(len(examples), 20)  # Limit evaluation count for speed

        for example in examples[:total]:
            try:
                prediction = temp_predictor(**example.inputs())
                if self.metric(example, prediction):
                    correct += 1
            except Exception as e:
                if self.verbose:
                    print(f"Evaluation error: {e}")
                continue

        return correct / total if total > 0 else 0.0

    def progressive_optimization(self, program, trainset):
        """Progressive optimization"""
        current_program = program.deepcopy()

        for depth_level in range(self.depth):
            print(f"\nCOPRO optimization round {depth_level + 1}/{self.depth}")

            # Optimize current program instructions
            optimizations = self.optimize_signature_instructions(
                current_program, trainset
            )

            # Apply best optimizations
            improvements = 0
            for module, optimization in optimizations.items():
                if optimization['score'] > 0:
                    # Update module instruction
                    if hasattr(module.signature, 'instructions'):
                        module.signature.instructions = optimization['instruction']
                    improvements += 1

            print(f"Round {depth_level + 1} optimized {improvements} modules")

            # If no improvement, end early
            if improvements == 0:
                print("No further improvement, optimization ended")
                break

        return current_program

    def compile(self, student_program, trainset, valset=None):
        """Compile student program"""
        print(f"Starting COPRO optimization")
        print(f"Training set size: {len(trainset)}")

        if valset is None:
            # Split training set
            random.shuffle(trainset)
            split_point = int(len(trainset) * 0.8)
            train_subset, val_subset = trainset[:split_point], trainset[split_point:]
        else:
            train_subset = trainset
            val_subset = valset

        # Execute progressive optimization
        optimized_program = self.progressive_optimization(student_program, train_subset)

        # Final evaluation
        if val_subset:
            final_score = self.evaluate_program(optimized_program, val_subset)
            print(f"Final validation score: {final_score:.3f}")

        return optimized_program

    def evaluate_program(self, program, examples):
        """Evaluate entire program"""
        correct = 0
        total = len(examples)

        for example in examples:
            try:
                prediction = program(**example.inputs())
                if self.metric(example, prediction):
                    correct += 1
            except Exception:
                continue

        return correct / total if total > 0 else 0.0

# Practical application example
class SentimentAnalyzer(dspy.Module):
    """Sentiment analyzer"""

    def __init__(self):
        super().__init__()
        self.analyze = dspy.ChainOfThought(
            "text -> reasoning, sentiment",
            instructions="Analyze the sentiment of the given text. Consider context, tone, and emotional indicators."
        )

    def forward(self, text):
        result = self.analyze(text=text)
        return dspy.Prediction(
            reasoning=result.reasoning,
            sentiment=result.sentiment
        )

# Prepare sentiment analysis data
sentiment_trainset = [
    dspy.Example(text="I love this movie! It's amazing!", sentiment="positive").with_inputs('text'),
    dspy.Example(text="This is the worst experience ever.", sentiment="negative").with_inputs('text'),
    dspy.Example(text="The weather is okay today.", sentiment="neutral").with_inputs('text'),
    # ... more examples
]

def sentiment_metric(example, prediction, trace=None):
    """Sentiment analysis evaluation metric"""
    return example.sentiment.lower() in prediction.sentiment.lower()

# Use COPRO optimizer
sentiment_analyzer = SentimentAnalyzer()
copro_optimizer = COPROOptimizer(
    metric=sentiment_metric,
    breadth=8,
    depth=3,
    verbose=True
)

optimized_analyzer = copro_optimizer.compile(sentiment_analyzer, sentiment_trainset)

5. Optimizer Evaluation and Tuning Strategies

class OptimizerEvaluator:
    """Optimizer evaluation and comparison tool"""

    def __init__(self, base_program, trainset, testset):
        self.base_program = base_program
        self.trainset = trainset
        self.testset = testset
        self.results = {}

    def evaluate_optimizer(self, optimizer_name, optimizer, metric):
        """Evaluate single optimizer"""
        print(f"\nEvaluating optimizer: {optimizer_name}")

        # Record start time
        import time
        start_time = time.time()

        try:
            # Compile program
            compiled_program = optimizer.compile(self.base_program, self.trainset)
            compilation_time = time.time() - start_time

            # Evaluate performance
            test_score = self.evaluate_program(compiled_program, self.testset, metric)
            train_score = self.evaluate_program(compiled_program, self.trainset, metric)

            # Record results
            self.results[optimizer_name] = {
                'test_score': test_score,
                'train_score': train_score,
                'compilation_time': compilation_time,
                'overfitting': abs(train_score - test_score)
            }

            print(f"{optimizer_name}:")
            print(f"   Train score: {train_score:.3f}")
            print(f"   Test score: {test_score:.3f}")
            print(f"   Compilation time: {compilation_time:.2f}s")
            print(f"   Overfitting degree: {abs(train_score - test_score):.3f}")

        except Exception as e:
            print(f"{optimizer_name} evaluation failed: {e}")
            self.results[optimizer_name] = {
                'error': str(e)
            }

    def evaluate_program(self, program, examples, metric):
        """Evaluate program performance on dataset"""
        correct = 0
        total = len(examples)

        for example in examples:
            try:
                prediction = program(**example.inputs())
                if metric(example, prediction):
                    correct += 1
            except:
                continue

        return correct / total if total > 0 else 0.0

    def compare_optimizers(self, optimizers_config, metric):
        """Compare multiple optimizers"""
        print("Starting optimizer performance comparison")

        for name, optimizer in optimizers_config.items():
            self.evaluate_optimizer(name, optimizer, metric)

        # Generate comparison report
        self.generate_comparison_report()

    def generate_comparison_report(self):
        """Generate comparison report"""
        print("\nOptimizer Comparison Report")
        print("=" * 60)

        # Sort by test score
        valid_results = {k: v for k, v in self.results.items() if 'error' not in v}

        if not valid_results:
            print("No successful optimizer results")
            return

        sorted_results = sorted(
            valid_results.items(),
            key=lambda x: x[1]['test_score'],
            reverse=True
        )

        print(f"{'Optimizer':<20} {'Test Score':<10} {'Train Score':<10} {'Compile Time':<10} {'Overfitting':<10}")
        print("-" * 60)

        for name, result in sorted_results:
            print(f"{name:<20} {result['test_score']:<10.3f} {result['train_score']:<10.3f} "
                  f"{result['compilation_time']:<10.2f} {result['overfitting']:<10.3f}")

        # Recommend best optimizer
        best_optimizer = sorted_results[0]
        print(f"\nRecommended optimizer: {best_optimizer[0]}")

        # Analyze results
        print("\nAnalysis:")
        if best_optimizer[1]['overfitting'] > 0.1:
            print("Warning: Best optimizer may have overfitting issues. Suggestions:")
            print("   - Increase training data")
            print("   - Use regularization techniques")
            print("   - Reduce model complexity")

        if best_optimizer[1]['compilation_time'] > 300:  # 5 minutes
            print("Warning: Compilation time is long. Suggestions:")
            print("   - Reduce number of candidate programs")
            print("   - Use smaller training set for quick iteration")
            print("   - Consider parallelizing optimization")

# Practical evaluation example
def comprehensive_optimizer_evaluation():
    """Comprehensive optimizer evaluation example"""

    # Prepare test program
    class TestProgram(dspy.Module):
        def __init__(self):
            super().__init__()
            self.generate = dspy.ChainOfThought("question -> answer")

        def forward(self, question):
            return self.generate(question=question)

    # Prepare data
    import random
    full_dataset = [
        dspy.Example(question="What is 2+2?", answer="4").with_inputs('question'),
        dspy.Example(question="What is the capital of France?", answer="Paris").with_inputs('question'),
        # ... more examples
    ]

    # Split data
    random.shuffle(full_dataset)
    train_size = int(len(full_dataset) * 0.6)
    test_size = int(len(full_dataset) * 0.2)

    trainset = full_dataset[:train_size]
    testset = full_dataset[train_size:train_size + test_size]

    # Define evaluation metric
    def simple_metric(example, prediction, trace=None):
        return example.answer.lower() in prediction.answer.lower()

    # Configure optimizers
    optimizers = {
        'Bootstrap': AdvancedBootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=4),
        'LabeledFewShot': LabeledFewShotOptimizer(k=8),
        'COPRO': COPROOptimizer(metric=simple_metric, breadth=6, depth=2)
    }

    # Execute evaluation
    evaluator = OptimizerEvaluator(TestProgram(), trainset, testset)
    evaluator.compare_optimizers(optimizers, simple_metric)

    return evaluator.results

# Run evaluation
# evaluation_results = comprehensive_optimizer_evaluation()

Practice Exercises

Exercise 1: Custom Optimizer

class CustomOptimizer:
    """Custom optimizer exercise"""

    def __init__(self, metric, strategy='random'):
        self.metric = metric
        self.strategy = strategy  # 'random', 'similarity', 'difficulty'

    def compile(self, program, trainset):
        """Implement your optimization strategy"""
        # TODO: Implement custom optimization logic
        pass

# Exercise tasks:
# 1. Implement similarity-based example selection strategy
# 2. Implement difficulty-based incremental example selection strategy
# 3. Compare performance differences of different strategies

Exercise 2: Multi-Objective Optimization

class MultiObjectiveOptimizer:
    """Multi-objective optimizer exercise"""

    def __init__(self, metrics_config):
        """
        metrics_config: {
            'accuracy': {'metric': accuracy_func, 'weight': 0.6},
            'speed': {'metric': speed_func, 'weight': 0.2},
            'robustness': {'metric': robustness_func, 'weight': 0.2}
        }
        """
        self.metrics_config = metrics_config

    def compile(self, program, trainset):
        """Implement multi-objective optimization"""
        # TODO: Implement optimization considering multiple metrics
        pass

# Exercise tasks:
# 1. Design multiple evaluation metrics
# 2. Implement Pareto optimal solution selection
# 3. Analyze the impact of different weight configurations

Best Practices

1. Optimizer Selection Guide

def select_optimizer_guide():
    """Optimizer selection guide"""

    guidelines = {
        'Bootstrap Few-Shot': {
            'Applicable Scenarios': [
                'Medium data size (100-1000 examples)',
                'Need to automatically generate high-quality examples',
                'Medium task complexity'
            ],
            'Advantages': [
                'High automation',
                'Usually achieves good performance',
                'No need for manual annotation of examples'
            ],
            'Disadvantages': [
                'Higher computational cost',
                'May overfit',
                'Needs good initial program'
            ]
        },

        'Labeled Few-Shot': {
            'Applicable Scenarios': [
                'Have high-quality annotated data',
                'Smaller data size (<100 examples)',
                'Need quick prototype validation'
            ],
            'Advantages': [
                'Simple and direct',
                'Low computational cost',
                'Strong controllability'
            ],
            'Disadvantages': [
                'Requires manual example selection',
                'Performance ceiling limited by example quality',
                'Cannot adaptively optimize'
            ]
        },

        'COPRO': {
            'Applicable Scenarios': [
                'Prompt-sensitive tasks',
                'Need fine-tuning',
                'Have sufficient computational resources'
            ],
            'Advantages': [
                'Can optimize prompts themselves',
                'Theoretically higher performance ceiling',
                'Suitable for complex reasoning tasks'
            ],
            'Disadvantages': [
                'Highest computational overhead',
                'Complex tuning',
                'May be unstable'
            ]
        }
    }

    return guidelines

# Decision tree
def choose_optimizer(data_size, quality, compute_budget, task_complexity):
    """Optimizer selection decision tree"""

    if compute_budget == 'low':
        return 'LabeledFewShot'
    elif data_size < 50:
        return 'LabeledFewShot'
    elif data_size > 500 and compute_budget == 'high':
        if task_complexity == 'high':
            return 'COPRO'
        else:
            return 'Bootstrap'
    else:
        return 'Bootstrap'

2. Performance Monitoring and Debugging

class OptimizationMonitor:
    """Optimization process monitor"""

    def __init__(self):
        self.metrics_history = []
        self.timing_info = {}

    def monitor_compilation(self, optimizer, program, trainset):
        """Monitor compilation process"""
        import time
        import memory_profiler

        start_time = time.time()
        start_memory = memory_profiler.memory_usage()[0]

        # Execute compilation
        compiled_program = optimizer.compile(program, trainset)

        end_time = time.time()
        end_memory = memory_profiler.memory_usage()[0]

        # Record performance metrics
        self.timing_info = {
            'compilation_time': end_time - start_time,
            'memory_usage': end_memory - start_memory,
            'trainset_size': len(trainset)
        }

        print(f"Compilation performance:")
        print(f"   Time: {self.timing_info['compilation_time']:.2f}s")
        print(f"   Memory: {self.timing_info['memory_usage']:.2f}MB")

        return compiled_program

    def track_convergence(self, scores_by_iteration):
        """Track convergence"""
        import matplotlib.pyplot as plt

        plt.figure(figsize=(10, 6))
        plt.plot(scores_by_iteration)
        plt.title('Optimizer Convergence Curve')
        plt.xlabel('Iteration')
        plt.ylabel('Performance Score')
        plt.grid(True)
        plt.show()

        # Detect convergence
        if len(scores_by_iteration) > 5:
            recent_improvement = scores_by_iteration[-1] - scores_by_iteration[-5]
            if recent_improvement < 0.01:
                print("Warning: Optimizer may have converged, suggest stopping iterations")

# Use monitor
monitor = OptimizationMonitor()
# compiled_program = monitor.monitor_compilation(optimizer, program, trainset)

Through this chapter, you should have mastered the principles and usage of various optimizers in DSPy. Optimizers are the core feature of the DSPy framework, capable of automatically improving program performance. In practical applications, choose appropriate optimization strategies based on specific task characteristics, data scale, and computational resources.