Chapter 12: Model Evaluation and Risk Management

Haiyue
19min

Chapter 12: Model Evaluation and Risk Management

Learning Objectives
  • Master diagnostic methods for Markov models
  • Perform model robustness testing
  • Implement dynamic risk measurement
  • Build model risk management framework

Knowledge Summary

1. Model Validation Framework

🔄 正在渲染 Mermaid 图表...

2. Key Evaluation Metrics

Statistical Metrics:

  • Log-likelihood: =t=1TlogP(ytyt1,θ)\ell = \sum_{t=1}^T \log P(y_t | y_{t-1}, \theta)
  • AIC: AIC=2+2kAIC = -2\ell + 2k
  • BIC: BIC=2+klogTBIC = -2\ell + k\log T

Prediction Accuracy:

  • RMSE: 1Tt=1T(yty^t)2\sqrt{\frac{1}{T}\sum_{t=1}^T (y_t - \hat{y}_t)^2}
  • MAE: 1Tt=1Tyty^t\frac{1}{T}\sum_{t=1}^T |y_t - \hat{y}_t|

Risk Metrics:

  • VaR: P(L>VaRα)=αP(L > VaR_\alpha) = \alpha
  • ES: ESα=E[LL>VaRα]ES_\alpha = E[L | L > VaR_\alpha]

Example Code

Example 1: Model Diagnostics and Validation

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd

class ModelValidator:
    """Markov model validator"""

    def __init__(self, model):
        self.model = model
        self.validation_results = {}

    def goodness_of_fit_test(self, observed_transitions, expected_transitions):
        """Goodness of fit test"""
        # Chi-square test
        chi2_stat = np.sum((observed_transitions - expected_transitions)**2 / expected_transitions)
        dof = len(observed_transitions.flatten()) - 1
        p_value = 1 - stats.chi2.cdf(chi2_stat, dof)

        return {
            'chi2_statistic': chi2_stat,
            'p_value': p_value,
            'degrees_of_freedom': dof
        }

    def ljung_box_test(self, residuals, lags=10):
        """Ljung-Box autocorrelation test"""
        from statsmodels.stats.diagnostic import acorr_ljungbox
        result = acorr_ljungbox(residuals, lags=lags, return_df=True)
        return result

    def arch_test(self, residuals, lags=5):
        """ARCH effect test"""
        squared_residuals = residuals**2
        from statsmodels.tsa.stattools import acf
        autocorrs = acf(squared_residuals, nlags=lags, fft=True)[1:]

        # Simplified ARCH test statistic
        n = len(residuals)
        lm_stat = n * np.sum(autocorrs**2)
        p_value = 1 - stats.chi2.cdf(lm_stat, lags)

        return {
            'lm_statistic': lm_stat,
            'p_value': p_value,
            'autocorrelations': autocorrs
        }

# Example: Model validation
np.random.seed(42)

# Generate test data
def generate_test_data(n_samples=1000):
    # True 2-state Markov chain
    true_P = np.array([[0.8, 0.2], [0.3, 0.7]])
    states = [0]

    for _ in range(n_samples-1):
        current_state = states[-1]
        next_state = np.random.choice([0, 1], p=true_P[current_state])
        states.append(next_state)

    return np.array(states), true_P

states, true_P = generate_test_data(2000)

# Estimate transition matrix
def estimate_transition_matrix(states):
    n_states = len(np.unique(states))
    transition_counts = np.zeros((n_states, n_states))

    for t in range(len(states)-1):
        transition_counts[states[t], states[t+1]] += 1

    row_sums = transition_counts.sum(axis=1, keepdims=True)
    estimated_P = transition_counts / row_sums

    return estimated_P, transition_counts

estimated_P, transition_counts = estimate_transition_matrix(states[:-500])
test_states = states[-500:]

print("Model Validation Analysis:")
print(f"True transition matrix:\n{true_P}")
print(f"Estimated transition matrix:\n{estimated_P}")
print(f"Estimation error:\n{np.abs(true_P - estimated_P)}")

# Create validator
validator = ModelValidator(None)

# Goodness of fit test
expected_counts = np.zeros_like(transition_counts)
for t in range(len(states[:-500])-1):
    for i in range(2):
        for j in range(2):
            expected_counts[i,j] += (states[t] == i) * estimated_P[i,j]

gof_result = validator.goodness_of_fit_test(
    transition_counts.flatten(),
    expected_counts.flatten()
)

print(f"\nGoodness of Fit Test:")
print(f"Chi-square statistic: {gof_result['chi2_statistic']:.4f}")
print(f"p-value: {gof_result['p_value']:.4f}")

Example 2: Out-of-Sample Performance Evaluation

class OutOfSampleTester:
    """Out-of-sample testing"""

    def rolling_window_validation(self, data, model_func, window_size=500, step_size=50):
        """Rolling window validation"""
        results = []

        for start in range(0, len(data) - window_size - step_size, step_size):
            # Training window
            train_data = data[start:start + window_size]
            # Testing window
            test_data = data[start + window_size:start + window_size + step_size]

            # Train model
            model = model_func(train_data)

            # Predict
            predictions = self.predict_sequence(model, test_data)

            # Evaluate
            accuracy = np.mean(predictions == test_data[1:])
            log_likelihood = self.calculate_log_likelihood(model, test_data)

            results.append({
                'start_idx': start,
                'accuracy': accuracy,
                'log_likelihood': log_likelihood,
                'n_test': len(test_data) - 1
            })

        return pd.DataFrame(results)

    def predict_sequence(self, transition_matrix, test_sequence):
        """Predict state sequence"""
        predictions = []

        for t in range(len(test_sequence) - 1):
            current_state = test_sequence[t]
            # Predict next state (choose most probable state)
            next_state_probs = transition_matrix[current_state]
            predicted_state = np.argmax(next_state_probs)
            predictions.append(predicted_state)

        return np.array(predictions)

    def calculate_log_likelihood(self, transition_matrix, sequence):
        """Calculate log-likelihood"""
        log_likelihood = 0

        for t in range(len(sequence) - 1):
            current_state = sequence[t]
            next_state = sequence[t + 1]
            prob = transition_matrix[current_state, next_state]
            if prob > 0:
                log_likelihood += np.log(prob)
            else:
                log_likelihood += -np.inf

        return log_likelihood

# Out-of-sample testing
def simple_model_func(data):
    estimated_P, _ = estimate_transition_matrix(data)
    return estimated_P

tester = OutOfSampleTester()
oos_results = tester.rolling_window_validation(
    states, simple_model_func, window_size=300, step_size=50
)

print(f"\nOut-of-Sample Test Results:")
print(f"Average accuracy: {oos_results['accuracy'].mean():.3f}")
print(f"Accuracy standard deviation: {oos_results['accuracy'].std():.3f}")
print(f"Average log-likelihood: {oos_results['log_likelihood'].mean():.2f}")

# Visualize out-of-sample performance
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Accuracy over time
axes[0, 0].plot(oos_results['start_idx'], oos_results['accuracy'], 'b-o', markersize=4)
axes[0, 0].set_title('Out-of-Sample Prediction Accuracy')
axes[0, 0].set_xlabel('Starting Position')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].grid(True, alpha=0.3)

# Log-likelihood over time
axes[0, 1].plot(oos_results['start_idx'], oos_results['log_likelihood'], 'r-o', markersize=4)
axes[0, 1].set_title('Out-of-Sample Log-Likelihood')
axes[0, 1].set_xlabel('Starting Position')
axes[0, 1].set_ylabel('Log-Likelihood')
axes[0, 1].grid(True, alpha=0.3)

# Accuracy distribution
axes[1, 0].hist(oos_results['accuracy'], bins=15, alpha=0.7, edgecolor='black')
axes[1, 0].axvline(oos_results['accuracy'].mean(), color='red', linestyle='--',
                   label=f'Mean: {oos_results["accuracy"].mean():.3f}')
axes[1, 0].set_title('Accuracy Distribution')
axes[1, 0].set_xlabel('Accuracy')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Performance stability
rolling_mean = oos_results['accuracy'].rolling(window=5, center=True).mean()
rolling_std = oos_results['accuracy'].rolling(window=5, center=True).std()

axes[1, 1].plot(oos_results['start_idx'], oos_results['accuracy'], 'b-', alpha=0.3, label='Raw')
axes[1, 1].plot(oos_results['start_idx'], rolling_mean, 'r-', linewidth=2, label='5-period Moving Average')
axes[1, 1].fill_between(oos_results['start_idx'],
                        rolling_mean - rolling_std,
                        rolling_mean + rolling_std,
                        alpha=0.2, color='red', label='±1 Std Dev')
axes[1, 1].set_title('Prediction Performance Stability')
axes[1, 1].set_xlabel('Starting Position')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Example 3: Risk Management Framework

class RiskManager:
    """Risk management framework"""

    def __init__(self, confidence_level=0.95):
        self.confidence_level = confidence_level
        self.risk_metrics = {}

    def calculate_var(self, returns, method='historical'):
        """Calculate VaR"""
        if method == 'historical':
            var = np.percentile(returns, (1 - self.confidence_level) * 100)
        elif method == 'parametric':
            mean_return = np.mean(returns)
            std_return = np.std(returns)
            var = mean_return - stats.norm.ppf(self.confidence_level) * std_return
        else:
            raise ValueError("Method must be 'historical' or 'parametric'")

        return var

    def calculate_expected_shortfall(self, returns):
        """Calculate expected shortfall"""
        var = self.calculate_var(returns, method='historical')
        tail_losses = returns[returns <= var]
        if len(tail_losses) > 0:
            es = np.mean(tail_losses)
        else:
            es = var
        return es

    def stress_testing(self, model, base_scenario, stress_scenarios):
        """Stress testing"""
        results = {}

        # Base scenario
        base_returns = self.simulate_returns(model, base_scenario)
        results['base'] = {
            'var': self.calculate_var(base_returns),
            'es': self.calculate_expected_shortfall(base_returns),
            'returns': base_returns
        }

        # Stress scenarios
        for scenario_name, scenario_params in stress_scenarios.items():
            stress_returns = self.simulate_returns(model, scenario_params)
            results[scenario_name] = {
                'var': self.calculate_var(stress_returns),
                'es': self.calculate_expected_shortfall(stress_returns),
                'returns': stress_returns
            }

        return results

    def simulate_returns(self, transition_matrix, scenario_params):
        """Simulate returns"""
        n_simulations = scenario_params.get('n_simulations', 1000)
        n_periods = scenario_params.get('n_periods', 252)
        state_returns = scenario_params.get('state_returns', [-0.02, 0.015])
        initial_state = scenario_params.get('initial_state', 0)

        all_returns = []

        for _ in range(n_simulations):
            returns = []
            current_state = initial_state

            for _ in range(n_periods):
                # Generate returns
                state_return = state_returns[current_state]
                noise = np.random.normal(0, 0.01)  # Add noise
                period_return = state_return + noise
                returns.append(period_return)

                # Transition to next state
                next_state = np.random.choice(len(state_returns),
                                            p=transition_matrix[current_state])
                current_state = next_state

            all_returns.extend(returns)

        return np.array(all_returns)

    def model_risk_assessment(self, models, test_data):
        """Model risk assessment"""
        model_results = {}

        for model_name, model in models.items():
            # Predict
            predictions = self.predict_with_model(model, test_data)

            # Calculate accuracy
            accuracy = np.mean(predictions == test_data[1:])

            # Calculate log-likelihood
            log_likelihood = 0
            for t in range(len(test_data) - 1):
                current_state = test_data[t]
                next_state = test_data[t + 1]
                prob = model[current_state, next_state]
                if prob > 0:
                    log_likelihood += np.log(prob)

            model_results[model_name] = {
                'accuracy': accuracy,
                'log_likelihood': log_likelihood,
                'predictions': predictions
            }

        return model_results

    def predict_with_model(self, transition_matrix, sequence):
        """Predict using model"""
        predictions = []

        for t in range(len(sequence) - 1):
            current_state = sequence[t]
            next_state_probs = transition_matrix[current_state]
            predicted_state = np.argmax(next_state_probs)
            predictions.append(predicted_state)

        return np.array(predictions)

# Risk management analysis
risk_manager = RiskManager(confidence_level=0.95)

# Define scenarios
base_scenario = {
    'n_simulations': 1000,
    'n_periods': 252,
    'state_returns': [-0.01, 0.008],  # Normal market
    'initial_state': 1
}

stress_scenarios = {
    'market_crash': {
        'n_simulations': 1000,
        'n_periods': 252,
        'state_returns': [-0.05, -0.02],  # Market crash
        'initial_state': 0
    },
    'high_volatility': {
        'n_simulations': 1000,
        'n_periods': 252,
        'state_returns': [-0.03, 0.025],  # High volatility
        'initial_state': 0
    }
}

# Execute stress testing
stress_results = risk_manager.stress_testing(estimated_P, base_scenario, stress_scenarios)

print(f"\nRisk Management Analysis:")
print("=" * 50)
for scenario, results in stress_results.items():
    print(f"\n{scenario.upper()} Scenario:")
    print(f"  95% VaR: {results['var']:.4f}")
    print(f"  Expected Shortfall: {results['es']:.4f}")
    print(f"  Return mean: {np.mean(results['returns']):.4f}")
    print(f"  Return std dev: {np.std(results['returns']):.4f}")

# Model risk assessment
models = {
    'estimated_model': estimated_P,
    'true_model': true_P,
    'naive_model': np.array([[0.5, 0.5], [0.5, 0.5]])  # Naive model
}

model_risk_results = risk_manager.model_risk_assessment(models, test_states)

print(f"\nModel Risk Assessment:")
for model_name, results in model_risk_results.items():
    print(f"{model_name}:")
    print(f"  Accuracy: {results['accuracy']:.3f}")
    print(f"  Log-likelihood: {results['log_likelihood']:.2f}")

# Visualize risk analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# VaR comparison
scenarios = list(stress_results.keys())
vars = [stress_results[s]['var'] for s in scenarios]
colors = ['blue', 'red', 'orange']

axes[0, 0].bar(scenarios, vars, color=colors, alpha=0.7)
axes[0, 0].set_title('VaR Under Different Scenarios')
axes[0, 0].set_ylabel('VaR')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Return distribution comparison
for i, (scenario, color) in enumerate(zip(scenarios, colors)):
    returns = stress_results[scenario]['returns']
    axes[0, 1].hist(returns, bins=50, alpha=0.6, density=True,
                   color=color, label=scenario)

axes[0, 1].set_title('Return Distribution Under Different Scenarios')
axes[0, 1].set_xlabel('Return')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Model performance comparison
model_names = list(model_risk_results.keys())
accuracies = [model_risk_results[m]['accuracy'] for m in model_names]

axes[1, 0].bar(model_names, accuracies, color='green', alpha=0.7)
axes[1, 0].set_title('Model Prediction Accuracy Comparison')
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Risk metric trends
periods = range(1, 11)
base_returns = stress_results['base']['returns']
rolling_vars = []

for period in periods:
    period_returns = base_returns[:period*100]  # Take data of different lengths
    var = risk_manager.calculate_var(period_returns)
    rolling_vars.append(var)

axes[1, 1].plot(periods, rolling_vars, 'b-o', linewidth=2, markersize=6)
axes[1, 1].set_title('VaR vs Sample Size')
axes[1, 1].set_xlabel('Sample Period (×100)')
axes[1, 1].set_ylabel('VaR')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nRisk Management Recommendations:")
print(f"1. Establish multi-scenario stress testing framework")
print(f"2. Regularly assess model prediction accuracy")
print(f"3. Monitor time-varying characteristics of model risk")
print(f"4. Establish early warning mechanism for model failure")

Mathematical Formula Summary

  1. Model Selection Criteria:

    • AIC=2+2kAIC = -2\ell + 2k
    • BIC=2+klogTBIC = -2\ell + k\log T
    • HQ=2+2kloglogTHQ = -2\ell + 2k\log\log T
  2. Prediction Evaluation Metrics:

    • RMSE=1Tt=1T(yty^t)2RMSE = \sqrt{\frac{1}{T}\sum_{t=1}^T (y_t - \hat{y}_t)^2}
    • MAE=1Tt=1Tyty^tMAE = \frac{1}{T}\sum_{t=1}^T |y_t - \hat{y}_t|
    • MAPE=1Tt=1Tyty^tytMAPE = \frac{1}{T}\sum_{t=1}^T \frac{|y_t - \hat{y}_t|}{|y_t|}
  3. Risk Measures:

    • VaRα=inf{x:P(Lx)α}VaR_\alpha = \inf\{x : P(L \leq x) \geq \alpha\}
    • ESα=E[LL>VaRα]ES_\alpha = E[L | L > VaR_\alpha]
  4. Model Confidence Intervals:

    • θ^±zα/2Var(θ^)\hat{\theta} \pm z_{\alpha/2} \sqrt{\text{Var}(\hat{\theta})}
Key Points in Risk Management
  • Model validation is an ongoing process requiring regular updates
  • Out-of-sample testing is more important than in-sample fitting
  • Stress testing should include extreme but plausible scenarios
  • Model risk needs to be quantified and managed
  • Regulatory requirements are constantly evolving, need to stay informed