How to implement error handling and retry mechanisms in MCP? - 面试题

Implementing error handling and retry mechanisms in MCP is crucial for ensuring system stability and reliability. Here are detailed implementation strategies:

Error Handling Strategies

1. Error Classification

Retryable Errors: Network timeouts, temporary service unavailability, rate limits, etc.
Non-retryable Errors: Parameter errors, insufficient permissions, resource not found, etc.
Business Errors: Business logic-related errors requiring special handling

2. Error Response Format

json
{
  "jsonrpc": "2.0",
  "id": "req-123",
  "error": {
    "code": -32000,
    "message": "Server error",
    "data": {
      "retryable": true,
      "retryAfter": 5,
      "details": "Database connection timeout"
    }
  }
}

3. Error Handling Implementation

python
from typing import Optional
import asyncio

class MCPErrorHandler:
    def __init__(self):
        self.retryable_codes = [
            -32000,  # Server error
            -32001,  # Timeout
            -32002   # Rate limit
        ]
    
    def is_retryable(self, error: dict) -> bool:
        """Determine if error is retryable"""
        error_code = error.get("code")
        return error_code in self.retryable_codes
    
    def get_retry_delay(self, error: dict) -> int:
        """Get retry delay time"""
        error_data = error.get("data", {})
        return error_data.get("retryAfter", 1)

Retry Mechanisms

4. Exponential Backoff Retry

python
import time
import random

async def exponential_backoff_retry(
    func,
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 32.0
):
    """Exponential backoff retry mechanism"""
    last_exception = None
    
    for attempt in range(max_retries):
        try:
            return await func()
        except Exception as e:
            last_exception = e
            
            if attempt == max_retries - 1:
                raise
            
            # Calculate delay (with random jitter)
            delay = min(
                base_delay * (2 ** attempt) + random.uniform(0, 1),
                max_delay
            )
            
            await asyncio.sleep(delay)
    
    raise last_exception

5. Intelligent Retry Strategy

python
class RetryStrategy:
    def __init__(
        self,
        max_retries: int = 3,
        backoff_factor: float = 2.0,
        jitter: bool = True
    ):
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.jitter = jitter
    
    async def execute_with_retry(
        self,
        func,
        is_retryable: Optional[callable] = None
    ):
        """Execute function with intelligent retry strategy"""
        for attempt in range(self.max_retries):
            try:
                return await func()
            except Exception as e:
                if attempt == self.max_retries - 1:
                    raise
                
                if is_retryable and not is_retryable(e):
                    raise
                
                delay = self._calculate_delay(attempt)
                await asyncio.sleep(delay)
    
    def _calculate_delay(self, attempt: int) -> float:
        """Calculate retry delay"""
        delay = self.backoff_factor ** attempt
        
        if self.jitter:
            delay += random.uniform(0, delay * 0.1)
        
        return delay

Circuit Breaker Pattern

6. Implementing Circuit Breaker

python
from enum import Enum
import time

class CircuitState(Enum):
    CLOSED = "closed"
    OPEN = "open"
    HALF_OPEN = "half_open"

class CircuitBreaker:
    def __init__(
        self,
        failure_threshold: int = 5,
        recovery_timeout: float = 60.0
    ):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.last_failure_time = None
    
    async def call(self, func):
        """Call function through circuit breaker"""
        if self.state == CircuitState.OPEN:
            if self._should_attempt_reset():
                self.state = CircuitState.HALF_OPEN
            else:
                raise Exception("Circuit breaker is OPEN")
        
        try:
            result = await func()
            self._on_success()
            return result
        except Exception as e:
            self._on_failure()
            raise
    
    def _should_attempt_reset(self) -> bool:
        """Determine if circuit breaker should attempt reset"""
        if self.last_failure_time is None:
            return False
        
        elapsed = time.time() - self.last_failure_time
        return elapsed >= self.recovery_timeout
    
    def _on_success(self):
        """Handle success"""
        self.failure_count = 0
        if self.state == CircuitState.HALF_OPEN:
            self.state = CircuitState.CLOSED
    
    def _on_failure(self):
        """Handle failure"""
        self.failure_count += 1
        self.last_failure_time = time.time()
        
        if self.failure_count >= self.failure_threshold:
            self.state = CircuitState.OPEN

Monitoring and Logging

7. Error Monitoring

python
class ErrorMonitor:
    def __init__(self):
        self.error_counts = {}
        self.error_rates = {}
    
    def record_error(self, error_type: str):
        """Record error"""
        self.error_counts[error_type] = \
            self.error_counts.get(error_type, 0) + 1
    
    def get_error_rate(self, error_type: str) -> float:
        """Get error rate"""
        total = sum(self.error_counts.values())
        if total == 0:
            return 0.0
        
        return self.error_counts.get(error_type, 0) / total

Best Practices

Classify Error Types: Correctly identify retryable and non-retryable errors
Set Reasonable Retry Parameters: Adjust retry count and delay based on business scenarios
Implement Circuit Breaker: Prevent cascading failures
Detailed Logging: Record all errors and retry information
Monitoring and Alerting: Monitor error rates in real-time and set up alerts
Graceful Degradation: Provide fallback solutions when services are unavailable

Through these strategies, you can build a robust MCP system that effectively handles various error scenarios.