Files
aitbc/apps/exchange/health_monitor.py
oib 15427c96c0 chore: update file permissions to executable across repository
- Change file mode from 644 to 755 for all project files
- Add chain_id parameter to get_balance RPC endpoint with default "ait-devnet"
- Rename Miner.extra_meta_data to extra_metadata for consistency
2026-03-06 22:17:54 +01:00

276 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Exchange Health Monitoring and Failover System
Monitors exchange health and provides automatic failover capabilities
"""
import asyncio
import time
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from enum import Enum
import logging
from real_exchange_integration import exchange_manager, ExchangeStatus, ExchangeHealth
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class FailoverStrategy(str, Enum):
"""Failover strategies"""
MANUAL = "manual"
AUTOMATIC = "automatic"
PRIORITY_BASED = "priority_based"
@dataclass
class FailoverConfig:
"""Failover configuration"""
strategy: FailoverStrategy
health_check_interval: int = 30 # seconds
max_failures: int = 3
recovery_check_interval: int = 60 # seconds
priority_order: List[str] = None # Exchange priority for failover
class ExchangeHealthMonitor:
"""Monitors exchange health and manages failover"""
def __init__(self, config: FailoverConfig):
self.config = config
self.health_history: Dict[str, List[ExchangeHealth]] = {}
self.failure_counts: Dict[str, int] = {}
self.active_exchanges: List[str] = []
self.monitoring_task = None
self.is_monitoring = False
async def start_monitoring(self):
"""Start health monitoring"""
if self.is_monitoring:
logger.warning("⚠️ Health monitoring already running")
return
self.is_monitoring = True
self.monitoring_task = asyncio.create_task(self._monitor_loop())
logger.info("🔍 Exchange health monitoring started")
async def stop_monitoring(self):
"""Stop health monitoring"""
self.is_monitoring = False
if self.monitoring_task:
self.monitoring_task.cancel()
try:
await self.monitoring_task
except asyncio.CancelledError:
pass
logger.info("🔍 Exchange health monitoring stopped")
async def _monitor_loop(self):
"""Main monitoring loop"""
while self.is_monitoring:
try:
await self._check_all_exchanges()
await asyncio.sleep(self.config.health_check_interval)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"❌ Monitoring error: {e}")
await asyncio.sleep(5)
async def _check_all_exchanges(self):
"""Check health of all connected exchanges"""
try:
health_status = await exchange_manager.get_all_health_status()
for exchange_name, health in health_status.items():
await self._process_health_check(exchange_name, health)
except Exception as e:
logger.error(f"❌ Health check failed: {e}")
async def _process_health_check(self, exchange_name: str, health: ExchangeHealth):
"""Process individual exchange health check"""
# Store health history
if exchange_name not in self.health_history:
self.health_history[exchange_name] = []
self.health_history[exchange_name].append(health)
# Keep only last 100 checks
if len(self.health_history[exchange_name]) > 100:
self.health_history[exchange_name] = self.health_history[exchange_name][-100:]
# Check for failures
if health.status == ExchangeStatus.ERROR:
self.failure_counts[exchange_name] = self.failure_counts.get(exchange_name, 0) + 1
logger.warning(f"⚠️ {exchange_name} failure #{self.failure_counts[exchange_name]}: {health.error_message}")
# Trigger failover if needed
if self.failure_counts[exchange_name] >= self.config.max_failures:
await self._trigger_failover(exchange_name)
else:
# Reset failure count on successful check
if exchange_name in self.failure_counts and self.failure_counts[exchange_name] > 0:
logger.info(f"{exchange_name} recovered after {self.failure_counts[exchange_name]} failures")
self.failure_counts[exchange_name] = 0
# Update active exchanges list
if exchange_name not in self.active_exchanges:
self.active_exchanges.append(exchange_name)
async def _trigger_failover(self, failed_exchange: str):
"""Trigger failover for failed exchange"""
logger.error(f"🚨 FAILOVER TRIGGERED: {failed_exchange} failed {self.failure_counts[failed_exchange]} times")
if self.config.strategy == FailoverStrategy.AUTOMATIC:
await self._automatic_failover(failed_exchange)
elif self.config.strategy == FailoverStrategy.PRIORITY_BASED:
await self._priority_based_failover(failed_exchange)
else:
logger.info(f"📝 Manual failover required for {failed_exchange}")
async def _automatic_failover(self, failed_exchange: str):
"""Automatic failover to any healthy exchange"""
healthy_exchanges = [
ex for ex in exchange_manager.exchanges.keys()
if ex != failed_exchange and self.failure_counts.get(ex, 0) < self.config.max_failures
]
if healthy_exchanges:
backup = healthy_exchanges[0]
logger.info(f"🔄 Automatic failover: {failed_exchange}{backup}")
await self._redirect_orders(failed_exchange, backup)
else:
logger.error(f"❌ No healthy exchanges available for failover")
async def _priority_based_failover(self, failed_exchange: str):
"""Priority-based failover"""
if not self.config.priority_order:
logger.warning("⚠️ No priority order configured, falling back to automatic")
await self._automatic_failover(failed_exchange)
return
# Find next healthy exchange in priority order
for exchange in self.config.priority_order:
if (exchange != failed_exchange and
exchange in exchange_manager.exchanges and
self.failure_counts.get(exchange, 0) < self.config.max_failures):
logger.info(f"🔄 Priority-based failover: {failed_exchange}{exchange}")
await self._redirect_orders(failed_exchange, exchange)
return
logger.error(f"❌ No healthy exchanges available in priority order")
async def _redirect_orders(self, from_exchange: str, to_exchange: str):
"""Redirect orders from failed exchange to backup"""
# This would integrate with the order management system
logger.info(f"📦 Redirecting orders from {from_exchange} to {to_exchange}")
# Implementation would depend on order tracking system
def get_health_summary(self) -> Dict[str, Any]:
"""Get comprehensive health summary"""
summary = {
"monitoring_active": self.is_monitoring,
"active_exchanges": self.active_exchanges.copy(),
"failure_counts": self.failure_counts.copy(),
"exchange_health": {},
"uptime_stats": {}
}
# Calculate uptime statistics
for exchange_name, history in self.health_history.items():
if history:
total_checks = len(history)
successful_checks = sum(1 for h in history if h.status == ExchangeStatus.CONNECTED)
uptime_pct = (successful_checks / total_checks) * 100 if total_checks > 0 else 0
avg_latency = sum(h.latency_ms for h in history if h.status == ExchangeStatus.CONNECTED) / successful_checks if successful_checks > 0 else 0
summary["exchange_health"][exchange_name] = {
"status": history[-1].status.value if history else "unknown",
"last_check": history[-1].last_check.strftime('%H:%M:%S') if history else None,
"avg_latency_ms": round(avg_latency, 2),
"total_checks": total_checks,
"successful_checks": successful_checks,
"uptime_percentage": round(uptime_pct, 2)
}
return summary
def get_alerts(self) -> List[Dict[str, Any]]:
"""Get current alerts"""
alerts = []
for exchange_name, count in self.failure_counts.items():
if count >= self.config.max_failures:
alerts.append({
"level": "critical",
"exchange": exchange_name,
"message": f"Exchange has failed {count} times",
"timestamp": datetime.now()
})
elif count > 0:
alerts.append({
"level": "warning",
"exchange": exchange_name,
"message": f"Exchange has {count} recent failures",
"timestamp": datetime.now()
})
return alerts
# Global instance
default_config = FailoverConfig(
strategy=FailoverStrategy.AUTOMATIC,
health_check_interval=30,
max_failures=3,
priority_order=["binance", "coinbasepro", "kraken"]
)
health_monitor = ExchangeHealthMonitor(default_config)
# CLI Functions
async def start_health_monitoring():
"""Start health monitoring"""
await health_monitor.start_monitoring()
async def stop_health_monitoring():
"""Stop health monitoring"""
await health_monitor.stop_monitoring()
def get_health_summary():
"""Get health summary"""
return health_monitor.get_health_summary()
def get_alerts():
"""Get current alerts"""
return health_monitor.get_alerts()
# Test function
async def test_health_monitoring():
"""Test health monitoring system"""
print("🧪 Testing Health Monitoring System...")
# Start monitoring
await start_health_monitoring()
print("✅ Health monitoring started")
# Run for a few seconds to see it work
await asyncio.sleep(5)
# Get summary
summary = get_health_summary()
print(f"📊 Health Summary: {summary}")
# Get alerts
alerts = get_alerts()
print(f"🚨 Alerts: {len(alerts)}")
# Stop monitoring
await stop_health_monitoring()
print("🔍 Health monitoring stopped")
if __name__ == "__main__":
asyncio.run(test_health_monitoring())