feat: implement AITBC mesh network deployment infrastructure
✅ Phase 0: Pre-implementation checklist completed - Environment configurations (dev/staging/production) - Directory structure setup (logs, backups, monitoring) - Virtual environment with dependencies ✅ Master deployment script created - Single command deployment with validation - Progress tracking and rollback capability - Health checks and deployment reporting ✅ Validation script created - Module import validation - Basic functionality testing - Configuration and script verification ✅ Implementation fixes - Fixed dataclass import in consensus keys - Fixed async function syntax in tests - Updated deployment script for virtual environment 🚀 Ready for deployment: ./scripts/deploy-mesh-network.sh dev
This commit is contained in:
289
apps/blockchain-node/src/aitbc_chain/network/health.py
Normal file
289
apps/blockchain-node/src/aitbc_chain/network/health.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
Peer Health Monitoring Service
|
||||
Monitors peer liveness and performance metrics
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import ping3
|
||||
import statistics
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from .discovery import PeerNode, NodeStatus
|
||||
|
||||
class HealthMetric(Enum):
|
||||
LATENCY = "latency"
|
||||
AVAILABILITY = "availability"
|
||||
THROUGHPUT = "throughput"
|
||||
ERROR_RATE = "error_rate"
|
||||
|
||||
@dataclass
|
||||
class HealthStatus:
|
||||
node_id: str
|
||||
status: NodeStatus
|
||||
last_check: float
|
||||
latency_ms: float
|
||||
availability_percent: float
|
||||
throughput_mbps: float
|
||||
error_rate_percent: float
|
||||
consecutive_failures: int
|
||||
health_score: float
|
||||
|
||||
class PeerHealthMonitor:
|
||||
"""Monitors health and performance of peer nodes"""
|
||||
|
||||
def __init__(self, check_interval: int = 60):
|
||||
self.check_interval = check_interval
|
||||
self.health_status: Dict[str, HealthStatus] = {}
|
||||
self.running = False
|
||||
self.latency_history: Dict[str, List[float]] = {}
|
||||
self.max_history_size = 100
|
||||
|
||||
# Health thresholds
|
||||
self.max_latency_ms = 1000
|
||||
self.min_availability_percent = 90.0
|
||||
self.min_health_score = 0.5
|
||||
self.max_consecutive_failures = 3
|
||||
|
||||
async def start_monitoring(self, peers: Dict[str, PeerNode]):
|
||||
"""Start health monitoring for peers"""
|
||||
self.running = True
|
||||
log_info("Starting peer health monitoring")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self._check_all_peers(peers)
|
||||
await asyncio.sleep(self.check_interval)
|
||||
except Exception as e:
|
||||
log_error(f"Health monitoring error: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
async def stop_monitoring(self):
|
||||
"""Stop health monitoring"""
|
||||
self.running = False
|
||||
log_info("Stopping peer health monitoring")
|
||||
|
||||
async def _check_all_peers(self, peers: Dict[str, PeerNode]):
|
||||
"""Check health of all peers"""
|
||||
tasks = []
|
||||
|
||||
for node_id, peer in peers.items():
|
||||
if peer.status == NodeStatus.ONLINE:
|
||||
task = asyncio.create_task(self._check_peer_health(peer))
|
||||
tasks.append(task)
|
||||
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
async def _check_peer_health(self, peer: PeerNode):
|
||||
"""Check health of individual peer"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Check latency
|
||||
latency = await self._measure_latency(peer.address, peer.port)
|
||||
|
||||
# Check availability
|
||||
availability = await self._check_availability(peer)
|
||||
|
||||
# Check throughput
|
||||
throughput = await self._measure_throughput(peer)
|
||||
|
||||
# Calculate health score
|
||||
health_score = self._calculate_health_score(latency, availability, throughput)
|
||||
|
||||
# Update health status
|
||||
self._update_health_status(peer, NodeStatus.ONLINE, latency, availability, throughput, 0.0, health_score)
|
||||
|
||||
# Reset consecutive failures
|
||||
if peer.node_id in self.health_status:
|
||||
self.health_status[peer.node_id].consecutive_failures = 0
|
||||
|
||||
except Exception as e:
|
||||
log_error(f"Health check failed for peer {peer.node_id}: {e}")
|
||||
|
||||
# Handle failure
|
||||
consecutive_failures = self.health_status.get(peer.node_id, HealthStatus(peer.node_id, NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).consecutive_failures + 1
|
||||
|
||||
if consecutive_failures >= self.max_consecutive_failures:
|
||||
self._update_health_status(peer, NodeStatus.OFFLINE, 0, 0, 0, 100.0, 0.0)
|
||||
else:
|
||||
self._update_health_status(peer, NodeStatus.ERROR, 0, 0, 0, 0.0, consecutive_failures, 0.0)
|
||||
|
||||
async def _measure_latency(self, address: str, port: int) -> float:
|
||||
"""Measure network latency to peer"""
|
||||
try:
|
||||
# Use ping3 for basic latency measurement
|
||||
latency = ping3.ping(address, timeout=2)
|
||||
|
||||
if latency is not None:
|
||||
latency_ms = latency * 1000
|
||||
|
||||
# Update latency history
|
||||
node_id = f"{address}:{port}"
|
||||
if node_id not in self.latency_history:
|
||||
self.latency_history[node_id] = []
|
||||
|
||||
self.latency_history[node_id].append(latency_ms)
|
||||
|
||||
# Limit history size
|
||||
if len(self.latency_history[node_id]) > self.max_history_size:
|
||||
self.latency_history[node_id].pop(0)
|
||||
|
||||
return latency_ms
|
||||
else:
|
||||
return float('inf')
|
||||
|
||||
except Exception as e:
|
||||
log_debug(f"Latency measurement failed for {address}:{port}: {e}")
|
||||
return float('inf')
|
||||
|
||||
async def _check_availability(self, peer: PeerNode) -> float:
|
||||
"""Check peer availability by attempting connection"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Try to connect to peer
|
||||
reader, writer = await asyncio.wait_for(
|
||||
asyncio.open_connection(peer.address, peer.port),
|
||||
timeout=5.0
|
||||
)
|
||||
|
||||
connection_time = (time.time() - start_time) * 1000
|
||||
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
|
||||
# Calculate availability based on recent history
|
||||
node_id = peer.node_id
|
||||
if node_id in self.health_status:
|
||||
# Simple availability calculation based on success rate
|
||||
recent_status = self.health_status[node_id]
|
||||
if recent_status.status == NodeStatus.ONLINE:
|
||||
return min(100.0, recent_status.availability_percent + 5.0)
|
||||
else:
|
||||
return max(0.0, recent_status.availability_percent - 10.0)
|
||||
else:
|
||||
return 100.0 # First successful connection
|
||||
|
||||
except Exception as e:
|
||||
log_debug(f"Availability check failed for {peer.node_id}: {e}")
|
||||
return 0.0
|
||||
|
||||
async def _measure_throughput(self, peer: PeerNode) -> float:
|
||||
"""Measure network throughput to peer"""
|
||||
try:
|
||||
# Simple throughput test using small data transfer
|
||||
test_data = b"x" * 1024 # 1KB test data
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
reader, writer = await asyncio.open_connection(peer.address, peer.port)
|
||||
|
||||
# Send test data
|
||||
writer.write(test_data)
|
||||
await writer.drain()
|
||||
|
||||
# Wait for echo response (if peer supports it)
|
||||
response = await asyncio.wait_for(reader.read(1024), timeout=2.0)
|
||||
|
||||
transfer_time = time.time() - start_time
|
||||
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
|
||||
# Calculate throughput in Mbps
|
||||
bytes_transferred = len(test_data) + len(response)
|
||||
throughput_mbps = (bytes_transferred * 8) / (transfer_time * 1024 * 1024)
|
||||
|
||||
return throughput_mbps
|
||||
|
||||
except Exception as e:
|
||||
log_debug(f"Throughput measurement failed for {peer.node_id}: {e}")
|
||||
return 0.0
|
||||
|
||||
def _calculate_health_score(self, latency: float, availability: float, throughput: float) -> float:
|
||||
"""Calculate overall health score"""
|
||||
# Latency score (lower is better)
|
||||
latency_score = max(0.0, 1.0 - (latency / self.max_latency_ms))
|
||||
|
||||
# Availability score
|
||||
availability_score = availability / 100.0
|
||||
|
||||
# Throughput score (higher is better, normalized to 10 Mbps)
|
||||
throughput_score = min(1.0, throughput / 10.0)
|
||||
|
||||
# Weighted average
|
||||
health_score = (
|
||||
latency_score * 0.3 +
|
||||
availability_score * 0.4 +
|
||||
throughput_score * 0.3
|
||||
)
|
||||
|
||||
return health_score
|
||||
|
||||
def _update_health_status(self, peer: PeerNode, status: NodeStatus, latency: float,
|
||||
availability: float, throughput: float, error_rate: float,
|
||||
consecutive_failures: int = 0, health_score: float = 0.0):
|
||||
"""Update health status for peer"""
|
||||
self.health_status[peer.node_id] = HealthStatus(
|
||||
node_id=peer.node_id,
|
||||
status=status,
|
||||
last_check=time.time(),
|
||||
latency_ms=latency,
|
||||
availability_percent=availability,
|
||||
throughput_mbps=throughput,
|
||||
error_rate_percent=error_rate,
|
||||
consecutive_failures=consecutive_failures,
|
||||
health_score=health_score
|
||||
)
|
||||
|
||||
# Update peer status in discovery
|
||||
peer.status = status
|
||||
peer.last_seen = time.time()
|
||||
|
||||
def get_health_status(self, node_id: str) -> Optional[HealthStatus]:
|
||||
"""Get health status for specific peer"""
|
||||
return self.health_status.get(node_id)
|
||||
|
||||
def get_all_health_status(self) -> Dict[str, HealthStatus]:
|
||||
"""Get health status for all peers"""
|
||||
return self.health_status.copy()
|
||||
|
||||
def get_average_latency(self, node_id: str) -> Optional[float]:
|
||||
"""Get average latency for peer"""
|
||||
node_key = f"{self.health_status.get(node_id, HealthStatus('', NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).node_id}"
|
||||
|
||||
if node_key in self.latency_history and self.latency_history[node_key]:
|
||||
return statistics.mean(self.latency_history[node_key])
|
||||
|
||||
return None
|
||||
|
||||
def get_healthy_peers(self) -> List[str]:
|
||||
"""Get list of healthy peers"""
|
||||
return [
|
||||
node_id for node_id, status in self.health_status.items()
|
||||
if status.health_score >= self.min_health_score
|
||||
]
|
||||
|
||||
def get_unhealthy_peers(self) -> List[str]:
|
||||
"""Get list of unhealthy peers"""
|
||||
return [
|
||||
node_id for node_id, status in self.health_status.items()
|
||||
if status.health_score < self.min_health_score
|
||||
]
|
||||
|
||||
# Global health monitor
|
||||
health_monitor: Optional[PeerHealthMonitor] = None
|
||||
|
||||
def get_health_monitor() -> Optional[PeerHealthMonitor]:
|
||||
"""Get global health monitor"""
|
||||
return health_monitor
|
||||
|
||||
def create_health_monitor(check_interval: int = 60) -> PeerHealthMonitor:
|
||||
"""Create and set global health monitor"""
|
||||
global health_monitor
|
||||
health_monitor = PeerHealthMonitor(check_interval)
|
||||
return health_monitor
|
||||
Reference in New Issue
Block a user