Files
aitbc/apps/blockchain-node/src/aitbc_chain/network/health.py
aitbc c876b0aa20 feat: implement AITBC mesh network deployment infrastructure
 Phase 0: Pre-implementation checklist completed
- Environment configurations (dev/staging/production)
- Directory structure setup (logs, backups, monitoring)
- Virtual environment with dependencies

 Master deployment script created
- Single command deployment with validation
- Progress tracking and rollback capability
- Health checks and deployment reporting

 Validation script created
- Module import validation
- Basic functionality testing
- Configuration and script verification

 Implementation fixes
- Fixed dataclass import in consensus keys
- Fixed async function syntax in tests
- Updated deployment script for virtual environment

🚀 Ready for deployment: ./scripts/deploy-mesh-network.sh dev
2026-04-02 12:08:15 +02:00

290 lines
10 KiB
Python

"""
Peer Health Monitoring Service
Monitors peer liveness and performance metrics
"""
import asyncio
import time
import ping3
import statistics
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
from .discovery import PeerNode, NodeStatus
class HealthMetric(Enum):
LATENCY = "latency"
AVAILABILITY = "availability"
THROUGHPUT = "throughput"
ERROR_RATE = "error_rate"
@dataclass
class HealthStatus:
node_id: str
status: NodeStatus
last_check: float
latency_ms: float
availability_percent: float
throughput_mbps: float
error_rate_percent: float
consecutive_failures: int
health_score: float
class PeerHealthMonitor:
"""Monitors health and performance of peer nodes"""
def __init__(self, check_interval: int = 60):
self.check_interval = check_interval
self.health_status: Dict[str, HealthStatus] = {}
self.running = False
self.latency_history: Dict[str, List[float]] = {}
self.max_history_size = 100
# Health thresholds
self.max_latency_ms = 1000
self.min_availability_percent = 90.0
self.min_health_score = 0.5
self.max_consecutive_failures = 3
async def start_monitoring(self, peers: Dict[str, PeerNode]):
"""Start health monitoring for peers"""
self.running = True
log_info("Starting peer health monitoring")
while self.running:
try:
await self._check_all_peers(peers)
await asyncio.sleep(self.check_interval)
except Exception as e:
log_error(f"Health monitoring error: {e}")
await asyncio.sleep(10)
async def stop_monitoring(self):
"""Stop health monitoring"""
self.running = False
log_info("Stopping peer health monitoring")
async def _check_all_peers(self, peers: Dict[str, PeerNode]):
"""Check health of all peers"""
tasks = []
for node_id, peer in peers.items():
if peer.status == NodeStatus.ONLINE:
task = asyncio.create_task(self._check_peer_health(peer))
tasks.append(task)
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
async def _check_peer_health(self, peer: PeerNode):
"""Check health of individual peer"""
start_time = time.time()
try:
# Check latency
latency = await self._measure_latency(peer.address, peer.port)
# Check availability
availability = await self._check_availability(peer)
# Check throughput
throughput = await self._measure_throughput(peer)
# Calculate health score
health_score = self._calculate_health_score(latency, availability, throughput)
# Update health status
self._update_health_status(peer, NodeStatus.ONLINE, latency, availability, throughput, 0.0, health_score)
# Reset consecutive failures
if peer.node_id in self.health_status:
self.health_status[peer.node_id].consecutive_failures = 0
except Exception as e:
log_error(f"Health check failed for peer {peer.node_id}: {e}")
# Handle failure
consecutive_failures = self.health_status.get(peer.node_id, HealthStatus(peer.node_id, NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).consecutive_failures + 1
if consecutive_failures >= self.max_consecutive_failures:
self._update_health_status(peer, NodeStatus.OFFLINE, 0, 0, 0, 100.0, 0.0)
else:
self._update_health_status(peer, NodeStatus.ERROR, 0, 0, 0, 0.0, consecutive_failures, 0.0)
async def _measure_latency(self, address: str, port: int) -> float:
"""Measure network latency to peer"""
try:
# Use ping3 for basic latency measurement
latency = ping3.ping(address, timeout=2)
if latency is not None:
latency_ms = latency * 1000
# Update latency history
node_id = f"{address}:{port}"
if node_id not in self.latency_history:
self.latency_history[node_id] = []
self.latency_history[node_id].append(latency_ms)
# Limit history size
if len(self.latency_history[node_id]) > self.max_history_size:
self.latency_history[node_id].pop(0)
return latency_ms
else:
return float('inf')
except Exception as e:
log_debug(f"Latency measurement failed for {address}:{port}: {e}")
return float('inf')
async def _check_availability(self, peer: PeerNode) -> float:
"""Check peer availability by attempting connection"""
try:
start_time = time.time()
# Try to connect to peer
reader, writer = await asyncio.wait_for(
asyncio.open_connection(peer.address, peer.port),
timeout=5.0
)
connection_time = (time.time() - start_time) * 1000
writer.close()
await writer.wait_closed()
# Calculate availability based on recent history
node_id = peer.node_id
if node_id in self.health_status:
# Simple availability calculation based on success rate
recent_status = self.health_status[node_id]
if recent_status.status == NodeStatus.ONLINE:
return min(100.0, recent_status.availability_percent + 5.0)
else:
return max(0.0, recent_status.availability_percent - 10.0)
else:
return 100.0 # First successful connection
except Exception as e:
log_debug(f"Availability check failed for {peer.node_id}: {e}")
return 0.0
async def _measure_throughput(self, peer: PeerNode) -> float:
"""Measure network throughput to peer"""
try:
# Simple throughput test using small data transfer
test_data = b"x" * 1024 # 1KB test data
start_time = time.time()
reader, writer = await asyncio.open_connection(peer.address, peer.port)
# Send test data
writer.write(test_data)
await writer.drain()
# Wait for echo response (if peer supports it)
response = await asyncio.wait_for(reader.read(1024), timeout=2.0)
transfer_time = time.time() - start_time
writer.close()
await writer.wait_closed()
# Calculate throughput in Mbps
bytes_transferred = len(test_data) + len(response)
throughput_mbps = (bytes_transferred * 8) / (transfer_time * 1024 * 1024)
return throughput_mbps
except Exception as e:
log_debug(f"Throughput measurement failed for {peer.node_id}: {e}")
return 0.0
def _calculate_health_score(self, latency: float, availability: float, throughput: float) -> float:
"""Calculate overall health score"""
# Latency score (lower is better)
latency_score = max(0.0, 1.0 - (latency / self.max_latency_ms))
# Availability score
availability_score = availability / 100.0
# Throughput score (higher is better, normalized to 10 Mbps)
throughput_score = min(1.0, throughput / 10.0)
# Weighted average
health_score = (
latency_score * 0.3 +
availability_score * 0.4 +
throughput_score * 0.3
)
return health_score
def _update_health_status(self, peer: PeerNode, status: NodeStatus, latency: float,
availability: float, throughput: float, error_rate: float,
consecutive_failures: int = 0, health_score: float = 0.0):
"""Update health status for peer"""
self.health_status[peer.node_id] = HealthStatus(
node_id=peer.node_id,
status=status,
last_check=time.time(),
latency_ms=latency,
availability_percent=availability,
throughput_mbps=throughput,
error_rate_percent=error_rate,
consecutive_failures=consecutive_failures,
health_score=health_score
)
# Update peer status in discovery
peer.status = status
peer.last_seen = time.time()
def get_health_status(self, node_id: str) -> Optional[HealthStatus]:
"""Get health status for specific peer"""
return self.health_status.get(node_id)
def get_all_health_status(self) -> Dict[str, HealthStatus]:
"""Get health status for all peers"""
return self.health_status.copy()
def get_average_latency(self, node_id: str) -> Optional[float]:
"""Get average latency for peer"""
node_key = f"{self.health_status.get(node_id, HealthStatus('', NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).node_id}"
if node_key in self.latency_history and self.latency_history[node_key]:
return statistics.mean(self.latency_history[node_key])
return None
def get_healthy_peers(self) -> List[str]:
"""Get list of healthy peers"""
return [
node_id for node_id, status in self.health_status.items()
if status.health_score >= self.min_health_score
]
def get_unhealthy_peers(self) -> List[str]:
"""Get list of unhealthy peers"""
return [
node_id for node_id, status in self.health_status.items()
if status.health_score < self.min_health_score
]
# Global health monitor
health_monitor: Optional[PeerHealthMonitor] = None
def get_health_monitor() -> Optional[PeerHealthMonitor]:
"""Get global health monitor"""
return health_monitor
def create_health_monitor(check_interval: int = 60) -> PeerHealthMonitor:
"""Create and set global health monitor"""
global health_monitor
health_monitor = PeerHealthMonitor(check_interval)
return health_monitor