✅ Phase 0: Pre-implementation checklist completed - Environment configurations (dev/staging/production) - Directory structure setup (logs, backups, monitoring) - Virtual environment with dependencies ✅ Master deployment script created - Single command deployment with validation - Progress tracking and rollback capability - Health checks and deployment reporting ✅ Validation script created - Module import validation - Basic functionality testing - Configuration and script verification ✅ Implementation fixes - Fixed dataclass import in consensus keys - Fixed async function syntax in tests - Updated deployment script for virtual environment 🚀 Ready for deployment: ./scripts/deploy-mesh-network.sh dev
290 lines
10 KiB
Python
290 lines
10 KiB
Python
"""
|
|
Peer Health Monitoring Service
|
|
Monitors peer liveness and performance metrics
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
import ping3
|
|
import statistics
|
|
from typing import Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
|
|
from .discovery import PeerNode, NodeStatus
|
|
|
|
class HealthMetric(Enum):
|
|
LATENCY = "latency"
|
|
AVAILABILITY = "availability"
|
|
THROUGHPUT = "throughput"
|
|
ERROR_RATE = "error_rate"
|
|
|
|
@dataclass
|
|
class HealthStatus:
|
|
node_id: str
|
|
status: NodeStatus
|
|
last_check: float
|
|
latency_ms: float
|
|
availability_percent: float
|
|
throughput_mbps: float
|
|
error_rate_percent: float
|
|
consecutive_failures: int
|
|
health_score: float
|
|
|
|
class PeerHealthMonitor:
|
|
"""Monitors health and performance of peer nodes"""
|
|
|
|
def __init__(self, check_interval: int = 60):
|
|
self.check_interval = check_interval
|
|
self.health_status: Dict[str, HealthStatus] = {}
|
|
self.running = False
|
|
self.latency_history: Dict[str, List[float]] = {}
|
|
self.max_history_size = 100
|
|
|
|
# Health thresholds
|
|
self.max_latency_ms = 1000
|
|
self.min_availability_percent = 90.0
|
|
self.min_health_score = 0.5
|
|
self.max_consecutive_failures = 3
|
|
|
|
async def start_monitoring(self, peers: Dict[str, PeerNode]):
|
|
"""Start health monitoring for peers"""
|
|
self.running = True
|
|
log_info("Starting peer health monitoring")
|
|
|
|
while self.running:
|
|
try:
|
|
await self._check_all_peers(peers)
|
|
await asyncio.sleep(self.check_interval)
|
|
except Exception as e:
|
|
log_error(f"Health monitoring error: {e}")
|
|
await asyncio.sleep(10)
|
|
|
|
async def stop_monitoring(self):
|
|
"""Stop health monitoring"""
|
|
self.running = False
|
|
log_info("Stopping peer health monitoring")
|
|
|
|
async def _check_all_peers(self, peers: Dict[str, PeerNode]):
|
|
"""Check health of all peers"""
|
|
tasks = []
|
|
|
|
for node_id, peer in peers.items():
|
|
if peer.status == NodeStatus.ONLINE:
|
|
task = asyncio.create_task(self._check_peer_health(peer))
|
|
tasks.append(task)
|
|
|
|
if tasks:
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
async def _check_peer_health(self, peer: PeerNode):
|
|
"""Check health of individual peer"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Check latency
|
|
latency = await self._measure_latency(peer.address, peer.port)
|
|
|
|
# Check availability
|
|
availability = await self._check_availability(peer)
|
|
|
|
# Check throughput
|
|
throughput = await self._measure_throughput(peer)
|
|
|
|
# Calculate health score
|
|
health_score = self._calculate_health_score(latency, availability, throughput)
|
|
|
|
# Update health status
|
|
self._update_health_status(peer, NodeStatus.ONLINE, latency, availability, throughput, 0.0, health_score)
|
|
|
|
# Reset consecutive failures
|
|
if peer.node_id in self.health_status:
|
|
self.health_status[peer.node_id].consecutive_failures = 0
|
|
|
|
except Exception as e:
|
|
log_error(f"Health check failed for peer {peer.node_id}: {e}")
|
|
|
|
# Handle failure
|
|
consecutive_failures = self.health_status.get(peer.node_id, HealthStatus(peer.node_id, NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).consecutive_failures + 1
|
|
|
|
if consecutive_failures >= self.max_consecutive_failures:
|
|
self._update_health_status(peer, NodeStatus.OFFLINE, 0, 0, 0, 100.0, 0.0)
|
|
else:
|
|
self._update_health_status(peer, NodeStatus.ERROR, 0, 0, 0, 0.0, consecutive_failures, 0.0)
|
|
|
|
async def _measure_latency(self, address: str, port: int) -> float:
|
|
"""Measure network latency to peer"""
|
|
try:
|
|
# Use ping3 for basic latency measurement
|
|
latency = ping3.ping(address, timeout=2)
|
|
|
|
if latency is not None:
|
|
latency_ms = latency * 1000
|
|
|
|
# Update latency history
|
|
node_id = f"{address}:{port}"
|
|
if node_id not in self.latency_history:
|
|
self.latency_history[node_id] = []
|
|
|
|
self.latency_history[node_id].append(latency_ms)
|
|
|
|
# Limit history size
|
|
if len(self.latency_history[node_id]) > self.max_history_size:
|
|
self.latency_history[node_id].pop(0)
|
|
|
|
return latency_ms
|
|
else:
|
|
return float('inf')
|
|
|
|
except Exception as e:
|
|
log_debug(f"Latency measurement failed for {address}:{port}: {e}")
|
|
return float('inf')
|
|
|
|
async def _check_availability(self, peer: PeerNode) -> float:
|
|
"""Check peer availability by attempting connection"""
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Try to connect to peer
|
|
reader, writer = await asyncio.wait_for(
|
|
asyncio.open_connection(peer.address, peer.port),
|
|
timeout=5.0
|
|
)
|
|
|
|
connection_time = (time.time() - start_time) * 1000
|
|
|
|
writer.close()
|
|
await writer.wait_closed()
|
|
|
|
# Calculate availability based on recent history
|
|
node_id = peer.node_id
|
|
if node_id in self.health_status:
|
|
# Simple availability calculation based on success rate
|
|
recent_status = self.health_status[node_id]
|
|
if recent_status.status == NodeStatus.ONLINE:
|
|
return min(100.0, recent_status.availability_percent + 5.0)
|
|
else:
|
|
return max(0.0, recent_status.availability_percent - 10.0)
|
|
else:
|
|
return 100.0 # First successful connection
|
|
|
|
except Exception as e:
|
|
log_debug(f"Availability check failed for {peer.node_id}: {e}")
|
|
return 0.0
|
|
|
|
async def _measure_throughput(self, peer: PeerNode) -> float:
|
|
"""Measure network throughput to peer"""
|
|
try:
|
|
# Simple throughput test using small data transfer
|
|
test_data = b"x" * 1024 # 1KB test data
|
|
|
|
start_time = time.time()
|
|
|
|
reader, writer = await asyncio.open_connection(peer.address, peer.port)
|
|
|
|
# Send test data
|
|
writer.write(test_data)
|
|
await writer.drain()
|
|
|
|
# Wait for echo response (if peer supports it)
|
|
response = await asyncio.wait_for(reader.read(1024), timeout=2.0)
|
|
|
|
transfer_time = time.time() - start_time
|
|
|
|
writer.close()
|
|
await writer.wait_closed()
|
|
|
|
# Calculate throughput in Mbps
|
|
bytes_transferred = len(test_data) + len(response)
|
|
throughput_mbps = (bytes_transferred * 8) / (transfer_time * 1024 * 1024)
|
|
|
|
return throughput_mbps
|
|
|
|
except Exception as e:
|
|
log_debug(f"Throughput measurement failed for {peer.node_id}: {e}")
|
|
return 0.0
|
|
|
|
def _calculate_health_score(self, latency: float, availability: float, throughput: float) -> float:
|
|
"""Calculate overall health score"""
|
|
# Latency score (lower is better)
|
|
latency_score = max(0.0, 1.0 - (latency / self.max_latency_ms))
|
|
|
|
# Availability score
|
|
availability_score = availability / 100.0
|
|
|
|
# Throughput score (higher is better, normalized to 10 Mbps)
|
|
throughput_score = min(1.0, throughput / 10.0)
|
|
|
|
# Weighted average
|
|
health_score = (
|
|
latency_score * 0.3 +
|
|
availability_score * 0.4 +
|
|
throughput_score * 0.3
|
|
)
|
|
|
|
return health_score
|
|
|
|
def _update_health_status(self, peer: PeerNode, status: NodeStatus, latency: float,
|
|
availability: float, throughput: float, error_rate: float,
|
|
consecutive_failures: int = 0, health_score: float = 0.0):
|
|
"""Update health status for peer"""
|
|
self.health_status[peer.node_id] = HealthStatus(
|
|
node_id=peer.node_id,
|
|
status=status,
|
|
last_check=time.time(),
|
|
latency_ms=latency,
|
|
availability_percent=availability,
|
|
throughput_mbps=throughput,
|
|
error_rate_percent=error_rate,
|
|
consecutive_failures=consecutive_failures,
|
|
health_score=health_score
|
|
)
|
|
|
|
# Update peer status in discovery
|
|
peer.status = status
|
|
peer.last_seen = time.time()
|
|
|
|
def get_health_status(self, node_id: str) -> Optional[HealthStatus]:
|
|
"""Get health status for specific peer"""
|
|
return self.health_status.get(node_id)
|
|
|
|
def get_all_health_status(self) -> Dict[str, HealthStatus]:
|
|
"""Get health status for all peers"""
|
|
return self.health_status.copy()
|
|
|
|
def get_average_latency(self, node_id: str) -> Optional[float]:
|
|
"""Get average latency for peer"""
|
|
node_key = f"{self.health_status.get(node_id, HealthStatus('', NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).node_id}"
|
|
|
|
if node_key in self.latency_history and self.latency_history[node_key]:
|
|
return statistics.mean(self.latency_history[node_key])
|
|
|
|
return None
|
|
|
|
def get_healthy_peers(self) -> List[str]:
|
|
"""Get list of healthy peers"""
|
|
return [
|
|
node_id for node_id, status in self.health_status.items()
|
|
if status.health_score >= self.min_health_score
|
|
]
|
|
|
|
def get_unhealthy_peers(self) -> List[str]:
|
|
"""Get list of unhealthy peers"""
|
|
return [
|
|
node_id for node_id, status in self.health_status.items()
|
|
if status.health_score < self.min_health_score
|
|
]
|
|
|
|
# Global health monitor
|
|
health_monitor: Optional[PeerHealthMonitor] = None
|
|
|
|
def get_health_monitor() -> Optional[PeerHealthMonitor]:
|
|
"""Get global health monitor"""
|
|
return health_monitor
|
|
|
|
def create_health_monitor(check_interval: int = 60) -> PeerHealthMonitor:
|
|
"""Create and set global health monitor"""
|
|
global health_monitor
|
|
health_monitor = PeerHealthMonitor(check_interval)
|
|
return health_monitor
|