aitbc/apps/blockchain-node/src/aitbc_chain/network/health.py

"""
Peer Health Monitoring Service
Monitors peer liveness and performance metrics
"""

import asyncio
import time
import ping3
import statistics
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum

from .discovery import PeerNode, NodeStatus

class HealthMetric(Enum):
    LATENCY = "latency"
    AVAILABILITY = "availability"
    THROUGHPUT = "throughput"
    ERROR_RATE = "error_rate"

@dataclass
class HealthStatus:
    node_id: str
    status: NodeStatus
    last_check: float
    latency_ms: float
    availability_percent: float
    throughput_mbps: float
    error_rate_percent: float
    consecutive_failures: int
    health_score: float

class PeerHealthMonitor:
    """Monitors health and performance of peer nodes"""

    def __init__(self, check_interval: int = 60):
        self.check_interval = check_interval
        self.health_status: Dict[str, HealthStatus] = {}
        self.running = False
        self.latency_history: Dict[str, List[float]] = {}
        self.max_history_size = 100

        # Health thresholds
        self.max_latency_ms = 1000
        self.min_availability_percent = 90.0
        self.min_health_score = 0.5
        self.max_consecutive_failures = 3

    async def start_monitoring(self, peers: Dict[str, PeerNode]):
        """Start health monitoring for peers"""
        self.running = True
        log_info("Starting peer health monitoring")

        while self.running:
            try:
                await self._check_all_peers(peers)
                await asyncio.sleep(self.check_interval)
            except Exception as e:
                log_error(f"Health monitoring error: {e}")
                await asyncio.sleep(10)

    async def stop_monitoring(self):
        """Stop health monitoring"""
        self.running = False
        log_info("Stopping peer health monitoring")

    async def _check_all_peers(self, peers: Dict[str, PeerNode]):
        """Check health of all peers"""
        tasks = []

        for node_id, peer in peers.items():
            if peer.status == NodeStatus.ONLINE:
                task = asyncio.create_task(self._check_peer_health(peer))
                tasks.append(task)

        if tasks:
            await asyncio.gather(*tasks, return_exceptions=True)

    async def _check_peer_health(self, peer: PeerNode):
        """Check health of individual peer"""
        start_time = time.time()

        try:
            # Check latency
            latency = await self._measure_latency(peer.address, peer.port)

            # Check availability
            availability = await self._check_availability(peer)

            # Check throughput
            throughput = await self._measure_throughput(peer)

            # Calculate health score
            health_score = self._calculate_health_score(latency, availability, throughput)

            # Update health status
            self._update_health_status(peer, NodeStatus.ONLINE, latency, availability, throughput, 0.0, health_score)

            # Reset consecutive failures
            if peer.node_id in self.health_status:
                self.health_status[peer.node_id].consecutive_failures = 0

        except Exception as e:
            log_error(f"Health check failed for peer {peer.node_id}: {e}")

            # Handle failure
            consecutive_failures = self.health_status.get(peer.node_id, HealthStatus(peer.node_id, NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).consecutive_failures + 1

            if consecutive_failures >= self.max_consecutive_failures:
                self._update_health_status(peer, NodeStatus.OFFLINE, 0, 0, 0, 100.0, 0.0)
            else:
                self._update_health_status(peer, NodeStatus.ERROR, 0, 0, 0, 0.0, consecutive_failures, 0.0)

    async def _measure_latency(self, address: str, port: int) -> float:
        """Measure network latency to peer"""
        try:
            # Use ping3 for basic latency measurement
            latency = ping3.ping(address, timeout=2)

            if latency is not None:
                latency_ms = latency * 1000

                # Update latency history
                node_id = f"{address}:{port}"
                if node_id not in self.latency_history:
                    self.latency_history[node_id] = []

                self.latency_history[node_id].append(latency_ms)

                # Limit history size
                if len(self.latency_history[node_id]) > self.max_history_size:
                    self.latency_history[node_id].pop(0)

                return latency_ms
            else:
                return float('inf')

        except Exception as e:
            log_debug(f"Latency measurement failed for {address}:{port}: {e}")
            return float('inf')

    async def _check_availability(self, peer: PeerNode) -> float:
        """Check peer availability by attempting connection"""
        try:
            start_time = time.time()

            # Try to connect to peer
            reader, writer = await asyncio.wait_for(
                asyncio.open_connection(peer.address, peer.port),
                timeout=5.0
            )

            connection_time = (time.time() - start_time) * 1000

            writer.close()
            await writer.wait_closed()

            # Calculate availability based on recent history
            node_id = peer.node_id
            if node_id in self.health_status:
                # Simple availability calculation based on success rate
                recent_status = self.health_status[node_id]
                if recent_status.status == NodeStatus.ONLINE:
                    return min(100.0, recent_status.availability_percent + 5.0)
                else:
                    return max(0.0, recent_status.availability_percent - 10.0)
            else:
                return 100.0  # First successful connection

        except Exception as e:
            log_debug(f"Availability check failed for {peer.node_id}: {e}")
            return 0.0

    async def _measure_throughput(self, peer: PeerNode) -> float:
        """Measure network throughput to peer"""
        try:
            # Simple throughput test using small data transfer
            test_data = b"x" * 1024  # 1KB test data

            start_time = time.time()

            reader, writer = await asyncio.open_connection(peer.address, peer.port)

            # Send test data
            writer.write(test_data)
            await writer.drain()

            # Wait for echo response (if peer supports it)
            response = await asyncio.wait_for(reader.read(1024), timeout=2.0)

            transfer_time = time.time() - start_time

            writer.close()
            await writer.wait_closed()

            # Calculate throughput in Mbps
            bytes_transferred = len(test_data) + len(response)
            throughput_mbps = (bytes_transferred * 8) / (transfer_time * 1024 * 1024)

            return throughput_mbps

        except Exception as e:
            log_debug(f"Throughput measurement failed for {peer.node_id}: {e}")
            return 0.0

    def _calculate_health_score(self, latency: float, availability: float, throughput: float) -> float:
        """Calculate overall health score"""
        # Latency score (lower is better)
        latency_score = max(0.0, 1.0 - (latency / self.max_latency_ms))

        # Availability score
        availability_score = availability / 100.0

        # Throughput score (higher is better, normalized to 10 Mbps)
        throughput_score = min(1.0, throughput / 10.0)

        # Weighted average
        health_score = (
            latency_score * 0.3 +
            availability_score * 0.4 +
            throughput_score * 0.3
        )

        return health_score

    def _update_health_status(self, peer: PeerNode, status: NodeStatus, latency: float,
                            availability: float, throughput: float, error_rate: float,
                            consecutive_failures: int = 0, health_score: float = 0.0):
        """Update health status for peer"""
        self.health_status[peer.node_id] = HealthStatus(
            node_id=peer.node_id,
            status=status,
            last_check=time.time(),
            latency_ms=latency,
            availability_percent=availability,
            throughput_mbps=throughput,
            error_rate_percent=error_rate,
            consecutive_failures=consecutive_failures,
            health_score=health_score
        )

        # Update peer status in discovery
        peer.status = status
        peer.last_seen = time.time()

    def get_health_status(self, node_id: str) -> Optional[HealthStatus]:
        """Get health status for specific peer"""
        return self.health_status.get(node_id)

    def get_all_health_status(self) -> Dict[str, HealthStatus]:
        """Get health status for all peers"""
        return self.health_status.copy()

    def get_average_latency(self, node_id: str) -> Optional[float]:
        """Get average latency for peer"""
        node_key = f"{self.health_status.get(node_id, HealthStatus('', NodeStatus.OFFLINE, 0, 0, 0, 0, 0, 0, 0.0)).node_id}"

        if node_key in self.latency_history and self.latency_history[node_key]:
            return statistics.mean(self.latency_history[node_key])

        return None

    def get_healthy_peers(self) -> List[str]:
        """Get list of healthy peers"""
        return [
            node_id for node_id, status in self.health_status.items()
            if status.health_score >= self.min_health_score
        ]

    def get_unhealthy_peers(self) -> List[str]:
        """Get list of unhealthy peers"""
        return [
            node_id for node_id, status in self.health_status.items()
            if status.health_score < self.min_health_score
        ]

# Global health monitor
health_monitor: Optional[PeerHealthMonitor] = None

def get_health_monitor() -> Optional[PeerHealthMonitor]:
    """Get global health monitor"""
    return health_monitor

def create_health_monitor(check_interval: int = 60) -> PeerHealthMonitor:
    """Create and set global health monitor"""
    global health_monitor
    health_monitor = PeerHealthMonitor(check_interval)
    return health_monitor