""" Network Partition Detection and Recovery Handles network split detection and automatic recovery """ import asyncio import time from typing import Dict, List, Set, Optional, Tuple from dataclasses import dataclass from enum import Enum from .discovery import P2PDiscovery, PeerNode, NodeStatus from .health import PeerHealthMonitor, HealthStatus class PartitionState(Enum): HEALTHY = "healthy" PARTITIONED = "partitioned" RECOVERING = "recovering" ISOLATED = "isolated" @dataclass class PartitionInfo: partition_id: str nodes: Set[str] leader: Optional[str] size: int created_at: float last_seen: float class NetworkPartitionManager: """Manages network partition detection and recovery""" def __init__(self, discovery: P2PDiscovery, health_monitor: PeerHealthMonitor): self.discovery = discovery self.health_monitor = health_monitor self.current_state = PartitionState.HEALTHY self.partitions: Dict[str, PartitionInfo] = {} self.local_partition_id = None self.detection_interval = 30 # seconds self.recovery_timeout = 300 # 5 minutes self.max_partition_size = 0.4 # Max 40% of network in one partition self.running = False # Partition detection thresholds self.min_connected_nodes = 3 self.partition_detection_threshold = 0.3 # 30% of network unreachable async def start_partition_monitoring(self): """Start partition monitoring service""" self.running = True log_info("Starting network partition monitoring") while self.running: try: await self._detect_partitions() await self._handle_partitions() await asyncio.sleep(self.detection_interval) except Exception as e: log_error(f"Partition monitoring error: {e}") await asyncio.sleep(10) async def stop_partition_monitoring(self): """Stop partition monitoring service""" self.running = False log_info("Stopping network partition monitoring") async def _detect_partitions(self): """Detect network partitions""" current_peers = self.discovery.get_peer_list() total_nodes = len(current_peers) + 1 # +1 for local node # Check connectivity reachable_nodes = set() unreachable_nodes = set() for peer in current_peers: health = self.health_monitor.get_health_status(peer.node_id) if health and health.status == NodeStatus.ONLINE: reachable_nodes.add(peer.node_id) else: unreachable_nodes.add(peer.node_id) # Calculate partition metrics reachable_ratio = len(reachable_nodes) / total_nodes if total_nodes > 0 else 0 log_info(f"Network connectivity: {len(reachable_nodes)}/{total_nodes} reachable ({reachable_ratio:.2%})") # Detect partition if reachable_ratio < (1 - self.partition_detection_threshold): await self._handle_partition_detected(reachable_nodes, unreachable_nodes) else: await self._handle_partition_healed() async def _handle_partition_detected(self, reachable_nodes: Set[str], unreachable_nodes: Set[str]): """Handle detected network partition""" if self.current_state == PartitionState.HEALTHY: log_warn(f"Network partition detected! Reachable: {len(reachable_nodes)}, Unreachable: {len(unreachable_nodes)}") self.current_state = PartitionState.PARTITIONED # Create partition info partition_id = self._generate_partition_id(reachable_nodes) self.local_partition_id = partition_id self.partitions[partition_id] = PartitionInfo( partition_id=partition_id, nodes=reachable_nodes.copy(), leader=None, size=len(reachable_nodes), created_at=time.time(), last_seen=time.time() ) # Start recovery procedures asyncio.create_task(self._start_partition_recovery()) async def _handle_partition_healed(self): """Handle healed network partition""" if self.current_state in [PartitionState.PARTITIONED, PartitionState.RECOVERING]: log_info("Network partition healed!") self.current_state = PartitionState.HEALTHY # Clear partition info self.partitions.clear() self.local_partition_id = None async def _handle_partitions(self): """Handle active partitions""" if self.current_state == PartitionState.PARTITIONED: await self._maintain_partition() elif self.current_state == PartitionState.RECOVERING: await self._monitor_recovery() async def _maintain_partition(self): """Maintain operations during partition""" if not self.local_partition_id: return partition = self.partitions.get(self.local_partition_id) if not partition: return # Update partition info current_peers = set(peer.node_id for peer in self.discovery.get_peer_list()) partition.nodes = current_peers partition.last_seen = time.time() partition.size = len(current_peers) # Select leader if none exists if not partition.leader: partition.leader = self._select_partition_leader(current_peers) log_info(f"Selected partition leader: {partition.leader}") async def _start_partition_recovery(self): """Start partition recovery procedures""" log_info("Starting partition recovery procedures") recovery_tasks = [ asyncio.create_task(self._attempt_reconnection()), asyncio.create_task(self._bootstrap_from_known_nodes()), asyncio.create_task(self._coordinate_with_other_partitions()) ] try: await asyncio.gather(*recovery_tasks, return_exceptions=True) except Exception as e: log_error(f"Partition recovery error: {e}") async def _attempt_reconnection(self): """Attempt to reconnect to unreachable nodes""" if not self.local_partition_id: return partition = self.partitions[self.local_partition_id] # Try to reconnect to known unreachable nodes all_known_peers = self.discovery.peers.copy() for node_id, peer in all_known_peers.items(): if node_id not in partition.nodes: # Try to reconnect success = await self.discovery._connect_to_peer(peer.address, peer.port) if success: log_info(f"Reconnected to node {node_id} during partition recovery") async def _bootstrap_from_known_nodes(self): """Bootstrap network from known good nodes""" # Try to connect to bootstrap nodes for address, port in self.discovery.bootstrap_nodes: try: success = await self.discovery._connect_to_peer(address, port) if success: log_info(f"Bootstrap successful to {address}:{port}") break except Exception as e: log_debug(f"Bootstrap failed to {address}:{port}: {e}") async def _coordinate_with_other_partitions(self): """Coordinate with other partitions (if detectable)""" # In a real implementation, this would use partition detection protocols # For now, just log the attempt log_info("Attempting to coordinate with other partitions") async def _monitor_recovery(self): """Monitor partition recovery progress""" if not self.local_partition_id: return partition = self.partitions[self.local_partition_id] # Check if recovery is taking too long if time.time() - partition.created_at > self.recovery_timeout: log_warn("Partition recovery timeout, considering extended recovery strategies") await self._extended_recovery_strategies() async def _extended_recovery_strategies(self): """Implement extended recovery strategies""" # Try alternative discovery methods await self._alternative_discovery() # Consider network reconfiguration await self._network_reconfiguration() async def _alternative_discovery(self): """Try alternative peer discovery methods""" log_info("Trying alternative discovery methods") # Try DNS-based discovery await self._dns_discovery() # Try multicast discovery await self._multicast_discovery() async def _dns_discovery(self): """DNS-based peer discovery""" # In a real implementation, this would query DNS records log_debug("Attempting DNS-based discovery") async def _multicast_discovery(self): """Multicast-based peer discovery""" # In a real implementation, this would use multicast packets log_debug("Attempting multicast discovery") async def _network_reconfiguration(self): """Reconfigure network for partition resilience""" log_info("Reconfiguring network for partition resilience") # Increase connection retry intervals # Adjust topology for better fault tolerance # Enable alternative communication channels def _generate_partition_id(self, nodes: Set[str]) -> str: """Generate unique partition ID""" import hashlib sorted_nodes = sorted(nodes) content = "|".join(sorted_nodes) return hashlib.sha256(content.encode()).hexdigest()[:16] def _select_partition_leader(self, nodes: Set[str]) -> Optional[str]: """Select leader for partition""" if not nodes: return None # Select node with highest reputation best_node = None best_reputation = 0 for node_id in nodes: peer = self.discovery.peers.get(node_id) if peer and peer.reputation > best_reputation: best_reputation = peer.reputation best_node = node_id return best_node def get_partition_status(self) -> Dict: """Get current partition status""" return { 'state': self.current_state.value, 'local_partition_id': self.local_partition_id, 'partition_count': len(self.partitions), 'partitions': { pid: { 'size': info.size, 'leader': info.leader, 'created_at': info.created_at, 'last_seen': info.last_seen } for pid, info in self.partitions.items() } } def is_partitioned(self) -> bool: """Check if network is currently partitioned""" return self.current_state in [PartitionState.PARTITIONED, PartitionState.RECOVERING] def get_local_partition_size(self) -> int: """Get size of local partition""" if not self.local_partition_id: return 0 partition = self.partitions.get(self.local_partition_id) return partition.size if partition else 0 # Global partition manager partition_manager: Optional[NetworkPartitionManager] = None def get_partition_manager() -> Optional[NetworkPartitionManager]: """Get global partition manager""" return partition_manager def create_partition_manager(discovery: P2PDiscovery, health_monitor: PeerHealthMonitor) -> NetworkPartitionManager: """Create and set global partition manager""" global partition_manager partition_manager = NetworkPartitionManager(discovery, health_monitor) return partition_manager