#!/bin/bash # # Node Failover Simulation Script # Simulates node shutdown and verifies network continues operating # Uses RPC endpoints only, no SSH access (check logic only) # set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" # Node Configuration NODES=( "aitbc:10.1.223.93" "aitbc1:10.1.223.40" "aitbc2:10.1.223.98" ) RPC_PORT=8006 LOG_DIR="/var/log/aitbc" LOG_FILE="${LOG_DIR}/failover-simulation.log" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # Logging functions log() { local level="$1" shift local message="$@" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}" } log_success() { log "SUCCESS" "$@" echo -e "${GREEN}$@${NC}" } log_error() { log "ERROR" "$@" echo -e "${RED}$@${NC}" } log_warning() { log "WARNING" "$@" echo -e "${YELLOW}$@${NC}" } # Check RPC endpoint health check_rpc_health() { local node_name="$1" local node_ip="$2" if curl -f -s --max-time 5 "http://${node_ip}:${RPC_PORT}/health" > /dev/null 2>&1; then log_success "RPC healthy on ${node_name}" return 0 else log_error "RPC unhealthy on ${node_name}" return 1 fi } # Simulate node shutdown (check logic only) simulate_node_shutdown() { local node_name="$1" local node_ip="$2" log "=== SIMULATING shutdown of ${node_name} ===" log "Note: This is a simulation - no actual service shutdown" log "Marking ${node_name} as unavailable in test logic" # In a real scenario, we would stop the service here # For simulation, we just mark it as unavailable in our logic return 0 } # Simulate node reconnection (check logic only) simulate_node_reconnection() { local node_name="$1" local node_ip="$2" log "=== SIMULATING reconnection of ${node_name} ===" log "Note: This is a simulation - no actual service restart" log "Marking ${node_name} as available in test logic" # Check if RPC is actually available if check_rpc_health "$node_name" "$node_ip"; then log_success "${node_name} reconnected (RPC available)" return 0 else log_error "${node_name} failed to reconnect (RPC unavailable)" return 1 fi } # Verify network continues with node down verify_network_continues() { local down_node="$1" log "=== Verifying network continues with ${down_node} down ===" local available_nodes=0 for node_config in "${NODES[@]}"; do IFS=':' read -r node_name node_ip <<< "$node_config" # Skip the simulated down node if [ "$node_name" = "$down_node" ]; then log "Skipping ${node_name} (simulated down)" continue fi if check_rpc_health "$node_name" "$node_ip"; then ((available_nodes++)) fi done log "Available nodes: ${available_nodes} / 3" if [ $available_nodes -ge 2 ]; then log_success "Network continues operating with ${available_nodes} nodes" return 0 else log_error "Network not operating with insufficient nodes (${available_nodes})" return 1 fi } # Verify consensus with reduced node count verify_consensus() { local down_node="$1" log "=== Verifying consensus with ${down_node} down ===" # Get block heights from available nodes local heights=() for node_config in "${NODES[@]}"; do IFS=':' read -r node_name node_ip <<< "$node_config" # Skip the simulated down node if [ "$node_name" = "$down_node" ]; then continue fi local height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/rpc/head" 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0") if [ "$height" != "0" ]; then heights+=("${node_name}:${height}") log "Block height on ${node_name}: ${height}" fi done # Check if heights are consistent if [ ${#heights[@]} -lt 2 ]; then log_error "Not enough nodes to verify consensus" return 1 fi local first_height=$(echo "${heights[0]}" | cut -d':' -f2) local consistent=true for height_info in "${heights[@]}"; do local h=$(echo "$height_info" | cut -d':' -f2) if [ "$h" != "$first_height" ]; then consistent=false log_warning "Height mismatch: ${height_info}" fi done if [ "$consistent" = true ]; then log_success "Consensus verified (all nodes at height ${first_height})" return 0 else log_error "Consensus failed (heights inconsistent)" return 1 fi } # Measure recovery time (simulated) measure_recovery_time() { local node_name="$1" local node_ip="$2" log "=== Measuring recovery time for ${node_name} ===" local start=$(date +%s) # Simulate reconnection check if simulate_node_reconnection "$node_name" "$node_ip"; then local end=$(date +%s) local recovery_time=$((end - start)) log "Recovery time for ${node_name}: ${recovery_time}s" echo "${recovery_time}" return 0 else log_error "Recovery failed for ${node_name}" echo "failed" return 1 fi } # Main execution main() { log "=== Node Failover Simulation Started ===" # Create log directory if it doesn't exist mkdir -p "${LOG_DIR}" local total_failures=0 # Check initial network health log "=== Checking initial network health ===" for node_config in "${NODES[@]}"; do IFS=':' read -r node_name node_ip <<< "$node_config" if ! check_rpc_health "$node_name" "$node_ip"; then ((total_failures++)) fi done if [ ${total_failures} -gt 0 ]; then log_error "Initial network health check failed" exit 1 fi # Simulate shutdown of each node sequentially for node_config in "${NODES[@]}"; do IFS=':' read -r node_name node_ip <<< "$node_config" log "" log "=== Testing failover for ${node_name} ===" # Simulate shutdown simulate_node_shutdown "$node_name" "$node_ip" # Verify network continues if ! verify_network_continues "$node_name"; then log_error "Network failed to continue without ${node_name}" ((total_failures++)) fi # Verify consensus if ! verify_consensus "$node_name"; then log_error "Consensus failed without ${node_name}" ((total_failures++)) fi # Simulate reconnection local recovery_time=$(measure_recovery_time "$node_name" "$node_ip") if [ "$recovery_time" = "failed" ]; then log_error "Recovery failed for ${node_name}" ((total_failures++)) fi done log "=== Node Failover Simulation Completed ===" log "Total failures: ${total_failures}" if [ ${total_failures} -eq 0 ]; then log_success "Node Failover Simulation passed" exit 0 else log_error "Node Failover Simulation failed with ${total_failures} failures" exit 1 fi } # Run main function main "$@"