feat: add medium-priority multi-node blockchain testing workflows
- Add cross-node transaction testing workflow (manual dispatch) - Add node failover simulation workflow (manual dispatch, check logic only) - Add multi-node stress testing workflow (manual dispatch) - All workflows use only RPC endpoints (no SSH access) - All workflows run on manual dispatch only - No remediation steps (monitoring/testing only) - Cross-node transaction testing uses real transactions from test wallet - Failover simulation uses check logic only (no actual shutdown) - Stress testing generates real transactions with configurable count/rate - Comprehensive logging to /var/log/aitbc/ - Proper wallet creation and cleanup
This commit is contained in:
275
scripts/multi-node/failover-simulation.sh
Executable file
275
scripts/multi-node/failover-simulation.sh
Executable file
@@ -0,0 +1,275 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Node Failover Simulation Script
|
||||
# Simulates node shutdown and verifies network continues operating
|
||||
# Uses RPC endpoints only, no SSH access (check logic only)
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
# Node Configuration
|
||||
NODES=(
|
||||
"aitbc:10.1.223.93"
|
||||
"aitbc1:10.1.223.40"
|
||||
"aitbc2:10.1.223.98"
|
||||
)
|
||||
|
||||
RPC_PORT=8006
|
||||
LOG_DIR="/var/log/aitbc"
|
||||
LOG_FILE="${LOG_DIR}/failover-simulation.log"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Logging functions
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local message="$@"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
log "SUCCESS" "$@"
|
||||
echo -e "${GREEN}$@${NC}"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
log "ERROR" "$@"
|
||||
echo -e "${RED}$@${NC}"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
log "WARNING" "$@"
|
||||
echo -e "${YELLOW}$@${NC}"
|
||||
}
|
||||
|
||||
# Check RPC endpoint health
|
||||
check_rpc_health() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
|
||||
if curl -f -s --max-time 5 "http://${node_ip}:${RPC_PORT}/health" > /dev/null 2>&1; then
|
||||
log_success "RPC healthy on ${node_name}"
|
||||
return 0
|
||||
else
|
||||
log_error "RPC unhealthy on ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Simulate node shutdown (check logic only)
|
||||
simulate_node_shutdown() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
|
||||
log "=== SIMULATING shutdown of ${node_name} ==="
|
||||
log "Note: This is a simulation - no actual service shutdown"
|
||||
log "Marking ${node_name} as unavailable in test logic"
|
||||
|
||||
# In a real scenario, we would stop the service here
|
||||
# For simulation, we just mark it as unavailable in our logic
|
||||
return 0
|
||||
}
|
||||
|
||||
# Simulate node reconnection (check logic only)
|
||||
simulate_node_reconnection() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
|
||||
log "=== SIMULATING reconnection of ${node_name} ==="
|
||||
log "Note: This is a simulation - no actual service restart"
|
||||
log "Marking ${node_name} as available in test logic"
|
||||
|
||||
# Check if RPC is actually available
|
||||
if check_rpc_health "$node_name" "$node_ip"; then
|
||||
log_success "${node_name} reconnected (RPC available)"
|
||||
return 0
|
||||
else
|
||||
log_error "${node_name} failed to reconnect (RPC unavailable)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Verify network continues with node down
|
||||
verify_network_continues() {
|
||||
local down_node="$1"
|
||||
|
||||
log "=== Verifying network continues with ${down_node} down ==="
|
||||
|
||||
local available_nodes=0
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
# Skip the simulated down node
|
||||
if [ "$node_name" = "$down_node" ]; then
|
||||
log "Skipping ${node_name} (simulated down)"
|
||||
continue
|
||||
fi
|
||||
|
||||
if check_rpc_health "$node_name" "$node_ip"; then
|
||||
((available_nodes++))
|
||||
fi
|
||||
done
|
||||
|
||||
log "Available nodes: ${available_nodes} / 3"
|
||||
|
||||
if [ $available_nodes -ge 2 ]; then
|
||||
log_success "Network continues operating with ${available_nodes} nodes"
|
||||
return 0
|
||||
else
|
||||
log_error "Network not operating with insufficient nodes (${available_nodes})"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Verify consensus with reduced node count
|
||||
verify_consensus() {
|
||||
local down_node="$1"
|
||||
|
||||
log "=== Verifying consensus with ${down_node} down ==="
|
||||
|
||||
# Get block heights from available nodes
|
||||
local heights=()
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
# Skip the simulated down node
|
||||
if [ "$node_name" = "$down_node" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
local height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/rpc/head" 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
|
||||
if [ "$height" != "0" ]; then
|
||||
heights+=("${node_name}:${height}")
|
||||
log "Block height on ${node_name}: ${height}"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check if heights are consistent
|
||||
if [ ${#heights[@]} -lt 2 ]; then
|
||||
log_error "Not enough nodes to verify consensus"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local first_height=$(echo "${heights[0]}" | cut -d':' -f2)
|
||||
local consistent=true
|
||||
|
||||
for height_info in "${heights[@]}"; do
|
||||
local h=$(echo "$height_info" | cut -d':' -f2)
|
||||
if [ "$h" != "$first_height" ]; then
|
||||
consistent=false
|
||||
log_warning "Height mismatch: ${height_info}"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$consistent" = true ]; then
|
||||
log_success "Consensus verified (all nodes at height ${first_height})"
|
||||
return 0
|
||||
else
|
||||
log_error "Consensus failed (heights inconsistent)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Measure recovery time (simulated)
|
||||
measure_recovery_time() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
|
||||
log "=== Measuring recovery time for ${node_name} ==="
|
||||
|
||||
local start=$(date +%s)
|
||||
|
||||
# Simulate reconnection check
|
||||
if simulate_node_reconnection "$node_name" "$node_ip"; then
|
||||
local end=$(date +%s)
|
||||
local recovery_time=$((end - start))
|
||||
log "Recovery time for ${node_name}: ${recovery_time}s"
|
||||
echo "${recovery_time}"
|
||||
return 0
|
||||
else
|
||||
log_error "Recovery failed for ${node_name}"
|
||||
echo "failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "=== Node Failover Simulation Started ==="
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
local total_failures=0
|
||||
|
||||
# Check initial network health
|
||||
log "=== Checking initial network health ==="
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
if ! check_rpc_health "$node_name" "$node_ip"; then
|
||||
((total_failures++))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${total_failures} -gt 0 ]; then
|
||||
log_error "Initial network health check failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Simulate shutdown of each node sequentially
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
log ""
|
||||
log "=== Testing failover for ${node_name} ==="
|
||||
|
||||
# Simulate shutdown
|
||||
simulate_node_shutdown "$node_name" "$node_ip"
|
||||
|
||||
# Verify network continues
|
||||
if ! verify_network_continues "$node_name"; then
|
||||
log_error "Network failed to continue without ${node_name}"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
# Verify consensus
|
||||
if ! verify_consensus "$node_name"; then
|
||||
log_error "Consensus failed without ${node_name}"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
# Simulate reconnection
|
||||
local recovery_time=$(measure_recovery_time "$node_name" "$node_ip")
|
||||
|
||||
if [ "$recovery_time" = "failed" ]; then
|
||||
log_error "Recovery failed for ${node_name}"
|
||||
((total_failures++))
|
||||
fi
|
||||
done
|
||||
|
||||
log "=== Node Failover Simulation Completed ==="
|
||||
log "Total failures: ${total_failures}"
|
||||
|
||||
if [ ${total_failures} -eq 0 ]; then
|
||||
log_success "Node Failover Simulation passed"
|
||||
exit 0
|
||||
else
|
||||
log_error "Node Failover Simulation failed with ${total_failures} failures"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user