From c5525d7345a8526ad646e4550eeb72a0a83f72e6 Mon Sep 17 00:00:00 2001 From: aitbc Date: Mon, 20 Apr 2026 20:22:28 +0200 Subject: [PATCH] feat: add multi-node blockchain monitoring workflows for 3-node network - Create multi-node blockchain health monitoring workflow - Create P2P network verification workflow for all 3 nodes - Create blockchain synchronization verification workflow - Update blockchain-communication-test.sh to include aitbc2 (gitea-runner) - Add shared scripts directory with health check, P2P verification, and sync verification scripts - All workflows trigger on git push to main/develop branches - Workflows run on gitea-runner (has SSH access to all nodes) - Include automatic remediation for failed services and sync issues - Sync threshold set to 10 blocks - Logging to /var/log/aitbc/ and alerts in Gitea UI --- .../blockchain-sync-verification.yml | 67 ++++ .gitea/workflows/multi-node-health.yml | 67 ++++ .gitea/workflows/p2p-network-verification.yml | 67 ++++ scripts/blockchain-communication-test.sh | 91 +++-- scripts/multi-node/blockchain-health-check.sh | 288 +++++++++++++++ scripts/multi-node/p2p-verification.sh | 287 +++++++++++++++ scripts/multi-node/sync-verification.sh | 348 ++++++++++++++++++ 7 files changed, 1192 insertions(+), 23 deletions(-) create mode 100644 .gitea/workflows/blockchain-sync-verification.yml create mode 100644 .gitea/workflows/multi-node-health.yml create mode 100644 .gitea/workflows/p2p-network-verification.yml create mode 100755 scripts/multi-node/blockchain-health-check.sh create mode 100755 scripts/multi-node/p2p-verification.sh create mode 100755 scripts/multi-node/sync-verification.sh diff --git a/.gitea/workflows/blockchain-sync-verification.yml b/.gitea/workflows/blockchain-sync-verification.yml new file mode 100644 index 00000000..3ba37204 --- /dev/null +++ b/.gitea/workflows/blockchain-sync-verification.yml @@ -0,0 +1,67 @@ +name: Blockchain Synchronization Verification + +on: + push: + branches: [main, develop] + paths: + - 'apps/blockchain-node/**' + - 'scripts/multi-node/**' + - '.gitea/workflows/blockchain-sync-verification.yml' + pull_request: + branches: [main, develop] + workflow_dispatch: + schedule: + - cron: '0 */6 * * *' # Every 6 hours + +concurrency: + group: blockchain-sync-verification-${{ github.ref }} + cancel-in-progress: true + +jobs: + sync-verification: + runs-on: debian + timeout-minutes: 20 + + steps: + - name: Clone repository + run: | + WORKSPACE="/var/lib/aitbc-workspaces/blockchain-sync-verification" + rm -rf "$WORKSPACE" + mkdir -p "$WORKSPACE" + cd "$WORKSPACE" + git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo + + - name: Initialize job logging + run: | + cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo + bash scripts/ci/setup-job-logging.sh + + - name: Setup Python environment + run: | + cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo + + # Remove any existing venv to avoid cache corruption issues + rm -rf venv + + bash scripts/ci/setup-python-venv.sh \ + --repo-dir "$PWD" \ + --venv-dir "$PWD/venv" \ + --skip-requirements \ + --extra-packages "requests psutil" + + - name: Run blockchain synchronization verification + run: | + cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo + bash scripts/multi-node/sync-verification.sh + + - name: Sync verification report + if: always() + run: | + echo "=== Blockchain Synchronization Verification Report ===" + if [ -f /var/log/aitbc/sync-verification.log ]; then + tail -50 /var/log/aitbc/sync-verification.log + fi + + - name: Cleanup + if: always() + run: rm -rf /var/lib/aitbc-workspaces/blockchain-sync-verification diff --git a/.gitea/workflows/multi-node-health.yml b/.gitea/workflows/multi-node-health.yml new file mode 100644 index 00000000..0dbc91c3 --- /dev/null +++ b/.gitea/workflows/multi-node-health.yml @@ -0,0 +1,67 @@ +name: Multi-Node Blockchain Health Monitoring + +on: + push: + branches: [main, develop] + paths: + - 'apps/blockchain-node/**' + - 'scripts/multi-node/**' + - '.gitea/workflows/multi-node-health.yml' + pull_request: + branches: [main, develop] + workflow_dispatch: + schedule: + - cron: '0 */2 * * *' # Every 2 hours + +concurrency: + group: multi-node-health-${{ github.ref }} + cancel-in-progress: true + +jobs: + health-check: + runs-on: debian + timeout-minutes: 15 + + steps: + - name: Clone repository + run: | + WORKSPACE="/var/lib/aitbc-workspaces/multi-node-health" + rm -rf "$WORKSPACE" + mkdir -p "$WORKSPACE" + cd "$WORKSPACE" + git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo + + - name: Initialize job logging + run: | + cd /var/lib/aitbc-workspaces/multi-node-health/repo + bash scripts/ci/setup-job-logging.sh + + - name: Setup Python environment + run: | + cd /var/lib/aitbc-workspaces/multi-node-health/repo + + # Remove any existing venv to avoid cache corruption issues + rm -rf venv + + bash scripts/ci/setup-python-venv.sh \ + --repo-dir "$PWD" \ + --venv-dir "$PWD/venv" \ + --skip-requirements \ + --extra-packages "requests psutil" + + - name: Run multi-node health check + run: | + cd /var/lib/aitbc-workspaces/multi-node-health/repo + bash scripts/multi-node/blockchain-health-check.sh + + - name: Health check report + if: always() + run: | + echo "=== Multi-Node Health Check Report ===" + if [ -f /var/log/aitbc/multi-node-health.log ]; then + tail -50 /var/log/aitbc/multi-node-health.log + fi + + - name: Cleanup + if: always() + run: rm -rf /var/lib/aitbc-workspaces/multi-node-health diff --git a/.gitea/workflows/p2p-network-verification.yml b/.gitea/workflows/p2p-network-verification.yml new file mode 100644 index 00000000..7616a386 --- /dev/null +++ b/.gitea/workflows/p2p-network-verification.yml @@ -0,0 +1,67 @@ +name: P2P Network Verification + +on: + push: + branches: [main, develop] + paths: + - 'apps/blockchain-node/**' + - 'scripts/multi-node/**' + - '.gitea/workflows/p2p-network-verification.yml' + pull_request: + branches: [main, develop] + workflow_dispatch: + schedule: + - cron: '0 */4 * * *' # Every 4 hours + +concurrency: + group: p2p-network-verification-${{ github.ref }} + cancel-in-progress: true + +jobs: + p2p-verification: + runs-on: debian + timeout-minutes: 15 + + steps: + - name: Clone repository + run: | + WORKSPACE="/var/lib/aitbc-workspaces/p2p-network-verification" + rm -rf "$WORKSPACE" + mkdir -p "$WORKSPACE" + cd "$WORKSPACE" + git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo + + - name: Initialize job logging + run: | + cd /var/lib/aitbc-workspaces/p2p-network-verification/repo + bash scripts/ci/setup-job-logging.sh + + - name: Setup Python environment + run: | + cd /var/lib/aitbc-workspaces/p2p-network-verification/repo + + # Remove any existing venv to avoid cache corruption issues + rm -rf venv + + bash scripts/ci/setup-python-venv.sh \ + --repo-dir "$PWD" \ + --venv-dir "$PWD/venv" \ + --skip-requirements \ + --extra-packages "requests psutil" + + - name: Run P2P network verification + run: | + cd /var/lib/aitbc-workspaces/p2p-network-verification/repo + bash scripts/multi-node/p2p-verification.sh + + - name: P2P verification report + if: always() + run: | + echo "=== P2P Network Verification Report ===" + if [ -f /var/log/aitbc/p2p-verification.log ]; then + tail -50 /var/log/aitbc/p2p-verification.log + fi + + - name: Cleanup + if: always() + run: rm -rf /var/lib/aitbc-workspaces/p2p-network-verification diff --git a/scripts/blockchain-communication-test.sh b/scripts/blockchain-communication-test.sh index 9159ee6e..289035e0 100755 --- a/scripts/blockchain-communication-test.sh +++ b/scripts/blockchain-communication-test.sh @@ -1,8 +1,8 @@ #!/bin/bash # # Blockchain Communication Test Script -# Tests communication between aitbc (genesis) and aitbc1 (follower) nodes -# Both nodes run on port 8006 on different physical machines +# Tests communication between aitbc (genesis), aitbc1 (follower), and aitbc2 (gitea-runner) nodes +# All nodes run on port 8006 on different physical machines # set -e @@ -11,8 +11,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # Configuration -GENESIS_IP="10.1.223.40" -FOLLOWER_IP="" # Replace with actual IP +GENESIS_IP="10.1.223.93" +FOLLOWER_IP="10.1.223.40" +FOLLOWER2_IP="10.1.223.98" # gitea-runner/aitbc2 PORT=8006 CLI_PATH="${CLI_PATH:-${REPO_ROOT}/aitbc-cli}" LOG_DIR="/var/log/aitbc" @@ -114,7 +115,7 @@ test_connectivity() { return 1 fi - # Test follower node + # Test follower node (aitbc1) log_debug "Testing follower node at ${FOLLOWER_IP}:${PORT}" if curl -f -s "http://${FOLLOWER_IP}:${PORT}/health" > /dev/null; then log_success "Follower node (aitbc1) is reachable" @@ -123,12 +124,27 @@ test_connectivity() { return 1 fi + # Test follower node (aitbc2/gitea-runner) + log_debug "Testing follower node (aitbc2/gitea-runner) at ${FOLLOWER2_IP}:${PORT}" + if curl -f -s "http://${FOLLOWER2_IP}:${PORT}/health" > /dev/null; then + log_success "Follower node (aitbc2/gitea-runner) is reachable" + else + log_error "Follower node (aitbc2/gitea-runner) is NOT reachable" + return 1 + fi + # Test P2P connectivity log_debug "Testing P2P connectivity" if ${CLI_PATH} network ping --node aitbc1 --host ${FOLLOWER_IP} --port ${PORT} --debug > /dev/null 2>&1; then - log_success "P2P connectivity between nodes is working" + log_success "P2P connectivity to aitbc1 is working" else - log_warning "P2P connectivity test failed (may not be critical)" + log_warning "P2P connectivity to aitbc1 test failed (may not be critical)" + fi + + if ${CLI_PATH} network ping --node aitbc2 --host ${FOLLOWER2_IP} --port ${PORT} --debug > /dev/null 2>&1; then + log_success "P2P connectivity to aitbc2 is working" + else + log_warning "P2P connectivity to aitbc2 test failed (may not be critical)" fi # Check peers @@ -146,23 +162,38 @@ test_blockchain_status() { GENESIS_HEIGHT=$(NODE_URL="http://${GENESIS_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0") log_info "Genesis node block height: ${GENESIS_HEIGHT}" - # Get follower node status - log_debug "Getting follower node blockchain info" + # Get follower node (aitbc1) status + log_debug "Getting follower node (aitbc1) blockchain info" FOLLOWER_HEIGHT=$(NODE_URL="http://${FOLLOWER_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0") - log_info "Follower node block height: ${FOLLOWER_HEIGHT}" + log_info "Follower node (aitbc1) block height: ${FOLLOWER_HEIGHT}" + + # Get follower node (aitbc2/gitea-runner) status + log_debug "Getting follower node (aitbc2/gitea-runner) blockchain info" + FOLLOWER2_HEIGHT=$(NODE_URL="http://${FOLLOWER2_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0") + log_info "Follower node (aitbc2/gitea-runner) block height: ${FOLLOWER2_HEIGHT}" # Compare heights - HEIGHT_DIFF=$((GENESIS_HEIGHT - FOLLOWER_HEIGHT)) - HEIGHT_DIFF=${HEIGHT_DIFF#-} # Absolute value + HEIGHT_DIFF1=$((GENESIS_HEIGHT - FOLLOWER_HEIGHT)) + HEIGHT_DIFF1=${HEIGHT_DIFF1#-} # Absolute value - if [ ${HEIGHT_DIFF} -le 2 ]; then - log_success "Block synchronization is good (diff: ${HEIGHT_DIFF} blocks)" + HEIGHT_DIFF2=$((GENESIS_HEIGHT - FOLLOWER2_HEIGHT)) + HEIGHT_DIFF2=${HEIGHT_DIFF2#-} # Absolute value + + HEIGHT_DIFF3=$((FOLLOWER_HEIGHT - FOLLOWER2_HEIGHT)) + HEIGHT_DIFF3=${HEIGHT_DIFF3#-} # Absolute value + + # Use the maximum difference + MAX_DIFF=$((HEIGHT_DIFF1 > HEIGHT_DIFF2 ? HEIGHT_DIFF1 : HEIGHT_DIFF2)) + MAX_DIFF=$((MAX_DIFF > HEIGHT_DIFF3 ? MAX_DIFF : HEIGHT_DIFF3)) + + if [ ${MAX_DIFF} -le 2 ]; then + log_success "Block synchronization is good (max diff: ${MAX_DIFF} blocks)" return 0 - elif [ ${HEIGHT_DIFF} -le 10 ]; then - log_warning "Block synchronization lag (diff: ${HEIGHT_DIFF} blocks)" + elif [ ${MAX_DIFF} -le 10 ]; then + log_warning "Block synchronization lag (max diff: ${MAX_DIFF} blocks)" return 1 else - log_error "Block synchronization severely lagged (diff: ${HEIGHT_DIFF} blocks)" + log_error "Block synchronization severely lagged (max diff: ${MAX_DIFF} blocks)" return 1 fi } @@ -259,23 +290,37 @@ test_sync() { log_warning "Genesis node has uncommitted changes" fi - # Check git status on follower - log_debug "Checking git status on follower node" + # Check git status on follower (aitbc1) + log_debug "Checking git status on follower node (aitbc1)" FOLLOWER_STATUS=$(ssh aitbc1 'cd /opt/aitbc && git status --porcelain 2>/dev/null' || echo "error") if [ "${FOLLOWER_STATUS}" = "error" ]; then - log_error "Git status check failed on follower node" + log_error "Git status check failed on follower node (aitbc1)" return 1 elif [ -z "${FOLLOWER_STATUS}" ]; then - log_success "Follower node git status is clean" + log_success "Follower node (aitbc1) git status is clean" else - log_warning "Follower node has uncommitted changes" + log_warning "Follower node (aitbc1) has uncommitted changes" + fi + + # Check git status on follower (aitbc2/gitea-runner) + log_debug "Checking git status on follower node (aitbc2/gitea-runner)" + FOLLOWER2_STATUS=$(ssh gitea-runner 'cd /opt/aitbc && git status --porcelain 2>/dev/null' || echo "error") + + if [ "${FOLLOWER2_STATUS}" = "error" ]; then + log_error "Git status check failed on follower node (aitbc2/gitea-runner)" + return 1 + elif [ -z "${FOLLOWER2_STATUS}" ]; then + log_success "Follower node (aitbc2/gitea-runner) git status is clean" + else + log_warning "Follower node (aitbc2/gitea-runner) has uncommitted changes" fi # Test git pull log_debug "Testing git pull from Gitea" git pull origin main --verbose >> "${LOG_FILE}" 2>&1 ssh aitbc1 'cd /opt/aitbc && git pull origin main --verbose' >> "${LOG_FILE}" 2>&1 + ssh gitea-runner 'cd /opt/aitbc && git pull origin main --verbose' >> "${LOG_FILE}" 2>&1 log_success "Git synchronization test completed" return 0 @@ -347,7 +392,7 @@ run_monitor() { # Main execution main() { log_info "Blockchain Communication Test Script" - log_info "Genesis IP: ${GENESIS_IP}, Follower IP: ${FOLLOWER_IP}, Port: ${PORT}" + log_info "Genesis IP: ${GENESIS_IP}, Follower IP: ${FOLLOWER_IP}, Follower2 IP: ${FOLLOWER2_IP}, Port: ${PORT}" # Create log directory if it doesn't exist mkdir -p "${LOG_DIR}" diff --git a/scripts/multi-node/blockchain-health-check.sh b/scripts/multi-node/blockchain-health-check.sh new file mode 100755 index 00000000..1793d4ae --- /dev/null +++ b/scripts/multi-node/blockchain-health-check.sh @@ -0,0 +1,288 @@ +#!/bin/bash +# +# Multi-Node Blockchain Health Check Script +# Checks health of all 3 blockchain nodes (aitbc, aitbc1, aitbc2) +# Provides automatic remediation for failed services +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="/var/log/aitbc" +LOG_FILE="${LOG_DIR}/multi-node-health.log" + +# Node Configuration +NODES=( + "aitbc:10.1.223.93" + "aitbc1:10.1.223.40" + "aitbc2:10.1.223.98" +) + +RPC_PORT=8006 +REDIS_HOST="10.1.223.93" +REDIS_PORT=6379 + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# Logging functions +log() { + local level="$1" + shift + local message="$@" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}" +} + +log_success() { + log "SUCCESS" "$@" + echo -e "${GREEN}$@${NC}" +} + +log_error() { + log "ERROR" "$@" + echo -e "${RED}$@${NC}" +} + +log_warning() { + log "WARNING" "$@" + echo -e "${YELLOW}$@${NC}" +} + +# SSH execution helper +ssh_exec() { + local node="$1" + local command="$2" + ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1 +} + +# Check RPC endpoint health +check_rpc_health() { + local node_name="$1" + local node_ip="$2" + + log "Checking RPC health for ${node_name} (${node_ip}:${RPC_PORT})" + + if curl -f -s --max-time 5 "http://${node_ip}:${RPC_PORT}/health" > /dev/null 2>&1; then + log_success "RPC endpoint healthy on ${node_name}" + return 0 + else + log_error "RPC endpoint unhealthy on ${node_name}" + return 1 + fi +} + +# Check systemd service status +check_service_status() { + local node="$1" + local service="$2" + + log "Checking ${service} status on ${node}" + + status=$(ssh_exec "$node" "systemctl is-active ${service}" 2>&1 || echo "inactive") + + if [ "$status" = "active" ]; then + log_success "${service} is active on ${node}" + return 0 + else + log_error "${service} is ${status} on ${node}" + return 1 + fi +} + +# Check resource usage +check_resource_usage() { + local node="$1" + + log "Checking resource usage on ${node}" + + memory=$(ssh_exec "$node" "free | grep Mem | awk '{printf \"%.1f\", (\$3/\$2)*100}'" 2>&1 || echo "0") + cpu=$(ssh_exec "$node" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | cut -d'%' -f1" 2>&1 || echo "0") + disk=$(ssh_exec "$node" "df /var/lib/aitbc | tail -1 | awk '{print \$5}' | cut -d'%' -f1" 2>&1 || echo "0") + + log "Resource usage on ${node}: CPU ${cpu}%, Memory ${memory}%, Disk ${disk}%" + + # Check thresholds + if [ "${disk%.*}" -gt 90 ]; then + log_warning "Disk usage critical on ${node}: ${disk}%" + return 1 + fi + + if [ "${memory%.*}" -gt 90 ]; then + log_warning "Memory usage critical on ${node}: ${memory}%" + return 1 + fi + + return 0 +} + +# Check Redis connectivity +check_redis_connectivity() { + log "Checking Redis connectivity (${REDIS_HOST}:${REDIS_PORT})" + + if redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT}" ping > /dev/null 2>&1; then + log_success "Redis connectivity OK" + return 0 + else + log_error "Redis connectivity failed" + return 1 + fi +} + +# Remediation functions +restart_rpc_service() { + local node="$1" + log "Attempting to restart aitbc-blockchain-rpc on ${node}" + + ssh_exec "$node" "systemctl restart aitbc-blockchain-rpc" 2>&1 | tee -a "${LOG_FILE}" + sleep 5 + + if ssh_exec "$node" "systemctl is-active aitbc-blockchain-rpc" 2>&1 | grep -q "active"; then + log_success "Successfully restarted aitbc-blockchain-rpc on ${node}" + return 0 + else + log_error "Failed to restart aitbc-blockchain-rpc on ${node}" + return 1 + fi +} + +restart_p2p_service() { + local node="$1" + log "Attempting to restart aitbc-blockchain-p2p on ${node}" + + ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}" + sleep 5 + + if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" 2>&1 | grep -q "active"; then + log_success "Successfully restarted aitbc-blockchain-p2p on ${node}" + return 0 + else + log_error "Failed to restart aitbc-blockchain-p2p on ${node}" + return 1 + fi +} + +restart_node_service() { + local node="$1" + log "Attempting to restart aitbc-blockchain-node on ${node}" + + ssh_exec "$node" "systemctl restart aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}" + sleep 10 + + if ssh_exec "$node" "systemctl is-active aitbc-blockchain-node" 2>&1 | grep -q "active"; then + log_success "Successfully restarted aitbc-blockchain-node on ${node}" + return 0 + else + log_error "Failed to restart aitbc-blockchain-node on ${node}" + return 1 + fi +} + +# Main health check for a node +check_node_health() { + local node_name="$1" + local node_ip="$2" + local node="${node_name}" + + local failures=0 + + # Check RPC health + if ! check_rpc_health "$node_name" "$node_ip"; then + ((failures++)) + log "Attempting remediation for RPC on ${node_name}" + if restart_rpc_service "$node"; then + # Retry RPC check + if ! check_rpc_health "$node_name" "$node_ip"; then + log_error "RPC remediation failed on ${node_name}" + else + log_success "RPC remediation successful on ${node_name}" + ((failures--)) + fi + fi + fi + + # Check blockchain node service + if ! check_service_status "$node" "aitbc-blockchain-node"; then + ((failures++)) + log "Attempting remediation for blockchain node on ${node_name}" + if restart_node_service "$node"; then + # Retry service check + if check_service_status "$node" "aitbc-blockchain-node"; then + log_success "Blockchain node remediation successful on ${node_name}" + ((failures--)) + fi + fi + fi + + # Check P2P service + if ! check_service_status "$node" "aitbc-blockchain-p2p"; then + ((failures++)) + log "Attempting remediation for P2P on ${node_name}" + if restart_p2p_service "$node"; then + # Retry service check + if check_service_status "$node" "aitbc-blockchain-p2p"; then + log_success "P2P remediation successful on ${node_name}" + ((failures--)) + fi + fi + fi + + # Check resource usage + if ! check_resource_usage "$node"; then + ((failures++)) + log_warning "Resource usage issues on ${node_name}" + fi + + return $failures +} + +# Main execution +main() { + log "=== Multi-Node Blockchain Health Check Started ===" + + # Create log directory if it doesn't exist + mkdir -p "${LOG_DIR}" + + local total_failures=0 + + # Check Redis connectivity (shared resource) + if ! check_redis_connectivity; then + log_error "Redis connectivity failed - this affects all nodes" + ((total_failures++)) + fi + + # Check each node + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + + log "=== Checking node: ${node_name} (${node_ip}) ===" + + if check_node_health "$node_name" "$node_ip"; then + log_success "Node ${node_name} is healthy" + else + failures=$? + log_error "Node ${node_name} has ${failures} health issues" + ((total_failures+=failures)) + fi + + echo "" | tee -a "${LOG_FILE}" + done + + log "=== Multi-Node Blockchain Health Check Completed ===" + log "Total failures: ${total_failures}" + + if [ ${total_failures} -eq 0 ]; then + log_success "All nodes are healthy" + exit 0 + else + log_error "Health check completed with ${total_failures} failures" + exit 1 + fi +} + +# Run main function +main "$@" diff --git a/scripts/multi-node/p2p-verification.sh b/scripts/multi-node/p2p-verification.sh new file mode 100755 index 00000000..cddcb2dc --- /dev/null +++ b/scripts/multi-node/p2p-verification.sh @@ -0,0 +1,287 @@ +#!/bin/bash +# +# P2P Network Verification Script +# Verifies P2P network connectivity across all 3 blockchain nodes +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="/var/log/aitbc" +LOG_FILE="${LOG_DIR}/p2p-verification.log" + +# Node Configuration +NODES=( + "aitbc:10.1.223.93" + "aitbc1:10.1.223.40" + "aitbc2:10.1.223.98" +) + +P2P_PORT=7070 +REDIS_HOST="10.1.223.93" +REDIS_PORT=6379 + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# Logging functions +log() { + local level="$1" + shift + local message="$@" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}" +} + +log_success() { + log "SUCCESS" "$@" + echo -e "${GREEN}$@${NC}" +} + +log_error() { + log "ERROR" "$@" + echo -e "${RED}$@${NC}" +} + +log_warning() { + log "WARNING" "$@" + echo -e "${YELLOW}$@${NC}" +} + +# SSH execution helper +ssh_exec() { + local node="$1" + local command="$2" + ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1 +} + +# Check P2P peer list on a node +check_p2p_peers() { + local node="$1" + local node_name="$2" + + log "Checking P2P peers on ${node_name}" + + # Read node.env to get expected peers + peers=$(ssh_exec "$node" "grep '^p2p_peers=' /etc/aitbc/node.env | cut -d'=' -f2" 2>&1 || echo "") + + if [ -z "$peers" ]; then + log_error "No p2p_peers configured on ${node_name}" + return 1 + fi + + log "Expected peers on ${node_name}: ${peers}" + + # Check P2P service status + if ! ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" | grep -q "active"; then + log_error "P2P service not active on ${node_name}" + return 1 + fi + + log_success "P2P peers configured on ${node_name}" + return 0 +} + +# Check P2P connectivity between nodes +check_p2p_connectivity() { + local source_node="$1" + local source_name="$2" + local target_node="$3" + local target_name="$4" + + log "Checking P2P connectivity from ${source_name} to ${target_name}" + + # Try to connect to target P2P port + if ssh_exec "$source_node" "timeout 5 bash -c '&1; then + log_success "P2P connectivity OK from ${source_name} to ${target_name}" + return 0 + else + log_error "P2P connectivity FAILED from ${source_name} to ${target_name}" + return 1 + fi +} + +# Check Redis gossip backend connectivity +check_gossip_backend() { + log "Checking Redis gossip backend connectivity (${REDIS_HOST}:${REDIS_PORT})" + + if redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT}" ping > /dev/null 2>&1; then + log_success "Redis gossip backend connectivity OK" + return 0 + else + log_error "Redis gossip backend connectivity failed" + return 1 + fi +} + +# Check for P2P handshake errors in logs +check_p2p_logs() { + local node="$1" + local node_name="$2" + + log "Checking P2P logs for errors on ${node_name}" + + # Check for handshake errors + errors=$(ssh_exec "$node" "journalctl -u aitbc-blockchain-p2p --since '1 hour ago' | grep -i 'handshake\|error\|failed' | tail -5" 2>&1 || echo "") + + if [ -n "$errors" ]; then + log_warning "P2P errors found on ${node_name}:" + echo "$errors" | tee -a "${LOG_FILE}" + return 1 + else + log_success "No P2P errors found on ${node_name}" + return 0 + fi +} + +# Remediation: Restart P2P service +remediate_p2p_service() { + local node="$1" + local node_name="$2" + + log "Attempting P2P remediation on ${node_name}" + + ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}" + sleep 5 + + if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" | grep -q "active"; then + log_success "P2P service remediation successful on ${node_name}" + return 0 + else + log_error "P2P service remediation failed on ${node_name}" + return 1 + fi +} + +# Update p2p_peers configuration if needed +update_p2p_peers() { + local node="$1" + local node_name="$2" + + log "Updating p2p_peers configuration on ${node_name}" + + # Determine correct peers based on node name + case "$node_name" in + "aitbc") + peers="aitbc1:7070,aitbc2:7070" + ;; + "aitbc1") + peers="aitbc:7070,aitbc2:7070" + ;; + "aitbc2") + peers="aitbc:7070,aitbc1:7070" + ;; + *) + log_error "Unknown node name: ${node_name}" + return 1 + ;; + esac + + # Update node.env + ssh_exec "$node" "sed -i 's/^p2p_peers=.*/p2p_peers=${peers}/' /etc/aitbc/node.env" 2>&1 | tee -a "${LOG_FILE}" + + # Restart P2P service to apply changes + ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}" + sleep 5 + + log_success "Updated p2p_peers on ${node_name} to: ${peers}" + return 0 +} + +# Main verification for a node +verify_node_p2p() { + local node_name="$1" + local node_ip="$2" + local node="${node_name}" + + local failures=0 + + # Check P2P peers configuration + if ! check_p2p_peers "$node" "$node_name"; then + ((failures++)) + log "Attempting remediation for P2P peers on ${node_name}" + update_p2p_peers "$node" "$node_name" || true + fi + + # Check P2P logs for errors + if ! check_p2p_logs "$node" "$node_name"; then + ((failures++)) + log "Attempting remediation for P2P errors on ${node_name}" + remediate_p2p_service "$node" "$node_name" || true + fi + + return $failures +} + +# Main execution +main() { + log "=== P2P Network Verification Started ===" + + # Create log directory if it doesn't exist + mkdir -p "${LOG_DIR}" + + local total_failures=0 + + # Check Redis gossip backend + if ! check_gossip_backend; then + log_error "Gossip backend connectivity failed" + ((total_failures++)) + fi + + # Check each node's P2P configuration + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + + log "=== Verifying P2P on node: ${node_name} (${node_ip}) ===" + + if verify_node_p2p "$node_name" "$node_ip"; then + log_success "P2P verification passed for ${node_name}" + else + failures=$? + log_error "P2P verification failed for ${node_name} with ${failures} issues" + ((total_failures+=failures)) + fi + + echo "" | tee -a "${LOG_FILE}" + done + + # Check P2P connectivity between all node pairs + log "=== Checking P2P connectivity between node pairs ===" + + for source_config in "${NODES[@]}"; do + IFS=':' read -r source_name source_ip <<< "$source_config" + + for target_config in "${NODES[@]}"; do + IFS=':' read -r target_name target_ip <<< "$target_config" + + # Skip self-connectivity check + if [ "$source_name" = "$target_name" ]; then + continue + fi + + if ! check_p2p_connectivity "$source_name" "$source_name" "$target_ip" "$target_name"; then + ((total_failures++)) + log "Attempting remediation for P2P connectivity" + remediate_p2p_service "$source_name" "$source_name" || true + fi + done + done + + log "=== P2P Network Verification Completed ===" + log "Total failures: ${total_failures}" + + if [ ${total_failures} -eq 0 ]; then + log_success "P2P network verification passed" + exit 0 + else + log_error "P2P network verification failed with ${total_failures} failures" + exit 1 + fi +} + +# Run main function +main "$@" diff --git a/scripts/multi-node/sync-verification.sh b/scripts/multi-node/sync-verification.sh new file mode 100755 index 00000000..47f7e7e5 --- /dev/null +++ b/scripts/multi-node/sync-verification.sh @@ -0,0 +1,348 @@ +#!/bin/bash +# +# Blockchain Synchronization Verification Script +# Verifies blockchain synchronization across all 3 nodes +# Provides automatic remediation by forcing sync from healthy node +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="/var/log/aitbc" +LOG_FILE="${LOG_DIR}/sync-verification.log" + +# Node Configuration +NODES=( + "aitbc:10.1.223.93" + "aitbc1:10.1.223.40" + "aitbc2:10.1.223.98" +) + +RPC_PORT=8006 +SYNC_THRESHOLD=10 + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# Logging functions +log() { + local level="$1" + shift + local message="$@" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}" +} + +log_success() { + log "SUCCESS" "$@" + echo -e "${GREEN}$@${NC}" +} + +log_error() { + log "ERROR" "$@" + echo -e "${RED}$@${NC}" +} + +log_warning() { + log "WARNING" "$@" + echo -e "${YELLOW}$@${NC}" +} + +# SSH execution helper +ssh_exec() { + local node="$1" + local command="$2" + ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1 +} + +# Get block height from RPC endpoint +get_block_height() { + local node_ip="$1" + + # Try to get block height from RPC + height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/height" 2>/dev/null | grep -o '[0-9]*' || echo "0") + + if [ -z "$height" ] || [ "$height" = "0" ]; then + # Try alternative endpoint + height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/height" 2>/dev/null | grep -o '[0-9]*' || echo "0") + fi + + echo "$height" +} + +# Get chain ID from RPC endpoint +get_chain_id() { + local node_ip="$1" + + chain_id=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/chain-id" 2>/dev/null || echo "") + + if [ -z "$chain_id" ]; then + chain_id=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/chain-id" 2>/dev/null || echo "") + fi + + echo "$chain_id" +} + +# Get block hash at specific height +get_block_hash() { + local node_ip="$1" + local height="$2" + + hash=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/block/${height}/hash" 2>/dev/null || echo "") + echo "$hash" +} + +# Check chain ID consistency +check_chain_id_consistency() { + log "Checking chain ID consistency across nodes" + + local first_chain_id="" + local consistent=true + + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + + chain_id=$(get_chain_id "$node_ip") + + if [ -z "$chain_id" ]; then + log_error "Could not get chain ID from ${node_name}" + consistent=false + continue + fi + + log "Chain ID on ${node_name}: ${chain_id}" + + if [ -z "$first_chain_id" ]; then + first_chain_id="$chain_id" + elif [ "$chain_id" != "$first_chain_id" ]; then + log_error "Chain ID mismatch on ${node_name}: ${chain_id} vs ${first_chain_id}" + consistent=false + fi + done + + if [ "$consistent" = true ]; then + log_success "Chain ID consistent across all nodes" + return 0 + else + log_error "Chain ID inconsistent across nodes" + return 1 + fi +} + +# Check block synchronization +check_block_sync() { + log "Checking block synchronization across nodes" + + local heights=() + local max_height=0 + local min_height=999999 + + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + + height=$(get_block_height "$node_ip") + + if [ -z "$height" ] || [ "$height" = "0" ]; then + log_error "Could not get block height from ${node_name}" + return 1 + fi + + heights+=("${node_name}:${height}") + log "Block height on ${node_name}: ${height}" + + if [ "$height" -gt "$max_height" ]; then + max_height=$height + max_node="${node_name}" + max_ip="${node_ip}" + fi + + if [ "$height" -lt "$min_height" ]; then + min_height=$height + fi + done + + local height_diff=$((max_height - min_height)) + + log "Max height: ${max_height} (${max_node}), Min height: ${min_height}, Diff: ${height_diff}" + + if [ "$height_diff" -le "$SYNC_THRESHOLD" ]; then + log_success "Block synchronization within threshold (diff: ${height_diff})" + return 0 + else + log_error "Block synchronization exceeds threshold (diff: ${height_diff})" + return 1 + fi +} + +# Check block hash consistency at current height +check_block_hash_consistency() { + log "Checking block hash consistency" + + local target_height="" + + # Find the minimum height to compare at + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + height=$(get_block_height "$node_ip") + + if [ -z "$target_height" ] || [ "$height" -lt "$target_height" ]; then + target_height=$height + fi + done + + log "Comparing block hashes at height ${target_height}" + + local first_hash="" + local consistent=true + + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + + hash=$(get_block_hash "$node_ip" "$target_height") + + if [ -z "$hash" ]; then + log_warning "Could not get block hash from ${node_name} at height ${target_height}" + continue + fi + + log "Block hash on ${node_name} at height ${target_height}: ${hash}" + + if [ -z "$first_hash" ]; then + first_hash="$hash" + elif [ "$hash" != "$first_hash" ]; then + log_error "Block hash mismatch on ${node_name} at height ${target_height}" + consistent=false + fi + done + + if [ "$consistent" = true ]; then + log_success "Block hashes consistent at height ${target_height}" + return 0 + else + log_error "Block hashes inconsistent" + return 1 + fi +} + +# Remediation: Force sync from healthy node +force_sync_from_source() { + local target_node="$1" + local target_name="$2" + local source_node="$3" + local source_name="$4" + + log "Forcing sync from ${source_name} to ${target_name}" + + # Stop blockchain service on target + log "Stopping blockchain service on ${target_name}" + ssh_exec "$target_node" "systemctl stop aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}" + sleep 5 + + # Copy chain.db from source to target + log "Copying chain.db from ${source_name} to ${target_name}" + ssh_exec "$source_node" "cat /var/lib/aitbc/data/chain.db" | ssh_exec "$target_node" "cat > /var/lib/aitbc/data/chain.db" 2>&1 | tee -a "${LOG_FILE}" + + # Start blockchain service on target + log "Starting blockchain service on ${target_name}" + ssh_exec "$target_node" "systemctl start aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}" + sleep 10 + + # Verify service is running + if ssh_exec "$target_node" "systemctl is-active aitbc-blockchain-node" | grep -q "active"; then + log_success "Sync completed successfully on ${target_name}" + return 0 + else + log_error "Failed to start blockchain service on ${target_name} after sync" + return 1 + fi +} + +# Main sync verification +main() { + log "=== Blockchain Synchronization Verification Started ===" + + # Create log directory if it doesn't exist + mkdir -p "${LOG_DIR}" + + local total_failures=0 + + # Check chain ID consistency + if ! check_chain_id_consistency; then + log_error "Chain ID inconsistency detected - this is critical" + ((total_failures++)) + fi + + # Check block synchronization + if ! check_block_sync; then + log_error "Block synchronization issue detected" + ((total_failures++)) + + # Determine source and target nodes for remediation + local max_height=0 + local max_node="" + local max_ip="" + local min_height=999999 + local min_node="" + local min_ip="" + + for node_config in "${NODES[@]}"; do + IFS=':' read -r node_name node_ip <<< "$node_config" + height=$(get_block_height "$node_ip") + + if [ "$height" -gt "$max_height" ]; then + max_height=$height + max_node="${node_name}" + max_ip="${node_ip}" + fi + + if [ "$height" -lt "$min_height" ]; then + min_height=$height + min_node="${node_name}" + min_ip="${node_ip}" + fi + done + + # Attempt remediation if difference exceeds threshold + local height_diff=$((max_height - min_height)) + if [ "$height_diff" -gt "$SYNC_THRESHOLD" ]; then + log "Attempting remediation: sync from ${max_node} to ${min_node}" + if force_sync_from_source "$min_ip" "$min_node" "$max_ip" "$max_node"; then + log_success "Remediation successful" + # Re-check sync after remediation + if check_block_sync; then + log_success "Sync verification passed after remediation" + else + log_error "Sync still fails after remediation" + ((total_failures++)) + fi + else + log_error "Remediation failed" + ((total_failures++)) + fi + fi + fi + + # Check block hash consistency + if ! check_block_hash_consistency; then + log_error "Block hash inconsistency detected" + ((total_failures++)) + fi + + log "=== Blockchain Synchronization Verification Completed ===" + log "Total failures: ${total_failures}" + + if [ ${total_failures} -eq 0 ]; then + log_success "Blockchain synchronization verification passed" + exit 0 + else + log_error "Blockchain synchronization verification failed with ${total_failures} failures" + exit 1 + fi +} + +# Run main function +main "$@"