feat: add multi-node blockchain monitoring workflows for 3-node network
- Create multi-node blockchain health monitoring workflow - Create P2P network verification workflow for all 3 nodes - Create blockchain synchronization verification workflow - Update blockchain-communication-test.sh to include aitbc2 (gitea-runner) - Add shared scripts directory with health check, P2P verification, and sync verification scripts - All workflows trigger on git push to main/develop branches - Workflows run on gitea-runner (has SSH access to all nodes) - Include automatic remediation for failed services and sync issues - Sync threshold set to 10 blocks - Logging to /var/log/aitbc/ and alerts in Gitea UI
This commit is contained in:
67
.gitea/workflows/blockchain-sync-verification.yml
Normal file
67
.gitea/workflows/blockchain-sync-verification.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
name: Blockchain Synchronization Verification
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, develop]
|
||||
paths:
|
||||
- 'apps/blockchain-node/**'
|
||||
- 'scripts/multi-node/**'
|
||||
- '.gitea/workflows/blockchain-sync-verification.yml'
|
||||
pull_request:
|
||||
branches: [main, develop]
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 */6 * * *' # Every 6 hours
|
||||
|
||||
concurrency:
|
||||
group: blockchain-sync-verification-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
sync-verification:
|
||||
runs-on: debian
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Clone repository
|
||||
run: |
|
||||
WORKSPACE="/var/lib/aitbc-workspaces/blockchain-sync-verification"
|
||||
rm -rf "$WORKSPACE"
|
||||
mkdir -p "$WORKSPACE"
|
||||
cd "$WORKSPACE"
|
||||
git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo
|
||||
|
||||
- name: Initialize job logging
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo
|
||||
bash scripts/ci/setup-job-logging.sh
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo
|
||||
|
||||
# Remove any existing venv to avoid cache corruption issues
|
||||
rm -rf venv
|
||||
|
||||
bash scripts/ci/setup-python-venv.sh \
|
||||
--repo-dir "$PWD" \
|
||||
--venv-dir "$PWD/venv" \
|
||||
--skip-requirements \
|
||||
--extra-packages "requests psutil"
|
||||
|
||||
- name: Run blockchain synchronization verification
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/blockchain-sync-verification/repo
|
||||
bash scripts/multi-node/sync-verification.sh
|
||||
|
||||
- name: Sync verification report
|
||||
if: always()
|
||||
run: |
|
||||
echo "=== Blockchain Synchronization Verification Report ==="
|
||||
if [ -f /var/log/aitbc/sync-verification.log ]; then
|
||||
tail -50 /var/log/aitbc/sync-verification.log
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: rm -rf /var/lib/aitbc-workspaces/blockchain-sync-verification
|
||||
67
.gitea/workflows/multi-node-health.yml
Normal file
67
.gitea/workflows/multi-node-health.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
name: Multi-Node Blockchain Health Monitoring
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, develop]
|
||||
paths:
|
||||
- 'apps/blockchain-node/**'
|
||||
- 'scripts/multi-node/**'
|
||||
- '.gitea/workflows/multi-node-health.yml'
|
||||
pull_request:
|
||||
branches: [main, develop]
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 */2 * * *' # Every 2 hours
|
||||
|
||||
concurrency:
|
||||
group: multi-node-health-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
health-check:
|
||||
runs-on: debian
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Clone repository
|
||||
run: |
|
||||
WORKSPACE="/var/lib/aitbc-workspaces/multi-node-health"
|
||||
rm -rf "$WORKSPACE"
|
||||
mkdir -p "$WORKSPACE"
|
||||
cd "$WORKSPACE"
|
||||
git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo
|
||||
|
||||
- name: Initialize job logging
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/multi-node-health/repo
|
||||
bash scripts/ci/setup-job-logging.sh
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/multi-node-health/repo
|
||||
|
||||
# Remove any existing venv to avoid cache corruption issues
|
||||
rm -rf venv
|
||||
|
||||
bash scripts/ci/setup-python-venv.sh \
|
||||
--repo-dir "$PWD" \
|
||||
--venv-dir "$PWD/venv" \
|
||||
--skip-requirements \
|
||||
--extra-packages "requests psutil"
|
||||
|
||||
- name: Run multi-node health check
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/multi-node-health/repo
|
||||
bash scripts/multi-node/blockchain-health-check.sh
|
||||
|
||||
- name: Health check report
|
||||
if: always()
|
||||
run: |
|
||||
echo "=== Multi-Node Health Check Report ==="
|
||||
if [ -f /var/log/aitbc/multi-node-health.log ]; then
|
||||
tail -50 /var/log/aitbc/multi-node-health.log
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: rm -rf /var/lib/aitbc-workspaces/multi-node-health
|
||||
67
.gitea/workflows/p2p-network-verification.yml
Normal file
67
.gitea/workflows/p2p-network-verification.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
name: P2P Network Verification
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, develop]
|
||||
paths:
|
||||
- 'apps/blockchain-node/**'
|
||||
- 'scripts/multi-node/**'
|
||||
- '.gitea/workflows/p2p-network-verification.yml'
|
||||
pull_request:
|
||||
branches: [main, develop]
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 */4 * * *' # Every 4 hours
|
||||
|
||||
concurrency:
|
||||
group: p2p-network-verification-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
p2p-verification:
|
||||
runs-on: debian
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Clone repository
|
||||
run: |
|
||||
WORKSPACE="/var/lib/aitbc-workspaces/p2p-network-verification"
|
||||
rm -rf "$WORKSPACE"
|
||||
mkdir -p "$WORKSPACE"
|
||||
cd "$WORKSPACE"
|
||||
git clone --depth 1 http://gitea.bubuit.net:3000/oib/aitbc.git repo
|
||||
|
||||
- name: Initialize job logging
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/p2p-network-verification/repo
|
||||
bash scripts/ci/setup-job-logging.sh
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/p2p-network-verification/repo
|
||||
|
||||
# Remove any existing venv to avoid cache corruption issues
|
||||
rm -rf venv
|
||||
|
||||
bash scripts/ci/setup-python-venv.sh \
|
||||
--repo-dir "$PWD" \
|
||||
--venv-dir "$PWD/venv" \
|
||||
--skip-requirements \
|
||||
--extra-packages "requests psutil"
|
||||
|
||||
- name: Run P2P network verification
|
||||
run: |
|
||||
cd /var/lib/aitbc-workspaces/p2p-network-verification/repo
|
||||
bash scripts/multi-node/p2p-verification.sh
|
||||
|
||||
- name: P2P verification report
|
||||
if: always()
|
||||
run: |
|
||||
echo "=== P2P Network Verification Report ==="
|
||||
if [ -f /var/log/aitbc/p2p-verification.log ]; then
|
||||
tail -50 /var/log/aitbc/p2p-verification.log
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: rm -rf /var/lib/aitbc-workspaces/p2p-network-verification
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Blockchain Communication Test Script
|
||||
# Tests communication between aitbc (genesis) and aitbc1 (follower) nodes
|
||||
# Both nodes run on port 8006 on different physical machines
|
||||
# Tests communication between aitbc (genesis), aitbc1 (follower), and aitbc2 (gitea-runner) nodes
|
||||
# All nodes run on port 8006 on different physical machines
|
||||
#
|
||||
|
||||
set -e
|
||||
@@ -11,8 +11,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
|
||||
# Configuration
|
||||
GENESIS_IP="10.1.223.40"
|
||||
FOLLOWER_IP="<aitbc1-ip>" # Replace with actual IP
|
||||
GENESIS_IP="10.1.223.93"
|
||||
FOLLOWER_IP="10.1.223.40"
|
||||
FOLLOWER2_IP="10.1.223.98" # gitea-runner/aitbc2
|
||||
PORT=8006
|
||||
CLI_PATH="${CLI_PATH:-${REPO_ROOT}/aitbc-cli}"
|
||||
LOG_DIR="/var/log/aitbc"
|
||||
@@ -114,7 +115,7 @@ test_connectivity() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Test follower node
|
||||
# Test follower node (aitbc1)
|
||||
log_debug "Testing follower node at ${FOLLOWER_IP}:${PORT}"
|
||||
if curl -f -s "http://${FOLLOWER_IP}:${PORT}/health" > /dev/null; then
|
||||
log_success "Follower node (aitbc1) is reachable"
|
||||
@@ -123,12 +124,27 @@ test_connectivity() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Test follower node (aitbc2/gitea-runner)
|
||||
log_debug "Testing follower node (aitbc2/gitea-runner) at ${FOLLOWER2_IP}:${PORT}"
|
||||
if curl -f -s "http://${FOLLOWER2_IP}:${PORT}/health" > /dev/null; then
|
||||
log_success "Follower node (aitbc2/gitea-runner) is reachable"
|
||||
else
|
||||
log_error "Follower node (aitbc2/gitea-runner) is NOT reachable"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Test P2P connectivity
|
||||
log_debug "Testing P2P connectivity"
|
||||
if ${CLI_PATH} network ping --node aitbc1 --host ${FOLLOWER_IP} --port ${PORT} --debug > /dev/null 2>&1; then
|
||||
log_success "P2P connectivity between nodes is working"
|
||||
log_success "P2P connectivity to aitbc1 is working"
|
||||
else
|
||||
log_warning "P2P connectivity test failed (may not be critical)"
|
||||
log_warning "P2P connectivity to aitbc1 test failed (may not be critical)"
|
||||
fi
|
||||
|
||||
if ${CLI_PATH} network ping --node aitbc2 --host ${FOLLOWER2_IP} --port ${PORT} --debug > /dev/null 2>&1; then
|
||||
log_success "P2P connectivity to aitbc2 is working"
|
||||
else
|
||||
log_warning "P2P connectivity to aitbc2 test failed (may not be critical)"
|
||||
fi
|
||||
|
||||
# Check peers
|
||||
@@ -146,23 +162,38 @@ test_blockchain_status() {
|
||||
GENESIS_HEIGHT=$(NODE_URL="http://${GENESIS_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
log_info "Genesis node block height: ${GENESIS_HEIGHT}"
|
||||
|
||||
# Get follower node status
|
||||
log_debug "Getting follower node blockchain info"
|
||||
# Get follower node (aitbc1) status
|
||||
log_debug "Getting follower node (aitbc1) blockchain info"
|
||||
FOLLOWER_HEIGHT=$(NODE_URL="http://${FOLLOWER_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
log_info "Follower node block height: ${FOLLOWER_HEIGHT}"
|
||||
log_info "Follower node (aitbc1) block height: ${FOLLOWER_HEIGHT}"
|
||||
|
||||
# Get follower node (aitbc2/gitea-runner) status
|
||||
log_debug "Getting follower node (aitbc2/gitea-runner) blockchain info"
|
||||
FOLLOWER2_HEIGHT=$(NODE_URL="http://${FOLLOWER2_IP}:${PORT}" ${CLI_PATH} blockchain height --output json 2>/dev/null | grep -o '"height":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
log_info "Follower node (aitbc2/gitea-runner) block height: ${FOLLOWER2_HEIGHT}"
|
||||
|
||||
# Compare heights
|
||||
HEIGHT_DIFF=$((GENESIS_HEIGHT - FOLLOWER_HEIGHT))
|
||||
HEIGHT_DIFF=${HEIGHT_DIFF#-} # Absolute value
|
||||
HEIGHT_DIFF1=$((GENESIS_HEIGHT - FOLLOWER_HEIGHT))
|
||||
HEIGHT_DIFF1=${HEIGHT_DIFF1#-} # Absolute value
|
||||
|
||||
if [ ${HEIGHT_DIFF} -le 2 ]; then
|
||||
log_success "Block synchronization is good (diff: ${HEIGHT_DIFF} blocks)"
|
||||
HEIGHT_DIFF2=$((GENESIS_HEIGHT - FOLLOWER2_HEIGHT))
|
||||
HEIGHT_DIFF2=${HEIGHT_DIFF2#-} # Absolute value
|
||||
|
||||
HEIGHT_DIFF3=$((FOLLOWER_HEIGHT - FOLLOWER2_HEIGHT))
|
||||
HEIGHT_DIFF3=${HEIGHT_DIFF3#-} # Absolute value
|
||||
|
||||
# Use the maximum difference
|
||||
MAX_DIFF=$((HEIGHT_DIFF1 > HEIGHT_DIFF2 ? HEIGHT_DIFF1 : HEIGHT_DIFF2))
|
||||
MAX_DIFF=$((MAX_DIFF > HEIGHT_DIFF3 ? MAX_DIFF : HEIGHT_DIFF3))
|
||||
|
||||
if [ ${MAX_DIFF} -le 2 ]; then
|
||||
log_success "Block synchronization is good (max diff: ${MAX_DIFF} blocks)"
|
||||
return 0
|
||||
elif [ ${HEIGHT_DIFF} -le 10 ]; then
|
||||
log_warning "Block synchronization lag (diff: ${HEIGHT_DIFF} blocks)"
|
||||
elif [ ${MAX_DIFF} -le 10 ]; then
|
||||
log_warning "Block synchronization lag (max diff: ${MAX_DIFF} blocks)"
|
||||
return 1
|
||||
else
|
||||
log_error "Block synchronization severely lagged (diff: ${HEIGHT_DIFF} blocks)"
|
||||
log_error "Block synchronization severely lagged (max diff: ${MAX_DIFF} blocks)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
@@ -259,23 +290,37 @@ test_sync() {
|
||||
log_warning "Genesis node has uncommitted changes"
|
||||
fi
|
||||
|
||||
# Check git status on follower
|
||||
log_debug "Checking git status on follower node"
|
||||
# Check git status on follower (aitbc1)
|
||||
log_debug "Checking git status on follower node (aitbc1)"
|
||||
FOLLOWER_STATUS=$(ssh aitbc1 'cd /opt/aitbc && git status --porcelain 2>/dev/null' || echo "error")
|
||||
|
||||
if [ "${FOLLOWER_STATUS}" = "error" ]; then
|
||||
log_error "Git status check failed on follower node"
|
||||
log_error "Git status check failed on follower node (aitbc1)"
|
||||
return 1
|
||||
elif [ -z "${FOLLOWER_STATUS}" ]; then
|
||||
log_success "Follower node git status is clean"
|
||||
log_success "Follower node (aitbc1) git status is clean"
|
||||
else
|
||||
log_warning "Follower node has uncommitted changes"
|
||||
log_warning "Follower node (aitbc1) has uncommitted changes"
|
||||
fi
|
||||
|
||||
# Check git status on follower (aitbc2/gitea-runner)
|
||||
log_debug "Checking git status on follower node (aitbc2/gitea-runner)"
|
||||
FOLLOWER2_STATUS=$(ssh gitea-runner 'cd /opt/aitbc && git status --porcelain 2>/dev/null' || echo "error")
|
||||
|
||||
if [ "${FOLLOWER2_STATUS}" = "error" ]; then
|
||||
log_error "Git status check failed on follower node (aitbc2/gitea-runner)"
|
||||
return 1
|
||||
elif [ -z "${FOLLOWER2_STATUS}" ]; then
|
||||
log_success "Follower node (aitbc2/gitea-runner) git status is clean"
|
||||
else
|
||||
log_warning "Follower node (aitbc2/gitea-runner) has uncommitted changes"
|
||||
fi
|
||||
|
||||
# Test git pull
|
||||
log_debug "Testing git pull from Gitea"
|
||||
git pull origin main --verbose >> "${LOG_FILE}" 2>&1
|
||||
ssh aitbc1 'cd /opt/aitbc && git pull origin main --verbose' >> "${LOG_FILE}" 2>&1
|
||||
ssh gitea-runner 'cd /opt/aitbc && git pull origin main --verbose' >> "${LOG_FILE}" 2>&1
|
||||
|
||||
log_success "Git synchronization test completed"
|
||||
return 0
|
||||
@@ -347,7 +392,7 @@ run_monitor() {
|
||||
# Main execution
|
||||
main() {
|
||||
log_info "Blockchain Communication Test Script"
|
||||
log_info "Genesis IP: ${GENESIS_IP}, Follower IP: ${FOLLOWER_IP}, Port: ${PORT}"
|
||||
log_info "Genesis IP: ${GENESIS_IP}, Follower IP: ${FOLLOWER_IP}, Follower2 IP: ${FOLLOWER2_IP}, Port: ${PORT}"
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
288
scripts/multi-node/blockchain-health-check.sh
Executable file
288
scripts/multi-node/blockchain-health-check.sh
Executable file
@@ -0,0 +1,288 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Multi-Node Blockchain Health Check Script
|
||||
# Checks health of all 3 blockchain nodes (aitbc, aitbc1, aitbc2)
|
||||
# Provides automatic remediation for failed services
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
LOG_DIR="/var/log/aitbc"
|
||||
LOG_FILE="${LOG_DIR}/multi-node-health.log"
|
||||
|
||||
# Node Configuration
|
||||
NODES=(
|
||||
"aitbc:10.1.223.93"
|
||||
"aitbc1:10.1.223.40"
|
||||
"aitbc2:10.1.223.98"
|
||||
)
|
||||
|
||||
RPC_PORT=8006
|
||||
REDIS_HOST="10.1.223.93"
|
||||
REDIS_PORT=6379
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Logging functions
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local message="$@"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
log "SUCCESS" "$@"
|
||||
echo -e "${GREEN}$@${NC}"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
log "ERROR" "$@"
|
||||
echo -e "${RED}$@${NC}"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
log "WARNING" "$@"
|
||||
echo -e "${YELLOW}$@${NC}"
|
||||
}
|
||||
|
||||
# SSH execution helper
|
||||
ssh_exec() {
|
||||
local node="$1"
|
||||
local command="$2"
|
||||
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1
|
||||
}
|
||||
|
||||
# Check RPC endpoint health
|
||||
check_rpc_health() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
|
||||
log "Checking RPC health for ${node_name} (${node_ip}:${RPC_PORT})"
|
||||
|
||||
if curl -f -s --max-time 5 "http://${node_ip}:${RPC_PORT}/health" > /dev/null 2>&1; then
|
||||
log_success "RPC endpoint healthy on ${node_name}"
|
||||
return 0
|
||||
else
|
||||
log_error "RPC endpoint unhealthy on ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check systemd service status
|
||||
check_service_status() {
|
||||
local node="$1"
|
||||
local service="$2"
|
||||
|
||||
log "Checking ${service} status on ${node}"
|
||||
|
||||
status=$(ssh_exec "$node" "systemctl is-active ${service}" 2>&1 || echo "inactive")
|
||||
|
||||
if [ "$status" = "active" ]; then
|
||||
log_success "${service} is active on ${node}"
|
||||
return 0
|
||||
else
|
||||
log_error "${service} is ${status} on ${node}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check resource usage
|
||||
check_resource_usage() {
|
||||
local node="$1"
|
||||
|
||||
log "Checking resource usage on ${node}"
|
||||
|
||||
memory=$(ssh_exec "$node" "free | grep Mem | awk '{printf \"%.1f\", (\$3/\$2)*100}'" 2>&1 || echo "0")
|
||||
cpu=$(ssh_exec "$node" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | cut -d'%' -f1" 2>&1 || echo "0")
|
||||
disk=$(ssh_exec "$node" "df /var/lib/aitbc | tail -1 | awk '{print \$5}' | cut -d'%' -f1" 2>&1 || echo "0")
|
||||
|
||||
log "Resource usage on ${node}: CPU ${cpu}%, Memory ${memory}%, Disk ${disk}%"
|
||||
|
||||
# Check thresholds
|
||||
if [ "${disk%.*}" -gt 90 ]; then
|
||||
log_warning "Disk usage critical on ${node}: ${disk}%"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ "${memory%.*}" -gt 90 ]; then
|
||||
log_warning "Memory usage critical on ${node}: ${memory}%"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check Redis connectivity
|
||||
check_redis_connectivity() {
|
||||
log "Checking Redis connectivity (${REDIS_HOST}:${REDIS_PORT})"
|
||||
|
||||
if redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT}" ping > /dev/null 2>&1; then
|
||||
log_success "Redis connectivity OK"
|
||||
return 0
|
||||
else
|
||||
log_error "Redis connectivity failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Remediation functions
|
||||
restart_rpc_service() {
|
||||
local node="$1"
|
||||
log "Attempting to restart aitbc-blockchain-rpc on ${node}"
|
||||
|
||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-rpc" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 5
|
||||
|
||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-rpc" 2>&1 | grep -q "active"; then
|
||||
log_success "Successfully restarted aitbc-blockchain-rpc on ${node}"
|
||||
return 0
|
||||
else
|
||||
log_error "Failed to restart aitbc-blockchain-rpc on ${node}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
restart_p2p_service() {
|
||||
local node="$1"
|
||||
log "Attempting to restart aitbc-blockchain-p2p on ${node}"
|
||||
|
||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 5
|
||||
|
||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" 2>&1 | grep -q "active"; then
|
||||
log_success "Successfully restarted aitbc-blockchain-p2p on ${node}"
|
||||
return 0
|
||||
else
|
||||
log_error "Failed to restart aitbc-blockchain-p2p on ${node}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
restart_node_service() {
|
||||
local node="$1"
|
||||
log "Attempting to restart aitbc-blockchain-node on ${node}"
|
||||
|
||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 10
|
||||
|
||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-node" 2>&1 | grep -q "active"; then
|
||||
log_success "Successfully restarted aitbc-blockchain-node on ${node}"
|
||||
return 0
|
||||
else
|
||||
log_error "Failed to restart aitbc-blockchain-node on ${node}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main health check for a node
|
||||
check_node_health() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
local node="${node_name}"
|
||||
|
||||
local failures=0
|
||||
|
||||
# Check RPC health
|
||||
if ! check_rpc_health "$node_name" "$node_ip"; then
|
||||
((failures++))
|
||||
log "Attempting remediation for RPC on ${node_name}"
|
||||
if restart_rpc_service "$node"; then
|
||||
# Retry RPC check
|
||||
if ! check_rpc_health "$node_name" "$node_ip"; then
|
||||
log_error "RPC remediation failed on ${node_name}"
|
||||
else
|
||||
log_success "RPC remediation successful on ${node_name}"
|
||||
((failures--))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check blockchain node service
|
||||
if ! check_service_status "$node" "aitbc-blockchain-node"; then
|
||||
((failures++))
|
||||
log "Attempting remediation for blockchain node on ${node_name}"
|
||||
if restart_node_service "$node"; then
|
||||
# Retry service check
|
||||
if check_service_status "$node" "aitbc-blockchain-node"; then
|
||||
log_success "Blockchain node remediation successful on ${node_name}"
|
||||
((failures--))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check P2P service
|
||||
if ! check_service_status "$node" "aitbc-blockchain-p2p"; then
|
||||
((failures++))
|
||||
log "Attempting remediation for P2P on ${node_name}"
|
||||
if restart_p2p_service "$node"; then
|
||||
# Retry service check
|
||||
if check_service_status "$node" "aitbc-blockchain-p2p"; then
|
||||
log_success "P2P remediation successful on ${node_name}"
|
||||
((failures--))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check resource usage
|
||||
if ! check_resource_usage "$node"; then
|
||||
((failures++))
|
||||
log_warning "Resource usage issues on ${node_name}"
|
||||
fi
|
||||
|
||||
return $failures
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "=== Multi-Node Blockchain Health Check Started ==="
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
local total_failures=0
|
||||
|
||||
# Check Redis connectivity (shared resource)
|
||||
if ! check_redis_connectivity; then
|
||||
log_error "Redis connectivity failed - this affects all nodes"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
# Check each node
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
log "=== Checking node: ${node_name} (${node_ip}) ==="
|
||||
|
||||
if check_node_health "$node_name" "$node_ip"; then
|
||||
log_success "Node ${node_name} is healthy"
|
||||
else
|
||||
failures=$?
|
||||
log_error "Node ${node_name} has ${failures} health issues"
|
||||
((total_failures+=failures))
|
||||
fi
|
||||
|
||||
echo "" | tee -a "${LOG_FILE}"
|
||||
done
|
||||
|
||||
log "=== Multi-Node Blockchain Health Check Completed ==="
|
||||
log "Total failures: ${total_failures}"
|
||||
|
||||
if [ ${total_failures} -eq 0 ]; then
|
||||
log_success "All nodes are healthy"
|
||||
exit 0
|
||||
else
|
||||
log_error "Health check completed with ${total_failures} failures"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
287
scripts/multi-node/p2p-verification.sh
Executable file
287
scripts/multi-node/p2p-verification.sh
Executable file
@@ -0,0 +1,287 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# P2P Network Verification Script
|
||||
# Verifies P2P network connectivity across all 3 blockchain nodes
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
LOG_DIR="/var/log/aitbc"
|
||||
LOG_FILE="${LOG_DIR}/p2p-verification.log"
|
||||
|
||||
# Node Configuration
|
||||
NODES=(
|
||||
"aitbc:10.1.223.93"
|
||||
"aitbc1:10.1.223.40"
|
||||
"aitbc2:10.1.223.98"
|
||||
)
|
||||
|
||||
P2P_PORT=7070
|
||||
REDIS_HOST="10.1.223.93"
|
||||
REDIS_PORT=6379
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Logging functions
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local message="$@"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
log "SUCCESS" "$@"
|
||||
echo -e "${GREEN}$@${NC}"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
log "ERROR" "$@"
|
||||
echo -e "${RED}$@${NC}"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
log "WARNING" "$@"
|
||||
echo -e "${YELLOW}$@${NC}"
|
||||
}
|
||||
|
||||
# SSH execution helper
|
||||
ssh_exec() {
|
||||
local node="$1"
|
||||
local command="$2"
|
||||
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1
|
||||
}
|
||||
|
||||
# Check P2P peer list on a node
|
||||
check_p2p_peers() {
|
||||
local node="$1"
|
||||
local node_name="$2"
|
||||
|
||||
log "Checking P2P peers on ${node_name}"
|
||||
|
||||
# Read node.env to get expected peers
|
||||
peers=$(ssh_exec "$node" "grep '^p2p_peers=' /etc/aitbc/node.env | cut -d'=' -f2" 2>&1 || echo "")
|
||||
|
||||
if [ -z "$peers" ]; then
|
||||
log_error "No p2p_peers configured on ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "Expected peers on ${node_name}: ${peers}"
|
||||
|
||||
# Check P2P service status
|
||||
if ! ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" | grep -q "active"; then
|
||||
log_error "P2P service not active on ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_success "P2P peers configured on ${node_name}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check P2P connectivity between nodes
|
||||
check_p2p_connectivity() {
|
||||
local source_node="$1"
|
||||
local source_name="$2"
|
||||
local target_node="$3"
|
||||
local target_name="$4"
|
||||
|
||||
log "Checking P2P connectivity from ${source_name} to ${target_name}"
|
||||
|
||||
# Try to connect to target P2P port
|
||||
if ssh_exec "$source_node" "timeout 5 bash -c '</dev/tcp/${target_node#*:}/${P2P_PORT}'" 2>&1; then
|
||||
log_success "P2P connectivity OK from ${source_name} to ${target_name}"
|
||||
return 0
|
||||
else
|
||||
log_error "P2P connectivity FAILED from ${source_name} to ${target_name}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check Redis gossip backend connectivity
|
||||
check_gossip_backend() {
|
||||
log "Checking Redis gossip backend connectivity (${REDIS_HOST}:${REDIS_PORT})"
|
||||
|
||||
if redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT}" ping > /dev/null 2>&1; then
|
||||
log_success "Redis gossip backend connectivity OK"
|
||||
return 0
|
||||
else
|
||||
log_error "Redis gossip backend connectivity failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check for P2P handshake errors in logs
|
||||
check_p2p_logs() {
|
||||
local node="$1"
|
||||
local node_name="$2"
|
||||
|
||||
log "Checking P2P logs for errors on ${node_name}"
|
||||
|
||||
# Check for handshake errors
|
||||
errors=$(ssh_exec "$node" "journalctl -u aitbc-blockchain-p2p --since '1 hour ago' | grep -i 'handshake\|error\|failed' | tail -5" 2>&1 || echo "")
|
||||
|
||||
if [ -n "$errors" ]; then
|
||||
log_warning "P2P errors found on ${node_name}:"
|
||||
echo "$errors" | tee -a "${LOG_FILE}"
|
||||
return 1
|
||||
else
|
||||
log_success "No P2P errors found on ${node_name}"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# Remediation: Restart P2P service
|
||||
remediate_p2p_service() {
|
||||
local node="$1"
|
||||
local node_name="$2"
|
||||
|
||||
log "Attempting P2P remediation on ${node_name}"
|
||||
|
||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 5
|
||||
|
||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" | grep -q "active"; then
|
||||
log_success "P2P service remediation successful on ${node_name}"
|
||||
return 0
|
||||
else
|
||||
log_error "P2P service remediation failed on ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Update p2p_peers configuration if needed
|
||||
update_p2p_peers() {
|
||||
local node="$1"
|
||||
local node_name="$2"
|
||||
|
||||
log "Updating p2p_peers configuration on ${node_name}"
|
||||
|
||||
# Determine correct peers based on node name
|
||||
case "$node_name" in
|
||||
"aitbc")
|
||||
peers="aitbc1:7070,aitbc2:7070"
|
||||
;;
|
||||
"aitbc1")
|
||||
peers="aitbc:7070,aitbc2:7070"
|
||||
;;
|
||||
"aitbc2")
|
||||
peers="aitbc:7070,aitbc1:7070"
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown node name: ${node_name}"
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Update node.env
|
||||
ssh_exec "$node" "sed -i 's/^p2p_peers=.*/p2p_peers=${peers}/' /etc/aitbc/node.env" 2>&1 | tee -a "${LOG_FILE}"
|
||||
|
||||
# Restart P2P service to apply changes
|
||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 5
|
||||
|
||||
log_success "Updated p2p_peers on ${node_name} to: ${peers}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Main verification for a node
|
||||
verify_node_p2p() {
|
||||
local node_name="$1"
|
||||
local node_ip="$2"
|
||||
local node="${node_name}"
|
||||
|
||||
local failures=0
|
||||
|
||||
# Check P2P peers configuration
|
||||
if ! check_p2p_peers "$node" "$node_name"; then
|
||||
((failures++))
|
||||
log "Attempting remediation for P2P peers on ${node_name}"
|
||||
update_p2p_peers "$node" "$node_name" || true
|
||||
fi
|
||||
|
||||
# Check P2P logs for errors
|
||||
if ! check_p2p_logs "$node" "$node_name"; then
|
||||
((failures++))
|
||||
log "Attempting remediation for P2P errors on ${node_name}"
|
||||
remediate_p2p_service "$node" "$node_name" || true
|
||||
fi
|
||||
|
||||
return $failures
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "=== P2P Network Verification Started ==="
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
local total_failures=0
|
||||
|
||||
# Check Redis gossip backend
|
||||
if ! check_gossip_backend; then
|
||||
log_error "Gossip backend connectivity failed"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
# Check each node's P2P configuration
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
log "=== Verifying P2P on node: ${node_name} (${node_ip}) ==="
|
||||
|
||||
if verify_node_p2p "$node_name" "$node_ip"; then
|
||||
log_success "P2P verification passed for ${node_name}"
|
||||
else
|
||||
failures=$?
|
||||
log_error "P2P verification failed for ${node_name} with ${failures} issues"
|
||||
((total_failures+=failures))
|
||||
fi
|
||||
|
||||
echo "" | tee -a "${LOG_FILE}"
|
||||
done
|
||||
|
||||
# Check P2P connectivity between all node pairs
|
||||
log "=== Checking P2P connectivity between node pairs ==="
|
||||
|
||||
for source_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r source_name source_ip <<< "$source_config"
|
||||
|
||||
for target_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r target_name target_ip <<< "$target_config"
|
||||
|
||||
# Skip self-connectivity check
|
||||
if [ "$source_name" = "$target_name" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! check_p2p_connectivity "$source_name" "$source_name" "$target_ip" "$target_name"; then
|
||||
((total_failures++))
|
||||
log "Attempting remediation for P2P connectivity"
|
||||
remediate_p2p_service "$source_name" "$source_name" || true
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
log "=== P2P Network Verification Completed ==="
|
||||
log "Total failures: ${total_failures}"
|
||||
|
||||
if [ ${total_failures} -eq 0 ]; then
|
||||
log_success "P2P network verification passed"
|
||||
exit 0
|
||||
else
|
||||
log_error "P2P network verification failed with ${total_failures} failures"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
348
scripts/multi-node/sync-verification.sh
Executable file
348
scripts/multi-node/sync-verification.sh
Executable file
@@ -0,0 +1,348 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Blockchain Synchronization Verification Script
|
||||
# Verifies blockchain synchronization across all 3 nodes
|
||||
# Provides automatic remediation by forcing sync from healthy node
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
LOG_DIR="/var/log/aitbc"
|
||||
LOG_FILE="${LOG_DIR}/sync-verification.log"
|
||||
|
||||
# Node Configuration
|
||||
NODES=(
|
||||
"aitbc:10.1.223.93"
|
||||
"aitbc1:10.1.223.40"
|
||||
"aitbc2:10.1.223.98"
|
||||
)
|
||||
|
||||
RPC_PORT=8006
|
||||
SYNC_THRESHOLD=10
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Logging functions
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local message="$@"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[${timestamp}] [${level}] ${message}" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
log "SUCCESS" "$@"
|
||||
echo -e "${GREEN}$@${NC}"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
log "ERROR" "$@"
|
||||
echo -e "${RED}$@${NC}"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
log "WARNING" "$@"
|
||||
echo -e "${YELLOW}$@${NC}"
|
||||
}
|
||||
|
||||
# SSH execution helper
|
||||
ssh_exec() {
|
||||
local node="$1"
|
||||
local command="$2"
|
||||
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1
|
||||
}
|
||||
|
||||
# Get block height from RPC endpoint
|
||||
get_block_height() {
|
||||
local node_ip="$1"
|
||||
|
||||
# Try to get block height from RPC
|
||||
height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/height" 2>/dev/null | grep -o '[0-9]*' || echo "0")
|
||||
|
||||
if [ -z "$height" ] || [ "$height" = "0" ]; then
|
||||
# Try alternative endpoint
|
||||
height=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/height" 2>/dev/null | grep -o '[0-9]*' || echo "0")
|
||||
fi
|
||||
|
||||
echo "$height"
|
||||
}
|
||||
|
||||
# Get chain ID from RPC endpoint
|
||||
get_chain_id() {
|
||||
local node_ip="$1"
|
||||
|
||||
chain_id=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/chain-id" 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "$chain_id" ]; then
|
||||
chain_id=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/chain-id" 2>/dev/null || echo "")
|
||||
fi
|
||||
|
||||
echo "$chain_id"
|
||||
}
|
||||
|
||||
# Get block hash at specific height
|
||||
get_block_hash() {
|
||||
local node_ip="$1"
|
||||
local height="$2"
|
||||
|
||||
hash=$(curl -s --max-time 5 "http://${node_ip}:${RPC_PORT}/blockchain/block/${height}/hash" 2>/dev/null || echo "")
|
||||
echo "$hash"
|
||||
}
|
||||
|
||||
# Check chain ID consistency
|
||||
check_chain_id_consistency() {
|
||||
log "Checking chain ID consistency across nodes"
|
||||
|
||||
local first_chain_id=""
|
||||
local consistent=true
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
chain_id=$(get_chain_id "$node_ip")
|
||||
|
||||
if [ -z "$chain_id" ]; then
|
||||
log_error "Could not get chain ID from ${node_name}"
|
||||
consistent=false
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Chain ID on ${node_name}: ${chain_id}"
|
||||
|
||||
if [ -z "$first_chain_id" ]; then
|
||||
first_chain_id="$chain_id"
|
||||
elif [ "$chain_id" != "$first_chain_id" ]; then
|
||||
log_error "Chain ID mismatch on ${node_name}: ${chain_id} vs ${first_chain_id}"
|
||||
consistent=false
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$consistent" = true ]; then
|
||||
log_success "Chain ID consistent across all nodes"
|
||||
return 0
|
||||
else
|
||||
log_error "Chain ID inconsistent across nodes"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check block synchronization
|
||||
check_block_sync() {
|
||||
log "Checking block synchronization across nodes"
|
||||
|
||||
local heights=()
|
||||
local max_height=0
|
||||
local min_height=999999
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
height=$(get_block_height "$node_ip")
|
||||
|
||||
if [ -z "$height" ] || [ "$height" = "0" ]; then
|
||||
log_error "Could not get block height from ${node_name}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
heights+=("${node_name}:${height}")
|
||||
log "Block height on ${node_name}: ${height}"
|
||||
|
||||
if [ "$height" -gt "$max_height" ]; then
|
||||
max_height=$height
|
||||
max_node="${node_name}"
|
||||
max_ip="${node_ip}"
|
||||
fi
|
||||
|
||||
if [ "$height" -lt "$min_height" ]; then
|
||||
min_height=$height
|
||||
fi
|
||||
done
|
||||
|
||||
local height_diff=$((max_height - min_height))
|
||||
|
||||
log "Max height: ${max_height} (${max_node}), Min height: ${min_height}, Diff: ${height_diff}"
|
||||
|
||||
if [ "$height_diff" -le "$SYNC_THRESHOLD" ]; then
|
||||
log_success "Block synchronization within threshold (diff: ${height_diff})"
|
||||
return 0
|
||||
else
|
||||
log_error "Block synchronization exceeds threshold (diff: ${height_diff})"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check block hash consistency at current height
|
||||
check_block_hash_consistency() {
|
||||
log "Checking block hash consistency"
|
||||
|
||||
local target_height=""
|
||||
|
||||
# Find the minimum height to compare at
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
height=$(get_block_height "$node_ip")
|
||||
|
||||
if [ -z "$target_height" ] || [ "$height" -lt "$target_height" ]; then
|
||||
target_height=$height
|
||||
fi
|
||||
done
|
||||
|
||||
log "Comparing block hashes at height ${target_height}"
|
||||
|
||||
local first_hash=""
|
||||
local consistent=true
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
|
||||
hash=$(get_block_hash "$node_ip" "$target_height")
|
||||
|
||||
if [ -z "$hash" ]; then
|
||||
log_warning "Could not get block hash from ${node_name} at height ${target_height}"
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Block hash on ${node_name} at height ${target_height}: ${hash}"
|
||||
|
||||
if [ -z "$first_hash" ]; then
|
||||
first_hash="$hash"
|
||||
elif [ "$hash" != "$first_hash" ]; then
|
||||
log_error "Block hash mismatch on ${node_name} at height ${target_height}"
|
||||
consistent=false
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$consistent" = true ]; then
|
||||
log_success "Block hashes consistent at height ${target_height}"
|
||||
return 0
|
||||
else
|
||||
log_error "Block hashes inconsistent"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Remediation: Force sync from healthy node
|
||||
force_sync_from_source() {
|
||||
local target_node="$1"
|
||||
local target_name="$2"
|
||||
local source_node="$3"
|
||||
local source_name="$4"
|
||||
|
||||
log "Forcing sync from ${source_name} to ${target_name}"
|
||||
|
||||
# Stop blockchain service on target
|
||||
log "Stopping blockchain service on ${target_name}"
|
||||
ssh_exec "$target_node" "systemctl stop aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 5
|
||||
|
||||
# Copy chain.db from source to target
|
||||
log "Copying chain.db from ${source_name} to ${target_name}"
|
||||
ssh_exec "$source_node" "cat /var/lib/aitbc/data/chain.db" | ssh_exec "$target_node" "cat > /var/lib/aitbc/data/chain.db" 2>&1 | tee -a "${LOG_FILE}"
|
||||
|
||||
# Start blockchain service on target
|
||||
log "Starting blockchain service on ${target_name}"
|
||||
ssh_exec "$target_node" "systemctl start aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}"
|
||||
sleep 10
|
||||
|
||||
# Verify service is running
|
||||
if ssh_exec "$target_node" "systemctl is-active aitbc-blockchain-node" | grep -q "active"; then
|
||||
log_success "Sync completed successfully on ${target_name}"
|
||||
return 0
|
||||
else
|
||||
log_error "Failed to start blockchain service on ${target_name} after sync"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main sync verification
|
||||
main() {
|
||||
log "=== Blockchain Synchronization Verification Started ==="
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
local total_failures=0
|
||||
|
||||
# Check chain ID consistency
|
||||
if ! check_chain_id_consistency; then
|
||||
log_error "Chain ID inconsistency detected - this is critical"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
# Check block synchronization
|
||||
if ! check_block_sync; then
|
||||
log_error "Block synchronization issue detected"
|
||||
((total_failures++))
|
||||
|
||||
# Determine source and target nodes for remediation
|
||||
local max_height=0
|
||||
local max_node=""
|
||||
local max_ip=""
|
||||
local min_height=999999
|
||||
local min_node=""
|
||||
local min_ip=""
|
||||
|
||||
for node_config in "${NODES[@]}"; do
|
||||
IFS=':' read -r node_name node_ip <<< "$node_config"
|
||||
height=$(get_block_height "$node_ip")
|
||||
|
||||
if [ "$height" -gt "$max_height" ]; then
|
||||
max_height=$height
|
||||
max_node="${node_name}"
|
||||
max_ip="${node_ip}"
|
||||
fi
|
||||
|
||||
if [ "$height" -lt "$min_height" ]; then
|
||||
min_height=$height
|
||||
min_node="${node_name}"
|
||||
min_ip="${node_ip}"
|
||||
fi
|
||||
done
|
||||
|
||||
# Attempt remediation if difference exceeds threshold
|
||||
local height_diff=$((max_height - min_height))
|
||||
if [ "$height_diff" -gt "$SYNC_THRESHOLD" ]; then
|
||||
log "Attempting remediation: sync from ${max_node} to ${min_node}"
|
||||
if force_sync_from_source "$min_ip" "$min_node" "$max_ip" "$max_node"; then
|
||||
log_success "Remediation successful"
|
||||
# Re-check sync after remediation
|
||||
if check_block_sync; then
|
||||
log_success "Sync verification passed after remediation"
|
||||
else
|
||||
log_error "Sync still fails after remediation"
|
||||
((total_failures++))
|
||||
fi
|
||||
else
|
||||
log_error "Remediation failed"
|
||||
((total_failures++))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check block hash consistency
|
||||
if ! check_block_hash_consistency; then
|
||||
log_error "Block hash inconsistency detected"
|
||||
((total_failures++))
|
||||
fi
|
||||
|
||||
log "=== Blockchain Synchronization Verification Completed ==="
|
||||
log "Total failures: ${total_failures}"
|
||||
|
||||
if [ ${total_failures} -eq 0 ]; then
|
||||
log_success "Blockchain synchronization verification passed"
|
||||
exit 0
|
||||
else
|
||||
log_error "Blockchain synchronization verification failed with ${total_failures} failures"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user