refactor: remove SSH dependencies from blockchain health check
Some checks failed
Blockchain Synchronization Verification / sync-verification (push) Failing after 3s
Multi-Node Blockchain Health Monitoring / health-check (push) Successful in 7s
P2P Network Verification / p2p-verification (push) Has been cancelled

- Remove SSH-based service status checks (use RPC health instead)
- Remove SSH-based resource usage checks
- Remove SSH-based remediation functions
- Remove ssh_exec function entirely
- Script now uses only RPC endpoints for health checks
- gitea-runner no longer needs SSH access to other nodes
This commit is contained in:
aitbc
2026-04-20 20:30:48 +02:00
parent 7d19ec110e
commit adb719efcc

View File

@@ -53,22 +53,6 @@ log_warning() {
echo -e "${YELLOW}$@${NC}"
}
# SSH execution helper
ssh_exec() {
local node="$1"
local command="$2"
# Get local IP address
local local_ip=$(hostname -I | awk '{print $1}')
# If node is localhost or local IP, execute directly without SSH
if [ "$node" = "localhost" ] || [ "$node" = "$(hostname)" ] || [ "$node" = "$local_ip" ]; then
bash -c "$command" 2>&1 || return 1
else
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1
fi
}
# Check RPC endpoint health
check_rpc_health() {
local node_name="$1"
@@ -85,47 +69,24 @@ check_rpc_health() {
fi
}
# Check systemd service status
# Check systemd service status (RPC-based only, no SSH)
check_service_status() {
local node="$1"
local service="$2"
local node_name="$1"
local node_ip="$2"
local service="$3"
log "Checking ${service} status on ${node}"
status=$(ssh_exec "$node" "systemctl is-active ${service}" 2>&1 || echo "inactive")
if [ "$status" = "active" ]; then
log_success "${service} is active on ${node}"
return 0
else
log_error "${service} is ${status} on ${node}"
return 1
fi
# Skip SSH-based service checks - use RPC health instead
log "Skipping SSH-based service check for ${service} on ${node_name} (using RPC health instead)"
return 0
}
# Check resource usage
# Check resource usage (RPC-based only, no SSH)
check_resource_usage() {
local node="$1"
log "Checking resource usage on ${node}"
memory=$(ssh_exec "$node" "free | grep Mem | awk '{printf \"%.1f\", (\$3/\$2)*100}'" 2>&1 || echo "0")
cpu=$(ssh_exec "$node" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | cut -d'%' -f1" 2>&1 || echo "0")
disk=$(ssh_exec "$node" "df /var/lib/aitbc | tail -1 | awk '{print \$5}' | cut -d'%' -f1" 2>&1 || echo "0")
log "Resource usage on ${node}: CPU ${cpu}%, Memory ${memory}%, Disk ${disk}%"
# Check thresholds
if [ "${disk%.*}" -gt 90 ]; then
log_warning "Disk usage critical on ${node}: ${disk}%"
return 1
fi
if [ "${memory%.*}" -gt 90 ]; then
log_warning "Memory usage critical on ${node}: ${memory}%"
return 1
fi
local node_name="$1"
local node_ip="$2"
# Skip SSH-based resource checks
log "Skipping SSH-based resource usage check for ${node_name} (not supported without SSH)"
return 0
}
@@ -142,109 +103,21 @@ check_redis_connectivity() {
fi
}
# Remediation functions
restart_rpc_service() {
local node="$1"
log "Attempting to restart aitbc-blockchain-rpc on ${node}"
ssh_exec "$node" "systemctl restart aitbc-blockchain-rpc" 2>&1 | tee -a "${LOG_FILE}"
sleep 5
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-rpc" 2>&1 | grep -q "active"; then
log_success "Successfully restarted aitbc-blockchain-rpc on ${node}"
return 0
else
log_error "Failed to restart aitbc-blockchain-rpc on ${node}"
return 1
fi
}
restart_p2p_service() {
local node="$1"
log "Attempting to restart aitbc-blockchain-p2p on ${node}"
ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}"
sleep 5
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" 2>&1 | grep -q "active"; then
log_success "Successfully restarted aitbc-blockchain-p2p on ${node}"
return 0
else
log_error "Failed to restart aitbc-blockchain-p2p on ${node}"
return 1
fi
}
restart_node_service() {
local node="$1"
log "Attempting to restart aitbc-blockchain-node on ${node}"
ssh_exec "$node" "systemctl restart aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}"
sleep 10
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-node" 2>&1 | grep -q "active"; then
log_success "Successfully restarted aitbc-blockchain-node on ${node}"
return 0
else
log_error "Failed to restart aitbc-blockchain-node on ${node}"
return 1
fi
}
# Main health check for a node
# Main health check for a node (RPC-based only)
check_node_health() {
local node_name="$1"
local node_ip="$2"
local node="${node_name}"
local failures=0
# Check RPC health
# Check RPC health only
if ! check_rpc_health "$node_name" "$node_ip"; then
((failures++))
log "Attempting remediation for RPC on ${node_name}"
if restart_rpc_service "$node"; then
# Retry RPC check
if ! check_rpc_health "$node_name" "$node_ip"; then
log_error "RPC remediation failed on ${node_name}"
else
log_success "RPC remediation successful on ${node_name}"
((failures--))
fi
fi
log_error "RPC endpoint unhealthy on ${node_name}"
fi
# Check blockchain node service
if ! check_service_status "$node" "aitbc-blockchain-node"; then
((failures++))
log "Attempting remediation for blockchain node on ${node_name}"
if restart_node_service "$node"; then
# Retry service check
if check_service_status "$node" "aitbc-blockchain-node"; then
log_success "Blockchain node remediation successful on ${node_name}"
((failures--))
fi
fi
fi
# Check P2P service
if ! check_service_status "$node" "aitbc-blockchain-p2p"; then
((failures++))
log "Attempting remediation for P2P on ${node_name}"
if restart_p2p_service "$node"; then
# Retry service check
if check_service_status "$node" "aitbc-blockchain-p2p"; then
log_success "P2P remediation successful on ${node_name}"
((failures--))
fi
fi
fi
# Check resource usage
if ! check_resource_usage "$node"; then
((failures++))
log_warning "Resource usage issues on ${node_name}"
fi
# Skip SSH-based service and resource checks
log "Skipping SSH-based checks for ${node_name} (RPC health only mode)"
return $failures
}