From adb719efccfa54d6841d3ef2093b10a5d42a0644 Mon Sep 17 00:00:00 2001 From: aitbc Date: Mon, 20 Apr 2026 20:30:48 +0200 Subject: [PATCH] refactor: remove SSH dependencies from blockchain health check - Remove SSH-based service status checks (use RPC health instead) - Remove SSH-based resource usage checks - Remove SSH-based remediation functions - Remove ssh_exec function entirely - Script now uses only RPC endpoints for health checks - gitea-runner no longer needs SSH access to other nodes --- scripts/multi-node/blockchain-health-check.sh | 161 ++---------------- 1 file changed, 17 insertions(+), 144 deletions(-) diff --git a/scripts/multi-node/blockchain-health-check.sh b/scripts/multi-node/blockchain-health-check.sh index a4764ec8..51cd4ad7 100755 --- a/scripts/multi-node/blockchain-health-check.sh +++ b/scripts/multi-node/blockchain-health-check.sh @@ -53,22 +53,6 @@ log_warning() { echo -e "${YELLOW}$@${NC}" } -# SSH execution helper -ssh_exec() { - local node="$1" - local command="$2" - - # Get local IP address - local local_ip=$(hostname -I | awk '{print $1}') - - # If node is localhost or local IP, execute directly without SSH - if [ "$node" = "localhost" ] || [ "$node" = "$(hostname)" ] || [ "$node" = "$local_ip" ]; then - bash -c "$command" 2>&1 || return 1 - else - ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1 - fi -} - # Check RPC endpoint health check_rpc_health() { local node_name="$1" @@ -85,47 +69,24 @@ check_rpc_health() { fi } -# Check systemd service status +# Check systemd service status (RPC-based only, no SSH) check_service_status() { - local node="$1" - local service="$2" + local node_name="$1" + local node_ip="$2" + local service="$3" - log "Checking ${service} status on ${node}" - - status=$(ssh_exec "$node" "systemctl is-active ${service}" 2>&1 || echo "inactive") - - if [ "$status" = "active" ]; then - log_success "${service} is active on ${node}" - return 0 - else - log_error "${service} is ${status} on ${node}" - return 1 - fi + # Skip SSH-based service checks - use RPC health instead + log "Skipping SSH-based service check for ${service} on ${node_name} (using RPC health instead)" + return 0 } -# Check resource usage +# Check resource usage (RPC-based only, no SSH) check_resource_usage() { - local node="$1" - - log "Checking resource usage on ${node}" - - memory=$(ssh_exec "$node" "free | grep Mem | awk '{printf \"%.1f\", (\$3/\$2)*100}'" 2>&1 || echo "0") - cpu=$(ssh_exec "$node" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | cut -d'%' -f1" 2>&1 || echo "0") - disk=$(ssh_exec "$node" "df /var/lib/aitbc | tail -1 | awk '{print \$5}' | cut -d'%' -f1" 2>&1 || echo "0") - - log "Resource usage on ${node}: CPU ${cpu}%, Memory ${memory}%, Disk ${disk}%" - - # Check thresholds - if [ "${disk%.*}" -gt 90 ]; then - log_warning "Disk usage critical on ${node}: ${disk}%" - return 1 - fi - - if [ "${memory%.*}" -gt 90 ]; then - log_warning "Memory usage critical on ${node}: ${memory}%" - return 1 - fi + local node_name="$1" + local node_ip="$2" + # Skip SSH-based resource checks + log "Skipping SSH-based resource usage check for ${node_name} (not supported without SSH)" return 0 } @@ -142,109 +103,21 @@ check_redis_connectivity() { fi } -# Remediation functions -restart_rpc_service() { - local node="$1" - log "Attempting to restart aitbc-blockchain-rpc on ${node}" - - ssh_exec "$node" "systemctl restart aitbc-blockchain-rpc" 2>&1 | tee -a "${LOG_FILE}" - sleep 5 - - if ssh_exec "$node" "systemctl is-active aitbc-blockchain-rpc" 2>&1 | grep -q "active"; then - log_success "Successfully restarted aitbc-blockchain-rpc on ${node}" - return 0 - else - log_error "Failed to restart aitbc-blockchain-rpc on ${node}" - return 1 - fi -} - -restart_p2p_service() { - local node="$1" - log "Attempting to restart aitbc-blockchain-p2p on ${node}" - - ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}" - sleep 5 - - if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" 2>&1 | grep -q "active"; then - log_success "Successfully restarted aitbc-blockchain-p2p on ${node}" - return 0 - else - log_error "Failed to restart aitbc-blockchain-p2p on ${node}" - return 1 - fi -} - -restart_node_service() { - local node="$1" - log "Attempting to restart aitbc-blockchain-node on ${node}" - - ssh_exec "$node" "systemctl restart aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}" - sleep 10 - - if ssh_exec "$node" "systemctl is-active aitbc-blockchain-node" 2>&1 | grep -q "active"; then - log_success "Successfully restarted aitbc-blockchain-node on ${node}" - return 0 - else - log_error "Failed to restart aitbc-blockchain-node on ${node}" - return 1 - fi -} - -# Main health check for a node +# Main health check for a node (RPC-based only) check_node_health() { local node_name="$1" local node_ip="$2" - local node="${node_name}" local failures=0 - # Check RPC health + # Check RPC health only if ! check_rpc_health "$node_name" "$node_ip"; then ((failures++)) - log "Attempting remediation for RPC on ${node_name}" - if restart_rpc_service "$node"; then - # Retry RPC check - if ! check_rpc_health "$node_name" "$node_ip"; then - log_error "RPC remediation failed on ${node_name}" - else - log_success "RPC remediation successful on ${node_name}" - ((failures--)) - fi - fi + log_error "RPC endpoint unhealthy on ${node_name}" fi - # Check blockchain node service - if ! check_service_status "$node" "aitbc-blockchain-node"; then - ((failures++)) - log "Attempting remediation for blockchain node on ${node_name}" - if restart_node_service "$node"; then - # Retry service check - if check_service_status "$node" "aitbc-blockchain-node"; then - log_success "Blockchain node remediation successful on ${node_name}" - ((failures--)) - fi - fi - fi - - # Check P2P service - if ! check_service_status "$node" "aitbc-blockchain-p2p"; then - ((failures++)) - log "Attempting remediation for P2P on ${node_name}" - if restart_p2p_service "$node"; then - # Retry service check - if check_service_status "$node" "aitbc-blockchain-p2p"; then - log_success "P2P remediation successful on ${node_name}" - ((failures--)) - fi - fi - fi - - # Check resource usage - if ! check_resource_usage "$node"; then - ((failures++)) - log_warning "Resource usage issues on ${node_name}" - fi + # Skip SSH-based service and resource checks + log "Skipping SSH-based checks for ${node_name} (RPC health only mode)" return $failures }