refactor: remove SSH dependencies from blockchain health check
- Remove SSH-based service status checks (use RPC health instead) - Remove SSH-based resource usage checks - Remove SSH-based remediation functions - Remove ssh_exec function entirely - Script now uses only RPC endpoints for health checks - gitea-runner no longer needs SSH access to other nodes
This commit is contained in:
@@ -53,22 +53,6 @@ log_warning() {
|
|||||||
echo -e "${YELLOW}$@${NC}"
|
echo -e "${YELLOW}$@${NC}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# SSH execution helper
|
|
||||||
ssh_exec() {
|
|
||||||
local node="$1"
|
|
||||||
local command="$2"
|
|
||||||
|
|
||||||
# Get local IP address
|
|
||||||
local local_ip=$(hostname -I | awk '{print $1}')
|
|
||||||
|
|
||||||
# If node is localhost or local IP, execute directly without SSH
|
|
||||||
if [ "$node" = "localhost" ] || [ "$node" = "$(hostname)" ] || [ "$node" = "$local_ip" ]; then
|
|
||||||
bash -c "$command" 2>&1 || return 1
|
|
||||||
else
|
|
||||||
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$node" "$command" 2>&1 || return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check RPC endpoint health
|
# Check RPC endpoint health
|
||||||
check_rpc_health() {
|
check_rpc_health() {
|
||||||
local node_name="$1"
|
local node_name="$1"
|
||||||
@@ -85,47 +69,24 @@ check_rpc_health() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check systemd service status
|
# Check systemd service status (RPC-based only, no SSH)
|
||||||
check_service_status() {
|
check_service_status() {
|
||||||
local node="$1"
|
local node_name="$1"
|
||||||
local service="$2"
|
local node_ip="$2"
|
||||||
|
local service="$3"
|
||||||
|
|
||||||
log "Checking ${service} status on ${node}"
|
# Skip SSH-based service checks - use RPC health instead
|
||||||
|
log "Skipping SSH-based service check for ${service} on ${node_name} (using RPC health instead)"
|
||||||
status=$(ssh_exec "$node" "systemctl is-active ${service}" 2>&1 || echo "inactive")
|
return 0
|
||||||
|
|
||||||
if [ "$status" = "active" ]; then
|
|
||||||
log_success "${service} is active on ${node}"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "${service} is ${status} on ${node}"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check resource usage
|
# Check resource usage (RPC-based only, no SSH)
|
||||||
check_resource_usage() {
|
check_resource_usage() {
|
||||||
local node="$1"
|
local node_name="$1"
|
||||||
|
local node_ip="$2"
|
||||||
log "Checking resource usage on ${node}"
|
|
||||||
|
|
||||||
memory=$(ssh_exec "$node" "free | grep Mem | awk '{printf \"%.1f\", (\$3/\$2)*100}'" 2>&1 || echo "0")
|
|
||||||
cpu=$(ssh_exec "$node" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | cut -d'%' -f1" 2>&1 || echo "0")
|
|
||||||
disk=$(ssh_exec "$node" "df /var/lib/aitbc | tail -1 | awk '{print \$5}' | cut -d'%' -f1" 2>&1 || echo "0")
|
|
||||||
|
|
||||||
log "Resource usage on ${node}: CPU ${cpu}%, Memory ${memory}%, Disk ${disk}%"
|
|
||||||
|
|
||||||
# Check thresholds
|
|
||||||
if [ "${disk%.*}" -gt 90 ]; then
|
|
||||||
log_warning "Disk usage critical on ${node}: ${disk}%"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${memory%.*}" -gt 90 ]; then
|
|
||||||
log_warning "Memory usage critical on ${node}: ${memory}%"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
# Skip SSH-based resource checks
|
||||||
|
log "Skipping SSH-based resource usage check for ${node_name} (not supported without SSH)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -142,109 +103,21 @@ check_redis_connectivity() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Remediation functions
|
# Main health check for a node (RPC-based only)
|
||||||
restart_rpc_service() {
|
|
||||||
local node="$1"
|
|
||||||
log "Attempting to restart aitbc-blockchain-rpc on ${node}"
|
|
||||||
|
|
||||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-rpc" 2>&1 | tee -a "${LOG_FILE}"
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-rpc" 2>&1 | grep -q "active"; then
|
|
||||||
log_success "Successfully restarted aitbc-blockchain-rpc on ${node}"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "Failed to restart aitbc-blockchain-rpc on ${node}"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
restart_p2p_service() {
|
|
||||||
local node="$1"
|
|
||||||
log "Attempting to restart aitbc-blockchain-p2p on ${node}"
|
|
||||||
|
|
||||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-p2p" 2>&1 | tee -a "${LOG_FILE}"
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-p2p" 2>&1 | grep -q "active"; then
|
|
||||||
log_success "Successfully restarted aitbc-blockchain-p2p on ${node}"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "Failed to restart aitbc-blockchain-p2p on ${node}"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
restart_node_service() {
|
|
||||||
local node="$1"
|
|
||||||
log "Attempting to restart aitbc-blockchain-node on ${node}"
|
|
||||||
|
|
||||||
ssh_exec "$node" "systemctl restart aitbc-blockchain-node" 2>&1 | tee -a "${LOG_FILE}"
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
if ssh_exec "$node" "systemctl is-active aitbc-blockchain-node" 2>&1 | grep -q "active"; then
|
|
||||||
log_success "Successfully restarted aitbc-blockchain-node on ${node}"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "Failed to restart aitbc-blockchain-node on ${node}"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main health check for a node
|
|
||||||
check_node_health() {
|
check_node_health() {
|
||||||
local node_name="$1"
|
local node_name="$1"
|
||||||
local node_ip="$2"
|
local node_ip="$2"
|
||||||
local node="${node_name}"
|
|
||||||
|
|
||||||
local failures=0
|
local failures=0
|
||||||
|
|
||||||
# Check RPC health
|
# Check RPC health only
|
||||||
if ! check_rpc_health "$node_name" "$node_ip"; then
|
if ! check_rpc_health "$node_name" "$node_ip"; then
|
||||||
((failures++))
|
((failures++))
|
||||||
log "Attempting remediation for RPC on ${node_name}"
|
log_error "RPC endpoint unhealthy on ${node_name}"
|
||||||
if restart_rpc_service "$node"; then
|
|
||||||
# Retry RPC check
|
|
||||||
if ! check_rpc_health "$node_name" "$node_ip"; then
|
|
||||||
log_error "RPC remediation failed on ${node_name}"
|
|
||||||
else
|
|
||||||
log_success "RPC remediation successful on ${node_name}"
|
|
||||||
((failures--))
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check blockchain node service
|
# Skip SSH-based service and resource checks
|
||||||
if ! check_service_status "$node" "aitbc-blockchain-node"; then
|
log "Skipping SSH-based checks for ${node_name} (RPC health only mode)"
|
||||||
((failures++))
|
|
||||||
log "Attempting remediation for blockchain node on ${node_name}"
|
|
||||||
if restart_node_service "$node"; then
|
|
||||||
# Retry service check
|
|
||||||
if check_service_status "$node" "aitbc-blockchain-node"; then
|
|
||||||
log_success "Blockchain node remediation successful on ${node_name}"
|
|
||||||
((failures--))
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check P2P service
|
|
||||||
if ! check_service_status "$node" "aitbc-blockchain-p2p"; then
|
|
||||||
((failures++))
|
|
||||||
log "Attempting remediation for P2P on ${node_name}"
|
|
||||||
if restart_p2p_service "$node"; then
|
|
||||||
# Retry service check
|
|
||||||
if check_service_status "$node" "aitbc-blockchain-p2p"; then
|
|
||||||
log_success "P2P remediation successful on ${node_name}"
|
|
||||||
((failures--))
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check resource usage
|
|
||||||
if ! check_resource_usage "$node"; then
|
|
||||||
((failures++))
|
|
||||||
log_warning "Resource usage issues on ${node_name}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
return $failures
|
return $failures
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user