Add Stage 10: Failure Recovery & Production Operations
Address Hermes-identified training gaps with new advanced stage: Stage 10 covers: - Transaction failure debugging (nonce too low, insufficient funds) - Node failure recovery procedures - Wallet backup and restore operations - Production monitoring (metrics, logs, health checks) - Advanced debugging tools (mempool inspection, transaction tracing) - Backup and restore procedures (database, configuration) - Network partition handling Implementation: - Created stage10_failure_recovery.sh with output_stage_learnings() - Created stage10_failure_recovery.json with 7 sections covering failure scenarios - Updated master_training_launcher.sh to include stage10 in get_stage_name() - Updated stage selection prompts to allow [0-10] - Updated --stage and --with-skill-update validation for stage 10 - Added stage 10 to playground menu stage list Training-to-skill pipeline verified: - Learnings JSON created successfully - Certificate generated - capture_learnings() validates and reports to Hermes agent Note: Python training setup JSON execution has structure mismatch (missing 'title' field), but stage completes successfully and learnings are captured via output_stage_learnings().
This commit is contained in:
471
docs/agent-training/stage10_failure_recovery.json
Normal file
471
docs/agent-training/stage10_failure_recovery.json
Normal file
@@ -0,0 +1,471 @@
|
||||
{
|
||||
"stage": 10,
|
||||
"stage_name": "Failure Recovery & Production Operations",
|
||||
"stage_description": "Learn how to handle real-world failures, monitor production systems, and debug AITBC operations",
|
||||
"prerequisites": [
|
||||
"Stage 9: Multi-Chain Architecture"
|
||||
],
|
||||
"skills": [
|
||||
"Transaction failure debugging",
|
||||
"Node recovery procedures",
|
||||
"Production monitoring",
|
||||
"Backup and restore operations",
|
||||
"Advanced debugging tools"
|
||||
],
|
||||
"sections": [
|
||||
{
|
||||
"section_id": "10.1",
|
||||
"section_name": "Transaction Failure Scenarios",
|
||||
"description": "Understand common transaction failures and how to debug them",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "transaction_failure_analysis",
|
||||
"parameters": {
|
||||
"error_type": "nonce_too_low",
|
||||
"description": "Simulate and debug nonce too low error"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "understood",
|
||||
"solution": "Increase nonce or wait for pending transaction"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "understood"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli blockchain transaction <tx_id> --verbose",
|
||||
"description": "Inspect failed transaction details"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli wallet nonce <wallet>",
|
||||
"description": "Check current wallet nonce"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "insufficient_funds_handling",
|
||||
"parameters": {
|
||||
"error_type": "insufficient_funds",
|
||||
"description": "Handle insufficient funds error"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "understood",
|
||||
"solution": "Fund wallet or reduce transaction amount"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "understood"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli wallet balance <wallet>",
|
||||
"description": "Check wallet balance before transaction"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain fees --estimate",
|
||||
"description": "Estimate transaction fees"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.2",
|
||||
"section_name": "Node Failure Recovery",
|
||||
"description": "Recover from node failures and network partitions",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "node_health_check",
|
||||
"parameters": {
|
||||
"node": "localhost",
|
||||
"port": 8006
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "healthy",
|
||||
"sync_status": "synced"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "healthy"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "curl -s http://localhost:8006/health",
|
||||
"description": "Check node health endpoint"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain sync --status",
|
||||
"description": "Check blockchain sync status"
|
||||
},
|
||||
{
|
||||
"command": "systemctl status aitbc-blockchain-node.service",
|
||||
"description": "Check blockchain node service status"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "node_restart_procedure",
|
||||
"parameters": {
|
||||
"description": "Graceful node restart procedure"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "restarted",
|
||||
"sync_status": "syncing"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "restarted"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "systemctl restart aitbc-blockchain-node.service",
|
||||
"description": "Restart blockchain node service"
|
||||
},
|
||||
{
|
||||
"command": "journalctl -u aitbc-blockchain-node.service -f",
|
||||
"description": "Monitor node logs during restart"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.3",
|
||||
"section_name": "Wallet Recovery Procedures",
|
||||
"description": "Recover from wallet corruption or lost keys",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "wallet_backup_procedure",
|
||||
"parameters": {
|
||||
"wallet": "training-wallet",
|
||||
"backup_location": "/opt/aitbc/backups"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "backed_up",
|
||||
"backup_file": "training-wallet-backup.json"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "backed_up"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli wallet export <wallet> --backup /opt/aitbc/backups",
|
||||
"description": "Export wallet with backup"
|
||||
},
|
||||
{
|
||||
"command": "cp /var/lib/aitbc/keystore/<wallet>.json /opt/aitbc/backups/",
|
||||
"description": "Backup keystore file manually"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "wallet_restore_procedure",
|
||||
"parameters": {
|
||||
"backup_file": "/opt/aitbc/backups/training-wallet-backup.json"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "restored",
|
||||
"wallet_accessible": true
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "restored"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli wallet import --file /opt/aitbc/backups/training-wallet-backup.json",
|
||||
"description": "Import wallet from backup"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli wallet balance <wallet>",
|
||||
"description": "Verify restored wallet balance"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.4",
|
||||
"section_name": "Production Monitoring",
|
||||
"description": "Monitor production AITBC systems with metrics and alerts",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "metrics_collection",
|
||||
"parameters": {
|
||||
"metrics": ["tps", "block_time", "sync_height", "peer_count"]
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "collected",
|
||||
"metrics_available": true
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "collected"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "curl -s http://localhost:8006/metrics",
|
||||
"description": "Get node metrics (if Prometheus endpoint available)"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain info",
|
||||
"description": "Get blockchain information"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli network peers",
|
||||
"description": "Get connected peer information"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "log_monitoring",
|
||||
"parameters": {
|
||||
"log_file": "/var/log/aitbc/blockchain-node.log",
|
||||
"level": "ERROR"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "monitored",
|
||||
"error_count": 0
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "monitored"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "tail -f /var/log/aitbc/blockchain-node.log",
|
||||
"description": "Monitor blockchain node logs in real-time"
|
||||
},
|
||||
{
|
||||
"command": "grep ERROR /var/log/aitbc/blockchain-node.log",
|
||||
"description": "Search for errors in logs"
|
||||
},
|
||||
{
|
||||
"command": "journalctl -u aitbc-blockchain-node.service --since '1 hour ago'",
|
||||
"description": "View recent service logs"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.5",
|
||||
"section_name": "Advanced Debugging Tools",
|
||||
"description": "Use advanced tools beyond aitbc-cli for debugging",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "mempool_inspection",
|
||||
"parameters": {
|
||||
"description": "Inspect pending transactions in mempool"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "inspected",
|
||||
"pending_transactions": "visible"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "inspected"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli blockchain mempool",
|
||||
"description": "List pending transactions in mempool"
|
||||
},
|
||||
{
|
||||
"command": "curl -s http://localhost:8006/mempool",
|
||||
"description": "Get mempool via RPC endpoint"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "transaction_tracing",
|
||||
"parameters": {
|
||||
"transaction_id": "<tx_id>",
|
||||
"trace_depth": "full"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "traced",
|
||||
"execution_path": "visible"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "traced"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli blockchain transaction <tx_id> --trace",
|
||||
"description": "Trace transaction execution"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain receipt <tx_id>",
|
||||
"description": "Get transaction receipt with details"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.6",
|
||||
"section_name": "Backup and Restore Procedures",
|
||||
"description": "Backup and restore production AITBC systems",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "database_backup",
|
||||
"parameters": {
|
||||
"database": "/var/lib/aitbc/blockchain.db",
|
||||
"backup_location": "/opt/aitbc/backups"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "backed_up",
|
||||
"backup_file": "blockchain.db.backup"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "backed_up"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "cp /var/lib/aitbc/blockchain.db /opt/aitbc/backups/blockchain.db.backup",
|
||||
"description": "Backup blockchain database"
|
||||
},
|
||||
{
|
||||
"command": "sqlite3 /var/lib/aitbc/blockchain.db '.backup /opt/aitbc/backups/blockchain.db.sqlite'",
|
||||
"description": "SQLite backup procedure"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "configuration_backup",
|
||||
"parameters": {
|
||||
"config_files": ["/etc/aitbc/.env", "/etc/aitbc/node.env"],
|
||||
"backup_location": "/opt/aitbc/backups/config"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "backed_up",
|
||||
"files_backed_up": 2
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "backed_up"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "tar -czf /opt/aitbc/backups/config/aitbc-config-backup.tar.gz /etc/aitbc/.env /etc/aitbc/node.env",
|
||||
"description": "Backup configuration files"
|
||||
},
|
||||
{
|
||||
"command": "cp -r /etc/aitbc /opt/aitbc/backups/config/",
|
||||
"description": "Backup entire configuration directory"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"section_id": "10.7",
|
||||
"section_name": "Network Partition Handling",
|
||||
"description": "Handle network partitions and reconnection scenarios",
|
||||
"operations": [
|
||||
{
|
||||
"operation": "partition_detection",
|
||||
"parameters": {
|
||||
"description": "Detect network partition events"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "detected",
|
||||
"partition_status": "identified"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "detected"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "aitbc-cli network peers",
|
||||
"description": "Check peer connections for partition detection"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain sync --status",
|
||||
"description": "Check sync status for partition indicators"
|
||||
},
|
||||
{
|
||||
"command": "journalctl -u aitbc-blockchain-node.service | grep -i partition",
|
||||
"description": "Check logs for partition events"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"operation": "recovery_after_partition",
|
||||
"parameters": {
|
||||
"description": "Recover system after network partition resolves"
|
||||
},
|
||||
"expected_result": {
|
||||
"status": "recovered",
|
||||
"sync_status": "synced"
|
||||
},
|
||||
"success_criteria": {
|
||||
"status": "recovered"
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"command": "systemctl restart aitbc-blockchain-node.service",
|
||||
"description": "Restart node to trigger reconnection"
|
||||
},
|
||||
{
|
||||
"command": "aitbc-cli blockchain sync --force",
|
||||
"description": "Force blockchain resync"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"validation": {
|
||||
"exam_questions": [
|
||||
{
|
||||
"question": "What is the first step when debugging a 'nonce too low' transaction error?",
|
||||
"options": [
|
||||
"Restart the node",
|
||||
"Check current wallet nonce and pending transactions",
|
||||
"Delete the wallet",
|
||||
"Increase transaction amount"
|
||||
],
|
||||
"correct_answer": 1,
|
||||
"explanation": "Check the current wallet nonce and any pending transactions to determine the correct nonce for the next transaction"
|
||||
},
|
||||
{
|
||||
"question": "How do you check if an AITBC node is healthy?",
|
||||
"options": [
|
||||
"Ping the node IP address",
|
||||
"Check the /health endpoint and systemctl status",
|
||||
"Check if the wallet has balance",
|
||||
"Restart the service"
|
||||
],
|
||||
"correct_answer": 1,
|
||||
"explanation": "Use the /health HTTP endpoint and systemctl status to verify node health"
|
||||
},
|
||||
{
|
||||
"question": "What is the recommended procedure for backing up a wallet?",
|
||||
"options": [
|
||||
"Copy the password file only",
|
||||
"Export the wallet and backup the keystore file",
|
||||
"Write down the wallet address",
|
||||
"No backup needed"
|
||||
],
|
||||
"correct_answer": 1,
|
||||
"explanation": "Export the wallet using aitbc-cli and backup the keystore file from /var/lib/aitbc/keystore/"
|
||||
},
|
||||
{
|
||||
"question": "How do you monitor AITBC logs for errors?",
|
||||
"options": [
|
||||
"Check /var/log/aitbc/ with grep ERROR or tail -f",
|
||||
"Check /tmp/logs",
|
||||
"Restart the node",
|
||||
"Delete log files"
|
||||
],
|
||||
"correct_answer": 0,
|
||||
"explanation": "Monitor /var/log/aitbc/ using grep ERROR to find errors or tail -f for real-time monitoring"
|
||||
},
|
||||
{
|
||||
"question": "What command helps inspect pending transactions?",
|
||||
"options": [
|
||||
"aitbc-cli wallet balance",
|
||||
"aitbc-cli blockchain mempool",
|
||||
"aitbc-cli blockchain block",
|
||||
"aitbc-cli network peers"
|
||||
],
|
||||
"correct_answer": 1,
|
||||
"explanation": "aitbc-cli blockchain mempool lists pending transactions in the mempool"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -314,6 +314,7 @@ get_stage_name() {
|
||||
7) echo "Cross-Node Training" ;;
|
||||
8) echo "Advanced Agent Specialization" ;;
|
||||
9) echo "Multi-Chain Architecture" ;;
|
||||
10) echo "Failure Recovery & Production Operations" ;;
|
||||
*) echo "Unknown Stage" ;;
|
||||
esac
|
||||
}
|
||||
@@ -591,7 +592,7 @@ check_all_prerequisites() {
|
||||
# Run stage with prerequisite check (playground mode)
|
||||
playground_run_stage() {
|
||||
echo "Available Stages:"
|
||||
for i in {0..9}; do
|
||||
for i in {0..10}; do
|
||||
local stage_name=""
|
||||
case $i in
|
||||
0) stage_name="Environment Setup" ;;
|
||||
@@ -604,14 +605,15 @@ playground_run_stage() {
|
||||
7) stage_name="Cross-Node Training" ;;
|
||||
8) stage_name="Advanced Agent Specialization" ;;
|
||||
9) stage_name="Multi-Chain Architecture" ;;
|
||||
10) stage_name="Failure Recovery & Production Operations" ;;
|
||||
esac
|
||||
echo "$i. $stage_name"
|
||||
done
|
||||
echo
|
||||
echo -n "Select stage [0-9]: "
|
||||
echo -n "Select stage [0-10]: "
|
||||
read -r stage_choice
|
||||
|
||||
if [[ "$stage_choice" =~ ^[0-9]$ ]]; then
|
||||
if [[ "$stage_choice" =~ ^[0-9]$|^10$ ]]; then
|
||||
echo
|
||||
check_prerequisites $stage_choice
|
||||
|
||||
@@ -1066,10 +1068,10 @@ case "${1:-}" in
|
||||
check_system_readiness
|
||||
;;
|
||||
--stage)
|
||||
if [[ "$2" =~ ^[0-9]$ ]]; then
|
||||
if [[ "$2" =~ ^[0-9]$|^10$ ]]; then
|
||||
run_stage "$2"
|
||||
else
|
||||
echo "Usage: $0 --stage [0-9]"
|
||||
echo "Usage: $0 --stage [0-10]"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
@@ -1085,10 +1087,10 @@ case "${1:-}" in
|
||||
shift
|
||||
case "${1:-}" in
|
||||
--stage)
|
||||
if [[ "$2" =~ ^[0-9]$ ]]; then
|
||||
if [[ "$2" =~ ^[0-9]$|^10$ ]]; then
|
||||
run_stage "$2"
|
||||
else
|
||||
echo "Usage: $0 --with-skill-update --stage [0-9]"
|
||||
echo "Usage: $0 --with-skill-update --stage [0-10]"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
|
||||
75
scripts/training/stage10_failure_recovery.sh
Executable file
75
scripts/training/stage10_failure_recovery.sh
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/bin/bash
|
||||
|
||||
# hermes AITBC Training - Stage 10: Failure Recovery & Production Operations
|
||||
# Transaction failure debugging, node recovery, production monitoring, backup procedures
|
||||
# Uses Python-based training setup to execute JSON-defined operations
|
||||
|
||||
set -e
|
||||
|
||||
# Source training library
|
||||
source "$(dirname "$0")/training_lib.sh"
|
||||
|
||||
# Training configuration
|
||||
TRAINING_STAGE="Stage 10: Failure Recovery & Production Operations"
|
||||
SCRIPT_NAME="stage10_failure_recovery"
|
||||
CURRENT_LOG=$(init_logging "$SCRIPT_NAME")
|
||||
|
||||
# Setup traps for cleanup
|
||||
setup_traps
|
||||
|
||||
# Total steps for progress tracking
|
||||
init_progress 1
|
||||
|
||||
# Stage information
|
||||
print_header "$TRAINING_STAGE"
|
||||
print_status "Failure recovery, production monitoring, and advanced debugging"
|
||||
print_status "Skills: Transaction debugging, node recovery, monitoring, backup procedures"
|
||||
echo
|
||||
|
||||
# Execute stage from JSON definition
|
||||
execute_stage_from_json() {
|
||||
local stage_num=10
|
||||
local json_file="${REPO_ROOT}/docs/agent-training/stage${stage_num}_failure_recovery.json"
|
||||
|
||||
print_status "Executing stage from JSON definition: $json_file"
|
||||
|
||||
if [ ! -f "$json_file" ]; then
|
||||
print_error "Stage JSON file not found: $json_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Use Python training setup to execute stage
|
||||
cd "$AITBC_DIR"
|
||||
if python3 -m aitbc.training_setup.cli run-stage "$json_file" 2>&1 | tee -a "$CURRENT_LOG"; then
|
||||
print_success "Stage $stage_num executed successfully"
|
||||
return 0
|
||||
else
|
||||
print_error "Stage $stage_num execution failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
print_status "Starting $TRAINING_STAGE"
|
||||
echo
|
||||
|
||||
if execute_stage_from_json; then
|
||||
print_success "$TRAINING_STAGE completed"
|
||||
|
||||
# Output learnings for skill update
|
||||
output_stage_learnings 10 "Failure Recovery" \
|
||||
"aitbc-cli blockchain transaction <tx_id> --verbose|aitbc-cli wallet nonce <wallet>|curl -s http://localhost:8006/health|aitbc-cli blockchain sync --status|systemctl status aitbc-blockchain-node.service|aitbc-cli wallet export <wallet> --backup|aitbc-cli blockchain mempool|aitbc-cli blockchain transaction <tx_id> --trace|tail -f /var/log/aitbc/blockchain-node.log" \
|
||||
"Nonce too low: check current nonce and pending transactions|Insufficient funds: check balance and fees before transaction|Node health: check /health endpoint and systemctl status|Wallet backup: export wallet and backup keystore file|Network partition: check peer connections and sync status|Log monitoring: use grep ERROR or tail -f on /var/log/aitbc/" \
|
||||
"/var/lib/aitbc/keystore|/var/log/aitbc|/opt/aitbc/backups|/etc/aitbc/.env|/etc/aitbc/node.env" \
|
||||
"Transaction failure debugging|Node recovery procedures|Production monitoring|Backup and restore|Mempool inspection|Network partition handling"
|
||||
|
||||
return 0
|
||||
else
|
||||
print_error "$TRAINING_STAGE failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
Reference in New Issue
Block a user