aitbc/scripts/production_monitoring.sh

#!/bin/bash
#
# Production Monitoring Setup for AITBC Platform
# Configures monitoring, alerting, and observability
#

set -euo pipefail

# Colors
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m'

log() { echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1"; }
success() { echo -e "${GREEN}✅ $1${NC}"; }
warning() { echo -e "${YELLOW}⚠️  $1${NC}"; }

# Create monitoring directory
MONITORING_DIR="/opt/aitbc/monitoring"
mkdir -p "$MONITORING_DIR"

# Setup system metrics collection
setup_system_metrics() {
    log "Setting up system metrics collection..."

    # Create metrics collection script
    cat > "$MONITORING_DIR/collect_metrics.sh" << 'EOF'
#!/bin/bash
# System metrics collection for AITBC platform

METRICS_FILE="/opt/aitbc/monitoring/metrics.log"
TIMESTAMP=$(date -Iseconds)

# System metrics
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
MEM_USAGE=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}')
DISK_USAGE=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')

# Service metrics
COORDINATOR_STATUS=$(systemctl is-active aitbc-coordinator)
BLOCKCHAIN_STATUS=$(systemctl is-active blockchain-node)

# API metrics
API_RESPONSE_TIME=$(curl -o /dev/null -s -w '%{time_total}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "0")
API_STATUS=$(curl -o /dev/null -s -w '%{http_code}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "000")

# Write metrics
echo "$TIMESTAMP,cpu:$CPU_USAGE,memory:$MEM_USAGE,disk:$DISK_USAGE,coordinator:$COORDINATOR_STATUS,blockchain:$BLOCKCHAIN_STATUS,api_time:$API_RESPONSE_TIME,api_status:$API_STATUS" >> "$METRICS_FILE"

# Keep only last 1000 lines
tail -n 1000 "$METRICS_FILE" > "$METRICS_FILE.tmp" && mv "$METRICS_FILE.tmp" "$METRICS_FILE"
EOF

    chmod +x "$MONITORING_DIR/collect_metrics.sh"

    # Add to crontab (every 2 minutes)
    (crontab -l 2>/dev/null; echo "*/2 * * * * $MONITORING_DIR/collect_metrics.sh") | crontab -

    success "System metrics collection configured"
}

# Setup alerting system
setup_alerting() {
    log "Setting up alerting system..."

    # Create alerting script
    cat > "$MONITORING_DIR/check_alerts.sh" << 'EOF'
#!/bin/bash
# Alert checking for AITBC platform

ALERT_LOG="/opt/aitbc/monitoring/alerts.log"
TIMESTAMP=$(date -Iseconds)
ALERT_TRIGGERED=false

# Check service status
check_service() {
    local service=$1
    local status=$(systemctl is-active "$service" 2>/dev/null || echo "failed")

    if [[ "$status" != "active" ]]; then
        echo "$TIMESTAMP,SERVICE,$service is $status" >> "$ALERT_LOG"
        echo "🚨 ALERT: Service $service is $status"
        ALERT_TRIGGERED=true
    fi
}

# Check API health
check_api() {
    local response=$(curl -s -o /dev/null -w '%{http_code}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "000")

    if [[ "$response" != "200" ]]; then
        echo "$TIMESTAMP,API,Health endpoint returned $response" >> "$ALERT_LOG"
        echo "🚨 ALERT: API health check failed (HTTP $response)"
        ALERT_TRIGGERED=true
    fi
}

# Check disk space
check_disk() {
    local usage=$(df / | awk 'NR==2{print $5}' | sed 's/%//')

    if [[ $usage -gt 80 ]]; then
        echo "$TIMESTAMP,DISK,Disk usage is ${usage}%" >> "$ALERT_LOG"
        echo "🚨 ALERT: Disk usage is ${usage}%"
        ALERT_TRIGGERED=true
    fi
}

# Check memory usage
check_memory() {
    local usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')

    if [[ $usage -gt 90 ]]; then
        echo "$TIMESTAMP,MEMORY,Memory usage is ${usage}%" >> "$ALERT_LOG"
        echo "🚨 ALERT: Memory usage is ${usage}%"
        ALERT_TRIGGERED=true
    fi
}

# Run checks
check_service "aitbc-coordinator"
check_service "blockchain-node"
check_api
check_disk
check_memory

# If no alerts, log all clear
if [[ "$ALERT_TRIGGERED" == "false" ]]; then
    echo "$TIMESTAMP,ALL_CLEAR,All systems operational" >> "$ALERT_LOG"
fi
EOF

    chmod +x "$MONITORING_DIR/check_alerts.sh"

    # Add to crontab (every 5 minutes)
    (crontab -l 2>/dev/null; echo "*/5 * * * * $MONITORING_DIR/check_alerts.sh") | crontab -

    success "Alerting system configured"
}

# Setup performance dashboard
setup_dashboard() {
    log "Setting up performance dashboard..."

    # Create dashboard script
    cat > "$MONITORING_DIR/dashboard.sh" << 'EOF'
#!/bin/bash
# Performance dashboard for AITBC platform

clear
echo "🔍 AITBC Platform Performance Dashboard"
echo "========================================"
echo "Last Updated: $(date)"
echo ""

# System Status
echo "📊 System Status:"
echo "CPU: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')% used"
echo "Memory: $(free -h | grep Mem | awk '{print $3"/"$2}')"
echo "Disk: $(df -h / | awk 'NR==2{print $3"/"$2" ("$5")"}')"
echo ""

# Service Status
echo "🔧 Service Status:"
systemctl is-active aitbc-coordinator && echo "✅ Coordinator API: Active" || echo "❌ Coordinator API: Inactive"
systemctl is-active blockchain-node && echo "✅ Blockchain Node: Active" || echo "❌ Blockchain Node: Inactive"
systemctl is-active nginx && echo "✅ Nginx: Active" || echo "❌ Nginx: Inactive"
echo ""

# API Performance
echo "🌐 API Performance:"
API_TIME=$(curl -o /dev/null -s -w '%{time_total}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "0.000")
echo "Health Endpoint: ${API_TIME}s"
echo ""

# Recent Alerts (last 10)
echo "🚨 Recent Alerts:"
if [[ -f /opt/aitbc/monitoring/alerts.log ]]; then
    tail -n 10 /opt/aitbc/monitoring/alerts.log | while IFS=',' read -r timestamp type message; do
        echo "  $timestamp: $message"
    done
else
    echo "  No alerts logged"
fi
echo ""

# Quick Stats
echo "📈 Quick Stats:"
if [[ -f /opt/aitbc/monitoring/metrics.log ]]; then
    echo "  Metrics collected: $(wc -l < /opt/aitbc/monitoring/metrics.log) entries"
    echo "  Alerts triggered: $(grep -c "ALERT" /opt/aitbc/monitoring/alerts.log 2>/dev/null || echo "0")"
fi

echo ""
echo "Press Ctrl+C to exit, or refresh in 30 seconds..."
sleep 30
exec "$0"
EOF

    chmod +x "$MONITORING_DIR/dashboard.sh"

    success "Performance dashboard created"
}

# Setup log analysis
setup_log_analysis() {
    log "Setting up log analysis..."

    # Create log analysis script
    cat > "$MONITORING_DIR/analyze_logs.sh" << 'EOF'
#!/bin/bash
# Log analysis for AITBC platform

LOG_DIR="/var/log"
ANALYSIS_FILE="/opt/aitbc/monitoring/log_analysis.txt"
TIMESTAMP=$(date -Iseconds)

echo "=== Log Analysis - $TIMESTAMP ===" >> "$ANALYSIS_FILE"

# Analyze nginx logs
if [[ -f "$LOG_DIR/nginx/access.log" ]]; then
    echo "" >> "$ANALYSIS_FILE"
    echo "NGINX Access Analysis:" >> "$ANALYSIS_FILE"

    # Top 10 endpoints
    echo "Top 10 endpoints:" >> "$ANALYSIS_FILE"
    awk '{print $7}' "$LOG_DIR/nginx/access.log" | sort | uniq -c | sort -nr | head -10 >> "$ANALYSIS_FILE"

    # HTTP status codes
    echo "" >> "$ANALYSIS_FILE"
    echo "HTTP Status Codes:" >> "$ANALYSIS_FILE"
    awk '{print $9}' "$LOG_DIR/nginx/access.log" | sort | uniq -c | sort -nr >> "$ANALYSIS_FILE"

    # Error rate
    local total=$(wc -l < "$LOG_DIR/nginx/access.log")
    local errors=$(awk '$9 >= 400 {print}' "$LOG_DIR/nginx/access.log" | wc -l)
    local error_rate=$(echo "scale=2; $errors * 100 / $total" | bc)
    echo "" >> "$ANALYSIS_FILE"
    echo "Error Rate: ${error_rate}%" >> "$ANALYSIS_FILE"
fi

# Analyze application logs
if journalctl -u aitbc-coordinator --since "1 hour ago" | grep -q "ERROR"; then
    echo "" >> "$ANALYSIS_FILE"
    echo "Application Errors (last hour):" >> "$ANALYSIS_FILE"
    journalctl -u aitbc-coordinator --since "1 hour ago" | grep "ERROR" | tail -5 >> "$ANALYSIS_FILE"
fi

echo "Analysis complete" >> "$ANALYSIS_FILE"
EOF

    chmod +x "$MONITORING_DIR/analyze_logs.sh"

    # Add to crontab (hourly)
    (crontab -l 2>/dev/null; echo "0 * * * * $MONITORING_DIR/analyze_logs.sh") | crontab -

    success "Log analysis configured"
}

# Main execution
main() {
    log "Setting up AITBC Production Monitoring..."

    setup_system_metrics
    setup_alerting
    setup_dashboard
    setup_log_analysis

    success "Production monitoring setup complete!"

    echo
    echo "📊 MONITORING SUMMARY:"
    echo "   ✅ System metrics collection (every 2 minutes)"
    echo "   ✅ Alert checking (every 5 minutes)"
    echo "   ✅ Performance dashboard"
    echo "   ✅ Log analysis (hourly)"
    echo
    echo "🔧 MONITORING COMMANDS:"
    echo "   Dashboard: $MONITORING_DIR/dashboard.sh"
    echo "   Metrics: $MONITORING_DIR/collect_metrics.sh"
    echo "   Alerts: $MONITORING_DIR/check_alerts.sh"
    echo "   Log Analysis: $MONITORING_DIR/analyze_logs.sh"
    echo
    echo "📁 MONITORING FILES:"
    echo "   Metrics: $MONITORING_DIR/metrics.log"
    echo "   Alerts: $MONITORING_DIR/alerts.log"
    echo "   Analysis: $MONITORING_DIR/log_analysis.txt"
}

main "$@"