chore: remove outdated documentation and reference files
Some checks failed
AITBC CI/CD Pipeline / lint-and-test (3.11) (push) Has been cancelled
AITBC CI/CD Pipeline / lint-and-test (3.12) (push) Has been cancelled
AITBC CI/CD Pipeline / lint-and-test (3.13) (push) Has been cancelled
AITBC CI/CD Pipeline / test-cli (push) Has been cancelled
AITBC CI/CD Pipeline / test-services (push) Has been cancelled
AITBC CI/CD Pipeline / test-production-services (push) Has been cancelled
AITBC CI/CD Pipeline / security-scan (push) Has been cancelled
AITBC CI/CD Pipeline / build (push) Has been cancelled
AITBC CI/CD Pipeline / deploy-staging (push) Has been cancelled
AITBC CI/CD Pipeline / deploy-production (push) Has been cancelled
AITBC CI/CD Pipeline / performance-test (push) Has been cancelled
AITBC CI/CD Pipeline / docs (push) Has been cancelled
AITBC CI/CD Pipeline / release (push) Has been cancelled
AITBC CI/CD Pipeline / notify (push) Has been cancelled
Security Scanning / Bandit Security Scan (apps/coordinator-api/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (cli/aitbc_cli) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-core/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-crypto/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-sdk/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (tests) (push) Has been cancelled
Security Scanning / CodeQL Security Analysis (javascript) (push) Has been cancelled
Security Scanning / CodeQL Security Analysis (python) (push) Has been cancelled
Security Scanning / Dependency Security Scan (push) Has been cancelled
Security Scanning / Container Security Scan (push) Has been cancelled
Security Scanning / OSSF Scorecard (push) Has been cancelled
Security Scanning / Security Summary Report (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.11) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.12) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.13) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-summary (push) Has been cancelled
Some checks failed
AITBC CI/CD Pipeline / lint-and-test (3.11) (push) Has been cancelled
AITBC CI/CD Pipeline / lint-and-test (3.12) (push) Has been cancelled
AITBC CI/CD Pipeline / lint-and-test (3.13) (push) Has been cancelled
AITBC CI/CD Pipeline / test-cli (push) Has been cancelled
AITBC CI/CD Pipeline / test-services (push) Has been cancelled
AITBC CI/CD Pipeline / test-production-services (push) Has been cancelled
AITBC CI/CD Pipeline / security-scan (push) Has been cancelled
AITBC CI/CD Pipeline / build (push) Has been cancelled
AITBC CI/CD Pipeline / deploy-staging (push) Has been cancelled
AITBC CI/CD Pipeline / deploy-production (push) Has been cancelled
AITBC CI/CD Pipeline / performance-test (push) Has been cancelled
AITBC CI/CD Pipeline / docs (push) Has been cancelled
AITBC CI/CD Pipeline / release (push) Has been cancelled
AITBC CI/CD Pipeline / notify (push) Has been cancelled
Security Scanning / Bandit Security Scan (apps/coordinator-api/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (cli/aitbc_cli) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-core/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-crypto/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (packages/py/aitbc-sdk/src) (push) Has been cancelled
Security Scanning / Bandit Security Scan (tests) (push) Has been cancelled
Security Scanning / CodeQL Security Analysis (javascript) (push) Has been cancelled
Security Scanning / CodeQL Security Analysis (python) (push) Has been cancelled
Security Scanning / Dependency Security Scan (push) Has been cancelled
Security Scanning / Container Security Scan (push) Has been cancelled
Security Scanning / OSSF Scorecard (push) Has been cancelled
Security Scanning / Security Summary Report (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.11) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.12) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-cli-level1 (3.13) (push) Has been cancelled
AITBC CLI Level 1 Commands Test / test-summary (push) Has been cancelled
- Remove debugging service documentation (DEBUgging_SERVICES.md) - Remove development logs policy and quick reference guides - Remove E2E test creation summary - Remove gift certificate example file - Remove GitHub pull summary documentation
This commit is contained in:
253
scripts/monitoring/monitor-prs.py
Executable file
253
scripts/monitoring/monitor-prs.py
Executable file
@@ -0,0 +1,253 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enhanced monitor for Gitea PRs:
|
||||
- Auto-request review from sibling on my PRs
|
||||
- Auto-validate sibling's PRs and approve if passing checks, with stability ring awareness
|
||||
- Monitor CI statuses and report failures
|
||||
- Release claim branches when associated PRs merge, close, or EXPIRE
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
|
||||
GITEA_TOKEN = os.getenv('GITEA_TOKEN') or 'ffce3b62d583b761238ae00839dce7718acaad85'
|
||||
REPO = 'oib/aitbc'
|
||||
API_BASE = os.getenv('GITEA_API_BASE', 'http://gitea.bubuit.net:3000/api/v1')
|
||||
MY_AGENT = os.getenv('AGENT_NAME', 'aitbc1')
|
||||
SIBLING_AGENT = 'aitbc' if MY_AGENT == 'aitbc1' else 'aitbc1'
|
||||
CLAIM_STATE_FILE = '/opt/aitbc/.claim-state.json'
|
||||
CLAIM_TTL_SECONDS = 7200 # Must match claim-task.py
|
||||
|
||||
def query_api(path, method='GET', data=None):
|
||||
url = f"{API_BASE}/{path}"
|
||||
cmd = ['curl', '-s', '-H', f'Authorization: token {GITEA_TOKEN}', '-X', method]
|
||||
if data:
|
||||
cmd += ['-d', json.dumps(data), '-H', 'Content-Type: application/json']
|
||||
cmd.append(url)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
try:
|
||||
return json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
def get_pr_files(pr_number):
|
||||
return query_api(f'repos/{REPO}/pulls/{pr_number}/files') or []
|
||||
|
||||
def detect_ring(path):
|
||||
ring0 = ['packages/py/aitbc-core/', 'packages/py/aitbc-sdk/', 'packages/py/aitbc-agent-sdk/', 'packages/py/aitbc-crypto/']
|
||||
ring1 = ['apps/coordinator-api/', 'apps/blockchain-node/', 'apps/analytics/', 'services/']
|
||||
ring2 = ['cli/', 'scripts/', 'tools/']
|
||||
ring3 = ['experiments/', 'playground/', 'prototypes/', 'examples/']
|
||||
if any(path.startswith(p) for p in ring0):
|
||||
return 0
|
||||
if any(path.startswith(p) for p in ring1):
|
||||
return 1
|
||||
if any(path.startswith(p) for p in ring2):
|
||||
return 2
|
||||
if any(path.startswith(p) for p in ring3):
|
||||
return 3
|
||||
return 2
|
||||
|
||||
def load_claim_state():
|
||||
if os.path.exists(CLAIM_STATE_FILE):
|
||||
with open(CLAIM_STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
def save_claim_state(state):
|
||||
with open(CLAIM_STATE_FILE, 'w') as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
def release_claim(issue_number, claim_branch):
|
||||
check = subprocess.run(['git', 'ls-remote', '--heads', 'origin', claim_branch],
|
||||
capture_output=True, text=True, cwd='/opt/aitbc')
|
||||
if check.returncode == 0 and check.stdout.strip():
|
||||
subprocess.run(['git', 'push', 'origin', '--delete', claim_branch],
|
||||
capture_output=True, cwd='/opt/aitbc')
|
||||
state = load_claim_state()
|
||||
if state.get('current_claim') == issue_number:
|
||||
state.clear()
|
||||
save_claim_state(state)
|
||||
print(f"✅ Released claim for issue #{issue_number} (deleted branch {claim_branch})")
|
||||
|
||||
def is_claim_expired(state):
|
||||
"""Check if the current claim has exceeded TTL."""
|
||||
expires_at = state.get('expires_at')
|
||||
if not expires_at:
|
||||
return False
|
||||
now_ts = datetime.utcnow().timestamp()
|
||||
return now_ts > expires_at
|
||||
|
||||
def get_open_prs():
|
||||
return query_api(f'repos/{REPO}/pulls?state=open') or []
|
||||
|
||||
def get_all_prs(state='all'):
|
||||
return query_api(f'repos/{REPO}/pulls?state={state}') or []
|
||||
|
||||
def get_pr_reviews(pr_number):
|
||||
return query_api(f'repos/{REPO}/pulls/{pr_number}/reviews') or []
|
||||
|
||||
def get_commit_statuses(pr_number):
|
||||
pr = query_api(f'repos/{REPO}/pulls/{pr_number}')
|
||||
if not pr:
|
||||
return []
|
||||
sha = pr['head']['sha']
|
||||
statuses = query_api(f'repos/{REPO}/commits/{sha}/statuses')
|
||||
if not statuses or not isinstance(statuses, list):
|
||||
return []
|
||||
return statuses
|
||||
|
||||
def request_reviewer(pr_number, reviewer):
|
||||
data = {"reviewers": [reviewer]}
|
||||
return query_api(f'repos/{REPO}/pulls/{pr_number}/requested_reviewers', method='POST', data=data)
|
||||
|
||||
def post_review(pr_number, state, body=''):
|
||||
data = {"body": body, "event": state}
|
||||
return query_api(f'repos/{REPO}/pulls/{pr_number}/reviews', method='POST', data=data)
|
||||
|
||||
def validate_pr_branch(pr):
|
||||
head = pr['head']
|
||||
ref = head['ref']
|
||||
repo = head.get('repo', {}).get('full_name', REPO)
|
||||
tmpdir = tempfile.mkdtemp(prefix='aitbc-pr-')
|
||||
try:
|
||||
clone_url = f"git@gitea.bubuit.net:{repo}.git"
|
||||
result = subprocess.run(['git', 'clone', '-b', ref, '--depth', '1', clone_url, tmpdir],
|
||||
capture_output=True, text=True, timeout=60)
|
||||
if result.returncode != 0:
|
||||
return False, f"Clone failed: {result.stderr.strip()}"
|
||||
py_files = subprocess.run(['find', tmpdir, '-name', '*.py'], capture_output=True, text=True)
|
||||
if py_files.returncode == 0 and py_files.stdout.strip():
|
||||
for f in py_files.stdout.strip().split('\n')[:20]:
|
||||
res = subprocess.run(['python3', '-m', 'py_compile', f],
|
||||
capture_output=True, text=True, cwd=tmpdir)
|
||||
if res.returncode != 0:
|
||||
return False, f"Syntax error in `{f}`: {res.stderr.strip()}"
|
||||
return True, "Automated validation passed."
|
||||
except Exception as e:
|
||||
return False, f"Validation error: {str(e)}"
|
||||
finally:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
||||
def main():
|
||||
now = datetime.utcnow().replace(tzinfo=timezone.utc)
|
||||
now_iso = now.isoformat()
|
||||
now_ts = now.timestamp()
|
||||
print(f"[{now_iso}] Monitoring PRs and claim locks...")
|
||||
|
||||
# 0. Check claim state: if we have a current claim, see if it expired or PR merged
|
||||
state = load_claim_state()
|
||||
if state.get('current_claim'):
|
||||
issue_num = state['current_claim']
|
||||
work_branch = state.get('work_branch')
|
||||
claim_branch = state.get('claim_branch')
|
||||
# Check expiration
|
||||
if is_claim_expired(state):
|
||||
print(f"Claim for issue #{issue_num} has expired. Releasing.")
|
||||
release_claim(issue_num, claim_branch)
|
||||
else:
|
||||
# Check if PR merged/closed
|
||||
all_prs = get_all_prs(state='all')
|
||||
matched_pr = None
|
||||
for pr in all_prs:
|
||||
if pr['head']['ref'] == work_branch:
|
||||
matched_pr = pr
|
||||
break
|
||||
if matched_pr and matched_pr['state'] == 'closed':
|
||||
release_claim(issue_num, claim_branch)
|
||||
|
||||
# 1. Process open PRs
|
||||
open_prs = get_open_prs()
|
||||
notifications = []
|
||||
|
||||
for pr in open_prs:
|
||||
number = pr['number']
|
||||
title = pr['title']
|
||||
author = pr['user']['login']
|
||||
head_ref = pr['head']['ref']
|
||||
|
||||
# A. If PR from sibling, consider for review
|
||||
if author == SIBLING_AGENT:
|
||||
reviews = get_pr_reviews(number)
|
||||
my_reviews = [r for r in reviews if r['user']['login'] == MY_AGENT]
|
||||
if not my_reviews:
|
||||
files = get_pr_files(number)
|
||||
rings = [detect_ring(f['filename']) for f in files if f.get('status') != 'removed']
|
||||
max_ring = max(rings) if rings else 2
|
||||
if max_ring == 0:
|
||||
body = "Automated analysis: This PR modifies core (Ring 0) components. Manual review and a design specification are required before merge. No auto-approval."
|
||||
post_review(number, 'COMMENT', body=body)
|
||||
notifications.append(f"PR #{number} (Ring 0) flagged for manual review")
|
||||
else:
|
||||
passed, msg = validate_pr_branch(pr)
|
||||
if passed:
|
||||
post_review(number, 'APPROVED', body=f"Automated peer review: branch validated.\n\n✅ Syntax checks passed.\nRing {max_ring} change — auto-approved. CI must still pass.")
|
||||
notifications.append(f"Auto-approved PR #{number} from @{author} (Ring {max_ring})")
|
||||
else:
|
||||
post_review(number, 'CHANGES_REQUESTED', body=f"Automated peer review detected issues:\n\n{msg}\n\nPlease fix and push.")
|
||||
notifications.append(f"Requested changes on PR #{number} from @{author}: {msg[:100]}")
|
||||
|
||||
# B. If PR from me, ensure sibling is requested as reviewer
|
||||
if author == MY_AGENT:
|
||||
pr_full = query_api(f'repos/{REPO}/pulls/{number}')
|
||||
requested = pr_full.get('requested_reviewers', []) if pr_full else []
|
||||
if not any(r.get('login') == SIBLING_AGENT for r in requested):
|
||||
request_reviewer(number, SIBLING_AGENT)
|
||||
notifications.append(f"Requested review from @{SIBLING_AGENT} for my PR #{number}")
|
||||
|
||||
# C. Check CI statuses for any PR
|
||||
statuses = get_commit_statuses(number)
|
||||
failing = [s for s in statuses if s.get('status') not in ('success', 'pending')]
|
||||
if failing:
|
||||
for s in failing:
|
||||
notifications.append(f"PR #{number} status check failure: {s.get('context','unknown')} - {s.get('status','unknown')}")
|
||||
|
||||
# 2. Global cleanup of stale claim branches (orphaned, older than TTL)
|
||||
cleanup_global_expired_claims(now_ts)
|
||||
|
||||
if notifications:
|
||||
print("\n".join(notifications))
|
||||
else:
|
||||
print("No new alerts.")
|
||||
|
||||
def cleanup_global_expired_claims(now_ts=None):
|
||||
"""Delete remote claim branches that are older than TTL, even if state file is gone."""
|
||||
if now_ts is None:
|
||||
now_ts = datetime.utcnow().timestamp()
|
||||
# List all remote claim branches
|
||||
result = subprocess.run(['git', 'ls-remote', '--heads', 'origin', 'claim/*'],
|
||||
capture_output=True, text=True, cwd='/opt/aitbc')
|
||||
if result.returncode != 0 or not result.stdout.strip():
|
||||
return
|
||||
lines = result.stdout.strip().split('\n')
|
||||
cleaned = 0
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
sha, branch = parts[0], parts[1]
|
||||
# Get commit timestamp
|
||||
ts_result = subprocess.run(['git', 'show', '-s', '--format=%ct', sha],
|
||||
capture_output=True, text=True, cwd='/opt/aitbc')
|
||||
if ts_result.returncode == 0 and ts_result.stdout.strip():
|
||||
commit_ts = int(ts_result.stdout.strip())
|
||||
age = now_ts - commit_ts
|
||||
if age > CLAIM_TTL_SECONDS:
|
||||
print(f"Expired claim branch: {branch} (age {age/3600:.1f}h). Deleting.")
|
||||
subprocess.run(['git', 'push', 'origin', '--delete', branch],
|
||||
capture_output=True, cwd='/opt/aitbc')
|
||||
cleaned += 1
|
||||
if cleaned == 0:
|
||||
print(" cleanup_global_expired_claims: none")
|
||||
else:
|
||||
print(f" cleanup_global_expired_claims: removed {cleaned} expired branch(es)")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
53
scripts/monitoring/nightly_health_check.sh
Normal file
53
scripts/monitoring/nightly_health_check.sh
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# AITBC Nightly Health Check
|
||||
# Runs master planning cleanup and reports documentation/planning cleanliness.
|
||||
#
|
||||
set -e
|
||||
|
||||
PROJECT_ROOT="/opt/aitbc"
|
||||
PLANNING_DIR="$PROJECT_ROOT/docs/10_plan"
|
||||
DOCS_DIR="$PROJECT_ROOT/docs"
|
||||
MASTER_WORKFLOW="$PROJECT_ROOT/scripts/run_master_planning_cleanup.sh"
|
||||
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m'
|
||||
|
||||
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||
log_err() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||
|
||||
log_info "Starting nightly health check..."
|
||||
|
||||
if [[ -x "$MASTER_WORKFLOW" ]]; then
|
||||
log_info "Running master planning cleanup workflow..."
|
||||
if ! "$MASTER_WORKFLOW"; then
|
||||
log_warn "Master workflow reported issues; continuing to collect stats."
|
||||
fi
|
||||
else
|
||||
log_warn "Master workflow script not found or not executable: $MASTER_WORKFLOW"
|
||||
fi
|
||||
|
||||
log_info "Collecting documentation/planning stats..."
|
||||
planning_files=$(find "$PLANNING_DIR" -name "*.md" | wc -l)
|
||||
completed_files=$(find "$DOCS_DIR/completed" -name "*.md" | wc -l)
|
||||
archive_files=$(find "$DOCS_DIR/archive" -name "*.md" | wc -l)
|
||||
documented_files=$(find "$DOCS_DIR" -name "documented_*.md" | wc -l)
|
||||
completion_markers=$(find "$PLANNING_DIR" -name "*.md" -exec grep -l "✅" {} \; | wc -l)
|
||||
|
||||
echo "--- Nightly Health Check Summary ---"
|
||||
echo "Planning files (docs/10_plan): $planning_files"
|
||||
echo "Completed files (docs/completed): $completed_files"
|
||||
echo "Archive files (docs/archive): $archive_files"
|
||||
echo "Documented files (docs/): $documented_files"
|
||||
echo "Files with completion markers: $completion_markers"
|
||||
|
||||
if [[ $completion_markers -eq 0 ]]; then
|
||||
log_info "Planning cleanliness OK (0 completion markers)."
|
||||
else
|
||||
log_warn "Completion markers remain in planning files ($completion_markers)."
|
||||
fi
|
||||
|
||||
log_info "Nightly health check completed."
|
||||
291
scripts/monitoring/production_monitoring.sh
Executable file
291
scripts/monitoring/production_monitoring.sh
Executable file
@@ -0,0 +1,291 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Production Monitoring Setup for AITBC Platform
|
||||
# Configures monitoring, alerting, and observability
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() { echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1"; }
|
||||
success() { echo -e "${GREEN}✅ $1${NC}"; }
|
||||
warning() { echo -e "${YELLOW}⚠️ $1${NC}"; }
|
||||
|
||||
# Create monitoring directory
|
||||
MONITORING_DIR="/opt/aitbc/monitoring"
|
||||
mkdir -p "$MONITORING_DIR"
|
||||
|
||||
# Setup system metrics collection
|
||||
setup_system_metrics() {
|
||||
log "Setting up system metrics collection..."
|
||||
|
||||
# Create metrics collection script
|
||||
cat > "$MONITORING_DIR/collect_metrics.sh" << 'EOF'
|
||||
#!/bin/bash
|
||||
# System metrics collection for AITBC platform
|
||||
|
||||
METRICS_FILE="/opt/aitbc/monitoring/metrics.log"
|
||||
TIMESTAMP=$(date -Iseconds)
|
||||
|
||||
# System metrics
|
||||
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
|
||||
MEM_USAGE=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}')
|
||||
DISK_USAGE=$(df -h / | awk 'NR==2{print $5}' | sed 's/%//')
|
||||
|
||||
# Service metrics
|
||||
COORDINATOR_STATUS=$(systemctl is-active aitbc-coordinator)
|
||||
BLOCKCHAIN_STATUS=$(systemctl is-active blockchain-node)
|
||||
|
||||
# API metrics
|
||||
API_RESPONSE_TIME=$(curl -o /dev/null -s -w '%{time_total}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "0")
|
||||
API_STATUS=$(curl -o /dev/null -s -w '%{http_code}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "000")
|
||||
|
||||
# Write metrics
|
||||
echo "$TIMESTAMP,cpu:$CPU_USAGE,memory:$MEM_USAGE,disk:$DISK_USAGE,coordinator:$COORDINATOR_STATUS,blockchain:$BLOCKCHAIN_STATUS,api_time:$API_RESPONSE_TIME,api_status:$API_STATUS" >> "$METRICS_FILE"
|
||||
|
||||
# Keep only last 1000 lines
|
||||
tail -n 1000 "$METRICS_FILE" > "$METRICS_FILE.tmp" && mv "$METRICS_FILE.tmp" "$METRICS_FILE"
|
||||
EOF
|
||||
|
||||
chmod +x "$MONITORING_DIR/collect_metrics.sh"
|
||||
|
||||
# Add to crontab (every 2 minutes)
|
||||
(crontab -l 2>/dev/null; echo "*/2 * * * * $MONITORING_DIR/collect_metrics.sh") | crontab -
|
||||
|
||||
success "System metrics collection configured"
|
||||
}
|
||||
|
||||
# Setup alerting system
|
||||
setup_alerting() {
|
||||
log "Setting up alerting system..."
|
||||
|
||||
# Create alerting script
|
||||
cat > "$MONITORING_DIR/check_alerts.sh" << 'EOF'
|
||||
#!/bin/bash
|
||||
# Alert checking for AITBC platform
|
||||
|
||||
ALERT_LOG="/opt/aitbc/monitoring/alerts.log"
|
||||
TIMESTAMP=$(date -Iseconds)
|
||||
ALERT_TRIGGERED=false
|
||||
|
||||
# Check service status
|
||||
check_service() {
|
||||
local service=$1
|
||||
local status=$(systemctl is-active "$service" 2>/dev/null || echo "failed")
|
||||
|
||||
if [[ "$status" != "active" ]]; then
|
||||
echo "$TIMESTAMP,SERVICE,$service is $status" >> "$ALERT_LOG"
|
||||
echo "🚨 ALERT: Service $service is $status"
|
||||
ALERT_TRIGGERED=true
|
||||
fi
|
||||
}
|
||||
|
||||
# Check API health
|
||||
check_api() {
|
||||
local response=$(curl -s -o /dev/null -w '%{http_code}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$response" != "200" ]]; then
|
||||
echo "$TIMESTAMP,API,Health endpoint returned $response" >> "$ALERT_LOG"
|
||||
echo "🚨 ALERT: API health check failed (HTTP $response)"
|
||||
ALERT_TRIGGERED=true
|
||||
fi
|
||||
}
|
||||
|
||||
# Check disk space
|
||||
check_disk() {
|
||||
local usage=$(df / | awk 'NR==2{print $5}' | sed 's/%//')
|
||||
|
||||
if [[ $usage -gt 80 ]]; then
|
||||
echo "$TIMESTAMP,DISK,Disk usage is ${usage}%" >> "$ALERT_LOG"
|
||||
echo "🚨 ALERT: Disk usage is ${usage}%"
|
||||
ALERT_TRIGGERED=true
|
||||
fi
|
||||
}
|
||||
|
||||
# Check memory usage
|
||||
check_memory() {
|
||||
local usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
|
||||
|
||||
if [[ $usage -gt 90 ]]; then
|
||||
echo "$TIMESTAMP,MEMORY,Memory usage is ${usage}%" >> "$ALERT_LOG"
|
||||
echo "🚨 ALERT: Memory usage is ${usage}%"
|
||||
ALERT_TRIGGERED=true
|
||||
fi
|
||||
}
|
||||
|
||||
# Run checks
|
||||
check_service "aitbc-coordinator"
|
||||
check_service "blockchain-node"
|
||||
check_api
|
||||
check_disk
|
||||
check_memory
|
||||
|
||||
# If no alerts, log all clear
|
||||
if [[ "$ALERT_TRIGGERED" == "false" ]]; then
|
||||
echo "$TIMESTAMP,ALL_CLEAR,All systems operational" >> "$ALERT_LOG"
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x "$MONITORING_DIR/check_alerts.sh"
|
||||
|
||||
# Add to crontab (every 5 minutes)
|
||||
(crontab -l 2>/dev/null; echo "*/5 * * * * $MONITORING_DIR/check_alerts.sh") | crontab -
|
||||
|
||||
success "Alerting system configured"
|
||||
}
|
||||
|
||||
# Setup performance dashboard
|
||||
setup_dashboard() {
|
||||
log "Setting up performance dashboard..."
|
||||
|
||||
# Create dashboard script
|
||||
cat > "$MONITORING_DIR/dashboard.sh" << 'EOF'
|
||||
#!/bin/bash
|
||||
# Performance dashboard for AITBC platform
|
||||
|
||||
clear
|
||||
echo "🔍 AITBC Platform Performance Dashboard"
|
||||
echo "========================================"
|
||||
echo "Last Updated: $(date)"
|
||||
echo ""
|
||||
|
||||
# System Status
|
||||
echo "📊 System Status:"
|
||||
echo "CPU: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')% used"
|
||||
echo "Memory: $(free -h | grep Mem | awk '{print $3"/"$2}')"
|
||||
echo "Disk: $(df -h / | awk 'NR==2{print $3"/"$2" ("$5")"}')"
|
||||
echo ""
|
||||
|
||||
# Service Status
|
||||
echo "🔧 Service Status:"
|
||||
systemctl is-active aitbc-coordinator && echo "✅ Coordinator API: Active" || echo "❌ Coordinator API: Inactive"
|
||||
systemctl is-active blockchain-node && echo "✅ Blockchain Node: Active" || echo "❌ Blockchain Node: Inactive"
|
||||
systemctl is-active nginx && echo "✅ Nginx: Active" || echo "❌ Nginx: Inactive"
|
||||
echo ""
|
||||
|
||||
# API Performance
|
||||
echo "🌐 API Performance:"
|
||||
API_TIME=$(curl -o /dev/null -s -w '%{time_total}' https://aitbc.bubuit.net/api/v1/health 2>/dev/null || echo "0.000")
|
||||
echo "Health Endpoint: ${API_TIME}s"
|
||||
echo ""
|
||||
|
||||
# Recent Alerts (last 10)
|
||||
echo "🚨 Recent Alerts:"
|
||||
if [[ -f /opt/aitbc/monitoring/alerts.log ]]; then
|
||||
tail -n 10 /opt/aitbc/monitoring/alerts.log | while IFS=',' read -r timestamp type message; do
|
||||
echo " $timestamp: $message"
|
||||
done
|
||||
else
|
||||
echo " No alerts logged"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Quick Stats
|
||||
echo "📈 Quick Stats:"
|
||||
if [[ -f /opt/aitbc/monitoring/metrics.log ]]; then
|
||||
echo " Metrics collected: $(wc -l < /opt/aitbc/monitoring/metrics.log) entries"
|
||||
echo " Alerts triggered: $(grep -c "ALERT" /opt/aitbc/monitoring/alerts.log 2>/dev/null || echo "0")"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Press Ctrl+C to exit, or refresh in 30 seconds..."
|
||||
sleep 30
|
||||
exec "$0"
|
||||
EOF
|
||||
|
||||
chmod +x "$MONITORING_DIR/dashboard.sh"
|
||||
|
||||
success "Performance dashboard created"
|
||||
}
|
||||
|
||||
# Setup log analysis
|
||||
setup_log_analysis() {
|
||||
log "Setting up log analysis..."
|
||||
|
||||
# Create log analysis script
|
||||
cat > "$MONITORING_DIR/analyze_logs.sh" << 'EOF'
|
||||
#!/bin/bash
|
||||
# Log analysis for AITBC platform
|
||||
|
||||
LOG_DIR="/var/log"
|
||||
ANALYSIS_FILE="/opt/aitbc/monitoring/log_analysis.txt"
|
||||
TIMESTAMP=$(date -Iseconds)
|
||||
|
||||
echo "=== Log Analysis - $TIMESTAMP ===" >> "$ANALYSIS_FILE"
|
||||
|
||||
# Analyze nginx logs
|
||||
if [[ -f "$LOG_DIR/nginx/access.log" ]]; then
|
||||
echo "" >> "$ANALYSIS_FILE"
|
||||
echo "NGINX Access Analysis:" >> "$ANALYSIS_FILE"
|
||||
|
||||
# Top 10 endpoints
|
||||
echo "Top 10 endpoints:" >> "$ANALYSIS_FILE"
|
||||
awk '{print $7}' "$LOG_DIR/nginx/access.log" | sort | uniq -c | sort -nr | head -10 >> "$ANALYSIS_FILE"
|
||||
|
||||
# HTTP status codes
|
||||
echo "" >> "$ANALYSIS_FILE"
|
||||
echo "HTTP Status Codes:" >> "$ANALYSIS_FILE"
|
||||
awk '{print $9}' "$LOG_DIR/nginx/access.log" | sort | uniq -c | sort -nr >> "$ANALYSIS_FILE"
|
||||
|
||||
# Error rate
|
||||
local total=$(wc -l < "$LOG_DIR/nginx/access.log")
|
||||
local errors=$(awk '$9 >= 400 {print}' "$LOG_DIR/nginx/access.log" | wc -l)
|
||||
local error_rate=$(echo "scale=2; $errors * 100 / $total" | bc)
|
||||
echo "" >> "$ANALYSIS_FILE"
|
||||
echo "Error Rate: ${error_rate}%" >> "$ANALYSIS_FILE"
|
||||
fi
|
||||
|
||||
# Analyze application logs
|
||||
if journalctl -u aitbc-coordinator --since "1 hour ago" | grep -q "ERROR"; then
|
||||
echo "" >> "$ANALYSIS_FILE"
|
||||
echo "Application Errors (last hour):" >> "$ANALYSIS_FILE"
|
||||
journalctl -u aitbc-coordinator --since "1 hour ago" | grep "ERROR" | tail -5 >> "$ANALYSIS_FILE"
|
||||
fi
|
||||
|
||||
echo "Analysis complete" >> "$ANALYSIS_FILE"
|
||||
EOF
|
||||
|
||||
chmod +x "$MONITORING_DIR/analyze_logs.sh"
|
||||
|
||||
# Add to crontab (hourly)
|
||||
(crontab -l 2>/dev/null; echo "0 * * * * $MONITORING_DIR/analyze_logs.sh") | crontab -
|
||||
|
||||
success "Log analysis configured"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Setting up AITBC Production Monitoring..."
|
||||
|
||||
setup_system_metrics
|
||||
setup_alerting
|
||||
setup_dashboard
|
||||
setup_log_analysis
|
||||
|
||||
success "Production monitoring setup complete!"
|
||||
|
||||
echo
|
||||
echo "📊 MONITORING SUMMARY:"
|
||||
echo " ✅ System metrics collection (every 2 minutes)"
|
||||
echo " ✅ Alert checking (every 5 minutes)"
|
||||
echo " ✅ Performance dashboard"
|
||||
echo " ✅ Log analysis (hourly)"
|
||||
echo
|
||||
echo "🔧 MONITORING COMMANDS:"
|
||||
echo " Dashboard: $MONITORING_DIR/dashboard.sh"
|
||||
echo " Metrics: $MONITORING_DIR/collect_metrics.sh"
|
||||
echo " Alerts: $MONITORING_DIR/check_alerts.sh"
|
||||
echo " Log Analysis: $MONITORING_DIR/analyze_logs.sh"
|
||||
echo
|
||||
echo "📁 MONITORING FILES:"
|
||||
echo " Metrics: $MONITORING_DIR/metrics.log"
|
||||
echo " Alerts: $MONITORING_DIR/alerts.log"
|
||||
echo " Analysis: $MONITORING_DIR/log_analysis.txt"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user