Files
aitbc/monitoring/advanced-features-monitoring.yml
oib 864ef4343e refactor(contracts): remove deprecated AIPowerRental contract in favor of bounty system
- Delete AIPowerRental.sol (566 lines) - replaced by AgentBounty.sol
- Remove rental agreement system with provider/consumer model
- Remove performance metrics and SLA tracking
- Remove dispute resolution mechanism
- Remove ZK-proof verification for performance
- Remove provider/consumer authorization system
- Bounty system provides superior developer incentive structure
2026-02-27 21:46:54 +01:00

393 lines
11 KiB
YAML

# AITBC Advanced Agent Features Production Monitoring
# Docker Compose configuration for production monitoring services
version: '3.8'
services:
# Cross-Chain Reputation Monitoring
reputation-monitor:
image: prom/prometheus:v2.45.0
container_name: reputation-monitor
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=200h'
- '--storage.tsdb.retention.size=50GB'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=prometheus"
# Agent Communication Monitoring
communication-monitor:
image: grafana/grafana:10.0.0
container_name: communication-monitor
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_USERS_ALLOW_ORG_CREATE=false
- GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,grafana-clock-panel,grafana-simple-json-datasource
- GF_SERVER_ROOT_URL=http://localhost:3001
- GF_SERVER_DOMAIN=localhost
- GF_SERVER_PROTOCOL=http
- GF_SERVER_HTTP_PORT=3001
- GF_SERVER_ENFORCE_DOMAIN=false
- GF_SMTP_ENABLED=false
- GF_LOG_LEVEL=info
- GF_LOG_MODE=file
- GF_PATHS_LOGS=/var/log/grafana
- GF_PATHS_PLUGINS=/var/lib/grafana/plugins
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
- GF_PATHS_DATA=/var/lib/grafana
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
- grafana-data:/var/lib/grafana
- grafana-logs:/var/log/grafana
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=grafana"
# Advanced Learning Monitoring
learning-monitor:
image: node:18-alpine
container_name: learning-monitor
working_dir: /app
volumes:
- ./learning-monitor:/app
- learning-monitor-logs:/app/logs
command: npm start
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8002/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=learning-monitor"
# Agent Collaboration Monitoring
collaboration-monitor:
image: node:18-alpine
container_name: collaboration-monitor
working_dir: /app
volumes:
- ./collaboration-monitor:/app
- collaboration-monitor-logs:/app/logs
command: npm start
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8003/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=collaboration-monitor"
# Log Aggregation
log-aggregator:
image: fluent/fluent-bit:v2.2.0
container_name: log-aggregator
volumes:
- ./fluent-bit.conf:/fluent-bit/etc/fluent-bit.conf
- /var/log:/var/log:ro
- /var/log/containers:/var/log/containers:ro
- /var/log/pods:/var/log/pods:ro
- fluent-bit-data:/var/log/fluent-bit
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "fluent-bit --version"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=fluent-bit"
# Alert Manager
alert-manager:
image: prom/alertmanager:v0.25.0
container_name: alert-manager
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
- '--web.route-prefix=/'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=alertmanager"
# Node Exporter (System Metrics)
node-exporter:
image: prom/node-exporter:v1.6.0
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=node-exporter"
# Process Exporter (Application Metrics)
process-exporter:
image: ncabatoff/process-exporter:v0.7.0
container_name: process-exporter
ports:
- "9256:9256"
volumes:
- /proc:/host/proc:ro
command:
- '--procfs=/host/proc'
- '--path.procfs=/host/proc'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9256/metrics || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=process-exporter"
# Blackbox Exporter (External Monitoring)
blackbox-exporter:
image: prom/blackbox-exporter:v0.22.0
container_name: blackbox-exporter
ports:
- "9115:9115"
volumes:
- ./blackbox.yml:/etc/blackbox/blackbox.yml
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9115/metrics || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=blackbox-exporter"
# Loki (Log Aggregation)
loki:
image: grafana/loki:2.9.0
container_name: loki
ports:
- "3100:3100"
volumes:
- ./loki.yml:/etc/loki/local-config.yaml
- loki-data:/loki
command:
- '-config.file=/etc/loki/local-config.yaml'
- '-config.expand-env=true'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=loki"
# Promtail (Log Collection)
promtail:
image: grafana/promtail:2.9.0
container_name: promtail
volumes:
- ./promtail.yml:/etc/promtail/config.yml
- /var/log:/var/log:ro
- /var/log/containers:/var/log/containers:ro
- /var/log/pods:/var/log/pods:ro
command:
- '-config.file=/etc/promtail/config.yml'
- '-config.expand-env=true'
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9081/ready || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=promtail"
# Jaeger (Tracing)
jaeger:
image: jaegertracing/all-in-one:1.47.0
container_name: jaeger
ports:
- "16686:16686"
- "14250:14250"
- "14268:14268"
- "14269:14269"
- "5778:5778"
- "5775:5775"
environment:
- COLLECTOR_ZIPKIN_HOST_PORT=9411
- COLLECTOR_OTLP_HOST_PORT=14250
- COLLECTOR_JAEGER_AGENT_HOST_PORT=14268
- QUERY_BASE_PATH=/
- SPAN_STORAGE_TYPE=badger
- BADGER_EPHEMERAL_SPREAD=2000000
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:16686/ || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=jaeger"
# Redis (Cache)
redis:
image: redis:7.2.3-alpine
container_name: redis
ports:
- "6379:6379"
volumes:
- redis-data:/data
command: redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=redis"
# Grafana Loki Data Source
grafana-loki-datasource:
image: grafana/grafana-image-renderer:3.8.2
container_name: grafana-loki-datasource
ports:
- "8081:8081"
environment:
- RENDERING_SERVER_URL=http://localhost:8081/render
- RENDERING_CALLBACK_URL=http://localhost:3001/
- RENDERING_HOST=localhost
restart: unless-stopped
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8081/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "service=monitoring"
- "component=grafana-image-renderer"
networks:
monitoring:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
- gateway: 172.20.0.1
volumes:
prometheus-data:
driver: local
grafana-data:
driver: local
alertmanager-data:
driver: local
fluent-bit-data:
driver: local
loki-data:
driver: local
redis-data:
driver: local
grafana-logs:
driver: local
learning-monitor-logs:
driver: local
collaboration-monitor-logs:
driver: local
# Logging configuration
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"