# AITBC Advanced Agent Features Production Monitoring # Docker Compose configuration for production monitoring services version: '3.8' services: # Cross-Chain Reputation Monitoring reputation-monitor: image: prom/prometheus:v2.45.0 container_name: reputation-monitor ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml - ./rules:/etc/prometheus/rules - prometheus-data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=200h' - '--storage.tsdb.retention.size=50GB' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' - '--web.enable-admin-api' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=prometheus" # Agent Communication Monitoring communication-monitor: image: grafana/grafana:10.0.0 container_name: communication-monitor ports: - "3001:3000" environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} - GF_USERS_ALLOW_SIGN_UP=false - GF_USERS_ALLOW_ORG_CREATE=false - GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-worldmap-panel,grafana-clock-panel,grafana-simple-json-datasource - GF_SERVER_ROOT_URL=http://localhost:3001 - GF_SERVER_DOMAIN=localhost - GF_SERVER_PROTOCOL=http - GF_SERVER_HTTP_PORT=3001 - GF_SERVER_ENFORCE_DOMAIN=false - GF_SMTP_ENABLED=false - GF_LOG_LEVEL=info - GF_LOG_MODE=file - GF_PATHS_LOGS=/var/log/grafana - GF_PATHS_PLUGINS=/var/lib/grafana/plugins - GF_PATHS_PROVISIONING=/etc/grafana/provisioning - GF_PATHS_DATA=/var/lib/grafana volumes: - ./grafana/provisioning:/etc/grafana/provisioning - ./grafana/dashboards:/var/lib/grafana/dashboards - grafana-data:/var/lib/grafana - grafana-logs:/var/log/grafana restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:3001/api/health || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=grafana" # Advanced Learning Monitoring learning-monitor: image: node:18-alpine container_name: learning-monitor working_dir: /app volumes: - ./learning-monitor:/app - learning-monitor-logs:/app/logs command: npm start restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8002/health || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=learning-monitor" # Agent Collaboration Monitoring collaboration-monitor: image: node:18-alpine container_name: collaboration-monitor working_dir: /app volumes: - ./collaboration-monitor:/app - collaboration-monitor-logs:/app/logs command: npm start restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8003/health || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=collaboration-monitor" # Log Aggregation log-aggregator: image: fluent/fluent-bit:v2.2.0 container_name: log-aggregator volumes: - ./fluent-bit.conf:/fluent-bit/etc/fluent-bit.conf - /var/log:/var/log:ro - /var/log/containers:/var/log/containers:ro - /var/log/pods:/var/log/pods:ro - fluent-bit-data:/var/log/fluent-bit restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "fluent-bit --version"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=fluent-bit" # Alert Manager alert-manager: image: prom/alertmanager:v0.25.0 container_name: alert-manager ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - alertmanager-data:/alertmanager command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--web.external-url=http://localhost:9093' - '--web.route-prefix=/' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=alertmanager" # Node Exporter (System Metrics) node-exporter: image: prom/node-exporter:v1.6.0 container_name: node-exporter ports: - "9100:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro command: - '--path.procfs=/host/proc' - '--path.rootfs=/rootfs' - '--path.sysfs=/host/sys' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=node-exporter" # Process Exporter (Application Metrics) process-exporter: image: ncabatoff/process-exporter:v0.7.0 container_name: process-exporter ports: - "9256:9256" volumes: - /proc:/host/proc:ro command: - '--procfs=/host/proc' - '--path.procfs=/host/proc' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9256/metrics || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=process-exporter" # Blackbox Exporter (External Monitoring) blackbox-exporter: image: prom/blackbox-exporter:v0.22.0 container_name: blackbox-exporter ports: - "9115:9115" volumes: - ./blackbox.yml:/etc/blackbox/blackbox.yml restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9115/metrics || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=blackbox-exporter" # Loki (Log Aggregation) loki: image: grafana/loki:2.9.0 container_name: loki ports: - "3100:3100" volumes: - ./loki.yml:/etc/loki/local-config.yaml - loki-data:/loki command: - '-config.file=/etc/loki/local-config.yaml' - '-config.expand-env=true' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=loki" # Promtail (Log Collection) promtail: image: grafana/promtail:2.9.0 container_name: promtail volumes: - ./promtail.yml:/etc/promtail/config.yml - /var/log:/var/log:ro - /var/log/containers:/var/log/containers:ro - /var/log/pods:/var/log/pods:ro command: - '-config.file=/etc/promtail/config.yml' - '-config.expand-env=true' restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9081/ready || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=promtail" # Jaeger (Tracing) jaeger: image: jaegertracing/all-in-one:1.47.0 container_name: jaeger ports: - "16686:16686" - "14250:14250" - "14268:14268" - "14269:14269" - "5778:5778" - "5775:5775" environment: - COLLECTOR_ZIPKIN_HOST_PORT=9411 - COLLECTOR_OTLP_HOST_PORT=14250 - COLLECTOR_JAEGER_AGENT_HOST_PORT=14268 - QUERY_BASE_PATH=/ - SPAN_STORAGE_TYPE=badger - BADGER_EPHEMERAL_SPREAD=2000000 restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:16686/ || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=jaeger" # Redis (Cache) redis: image: redis:7.2.3-alpine container_name: redis ports: - "6379:6379" volumes: - redis-data:/data command: redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=redis" # Grafana Loki Data Source grafana-loki-datasource: image: grafana/grafana-image-renderer:3.8.2 container_name: grafana-loki-datasource ports: - "8081:8081" environment: - RENDERING_SERVER_URL=http://localhost:8081/render - RENDERING_CALLBACK_URL=http://localhost:3001/ - RENDERING_HOST=localhost restart: unless-stopped networks: - monitoring healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8081/health || exit 1"] interval: 30s timeout: 10s retries: 3 labels: - "service=monitoring" - "component=grafana-image-renderer" networks: monitoring: driver: bridge ipam: config: - subnet: 172.20.0.0/16 - gateway: 172.20.0.1 volumes: prometheus-data: driver: local grafana-data: driver: local alertmanager-data: driver: local fluent-bit-data: driver: local loki-data: driver: local redis-data: driver: local grafana-logs: driver: local learning-monitor-logs: driver: local collaboration-monitor-logs: driver: local # Logging configuration logging: driver: "json-file" options: max-size: "10m" max-file: "3"