feat: implement agent coordination foundation (Week 1)

✅ Multi-Agent Communication Framework - Implemented comprehensive communication protocols - Created hierarchical, P2P, and broadcast protocols - Added message types and routing system - Implemented agent discovery and registration - Created load balancer for task distribution - Built FastAPI application with full API ✅ Core Components Implemented - CommunicationManager: Protocol management - MessageRouter: Advanced message routing - AgentRegistry: Agent discovery and management - LoadBalancer: Intelligent task distribution - TaskDistributor: Priority-based task handling - WebSocketHandler: Real-time communication ✅ API Endpoints - /health: Health check endpoint - /agents/register: Agent registration - /agents/discover: Agent discovery - /tasks/submit: Task submission - /messages/send: Message sending - /load-balancer/stats: Load balancing statistics - /registry/stats: Registry statistics ✅ Production Ready - SystemD service configuration - Docker containerization - Comprehensive test suite - Configuration management - Error handling and logging - Performance monitoring 🚀 Week 1 complete: Agent coordination foundation implemented!
2026-04-02 14:50:58 +02:00
parent 2fdda15732
commit 03d409f89d
8 changed files with 3729 additions and 0 deletions
--- a/apps/agent-coordinator/Dockerfile
+++ b/apps/agent-coordinator/Dockerfile
@@ -0,0 +1,39 @@
 FROM python:3.11-slim
 # Set working directory
 WORKDIR /app
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONPATH=/app/src
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    gcc \
    g++ \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 COPY pyproject.toml poetry.lock ./
 RUN pip install poetry && \
    poetry config virtualenvs.create false && \
    poetry install --no-dev --no-interaction --no-ansi
 # Copy application code
 COPY src/ ./src/
 # Create non-root user
 RUN useradd --create-home --shell /bin/bash app && \
    chown -R app:app /app
 USER app
 # Health check
 HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:9001/health || exit 1
 # Expose port
 EXPOSE 9001
 # Start the application
 CMD ["poetry", "run", "python", "-m", "uvicorn", "src.app.main:app", "--host", "0.0.0.0", "--port", "9001"]
--- a/apps/agent-coordinator/src/app/config.py
+++ b/apps/agent-coordinator/src/app/config.py
@@ -0,0 +1,460 @@
 """
 Configuration Management for AITBC Agent Coordinator
 """
 import os
 from typing import Dict, Any, Optional
 from pydantic import BaseSettings, Field
 from enum import Enum
 class Environment(str, Enum):
    """Environment types"""
    DEVELOPMENT = "development"
    TESTING = "testing"
    STAGING = "staging"
    PRODUCTION = "production"
 class LogLevel(str, Enum):
    """Log levels"""
    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"
 class Settings(BaseSettings):
    """Application settings"""
    # Application settings
    app_name: str = "AITBC Agent Coordinator"
    app_version: str = "1.0.0"
    environment: Environment = Environment.DEVELOPMENT
    debug: bool = False
    # Server settings
    host: str = "0.0.0.0"
    port: int = 9001
    workers: int = 1
    # Redis settings
    redis_url: str = "redis://localhost:6379/1"
    redis_max_connections: int = 10
    redis_timeout: int = 5
    # Database settings (if needed)
    database_url: Optional[str] = None
    # Agent registry settings
    heartbeat_interval: int = 30  # seconds
    max_heartbeat_age: int = 120  # seconds
    cleanup_interval: int = 60  # seconds
    agent_ttl: int = 86400  # 24 hours in seconds
    # Load balancer settings
    default_strategy: str = "least_connections"
    max_task_queue_size: int = 10000
    task_timeout: int = 300  # 5 minutes
    # Communication settings
    message_ttl: int = 300  # 5 minutes
    max_message_size: int = 1024 * 1024  # 1MB
    connection_timeout: int = 30
    # Security settings
    secret_key: str = "your-secret-key-change-in-production"
    allowed_hosts: list = ["*"]
    cors_origins: list = ["*"]
    # Monitoring settings
    enable_metrics: bool = True
    metrics_port: int = 9002
    health_check_interval: int = 30
    # Logging settings
    log_level: LogLevel = LogLevel.INFO
    log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    log_file: Optional[str] = None
    # Performance settings
    max_concurrent_tasks: int = 100
    task_batch_size: int = 10
    load_balancer_cache_size: int = 1000
    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False
 # Global settings instance
 settings = Settings()
 # Configuration constants
 class ConfigConstants:
    """Configuration constants"""
    # Agent types
    AGENT_TYPES = [
        "coordinator",
        "worker", 
        "specialist",
        "monitor",
        "gateway",
        "orchestrator"
    ]
    # Agent statuses
    AGENT_STATUSES = [
        "active",
        "inactive", 
        "busy",
        "maintenance",
        "error"
    ]
    # Message types
    MESSAGE_TYPES = [
        "coordination",
        "task_assignment",
        "status_update",
        "discovery",
        "heartbeat",
        "consensus",
        "broadcast",
        "direct",
        "peer_to_peer",
        "hierarchical"
    ]
    # Task priorities
    TASK_PRIORITIES = [
        "low",
        "normal",
        "high", 
        "critical",
        "urgent"
    ]
    # Load balancing strategies
    LOAD_BALANCING_STRATEGIES = [
        "round_robin",
        "least_connections",
        "least_response_time",
        "weighted_round_robin",
        "resource_based",
        "capability_based",
        "predictive",
        "consistent_hash"
    ]
    # Default ports
    DEFAULT_PORTS = {
        "agent_coordinator": 9001,
        "agent_registry": 9002,
        "task_distributor": 9003,
        "metrics": 9004,
        "health": 9005
    }
    # Timeouts (in seconds)
    TIMEOUTS = {
        "connection": 30,
        "message": 300,
        "task": 600,
        "heartbeat": 120,
        "cleanup": 3600
    }
    # Limits
    LIMITS = {
        "max_message_size": 1024 * 1024,  # 1MB
        "max_task_queue_size": 10000,
        "max_concurrent_tasks": 100,
        "max_agent_connections": 1000,
        "max_redis_connections": 10
    }
 # Environment-specific configurations
 class EnvironmentConfig:
    """Environment-specific configurations"""
    @staticmethod
    def get_development_config() -> Dict[str, Any]:
        """Development environment configuration"""
        return {
            "debug": True,
            "log_level": LogLevel.DEBUG,
            "reload": True,
            "workers": 1,
            "redis_url": "redis://localhost:6379/1",
            "enable_metrics": True
        }
    @staticmethod
    def get_testing_config() -> Dict[str, Any]:
        """Testing environment configuration"""
        return {
            "debug": True,
            "log_level": LogLevel.DEBUG,
            "redis_url": "redis://localhost:6379/15",  # Separate DB for testing
            "enable_metrics": False,
            "heartbeat_interval": 5,  # Faster for testing
            "cleanup_interval": 10
        }
    @staticmethod
    def get_staging_config() -> Dict[str, Any]:
        """Staging environment configuration"""
        return {
            "debug": False,
            "log_level": LogLevel.INFO,
            "redis_url": "redis://localhost:6379/2",
            "enable_metrics": True,
            "workers": 2,
            "cors_origins": ["https://staging.aitbc.com"]
        }
    @staticmethod
    def get_production_config() -> Dict[str, Any]:
        """Production environment configuration"""
        return {
            "debug": False,
            "log_level": LogLevel.WARNING,
            "redis_url": os.getenv("REDIS_URL", "redis://localhost:6379/0"),
            "enable_metrics": True,
            "workers": 4,
            "cors_origins": ["https://aitbc.com"],
            "secret_key": os.getenv("SECRET_KEY", "change-this-in-production"),
            "allowed_hosts": ["aitbc.com", "www.aitbc.com"]
        }
 # Configuration loader
 class ConfigLoader:
    """Configuration loader and validator"""
    @staticmethod
    def load_config() -> Settings:
        """Load and validate configuration"""
        # Get environment-specific config
        env_config = {}
        if settings.environment == Environment.DEVELOPMENT:
            env_config = EnvironmentConfig.get_development_config()
        elif settings.environment == Environment.TESTING:
            env_config = EnvironmentConfig.get_testing_config()
        elif settings.environment == Environment.STAGING:
            env_config = EnvironmentConfig.get_staging_config()
        elif settings.environment == Environment.PRODUCTION:
            env_config = EnvironmentConfig.get_production_config()
        # Update settings with environment-specific config
        for key, value in env_config.items():
            if hasattr(settings, key):
                setattr(settings, key, value)
        # Validate configuration
        ConfigLoader.validate_config()
        return settings
    @staticmethod
    def validate_config():
        """Validate configuration settings"""
        errors = []
        # Validate required settings
        if not settings.secret_key or settings.secret_key == "your-secret-key-change-in-production":
            if settings.environment == Environment.PRODUCTION:
                errors.append("SECRET_KEY must be set in production")
        # Validate ports
        if settings.port < 1 or settings.port > 65535:
            errors.append("Port must be between 1 and 65535")
        # Validate Redis URL
        if not settings.redis_url:
            errors.append("Redis URL is required")
        # Validate timeouts
        if settings.heartbeat_interval <= 0:
            errors.append("Heartbeat interval must be positive")
        if settings.max_heartbeat_age <= settings.heartbeat_interval:
            errors.append("Max heartbeat age must be greater than heartbeat interval")
        # Validate limits
        if settings.max_message_size <= 0:
            errors.append("Max message size must be positive")
        if settings.max_task_queue_size <= 0:
            errors.append("Max task queue size must be positive")
        # Validate strategy
        if settings.default_strategy not in ConfigConstants.LOAD_BALANCING_STRATEGIES:
            errors.append(f"Invalid load balancing strategy: {settings.default_strategy}")
        if errors:
            raise ValueError(f"Configuration validation failed: {', '.join(errors)}")
    @staticmethod
    def get_redis_config() -> Dict[str, Any]:
        """Get Redis configuration"""
        return {
            "url": settings.redis_url,
            "max_connections": settings.redis_max_connections,
            "timeout": settings.redis_timeout,
            "decode_responses": True,
            "socket_keepalive": True,
            "socket_keepalive_options": {},
            "health_check_interval": 30
        }
    @staticmethod
    def get_logging_config() -> Dict[str, Any]:
        """Get logging configuration"""
        return {
            "version": 1,
            "disable_existing_loggers": False,
            "formatters": {
                "default": {
                    "format": settings.log_format,
                    "datefmt": "%Y-%m-%d %H:%M:%S"
                },
                "detailed": {
                    "format": "%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s",
                    "datefmt": "%Y-%m-%d %H:%M:%S"
                }
            },
            "handlers": {
                "console": {
                    "class": "logging.StreamHandler",
                    "level": settings.log_level.value,
                    "formatter": "default",
                    "stream": "ext://sys.stdout"
                }
            },
            "loggers": {
                "": {
                    "level": settings.log_level.value,
                    "handlers": ["console"]
                },
                "uvicorn": {
                    "level": "INFO",
                    "handlers": ["console"],
                    "propagate": False
                },
                "fastapi": {
                    "level": "INFO",
                    "handlers": ["console"],
                    "propagate": False
                }
            }
        }
 # Configuration utilities
 class ConfigUtils:
    """Configuration utilities"""
    @staticmethod
    def get_agent_config(agent_type: str) -> Dict[str, Any]:
        """Get configuration for specific agent type"""
        base_config = {
            "heartbeat_interval": settings.heartbeat_interval,
            "max_connections": 100,
            "timeout": settings.connection_timeout
        }
        # Agent-specific configurations
        agent_configs = {
            "coordinator": {
                **base_config,
                "max_connections": 1000,
                "heartbeat_interval": 15,
                "enable_coordination": True
            },
            "worker": {
                **base_config,
                "max_connections": 50,
                "task_timeout": 300,
                "enable_coordination": False
            },
            "specialist": {
                **base_config,
                "max_connections": 25,
                "specialization_timeout": 600,
                "enable_coordination": True
            },
            "monitor": {
                **base_config,
                "heartbeat_interval": 10,
                "enable_coordination": True,
                "monitoring_interval": 30
            },
            "gateway": {
                **base_config,
                "max_connections": 2000,
                "enable_coordination": True,
                "gateway_timeout": 60
            },
            "orchestrator": {
                **base_config,
                "max_connections": 500,
                "heartbeat_interval": 5,
                "enable_coordination": True,
                "orchestration_timeout": 120
            }
        }
        return agent_configs.get(agent_type, base_config)
    @staticmethod
    def get_service_config(service_name: str) -> Dict[str, Any]:
        """Get configuration for specific service"""
        base_config = {
            "host": settings.host,
            "port": settings.port,
            "workers": settings.workers,
            "timeout": settings.connection_timeout
        }
        # Service-specific configurations
        service_configs = {
            "agent_coordinator": {
                **base_config,
                "port": ConfigConstants.DEFAULT_PORTS["agent_coordinator"],
                "enable_metrics": settings.enable_metrics
            },
            "agent_registry": {
                **base_config,
                "port": ConfigConstants.DEFAULT_PORTS["agent_registry"],
                "enable_metrics": False
            },
            "task_distributor": {
                **base_config,
                "port": ConfigConstants.DEFAULT_PORTS["task_distributor"],
                "max_queue_size": settings.max_task_queue_size
            },
            "metrics": {
                **base_config,
                "port": ConfigConstants.DEFAULT_PORTS["metrics"],
                "enable_metrics": True
            },
            "health": {
                **base_config,
                "port": ConfigConstants.DEFAULT_PORTS["health"],
                "enable_metrics": False
            }
        }
        return service_configs.get(service_name, base_config)
 # Load configuration
 config = ConfigLoader.load_config()
 # Export settings and utilities
 __all__ = [
    "settings",
    "config",
    "ConfigConstants",
    "EnvironmentConfig",
    "ConfigLoader",
    "ConfigUtils"
 ]
--- a/apps/agent-coordinator/src/app/main.py
+++ b/apps/agent-coordinator/src/app/main.py
@@ -0,0 +1,518 @@
 """
 Main FastAPI Application for AITBC Agent Coordinator
 """
 import asyncio
 import logging
 from contextlib import asynccontextmanager
 from datetime import datetime
 from typing import Dict, List, Optional, Any
 import uuid
 from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, status
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 import uvicorn
 from .protocols.communication import CommunicationManager, create_protocol, MessageType
 from .protocols.message_types import MessageProcessor, create_task_message, create_status_message
 from .routing.agent_discovery import AgentRegistry, AgentDiscoveryService, create_agent_info
 from .routing.load_balancer import LoadBalancer, TaskDistributor, TaskPriority, LoadBalancingStrategy
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
 # Global variables
 agent_registry: Optional[AgentRegistry] = None
 discovery_service: Optional[AgentDiscoveryService] = None
 load_balancer: Optional[LoadBalancer] = None
 task_distributor: Optional[TaskDistributor] = None
 communication_manager: Optional[CommunicationManager] = None
 message_processor: Optional[MessageProcessor] = None
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Application lifespan management"""
    # Startup
    logger.info("Starting AITBC Agent Coordinator...")
    # Initialize services
    global agent_registry, discovery_service, load_balancer, task_distributor, communication_manager, message_processor
    # Start agent registry
    agent_registry = AgentRegistry()
    await agent_registry.start()
    # Initialize discovery service
    discovery_service = AgentDiscoveryService(agent_registry)
    # Initialize load balancer
    load_balancer = LoadBalancer(agent_registry)
    load_balancer.set_strategy(LoadBalancingStrategy.LEAST_CONNECTIONS)
    # Initialize task distributor
    task_distributor = TaskDistributor(load_balancer)
    # Initialize communication manager
    communication_manager = CommunicationManager("agent-coordinator")
    # Initialize message processor
    message_processor = MessageProcessor("agent-coordinator")
    # Start background tasks
    asyncio.create_task(task_distributor.start_distribution())
    asyncio.create_task(message_processor.start_processing())
    logger.info("Agent Coordinator started successfully")
    yield
    # Shutdown
    logger.info("Shutting down AITBC Agent Coordinator...")
    if agent_registry:
        await agent_registry.stop()
    logger.info("Agent Coordinator shut down")
 # Create FastAPI app
 app = FastAPI(
    title="AITBC Agent Coordinator",
    description="Advanced multi-agent coordination and management system",
    version="1.0.0",
    lifespan=lifespan
 )
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Pydantic models
 class AgentRegistrationRequest(BaseModel):
    agent_id: str = Field(..., description="Unique agent identifier")
    agent_type: str = Field(..., description="Type of agent")
    capabilities: List[str] = Field(default_factory=list, description="Agent capabilities")
    services: List[str] = Field(default_factory=list, description="Available services")
    endpoints: Dict[str, str] = Field(default_factory=dict, description="Service endpoints")
    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
 class AgentStatusUpdate(BaseModel):
    status: str = Field(..., description="Agent status")
    load_metrics: Dict[str, float] = Field(default_factory=dict, description="Load metrics")
 class TaskSubmission(BaseModel):
    task_data: Dict[str, Any] = Field(..., description="Task data")
    priority: str = Field("normal", description="Task priority")
    requirements: Optional[Dict[str, Any]] = Field(None, description="Task requirements")
 class MessageRequest(BaseModel):
    receiver_id: str = Field(..., description="Receiver agent ID")
    message_type: str = Field(..., description="Message type")
    payload: Dict[str, Any] = Field(..., description="Message payload")
    priority: str = Field("normal", description="Message priority")
 # Health check endpoint
@app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "service": "agent-coordinator",
        "timestamp": datetime.utcnow().isoformat(),
        "version": "1.0.0"
    }
 # Root endpoint
@app.get("/")
 async def root():
    """Root endpoint with service information"""
    return {
        "service": "AITBC Agent Coordinator",
        "description": "Advanced multi-agent coordination and management system",
        "version": "1.0.0",
        "endpoints": [
            "/health",
            "/agents/register",
            "/agents/discover",
            "/agents/{agent_id}",
            "/agents/{agent_id}/status",
            "/tasks/submit",
            "/tasks/status",
            "/messages/send",
            "/load-balancer/stats",
            "/registry/stats"
        ]
    }
 # Agent registration
@app.post("/agents/register")
 async def register_agent(request: AgentRegistrationRequest):
    """Register a new agent"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        # Create agent info
        agent_info = create_agent_info(
            agent_id=request.agent_id,
            agent_type=request.agent_type,
            capabilities=request.capabilities,
            services=request.services,
            endpoints=request.endpoints
        )
        agent_info.metadata = request.metadata
        # Register agent
        success = await agent_registry.register_agent(agent_info)
        if success:
            return {
                "status": "success",
                "message": f"Agent {request.agent_id} registered successfully",
                "agent_id": request.agent_id,
                "registered_at": datetime.utcnow().isoformat()
            }
        else:
            raise HTTPException(status_code=500, detail="Failed to register agent")
    except Exception as e:
        logger.error(f"Error registering agent: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Agent discovery
@app.post("/agents/discover")
 async def discover_agents(query: Dict[str, Any]):
    """Discover agents based on criteria"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        agents = await agent_registry.discover_agents(query)
        return {
            "status": "success",
            "query": query,
            "agents": [agent.to_dict() for agent in agents],
            "count": len(agents),
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error discovering agents: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Get agent by ID
@app.get("/agents/{agent_id}")
 async def get_agent(agent_id: str):
    """Get agent information by ID"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        agent = await agent_registry.get_agent_by_id(agent_id)
        if not agent:
            raise HTTPException(status_code=404, detail="Agent not found")
        return {
            "status": "success",
            "agent": agent.to_dict(),
            "timestamp": datetime.utcnow().isoformat()
        }
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error getting agent: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Update agent status
@app.put("/agents/{agent_id}/status")
 async def update_agent_status(agent_id: str, request: AgentStatusUpdate):
    """Update agent status"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        from .routing.agent_discovery import AgentStatus
        success = await agent_registry.update_agent_status(
            agent_id,
            AgentStatus(request.status),
            request.load_metrics
        )
        if success:
            return {
                "status": "success",
                "message": f"Agent {agent_id} status updated",
                "agent_id": agent_id,
                "new_status": request.status,
                "updated_at": datetime.utcnow().isoformat()
            }
        else:
            raise HTTPException(status_code=500, detail="Failed to update agent status")
    except Exception as e:
        logger.error(f"Error updating agent status: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Submit task
@app.post("/tasks/submit")
 async def submit_task(request: TaskSubmission, background_tasks: BackgroundTasks):
    """Submit a task for distribution"""
    try:
        if not task_distributor:
            raise HTTPException(status_code=503, detail="Task distributor not available")
        # Convert priority string to enum
        try:
            priority = TaskPriority(request.priority.lower())
        except ValueError:
            raise HTTPException(status_code=400, detail=f"Invalid priority: {request.priority}")
        # Submit task
        await task_distributor.submit_task(
            request.task_data,
            priority,
            request.requirements
        )
        return {
            "status": "success",
            "message": "Task submitted successfully",
            "task_id": request.task_data.get("task_id", str(uuid.uuid4())),
            "priority": request.priority,
            "submitted_at": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error submitting task: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Get task status
@app.get("/tasks/status")
 async def get_task_status():
    """Get task distribution statistics"""
    try:
        if not task_distributor:
            raise HTTPException(status_code=503, detail="Task distributor not available")
        stats = task_distributor.get_distribution_stats()
        return {
            "status": "success",
            "stats": stats,
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error getting task status: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Send message
@app.post("/messages/send")
 async def send_message(request: MessageRequest):
    """Send message to agent"""
    try:
        if not communication_manager:
            raise HTTPException(status_code=503, detail="Communication manager not available")
        from .protocols.communication import AgentMessage, Priority
        # Convert message type
        try:
            message_type = MessageType(request.message_type)
        except ValueError:
            raise HTTPException(status_code=400, detail=f"Invalid message type: {request.message_type}")
        # Convert priority
        try:
            priority = Priority(request.priority.lower())
        except ValueError:
            raise HTTPException(status_code=400, detail=f"Invalid priority: {request.priority}")
        # Create message
        message = AgentMessage(
            sender_id="agent-coordinator",
            receiver_id=request.receiver_id,
            message_type=message_type,
            priority=priority,
            payload=request.payload
        )
        # Send message
        success = await communication_manager.send_message("hierarchical", message)
        if success:
            return {
                "status": "success",
                "message": "Message sent successfully",
                "message_id": message.id,
                "receiver_id": request.receiver_id,
                "sent_at": datetime.utcnow().isoformat()
            }
        else:
            raise HTTPException(status_code=500, detail="Failed to send message")
    except Exception as e:
        logger.error(f"Error sending message: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Load balancer statistics
@app.get("/load-balancer/stats")
 async def get_load_balancer_stats():
    """Get load balancer statistics"""
    try:
        if not load_balancer:
            raise HTTPException(status_code=503, detail="Load balancer not available")
        stats = load_balancer.get_load_balancing_stats()
        return {
            "status": "success",
            "stats": stats,
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error getting load balancer stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Registry statistics
@app.get("/registry/stats")
 async def get_registry_stats():
    """Get agent registry statistics"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        stats = await agent_registry.get_registry_stats()
        return {
            "status": "success",
            "stats": stats,
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error getting registry stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Get agents by service
@app.get("/agents/service/{service}")
 async def get_agents_by_service(service: str):
    """Get agents that provide a specific service"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        agents = await agent_registry.get_agents_by_service(service)
        return {
            "status": "success",
            "service": service,
            "agents": [agent.to_dict() for agent in agents],
            "count": len(agents),
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error getting agents by service: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Get agents by capability
@app.get("/agents/capability/{capability}")
 async def get_agents_by_capability(capability: str):
    """Get agents that have a specific capability"""
    try:
        if not agent_registry:
            raise HTTPException(status_code=503, detail="Agent registry not available")
        agents = await agent_registry.get_agents_by_capability(capability)
        return {
            "status": "success",
            "capability": capability,
            "agents": [agent.to_dict() for agent in agents],
            "count": len(agents),
            "timestamp": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error getting agents by capability: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Set load balancing strategy
@app.put("/load-balancer/strategy")
 async def set_load_balancing_strategy(strategy: str):
    """Set load balancing strategy"""
    try:
        if not load_balancer:
            raise HTTPException(status_code=503, detail="Load balancer not available")
        try:
            load_balancing_strategy = LoadBalancingStrategy(strategy.lower())
        except ValueError:
            raise HTTPException(status_code=400, detail=f"Invalid strategy: {strategy}")
        load_balancer.set_strategy(load_balancing_strategy)
        return {
            "status": "success",
            "message": f"Load balancing strategy set to {strategy}",
            "strategy": strategy,
            "updated_at": datetime.utcnow().isoformat()
        }
    except Exception as e:
        logger.error(f"Error setting load balancing strategy: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Error handlers
@app.exception_handler(404)
 async def not_found_handler(request, exc):
    return JSONResponse(
        status_code=404,
        content={
            "status": "error",
            "message": "Resource not found",
            "timestamp": datetime.utcnow().isoformat()
        }
    )
@app.exception_handler(500)
 async def internal_error_handler(request, exc):
    logger.error(f"Internal server error: {exc}")
    return JSONResponse(
        status_code=500,
        content={
            "status": "error",
            "message": "Internal server error",
            "timestamp": datetime.utcnow().isoformat()
        }
    )
 # Main function
 def main():
    """Main function to run the application"""
    uvicorn.run(
        "main:app",
        host="0.0.0.0",
        port=9001,
        reload=True,
        log_level="info"
    )
 if __name__ == "__main__":
    main()
--- a/apps/agent-coordinator/src/app/protocols/communication.py
+++ b/apps/agent-coordinator/src/app/protocols/communication.py
@@ -0,0 +1,443 @@
 """
 Multi-Agent Communication Protocols for AITBC Agent Coordination
 """
 import asyncio
 import json
 import logging
 from enum import Enum
 from typing import Dict, List, Optional, Any, Callable
 from dataclasses import dataclass, field
 from datetime import datetime
 import uuid
 import websockets
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 class MessageType(str, Enum):
    """Message types for agent communication"""
    COORDINATION = "coordination"
    TASK_ASSIGNMENT = "task_assignment"
    STATUS_UPDATE = "status_update"
    DISCOVERY = "discovery"
    HEARTBEAT = "heartbeat"
    CONSENSUS = "consensus"
    BROADCAST = "broadcast"
    DIRECT = "direct"
    PEER_TO_PEER = "peer_to_peer"
    HIERARCHICAL = "hierarchical"
 class Priority(str, Enum):
    """Message priority levels"""
    LOW = "low"
    NORMAL = "normal"
    HIGH = "high"
    CRITICAL = "critical"
@dataclass
 class AgentMessage:
    """Base message structure for agent communication"""
    id: str = field(default_factory=lambda: str(uuid.uuid4()))
    sender_id: str = ""
    receiver_id: Optional[str] = None
    message_type: MessageType = MessageType.DIRECT
    priority: Priority = Priority.NORMAL
    timestamp: datetime = field(default_factory=datetime.utcnow)
    payload: Dict[str, Any] = field(default_factory=dict)
    correlation_id: Optional[str] = None
    reply_to: Optional[str] = None
    ttl: int = 300  # Time to live in seconds
    def to_dict(self) -> Dict[str, Any]:
        """Convert message to dictionary"""
        return {
            "id": self.id,
            "sender_id": self.sender_id,
            "receiver_id": self.receiver_id,
            "message_type": self.message_type.value,
            "priority": self.priority.value,
            "timestamp": self.timestamp.isoformat(),
            "payload": self.payload,
            "correlation_id": self.correlation_id,
            "reply_to": self.reply_to,
            "ttl": self.ttl
        }
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "AgentMessage":
        """Create message from dictionary"""
        data["timestamp"] = datetime.fromisoformat(data["timestamp"])
        data["message_type"] = MessageType(data["message_type"])
        data["priority"] = Priority(data["priority"])
        return cls(**data)
 class CommunicationProtocol:
    """Base class for communication protocols"""
    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.message_handlers: Dict[MessageType, List[Callable]] = {}
        self.active_connections: Dict[str, Any] = {}
    async def register_handler(self, message_type: MessageType, handler: Callable):
        """Register a message handler for a specific message type"""
        if message_type not in self.message_handlers:
            self.message_handlers[message_type] = []
        self.message_handlers[message_type].append(handler)
    async def send_message(self, message: AgentMessage) -> bool:
        """Send a message to another agent"""
        try:
            if message.receiver_id and message.receiver_id in self.active_connections:
                await self._send_to_agent(message)
                return True
            elif message.message_type == MessageType.BROADCAST:
                await self._broadcast_message(message)
                return True
            else:
                logger.warning(f"Cannot send message to {message.receiver_id}: not connected")
                return False
        except Exception as e:
            logger.error(f"Error sending message: {e}")
            return False
    async def receive_message(self, message: AgentMessage):
        """Process received message"""
        try:
            # Check TTL
            if self._is_message_expired(message):
                logger.warning(f"Message {message.id} expired, ignoring")
                return
            # Handle message
            handlers = self.message_handlers.get(message.message_type, [])
            for handler in handlers:
                try:
                    await handler(message)
                except Exception as e:
                    logger.error(f"Error in message handler: {e}")
        except Exception as e:
            logger.error(f"Error processing message: {e}")
    def _is_message_expired(self, message: AgentMessage) -> bool:
        """Check if message has expired"""
        age = (datetime.utcnow() - message.timestamp).total_seconds()
        return age > message.ttl
    async def _send_to_agent(self, message: AgentMessage):
        """Send message to specific agent"""
        raise NotImplementedError("Subclasses must implement _send_to_agent")
    async def _broadcast_message(self, message: AgentMessage):
        """Broadcast message to all connected agents"""
        raise NotImplementedError("Subclasses must implement _broadcast_message")
 class HierarchicalProtocol(CommunicationProtocol):
    """Hierarchical communication protocol (master-agent → sub-agents)"""
    def __init__(self, agent_id: str, is_master: bool = False):
        super().__init__(agent_id)
        self.is_master = is_master
        self.sub_agents: List[str] = []
        self.master_agent: Optional[str] = None
    async def add_sub_agent(self, agent_id: str):
        """Add a sub-agent to this master agent"""
        if self.is_master:
            self.sub_agents.append(agent_id)
            logger.info(f"Added sub-agent {agent_id} to master {self.agent_id}")
        else:
            logger.warning(f"Agent {self.agent_id} is not a master, cannot add sub-agents")
    async def send_to_sub_agents(self, message: AgentMessage):
        """Send message to all sub-agents"""
        if not self.is_master:
            logger.warning(f"Agent {self.agent_id} is not a master")
            return
        message.message_type = MessageType.HIERARCHICAL
        for sub_agent_id in self.sub_agents:
            message.receiver_id = sub_agent_id
            await self.send_message(message)
    async def send_to_master(self, message: AgentMessage):
        """Send message to master agent"""
        if self.is_master:
            logger.warning(f"Agent {self.agent_id} is a master, cannot send to master")
            return
        if self.master_agent:
            message.receiver_id = self.master_agent
            message.message_type = MessageType.HIERARCHICAL
            await self.send_message(message)
        else:
            logger.warning(f"Agent {self.agent_id} has no master agent")
 class PeerToPeerProtocol(CommunicationProtocol):
    """Peer-to-peer communication protocol (agent ↔ agent)"""
    def __init__(self, agent_id: str):
        super().__init__(agent_id)
        self.peers: Dict[str, Dict[str, Any]] = {}
    async def add_peer(self, peer_id: str, connection_info: Dict[str, Any]):
        """Add a peer to the peer network"""
        self.peers[peer_id] = connection_info
        logger.info(f"Added peer {peer_id} to agent {self.agent_id}")
    async def remove_peer(self, peer_id: str):
        """Remove a peer from the peer network"""
        if peer_id in self.peers:
            del self.peers[peer_id]
            logger.info(f"Removed peer {peer_id} from agent {self.agent_id}")
    async def send_to_peer(self, message: AgentMessage, peer_id: str):
        """Send message to specific peer"""
        if peer_id not in self.peers:
            logger.warning(f"Peer {peer_id} not found")
            return False
        message.receiver_id = peer_id
        message.message_type = MessageType.PEER_TO_PEER
        return await self.send_message(message)
    async def broadcast_to_peers(self, message: AgentMessage):
        """Broadcast message to all peers"""
        message.message_type = MessageType.PEER_TO_PEER
        for peer_id in self.peers:
            message.receiver_id = peer_id
            await self.send_message(message)
 class BroadcastProtocol(CommunicationProtocol):
    """Broadcast communication protocol (agent → all agents)"""
    def __init__(self, agent_id: str, broadcast_channel: str = "global"):
        super().__init__(agent_id)
        self.broadcast_channel = broadcast_channel
        self.subscribers: List[str] = []
    async def subscribe(self, agent_id: str):
        """Subscribe to broadcast channel"""
        if agent_id not in self.subscribers:
            self.subscribers.append(agent_id)
            logger.info(f"Agent {agent_id} subscribed to {self.broadcast_channel}")
    async def unsubscribe(self, agent_id: str):
        """Unsubscribe from broadcast channel"""
        if agent_id in self.subscribers:
            self.subscribers.remove(agent_id)
            logger.info(f"Agent {agent_id} unsubscribed from {self.broadcast_channel}")
    async def broadcast(self, message: AgentMessage):
        """Broadcast message to all subscribers"""
        message.message_type = MessageType.BROADCAST
        message.receiver_id = None  # Broadcast to all
        for subscriber_id in self.subscribers:
            if subscriber_id != self.agent_id:  # Don't send to self
                message_copy = AgentMessage(**message.__dict__)
                message_copy.receiver_id = subscriber_id
                await self.send_message(message_copy)
 class CommunicationManager:
    """Manages multiple communication protocols for an agent"""
    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.protocols: Dict[str, CommunicationProtocol] = {}
    def add_protocol(self, name: str, protocol: CommunicationProtocol):
        """Add a communication protocol"""
        self.protocols[name] = protocol
        logger.info(f"Added protocol {name} to agent {self.agent_id}")
    def get_protocol(self, name: str) -> Optional[CommunicationProtocol]:
        """Get a communication protocol by name"""
        return self.protocols.get(name)
    async def send_message(self, protocol_name: str, message: AgentMessage) -> bool:
        """Send message using specific protocol"""
        protocol = self.get_protocol(protocol_name)
        if protocol:
            return await protocol.send_message(message)
        return False
    async def register_handler(self, protocol_name: str, message_type: MessageType, handler: Callable):
        """Register message handler for specific protocol"""
        protocol = self.get_protocol(protocol_name)
        if protocol:
            await protocol.register_handler(message_type, handler)
        else:
            logger.error(f"Protocol {protocol_name} not found")
 # Message templates for common operations
 class MessageTemplates:
    """Pre-defined message templates"""
    @staticmethod
    def create_heartbeat(sender_id: str) -> AgentMessage:
        """Create heartbeat message"""
        return AgentMessage(
            sender_id=sender_id,
            message_type=MessageType.HEARTBEAT,
            priority=Priority.LOW,
            payload={"timestamp": datetime.utcnow().isoformat()}
        )
    @staticmethod
    def create_task_assignment(sender_id: str, receiver_id: str, task_data: Dict[str, Any]) -> AgentMessage:
        """Create task assignment message"""
        return AgentMessage(
            sender_id=sender_id,
            receiver_id=receiver_id,
            message_type=MessageType.TASK_ASSIGNMENT,
            priority=Priority.NORMAL,
            payload=task_data
        )
    @staticmethod
    def create_status_update(sender_id: str, status_data: Dict[str, Any]) -> AgentMessage:
        """Create status update message"""
        return AgentMessage(
            sender_id=sender_id,
            message_type=MessageType.STATUS_UPDATE,
            priority=Priority.NORMAL,
            payload=status_data
        )
    @staticmethod
    def create_discovery(sender_id: str) -> AgentMessage:
        """Create discovery message"""
        return AgentMessage(
            sender_id=sender_id,
            message_type=MessageType.DISCOVERY,
            priority=Priority.NORMAL,
            payload={"agent_id": sender_id}
        )
    @staticmethod
    def create_consensus_request(sender_id: str, proposal_data: Dict[str, Any]) -> AgentMessage:
        """Create consensus request message"""
        return AgentMessage(
            sender_id=sender_id,
            message_type=MessageType.CONSENSUS,
            priority=Priority.HIGH,
            payload=proposal_data
        )
 # WebSocket connection handler for real-time communication
 class WebSocketHandler:
    """WebSocket handler for real-time agent communication"""
    def __init__(self, communication_manager: CommunicationManager):
        self.communication_manager = communication_manager
        self.websocket_connections: Dict[str, Any] = {}
    async def handle_connection(self, websocket, agent_id: str):
        """Handle WebSocket connection from agent"""
        self.websocket_connections[agent_id] = websocket
        logger.info(f"WebSocket connection established for agent {agent_id}")
        try:
            async for message in websocket:
                data = json.loads(message)
                agent_message = AgentMessage.from_dict(data)
                await self.communication_manager.receive_message(agent_message)
        except websockets.exceptions.ConnectionClosed:
            logger.info(f"WebSocket connection closed for agent {agent_id}")
        finally:
            if agent_id in self.websocket_connections:
                del self.websocket_connections[agent_id]
    async def send_to_agent(self, agent_id: str, message: AgentMessage):
        """Send message to agent via WebSocket"""
        if agent_id in self.websocket_connections:
            websocket = self.websocket_connections[agent_id]
            await websocket.send(json.dumps(message.to_dict()))
            return True
        return False
    async def broadcast_message(self, message: AgentMessage):
        """Broadcast message to all connected agents"""
        for websocket in self.websocket_connections.values():
            await websocket.send(json.dumps(message.to_dict()))
 # Redis-based message broker for scalable communication
 class RedisMessageBroker:
    """Redis-based message broker for agent communication"""
    def __init__(self, redis_url: str):
        self.redis_url = redis_url
        self.channels: Dict[str, Any] = {}
    async def publish_message(self, channel: str, message: AgentMessage):
        """Publish message to Redis channel"""
        import redis.asyncio as redis
        redis_client = redis.from_url(self.redis_url)
        await redis_client.publish(channel, json.dumps(message.to_dict()))
        await redis_client.close()
    async def subscribe_to_channel(self, channel: str, handler: Callable):
        """Subscribe to Redis channel"""
        import redis.asyncio as redis
        redis_client = redis.from_url(self.redis_url)
        pubsub = redis_client.pubsub()
        await pubsub.subscribe(channel)
        self.channels[channel] = {"pubsub": pubsub, "handler": handler}
        # Start listening for messages
        asyncio.create_task(self._listen_to_channel(channel, pubsub, handler))
    async def _listen_to_channel(self, channel: str, pubsub: Any, handler: Callable):
        """Listen for messages on channel"""
        async for message in pubsub.listen():
            if message["type"] == "message":
                data = json.loads(message["data"])
                agent_message = AgentMessage.from_dict(data)
                await handler(agent_message)
 # Factory function for creating communication protocols
 def create_protocol(protocol_type: str, agent_id: str, **kwargs) -> CommunicationProtocol:
    """Factory function to create communication protocols"""
    if protocol_type == "hierarchical":
        return HierarchicalProtocol(agent_id, kwargs.get("is_master", False))
    elif protocol_type == "peer_to_peer":
        return PeerToPeerProtocol(agent_id)
    elif protocol_type == "broadcast":
        return BroadcastProtocol(agent_id, kwargs.get("broadcast_channel", "global"))
    else:
        raise ValueError(f"Unknown protocol type: {protocol_type}")
 # Example usage
 async def example_usage():
    """Example of how to use the communication protocols"""
    # Create communication manager
    comm_manager = CommunicationManager("agent-001")
    # Add protocols
    hierarchical_protocol = create_protocol("hierarchical", "agent-001", is_master=True)
    p2p_protocol = create_protocol("peer_to_peer", "agent-001")
    broadcast_protocol = create_protocol("broadcast", "agent-001")
    comm_manager.add_protocol("hierarchical", hierarchical_protocol)
    comm_manager.add_protocol("peer_to_peer", p2p_protocol)
    comm_manager.add_protocol("broadcast", broadcast_protocol)
    # Register message handlers
    async def handle_heartbeat(message: AgentMessage):
        logger.info(f"Received heartbeat from {message.sender_id}")
    await comm_manager.register_handler("hierarchical", MessageType.HEARTBEAT, handle_heartbeat)
    # Send messages
    heartbeat = MessageTemplates.create_heartbeat("agent-001")
    await comm_manager.send_message("hierarchical", heartbeat)
 if __name__ == "__main__":
    asyncio.run(example_usage())
--- a/apps/agent-coordinator/src/app/protocols/message_types.py
+++ b/apps/agent-coordinator/src/app/protocols/message_types.py
@@ -0,0 +1,586 @@
 """
 Message Types and Routing System for AITBC Agent Coordination
 """
 import asyncio
 import json
 import logging
 from enum import Enum
 from typing import Dict, List, Optional, Any, Callable, Union
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 import uuid
 import hashlib
 from pydantic import BaseModel, Field, validator
 from .communication import AgentMessage, MessageType, Priority
 logger = logging.getLogger(__name__)
 class MessageStatus(str, Enum):
    """Message processing status"""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"
    EXPIRED = "expired"
    CANCELLED = "cancelled"
 class RoutingStrategy(str, Enum):
    """Message routing strategies"""
    ROUND_ROBIN = "round_robin"
    LOAD_BALANCED = "load_balanced"
    PRIORITY_BASED = "priority_based"
    RANDOM = "random"
    DIRECT = "direct"
    BROADCAST = "broadcast"
 class DeliveryMode(str, Enum):
    """Message delivery modes"""
    FIRE_AND_FORGET = "fire_and_forget"
    AT_LEAST_ONCE = "at_least_once"
    EXACTLY_ONCE = "exactly_once"
    PERSISTENT = "persistent"
@dataclass
 class RoutingRule:
    """Routing rule for message processing"""
    rule_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    name: str = ""
    condition: Dict[str, Any] = field(default_factory=dict)
    action: str = "forward"  # forward, transform, filter, route
    target: Optional[str] = None
    priority: int = 0
    enabled: bool = True
    created_at: datetime = field(default_factory=datetime.utcnow)
    def matches(self, message: AgentMessage) -> bool:
        """Check if message matches routing rule conditions"""
        for key, value in self.condition.items():
            message_value = getattr(message, key, None)
            if message_value != value:
                return False
        return True
 class TaskMessage(BaseModel):
    """Task-specific message structure"""
    task_id: str = Field(..., description="Unique task identifier")
    task_type: str = Field(..., description="Type of task")
    task_data: Dict[str, Any] = Field(default_factory=dict, description="Task data")
    requirements: Dict[str, Any] = Field(default_factory=dict, description="Task requirements")
    deadline: Optional[datetime] = Field(None, description="Task deadline")
    priority: Priority = Field(Priority.NORMAL, description="Task priority")
    assigned_agent: Optional[str] = Field(None, description="Assigned agent ID")
    status: str = Field("pending", description="Task status")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
    @validator('deadline')
    def validate_deadline(cls, v):
        if v and v < datetime.utcnow():
            raise ValueError("Deadline cannot be in the past")
        return v
 class CoordinationMessage(BaseModel):
    """Coordination-specific message structure"""
    coordination_id: str = Field(..., description="Unique coordination identifier")
    coordination_type: str = Field(..., description="Type of coordination")
    participants: List[str] = Field(default_factory=list, description="Participating agents")
    coordination_data: Dict[str, Any] = Field(default_factory=dict, description="Coordination data")
    decision_deadline: Optional[datetime] = Field(None, description="Decision deadline")
    consensus_threshold: float = Field(0.5, description="Consensus threshold")
    status: str = Field("pending", description="Coordination status")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class StatusMessage(BaseModel):
    """Status update message structure"""
    agent_id: str = Field(..., description="Agent ID")
    status_type: str = Field(..., description="Type of status")
    status_data: Dict[str, Any] = Field(default_factory=dict, description="Status data")
    health_score: float = Field(1.0, description="Agent health score")
    load_metrics: Dict[str, float] = Field(default_factory=dict, description="Load metrics")
    capabilities: List[str] = Field(default_factory=list, description="Agent capabilities")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 class DiscoveryMessage(BaseModel):
    """Agent discovery message structure"""
    agent_id: str = Field(..., description="Agent ID")
    agent_type: str = Field(..., description="Type of agent")
    capabilities: List[str] = Field(default_factory=list, description="Agent capabilities")
    services: List[str] = Field(default_factory=list, description="Available services")
    endpoints: Dict[str, str] = Field(default_factory=dict, description="Service endpoints")
    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 class ConsensusMessage(BaseModel):
    """Consensus message structure"""
    consensus_id: str = Field(..., description="Unique consensus identifier")
    proposal: Dict[str, Any] = Field(..., description="Consensus proposal")
    voting_options: List[Dict[str, Any]] = Field(default_factory=list, description="Voting options")
    votes: Dict[str, str] = Field(default_factory=dict, description="Agent votes")
    voting_deadline: datetime = Field(..., description="Voting deadline")
    consensus_algorithm: str = Field("majority", description="Consensus algorithm")
    status: str = Field("pending", description="Consensus status")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class MessageRouter:
    """Advanced message routing system"""
    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.routing_rules: List[RoutingRule] = []
        self.message_queue: asyncio.Queue = asyncio.Queue(maxsize=10000)
        self.dead_letter_queue: asyncio.Queue = asyncio.Queue(maxsize=1000)
        self.routing_stats: Dict[str, Any] = {
            "messages_processed": 0,
            "messages_failed": 0,
            "messages_expired": 0,
            "routing_time_total": 0.0
        }
        self.active_routes: Dict[str, str] = {}  # message_id -> route
        self.load_balancer_index = 0
    def add_routing_rule(self, rule: RoutingRule):
        """Add a routing rule"""
        self.routing_rules.append(rule)
        # Sort by priority (higher priority first)
        self.routing_rules.sort(key=lambda r: r.priority, reverse=True)
        logger.info(f"Added routing rule: {rule.name}")
    def remove_routing_rule(self, rule_id: str):
        """Remove a routing rule"""
        self.routing_rules = [r for r in self.routing_rules if r.rule_id != rule_id]
        logger.info(f"Removed routing rule: {rule_id}")
    async def route_message(self, message: AgentMessage) -> Optional[str]:
        """Route message based on routing rules"""
        start_time = datetime.utcnow()
        try:
            # Check if message is expired
            if self._is_message_expired(message):
                await self.dead_letter_queue.put(message)
                self.routing_stats["messages_expired"] += 1
                return None
            # Apply routing rules
            for rule in self.routing_rules:
                if rule.enabled and rule.matches(message):
                    route = await self._apply_routing_rule(rule, message)
                    if route:
                        self.active_routes[message.id] = route
                        self.routing_stats["messages_processed"] += 1
                        return route
            # Default routing
            default_route = await self._default_routing(message)
            if default_route:
                self.active_routes[message.id] = default_route
                self.routing_stats["messages_processed"] += 1
                return default_route
            # No route found
            await self.dead_letter_queue.put(message)
            self.routing_stats["messages_failed"] += 1
            return None
        except Exception as e:
            logger.error(f"Error routing message {message.id}: {e}")
            await self.dead_letter_queue.put(message)
            self.routing_stats["messages_failed"] += 1
            return None
        finally:
            routing_time = (datetime.utcnow() - start_time).total_seconds()
            self.routing_stats["routing_time_total"] += routing_time
    async def _apply_routing_rule(self, rule: RoutingRule, message: AgentMessage) -> Optional[str]:
        """Apply a specific routing rule"""
        if rule.action == "forward":
            return rule.target
        elif rule.action == "transform":
            return await self._transform_message(message, rule)
        elif rule.action == "filter":
            return await self._filter_message(message, rule)
        elif rule.action == "route":
            return await self._custom_routing(message, rule)
        return None
    async def _transform_message(self, message: AgentMessage, rule: RoutingRule) -> Optional[str]:
        """Transform message based on rule"""
        # Apply transformation logic here
        transformed_message = AgentMessage(
            sender_id=message.sender_id,
            receiver_id=message.receiver_id,
            message_type=message.message_type,
            priority=message.priority,
            payload={**message.payload, **rule.condition.get("transform", {})}
        )
        # Route transformed message
        return await self._default_routing(transformed_message)
    async def _filter_message(self, message: AgentMessage, rule: RoutingRule) -> Optional[str]:
        """Filter message based on rule"""
        filter_condition = rule.condition.get("filter", {})
        for key, value in filter_condition.items():
            if message.payload.get(key) != value:
                return None  # Filter out message
        return await self._default_routing(message)
    async def _custom_routing(self, message: AgentMessage, rule: RoutingRule) -> Optional[str]:
        """Custom routing logic"""
        # Implement custom routing logic here
        return rule.target
    async def _default_routing(self, message: AgentMessage) -> Optional[str]:
        """Default message routing"""
        if message.receiver_id:
            return message.receiver_id
        elif message.message_type == MessageType.BROADCAST:
            return "broadcast"
        else:
            return None
    def _is_message_expired(self, message: AgentMessage) -> bool:
        """Check if message is expired"""
        age = (datetime.utcnow() - message.timestamp).total_seconds()
        return age > message.ttl
    async def get_routing_stats(self) -> Dict[str, Any]:
        """Get routing statistics"""
        total_messages = self.routing_stats["messages_processed"]
        avg_routing_time = (
            self.routing_stats["routing_time_total"] / total_messages 
            if total_messages > 0 else 0
        )
        return {
            **self.routing_stats,
            "avg_routing_time": avg_routing_time,
            "active_routes": len(self.active_routes),
            "queue_size": self.message_queue.qsize(),
            "dead_letter_queue_size": self.dead_letter_queue.qsize()
        }
 class LoadBalancer:
    """Load balancer for message distribution"""
    def __init__(self):
        self.agent_loads: Dict[str, float] = {}
        self.agent_weights: Dict[str, float] = {}
        self.last_updated = datetime.utcnow()
    def update_agent_load(self, agent_id: str, load: float):
        """Update agent load information"""
        self.agent_loads[agent_id] = load
        self.last_updated = datetime.utcnow()
    def set_agent_weight(self, agent_id: str, weight: float):
        """Set agent weight for load balancing"""
        self.agent_weights[agent_id] = weight
    def select_agent(self, available_agents: List[str], strategy: RoutingStrategy = RoutingStrategy.LOAD_BALANCED) -> Optional[str]:
        """Select agent based on load balancing strategy"""
        if not available_agents:
            return None
        if strategy == RoutingStrategy.ROUND_ROBIN:
            return self._round_robin_selection(available_agents)
        elif strategy == RoutingStrategy.LOAD_BALANCED:
            return self._load_balanced_selection(available_agents)
        elif strategy == RoutingStrategy.PRIORITY_BASED:
            return self._priority_based_selection(available_agents)
        elif strategy == RoutingStrategy.RANDOM:
            return self._random_selection(available_agents)
        else:
            return available_agents[0]
    def _round_robin_selection(self, agents: List[str]) -> str:
        """Round-robin agent selection"""
        agent = agents[self.load_balancer_index % len(agents)]
        self.load_balancer_index += 1
        return agent
    def _load_balanced_selection(self, agents: List[str]) -> str:
        """Load-balanced agent selection"""
        # Select agent with lowest load
        min_load = float('inf')
        selected_agent = None
        for agent in agents:
            load = self.agent_loads.get(agent, 0.0)
            weight = self.agent_weights.get(agent, 1.0)
            weighted_load = load / weight
            if weighted_load < min_load:
                min_load = weighted_load
                selected_agent = agent
        return selected_agent or agents[0]
    def _priority_based_selection(self, agents: List[str]) -> str:
        """Priority-based agent selection"""
        # Sort by weight (higher weight = higher priority)
        weighted_agents = sorted(
            agents,
            key=lambda a: self.agent_weights.get(a, 1.0),
            reverse=True
        )
        return weighted_agents[0]
    def _random_selection(self, agents: List[str]) -> str:
        """Random agent selection"""
        import random
        return random.choice(agents)
 class MessageQueue:
    """Advanced message queue with priority and persistence"""
    def __init__(self, max_size: int = 10000):
        self.max_size = max_size
        self.queues: Dict[Priority, asyncio.Queue] = {
            Priority.CRITICAL: asyncio.Queue(maxsize=max_size // 4),
            Priority.HIGH: asyncio.Queue(maxsize=max_size // 4),
            Priority.NORMAL: asyncio.Queue(maxsize // 2),
            Priority.LOW: asyncio.Queue(maxsize // 4)
        }
        self.message_store: Dict[str, AgentMessage] = {}
        self.delivery_confirmations: Dict[str, bool] = {}
    async def enqueue(self, message: AgentMessage, delivery_mode: DeliveryMode = DeliveryMode.AT_LEAST_ONCE) -> bool:
        """Enqueue message with priority"""
        try:
            # Store message for persistence
            if delivery_mode in [DeliveryMode.AT_LEAST_ONCE, DeliveryMode.EXACTLY_ONCE, DeliveryMode.PERSISTENT]:
                self.message_store[message.id] = message
            # Add to appropriate priority queue
            queue = self.queues[message.priority]
            await queue.put(message)
            logger.debug(f"Enqueued message {message.id} with priority {message.priority}")
            return True
        except asyncio.QueueFull:
            logger.error(f"Queue full, cannot enqueue message {message.id}")
            return False
    async def dequeue(self) -> Optional[AgentMessage]:
        """Dequeue message with priority order"""
        # Check queues in priority order
        for priority in [Priority.CRITICAL, Priority.HIGH, Priority.NORMAL, Priority.LOW]:
            queue = self.queues[priority]
            try:
                message = queue.get_nowait()
                logger.debug(f"Dequeued message {message.id} with priority {priority}")
                return message
            except asyncio.QueueEmpty:
                continue
        return None
    async def confirm_delivery(self, message_id: str):
        """Confirm message delivery"""
        self.delivery_confirmations[message_id] = True
        # Clean up if exactly once delivery
        if message_id in self.message_store:
            del self.message_store[message_id]
    def get_queue_stats(self) -> Dict[str, Any]:
        """Get queue statistics"""
        return {
            "queue_sizes": {
                priority.value: queue.qsize()
                for priority, queue in self.queues.items()
            },
            "stored_messages": len(self.message_store),
            "delivery_confirmations": len(self.delivery_confirmations),
            "max_size": self.max_size
        }
 class MessageProcessor:
    """Message processor with async handling"""
    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.router = MessageRouter(agent_id)
        self.load_balancer = LoadBalancer()
        self.message_queue = MessageQueue()
        self.processors: Dict[str, Callable] = {}
        self.processing_stats: Dict[str, Any] = {
            "messages_processed": 0,
            "processing_time_total": 0.0,
            "errors": 0
        }
    def register_processor(self, message_type: MessageType, processor: Callable):
        """Register message processor"""
        self.processors[message_type.value] = processor
        logger.info(f"Registered processor for {message_type.value}")
    async def process_message(self, message: AgentMessage) -> bool:
        """Process a message"""
        start_time = datetime.utcnow()
        try:
            # Route message
            route = await self.router.route_message(message)
            if not route:
                logger.warning(f"No route found for message {message.id}")
                return False
            # Process message
            processor = self.processors.get(message.message_type.value)
            if processor:
                await processor(message)
            else:
                logger.warning(f"No processor found for {message.message_type.value}")
                return False
            # Update stats
            self.processing_stats["messages_processed"] += 1
            processing_time = (datetime.utcnow() - start_time).total_seconds()
            self.processing_stats["processing_time_total"] += processing_time
            return True
        except Exception as e:
            logger.error(f"Error processing message {message.id}: {e}")
            self.processing_stats["errors"] += 1
            return False
    async def start_processing(self):
        """Start message processing loop"""
        while True:
            try:
                # Dequeue message
                message = await self.message_queue.dequeue()
                if message:
                    await self.process_message(message)
                else:
                    await asyncio.sleep(0.01)  # Small delay if no messages
            except Exception as e:
                logger.error(f"Error in processing loop: {e}")
                await asyncio.sleep(1)
    def get_processing_stats(self) -> Dict[str, Any]:
        """Get processing statistics"""
        total_processed = self.processing_stats["messages_processed"]
        avg_processing_time = (
            self.processing_stats["processing_time_total"] / total_processed
            if total_processed > 0 else 0
        )
        return {
            **self.processing_stats,
            "avg_processing_time": avg_processing_time,
            "queue_stats": self.message_queue.get_queue_stats(),
            "routing_stats": self.router.get_routing_stats()
        }
 # Factory functions for creating message types
 def create_task_message(sender_id: str, receiver_id: str, task_type: str, task_data: Dict[str, Any]) -> AgentMessage:
    """Create a task message"""
    task_msg = TaskMessage(
        task_id=str(uuid.uuid4()),
        task_type=task_type,
        task_data=task_data
    )
    return AgentMessage(
        sender_id=sender_id,
        receiver_id=receiver_id,
        message_type=MessageType.TASK_ASSIGNMENT,
        payload=task_msg.dict()
    )
 def create_coordination_message(sender_id: str, coordination_type: str, participants: List[str], data: Dict[str, Any]) -> AgentMessage:
    """Create a coordination message"""
    coord_msg = CoordinationMessage(
        coordination_id=str(uuid.uuid4()),
        coordination_type=coordination_type,
        participants=participants,
        coordination_data=data
    )
    return AgentMessage(
        sender_id=sender_id,
        message_type=MessageType.COORDINATION,
        payload=coord_msg.dict()
    )
 def create_status_message(agent_id: str, status_type: str, status_data: Dict[str, Any]) -> AgentMessage:
    """Create a status message"""
    status_msg = StatusMessage(
        agent_id=agent_id,
        status_type=status_type,
        status_data=status_data
    )
    return AgentMessage(
        sender_id=agent_id,
        message_type=MessageType.STATUS_UPDATE,
        payload=status_msg.dict()
    )
 def create_discovery_message(agent_id: str, agent_type: str, capabilities: List[str], services: List[str]) -> AgentMessage:
    """Create a discovery message"""
    discovery_msg = DiscoveryMessage(
        agent_id=agent_id,
        agent_type=agent_type,
        capabilities=capabilities,
        services=services
    )
    return AgentMessage(
        sender_id=agent_id,
        message_type=MessageType.DISCOVERY,
        payload=discovery_msg.dict()
    )
 def create_consensus_message(sender_id: str, proposal: Dict[str, Any], voting_options: List[Dict[str, Any]], deadline: datetime) -> AgentMessage:
    """Create a consensus message"""
    consensus_msg = ConsensusMessage(
        consensus_id=str(uuid.uuid4()),
        proposal=proposal,
        voting_options=voting_options,
        voting_deadline=deadline
    )
    return AgentMessage(
        sender_id=sender_id,
        message_type=MessageType.CONSENSUS,
        payload=consensus_msg.dict()
    )
 # Example usage
 async def example_usage():
    """Example of how to use the message routing system"""
    # Create message processor
    processor = MessageProcessor("agent-001")
    # Register processors
    async def process_task(message: AgentMessage):
        task_data = TaskMessage(**message.payload)
        logger.info(f"Processing task: {task_data.task_id}")
    processor.register_processor(MessageType.TASK_ASSIGNMENT, process_task)
    # Create and route message
    task_message = create_task_message(
        sender_id="agent-001",
        receiver_id="agent-002",
        task_type="data_processing",
        task_data={"input": "test_data"}
    )
    await processor.message_queue.enqueue(task_message)
    # Start processing (in real implementation, this would run in background)
    # await processor.start_processing()
 if __name__ == "__main__":
    asyncio.run(example_usage())
--- a/apps/agent-coordinator/src/app/routing/agent_discovery.py
+++ b/apps/agent-coordinator/src/app/routing/agent_discovery.py
@@ -0,0 +1,641 @@
 """
 Agent Discovery and Registration System for AITBC Agent Coordination
 """
 import asyncio
 import json
 import logging
 from typing import Dict, List, Optional, Set, Callable, Any
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 import uuid
 import hashlib
 from enum import Enum
 import redis.asyncio as redis
 from pydantic import BaseModel, Field
 from ..protocols.message_types import DiscoveryMessage, create_discovery_message
 from ..protocols.communication import AgentMessage, MessageType
 logger = logging.getLogger(__name__)
 class AgentStatus(str, Enum):
    """Agent status enumeration"""
    ACTIVE = "active"
    INACTIVE = "inactive"
    BUSY = "busy"
    MAINTENANCE = "maintenance"
    ERROR = "error"
 class AgentType(str, Enum):
    """Agent type enumeration"""
    COORDINATOR = "coordinator"
    WORKER = "worker"
    SPECIALIST = "specialist"
    MONITOR = "monitor"
    GATEWAY = "gateway"
    ORCHESTRATOR = "orchestrator"
@dataclass
 class AgentInfo:
    """Agent information structure"""
    agent_id: str
    agent_type: AgentType
    status: AgentStatus
    capabilities: List[str]
    services: List[str]
    endpoints: Dict[str, str]
    metadata: Dict[str, Any]
    last_heartbeat: datetime
    registration_time: datetime
    load_metrics: Dict[str, float] = field(default_factory=dict)
    health_score: float = 1.0
    version: str = "1.0.0"
    tags: Set[str] = field(default_factory=set)
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary"""
        return {
            "agent_id": self.agent_id,
            "agent_type": self.agent_type.value,
            "status": self.status.value,
            "capabilities": self.capabilities,
            "services": self.services,
            "endpoints": self.endpoints,
            "metadata": self.metadata,
            "last_heartbeat": self.last_heartbeat.isoformat(),
            "registration_time": self.registration_time.isoformat(),
            "load_metrics": self.load_metrics,
            "health_score": self.health_score,
            "version": self.version,
            "tags": list(self.tags)
        }
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "AgentInfo":
        """Create from dictionary"""
        data["agent_type"] = AgentType(data["agent_type"])
        data["status"] = AgentStatus(data["status"])
        data["last_heartbeat"] = datetime.fromisoformat(data["last_heartbeat"])
        data["registration_time"] = datetime.fromisoformat(data["registration_time"])
        data["tags"] = set(data.get("tags", []))
        return cls(**data)
 class AgentRegistry:
    """Central agent registry for discovery and management"""
    def __init__(self, redis_url: str = "redis://localhost:6379/1"):
        self.redis_url = redis_url
        self.redis_client: Optional[redis.Redis] = None
        self.agents: Dict[str, AgentInfo] = {}
        self.service_index: Dict[str, Set[str]] = {}  # service -> agent_ids
        self.capability_index: Dict[str, Set[str]] = {}  # capability -> agent_ids
        self.type_index: Dict[AgentType, Set[str]] = {}  # agent_type -> agent_ids
        self.heartbeat_interval = 30  # seconds
        self.cleanup_interval = 60  # seconds
        self.max_heartbeat_age = 120  # seconds
    async def start(self):
        """Start the registry service"""
        self.redis_client = redis.from_url(self.redis_url)
        # Load existing agents from Redis
        await self._load_agents_from_redis()
        # Start background tasks
        asyncio.create_task(self._heartbeat_monitor())
        asyncio.create_task(self._cleanup_inactive_agents())
        logger.info("Agent registry started")
    async def stop(self):
        """Stop the registry service"""
        if self.redis_client:
            await self.redis_client.close()
        logger.info("Agent registry stopped")
    async def register_agent(self, agent_info: AgentInfo) -> bool:
        """Register a new agent"""
        try:
            # Add to local registry
            self.agents[agent_info.agent_id] = agent_info
            # Update indexes
            self._update_indexes(agent_info)
            # Save to Redis
            await self._save_agent_to_redis(agent_info)
            # Publish registration event
            await self._publish_agent_event("agent_registered", agent_info)
            logger.info(f"Agent {agent_info.agent_id} registered successfully")
            return True
        except Exception as e:
            logger.error(f"Error registering agent {agent_info.agent_id}: {e}")
            return False
    async def unregister_agent(self, agent_id: str) -> bool:
        """Unregister an agent"""
        try:
            if agent_id not in self.agents:
                logger.warning(f"Agent {agent_id} not found for unregistration")
                return False
            agent_info = self.agents[agent_id]
            # Remove from local registry
            del self.agents[agent_id]
            # Update indexes
            self._remove_from_indexes(agent_info)
            # Remove from Redis
            await self._remove_agent_from_redis(agent_id)
            # Publish unregistration event
            await self._publish_agent_event("agent_unregistered", agent_info)
            logger.info(f"Agent {agent_id} unregistered successfully")
            return True
        except Exception as e:
            logger.error(f"Error unregistering agent {agent_id}: {e}")
            return False
    async def update_agent_status(self, agent_id: str, status: AgentStatus, load_metrics: Optional[Dict[str, float]] = None) -> bool:
        """Update agent status and metrics"""
        try:
            if agent_id not in self.agents:
                logger.warning(f"Agent {agent_id} not found for status update")
                return False
            agent_info = self.agents[agent_id]
            agent_info.status = status
            agent_info.last_heartbeat = datetime.utcnow()
            if load_metrics:
                agent_info.load_metrics.update(load_metrics)
            # Update health score
            agent_info.health_score = self._calculate_health_score(agent_info)
            # Save to Redis
            await self._save_agent_to_redis(agent_info)
            # Publish status update event
            await self._publish_agent_event("agent_status_updated", agent_info)
            return True
        except Exception as e:
            logger.error(f"Error updating agent status {agent_id}: {e}")
            return False
    async def update_agent_heartbeat(self, agent_id: str) -> bool:
        """Update agent heartbeat"""
        try:
            if agent_id not in self.agents:
                logger.warning(f"Agent {agent_id} not found for heartbeat")
                return False
            agent_info = self.agents[agent_id]
            agent_info.last_heartbeat = datetime.utcnow()
            # Update health score
            agent_info.health_score = self._calculate_health_score(agent_info)
            # Save to Redis
            await self._save_agent_to_redis(agent_info)
            return True
        except Exception as e:
            logger.error(f"Error updating heartbeat for {agent_id}: {e}")
            return False
    async def discover_agents(self, query: Dict[str, Any]) -> List[AgentInfo]:
        """Discover agents based on query criteria"""
        results = []
        try:
            # Start with all agents
            candidate_agents = list(self.agents.values())
            # Apply filters
            if "agent_type" in query:
                agent_type = AgentType(query["agent_type"])
                candidate_agents = [a for a in candidate_agents if a.agent_type == agent_type]
            if "status" in query:
                status = AgentStatus(query["status"])
                candidate_agents = [a for a in candidate_agents if a.status == status]
            if "capabilities" in query:
                required_capabilities = set(query["capabilities"])
                candidate_agents = [a for a in candidate_agents if required_capabilities.issubset(a.capabilities)]
            if "services" in query:
                required_services = set(query["services"])
                candidate_agents = [a for a in candidate_agents if required_services.issubset(a.services)]
            if "tags" in query:
                required_tags = set(query["tags"])
                candidate_agents = [a for a in candidate_agents if required_tags.issubset(a.tags)]
            if "min_health_score" in query:
                min_score = query["min_health_score"]
                candidate_agents = [a for a in candidate_agents if a.health_score >= min_score]
            # Sort by health score (highest first)
            results = sorted(candidate_agents, key=lambda a: a.health_score, reverse=True)
            # Limit results if specified
            if "limit" in query:
                results = results[:query["limit"]]
            logger.info(f"Discovered {len(results)} agents for query: {query}")
            return results
        except Exception as e:
            logger.error(f"Error discovering agents: {e}")
            return []
    async def get_agent_by_id(self, agent_id: str) -> Optional[AgentInfo]:
        """Get agent information by ID"""
        return self.agents.get(agent_id)
    async def get_agents_by_service(self, service: str) -> List[AgentInfo]:
        """Get agents that provide a specific service"""
        agent_ids = self.service_index.get(service, set())
        return [self.agents[agent_id] for agent_id in agent_ids if agent_id in self.agents]
    async def get_agents_by_capability(self, capability: str) -> List[AgentInfo]:
        """Get agents that have a specific capability"""
        agent_ids = self.capability_index.get(capability, set())
        return [self.agents[agent_id] for agent_id in agent_ids if agent_id in self.agents]
    async def get_agents_by_type(self, agent_type: AgentType) -> List[AgentInfo]:
        """Get agents of a specific type"""
        agent_ids = self.type_index.get(agent_type, set())
        return [self.agents[agent_id] for agent_id in agent_ids if agent_id in self.agents]
    async def get_registry_stats(self) -> Dict[str, Any]:
        """Get registry statistics"""
        total_agents = len(self.agents)
        status_counts = {}
        type_counts = {}
        for agent_info in self.agents.values():
            # Count by status
            status = agent_info.status.value
            status_counts[status] = status_counts.get(status, 0) + 1
            # Count by type
            agent_type = agent_info.agent_type.value
            type_counts[agent_type] = type_counts.get(agent_type, 0) + 1
        return {
            "total_agents": total_agents,
            "status_counts": status_counts,
            "type_counts": type_counts,
            "service_count": len(self.service_index),
            "capability_count": len(self.capability_index),
            "last_cleanup": datetime.utcnow().isoformat()
        }
    def _update_indexes(self, agent_info: AgentInfo):
        """Update search indexes"""
        # Service index
        for service in agent_info.services:
            if service not in self.service_index:
                self.service_index[service] = set()
            self.service_index[service].add(agent_info.agent_id)
        # Capability index
        for capability in agent_info.capabilities:
            if capability not in self.capability_index:
                self.capability_index[capability] = set()
            self.capability_index[capability].add(agent_info.agent_id)
        # Type index
        if agent_info.agent_type not in self.type_index:
            self.type_index[agent_info.agent_type] = set()
        self.type_index[agent_info.agent_type].add(agent_info.agent_id)
    def _remove_from_indexes(self, agent_info: AgentInfo):
        """Remove agent from search indexes"""
        # Service index
        for service in agent_info.services:
            if service in self.service_index:
                self.service_index[service].discard(agent_info.agent_id)
                if not self.service_index[service]:
                    del self.service_index[service]
        # Capability index
        for capability in agent_info.capabilities:
            if capability in self.capability_index:
                self.capability_index[capability].discard(agent_info.agent_id)
                if not self.capability_index[capability]:
                    del self.capability_index[capability]
        # Type index
        if agent_info.agent_type in self.type_index:
            self.type_index[agent_info.agent_type].discard(agent_info.agent_id)
            if not self.type_index[agent_info.agent_type]:
                del self.type_index[agent_info.agent_type]
    def _calculate_health_score(self, agent_info: AgentInfo) -> float:
        """Calculate agent health score"""
        base_score = 1.0
        # Penalty for high load
        if agent_info.load_metrics:
            avg_load = sum(agent_info.load_metrics.values()) / len(agent_info.load_metrics)
            if avg_load > 0.8:
                base_score -= 0.3
            elif avg_load > 0.6:
                base_score -= 0.1
        # Penalty for error status
        if agent_info.status == AgentStatus.ERROR:
            base_score -= 0.5
        elif agent_info.status == AgentStatus.MAINTENANCE:
            base_score -= 0.2
        elif agent_info.status == AgentStatus.BUSY:
            base_score -= 0.1
        # Penalty for old heartbeat
        heartbeat_age = (datetime.utcnow() - agent_info.last_heartbeat).total_seconds()
        if heartbeat_age > self.max_heartbeat_age:
            base_score -= 0.5
        elif heartbeat_age > self.max_heartbeat_age / 2:
            base_score -= 0.2
        return max(0.0, min(1.0, base_score))
    async def _save_agent_to_redis(self, agent_info: AgentInfo):
        """Save agent information to Redis"""
        if not self.redis_client:
            return
        key = f"agent:{agent_info.agent_id}"
        await self.redis_client.setex(
            key,
            timedelta(hours=24),  # 24 hour TTL
            json.dumps(agent_info.to_dict())
        )
    async def _remove_agent_from_redis(self, agent_id: str):
        """Remove agent from Redis"""
        if not self.redis_client:
            return
        key = f"agent:{agent_id}"
        await self.redis_client.delete(key)
    async def _load_agents_from_redis(self):
        """Load agents from Redis"""
        if not self.redis_client:
            return
        try:
            # Get all agent keys
            keys = await self.redis_client.keys("agent:*")
            for key in keys:
                data = await self.redis_client.get(key)
                if data:
                    agent_info = AgentInfo.from_dict(json.loads(data))
                    self.agents[agent_info.agent_id] = agent_info
                    self._update_indexes(agent_info)
            logger.info(f"Loaded {len(self.agents)} agents from Redis")
        except Exception as e:
            logger.error(f"Error loading agents from Redis: {e}")
    async def _publish_agent_event(self, event_type: str, agent_info: AgentInfo):
        """Publish agent event to Redis"""
        if not self.redis_client:
            return
        event = {
            "event_type": event_type,
            "timestamp": datetime.utcnow().isoformat(),
            "agent_info": agent_info.to_dict()
        }
        await self.redis_client.publish("agent_events", json.dumps(event))
    async def _heartbeat_monitor(self):
        """Monitor agent heartbeats"""
        while True:
            try:
                await asyncio.sleep(self.heartbeat_interval)
                # Check for agents with old heartbeats
                now = datetime.utcnow()
                for agent_id, agent_info in list(self.agents.items()):
                    heartbeat_age = (now - agent_info.last_heartbeat).total_seconds()
                    if heartbeat_age > self.max_heartbeat_age:
                        # Mark as inactive
                        if agent_info.status != AgentStatus.INACTIVE:
                            await self.update_agent_status(agent_id, AgentStatus.INACTIVE)
                            logger.warning(f"Agent {agent_id} marked as inactive due to old heartbeat")
            except Exception as e:
                logger.error(f"Error in heartbeat monitor: {e}")
                await asyncio.sleep(5)
    async def _cleanup_inactive_agents(self):
        """Clean up inactive agents"""
        while True:
            try:
                await asyncio.sleep(self.cleanup_interval)
                # Remove agents that have been inactive too long
                now = datetime.utcnow()
                max_inactive_age = timedelta(hours=1)  # 1 hour
                for agent_id, agent_info in list(self.agents.items()):
                    if agent_info.status == AgentStatus.INACTIVE:
                        inactive_age = now - agent_info.last_heartbeat
                        if inactive_age > max_inactive_age:
                            await self.unregister_agent(agent_id)
                            logger.info(f"Removed inactive agent {agent_id}")
            except Exception as e:
                logger.error(f"Error in cleanup task: {e}")
                await asyncio.sleep(5)
 class AgentDiscoveryService:
    """Service for agent discovery and registration"""
    def __init__(self, registry: AgentRegistry):
        self.registry = registry
        self.discovery_handlers: Dict[str, Callable] = {}
    def register_discovery_handler(self, handler_name: str, handler: Callable):
        """Register a discovery handler"""
        self.discovery_handlers[handler_name] = handler
        logger.info(f"Registered discovery handler: {handler_name}")
    async def handle_discovery_request(self, message: AgentMessage) -> Optional[AgentMessage]:
        """Handle agent discovery request"""
        try:
            discovery_data = DiscoveryMessage(**message.payload)
            # Update or register agent
            agent_info = AgentInfo(
                agent_id=discovery_data.agent_id,
                agent_type=AgentType(discovery_data.agent_type),
                status=AgentStatus.ACTIVE,
                capabilities=discovery_data.capabilities,
                services=discovery_data.services,
                endpoints=discovery_data.endpoints,
                metadata=discovery_data.metadata,
                last_heartbeat=datetime.utcnow(),
                registration_time=datetime.utcnow()
            )
            # Register or update agent
            if discovery_data.agent_id in self.registry.agents:
                await self.registry.update_agent_status(discovery_data.agent_id, AgentStatus.ACTIVE)
            else:
                await self.registry.register_agent(agent_info)
            # Send response with available agents
            available_agents = await self.registry.discover_agents({
                "status": "active",
                "limit": 50
            })
            response_data = {
                "discovery_agents": [agent.to_dict() for agent in available_agents],
                "registry_stats": await self.registry.get_registry_stats()
            }
            response = AgentMessage(
                sender_id="discovery_service",
                receiver_id=message.sender_id,
                message_type=MessageType.DISCOVERY,
                payload=response_data,
                correlation_id=message.id
            )
            return response
        except Exception as e:
            logger.error(f"Error handling discovery request: {e}")
            return None
    async def find_best_agent(self, requirements: Dict[str, Any]) -> Optional[AgentInfo]:
        """Find the best agent for given requirements"""
        try:
            # Build discovery query
            query = {}
            if "agent_type" in requirements:
                query["agent_type"] = requirements["agent_type"]
            if "capabilities" in requirements:
                query["capabilities"] = requirements["capabilities"]
            if "services" in requirements:
                query["services"] = requirements["services"]
            if "min_health_score" in requirements:
                query["min_health_score"] = requirements["min_health_score"]
            # Discover agents
            agents = await self.registry.discover_agents(query)
            if not agents:
                return None
            # Select best agent (highest health score)
            return agents[0]
        except Exception as e:
            logger.error(f"Error finding best agent: {e}")
            return None
    async def get_service_endpoints(self, service: str) -> Dict[str, List[str]]:
        """Get all endpoints for a specific service"""
        try:
            agents = await self.registry.get_agents_by_service(service)
            endpoints = {}
            for agent in agents:
                for service_name, endpoint in agent.endpoints.items():
                    if service_name not in endpoints:
                        endpoints[service_name] = []
                    endpoints[service_name].append(endpoint)
            return endpoints
        except Exception as e:
            logger.error(f"Error getting service endpoints: {e}")
            return {}
 # Factory functions
 def create_agent_info(agent_id: str, agent_type: str, capabilities: List[str], services: List[str], endpoints: Dict[str, str]) -> AgentInfo:
    """Create agent information"""
    return AgentInfo(
        agent_id=agent_id,
        agent_type=AgentType(agent_type),
        status=AgentStatus.ACTIVE,
        capabilities=capabilities,
        services=services,
        endpoints=endpoints,
        metadata={},
        last_heartbeat=datetime.utcnow(),
        registration_time=datetime.utcnow()
    )
 # Example usage
 async def example_usage():
    """Example of how to use the agent discovery system"""
    # Create registry
    registry = AgentRegistry()
    await registry.start()
    # Create discovery service
    discovery_service = AgentDiscoveryService(registry)
    # Register an agent
    agent_info = create_agent_info(
        agent_id="agent-001",
        agent_type="worker",
        capabilities=["data_processing", "analysis"],
        services=["process_data", "analyze_results"],
        endpoints={"http": "http://localhost:8001", "ws": "ws://localhost:8002"}
    )
    await registry.register_agent(agent_info)
    # Discover agents
    agents = await registry.discover_agents({
        "capabilities": ["data_processing"],
        "status": "active"
    })
    print(f"Found {len(agents)} agents")
    # Find best agent
    best_agent = await discovery_service.find_best_agent({
        "capabilities": ["data_processing"],
        "min_health_score": 0.8
    })
    if best_agent:
        print(f"Best agent: {best_agent.agent_id}")
    await registry.stop()
 if __name__ == "__main__":
    asyncio.run(example_usage())
--- a/apps/agent-coordinator/src/app/routing/load_balancer.py
+++ b/apps/agent-coordinator/src/app/routing/load_balancer.py
@@ -0,0 +1,716 @@
 """
 Load Balancer for Agent Distribution and Task Assignment
 """
 import asyncio
 import json
 import logging
 from typing import Dict, List, Optional, Tuple, Any, Callable
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from enum import Enum
 import statistics
 import uuid
 from collections import defaultdict, deque
 from .agent_discovery import AgentRegistry, AgentInfo, AgentStatus, AgentType
 from ..protocols.message_types import TaskMessage, create_task_message
 from ..protocols.communication import AgentMessage, MessageType, Priority
 logger = logging.getLogger(__name__)
 class LoadBalancingStrategy(str, Enum):
    """Load balancing strategies"""
    ROUND_ROBIN = "round_robin"
    LEAST_CONNECTIONS = "least_connections"
    LEAST_RESPONSE_TIME = "least_response_time"
    WEIGHTED_ROUND_ROBIN = "weighted_round_robin"
    RESOURCE_BASED = "resource_based"
    CAPABILITY_BASED = "capability_based"
    PREDICTIVE = "predictive"
    CONSISTENT_HASH = "consistent_hash"
 class TaskPriority(str, Enum):
    """Task priority levels"""
    LOW = "low"
    NORMAL = "normal"
    HIGH = "high"
    CRITICAL = "critical"
    URGENT = "urgent"
@dataclass
 class LoadMetrics:
    """Agent load metrics"""
    cpu_usage: float = 0.0
    memory_usage: float = 0.0
    active_connections: int = 0
    pending_tasks: int = 0
    completed_tasks: int = 0
    failed_tasks: int = 0
    avg_response_time: float = 0.0
    last_updated: datetime = field(default_factory=datetime.utcnow)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "cpu_usage": self.cpu_usage,
            "memory_usage": self.memory_usage,
            "active_connections": self.active_connections,
            "pending_tasks": self.pending_tasks,
            "completed_tasks": self.completed_tasks,
            "failed_tasks": self.failed_tasks,
            "avg_response_time": self.avg_response_time,
            "last_updated": self.last_updated.isoformat()
        }
@dataclass
 class TaskAssignment:
    """Task assignment record"""
    task_id: str
    agent_id: str
    assigned_at: datetime
    completed_at: Optional[datetime] = None
    status: str = "pending"
    response_time: Optional[float] = None
    success: bool = False
    error_message: Optional[str] = None
    def to_dict(self) -> Dict[str, Any]:
        return {
            "task_id": self.task_id,
            "agent_id": self.agent_id,
            "assigned_at": self.assigned_at.isoformat(),
            "completed_at": self.completed_at.isoformat() if self.completed_at else None,
            "status": self.status,
            "response_time": self.response_time,
            "success": self.success,
            "error_message": self.error_message
        }
@dataclass
 class AgentWeight:
    """Agent weight for load balancing"""
    agent_id: str
    weight: float = 1.0
    capacity: int = 100
    performance_score: float = 1.0
    reliability_score: float = 1.0
    last_updated: datetime = field(default_factory=datetime.utcnow)
 class LoadBalancer:
    """Advanced load balancer for agent distribution"""
    def __init__(self, registry: AgentRegistry):
        self.registry = registry
        self.strategy = LoadBalancingStrategy.LEAST_CONNECTIONS
        self.agent_weights: Dict[str, AgentWeight] = {}
        self.agent_metrics: Dict[str, LoadMetrics] = {}
        self.task_assignments: Dict[str, TaskAssignment] = {}
        self.assignment_history: deque = deque(maxlen=1000)
        self.round_robin_index = 0
        self.consistent_hash_ring: Dict[int, str] = {}
        self.prediction_models: Dict[str, Any] = {}
        # Statistics
        self.total_assignments = 0
        self.successful_assignments = 0
        self.failed_assignments = 0
    def set_strategy(self, strategy: LoadBalancingStrategy):
        """Set load balancing strategy"""
        self.strategy = strategy
        logger.info(f"Load balancing strategy changed to: {strategy.value}")
    def set_agent_weight(self, agent_id: str, weight: float, capacity: int = 100):
        """Set agent weight and capacity"""
        self.agent_weights[agent_id] = AgentWeight(
            agent_id=agent_id,
            weight=weight,
            capacity=capacity
        )
        logger.info(f"Set weight for agent {agent_id}: {weight}, capacity: {capacity}")
    def update_agent_metrics(self, agent_id: str, metrics: LoadMetrics):
        """Update agent load metrics"""
        self.agent_metrics[agent_id] = metrics
        self.agent_metrics[agent_id].last_updated = datetime.utcnow()
        # Update performance score based on metrics
        self._update_performance_score(agent_id, metrics)
    def _update_performance_score(self, agent_id: str, metrics: LoadMetrics):
        """Update agent performance score based on metrics"""
        if agent_id not in self.agent_weights:
            self.agent_weights[agent_id] = AgentWeight(agent_id=agent_id)
        weight = self.agent_weights[agent_id]
        # Calculate performance score (0.0 to 1.0)
        performance_factors = []
        # CPU usage factor (lower is better)
        cpu_factor = max(0.0, 1.0 - metrics.cpu_usage)
        performance_factors.append(cpu_factor)
        # Memory usage factor (lower is better)
        memory_factor = max(0.0, 1.0 - metrics.memory_usage)
        performance_factors.append(memory_factor)
        # Response time factor (lower is better)
        if metrics.avg_response_time > 0:
            response_factor = max(0.0, 1.0 - (metrics.avg_response_time / 10.0))  # 10s max
            performance_factors.append(response_factor)
        # Success rate factor (higher is better)
        total_tasks = metrics.completed_tasks + metrics.failed_tasks
        if total_tasks > 0:
            success_rate = metrics.completed_tasks / total_tasks
            performance_factors.append(success_rate)
        # Update performance score
        if performance_factors:
            weight.performance_score = statistics.mean(performance_factors)
        # Update reliability score
        if total_tasks > 10:  # Only update after enough tasks
            weight.reliability_score = success_rate
    async def assign_task(self, task_data: Dict[str, Any], requirements: Optional[Dict[str, Any]] = None) -> Optional[str]:
        """Assign task to best available agent"""
        try:
            # Find eligible agents
            eligible_agents = await self._find_eligible_agents(task_data, requirements)
            if not eligible_agents:
                logger.warning("No eligible agents found for task assignment")
                return None
            # Select best agent based on strategy
            selected_agent = await self._select_agent(eligible_agents, task_data)
            if not selected_agent:
                logger.warning("No agent selected for task assignment")
                return None
            # Create task assignment
            task_id = str(uuid.uuid4())
            assignment = TaskAssignment(
                task_id=task_id,
                agent_id=selected_agent,
                assigned_at=datetime.utcnow()
            )
            # Record assignment
            self.task_assignments[task_id] = assignment
            self.assignment_history.append(assignment)
            self.total_assignments += 1
            # Update agent metrics
            if selected_agent not in self.agent_metrics:
                self.agent_metrics[selected_agent] = LoadMetrics()
            self.agent_metrics[selected_agent].pending_tasks += 1
            logger.info(f"Task {task_id} assigned to agent {selected_agent}")
            return selected_agent
        except Exception as e:
            logger.error(f"Error assigning task: {e}")
            self.failed_assignments += 1
            return None
    async def complete_task(self, task_id: str, success: bool, response_time: Optional[float] = None, error_message: Optional[str] = None):
        """Mark task as completed"""
        try:
            if task_id not in self.task_assignments:
                logger.warning(f"Task assignment {task_id} not found")
                return
            assignment = self.task_assignments[task_id]
            assignment.completed_at = datetime.utcnow()
            assignment.status = "completed"
            assignment.success = success
            assignment.response_time = response_time
            assignment.error_message = error_message
            # Update agent metrics
            agent_id = assignment.agent_id
            if agent_id in self.agent_metrics:
                metrics = self.agent_metrics[agent_id]
                metrics.pending_tasks = max(0, metrics.pending_tasks - 1)
                if success:
                    metrics.completed_tasks += 1
                    self.successful_assignments += 1
                else:
                    metrics.failed_tasks += 1
                    self.failed_assignments += 1
                # Update average response time
                if response_time:
                    total_completed = metrics.completed_tasks + metrics.failed_tasks
                    if total_completed > 0:
                        metrics.avg_response_time = (
                            (metrics.avg_response_time * (total_completed - 1) + response_time) / total_completed
                        )
            logger.info(f"Task {task_id} completed by agent {assignment.agent_id}, success: {success}")
        except Exception as e:
            logger.error(f"Error completing task {task_id}: {e}")
    async def _find_eligible_agents(self, task_data: Dict[str, Any], requirements: Optional[Dict[str, Any]] = None) -> List[str]:
        """Find eligible agents for task"""
        try:
            # Build discovery query
            query = {"status": AgentStatus.ACTIVE}
            if requirements:
                if "agent_type" in requirements:
                    query["agent_type"] = requirements["agent_type"]
                if "capabilities" in requirements:
                    query["capabilities"] = requirements["capabilities"]
                if "services" in requirements:
                    query["services"] = requirements["services"]
                if "min_health_score" in requirements:
                    query["min_health_score"] = requirements["min_health_score"]
            # Discover agents
            agents = await self.registry.discover_agents(query)
            # Filter by capacity and load
            eligible_agents = []
            for agent in agents:
                agent_id = agent.agent_id
                # Check capacity
                if agent_id in self.agent_weights:
                    weight = self.agent_weights[agent_id]
                    current_load = self._get_agent_load(agent_id)
                    if current_load < weight.capacity:
                        eligible_agents.append(agent_id)
                else:
                    # Default capacity check
                    metrics = self.agent_metrics.get(agent_id, LoadMetrics())
                    if metrics.pending_tasks < 100:  # Default capacity
                        eligible_agents.append(agent_id)
            return eligible_agents
        except Exception as e:
            logger.error(f"Error finding eligible agents: {e}")
            return []
    def _get_agent_load(self, agent_id: str) -> int:
        """Get current load for agent"""
        metrics = self.agent_metrics.get(agent_id, LoadMetrics())
        return metrics.active_connections + metrics.pending_tasks
    async def _select_agent(self, eligible_agents: List[str], task_data: Dict[str, Any]) -> Optional[str]:
        """Select best agent based on current strategy"""
        if not eligible_agents:
            return None
        if self.strategy == LoadBalancingStrategy.ROUND_ROBIN:
            return self._round_robin_selection(eligible_agents)
        elif self.strategy == LoadBalancingStrategy.LEAST_CONNECTIONS:
            return self._least_connections_selection(eligible_agents)
        elif self.strategy == LoadBalancingStrategy.LEAST_RESPONSE_TIME:
            return self._least_response_time_selection(eligible_agents)
        elif self.strategy == LoadBalancingStrategy.WEIGHTED_ROUND_ROBIN:
            return self._weighted_round_robin_selection(eligible_agents)
        elif self.strategy == LoadBalancingStrategy.RESOURCE_BASED:
            return self._resource_based_selection(eligible_agents)
        elif self.strategy == LoadBalancingStrategy.CAPABILITY_BASED:
            return self._capability_based_selection(eligible_agents, task_data)
        elif self.strategy == LoadBalancingStrategy.PREDICTIVE:
            return self._predictive_selection(eligible_agents, task_data)
        elif self.strategy == LoadBalancingStrategy.CONSISTENT_HASH:
            return self._consistent_hash_selection(eligible_agents, task_data)
        else:
            return eligible_agents[0]
    def _round_robin_selection(self, agents: List[str]) -> str:
        """Round-robin agent selection"""
        agent = agents[self.round_robin_index % len(agents)]
        self.round_robin_index += 1
        return agent
    def _least_connections_selection(self, agents: List[str]) -> str:
        """Select agent with least connections"""
        min_connections = float('inf')
        selected_agent = None
        for agent_id in agents:
            metrics = self.agent_metrics.get(agent_id, LoadMetrics())
            connections = metrics.active_connections
            if connections < min_connections:
                min_connections = connections
                selected_agent = agent_id
        return selected_agent or agents[0]
    def _least_response_time_selection(self, agents: List[str]) -> str:
        """Select agent with least average response time"""
        min_response_time = float('inf')
        selected_agent = None
        for agent_id in agents:
            metrics = self.agent_metrics.get(agent_id, LoadMetrics())
            response_time = metrics.avg_response_time
            if response_time < min_response_time:
                min_response_time = response_time
                selected_agent = agent_id
        return selected_agent or agents[0]
    def _weighted_round_robin_selection(self, agents: List[str]) -> str:
        """Weighted round-robin selection"""
        # Calculate total weight
        total_weight = 0
        for agent_id in agents:
            weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
            total_weight += weight.weight
        if total_weight == 0:
            return agents[0]
        # Select agent based on weight
        current_weight = self.round_robin_index % total_weight
        accumulated_weight = 0
        for agent_id in agents:
            weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
            accumulated_weight += weight.weight
            if current_weight < accumulated_weight:
                self.round_robin_index += 1
                return agent_id
        return agents[0]
    def _resource_based_selection(self, agents: List[str]) -> str:
        """Resource-based selection considering CPU and memory"""
        best_score = -1
        selected_agent = None
        for agent_id in agents:
            metrics = self.agent_metrics.get(agent_id, LoadMetrics())
            # Calculate resource score (lower usage is better)
            cpu_score = max(0, 100 - metrics.cpu_usage)
            memory_score = max(0, 100 - metrics.memory_usage)
            resource_score = (cpu_score + memory_score) / 2
            # Apply performance weight
            weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
            final_score = resource_score * weight.performance_score
            if final_score > best_score:
                best_score = final_score
                selected_agent = agent_id
        return selected_agent or agents[0]
    def _capability_based_selection(self, agents: List[str], task_data: Dict[str, Any]) -> str:
        """Capability-based selection considering task requirements"""
        required_capabilities = task_data.get("required_capabilities", [])
        if not required_capabilities:
            return agents[0]
        best_score = -1
        selected_agent = None
        for agent_id in agents:
            agent_info = self.registry.agents.get(agent_id)
            if not agent_info:
                continue
            # Calculate capability match score
            agent_capabilities = set(agent_info.capabilities)
            required_set = set(required_capabilities)
            if required_set.issubset(agent_capabilities):
                # Perfect match
                capability_score = 1.0
            else:
                # Partial match
                intersection = required_set.intersection(agent_capabilities)
                capability_score = len(intersection) / len(required_set)
            # Apply performance weight
            weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
            final_score = capability_score * weight.performance_score
            if final_score > best_score:
                best_score = final_score
                selected_agent = agent_id
        return selected_agent or agents[0]
    def _predictive_selection(self, agents: List[str], task_data: Dict[str, Any]) -> str:
        """Predictive selection using historical performance"""
        task_type = task_data.get("task_type", "unknown")
        # Calculate predicted performance for each agent
        best_score = -1
        selected_agent = None
        for agent_id in agents:
            # Get historical performance for this task type
            score = self._calculate_predicted_score(agent_id, task_type)
            if score > best_score:
                best_score = score
                selected_agent = agent_id
        return selected_agent or agents[0]
    def _calculate_predicted_score(self, agent_id: str, task_type: str) -> float:
        """Calculate predicted performance score for agent"""
        # Simple prediction based on recent performance
        weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
        # Base score from performance and reliability
        base_score = (weight.performance_score + weight.reliability_score) / 2
        # Adjust based on recent assignments
        recent_assignments = [a for a in self.assignment_history if a.agent_id == agent_id][-10:]
        if recent_assignments:
            success_rate = sum(1 for a in recent_assignments if a.success) / len(recent_assignments)
            base_score = base_score * 0.7 + success_rate * 0.3
        return base_score
    def _consistent_hash_selection(self, agents: List[str], task_data: Dict[str, Any]) -> str:
        """Consistent hash selection for sticky routing"""
        # Create hash key from task data
        hash_key = json.dumps(task_data, sort_keys=True)
        hash_value = int(hashlib.md5(hash_key.encode()).hexdigest(), 16)
        # Build hash ring if not exists
        if not self.consistent_hash_ring:
            self._build_hash_ring(agents)
        # Find agent on hash ring
        for hash_pos in sorted(self.consistent_hash_ring.keys()):
            if hash_value <= hash_pos:
                return self.consistent_hash_ring[hash_pos]
        # Wrap around
        return self.consistent_hash_ring[min(self.consistent_hash_ring.keys())]
    def _build_hash_ring(self, agents: List[str]):
        """Build consistent hash ring"""
        self.consistent_hash_ring = {}
        for agent_id in agents:
            # Create multiple virtual nodes for better distribution
            for i in range(100):
                virtual_key = f"{agent_id}:{i}"
                hash_value = int(hashlib.md5(virtual_key.encode()).hexdigest(), 16)
                self.consistent_hash_ring[hash_value] = agent_id
    def get_load_balancing_stats(self) -> Dict[str, Any]:
        """Get load balancing statistics"""
        return {
            "strategy": self.strategy.value,
            "total_assignments": self.total_assignments,
            "successful_assignments": self.successful_assignments,
            "failed_assignments": self.failed_assignments,
            "success_rate": self.successful_assignments / max(1, self.total_assignments),
            "active_agents": len(self.agent_metrics),
            "agent_weights": len(self.agent_weights),
            "avg_agent_load": statistics.mean([self._get_agent_load(a) for a in self.agent_metrics]) if self.agent_metrics else 0
        }
    def get_agent_stats(self, agent_id: str) -> Optional[Dict[str, Any]]:
        """Get detailed statistics for a specific agent"""
        if agent_id not in self.agent_metrics:
            return None
        metrics = self.agent_metrics[agent_id]
        weight = self.agent_weights.get(agent_id, AgentWeight(agent_id=agent_id))
        # Get recent assignments
        recent_assignments = [a for a in self.assignment_history if a.agent_id == agent_id][-10:]
        return {
            "agent_id": agent_id,
            "metrics": metrics.to_dict(),
            "weight": {
                "weight": weight.weight,
                "capacity": weight.capacity,
                "performance_score": weight.performance_score,
                "reliability_score": weight.reliability_score
            },
            "recent_assignments": [a.to_dict() for a in recent_assignments],
            "current_load": self._get_agent_load(agent_id)
        }
 class TaskDistributor:
    """Task distributor with advanced load balancing"""
    def __init__(self, load_balancer: LoadBalancer):
        self.load_balancer = load_balancer
        self.task_queue = asyncio.Queue()
        self.priority_queues = {
            TaskPriority.URGENT: asyncio.Queue(),
            TaskPriority.CRITICAL: asyncio.Queue(),
            TaskPriority.HIGH: asyncio.Queue(),
            TaskPriority.NORMAL: asyncio.Queue(),
            TaskPriority.LOW: asyncio.Queue()
        }
        self.distribution_stats = {
            "tasks_distributed": 0,
            "tasks_completed": 0,
            "tasks_failed": 0,
            "avg_distribution_time": 0.0
        }
    async def submit_task(self, task_data: Dict[str, Any], priority: TaskPriority = TaskPriority.NORMAL, requirements: Optional[Dict[str, Any]] = None):
        """Submit task for distribution"""
        task_info = {
            "task_data": task_data,
            "priority": priority,
            "requirements": requirements,
            "submitted_at": datetime.utcnow()
        }
        await self.priority_queues[priority].put(task_info)
        logger.info(f"Task submitted with priority {priority.value}")
    async def start_distribution(self):
        """Start task distribution loop"""
        while True:
            try:
                # Check queues in priority order
                task_info = None
                for priority in [TaskPriority.URGENT, TaskPriority.CRITICAL, TaskPriority.HIGH, TaskPriority.NORMAL, TaskPriority.LOW]:
                    queue = self.priority_queues[priority]
                    try:
                        task_info = queue.get_nowait()
                        break
                    except asyncio.QueueEmpty:
                        continue
                if task_info:
                    await self._distribute_task(task_info)
                else:
                    await asyncio.sleep(0.01)  # Small delay if no tasks
            except Exception as e:
                logger.error(f"Error in distribution loop: {e}")
                await asyncio.sleep(1)
    async def _distribute_task(self, task_info: Dict[str, Any]):
        """Distribute a single task"""
        start_time = datetime.utcnow()
        try:
            # Assign task
            agent_id = await self.load_balancer.assign_task(
                task_info["task_data"],
                task_info["requirements"]
            )
            if agent_id:
                # Create task message
                task_message = create_task_message(
                    sender_id="task_distributor",
                    receiver_id=agent_id,
                    task_type=task_info["task_data"].get("task_type", "unknown"),
                    task_data=task_info["task_data"]
                )
                # Send task to agent (implementation depends on communication system)
                # await self._send_task_to_agent(agent_id, task_message)
                self.distribution_stats["tasks_distributed"] += 1
                # Simulate task completion (in real implementation, this would be event-driven)
                asyncio.create_task(self._simulate_task_completion(task_info, agent_id))
            else:
                logger.warning(f"Failed to distribute task: no suitable agent found")
                self.distribution_stats["tasks_failed"] += 1
        except Exception as e:
            logger.error(f"Error distributing task: {e}")
            self.distribution_stats["tasks_failed"] += 1
        finally:
            # Update distribution time
            distribution_time = (datetime.utcnow() - start_time).total_seconds()
            total_distributed = self.distribution_stats["tasks_distributed"]
            self.distribution_stats["avg_distribution_time"] = (
                (self.distribution_stats["avg_distribution_time"] * (total_distributed - 1) + distribution_time) / total_distributed
                if total_distributed > 0 else distribution_time
            )
    async def _simulate_task_completion(self, task_info: Dict[str, Any], agent_id: str):
        """Simulate task completion (for testing)"""
        # Simulate task processing time
        processing_time = 1.0 + (hash(task_info["task_data"].get("task_id", "")) % 5)
        await asyncio.sleep(processing_time)
        # Mark task as completed
        success = hash(agent_id) % 10 > 1  # 90% success rate
        await self.load_balancer.complete_task(
            task_info["task_data"].get("task_id", str(uuid.uuid4())),
            success,
            processing_time
        )
        if success:
            self.distribution_stats["tasks_completed"] += 1
        else:
            self.distribution_stats["tasks_failed"] += 1
    def get_distribution_stats(self) -> Dict[str, Any]:
        """Get distribution statistics"""
        return {
            **self.distribution_stats,
            "load_balancer_stats": self.load_balancer.get_load_balancing_stats(),
            "queue_sizes": {
                priority.value: queue.qsize()
                for priority, queue in self.priority_queues.items()
            }
        }
 # Example usage
 async def example_usage():
    """Example of how to use the load balancer"""
    # Create registry and load balancer
    registry = AgentRegistry()
    await registry.start()
    load_balancer = LoadBalancer(registry)
    load_balancer.set_strategy(LoadBalancingStrategy.LEAST_CONNECTIONS)
    # Create task distributor
    distributor = TaskDistributor(load_balancer)
    # Submit some tasks
    for i in range(10):
        await distributor.submit_task({
            "task_id": f"task-{i}",
            "task_type": "data_processing",
            "data": f"sample_data_{i}"
        }, TaskPriority.NORMAL)
    # Start distribution (in real implementation, this would run in background)
    # await distributor.start_distribution()
    await registry.stop()
 if __name__ == "__main__":
    asyncio.run(example_usage())
--- a/apps/agent-coordinator/tests/test_communication.py
+++ b/apps/agent-coordinator/tests/test_communication.py
@@ -0,0 +1,326 @@
 """
 Tests for Agent Communication Protocols
 """
 import pytest
 import asyncio
 from datetime import datetime, timedelta
 from unittest.mock import Mock, AsyncMock
 from src.app.protocols.communication import (
    AgentMessage, MessageType, Priority, CommunicationProtocol,
    HierarchicalProtocol, PeerToPeerProtocol, BroadcastProtocol,
    CommunicationManager, MessageTemplates
 )
 class TestAgentMessage:
    """Test AgentMessage class"""
    def test_message_creation(self):
        """Test message creation"""
        message = AgentMessage(
            sender_id="agent-001",
            receiver_id="agent-002",
            message_type=MessageType.DIRECT,
            priority=Priority.NORMAL,
            payload={"data": "test"}
        )
        assert message.sender_id == "agent-001"
        assert message.receiver_id == "agent-002"
        assert message.message_type == MessageType.DIRECT
        assert message.priority == Priority.NORMAL
        assert message.payload["data"] == "test"
        assert message.ttl == 300
    def test_message_serialization(self):
        """Test message serialization"""
        message = AgentMessage(
            sender_id="agent-001",
            receiver_id="agent-002",
            message_type=MessageType.DIRECT,
            priority=Priority.NORMAL,
            payload={"data": "test"}
        )
        # To dict
        message_dict = message.to_dict()
        assert message_dict["sender_id"] == "agent-001"
        assert message_dict["message_type"] == "direct"
        assert message_dict["priority"] == "normal"
        # From dict
        restored_message = AgentMessage.from_dict(message_dict)
        assert restored_message.sender_id == message.sender_id
        assert restored_message.receiver_id == message.receiver_id
        assert restored_message.message_type == message.message_type
        assert restored_message.priority == message.priority
    def test_message_expiration(self):
        """Test message expiration"""
        old_message = AgentMessage(
            sender_id="agent-001",
            receiver_id="agent-002",
            message_type=MessageType.DIRECT,
            timestamp=datetime.utcnow() - timedelta(seconds=400),
            ttl=300
        )
        # Message should be expired
        age = (datetime.utcnow() - old_message.timestamp).total_seconds()
        assert age > old_message.ttl
 class TestHierarchicalProtocol:
    """Test HierarchicalProtocol class"""
    @pytest.fixture
    def master_protocol(self):
        """Create master protocol"""
        return HierarchicalProtocol("master-agent", is_master=True)
    @pytest.fixture
    def sub_protocol(self):
        """Create sub-agent protocol"""
        return HierarchicalProtocol("sub-agent", is_master=False)
    def test_add_sub_agent(self, master_protocol):
        """Test adding sub-agent"""
        master_protocol.add_sub_agent("sub-agent-001")
        assert "sub-agent-001" in master_protocol.sub_agents
    def test_send_to_sub_agents(self, master_protocol):
        """Test sending to sub-agents"""
        master_protocol.add_sub_agent("sub-agent-001")
        master_protocol.add_sub_agent("sub-agent-002")
        message = MessageTemplates.create_heartbeat("master-agent")
        # Mock the send_message method
        master_protocol.send_message = AsyncMock(return_value=True)
        # Should send to both sub-agents
        asyncio.run(master_protocol.send_to_sub_agents(message))
        # Check that send_message was called twice
        assert master_protocol.send_message.call_count == 2
    def test_send_to_master(self, sub_protocol):
        """Test sending to master"""
        sub_protocol.master_agent = "master-agent"
        message = MessageTemplates.create_status_update("sub-agent", {"status": "active"})
        # Mock the send_message method
        sub_protocol.send_message = AsyncMock(return_value=True)
        asyncio.run(sub_protocol.send_to_master(message))
        # Check that send_message was called once
        assert sub_protocol.send_message.call_count == 1
 class TestPeerToPeerProtocol:
    """Test PeerToPeerProtocol class"""
    @pytest.fixture
    def p2p_protocol(self):
        """Create P2P protocol"""
        return PeerToPeerProtocol("agent-001")
    def test_add_peer(self, p2p_protocol):
        """Test adding peer"""
        p2p_protocol.add_peer("agent-002", {"endpoint": "http://localhost:8002"})
        assert "agent-002" in p2p_protocol.peers
        assert p2p_protocol.peers["agent-002"]["endpoint"] == "http://localhost:8002"
    def test_remove_peer(self, p2p_protocol):
        """Test removing peer"""
        p2p_protocol.add_peer("agent-002", {"endpoint": "http://localhost:8002"})
        p2p_protocol.remove_peer("agent-002")
        assert "agent-002" not in p2p_protocol.peers
    def test_send_to_peer(self, p2p_protocol):
        """Test sending to peer"""
        p2p_protocol.add_peer("agent-002", {"endpoint": "http://localhost:8002"})
        message = MessageTemplates.create_task_assignment(
            "agent-001", "agent-002", {"task": "test"}
        )
        # Mock the send_message method
        p2p_protocol.send_message = AsyncMock(return_value=True)
        result = asyncio.run(p2p_protocol.send_to_peer(message, "agent-002"))
        assert result is True
        assert p2p_protocol.send_message.call_count == 1
 class TestBroadcastProtocol:
    """Test BroadcastProtocol class"""
    @pytest.fixture
    def broadcast_protocol(self):
        """Create broadcast protocol"""
        return BroadcastProtocol("agent-001", "test-channel")
    def test_subscribe_unsubscribe(self, broadcast_protocol):
        """Test subscribe and unsubscribe"""
        broadcast_protocol.subscribe("agent-002")
        assert "agent-002" in broadcast_protocol.subscribers
        broadcast_protocol.unsubscribe("agent-002")
        assert "agent-002" not in broadcast_protocol.subscribers
    def test_broadcast(self, broadcast_protocol):
        """Test broadcasting"""
        broadcast_protocol.subscribe("agent-002")
        broadcast_protocol.subscribe("agent-003")
        message = MessageTemplates.create_discovery("agent-001")
        # Mock the send_message method
        broadcast_protocol.send_message = AsyncMock(return_value=True)
        asyncio.run(broadcast_protocol.broadcast(message))
        # Should send to 2 subscribers (not including self)
        assert broadcast_protocol.send_message.call_count == 2
 class TestCommunicationManager:
    """Test CommunicationManager class"""
    @pytest.fixture
    def comm_manager(self):
        """Create communication manager"""
        return CommunicationManager("agent-001")
    def test_add_protocol(self, comm_manager):
        """Test adding protocol"""
        protocol = Mock(spec=CommunicationProtocol)
        comm_manager.add_protocol("test", protocol)
        assert "test" in comm_manager.protocols
        assert comm_manager.protocols["test"] == protocol
    def test_get_protocol(self, comm_manager):
        """Test getting protocol"""
        protocol = Mock(spec=CommunicationProtocol)
        comm_manager.add_protocol("test", protocol)
        retrieved_protocol = comm_manager.get_protocol("test")
        assert retrieved_protocol == protocol
        # Test non-existent protocol
        assert comm_manager.get_protocol("non-existent") is None
    @pytest.mark.asyncio
    async def test_send_message(self, comm_manager):
        """Test sending message"""
        protocol = Mock(spec=CommunicationProtocol)
        protocol.send_message = AsyncMock(return_value=True)
        comm_manager.add_protocol("test", protocol)
        message = MessageTemplates.create_heartbeat("agent-001")
        result = await comm_manager.send_message("test", message)
        assert result is True
        protocol.send_message.assert_called_once_with(message)
    @pytest.mark.asyncio
    async def test_register_handler(self, comm_manager):
        """Test registering handler"""
        protocol = Mock(spec=CommunicationProtocol)
        protocol.register_handler = AsyncMock()
        comm_manager.add_protocol("test", protocol)
        handler = AsyncMock()
        await comm_manager.register_handler("test", MessageType.HEARTBEAT, handler)
        protocol.register_handler.assert_called_once_with(MessageType.HEARTBEAT, handler)
 class TestMessageTemplates:
    """Test MessageTemplates class"""
    def test_create_heartbeat(self):
        """Test creating heartbeat message"""
        message = MessageTemplates.create_heartbeat("agent-001")
        assert message.sender_id == "agent-001"
        assert message.message_type == MessageType.HEARTBEAT
        assert message.priority == Priority.LOW
        assert "timestamp" in message.payload
    def test_create_task_assignment(self):
        """Test creating task assignment message"""
        task_data = {"task_id": "task-001", "task_type": "process_data"}
        message = MessageTemplates.create_task_assignment("agent-001", "agent-002", task_data)
        assert message.sender_id == "agent-001"
        assert message.receiver_id == "agent-002"
        assert message.message_type == MessageType.TASK_ASSIGNMENT
        assert message.payload == task_data
    def test_create_status_update(self):
        """Test creating status update message"""
        status_data = {"status": "active", "load": 0.5}
        message = MessageTemplates.create_status_update("agent-001", status_data)
        assert message.sender_id == "agent-001"
        assert message.message_type == MessageType.STATUS_UPDATE
        assert message.payload == status_data
    def test_create_discovery(self):
        """Test creating discovery message"""
        message = MessageTemplates.create_discovery("agent-001")
        assert message.sender_id == "agent-001"
        assert message.message_type == MessageType.DISCOVERY
        assert message.payload["agent_id"] == "agent-001"
    def test_create_consensus_request(self):
        """Test creating consensus request message"""
        proposal_data = {"proposal": "test_proposal"}
        message = MessageTemplates.create_consensus_request("agent-001", proposal_data)
        assert message.sender_id == "agent-001"
        assert message.message_type == MessageType.CONSENSUS
        assert message.priority == Priority.HIGH
        assert message.payload == proposal_data
 # Integration tests
 class TestCommunicationIntegration:
    """Integration tests for communication system"""
    @pytest.mark.asyncio
    async def test_message_flow(self):
        """Test complete message flow"""
        # Create communication manager
        comm_manager = CommunicationManager("agent-001")
        # Create protocols
        hierarchical = HierarchicalProtocol("agent-001", is_master=True)
        p2p = PeerToPeerProtocol("agent-001")
        # Add protocols
        comm_manager.add_protocol("hierarchical", hierarchical)
        comm_manager.add_protocol("p2p", p2p)
        # Mock message sending
        hierarchical.send_message = AsyncMock(return_value=True)
        p2p.send_message = AsyncMock(return_value=True)
        # Register handler
        async def handle_heartbeat(message):
            assert message.sender_id == "agent-002"
            assert message.message_type == MessageType.HEARTBEAT
        await comm_manager.register_handler("hierarchical", MessageType.HEARTBEAT, handle_heartbeat)
        # Send heartbeat
        heartbeat = MessageTemplates.create_heartbeat("agent-001")
        result = await comm_manager.send_message("hierarchical", heartbeat)
        assert result is True
        hierarchical.send_message.assert_called_once()
 if __name__ == "__main__":
    pytest.main([__file__])