Update database paths and fix foreign key references across coordinator API

- Change SQLite database path from `/home/oib/windsurf/aitbc/data/` to `/opt/data/` - Fix foreign key references to use correct table names (users, wallets, gpu_registry) - Replace governance router with new governance and community routers - Add multi-modal RL router to main application - Simplify DEPLOYMENT_READINESS_REPORT.md to focus on production deployment status - Update governance router with decentralized DAO voting
2026-02-26 19:32:06 +01:00
parent 1e2ea0bb9d
commit 7bb2905cca
89 changed files with 38245 additions and 1260 deletions
--- a/gpu_acceleration/parallel_processing/distributed_framework.py
+++ b/gpu_acceleration/parallel_processing/distributed_framework.py
@@ -0,0 +1,468 @@
+"""
+Distributed Agent Processing Framework
+Implements a scalable, fault-tolerant framework for distributed AI agent tasks across the AITBC network.
+"""
+
+import asyncio
+import uuid
+import time
+import logging
+import json
+import hashlib
+from typing import Dict, List, Optional, Any, Callable, Awaitable
+from datetime import datetime
+from enum import Enum
+
+logger = logging.getLogger(__name__)
+
+class TaskStatus(str, Enum):
+    PENDING = "pending"
+    SCHEDULED = "scheduled"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    TIMEOUT = "timeout"
+    RETRYING = "retrying"
+
+class WorkerStatus(str, Enum):
+    IDLE = "idle"
+    BUSY = "busy"
+    OFFLINE = "offline"
+    OVERLOADED = "overloaded"
+
+class DistributedTask:
+    def __init__(
+        self, 
+        task_id: str, 
+        agent_id: str, 
+        payload: Dict[str, Any],
+        priority: int = 1,
+        requires_gpu: bool = False,
+        timeout_ms: int = 30000,
+        max_retries: int = 3
+    ):
+        self.task_id = task_id or f"dt_{uuid.uuid4().hex[:12]}"
+        self.agent_id = agent_id
+        self.payload = payload
+        self.priority = priority
+        self.requires_gpu = requires_gpu
+        self.timeout_ms = timeout_ms
+        self.max_retries = max_retries
+        
+        self.status = TaskStatus.PENDING
+        self.created_at = time.time()
+        self.scheduled_at = None
+        self.started_at = None
+        self.completed_at = None
+        
+        self.assigned_worker_id = None
+        self.result = None
+        self.error = None
+        self.retries = 0
+        
+        # Calculate content hash for caching/deduplication
+        content = json.dumps(payload, sort_keys=True)
+        self.content_hash = hashlib.sha256(content.encode()).hexdigest()
+
+class WorkerNode:
+    def __init__(
+        self, 
+        worker_id: str, 
+        capabilities: List[str], 
+        has_gpu: bool = False,
+        max_concurrent_tasks: int = 4
+    ):
+        self.worker_id = worker_id
+        self.capabilities = capabilities
+        self.has_gpu = has_gpu
+        self.max_concurrent_tasks = max_concurrent_tasks
+        
+        self.status = WorkerStatus.IDLE
+        self.active_tasks = []
+        self.last_heartbeat = time.time()
+        self.total_completed = 0
+        self.performance_score = 1.0  # 0.0 to 1.0 based on success rate and speed
+
+class DistributedProcessingCoordinator:
+    """
+    Coordinates distributed task execution across available worker nodes.
+    Implements advanced scheduling, fault tolerance, and load balancing.
+    """
+    
+    def __init__(self):
+        self.tasks: Dict[str, DistributedTask] = {}
+        self.workers: Dict[str, WorkerNode] = {}
+        self.task_queue = asyncio.PriorityQueue()
+        
+        # Result cache (content_hash -> result)
+        self.result_cache: Dict[str, Any] = {}
+        
+        self.is_running = False
+        self._scheduler_task = None
+        self._monitor_task = None
+        
+    async def start(self):
+        """Start the coordinator background tasks"""
+        if self.is_running:
+            return
+            
+        self.is_running = True
+        self._scheduler_task = asyncio.create_task(self._scheduling_loop())
+        self._monitor_task = asyncio.create_task(self._health_monitor_loop())
+        logger.info("Distributed Processing Coordinator started")
+        
+    async def stop(self):
+        """Stop the coordinator gracefully"""
+        self.is_running = False
+        if self._scheduler_task:
+            self._scheduler_task.cancel()
+        if self._monitor_task:
+            self._monitor_task.cancel()
+        logger.info("Distributed Processing Coordinator stopped")
+        
+    def register_worker(self, worker_id: str, capabilities: List[str], has_gpu: bool = False, max_tasks: int = 4):
+        """Register a new worker node in the cluster"""
+        if worker_id not in self.workers:
+            self.workers[worker_id] = WorkerNode(worker_id, capabilities, has_gpu, max_tasks)
+            logger.info(f"Registered new worker node: {worker_id} (GPU: {has_gpu})")
+        else:
+            # Update existing worker
+            worker = self.workers[worker_id]
+            worker.capabilities = capabilities
+            worker.has_gpu = has_gpu
+            worker.max_concurrent_tasks = max_tasks
+            worker.last_heartbeat = time.time()
+            if worker.status == WorkerStatus.OFFLINE:
+                worker.status = WorkerStatus.IDLE
+                
+    def heartbeat(self, worker_id: str, metrics: Optional[Dict[str, Any]] = None):
+        """Record a heartbeat from a worker node"""
+        if worker_id in self.workers:
+            worker = self.workers[worker_id]
+            worker.last_heartbeat = time.time()
+            
+            # Update status based on metrics if provided
+            if metrics:
+                cpu_load = metrics.get('cpu_load', 0.0)
+                if cpu_load > 0.9 or len(worker.active_tasks) >= worker.max_concurrent_tasks:
+                    worker.status = WorkerStatus.OVERLOADED
+                elif len(worker.active_tasks) > 0:
+                    worker.status = WorkerStatus.BUSY
+                else:
+                    worker.status = WorkerStatus.IDLE
+
+    async def submit_task(self, task: DistributedTask) -> str:
+        """Submit a new task to the distributed framework"""
+        # Check cache first
+        if task.content_hash in self.result_cache:
+            task.status = TaskStatus.COMPLETED
+            task.result = self.result_cache[task.content_hash]
+            task.completed_at = time.time()
+            self.tasks[task.task_id] = task
+            logger.debug(f"Task {task.task_id} fulfilled from cache")
+            return task.task_id
+            
+        self.tasks[task.task_id] = task
+        # Priority Queue uses lowest number first, so we invert user priority
+        queue_priority = 100 - min(task.priority, 100)
+        
+        await self.task_queue.put((queue_priority, task.created_at, task.task_id))
+        logger.debug(f"Task {task.task_id} queued with priority {task.priority}")
+        
+        return task.task_id
+        
+    async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Get the current status and result of a task"""
+        if task_id not in self.tasks:
+            return None
+            
+        task = self.tasks[task_id]
+        
+        response = {
+            'task_id': task.task_id,
+            'status': task.status,
+            'created_at': task.created_at
+        }
+        
+        if task.status == TaskStatus.COMPLETED:
+            response['result'] = task.result
+            response['completed_at'] = task.completed_at
+            response['duration_ms'] = int((task.completed_at - (task.started_at or task.created_at)) * 1000)
+        elif task.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT]:
+            response['error'] = str(task.error)
+            
+        if task.assigned_worker_id:
+            response['worker_id'] = task.assigned_worker_id
+            
+        return response
+
+    async def _scheduling_loop(self):
+        """Background task that assigns queued tasks to available workers"""
+        while self.is_running:
+            try:
+                # Get next task from queue (blocks until available)
+                if self.task_queue.empty():
+                    await asyncio.sleep(0.1)
+                    continue
+                    
+                priority, _, task_id = await self.task_queue.get()
+                
+                if task_id not in self.tasks:
+                    self.task_queue.task_done()
+                    continue
+                    
+                task = self.tasks[task_id]
+                
+                # If task was cancelled while in queue
+                if task.status != TaskStatus.PENDING and task.status != TaskStatus.RETRYING:
+                    self.task_queue.task_done()
+                    continue
+                    
+                # Find best worker
+                best_worker = self._find_best_worker(task)
+                
+                if best_worker:
+                    await self._assign_task(task, best_worker)
+                else:
+                    # No worker available right now, put back in queue with slight delay
+                    # Use a background task to not block the scheduling loop
+                    asyncio.create_task(self._requeue_delayed(priority, task))
+                    
+                self.task_queue.task_done()
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in scheduling loop: {e}")
+                await asyncio.sleep(1.0)
+                
+    async def _requeue_delayed(self, priority: int, task: DistributedTask):
+        """Put a task back in the queue after a short delay"""
+        await asyncio.sleep(0.5)
+        if self.is_running and task.status in [TaskStatus.PENDING, TaskStatus.RETRYING]:
+            await self.task_queue.put((priority, task.created_at, task.task_id))
+
+    def _find_best_worker(self, task: DistributedTask) -> Optional[WorkerNode]:
+        """Find the optimal worker for a task based on requirements and load"""
+        candidates = []
+        
+        for worker in self.workers.values():
+            # Skip offline or overloaded workers
+            if worker.status in [WorkerStatus.OFFLINE, WorkerStatus.OVERLOADED]:
+                continue
+                
+            # Skip if worker is at capacity
+            if len(worker.active_tasks) >= worker.max_concurrent_tasks:
+                continue
+                
+            # Check GPU requirement
+            if task.requires_gpu and not worker.has_gpu:
+                continue
+                
+            # Required capability check could be added here
+            
+            # Calculate score for worker
+            score = worker.performance_score * 100
+            
+            # Penalize slightly based on current load to balance distribution
+            load_factor = len(worker.active_tasks) / worker.max_concurrent_tasks
+            score -= (load_factor * 20)
+            
+            # Prefer GPU workers for GPU tasks, penalize GPU workers for CPU tasks 
+            # to keep them free for GPU workloads
+            if worker.has_gpu and not task.requires_gpu:
+                score -= 30
+                
+            candidates.append((score, worker))
+            
+        if not candidates:
+            return None
+            
+        # Return worker with highest score
+        candidates.sort(key=lambda x: x[0], reverse=True)
+        return candidates[0][1]
+
+    async def _assign_task(self, task: DistributedTask, worker: WorkerNode):
+        """Assign a task to a specific worker"""
+        task.status = TaskStatus.SCHEDULED
+        task.assigned_worker_id = worker.worker_id
+        task.scheduled_at = time.time()
+        
+        worker.active_tasks.append(task.task_id)
+        if len(worker.active_tasks) >= worker.max_concurrent_tasks:
+            worker.status = WorkerStatus.OVERLOADED
+        elif worker.status == WorkerStatus.IDLE:
+            worker.status = WorkerStatus.BUSY
+            
+        logger.debug(f"Assigned task {task.task_id} to worker {worker.worker_id}")
+        
+        # In a real system, this would make an RPC/network call to the worker
+        # Here we simulate the network dispatch asynchronously
+        asyncio.create_task(self._simulate_worker_execution(task, worker))
+
+    async def _simulate_worker_execution(self, task: DistributedTask, worker: WorkerNode):
+        """Simulate the execution on the remote worker node"""
+        task.status = TaskStatus.PROCESSING
+        task.started_at = time.time()
+        
+        try:
+            # Simulate processing time based on task complexity
+            # Real implementation would await the actual RPC response
+            complexity = task.payload.get('complexity', 1.0)
+            base_time = 0.5
+            
+            if worker.has_gpu and task.requires_gpu:
+                # GPU processes faster
+                processing_time = base_time * complexity * 0.2
+            else:
+                processing_time = base_time * complexity
+                
+            # Simulate potential network/node failure
+            if worker.performance_score < 0.5 and time.time() % 10 < 1:
+                raise ConnectionError("Worker node network failure")
+                
+            await asyncio.sleep(processing_time)
+            
+            # Success
+            self.report_task_success(task.task_id, {"result_data": "simulated_success", "processed_by": worker.worker_id})
+            
+        except Exception as e:
+            self.report_task_failure(task.task_id, str(e))
+
+    def report_task_success(self, task_id: str, result: Any):
+        """Called by a worker when a task completes successfully"""
+        if task_id not in self.tasks:
+            return
+            
+        task = self.tasks[task_id]
+        if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.TIMEOUT]:
+            return # Already finished
+            
+        task.status = TaskStatus.COMPLETED
+        task.result = result
+        task.completed_at = time.time()
+        
+        # Cache the result
+        self.result_cache[task.content_hash] = result
+        
+        # Update worker metrics
+        if task.assigned_worker_id and task.assigned_worker_id in self.workers:
+            worker = self.workers[task.assigned_worker_id]
+            if task_id in worker.active_tasks:
+                worker.active_tasks.remove(task_id)
+            worker.total_completed += 1
+            # Increase performance score slightly (max 1.0)
+            worker.performance_score = min(1.0, worker.performance_score + 0.01)
+            
+            if len(worker.active_tasks) < worker.max_concurrent_tasks and worker.status == WorkerStatus.OVERLOADED:
+                worker.status = WorkerStatus.BUSY
+            if len(worker.active_tasks) == 0:
+                worker.status = WorkerStatus.IDLE
+                
+        logger.info(f"Task {task_id} completed successfully")
+
+    def report_task_failure(self, task_id: str, error: str):
+        """Called when a task fails execution"""
+        if task_id not in self.tasks:
+            return
+            
+        task = self.tasks[task_id]
+        
+        # Update worker metrics
+        if task.assigned_worker_id and task.assigned_worker_id in self.workers:
+            worker = self.workers[task.assigned_worker_id]
+            if task_id in worker.active_tasks:
+                worker.active_tasks.remove(task_id)
+            # Decrease performance score heavily on failure
+            worker.performance_score = max(0.1, worker.performance_score - 0.05)
+            
+        # Handle retry logic
+        if task.retries < task.max_retries:
+            task.retries += 1
+            task.status = TaskStatus.RETRYING
+            task.assigned_worker_id = None
+            task.error = f"Attempt {task.retries} failed: {error}"
+            
+            logger.warning(f"Task {task_id} failed, scheduling retry {task.retries}/{task.max_retries}")
+            
+            # Put back in queue with slightly lower priority
+            queue_priority = (100 - min(task.priority, 100)) + (task.retries * 5)
+            asyncio.create_task(self.task_queue.put((queue_priority, time.time(), task.task_id)))
+        else:
+            task.status = TaskStatus.FAILED
+            task.error = f"Max retries exceeded. Final error: {error}"
+            task.completed_at = time.time()
+            logger.error(f"Task {task_id} failed permanently")
+
+    async def _health_monitor_loop(self):
+        """Background task that monitors worker health and task timeouts"""
+        while self.is_running:
+            try:
+                current_time = time.time()
+                
+                # 1. Check worker health
+                for worker_id, worker in self.workers.items():
+                    # If no heartbeat for 60 seconds, mark offline
+                    if current_time - worker.last_heartbeat > 60.0:
+                        if worker.status != WorkerStatus.OFFLINE:
+                            logger.warning(f"Worker {worker_id} went offline (missed heartbeats)")
+                            worker.status = WorkerStatus.OFFLINE
+                            
+                            # Re-queue all active tasks for this worker
+                            for task_id in worker.active_tasks:
+                                if task_id in self.tasks:
+                                    self.report_task_failure(task_id, "Worker node disconnected")
+                            worker.active_tasks.clear()
+                            
+                # 2. Check task timeouts
+                for task_id, task in self.tasks.items():
+                    if task.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING]:
+                        start_time = task.started_at or task.scheduled_at
+                        if start_time and (current_time - start_time) * 1000 > task.timeout_ms:
+                            logger.warning(f"Task {task_id} timed out")
+                            self.report_task_failure(task_id, f"Execution timed out after {task.timeout_ms}ms")
+                            
+                await asyncio.sleep(5.0)  # Check every 5 seconds
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in health monitor loop: {e}")
+                await asyncio.sleep(5.0)
+
+    def get_cluster_status(self) -> Dict[str, Any]:
+        """Get the overall status of the distributed cluster"""
+        total_workers = len(self.workers)
+        active_workers = sum(1 for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        gpu_workers = sum(1 for w in self.workers.values() if w.has_gpu and w.status != WorkerStatus.OFFLINE)
+        
+        pending_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.PENDING)
+        processing_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING])
+        completed_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.COMPLETED)
+        failed_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT])
+        
+        # Calculate cluster utilization
+        total_capacity = sum(w.max_concurrent_tasks for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        current_load = sum(len(w.active_tasks) for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        
+        utilization = (current_load / total_capacity * 100) if total_capacity > 0 else 0
+        
+        return {
+            "cluster_health": "healthy" if active_workers > 0 else "offline",
+            "nodes": {
+                "total": total_workers,
+                "active": active_workers,
+                "with_gpu": gpu_workers
+            },
+            "tasks": {
+                "pending": pending_tasks,
+                "processing": processing_tasks,
+                "completed": completed_tasks,
+                "failed": failed_tasks
+            },
+            "performance": {
+                "utilization_percent": round(utilization, 2),
+                "cache_size": len(self.result_cache)
+            },
+            "timestamp": datetime.utcnow().isoformat()
+        }