Update database paths and fix foreign key references across coordinator API

- Change SQLite database path from `/home/oib/windsurf/aitbc/data/` to `/opt/data/` - Fix foreign key references to use correct table names (users, wallets, gpu_registry) - Replace governance router with new governance and community routers - Add multi-modal RL router to main application - Simplify DEPLOYMENT_READINESS_REPORT.md to focus on production deployment status - Update governance router with decentralized DAO voting
2026-02-26 19:32:06 +01:00
parent 1e2ea0bb9d
commit 7bb2905cca
89 changed files with 38245 additions and 1260 deletions
--- a/gpu_acceleration/parallel_processing/distributed_framework.py
+++ b/gpu_acceleration/parallel_processing/distributed_framework.py
@@ -0,0 +1,468 @@
+"""
+Distributed Agent Processing Framework
+Implements a scalable, fault-tolerant framework for distributed AI agent tasks across the AITBC network.
+"""
+
+import asyncio
+import uuid
+import time
+import logging
+import json
+import hashlib
+from typing import Dict, List, Optional, Any, Callable, Awaitable
+from datetime import datetime
+from enum import Enum
+
+logger = logging.getLogger(__name__)
+
+class TaskStatus(str, Enum):
+    PENDING = "pending"
+    SCHEDULED = "scheduled"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    TIMEOUT = "timeout"
+    RETRYING = "retrying"
+
+class WorkerStatus(str, Enum):
+    IDLE = "idle"
+    BUSY = "busy"
+    OFFLINE = "offline"
+    OVERLOADED = "overloaded"
+
+class DistributedTask:
+    def __init__(
+        self, 
+        task_id: str, 
+        agent_id: str, 
+        payload: Dict[str, Any],
+        priority: int = 1,
+        requires_gpu: bool = False,
+        timeout_ms: int = 30000,
+        max_retries: int = 3
+    ):
+        self.task_id = task_id or f"dt_{uuid.uuid4().hex[:12]}"
+        self.agent_id = agent_id
+        self.payload = payload
+        self.priority = priority
+        self.requires_gpu = requires_gpu
+        self.timeout_ms = timeout_ms
+        self.max_retries = max_retries
+        
+        self.status = TaskStatus.PENDING
+        self.created_at = time.time()
+        self.scheduled_at = None
+        self.started_at = None
+        self.completed_at = None
+        
+        self.assigned_worker_id = None
+        self.result = None
+        self.error = None
+        self.retries = 0
+        
+        # Calculate content hash for caching/deduplication
+        content = json.dumps(payload, sort_keys=True)
+        self.content_hash = hashlib.sha256(content.encode()).hexdigest()
+
+class WorkerNode:
+    def __init__(
+        self, 
+        worker_id: str, 
+        capabilities: List[str], 
+        has_gpu: bool = False,
+        max_concurrent_tasks: int = 4
+    ):
+        self.worker_id = worker_id
+        self.capabilities = capabilities
+        self.has_gpu = has_gpu
+        self.max_concurrent_tasks = max_concurrent_tasks
+        
+        self.status = WorkerStatus.IDLE
+        self.active_tasks = []
+        self.last_heartbeat = time.time()
+        self.total_completed = 0
+        self.performance_score = 1.0  # 0.0 to 1.0 based on success rate and speed
+
+class DistributedProcessingCoordinator:
+    """
+    Coordinates distributed task execution across available worker nodes.
+    Implements advanced scheduling, fault tolerance, and load balancing.
+    """
+    
+    def __init__(self):
+        self.tasks: Dict[str, DistributedTask] = {}
+        self.workers: Dict[str, WorkerNode] = {}
+        self.task_queue = asyncio.PriorityQueue()
+        
+        # Result cache (content_hash -> result)
+        self.result_cache: Dict[str, Any] = {}
+        
+        self.is_running = False
+        self._scheduler_task = None
+        self._monitor_task = None
+        
+    async def start(self):
+        """Start the coordinator background tasks"""
+        if self.is_running:
+            return
+            
+        self.is_running = True
+        self._scheduler_task = asyncio.create_task(self._scheduling_loop())
+        self._monitor_task = asyncio.create_task(self._health_monitor_loop())
+        logger.info("Distributed Processing Coordinator started")
+        
+    async def stop(self):
+        """Stop the coordinator gracefully"""
+        self.is_running = False
+        if self._scheduler_task:
+            self._scheduler_task.cancel()
+        if self._monitor_task:
+            self._monitor_task.cancel()
+        logger.info("Distributed Processing Coordinator stopped")
+        
+    def register_worker(self, worker_id: str, capabilities: List[str], has_gpu: bool = False, max_tasks: int = 4):
+        """Register a new worker node in the cluster"""
+        if worker_id not in self.workers:
+            self.workers[worker_id] = WorkerNode(worker_id, capabilities, has_gpu, max_tasks)
+            logger.info(f"Registered new worker node: {worker_id} (GPU: {has_gpu})")
+        else:
+            # Update existing worker
+            worker = self.workers[worker_id]
+            worker.capabilities = capabilities
+            worker.has_gpu = has_gpu
+            worker.max_concurrent_tasks = max_tasks
+            worker.last_heartbeat = time.time()
+            if worker.status == WorkerStatus.OFFLINE:
+                worker.status = WorkerStatus.IDLE
+                
+    def heartbeat(self, worker_id: str, metrics: Optional[Dict[str, Any]] = None):
+        """Record a heartbeat from a worker node"""
+        if worker_id in self.workers:
+            worker = self.workers[worker_id]
+            worker.last_heartbeat = time.time()
+            
+            # Update status based on metrics if provided
+            if metrics:
+                cpu_load = metrics.get('cpu_load', 0.0)
+                if cpu_load > 0.9 or len(worker.active_tasks) >= worker.max_concurrent_tasks:
+                    worker.status = WorkerStatus.OVERLOADED
+                elif len(worker.active_tasks) > 0:
+                    worker.status = WorkerStatus.BUSY
+                else:
+                    worker.status = WorkerStatus.IDLE
+
+    async def submit_task(self, task: DistributedTask) -> str:
+        """Submit a new task to the distributed framework"""
+        # Check cache first
+        if task.content_hash in self.result_cache:
+            task.status = TaskStatus.COMPLETED
+            task.result = self.result_cache[task.content_hash]
+            task.completed_at = time.time()
+            self.tasks[task.task_id] = task
+            logger.debug(f"Task {task.task_id} fulfilled from cache")
+            return task.task_id
+            
+        self.tasks[task.task_id] = task
+        # Priority Queue uses lowest number first, so we invert user priority
+        queue_priority = 100 - min(task.priority, 100)
+        
+        await self.task_queue.put((queue_priority, task.created_at, task.task_id))
+        logger.debug(f"Task {task.task_id} queued with priority {task.priority}")
+        
+        return task.task_id
+        
+    async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Get the current status and result of a task"""
+        if task_id not in self.tasks:
+            return None
+            
+        task = self.tasks[task_id]
+        
+        response = {
+            'task_id': task.task_id,
+            'status': task.status,
+            'created_at': task.created_at
+        }
+        
+        if task.status == TaskStatus.COMPLETED:
+            response['result'] = task.result
+            response['completed_at'] = task.completed_at
+            response['duration_ms'] = int((task.completed_at - (task.started_at or task.created_at)) * 1000)
+        elif task.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT]:
+            response['error'] = str(task.error)
+            
+        if task.assigned_worker_id:
+            response['worker_id'] = task.assigned_worker_id
+            
+        return response
+
+    async def _scheduling_loop(self):
+        """Background task that assigns queued tasks to available workers"""
+        while self.is_running:
+            try:
+                # Get next task from queue (blocks until available)
+                if self.task_queue.empty():
+                    await asyncio.sleep(0.1)
+                    continue
+                    
+                priority, _, task_id = await self.task_queue.get()
+                
+                if task_id not in self.tasks:
+                    self.task_queue.task_done()
+                    continue
+                    
+                task = self.tasks[task_id]
+                
+                # If task was cancelled while in queue
+                if task.status != TaskStatus.PENDING and task.status != TaskStatus.RETRYING:
+                    self.task_queue.task_done()
+                    continue
+                    
+                # Find best worker
+                best_worker = self._find_best_worker(task)
+                
+                if best_worker:
+                    await self._assign_task(task, best_worker)
+                else:
+                    # No worker available right now, put back in queue with slight delay
+                    # Use a background task to not block the scheduling loop
+                    asyncio.create_task(self._requeue_delayed(priority, task))
+                    
+                self.task_queue.task_done()
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in scheduling loop: {e}")
+                await asyncio.sleep(1.0)
+                
+    async def _requeue_delayed(self, priority: int, task: DistributedTask):
+        """Put a task back in the queue after a short delay"""
+        await asyncio.sleep(0.5)
+        if self.is_running and task.status in [TaskStatus.PENDING, TaskStatus.RETRYING]:
+            await self.task_queue.put((priority, task.created_at, task.task_id))
+
+    def _find_best_worker(self, task: DistributedTask) -> Optional[WorkerNode]:
+        """Find the optimal worker for a task based on requirements and load"""
+        candidates = []
+        
+        for worker in self.workers.values():
+            # Skip offline or overloaded workers
+            if worker.status in [WorkerStatus.OFFLINE, WorkerStatus.OVERLOADED]:
+                continue
+                
+            # Skip if worker is at capacity
+            if len(worker.active_tasks) >= worker.max_concurrent_tasks:
+                continue
+                
+            # Check GPU requirement
+            if task.requires_gpu and not worker.has_gpu:
+                continue
+                
+            # Required capability check could be added here
+            
+            # Calculate score for worker
+            score = worker.performance_score * 100
+            
+            # Penalize slightly based on current load to balance distribution
+            load_factor = len(worker.active_tasks) / worker.max_concurrent_tasks
+            score -= (load_factor * 20)
+            
+            # Prefer GPU workers for GPU tasks, penalize GPU workers for CPU tasks 
+            # to keep them free for GPU workloads
+            if worker.has_gpu and not task.requires_gpu:
+                score -= 30
+                
+            candidates.append((score, worker))
+            
+        if not candidates:
+            return None
+            
+        # Return worker with highest score
+        candidates.sort(key=lambda x: x[0], reverse=True)
+        return candidates[0][1]
+
+    async def _assign_task(self, task: DistributedTask, worker: WorkerNode):
+        """Assign a task to a specific worker"""
+        task.status = TaskStatus.SCHEDULED
+        task.assigned_worker_id = worker.worker_id
+        task.scheduled_at = time.time()
+        
+        worker.active_tasks.append(task.task_id)
+        if len(worker.active_tasks) >= worker.max_concurrent_tasks:
+            worker.status = WorkerStatus.OVERLOADED
+        elif worker.status == WorkerStatus.IDLE:
+            worker.status = WorkerStatus.BUSY
+            
+        logger.debug(f"Assigned task {task.task_id} to worker {worker.worker_id}")
+        
+        # In a real system, this would make an RPC/network call to the worker
+        # Here we simulate the network dispatch asynchronously
+        asyncio.create_task(self._simulate_worker_execution(task, worker))
+
+    async def _simulate_worker_execution(self, task: DistributedTask, worker: WorkerNode):
+        """Simulate the execution on the remote worker node"""
+        task.status = TaskStatus.PROCESSING
+        task.started_at = time.time()
+        
+        try:
+            # Simulate processing time based on task complexity
+            # Real implementation would await the actual RPC response
+            complexity = task.payload.get('complexity', 1.0)
+            base_time = 0.5
+            
+            if worker.has_gpu and task.requires_gpu:
+                # GPU processes faster
+                processing_time = base_time * complexity * 0.2
+            else:
+                processing_time = base_time * complexity
+                
+            # Simulate potential network/node failure
+            if worker.performance_score < 0.5 and time.time() % 10 < 1:
+                raise ConnectionError("Worker node network failure")
+                
+            await asyncio.sleep(processing_time)
+            
+            # Success
+            self.report_task_success(task.task_id, {"result_data": "simulated_success", "processed_by": worker.worker_id})
+            
+        except Exception as e:
+            self.report_task_failure(task.task_id, str(e))
+
+    def report_task_success(self, task_id: str, result: Any):
+        """Called by a worker when a task completes successfully"""
+        if task_id not in self.tasks:
+            return
+            
+        task = self.tasks[task_id]
+        if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.TIMEOUT]:
+            return # Already finished
+            
+        task.status = TaskStatus.COMPLETED
+        task.result = result
+        task.completed_at = time.time()
+        
+        # Cache the result
+        self.result_cache[task.content_hash] = result
+        
+        # Update worker metrics
+        if task.assigned_worker_id and task.assigned_worker_id in self.workers:
+            worker = self.workers[task.assigned_worker_id]
+            if task_id in worker.active_tasks:
+                worker.active_tasks.remove(task_id)
+            worker.total_completed += 1
+            # Increase performance score slightly (max 1.0)
+            worker.performance_score = min(1.0, worker.performance_score + 0.01)
+            
+            if len(worker.active_tasks) < worker.max_concurrent_tasks and worker.status == WorkerStatus.OVERLOADED:
+                worker.status = WorkerStatus.BUSY
+            if len(worker.active_tasks) == 0:
+                worker.status = WorkerStatus.IDLE
+                
+        logger.info(f"Task {task_id} completed successfully")
+
+    def report_task_failure(self, task_id: str, error: str):
+        """Called when a task fails execution"""
+        if task_id not in self.tasks:
+            return
+            
+        task = self.tasks[task_id]
+        
+        # Update worker metrics
+        if task.assigned_worker_id and task.assigned_worker_id in self.workers:
+            worker = self.workers[task.assigned_worker_id]
+            if task_id in worker.active_tasks:
+                worker.active_tasks.remove(task_id)
+            # Decrease performance score heavily on failure
+            worker.performance_score = max(0.1, worker.performance_score - 0.05)
+            
+        # Handle retry logic
+        if task.retries < task.max_retries:
+            task.retries += 1
+            task.status = TaskStatus.RETRYING
+            task.assigned_worker_id = None
+            task.error = f"Attempt {task.retries} failed: {error}"
+            
+            logger.warning(f"Task {task_id} failed, scheduling retry {task.retries}/{task.max_retries}")
+            
+            # Put back in queue with slightly lower priority
+            queue_priority = (100 - min(task.priority, 100)) + (task.retries * 5)
+            asyncio.create_task(self.task_queue.put((queue_priority, time.time(), task.task_id)))
+        else:
+            task.status = TaskStatus.FAILED
+            task.error = f"Max retries exceeded. Final error: {error}"
+            task.completed_at = time.time()
+            logger.error(f"Task {task_id} failed permanently")
+
+    async def _health_monitor_loop(self):
+        """Background task that monitors worker health and task timeouts"""
+        while self.is_running:
+            try:
+                current_time = time.time()
+                
+                # 1. Check worker health
+                for worker_id, worker in self.workers.items():
+                    # If no heartbeat for 60 seconds, mark offline
+                    if current_time - worker.last_heartbeat > 60.0:
+                        if worker.status != WorkerStatus.OFFLINE:
+                            logger.warning(f"Worker {worker_id} went offline (missed heartbeats)")
+                            worker.status = WorkerStatus.OFFLINE
+                            
+                            # Re-queue all active tasks for this worker
+                            for task_id in worker.active_tasks:
+                                if task_id in self.tasks:
+                                    self.report_task_failure(task_id, "Worker node disconnected")
+                            worker.active_tasks.clear()
+                            
+                # 2. Check task timeouts
+                for task_id, task in self.tasks.items():
+                    if task.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING]:
+                        start_time = task.started_at or task.scheduled_at
+                        if start_time and (current_time - start_time) * 1000 > task.timeout_ms:
+                            logger.warning(f"Task {task_id} timed out")
+                            self.report_task_failure(task_id, f"Execution timed out after {task.timeout_ms}ms")
+                            
+                await asyncio.sleep(5.0)  # Check every 5 seconds
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in health monitor loop: {e}")
+                await asyncio.sleep(5.0)
+
+    def get_cluster_status(self) -> Dict[str, Any]:
+        """Get the overall status of the distributed cluster"""
+        total_workers = len(self.workers)
+        active_workers = sum(1 for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        gpu_workers = sum(1 for w in self.workers.values() if w.has_gpu and w.status != WorkerStatus.OFFLINE)
+        
+        pending_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.PENDING)
+        processing_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING])
+        completed_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.COMPLETED)
+        failed_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT])
+        
+        # Calculate cluster utilization
+        total_capacity = sum(w.max_concurrent_tasks for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        current_load = sum(len(w.active_tasks) for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
+        
+        utilization = (current_load / total_capacity * 100) if total_capacity > 0 else 0
+        
+        return {
+            "cluster_health": "healthy" if active_workers > 0 else "offline",
+            "nodes": {
+                "total": total_workers,
+                "active": active_workers,
+                "with_gpu": gpu_workers
+            },
+            "tasks": {
+                "pending": pending_tasks,
+                "processing": processing_tasks,
+                "completed": completed_tasks,
+                "failed": failed_tasks
+            },
+            "performance": {
+                "utilization_percent": round(utilization, 2),
+                "cache_size": len(self.result_cache)
+            },
+            "timestamp": datetime.utcnow().isoformat()
+        }
--- a/gpu_acceleration/parallel_processing/marketplace_cache_optimizer.py
+++ b/gpu_acceleration/parallel_processing/marketplace_cache_optimizer.py
@@ -0,0 +1,246 @@
+"""
+Marketplace Caching & Optimization Service
+Implements advanced caching, indexing, and data optimization for the AITBC marketplace.
+"""
+
+import json
+import time
+import hashlib
+import logging
+from typing import Dict, List, Optional, Any, Union, Set
+from collections import OrderedDict
+from datetime import datetime
+
+import redis.asyncio as redis
+
+logger = logging.getLogger(__name__)
+
+class LFU_LRU_Cache:
+    """Hybrid Least-Frequently/Least-Recently Used Cache for in-memory optimization"""
+    
+    def __init__(self, capacity: int):
+        self.capacity = capacity
+        self.cache = {}
+        self.frequencies = {}
+        self.frequency_lists = {}
+        self.min_freq = 0
+        
+    def get(self, key: str) -> Optional[Any]:
+        if key not in self.cache:
+            return None
+            
+        # Update frequency
+        freq = self.frequencies[key]
+        val = self.cache[key]
+        
+        # Remove from current frequency list
+        self.frequency_lists[freq].remove(key)
+        if not self.frequency_lists[freq] and self.min_freq == freq:
+            self.min_freq += 1
+            
+        # Add to next frequency list
+        new_freq = freq + 1
+        self.frequencies[key] = new_freq
+        if new_freq not in self.frequency_lists:
+            self.frequency_lists[new_freq] = OrderedDict()
+        self.frequency_lists[new_freq][key] = None
+        
+        return val
+        
+    def put(self, key: str, value: Any):
+        if self.capacity == 0:
+            return
+            
+        if key in self.cache:
+            self.cache[key] = value
+            self.get(key) # Update frequency
+            return
+            
+        if len(self.cache) >= self.capacity:
+            # Evict least frequently used item (if tie, least recently used)
+            evict_key, _ = self.frequency_lists[self.min_freq].popitem(last=False)
+            del self.cache[evict_key]
+            del self.frequencies[evict_key]
+            
+        # Add new item
+        self.cache[key] = value
+        self.frequencies[key] = 1
+        self.min_freq = 1
+        
+        if 1 not in self.frequency_lists:
+            self.frequency_lists[1] = OrderedDict()
+        self.frequency_lists[1][key] = None
+
+class MarketplaceDataOptimizer:
+    """Advanced optimization engine for marketplace data access"""
+    
+    def __init__(self, redis_url: str = "redis://localhost:6379/0"):
+        self.redis_url = redis_url
+        self.redis_client = None
+        
+        # Two-tier cache: Fast L1 (Memory), Slower L2 (Redis)
+        self.l1_cache = LFU_LRU_Cache(capacity=1000)
+        self.is_connected = False
+        
+        # Cache TTL defaults
+        self.ttls = {
+            'order_book': 5,          # Very dynamic, 5 seconds
+            'provider_status': 15,    # 15 seconds
+            'market_stats': 60,       # 1 minute
+            'historical_data': 3600   # 1 hour
+        }
+        
+    async def connect(self):
+        """Establish connection to Redis L2 cache"""
+        try:
+            self.redis_client = redis.from_url(self.redis_url, decode_responses=True)
+            await self.redis_client.ping()
+            self.is_connected = True
+            logger.info("Connected to Redis L2 cache")
+        except Exception as e:
+            logger.error(f"Failed to connect to Redis: {e}. Falling back to L1 cache only.")
+            self.is_connected = False
+            
+    async def disconnect(self):
+        """Close Redis connection"""
+        if self.redis_client:
+            await self.redis_client.close()
+            self.is_connected = False
+            
+    def _generate_cache_key(self, namespace: str, params: Dict[str, Any]) -> str:
+        """Generate a deterministic cache key from parameters"""
+        param_str = json.dumps(params, sort_keys=True)
+        param_hash = hashlib.md5(param_str.encode()).hexdigest()
+        return f"mkpt:{namespace}:{param_hash}"
+        
+    async def get_cached_data(self, namespace: str, params: Dict[str, Any]) -> Optional[Any]:
+        """Retrieve data from the multi-tier cache"""
+        key = self._generate_cache_key(namespace, params)
+        
+        # 1. Try L1 Memory Cache (fastest)
+        l1_result = self.l1_cache.get(key)
+        if l1_result is not None:
+            # Check if expired
+            if l1_result['expires_at'] > time.time():
+                logger.debug(f"L1 Cache hit for {key}")
+                return l1_result['data']
+                
+        # 2. Try L2 Redis Cache
+        if self.is_connected:
+            try:
+                l2_result_str = await self.redis_client.get(key)
+                if l2_result_str:
+                    logger.debug(f"L2 Cache hit for {key}")
+                    data = json.loads(l2_result_str)
+                    
+                    # Backfill L1 cache
+                    ttl = self.ttls.get(namespace, 60)
+                    self.l1_cache.put(key, {
+                        'data': data,
+                        'expires_at': time.time() + min(ttl, 10) # L1 expires sooner than L2
+                    })
+                    return data
+            except Exception as e:
+                logger.warning(f"Redis get failed: {e}")
+                
+        return None
+        
+    async def set_cached_data(self, namespace: str, params: Dict[str, Any], data: Any, custom_ttl: int = None):
+        """Store data in the multi-tier cache"""
+        key = self._generate_cache_key(namespace, params)
+        ttl = custom_ttl or self.ttls.get(namespace, 60)
+        
+        # 1. Update L1 Cache
+        self.l1_cache.put(key, {
+            'data': data,
+            'expires_at': time.time() + ttl
+        })
+        
+        # 2. Update L2 Redis Cache asynchronously
+        if self.is_connected:
+            try:
+                # We don't await this to keep the main thread fast
+                # In FastAPI we would use BackgroundTasks
+                await self.redis_client.setex(
+                    key, 
+                    ttl, 
+                    json.dumps(data)
+                )
+            except Exception as e:
+                logger.warning(f"Redis set failed: {e}")
+                
+    async def invalidate_namespace(self, namespace: str):
+        """Invalidate all cached items for a specific namespace"""
+        if self.is_connected:
+            try:
+                # Find all keys matching namespace pattern
+                cursor = 0
+                pattern = f"mkpt:{namespace}:*"
+                
+                while True:
+                    cursor, keys = await self.redis_client.scan(cursor=cursor, match=pattern, count=100)
+                    if keys:
+                        await self.redis_client.delete(*keys)
+                    if cursor == 0:
+                        break
+                        
+                logger.info(f"Invalidated L2 cache namespace: {namespace}")
+            except Exception as e:
+                logger.error(f"Failed to invalidate namespace {namespace}: {e}")
+                
+        # L1 invalidation is harder without scanning the whole dict
+        # We'll just let them naturally expire or get evicted
+                
+    async def precompute_market_stats(self, db_session) -> Dict[str, Any]:
+        """Background task to precompute expensive market statistics and cache them"""
+        # This would normally run periodically via Celery/Celery Beat
+        start_time = time.time()
+        
+        # Simulated expensive DB aggregations
+        # In reality: SELECT AVG(price), SUM(volume) FROM trades WHERE created_at > NOW() - 24h
+        stats = {
+            "24h_volume": 1250000.50,
+            "active_providers": 450,
+            "average_price_per_tflop": 0.005,
+            "network_utilization": 0.76,
+            "computed_at": datetime.utcnow().isoformat(),
+            "computation_time_ms": int((time.time() - start_time) * 1000)
+        }
+        
+        # Cache the precomputed stats
+        await self.set_cached_data('market_stats', {'period': '24h'}, stats, custom_ttl=300)
+        
+        return stats
+        
+    def optimize_order_book_response(self, raw_orders: List[Dict], depth: int = 50) -> Dict[str, List]:
+        """
+        Optimize the raw order book for client delivery.
+        Groups similar prices, limits depth, and formats efficiently.
+        """
+        buy_orders = [o for o in raw_orders if o['type'] == 'buy']
+        sell_orders = [o for o in raw_orders if o['type'] == 'sell']
+        
+        # Aggregate by price level to reduce payload size
+        agg_buys = {}
+        for order in buy_orders:
+            price = round(order['price'], 4)
+            if price not in agg_buys:
+                agg_buys[price] = 0
+            agg_buys[price] += order['amount']
+            
+        agg_sells = {}
+        for order in sell_orders:
+            price = round(order['price'], 4)
+            if price not in agg_sells:
+                agg_sells[price] = 0
+            agg_sells[price] += order['amount']
+            
+        # Format and sort
+        formatted_buys = [[p, q] for p, q in sorted(agg_buys.items(), reverse=True)[:depth]]
+        formatted_sells = [[p, q] for p, q in sorted(agg_sells.items())[:depth]]
+        
+        return {
+            "bids": formatted_buys,
+            "asks": formatted_sells,
+            "timestamp": time.time()
+        }
--- a/gpu_acceleration/parallel_processing/marketplace_monitor.py
+++ b/gpu_acceleration/parallel_processing/marketplace_monitor.py
@@ -0,0 +1,236 @@
+"""
+Marketplace Real-time Performance Monitor
+Implements comprehensive real-time monitoring and analytics for the AITBC marketplace.
+"""
+
+import time
+import asyncio
+import logging
+from typing import Dict, List, Optional, Any, collections
+from datetime import datetime, timedelta
+import collections
+
+logger = logging.getLogger(__name__)
+
+class TimeSeriesData:
+    """Efficient in-memory time series data structure for real-time metrics"""
+    
+    def __init__(self, max_points: int = 3600): # Default 1 hour of second-level data
+        self.max_points = max_points
+        self.timestamps = collections.deque(maxlen=max_points)
+        self.values = collections.deque(maxlen=max_points)
+        
+    def add(self, value: float, timestamp: float = None):
+        self.timestamps.append(timestamp or time.time())
+        self.values.append(value)
+        
+    def get_latest(self) -> Optional[float]:
+        return self.values[-1] if self.values else None
+        
+    def get_average(self, window_seconds: int = 60) -> float:
+        if not self.values:
+            return 0.0
+            
+        cutoff = time.time() - window_seconds
+        valid_values = [v for t, v in zip(self.timestamps, self.values) if t >= cutoff]
+        
+        return sum(valid_values) / len(valid_values) if valid_values else 0.0
+        
+    def get_percentile(self, percentile: float, window_seconds: int = 60) -> float:
+        if not self.values:
+            return 0.0
+            
+        cutoff = time.time() - window_seconds
+        valid_values = sorted([v for t, v in zip(self.timestamps, self.values) if t >= cutoff])
+        
+        if not valid_values:
+            return 0.0
+            
+        idx = int(len(valid_values) * percentile)
+        idx = min(max(idx, 0), len(valid_values) - 1)
+        return valid_values[idx]
+
+class MarketplaceMonitor:
+    """Real-time performance monitoring system for the marketplace"""
+    
+    def __init__(self):
+        # API Metrics
+        self.api_latency_ms = TimeSeriesData()
+        self.api_requests_per_sec = TimeSeriesData()
+        self.api_error_rate = TimeSeriesData()
+        
+        # Trading Metrics
+        self.order_matching_time_ms = TimeSeriesData()
+        self.trades_per_sec = TimeSeriesData()
+        self.active_orders = TimeSeriesData()
+        
+        # Resource Metrics
+        self.gpu_utilization_pct = TimeSeriesData()
+        self.network_bandwidth_mbps = TimeSeriesData()
+        self.active_providers = TimeSeriesData()
+        
+        # internal tracking
+        self._request_counter = 0
+        self._error_counter = 0
+        self._trade_counter = 0
+        self._last_tick = time.time()
+        
+        self.is_running = False
+        self._monitor_task = None
+        
+        # Alert thresholds
+        self.alert_thresholds = {
+            'api_latency_p95_ms': 500.0,
+            'api_error_rate_pct': 5.0,
+            'gpu_utilization_pct': 90.0,
+            'matching_time_ms': 100.0
+        }
+        
+        self.active_alerts = []
+        
+    async def start(self):
+        if self.is_running:
+            return
+        self.is_running = True
+        self._monitor_task = asyncio.create_task(self._metric_tick_loop())
+        logger.info("Marketplace Monitor started")
+        
+    async def stop(self):
+        self.is_running = False
+        if self._monitor_task:
+            self._monitor_task.cancel()
+        logger.info("Marketplace Monitor stopped")
+        
+    def record_api_call(self, latency_ms: float, is_error: bool = False):
+        """Record an API request for monitoring"""
+        self.api_latency_ms.add(latency_ms)
+        self._request_counter += 1
+        if is_error:
+            self._error_counter += 1
+            
+    def record_trade(self, matching_time_ms: float):
+        """Record a successful trade match"""
+        self.order_matching_time_ms.add(matching_time_ms)
+        self._trade_counter += 1
+        
+    def update_resource_metrics(self, gpu_util: float, bandwidth: float, providers: int, orders: int):
+        """Update system resource metrics"""
+        self.gpu_utilization_pct.add(gpu_util)
+        self.network_bandwidth_mbps.add(bandwidth)
+        self.active_providers.add(providers)
+        self.active_orders.add(orders)
+        
+    async def _metric_tick_loop(self):
+        """Background task that aggregates metrics every second"""
+        while self.is_running:
+            try:
+                now = time.time()
+                elapsed = now - self._last_tick
+                
+                if elapsed >= 1.0:
+                    # Calculate rates
+                    req_per_sec = self._request_counter / elapsed
+                    trades_per_sec = self._trade_counter / elapsed
+                    error_rate = (self._error_counter / max(1, self._request_counter)) * 100
+                    
+                    # Store metrics
+                    self.api_requests_per_sec.add(req_per_sec)
+                    self.trades_per_sec.add(trades_per_sec)
+                    self.api_error_rate.add(error_rate)
+                    
+                    # Reset counters
+                    self._request_counter = 0
+                    self._error_counter = 0
+                    self._trade_counter = 0
+                    self._last_tick = now
+                    
+                    # Evaluate alerts
+                    self._evaluate_alerts()
+                    
+                await asyncio.sleep(1.0 - (time.time() - now)) # Sleep for remainder of second
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in monitor tick loop: {e}")
+                await asyncio.sleep(1.0)
+                
+    def _evaluate_alerts(self):
+        """Check metrics against thresholds and generate alerts"""
+        current_alerts = []
+        
+        # API Latency Alert
+        p95_latency = self.api_latency_ms.get_percentile(0.95, window_seconds=60)
+        if p95_latency > self.alert_thresholds['api_latency_p95_ms']:
+            current_alerts.append({
+                'id': f"alert_latency_{int(time.time())}",
+                'severity': 'high' if p95_latency > self.alert_thresholds['api_latency_p95_ms'] * 2 else 'medium',
+                'metric': 'api_latency',
+                'value': p95_latency,
+                'threshold': self.alert_thresholds['api_latency_p95_ms'],
+                'message': f"High API Latency (p95): {p95_latency:.2f}ms",
+                'timestamp': datetime.utcnow().isoformat()
+            })
+            
+        # Error Rate Alert
+        avg_error_rate = self.api_error_rate.get_average(window_seconds=60)
+        if avg_error_rate > self.alert_thresholds['api_error_rate_pct']:
+            current_alerts.append({
+                'id': f"alert_error_{int(time.time())}",
+                'severity': 'critical',
+                'metric': 'error_rate',
+                'value': avg_error_rate,
+                'threshold': self.alert_thresholds['api_error_rate_pct'],
+                'message': f"High API Error Rate: {avg_error_rate:.2f}%",
+                'timestamp': datetime.utcnow().isoformat()
+            })
+            
+        # Matching Time Alert
+        avg_matching = self.order_matching_time_ms.get_average(window_seconds=60)
+        if avg_matching > self.alert_thresholds['matching_time_ms']:
+            current_alerts.append({
+                'id': f"alert_matching_{int(time.time())}",
+                'severity': 'medium',
+                'metric': 'matching_time',
+                'value': avg_matching,
+                'threshold': self.alert_thresholds['matching_time_ms'],
+                'message': f"Slow Order Matching: {avg_matching:.2f}ms",
+                'timestamp': datetime.utcnow().isoformat()
+            })
+            
+        self.active_alerts = current_alerts
+        
+        if current_alerts:
+            # In a real system, this would trigger webhooks, Slack/Discord messages, etc.
+            for alert in current_alerts:
+                if alert['severity'] in ['high', 'critical']:
+                    logger.warning(f"MARKETPLACE ALERT: {alert['message']}")
+
+    def get_realtime_dashboard_data(self) -> Dict[str, Any]:
+        """Get aggregated data formatted for the frontend dashboard"""
+        return {
+            'status': 'degraded' if any(a['severity'] in ['high', 'critical'] for a in self.active_alerts) else 'healthy',
+            'timestamp': datetime.utcnow().isoformat(),
+            'current_metrics': {
+                'api': {
+                    'rps': round(self.api_requests_per_sec.get_latest() or 0, 2),
+                    'latency_p50_ms': round(self.api_latency_ms.get_percentile(0.50, 60), 2),
+                    'latency_p95_ms': round(self.api_latency_ms.get_percentile(0.95, 60), 2),
+                    'error_rate_pct': round(self.api_error_rate.get_average(60), 2)
+                },
+                'trading': {
+                    'tps': round(self.trades_per_sec.get_latest() or 0, 2),
+                    'matching_time_ms': round(self.order_matching_time_ms.get_average(60), 2),
+                    'active_orders': int(self.active_orders.get_latest() or 0)
+                },
+                'network': {
+                    'active_providers': int(self.active_providers.get_latest() or 0),
+                    'gpu_utilization_pct': round(self.gpu_utilization_pct.get_latest() or 0, 2),
+                    'bandwidth_mbps': round(self.network_bandwidth_mbps.get_latest() or 0, 2)
+                }
+            },
+            'alerts': self.active_alerts
+        }
+
+# Global instance
+monitor = MarketplaceMonitor()
--- a/gpu_acceleration/parallel_processing/marketplace_scaler.py
+++ b/gpu_acceleration/parallel_processing/marketplace_scaler.py
@@ -0,0 +1,265 @@
+"""
+Marketplace Adaptive Resource Scaler
+Implements predictive and reactive auto-scaling of marketplace resources based on demand.
+"""
+
+import time
+import asyncio
+import logging
+from typing import Dict, List, Optional, Any, Tuple
+from datetime import datetime, timedelta
+import math
+
+logger = logging.getLogger(__name__)
+
+class ScalingPolicy:
+    """Configuration for scaling behavior"""
+    def __init__(
+        self,
+        min_nodes: int = 2,
+        max_nodes: int = 100,
+        target_utilization: float = 0.75,
+        scale_up_threshold: float = 0.85,
+        scale_down_threshold: float = 0.40,
+        cooldown_period_sec: int = 300, # 5 minutes between scaling actions
+        predictive_scaling: bool = True
+    ):
+        self.min_nodes = min_nodes
+        self.max_nodes = max_nodes
+        self.target_utilization = target_utilization
+        self.scale_up_threshold = scale_up_threshold
+        self.scale_down_threshold = scale_down_threshold
+        self.cooldown_period_sec = cooldown_period_sec
+        self.predictive_scaling = predictive_scaling
+
+class ResourceScaler:
+    """Adaptive resource scaling engine for the AITBC marketplace"""
+    
+    def __init__(self, policy: Optional[ScalingPolicy] = None):
+        self.policy = policy or ScalingPolicy()
+        
+        # Current state
+        self.current_nodes = self.policy.min_nodes
+        self.active_gpu_nodes = 0
+        self.active_cpu_nodes = self.policy.min_nodes
+        
+        self.last_scaling_action_time = 0
+        self.scaling_history = []
+        
+        # Historical demand tracking for predictive scaling
+        # Format: hour_of_week (0-167) -> avg_utilization
+        self.historical_demand = {}
+        
+        self.is_running = False
+        self._scaler_task = None
+        
+    async def start(self):
+        if self.is_running:
+            return
+        self.is_running = True
+        self._scaler_task = asyncio.create_task(self._scaling_loop())
+        logger.info(f"Resource Scaler started (Min: {self.policy.min_nodes}, Max: {self.policy.max_nodes})")
+        
+    async def stop(self):
+        self.is_running = False
+        if self._scaler_task:
+            self._scaler_task.cancel()
+        logger.info("Resource Scaler stopped")
+        
+    def update_historical_demand(self, utilization: float):
+        """Update historical data for predictive scaling"""
+        now = datetime.utcnow()
+        hour_of_week = now.weekday() * 24 + now.hour
+        
+        if hour_of_week not in self.historical_demand:
+            self.historical_demand[hour_of_week] = utilization
+        else:
+            # Exponential moving average (favor recent data)
+            current_avg = self.historical_demand[hour_of_week]
+            self.historical_demand[hour_of_week] = (current_avg * 0.9) + (utilization * 0.1)
+
+    def _predict_demand(self, lookahead_hours: int = 1) -> float:
+        """Predict expected utilization based on historical patterns"""
+        if not self.policy.predictive_scaling or not self.historical_demand:
+            return 0.0
+            
+        now = datetime.utcnow()
+        target_hour = (now.weekday() * 24 + now.hour + lookahead_hours) % 168
+        
+        # If we have exact data for that hour
+        if target_hour in self.historical_demand:
+            return self.historical_demand[target_hour]
+            
+        # Find nearest available data points
+        available_hours = sorted(self.historical_demand.keys())
+        if not available_hours:
+            return 0.0
+            
+        # Simplistic interpolation
+        return sum(self.historical_demand.values()) / len(self.historical_demand)
+        
+    async def _scaling_loop(self):
+        """Background task that evaluates scaling rules periodically"""
+        while self.is_running:
+            try:
+                # In a real system, we'd fetch this from the Monitor or Coordinator
+                # Here we simulate fetching current metrics
+                current_utilization = self._get_current_utilization()
+                current_queue_depth = self._get_queue_depth()
+                
+                self.update_historical_demand(current_utilization)
+                
+                await self.evaluate_scaling(current_utilization, current_queue_depth)
+                
+                # Check every 10 seconds
+                await asyncio.sleep(10.0)
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in scaling loop: {e}")
+                await asyncio.sleep(10.0)
+
+    async def evaluate_scaling(self, current_utilization: float, queue_depth: int) -> Optional[Dict[str, Any]]:
+        """Evaluate if scaling action is needed and execute if necessary"""
+        now = time.time()
+        
+        # Check cooldown
+        if now - self.last_scaling_action_time < self.policy.cooldown_period_sec:
+            return None
+            
+        predicted_utilization = self._predict_demand()
+        
+        # Determine target node count
+        target_nodes = self.current_nodes
+        action = None
+        reason = ""
+        
+        # Scale UP conditions
+        if current_utilization > self.policy.scale_up_threshold or queue_depth > self.current_nodes * 5:
+            # Reactive scale up
+            desired_increase = math.ceil(self.current_nodes * (current_utilization / self.policy.target_utilization - 1.0))
+            # Ensure we add at least 1, but bounded by queue depth and max_nodes
+            nodes_to_add = max(1, min(desired_increase, max(1, queue_depth // 2)))
+            
+            target_nodes = min(self.policy.max_nodes, self.current_nodes + nodes_to_add)
+            
+            if target_nodes > self.current_nodes:
+                action = "scale_up"
+                reason = f"High utilization ({current_utilization*100:.1f}%) or queue depth ({queue_depth})"
+                
+        elif self.policy.predictive_scaling and predicted_utilization > self.policy.scale_up_threshold:
+            # Predictive scale up (proactive)
+            # Add nodes more conservatively for predictive scaling
+            target_nodes = min(self.policy.max_nodes, self.current_nodes + 1)
+            
+            if target_nodes > self.current_nodes:
+                action = "scale_up"
+                reason = f"Predictive scaling (expected {predicted_utilization*100:.1f}% util)"
+                
+        # Scale DOWN conditions
+        elif current_utilization < self.policy.scale_down_threshold and queue_depth == 0:
+            # Only scale down if predicted utilization is also low
+            if not self.policy.predictive_scaling or predicted_utilization < self.policy.target_utilization:
+                # Remove nodes conservatively
+                nodes_to_remove = max(1, int(self.current_nodes * 0.2))
+                target_nodes = max(self.policy.min_nodes, self.current_nodes - nodes_to_remove)
+                
+                if target_nodes < self.current_nodes:
+                    action = "scale_down"
+                    reason = f"Low utilization ({current_utilization*100:.1f}%)"
+                    
+        # Execute scaling if needed
+        if action and target_nodes != self.current_nodes:
+            diff = abs(target_nodes - self.current_nodes)
+            
+            result = await self._execute_scaling(action, diff, target_nodes)
+            
+            record = {
+                "timestamp": datetime.utcnow().isoformat(),
+                "action": action,
+                "nodes_changed": diff,
+                "new_total": target_nodes,
+                "reason": reason,
+                "metrics_at_time": {
+                    "utilization": current_utilization,
+                    "queue_depth": queue_depth,
+                    "predicted_utilization": predicted_utilization
+                }
+            }
+            
+            self.scaling_history.append(record)
+            # Keep history manageable
+            if len(self.scaling_history) > 1000:
+                self.scaling_history = self.scaling_history[-1000:]
+                
+            self.last_scaling_action_time = now
+            self.current_nodes = target_nodes
+            
+            logger.info(f"Auto-scaler: {action.upper()} to {target_nodes} nodes. Reason: {reason}")
+            return record
+            
+        return None
+
+    async def _execute_scaling(self, action: str, count: int, new_total: int) -> bool:
+        """Execute the actual scaling action (e.g. interacting with Kubernetes/Docker/Cloud provider)"""
+        # In this implementation, we simulate the scaling delay
+        # In production, this would call cloud APIs (AWS AutoScaling, K8s Scale, etc.)
+        logger.debug(f"Executing {action} by {count} nodes...")
+        
+        # Simulate API delay
+        await asyncio.sleep(2.0)
+        
+        if action == "scale_up":
+            # Simulate provisioning new instances
+            # We assume a mix of CPU and GPU instances based on demand
+            new_gpus = count // 2
+            new_cpus = count - new_gpus
+            self.active_gpu_nodes += new_gpus
+            self.active_cpu_nodes += new_cpus
+        elif action == "scale_down":
+            # Simulate de-provisioning
+            # Prefer removing CPU nodes first if we have GPU ones
+            remove_cpus = min(count, max(0, self.active_cpu_nodes - self.policy.min_nodes))
+            remove_gpus = count - remove_cpus
+            
+            self.active_cpu_nodes -= remove_cpus
+            self.active_gpu_nodes = max(0, self.active_gpu_nodes - remove_gpus)
+            
+        return True
+
+    # --- Simulation helpers ---
+    def _get_current_utilization(self) -> float:
+        """Simulate getting current cluster utilization"""
+        # In reality, fetch from MarketplaceMonitor or Coordinator
+        import random
+        # Base utilization with some noise
+        base = 0.6
+        return max(0.1, min(0.99, base + random.uniform(-0.2, 0.3)))
+        
+    def _get_queue_depth(self) -> int:
+        """Simulate getting current queue depth"""
+        import random
+        if random.random() > 0.8:
+            return random.randint(10, 50)
+        return random.randint(0, 5)
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get current scaler status"""
+        return {
+            "status": "running" if self.is_running else "stopped",
+            "current_nodes": {
+                "total": self.current_nodes,
+                "cpu_nodes": self.active_cpu_nodes,
+                "gpu_nodes": self.active_gpu_nodes
+            },
+            "policy": {
+                "min_nodes": self.policy.min_nodes,
+                "max_nodes": self.policy.max_nodes,
+                "target_utilization": self.policy.target_utilization
+            },
+            "last_action": self.scaling_history[-1] if self.scaling_history else None,
+            "prediction": {
+                "next_hour_utilization_estimate": round(self._predict_demand(1), 3)
+            }
+        }