Update database paths and fix foreign key references across coordinator API
- Change SQLite database path from `/home/oib/windsurf/aitbc/data/` to `/opt/data/` - Fix foreign key references to use correct table names (users, wallets, gpu_registry) - Replace governance router with new governance and community routers - Add multi-modal RL router to main application - Simplify DEPLOYMENT_READINESS_REPORT.md to focus on production deployment status - Update governance router with decentralized DAO voting
This commit is contained in:
468
gpu_acceleration/parallel_processing/distributed_framework.py
Normal file
468
gpu_acceleration/parallel_processing/distributed_framework.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
Distributed Agent Processing Framework
|
||||
Implements a scalable, fault-tolerant framework for distributed AI agent tasks across the AITBC network.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import uuid
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import hashlib
|
||||
from typing import Dict, List, Optional, Any, Callable, Awaitable
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TaskStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
SCHEDULED = "scheduled"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
TIMEOUT = "timeout"
|
||||
RETRYING = "retrying"
|
||||
|
||||
class WorkerStatus(str, Enum):
|
||||
IDLE = "idle"
|
||||
BUSY = "busy"
|
||||
OFFLINE = "offline"
|
||||
OVERLOADED = "overloaded"
|
||||
|
||||
class DistributedTask:
|
||||
def __init__(
|
||||
self,
|
||||
task_id: str,
|
||||
agent_id: str,
|
||||
payload: Dict[str, Any],
|
||||
priority: int = 1,
|
||||
requires_gpu: bool = False,
|
||||
timeout_ms: int = 30000,
|
||||
max_retries: int = 3
|
||||
):
|
||||
self.task_id = task_id or f"dt_{uuid.uuid4().hex[:12]}"
|
||||
self.agent_id = agent_id
|
||||
self.payload = payload
|
||||
self.priority = priority
|
||||
self.requires_gpu = requires_gpu
|
||||
self.timeout_ms = timeout_ms
|
||||
self.max_retries = max_retries
|
||||
|
||||
self.status = TaskStatus.PENDING
|
||||
self.created_at = time.time()
|
||||
self.scheduled_at = None
|
||||
self.started_at = None
|
||||
self.completed_at = None
|
||||
|
||||
self.assigned_worker_id = None
|
||||
self.result = None
|
||||
self.error = None
|
||||
self.retries = 0
|
||||
|
||||
# Calculate content hash for caching/deduplication
|
||||
content = json.dumps(payload, sort_keys=True)
|
||||
self.content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
class WorkerNode:
|
||||
def __init__(
|
||||
self,
|
||||
worker_id: str,
|
||||
capabilities: List[str],
|
||||
has_gpu: bool = False,
|
||||
max_concurrent_tasks: int = 4
|
||||
):
|
||||
self.worker_id = worker_id
|
||||
self.capabilities = capabilities
|
||||
self.has_gpu = has_gpu
|
||||
self.max_concurrent_tasks = max_concurrent_tasks
|
||||
|
||||
self.status = WorkerStatus.IDLE
|
||||
self.active_tasks = []
|
||||
self.last_heartbeat = time.time()
|
||||
self.total_completed = 0
|
||||
self.performance_score = 1.0 # 0.0 to 1.0 based on success rate and speed
|
||||
|
||||
class DistributedProcessingCoordinator:
|
||||
"""
|
||||
Coordinates distributed task execution across available worker nodes.
|
||||
Implements advanced scheduling, fault tolerance, and load balancing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.tasks: Dict[str, DistributedTask] = {}
|
||||
self.workers: Dict[str, WorkerNode] = {}
|
||||
self.task_queue = asyncio.PriorityQueue()
|
||||
|
||||
# Result cache (content_hash -> result)
|
||||
self.result_cache: Dict[str, Any] = {}
|
||||
|
||||
self.is_running = False
|
||||
self._scheduler_task = None
|
||||
self._monitor_task = None
|
||||
|
||||
async def start(self):
|
||||
"""Start the coordinator background tasks"""
|
||||
if self.is_running:
|
||||
return
|
||||
|
||||
self.is_running = True
|
||||
self._scheduler_task = asyncio.create_task(self._scheduling_loop())
|
||||
self._monitor_task = asyncio.create_task(self._health_monitor_loop())
|
||||
logger.info("Distributed Processing Coordinator started")
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the coordinator gracefully"""
|
||||
self.is_running = False
|
||||
if self._scheduler_task:
|
||||
self._scheduler_task.cancel()
|
||||
if self._monitor_task:
|
||||
self._monitor_task.cancel()
|
||||
logger.info("Distributed Processing Coordinator stopped")
|
||||
|
||||
def register_worker(self, worker_id: str, capabilities: List[str], has_gpu: bool = False, max_tasks: int = 4):
|
||||
"""Register a new worker node in the cluster"""
|
||||
if worker_id not in self.workers:
|
||||
self.workers[worker_id] = WorkerNode(worker_id, capabilities, has_gpu, max_tasks)
|
||||
logger.info(f"Registered new worker node: {worker_id} (GPU: {has_gpu})")
|
||||
else:
|
||||
# Update existing worker
|
||||
worker = self.workers[worker_id]
|
||||
worker.capabilities = capabilities
|
||||
worker.has_gpu = has_gpu
|
||||
worker.max_concurrent_tasks = max_tasks
|
||||
worker.last_heartbeat = time.time()
|
||||
if worker.status == WorkerStatus.OFFLINE:
|
||||
worker.status = WorkerStatus.IDLE
|
||||
|
||||
def heartbeat(self, worker_id: str, metrics: Optional[Dict[str, Any]] = None):
|
||||
"""Record a heartbeat from a worker node"""
|
||||
if worker_id in self.workers:
|
||||
worker = self.workers[worker_id]
|
||||
worker.last_heartbeat = time.time()
|
||||
|
||||
# Update status based on metrics if provided
|
||||
if metrics:
|
||||
cpu_load = metrics.get('cpu_load', 0.0)
|
||||
if cpu_load > 0.9 or len(worker.active_tasks) >= worker.max_concurrent_tasks:
|
||||
worker.status = WorkerStatus.OVERLOADED
|
||||
elif len(worker.active_tasks) > 0:
|
||||
worker.status = WorkerStatus.BUSY
|
||||
else:
|
||||
worker.status = WorkerStatus.IDLE
|
||||
|
||||
async def submit_task(self, task: DistributedTask) -> str:
|
||||
"""Submit a new task to the distributed framework"""
|
||||
# Check cache first
|
||||
if task.content_hash in self.result_cache:
|
||||
task.status = TaskStatus.COMPLETED
|
||||
task.result = self.result_cache[task.content_hash]
|
||||
task.completed_at = time.time()
|
||||
self.tasks[task.task_id] = task
|
||||
logger.debug(f"Task {task.task_id} fulfilled from cache")
|
||||
return task.task_id
|
||||
|
||||
self.tasks[task.task_id] = task
|
||||
# Priority Queue uses lowest number first, so we invert user priority
|
||||
queue_priority = 100 - min(task.priority, 100)
|
||||
|
||||
await self.task_queue.put((queue_priority, task.created_at, task.task_id))
|
||||
logger.debug(f"Task {task.task_id} queued with priority {task.priority}")
|
||||
|
||||
return task.task_id
|
||||
|
||||
async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get the current status and result of a task"""
|
||||
if task_id not in self.tasks:
|
||||
return None
|
||||
|
||||
task = self.tasks[task_id]
|
||||
|
||||
response = {
|
||||
'task_id': task.task_id,
|
||||
'status': task.status,
|
||||
'created_at': task.created_at
|
||||
}
|
||||
|
||||
if task.status == TaskStatus.COMPLETED:
|
||||
response['result'] = task.result
|
||||
response['completed_at'] = task.completed_at
|
||||
response['duration_ms'] = int((task.completed_at - (task.started_at or task.created_at)) * 1000)
|
||||
elif task.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT]:
|
||||
response['error'] = str(task.error)
|
||||
|
||||
if task.assigned_worker_id:
|
||||
response['worker_id'] = task.assigned_worker_id
|
||||
|
||||
return response
|
||||
|
||||
async def _scheduling_loop(self):
|
||||
"""Background task that assigns queued tasks to available workers"""
|
||||
while self.is_running:
|
||||
try:
|
||||
# Get next task from queue (blocks until available)
|
||||
if self.task_queue.empty():
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
priority, _, task_id = await self.task_queue.get()
|
||||
|
||||
if task_id not in self.tasks:
|
||||
self.task_queue.task_done()
|
||||
continue
|
||||
|
||||
task = self.tasks[task_id]
|
||||
|
||||
# If task was cancelled while in queue
|
||||
if task.status != TaskStatus.PENDING and task.status != TaskStatus.RETRYING:
|
||||
self.task_queue.task_done()
|
||||
continue
|
||||
|
||||
# Find best worker
|
||||
best_worker = self._find_best_worker(task)
|
||||
|
||||
if best_worker:
|
||||
await self._assign_task(task, best_worker)
|
||||
else:
|
||||
# No worker available right now, put back in queue with slight delay
|
||||
# Use a background task to not block the scheduling loop
|
||||
asyncio.create_task(self._requeue_delayed(priority, task))
|
||||
|
||||
self.task_queue.task_done()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scheduling loop: {e}")
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
async def _requeue_delayed(self, priority: int, task: DistributedTask):
|
||||
"""Put a task back in the queue after a short delay"""
|
||||
await asyncio.sleep(0.5)
|
||||
if self.is_running and task.status in [TaskStatus.PENDING, TaskStatus.RETRYING]:
|
||||
await self.task_queue.put((priority, task.created_at, task.task_id))
|
||||
|
||||
def _find_best_worker(self, task: DistributedTask) -> Optional[WorkerNode]:
|
||||
"""Find the optimal worker for a task based on requirements and load"""
|
||||
candidates = []
|
||||
|
||||
for worker in self.workers.values():
|
||||
# Skip offline or overloaded workers
|
||||
if worker.status in [WorkerStatus.OFFLINE, WorkerStatus.OVERLOADED]:
|
||||
continue
|
||||
|
||||
# Skip if worker is at capacity
|
||||
if len(worker.active_tasks) >= worker.max_concurrent_tasks:
|
||||
continue
|
||||
|
||||
# Check GPU requirement
|
||||
if task.requires_gpu and not worker.has_gpu:
|
||||
continue
|
||||
|
||||
# Required capability check could be added here
|
||||
|
||||
# Calculate score for worker
|
||||
score = worker.performance_score * 100
|
||||
|
||||
# Penalize slightly based on current load to balance distribution
|
||||
load_factor = len(worker.active_tasks) / worker.max_concurrent_tasks
|
||||
score -= (load_factor * 20)
|
||||
|
||||
# Prefer GPU workers for GPU tasks, penalize GPU workers for CPU tasks
|
||||
# to keep them free for GPU workloads
|
||||
if worker.has_gpu and not task.requires_gpu:
|
||||
score -= 30
|
||||
|
||||
candidates.append((score, worker))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Return worker with highest score
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
return candidates[0][1]
|
||||
|
||||
async def _assign_task(self, task: DistributedTask, worker: WorkerNode):
|
||||
"""Assign a task to a specific worker"""
|
||||
task.status = TaskStatus.SCHEDULED
|
||||
task.assigned_worker_id = worker.worker_id
|
||||
task.scheduled_at = time.time()
|
||||
|
||||
worker.active_tasks.append(task.task_id)
|
||||
if len(worker.active_tasks) >= worker.max_concurrent_tasks:
|
||||
worker.status = WorkerStatus.OVERLOADED
|
||||
elif worker.status == WorkerStatus.IDLE:
|
||||
worker.status = WorkerStatus.BUSY
|
||||
|
||||
logger.debug(f"Assigned task {task.task_id} to worker {worker.worker_id}")
|
||||
|
||||
# In a real system, this would make an RPC/network call to the worker
|
||||
# Here we simulate the network dispatch asynchronously
|
||||
asyncio.create_task(self._simulate_worker_execution(task, worker))
|
||||
|
||||
async def _simulate_worker_execution(self, task: DistributedTask, worker: WorkerNode):
|
||||
"""Simulate the execution on the remote worker node"""
|
||||
task.status = TaskStatus.PROCESSING
|
||||
task.started_at = time.time()
|
||||
|
||||
try:
|
||||
# Simulate processing time based on task complexity
|
||||
# Real implementation would await the actual RPC response
|
||||
complexity = task.payload.get('complexity', 1.0)
|
||||
base_time = 0.5
|
||||
|
||||
if worker.has_gpu and task.requires_gpu:
|
||||
# GPU processes faster
|
||||
processing_time = base_time * complexity * 0.2
|
||||
else:
|
||||
processing_time = base_time * complexity
|
||||
|
||||
# Simulate potential network/node failure
|
||||
if worker.performance_score < 0.5 and time.time() % 10 < 1:
|
||||
raise ConnectionError("Worker node network failure")
|
||||
|
||||
await asyncio.sleep(processing_time)
|
||||
|
||||
# Success
|
||||
self.report_task_success(task.task_id, {"result_data": "simulated_success", "processed_by": worker.worker_id})
|
||||
|
||||
except Exception as e:
|
||||
self.report_task_failure(task.task_id, str(e))
|
||||
|
||||
def report_task_success(self, task_id: str, result: Any):
|
||||
"""Called by a worker when a task completes successfully"""
|
||||
if task_id not in self.tasks:
|
||||
return
|
||||
|
||||
task = self.tasks[task_id]
|
||||
if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.TIMEOUT]:
|
||||
return # Already finished
|
||||
|
||||
task.status = TaskStatus.COMPLETED
|
||||
task.result = result
|
||||
task.completed_at = time.time()
|
||||
|
||||
# Cache the result
|
||||
self.result_cache[task.content_hash] = result
|
||||
|
||||
# Update worker metrics
|
||||
if task.assigned_worker_id and task.assigned_worker_id in self.workers:
|
||||
worker = self.workers[task.assigned_worker_id]
|
||||
if task_id in worker.active_tasks:
|
||||
worker.active_tasks.remove(task_id)
|
||||
worker.total_completed += 1
|
||||
# Increase performance score slightly (max 1.0)
|
||||
worker.performance_score = min(1.0, worker.performance_score + 0.01)
|
||||
|
||||
if len(worker.active_tasks) < worker.max_concurrent_tasks and worker.status == WorkerStatus.OVERLOADED:
|
||||
worker.status = WorkerStatus.BUSY
|
||||
if len(worker.active_tasks) == 0:
|
||||
worker.status = WorkerStatus.IDLE
|
||||
|
||||
logger.info(f"Task {task_id} completed successfully")
|
||||
|
||||
def report_task_failure(self, task_id: str, error: str):
|
||||
"""Called when a task fails execution"""
|
||||
if task_id not in self.tasks:
|
||||
return
|
||||
|
||||
task = self.tasks[task_id]
|
||||
|
||||
# Update worker metrics
|
||||
if task.assigned_worker_id and task.assigned_worker_id in self.workers:
|
||||
worker = self.workers[task.assigned_worker_id]
|
||||
if task_id in worker.active_tasks:
|
||||
worker.active_tasks.remove(task_id)
|
||||
# Decrease performance score heavily on failure
|
||||
worker.performance_score = max(0.1, worker.performance_score - 0.05)
|
||||
|
||||
# Handle retry logic
|
||||
if task.retries < task.max_retries:
|
||||
task.retries += 1
|
||||
task.status = TaskStatus.RETRYING
|
||||
task.assigned_worker_id = None
|
||||
task.error = f"Attempt {task.retries} failed: {error}"
|
||||
|
||||
logger.warning(f"Task {task_id} failed, scheduling retry {task.retries}/{task.max_retries}")
|
||||
|
||||
# Put back in queue with slightly lower priority
|
||||
queue_priority = (100 - min(task.priority, 100)) + (task.retries * 5)
|
||||
asyncio.create_task(self.task_queue.put((queue_priority, time.time(), task.task_id)))
|
||||
else:
|
||||
task.status = TaskStatus.FAILED
|
||||
task.error = f"Max retries exceeded. Final error: {error}"
|
||||
task.completed_at = time.time()
|
||||
logger.error(f"Task {task_id} failed permanently")
|
||||
|
||||
async def _health_monitor_loop(self):
|
||||
"""Background task that monitors worker health and task timeouts"""
|
||||
while self.is_running:
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
# 1. Check worker health
|
||||
for worker_id, worker in self.workers.items():
|
||||
# If no heartbeat for 60 seconds, mark offline
|
||||
if current_time - worker.last_heartbeat > 60.0:
|
||||
if worker.status != WorkerStatus.OFFLINE:
|
||||
logger.warning(f"Worker {worker_id} went offline (missed heartbeats)")
|
||||
worker.status = WorkerStatus.OFFLINE
|
||||
|
||||
# Re-queue all active tasks for this worker
|
||||
for task_id in worker.active_tasks:
|
||||
if task_id in self.tasks:
|
||||
self.report_task_failure(task_id, "Worker node disconnected")
|
||||
worker.active_tasks.clear()
|
||||
|
||||
# 2. Check task timeouts
|
||||
for task_id, task in self.tasks.items():
|
||||
if task.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING]:
|
||||
start_time = task.started_at or task.scheduled_at
|
||||
if start_time and (current_time - start_time) * 1000 > task.timeout_ms:
|
||||
logger.warning(f"Task {task_id} timed out")
|
||||
self.report_task_failure(task_id, f"Execution timed out after {task.timeout_ms}ms")
|
||||
|
||||
await asyncio.sleep(5.0) # Check every 5 seconds
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health monitor loop: {e}")
|
||||
await asyncio.sleep(5.0)
|
||||
|
||||
def get_cluster_status(self) -> Dict[str, Any]:
|
||||
"""Get the overall status of the distributed cluster"""
|
||||
total_workers = len(self.workers)
|
||||
active_workers = sum(1 for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
|
||||
gpu_workers = sum(1 for w in self.workers.values() if w.has_gpu and w.status != WorkerStatus.OFFLINE)
|
||||
|
||||
pending_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.PENDING)
|
||||
processing_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING])
|
||||
completed_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.COMPLETED)
|
||||
failed_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT])
|
||||
|
||||
# Calculate cluster utilization
|
||||
total_capacity = sum(w.max_concurrent_tasks for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
|
||||
current_load = sum(len(w.active_tasks) for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
|
||||
|
||||
utilization = (current_load / total_capacity * 100) if total_capacity > 0 else 0
|
||||
|
||||
return {
|
||||
"cluster_health": "healthy" if active_workers > 0 else "offline",
|
||||
"nodes": {
|
||||
"total": total_workers,
|
||||
"active": active_workers,
|
||||
"with_gpu": gpu_workers
|
||||
},
|
||||
"tasks": {
|
||||
"pending": pending_tasks,
|
||||
"processing": processing_tasks,
|
||||
"completed": completed_tasks,
|
||||
"failed": failed_tasks
|
||||
},
|
||||
"performance": {
|
||||
"utilization_percent": round(utilization, 2),
|
||||
"cache_size": len(self.result_cache)
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
Marketplace Caching & Optimization Service
|
||||
Implements advanced caching, indexing, and data optimization for the AITBC marketplace.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any, Union, Set
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class LFU_LRU_Cache:
|
||||
"""Hybrid Least-Frequently/Least-Recently Used Cache for in-memory optimization"""
|
||||
|
||||
def __init__(self, capacity: int):
|
||||
self.capacity = capacity
|
||||
self.cache = {}
|
||||
self.frequencies = {}
|
||||
self.frequency_lists = {}
|
||||
self.min_freq = 0
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
if key not in self.cache:
|
||||
return None
|
||||
|
||||
# Update frequency
|
||||
freq = self.frequencies[key]
|
||||
val = self.cache[key]
|
||||
|
||||
# Remove from current frequency list
|
||||
self.frequency_lists[freq].remove(key)
|
||||
if not self.frequency_lists[freq] and self.min_freq == freq:
|
||||
self.min_freq += 1
|
||||
|
||||
# Add to next frequency list
|
||||
new_freq = freq + 1
|
||||
self.frequencies[key] = new_freq
|
||||
if new_freq not in self.frequency_lists:
|
||||
self.frequency_lists[new_freq] = OrderedDict()
|
||||
self.frequency_lists[new_freq][key] = None
|
||||
|
||||
return val
|
||||
|
||||
def put(self, key: str, value: Any):
|
||||
if self.capacity == 0:
|
||||
return
|
||||
|
||||
if key in self.cache:
|
||||
self.cache[key] = value
|
||||
self.get(key) # Update frequency
|
||||
return
|
||||
|
||||
if len(self.cache) >= self.capacity:
|
||||
# Evict least frequently used item (if tie, least recently used)
|
||||
evict_key, _ = self.frequency_lists[self.min_freq].popitem(last=False)
|
||||
del self.cache[evict_key]
|
||||
del self.frequencies[evict_key]
|
||||
|
||||
# Add new item
|
||||
self.cache[key] = value
|
||||
self.frequencies[key] = 1
|
||||
self.min_freq = 1
|
||||
|
||||
if 1 not in self.frequency_lists:
|
||||
self.frequency_lists[1] = OrderedDict()
|
||||
self.frequency_lists[1][key] = None
|
||||
|
||||
class MarketplaceDataOptimizer:
|
||||
"""Advanced optimization engine for marketplace data access"""
|
||||
|
||||
def __init__(self, redis_url: str = "redis://localhost:6379/0"):
|
||||
self.redis_url = redis_url
|
||||
self.redis_client = None
|
||||
|
||||
# Two-tier cache: Fast L1 (Memory), Slower L2 (Redis)
|
||||
self.l1_cache = LFU_LRU_Cache(capacity=1000)
|
||||
self.is_connected = False
|
||||
|
||||
# Cache TTL defaults
|
||||
self.ttls = {
|
||||
'order_book': 5, # Very dynamic, 5 seconds
|
||||
'provider_status': 15, # 15 seconds
|
||||
'market_stats': 60, # 1 minute
|
||||
'historical_data': 3600 # 1 hour
|
||||
}
|
||||
|
||||
async def connect(self):
|
||||
"""Establish connection to Redis L2 cache"""
|
||||
try:
|
||||
self.redis_client = redis.from_url(self.redis_url, decode_responses=True)
|
||||
await self.redis_client.ping()
|
||||
self.is_connected = True
|
||||
logger.info("Connected to Redis L2 cache")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Redis: {e}. Falling back to L1 cache only.")
|
||||
self.is_connected = False
|
||||
|
||||
async def disconnect(self):
|
||||
"""Close Redis connection"""
|
||||
if self.redis_client:
|
||||
await self.redis_client.close()
|
||||
self.is_connected = False
|
||||
|
||||
def _generate_cache_key(self, namespace: str, params: Dict[str, Any]) -> str:
|
||||
"""Generate a deterministic cache key from parameters"""
|
||||
param_str = json.dumps(params, sort_keys=True)
|
||||
param_hash = hashlib.md5(param_str.encode()).hexdigest()
|
||||
return f"mkpt:{namespace}:{param_hash}"
|
||||
|
||||
async def get_cached_data(self, namespace: str, params: Dict[str, Any]) -> Optional[Any]:
|
||||
"""Retrieve data from the multi-tier cache"""
|
||||
key = self._generate_cache_key(namespace, params)
|
||||
|
||||
# 1. Try L1 Memory Cache (fastest)
|
||||
l1_result = self.l1_cache.get(key)
|
||||
if l1_result is not None:
|
||||
# Check if expired
|
||||
if l1_result['expires_at'] > time.time():
|
||||
logger.debug(f"L1 Cache hit for {key}")
|
||||
return l1_result['data']
|
||||
|
||||
# 2. Try L2 Redis Cache
|
||||
if self.is_connected:
|
||||
try:
|
||||
l2_result_str = await self.redis_client.get(key)
|
||||
if l2_result_str:
|
||||
logger.debug(f"L2 Cache hit for {key}")
|
||||
data = json.loads(l2_result_str)
|
||||
|
||||
# Backfill L1 cache
|
||||
ttl = self.ttls.get(namespace, 60)
|
||||
self.l1_cache.put(key, {
|
||||
'data': data,
|
||||
'expires_at': time.time() + min(ttl, 10) # L1 expires sooner than L2
|
||||
})
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis get failed: {e}")
|
||||
|
||||
return None
|
||||
|
||||
async def set_cached_data(self, namespace: str, params: Dict[str, Any], data: Any, custom_ttl: int = None):
|
||||
"""Store data in the multi-tier cache"""
|
||||
key = self._generate_cache_key(namespace, params)
|
||||
ttl = custom_ttl or self.ttls.get(namespace, 60)
|
||||
|
||||
# 1. Update L1 Cache
|
||||
self.l1_cache.put(key, {
|
||||
'data': data,
|
||||
'expires_at': time.time() + ttl
|
||||
})
|
||||
|
||||
# 2. Update L2 Redis Cache asynchronously
|
||||
if self.is_connected:
|
||||
try:
|
||||
# We don't await this to keep the main thread fast
|
||||
# In FastAPI we would use BackgroundTasks
|
||||
await self.redis_client.setex(
|
||||
key,
|
||||
ttl,
|
||||
json.dumps(data)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis set failed: {e}")
|
||||
|
||||
async def invalidate_namespace(self, namespace: str):
|
||||
"""Invalidate all cached items for a specific namespace"""
|
||||
if self.is_connected:
|
||||
try:
|
||||
# Find all keys matching namespace pattern
|
||||
cursor = 0
|
||||
pattern = f"mkpt:{namespace}:*"
|
||||
|
||||
while True:
|
||||
cursor, keys = await self.redis_client.scan(cursor=cursor, match=pattern, count=100)
|
||||
if keys:
|
||||
await self.redis_client.delete(*keys)
|
||||
if cursor == 0:
|
||||
break
|
||||
|
||||
logger.info(f"Invalidated L2 cache namespace: {namespace}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to invalidate namespace {namespace}: {e}")
|
||||
|
||||
# L1 invalidation is harder without scanning the whole dict
|
||||
# We'll just let them naturally expire or get evicted
|
||||
|
||||
async def precompute_market_stats(self, db_session) -> Dict[str, Any]:
|
||||
"""Background task to precompute expensive market statistics and cache them"""
|
||||
# This would normally run periodically via Celery/Celery Beat
|
||||
start_time = time.time()
|
||||
|
||||
# Simulated expensive DB aggregations
|
||||
# In reality: SELECT AVG(price), SUM(volume) FROM trades WHERE created_at > NOW() - 24h
|
||||
stats = {
|
||||
"24h_volume": 1250000.50,
|
||||
"active_providers": 450,
|
||||
"average_price_per_tflop": 0.005,
|
||||
"network_utilization": 0.76,
|
||||
"computed_at": datetime.utcnow().isoformat(),
|
||||
"computation_time_ms": int((time.time() - start_time) * 1000)
|
||||
}
|
||||
|
||||
# Cache the precomputed stats
|
||||
await self.set_cached_data('market_stats', {'period': '24h'}, stats, custom_ttl=300)
|
||||
|
||||
return stats
|
||||
|
||||
def optimize_order_book_response(self, raw_orders: List[Dict], depth: int = 50) -> Dict[str, List]:
|
||||
"""
|
||||
Optimize the raw order book for client delivery.
|
||||
Groups similar prices, limits depth, and formats efficiently.
|
||||
"""
|
||||
buy_orders = [o for o in raw_orders if o['type'] == 'buy']
|
||||
sell_orders = [o for o in raw_orders if o['type'] == 'sell']
|
||||
|
||||
# Aggregate by price level to reduce payload size
|
||||
agg_buys = {}
|
||||
for order in buy_orders:
|
||||
price = round(order['price'], 4)
|
||||
if price not in agg_buys:
|
||||
agg_buys[price] = 0
|
||||
agg_buys[price] += order['amount']
|
||||
|
||||
agg_sells = {}
|
||||
for order in sell_orders:
|
||||
price = round(order['price'], 4)
|
||||
if price not in agg_sells:
|
||||
agg_sells[price] = 0
|
||||
agg_sells[price] += order['amount']
|
||||
|
||||
# Format and sort
|
||||
formatted_buys = [[p, q] for p, q in sorted(agg_buys.items(), reverse=True)[:depth]]
|
||||
formatted_sells = [[p, q] for p, q in sorted(agg_sells.items())[:depth]]
|
||||
|
||||
return {
|
||||
"bids": formatted_buys,
|
||||
"asks": formatted_sells,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
236
gpu_acceleration/parallel_processing/marketplace_monitor.py
Normal file
236
gpu_acceleration/parallel_processing/marketplace_monitor.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""
|
||||
Marketplace Real-time Performance Monitor
|
||||
Implements comprehensive real-time monitoring and analytics for the AITBC marketplace.
|
||||
"""
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any, collections
|
||||
from datetime import datetime, timedelta
|
||||
import collections
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TimeSeriesData:
|
||||
"""Efficient in-memory time series data structure for real-time metrics"""
|
||||
|
||||
def __init__(self, max_points: int = 3600): # Default 1 hour of second-level data
|
||||
self.max_points = max_points
|
||||
self.timestamps = collections.deque(maxlen=max_points)
|
||||
self.values = collections.deque(maxlen=max_points)
|
||||
|
||||
def add(self, value: float, timestamp: float = None):
|
||||
self.timestamps.append(timestamp or time.time())
|
||||
self.values.append(value)
|
||||
|
||||
def get_latest(self) -> Optional[float]:
|
||||
return self.values[-1] if self.values else None
|
||||
|
||||
def get_average(self, window_seconds: int = 60) -> float:
|
||||
if not self.values:
|
||||
return 0.0
|
||||
|
||||
cutoff = time.time() - window_seconds
|
||||
valid_values = [v for t, v in zip(self.timestamps, self.values) if t >= cutoff]
|
||||
|
||||
return sum(valid_values) / len(valid_values) if valid_values else 0.0
|
||||
|
||||
def get_percentile(self, percentile: float, window_seconds: int = 60) -> float:
|
||||
if not self.values:
|
||||
return 0.0
|
||||
|
||||
cutoff = time.time() - window_seconds
|
||||
valid_values = sorted([v for t, v in zip(self.timestamps, self.values) if t >= cutoff])
|
||||
|
||||
if not valid_values:
|
||||
return 0.0
|
||||
|
||||
idx = int(len(valid_values) * percentile)
|
||||
idx = min(max(idx, 0), len(valid_values) - 1)
|
||||
return valid_values[idx]
|
||||
|
||||
class MarketplaceMonitor:
|
||||
"""Real-time performance monitoring system for the marketplace"""
|
||||
|
||||
def __init__(self):
|
||||
# API Metrics
|
||||
self.api_latency_ms = TimeSeriesData()
|
||||
self.api_requests_per_sec = TimeSeriesData()
|
||||
self.api_error_rate = TimeSeriesData()
|
||||
|
||||
# Trading Metrics
|
||||
self.order_matching_time_ms = TimeSeriesData()
|
||||
self.trades_per_sec = TimeSeriesData()
|
||||
self.active_orders = TimeSeriesData()
|
||||
|
||||
# Resource Metrics
|
||||
self.gpu_utilization_pct = TimeSeriesData()
|
||||
self.network_bandwidth_mbps = TimeSeriesData()
|
||||
self.active_providers = TimeSeriesData()
|
||||
|
||||
# internal tracking
|
||||
self._request_counter = 0
|
||||
self._error_counter = 0
|
||||
self._trade_counter = 0
|
||||
self._last_tick = time.time()
|
||||
|
||||
self.is_running = False
|
||||
self._monitor_task = None
|
||||
|
||||
# Alert thresholds
|
||||
self.alert_thresholds = {
|
||||
'api_latency_p95_ms': 500.0,
|
||||
'api_error_rate_pct': 5.0,
|
||||
'gpu_utilization_pct': 90.0,
|
||||
'matching_time_ms': 100.0
|
||||
}
|
||||
|
||||
self.active_alerts = []
|
||||
|
||||
async def start(self):
|
||||
if self.is_running:
|
||||
return
|
||||
self.is_running = True
|
||||
self._monitor_task = asyncio.create_task(self._metric_tick_loop())
|
||||
logger.info("Marketplace Monitor started")
|
||||
|
||||
async def stop(self):
|
||||
self.is_running = False
|
||||
if self._monitor_task:
|
||||
self._monitor_task.cancel()
|
||||
logger.info("Marketplace Monitor stopped")
|
||||
|
||||
def record_api_call(self, latency_ms: float, is_error: bool = False):
|
||||
"""Record an API request for monitoring"""
|
||||
self.api_latency_ms.add(latency_ms)
|
||||
self._request_counter += 1
|
||||
if is_error:
|
||||
self._error_counter += 1
|
||||
|
||||
def record_trade(self, matching_time_ms: float):
|
||||
"""Record a successful trade match"""
|
||||
self.order_matching_time_ms.add(matching_time_ms)
|
||||
self._trade_counter += 1
|
||||
|
||||
def update_resource_metrics(self, gpu_util: float, bandwidth: float, providers: int, orders: int):
|
||||
"""Update system resource metrics"""
|
||||
self.gpu_utilization_pct.add(gpu_util)
|
||||
self.network_bandwidth_mbps.add(bandwidth)
|
||||
self.active_providers.add(providers)
|
||||
self.active_orders.add(orders)
|
||||
|
||||
async def _metric_tick_loop(self):
|
||||
"""Background task that aggregates metrics every second"""
|
||||
while self.is_running:
|
||||
try:
|
||||
now = time.time()
|
||||
elapsed = now - self._last_tick
|
||||
|
||||
if elapsed >= 1.0:
|
||||
# Calculate rates
|
||||
req_per_sec = self._request_counter / elapsed
|
||||
trades_per_sec = self._trade_counter / elapsed
|
||||
error_rate = (self._error_counter / max(1, self._request_counter)) * 100
|
||||
|
||||
# Store metrics
|
||||
self.api_requests_per_sec.add(req_per_sec)
|
||||
self.trades_per_sec.add(trades_per_sec)
|
||||
self.api_error_rate.add(error_rate)
|
||||
|
||||
# Reset counters
|
||||
self._request_counter = 0
|
||||
self._error_counter = 0
|
||||
self._trade_counter = 0
|
||||
self._last_tick = now
|
||||
|
||||
# Evaluate alerts
|
||||
self._evaluate_alerts()
|
||||
|
||||
await asyncio.sleep(1.0 - (time.time() - now)) # Sleep for remainder of second
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in monitor tick loop: {e}")
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
def _evaluate_alerts(self):
|
||||
"""Check metrics against thresholds and generate alerts"""
|
||||
current_alerts = []
|
||||
|
||||
# API Latency Alert
|
||||
p95_latency = self.api_latency_ms.get_percentile(0.95, window_seconds=60)
|
||||
if p95_latency > self.alert_thresholds['api_latency_p95_ms']:
|
||||
current_alerts.append({
|
||||
'id': f"alert_latency_{int(time.time())}",
|
||||
'severity': 'high' if p95_latency > self.alert_thresholds['api_latency_p95_ms'] * 2 else 'medium',
|
||||
'metric': 'api_latency',
|
||||
'value': p95_latency,
|
||||
'threshold': self.alert_thresholds['api_latency_p95_ms'],
|
||||
'message': f"High API Latency (p95): {p95_latency:.2f}ms",
|
||||
'timestamp': datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
# Error Rate Alert
|
||||
avg_error_rate = self.api_error_rate.get_average(window_seconds=60)
|
||||
if avg_error_rate > self.alert_thresholds['api_error_rate_pct']:
|
||||
current_alerts.append({
|
||||
'id': f"alert_error_{int(time.time())}",
|
||||
'severity': 'critical',
|
||||
'metric': 'error_rate',
|
||||
'value': avg_error_rate,
|
||||
'threshold': self.alert_thresholds['api_error_rate_pct'],
|
||||
'message': f"High API Error Rate: {avg_error_rate:.2f}%",
|
||||
'timestamp': datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
# Matching Time Alert
|
||||
avg_matching = self.order_matching_time_ms.get_average(window_seconds=60)
|
||||
if avg_matching > self.alert_thresholds['matching_time_ms']:
|
||||
current_alerts.append({
|
||||
'id': f"alert_matching_{int(time.time())}",
|
||||
'severity': 'medium',
|
||||
'metric': 'matching_time',
|
||||
'value': avg_matching,
|
||||
'threshold': self.alert_thresholds['matching_time_ms'],
|
||||
'message': f"Slow Order Matching: {avg_matching:.2f}ms",
|
||||
'timestamp': datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
self.active_alerts = current_alerts
|
||||
|
||||
if current_alerts:
|
||||
# In a real system, this would trigger webhooks, Slack/Discord messages, etc.
|
||||
for alert in current_alerts:
|
||||
if alert['severity'] in ['high', 'critical']:
|
||||
logger.warning(f"MARKETPLACE ALERT: {alert['message']}")
|
||||
|
||||
def get_realtime_dashboard_data(self) -> Dict[str, Any]:
|
||||
"""Get aggregated data formatted for the frontend dashboard"""
|
||||
return {
|
||||
'status': 'degraded' if any(a['severity'] in ['high', 'critical'] for a in self.active_alerts) else 'healthy',
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'current_metrics': {
|
||||
'api': {
|
||||
'rps': round(self.api_requests_per_sec.get_latest() or 0, 2),
|
||||
'latency_p50_ms': round(self.api_latency_ms.get_percentile(0.50, 60), 2),
|
||||
'latency_p95_ms': round(self.api_latency_ms.get_percentile(0.95, 60), 2),
|
||||
'error_rate_pct': round(self.api_error_rate.get_average(60), 2)
|
||||
},
|
||||
'trading': {
|
||||
'tps': round(self.trades_per_sec.get_latest() or 0, 2),
|
||||
'matching_time_ms': round(self.order_matching_time_ms.get_average(60), 2),
|
||||
'active_orders': int(self.active_orders.get_latest() or 0)
|
||||
},
|
||||
'network': {
|
||||
'active_providers': int(self.active_providers.get_latest() or 0),
|
||||
'gpu_utilization_pct': round(self.gpu_utilization_pct.get_latest() or 0, 2),
|
||||
'bandwidth_mbps': round(self.network_bandwidth_mbps.get_latest() or 0, 2)
|
||||
}
|
||||
},
|
||||
'alerts': self.active_alerts
|
||||
}
|
||||
|
||||
# Global instance
|
||||
monitor = MarketplaceMonitor()
|
||||
265
gpu_acceleration/parallel_processing/marketplace_scaler.py
Normal file
265
gpu_acceleration/parallel_processing/marketplace_scaler.py
Normal file
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Marketplace Adaptive Resource Scaler
|
||||
Implements predictive and reactive auto-scaling of marketplace resources based on demand.
|
||||
"""
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
import math
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ScalingPolicy:
|
||||
"""Configuration for scaling behavior"""
|
||||
def __init__(
|
||||
self,
|
||||
min_nodes: int = 2,
|
||||
max_nodes: int = 100,
|
||||
target_utilization: float = 0.75,
|
||||
scale_up_threshold: float = 0.85,
|
||||
scale_down_threshold: float = 0.40,
|
||||
cooldown_period_sec: int = 300, # 5 minutes between scaling actions
|
||||
predictive_scaling: bool = True
|
||||
):
|
||||
self.min_nodes = min_nodes
|
||||
self.max_nodes = max_nodes
|
||||
self.target_utilization = target_utilization
|
||||
self.scale_up_threshold = scale_up_threshold
|
||||
self.scale_down_threshold = scale_down_threshold
|
||||
self.cooldown_period_sec = cooldown_period_sec
|
||||
self.predictive_scaling = predictive_scaling
|
||||
|
||||
class ResourceScaler:
|
||||
"""Adaptive resource scaling engine for the AITBC marketplace"""
|
||||
|
||||
def __init__(self, policy: Optional[ScalingPolicy] = None):
|
||||
self.policy = policy or ScalingPolicy()
|
||||
|
||||
# Current state
|
||||
self.current_nodes = self.policy.min_nodes
|
||||
self.active_gpu_nodes = 0
|
||||
self.active_cpu_nodes = self.policy.min_nodes
|
||||
|
||||
self.last_scaling_action_time = 0
|
||||
self.scaling_history = []
|
||||
|
||||
# Historical demand tracking for predictive scaling
|
||||
# Format: hour_of_week (0-167) -> avg_utilization
|
||||
self.historical_demand = {}
|
||||
|
||||
self.is_running = False
|
||||
self._scaler_task = None
|
||||
|
||||
async def start(self):
|
||||
if self.is_running:
|
||||
return
|
||||
self.is_running = True
|
||||
self._scaler_task = asyncio.create_task(self._scaling_loop())
|
||||
logger.info(f"Resource Scaler started (Min: {self.policy.min_nodes}, Max: {self.policy.max_nodes})")
|
||||
|
||||
async def stop(self):
|
||||
self.is_running = False
|
||||
if self._scaler_task:
|
||||
self._scaler_task.cancel()
|
||||
logger.info("Resource Scaler stopped")
|
||||
|
||||
def update_historical_demand(self, utilization: float):
|
||||
"""Update historical data for predictive scaling"""
|
||||
now = datetime.utcnow()
|
||||
hour_of_week = now.weekday() * 24 + now.hour
|
||||
|
||||
if hour_of_week not in self.historical_demand:
|
||||
self.historical_demand[hour_of_week] = utilization
|
||||
else:
|
||||
# Exponential moving average (favor recent data)
|
||||
current_avg = self.historical_demand[hour_of_week]
|
||||
self.historical_demand[hour_of_week] = (current_avg * 0.9) + (utilization * 0.1)
|
||||
|
||||
def _predict_demand(self, lookahead_hours: int = 1) -> float:
|
||||
"""Predict expected utilization based on historical patterns"""
|
||||
if not self.policy.predictive_scaling or not self.historical_demand:
|
||||
return 0.0
|
||||
|
||||
now = datetime.utcnow()
|
||||
target_hour = (now.weekday() * 24 + now.hour + lookahead_hours) % 168
|
||||
|
||||
# If we have exact data for that hour
|
||||
if target_hour in self.historical_demand:
|
||||
return self.historical_demand[target_hour]
|
||||
|
||||
# Find nearest available data points
|
||||
available_hours = sorted(self.historical_demand.keys())
|
||||
if not available_hours:
|
||||
return 0.0
|
||||
|
||||
# Simplistic interpolation
|
||||
return sum(self.historical_demand.values()) / len(self.historical_demand)
|
||||
|
||||
async def _scaling_loop(self):
|
||||
"""Background task that evaluates scaling rules periodically"""
|
||||
while self.is_running:
|
||||
try:
|
||||
# In a real system, we'd fetch this from the Monitor or Coordinator
|
||||
# Here we simulate fetching current metrics
|
||||
current_utilization = self._get_current_utilization()
|
||||
current_queue_depth = self._get_queue_depth()
|
||||
|
||||
self.update_historical_demand(current_utilization)
|
||||
|
||||
await self.evaluate_scaling(current_utilization, current_queue_depth)
|
||||
|
||||
# Check every 10 seconds
|
||||
await asyncio.sleep(10.0)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scaling loop: {e}")
|
||||
await asyncio.sleep(10.0)
|
||||
|
||||
async def evaluate_scaling(self, current_utilization: float, queue_depth: int) -> Optional[Dict[str, Any]]:
|
||||
"""Evaluate if scaling action is needed and execute if necessary"""
|
||||
now = time.time()
|
||||
|
||||
# Check cooldown
|
||||
if now - self.last_scaling_action_time < self.policy.cooldown_period_sec:
|
||||
return None
|
||||
|
||||
predicted_utilization = self._predict_demand()
|
||||
|
||||
# Determine target node count
|
||||
target_nodes = self.current_nodes
|
||||
action = None
|
||||
reason = ""
|
||||
|
||||
# Scale UP conditions
|
||||
if current_utilization > self.policy.scale_up_threshold or queue_depth > self.current_nodes * 5:
|
||||
# Reactive scale up
|
||||
desired_increase = math.ceil(self.current_nodes * (current_utilization / self.policy.target_utilization - 1.0))
|
||||
# Ensure we add at least 1, but bounded by queue depth and max_nodes
|
||||
nodes_to_add = max(1, min(desired_increase, max(1, queue_depth // 2)))
|
||||
|
||||
target_nodes = min(self.policy.max_nodes, self.current_nodes + nodes_to_add)
|
||||
|
||||
if target_nodes > self.current_nodes:
|
||||
action = "scale_up"
|
||||
reason = f"High utilization ({current_utilization*100:.1f}%) or queue depth ({queue_depth})"
|
||||
|
||||
elif self.policy.predictive_scaling and predicted_utilization > self.policy.scale_up_threshold:
|
||||
# Predictive scale up (proactive)
|
||||
# Add nodes more conservatively for predictive scaling
|
||||
target_nodes = min(self.policy.max_nodes, self.current_nodes + 1)
|
||||
|
||||
if target_nodes > self.current_nodes:
|
||||
action = "scale_up"
|
||||
reason = f"Predictive scaling (expected {predicted_utilization*100:.1f}% util)"
|
||||
|
||||
# Scale DOWN conditions
|
||||
elif current_utilization < self.policy.scale_down_threshold and queue_depth == 0:
|
||||
# Only scale down if predicted utilization is also low
|
||||
if not self.policy.predictive_scaling or predicted_utilization < self.policy.target_utilization:
|
||||
# Remove nodes conservatively
|
||||
nodes_to_remove = max(1, int(self.current_nodes * 0.2))
|
||||
target_nodes = max(self.policy.min_nodes, self.current_nodes - nodes_to_remove)
|
||||
|
||||
if target_nodes < self.current_nodes:
|
||||
action = "scale_down"
|
||||
reason = f"Low utilization ({current_utilization*100:.1f}%)"
|
||||
|
||||
# Execute scaling if needed
|
||||
if action and target_nodes != self.current_nodes:
|
||||
diff = abs(target_nodes - self.current_nodes)
|
||||
|
||||
result = await self._execute_scaling(action, diff, target_nodes)
|
||||
|
||||
record = {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"action": action,
|
||||
"nodes_changed": diff,
|
||||
"new_total": target_nodes,
|
||||
"reason": reason,
|
||||
"metrics_at_time": {
|
||||
"utilization": current_utilization,
|
||||
"queue_depth": queue_depth,
|
||||
"predicted_utilization": predicted_utilization
|
||||
}
|
||||
}
|
||||
|
||||
self.scaling_history.append(record)
|
||||
# Keep history manageable
|
||||
if len(self.scaling_history) > 1000:
|
||||
self.scaling_history = self.scaling_history[-1000:]
|
||||
|
||||
self.last_scaling_action_time = now
|
||||
self.current_nodes = target_nodes
|
||||
|
||||
logger.info(f"Auto-scaler: {action.upper()} to {target_nodes} nodes. Reason: {reason}")
|
||||
return record
|
||||
|
||||
return None
|
||||
|
||||
async def _execute_scaling(self, action: str, count: int, new_total: int) -> bool:
|
||||
"""Execute the actual scaling action (e.g. interacting with Kubernetes/Docker/Cloud provider)"""
|
||||
# In this implementation, we simulate the scaling delay
|
||||
# In production, this would call cloud APIs (AWS AutoScaling, K8s Scale, etc.)
|
||||
logger.debug(f"Executing {action} by {count} nodes...")
|
||||
|
||||
# Simulate API delay
|
||||
await asyncio.sleep(2.0)
|
||||
|
||||
if action == "scale_up":
|
||||
# Simulate provisioning new instances
|
||||
# We assume a mix of CPU and GPU instances based on demand
|
||||
new_gpus = count // 2
|
||||
new_cpus = count - new_gpus
|
||||
self.active_gpu_nodes += new_gpus
|
||||
self.active_cpu_nodes += new_cpus
|
||||
elif action == "scale_down":
|
||||
# Simulate de-provisioning
|
||||
# Prefer removing CPU nodes first if we have GPU ones
|
||||
remove_cpus = min(count, max(0, self.active_cpu_nodes - self.policy.min_nodes))
|
||||
remove_gpus = count - remove_cpus
|
||||
|
||||
self.active_cpu_nodes -= remove_cpus
|
||||
self.active_gpu_nodes = max(0, self.active_gpu_nodes - remove_gpus)
|
||||
|
||||
return True
|
||||
|
||||
# --- Simulation helpers ---
|
||||
def _get_current_utilization(self) -> float:
|
||||
"""Simulate getting current cluster utilization"""
|
||||
# In reality, fetch from MarketplaceMonitor or Coordinator
|
||||
import random
|
||||
# Base utilization with some noise
|
||||
base = 0.6
|
||||
return max(0.1, min(0.99, base + random.uniform(-0.2, 0.3)))
|
||||
|
||||
def _get_queue_depth(self) -> int:
|
||||
"""Simulate getting current queue depth"""
|
||||
import random
|
||||
if random.random() > 0.8:
|
||||
return random.randint(10, 50)
|
||||
return random.randint(0, 5)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""Get current scaler status"""
|
||||
return {
|
||||
"status": "running" if self.is_running else "stopped",
|
||||
"current_nodes": {
|
||||
"total": self.current_nodes,
|
||||
"cpu_nodes": self.active_cpu_nodes,
|
||||
"gpu_nodes": self.active_gpu_nodes
|
||||
},
|
||||
"policy": {
|
||||
"min_nodes": self.policy.min_nodes,
|
||||
"max_nodes": self.policy.max_nodes,
|
||||
"target_utilization": self.policy.target_utilization
|
||||
},
|
||||
"last_action": self.scaling_history[-1] if self.scaling_history else None,
|
||||
"prediction": {
|
||||
"next_hour_utilization_estimate": round(self._predict_demand(1), 3)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user