Update database paths and fix foreign key references across coordinator API

- Change SQLite database path from `/home/oib/windsurf/aitbc/data/` to `/opt/data/`
- Fix foreign key references to use correct table names (users, wallets, gpu_registry)
- Replace governance router with new governance and community routers
- Add multi-modal RL router to main application
- Simplify DEPLOYMENT_READINESS_REPORT.md to focus on production deployment status
- Update governance router with decentralized DAO voting
This commit is contained in:
oib
2026-02-26 19:32:06 +01:00
parent 1e2ea0bb9d
commit 7bb2905cca
89 changed files with 38245 additions and 1260 deletions

View File

@@ -0,0 +1,576 @@
"""
Marketplace GPU Resource Optimizer
Optimizes GPU acceleration and resource utilization specifically for marketplace AI power trading
"""
import os
import sys
import time
import json
import logging
import asyncio
import numpy as np
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
import threading
import multiprocessing
# Try to import pycuda, fallback if not available
try:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
CUDA_AVAILABLE = True
except ImportError:
CUDA_AVAILABLE = False
print("Warning: PyCUDA not available. GPU optimization will run in simulation mode.")
logger = logging.getLogger(__name__)
class MarketplaceGPUOptimizer:
"""Optimizes GPU resources for marketplace AI power trading"""
def __init__(self, simulation_mode: bool = not CUDA_AVAILABLE):
self.simulation_mode = simulation_mode
self.gpu_devices = []
self.gpu_memory_pools = {}
self.active_jobs = {}
self.resource_metrics = {
'total_utilization': 0.0,
'memory_utilization': 0.0,
'compute_utilization': 0.0,
'energy_efficiency': 0.0,
'jobs_processed': 0,
'failed_jobs': 0
}
# Optimization configuration
self.config = {
'memory_fragmentation_threshold': 0.15, # 15%
'dynamic_batching_enabled': True,
'max_batch_size': 128,
'idle_power_state': 'P8',
'active_power_state': 'P0',
'thermal_throttle_threshold': 85.0 # Celsius
}
self.lock = threading.Lock()
self._initialize_gpu_devices()
def _initialize_gpu_devices(self):
"""Initialize available GPU devices"""
if self.simulation_mode:
# Create simulated GPUs
self.gpu_devices = [
{
'id': 0,
'name': 'Simulated RTX 4090',
'total_memory': 24 * 1024 * 1024 * 1024, # 24GB
'free_memory': 24 * 1024 * 1024 * 1024,
'compute_capability': (8, 9),
'utilization': 0.0,
'temperature': 45.0,
'power_draw': 30.0,
'power_limit': 450.0,
'status': 'idle'
},
{
'id': 1,
'name': 'Simulated RTX 4090',
'total_memory': 24 * 1024 * 1024 * 1024,
'free_memory': 24 * 1024 * 1024 * 1024,
'compute_capability': (8, 9),
'utilization': 0.0,
'temperature': 42.0,
'power_draw': 28.0,
'power_limit': 450.0,
'status': 'idle'
}
]
logger.info(f"Initialized {len(self.gpu_devices)} simulated GPU devices")
else:
try:
# Initialize real GPUs via PyCUDA
num_devices = cuda.Device.count()
for i in range(num_devices):
dev = cuda.Device(i)
free_mem, total_mem = cuda.mem_get_info()
self.gpu_devices.append({
'id': i,
'name': dev.name(),
'total_memory': total_mem,
'free_memory': free_mem,
'compute_capability': dev.compute_capability(),
'utilization': 0.0, # Would need NVML for real utilization
'temperature': 0.0, # Would need NVML
'power_draw': 0.0, # Would need NVML
'power_limit': 0.0, # Would need NVML
'status': 'idle'
})
logger.info(f"Initialized {len(self.gpu_devices)} real GPU devices")
except Exception as e:
logger.error(f"Error initializing GPUs: {e}")
self.simulation_mode = True
self._initialize_gpu_devices() # Fallback to simulation
# Initialize memory pools for each device
for gpu in self.gpu_devices:
self.gpu_memory_pools[gpu['id']] = {
'allocated_blocks': [],
'free_blocks': [{'start': 0, 'size': gpu['total_memory']}],
'fragmentation': 0.0
}
async def optimize_resource_allocation(self, job_requirements: Dict[str, Any]) -> Dict[str, Any]:
"""
Optimize GPU resource allocation for a new marketplace job
Returns the allocation plan or rejection if resources unavailable
"""
required_memory = job_requirements.get('memory_bytes', 1024 * 1024 * 1024) # Default 1GB
required_compute = job_requirements.get('compute_units', 1.0)
max_latency = job_requirements.get('max_latency_ms', 1000)
priority = job_requirements.get('priority', 1) # 1 (low) to 10 (high)
with self.lock:
# 1. Find optimal GPU
best_gpu_id = -1
best_score = -1.0
for gpu in self.gpu_devices:
# Check constraints
if gpu['free_memory'] < required_memory:
continue
if gpu['temperature'] > self.config['thermal_throttle_threshold'] and priority < 8:
continue # Reserve hot GPUs for high priority only
# Calculate optimization score (higher is better)
# We want to balance load but also minimize fragmentation
mem_utilization = 1.0 - (gpu['free_memory'] / gpu['total_memory'])
comp_utilization = gpu['utilization']
# Formula: Favor GPUs with enough space but try to pack jobs efficiently
# Penalty for high temp and high current utilization
score = 100.0
score -= (comp_utilization * 40.0)
score -= ((gpu['temperature'] - 40.0) * 1.5)
# Memory fit score: tighter fit is better to reduce fragmentation
mem_fit_ratio = required_memory / gpu['free_memory']
score += (mem_fit_ratio * 20.0)
if score > best_score:
best_score = score
best_gpu_id = gpu['id']
if best_gpu_id == -1:
# No GPU available, try optimization strategies
if await self._attempt_memory_defragmentation():
return await self.optimize_resource_allocation(job_requirements)
elif await self._preempt_low_priority_jobs(priority, required_memory):
return await self.optimize_resource_allocation(job_requirements)
else:
return {
'success': False,
'reason': 'Insufficient GPU resources available even after optimization',
'queued': True,
'estimated_wait_ms': 5000
}
# 2. Allocate resources on best GPU
job_id = f"job_{uuid4().hex[:8]}" if 'job_id' not in job_requirements else job_requirements['job_id']
allocation = self._allocate_memory(best_gpu_id, required_memory, job_id)
if not allocation['success']:
return {
'success': False,
'reason': 'Memory allocation failed due to fragmentation',
'queued': True
}
# 3. Update state
for i, gpu in enumerate(self.gpu_devices):
if gpu['id'] == best_gpu_id:
self.gpu_devices[i]['free_memory'] -= required_memory
self.gpu_devices[i]['utilization'] = min(1.0, self.gpu_devices[i]['utilization'] + (required_compute * 0.1))
self.gpu_devices[i]['status'] = 'active'
break
self.active_jobs[job_id] = {
'gpu_id': best_gpu_id,
'memory_allocated': required_memory,
'compute_allocated': required_compute,
'priority': priority,
'start_time': time.time(),
'status': 'running'
}
self._update_metrics()
return {
'success': True,
'job_id': job_id,
'gpu_id': best_gpu_id,
'allocation_plan': {
'memory_blocks': allocation['blocks'],
'dynamic_batching': self.config['dynamic_batching_enabled'],
'power_state_enforced': self.config['active_power_state']
},
'estimated_completion_ms': int(required_compute * 100)
}
def _allocate_memory(self, gpu_id: int, size: int, job_id: str) -> Dict[str, Any]:
"""Custom memory allocator designed to minimize fragmentation"""
pool = self.gpu_memory_pools[gpu_id]
# Sort free blocks by size (Best Fit algorithm)
pool['free_blocks'].sort(key=lambda x: x['size'])
allocated_blocks = []
remaining_size = size
# Try contiguous allocation first (Best Fit)
for i, block in enumerate(pool['free_blocks']):
if block['size'] >= size:
# Perfect or larger fit found
allocated_block = {
'job_id': job_id,
'start': block['start'],
'size': size
}
allocated_blocks.append(allocated_block)
pool['allocated_blocks'].append(allocated_block)
# Update free block
if block['size'] == size:
pool['free_blocks'].pop(i)
else:
block['start'] += size
block['size'] -= size
self._recalculate_fragmentation(gpu_id)
return {'success': True, 'blocks': allocated_blocks}
# If we reach here, we need to do scatter allocation (virtual memory mapping)
# This is more complex and less performant, but prevents OOM on fragmented memory
if sum(b['size'] for b in pool['free_blocks']) >= size:
# We have enough total memory, just fragmented
blocks_to_remove = []
for i, block in enumerate(pool['free_blocks']):
if remaining_size <= 0:
break
take_size = min(block['size'], remaining_size)
allocated_block = {
'job_id': job_id,
'start': block['start'],
'size': take_size
}
allocated_blocks.append(allocated_block)
pool['allocated_blocks'].append(allocated_block)
if take_size == block['size']:
blocks_to_remove.append(i)
else:
block['start'] += take_size
block['size'] -= take_size
remaining_size -= take_size
# Remove fully utilized free blocks (in reverse order to not mess up indices)
for i in reversed(blocks_to_remove):
pool['free_blocks'].pop(i)
self._recalculate_fragmentation(gpu_id)
return {'success': True, 'blocks': allocated_blocks, 'fragmented': True}
return {'success': False}
def release_resources(self, job_id: str) -> bool:
"""Release resources when a job is complete"""
with self.lock:
if job_id not in self.active_jobs:
return False
job = self.active_jobs[job_id]
gpu_id = job['gpu_id']
pool = self.gpu_memory_pools[gpu_id]
# Find and remove allocated blocks
blocks_to_free = []
new_allocated = []
for block in pool['allocated_blocks']:
if block['job_id'] == job_id:
blocks_to_free.append({'start': block['start'], 'size': block['size']})
else:
new_allocated.append(block)
pool['allocated_blocks'] = new_allocated
# Add back to free blocks and merge adjacent
pool['free_blocks'].extend(blocks_to_free)
self._merge_free_blocks(gpu_id)
# Update GPU state
for i, gpu in enumerate(self.gpu_devices):
if gpu['id'] == gpu_id:
self.gpu_devices[i]['free_memory'] += job['memory_allocated']
self.gpu_devices[i]['utilization'] = max(0.0, self.gpu_devices[i]['utilization'] - (job['compute_allocated'] * 0.1))
if self.gpu_devices[i]['utilization'] <= 0.05:
self.gpu_devices[i]['status'] = 'idle'
break
# Update metrics
self.resource_metrics['jobs_processed'] += 1
if job['status'] == 'failed':
self.resource_metrics['failed_jobs'] += 1
del self.active_jobs[job_id]
self._update_metrics()
return True
def _merge_free_blocks(self, gpu_id: int):
"""Merge adjacent free memory blocks to reduce fragmentation"""
pool = self.gpu_memory_pools[gpu_id]
if len(pool['free_blocks']) <= 1:
return
# Sort by start address
pool['free_blocks'].sort(key=lambda x: x['start'])
merged = [pool['free_blocks'][0]]
for current in pool['free_blocks'][1:]:
previous = merged[-1]
# Check if adjacent
if previous['start'] + previous['size'] == current['start']:
previous['size'] += current['size']
else:
merged.append(current)
pool['free_blocks'] = merged
self._recalculate_fragmentation(gpu_id)
def _recalculate_fragmentation(self, gpu_id: int):
"""Calculate memory fragmentation index (0.0 to 1.0)"""
pool = self.gpu_memory_pools[gpu_id]
if not pool['free_blocks']:
pool['fragmentation'] = 0.0
return
total_free = sum(b['size'] for b in pool['free_blocks'])
if total_free == 0:
pool['fragmentation'] = 0.0
return
max_block = max(b['size'] for b in pool['free_blocks'])
# Fragmentation is high if the largest free block is much smaller than total free memory
pool['fragmentation'] = 1.0 - (max_block / total_free)
async def _attempt_memory_defragmentation(self) -> bool:
"""Attempt to defragment GPU memory by moving active allocations"""
# In a real scenario, this involves pausing kernels and cudaMemcpyDeviceToDevice
# Here we simulate the process if fragmentation is above threshold
defrag_occurred = False
for gpu_id, pool in self.gpu_memory_pools.items():
if pool['fragmentation'] > self.config['memory_fragmentation_threshold']:
logger.info(f"Defragmenting GPU {gpu_id} (frag: {pool['fragmentation']:.2f})")
await asyncio.sleep(0.1) # Simulate defrag time
# Simulate perfect defragmentation
total_allocated = sum(b['size'] for b in pool['allocated_blocks'])
# Rebuild blocks optimally
new_allocated = []
current_ptr = 0
for block in pool['allocated_blocks']:
new_allocated.append({
'job_id': block['job_id'],
'start': current_ptr,
'size': block['size']
})
current_ptr += block['size']
pool['allocated_blocks'] = new_allocated
gpu = next((g for g in self.gpu_devices if g['id'] == gpu_id), None)
if gpu:
pool['free_blocks'] = [{
'start': total_allocated,
'size': gpu['total_memory'] - total_allocated
}]
pool['fragmentation'] = 0.0
defrag_occurred = True
return defrag_occurred
async def schedule_job(self, job_id: str, priority: int, memory_required: int, computation_complexity: float) -> bool:
"""Dynamic Priority Queue: Schedule a job and potentially preempt running jobs"""
job_data = {
'job_id': job_id,
'priority': priority,
'memory_required': memory_required,
'computation_complexity': computation_complexity,
'status': 'queued',
'submitted_at': datetime.utcnow().isoformat()
}
# Calculate scores and find best GPU
best_gpu = -1
best_score = -float('inf')
for gpu_id, status in self.gpu_status.items():
pool = self.gpu_memory_pools[gpu_id]
available_mem = pool['total_memory'] - pool['allocated_memory']
# Base score depends on memory availability
if available_mem >= memory_required:
score = (available_mem / pool['total_memory']) * 100
if score > best_score:
best_score = score
best_gpu = gpu_id
# If we found a GPU with enough free memory, allocate directly
if best_gpu >= 0:
alloc_result = self._allocate_memory(best_gpu, memory_required, job_id)
if alloc_result['success']:
job_data['status'] = 'running'
job_data['gpu_id'] = best_gpu
job_data['memory_allocated'] = memory_required
self.active_jobs[job_id] = job_data
return True
# If no GPU is available, try to preempt lower priority jobs
logger.info(f"No GPU has {memory_required}MB free for job {job_id}. Attempting preemption...")
preempt_success = await self._preempt_low_priority_jobs(priority, memory_required)
if preempt_success:
# We successfully preempted, now we should be able to allocate
for gpu_id, pool in self.gpu_memory_pools.items():
if (pool['total_memory'] - pool['allocated_memory']) >= memory_required:
alloc_result = self._allocate_memory(gpu_id, memory_required, job_id)
if alloc_result['success']:
job_data['status'] = 'running'
job_data['gpu_id'] = gpu_id
job_data['memory_allocated'] = memory_required
self.active_jobs[job_id] = job_data
return True
logger.warning(f"Job {job_id} remains queued. Insufficient resources even after preemption.")
return False
async def _preempt_low_priority_jobs(self, incoming_priority: int, required_memory: int) -> bool:
"""Preempt lower priority jobs to make room for higher priority ones"""
preemptable_jobs = []
for job_id, job in self.active_jobs.items():
if job['priority'] < incoming_priority:
preemptable_jobs.append((job_id, job))
# Sort by priority (lowest first) then memory (largest first)
preemptable_jobs.sort(key=lambda x: (x[1]['priority'], -x[1]['memory_allocated']))
freed_memory = 0
jobs_to_preempt = []
for job_id, job in preemptable_jobs:
jobs_to_preempt.append(job_id)
freed_memory += job['memory_allocated']
if freed_memory >= required_memory:
break
if freed_memory >= required_memory:
# Preempt the jobs
for job_id in jobs_to_preempt:
logger.info(f"Preempting low priority job {job_id} for higher priority request")
# In real scenario, would save state/checkpoint before killing
self.release_resources(job_id)
# Notify job owner (simulated)
# event_bus.publish('job_preempted', {'job_id': job_id})
return True
return False
def _update_metrics(self):
"""Update overall system metrics"""
total_util = 0.0
total_mem_util = 0.0
for gpu in self.gpu_devices:
mem_util = 1.0 - (gpu['free_memory'] / gpu['total_memory'])
total_mem_util += mem_util
total_util += gpu['utilization']
# Simulate dynamic temperature and power based on utilization
if self.simulation_mode:
target_temp = 35.0 + (gpu['utilization'] * 50.0)
gpu['temperature'] = gpu['temperature'] * 0.9 + target_temp * 0.1
target_power = 20.0 + (gpu['utilization'] * (gpu['power_limit'] - 20.0))
gpu['power_draw'] = gpu['power_draw'] * 0.8 + target_power * 0.2
n_gpus = len(self.gpu_devices)
if n_gpus > 0:
self.resource_metrics['compute_utilization'] = total_util / n_gpus
self.resource_metrics['memory_utilization'] = total_mem_util / n_gpus
self.resource_metrics['total_utilization'] = (self.resource_metrics['compute_utilization'] + self.resource_metrics['memory_utilization']) / 2
# Calculate energy efficiency (flops per watt approx)
total_power = sum(g['power_draw'] for g in self.gpu_devices)
if total_power > 0:
self.resource_metrics['energy_efficiency'] = (self.resource_metrics['compute_utilization'] * 100) / total_power
def get_system_status(self) -> Dict[str, Any]:
"""Get current system status and metrics"""
with self.lock:
self._update_metrics()
devices_info = []
for gpu in self.gpu_devices:
pool = self.gpu_memory_pools[gpu['id']]
devices_info.append({
'id': gpu['id'],
'name': gpu['name'],
'utilization': round(gpu['utilization'] * 100, 2),
'memory_used_gb': round((gpu['total_memory'] - gpu['free_memory']) / (1024**3), 2),
'memory_total_gb': round(gpu['total_memory'] / (1024**3), 2),
'temperature_c': round(gpu['temperature'], 1),
'power_draw_w': round(gpu['power_draw'], 1),
'status': gpu['status'],
'fragmentation': round(pool['fragmentation'] * 100, 2)
})
return {
'timestamp': datetime.utcnow().isoformat(),
'active_jobs': len(self.active_jobs),
'metrics': {
'overall_utilization_pct': round(self.resource_metrics['total_utilization'] * 100, 2),
'compute_utilization_pct': round(self.resource_metrics['compute_utilization'] * 100, 2),
'memory_utilization_pct': round(self.resource_metrics['memory_utilization'] * 100, 2),
'energy_efficiency_score': round(self.resource_metrics['energy_efficiency'], 4),
'jobs_processed_total': self.resource_metrics['jobs_processed']
},
'devices': devices_info
}
# Example usage function
async def optimize_marketplace_batch(jobs: List[Dict[str, Any]]):
"""Process a batch of marketplace jobs through the optimizer"""
optimizer = MarketplaceGPUOptimizer()
results = []
for job in jobs:
res = await optimizer.optimize_resource_allocation(job)
results.append(res)
return results, optimizer.get_system_status()

View File

@@ -0,0 +1,468 @@
"""
Distributed Agent Processing Framework
Implements a scalable, fault-tolerant framework for distributed AI agent tasks across the AITBC network.
"""
import asyncio
import uuid
import time
import logging
import json
import hashlib
from typing import Dict, List, Optional, Any, Callable, Awaitable
from datetime import datetime
from enum import Enum
logger = logging.getLogger(__name__)
class TaskStatus(str, Enum):
PENDING = "pending"
SCHEDULED = "scheduled"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
TIMEOUT = "timeout"
RETRYING = "retrying"
class WorkerStatus(str, Enum):
IDLE = "idle"
BUSY = "busy"
OFFLINE = "offline"
OVERLOADED = "overloaded"
class DistributedTask:
def __init__(
self,
task_id: str,
agent_id: str,
payload: Dict[str, Any],
priority: int = 1,
requires_gpu: bool = False,
timeout_ms: int = 30000,
max_retries: int = 3
):
self.task_id = task_id or f"dt_{uuid.uuid4().hex[:12]}"
self.agent_id = agent_id
self.payload = payload
self.priority = priority
self.requires_gpu = requires_gpu
self.timeout_ms = timeout_ms
self.max_retries = max_retries
self.status = TaskStatus.PENDING
self.created_at = time.time()
self.scheduled_at = None
self.started_at = None
self.completed_at = None
self.assigned_worker_id = None
self.result = None
self.error = None
self.retries = 0
# Calculate content hash for caching/deduplication
content = json.dumps(payload, sort_keys=True)
self.content_hash = hashlib.sha256(content.encode()).hexdigest()
class WorkerNode:
def __init__(
self,
worker_id: str,
capabilities: List[str],
has_gpu: bool = False,
max_concurrent_tasks: int = 4
):
self.worker_id = worker_id
self.capabilities = capabilities
self.has_gpu = has_gpu
self.max_concurrent_tasks = max_concurrent_tasks
self.status = WorkerStatus.IDLE
self.active_tasks = []
self.last_heartbeat = time.time()
self.total_completed = 0
self.performance_score = 1.0 # 0.0 to 1.0 based on success rate and speed
class DistributedProcessingCoordinator:
"""
Coordinates distributed task execution across available worker nodes.
Implements advanced scheduling, fault tolerance, and load balancing.
"""
def __init__(self):
self.tasks: Dict[str, DistributedTask] = {}
self.workers: Dict[str, WorkerNode] = {}
self.task_queue = asyncio.PriorityQueue()
# Result cache (content_hash -> result)
self.result_cache: Dict[str, Any] = {}
self.is_running = False
self._scheduler_task = None
self._monitor_task = None
async def start(self):
"""Start the coordinator background tasks"""
if self.is_running:
return
self.is_running = True
self._scheduler_task = asyncio.create_task(self._scheduling_loop())
self._monitor_task = asyncio.create_task(self._health_monitor_loop())
logger.info("Distributed Processing Coordinator started")
async def stop(self):
"""Stop the coordinator gracefully"""
self.is_running = False
if self._scheduler_task:
self._scheduler_task.cancel()
if self._monitor_task:
self._monitor_task.cancel()
logger.info("Distributed Processing Coordinator stopped")
def register_worker(self, worker_id: str, capabilities: List[str], has_gpu: bool = False, max_tasks: int = 4):
"""Register a new worker node in the cluster"""
if worker_id not in self.workers:
self.workers[worker_id] = WorkerNode(worker_id, capabilities, has_gpu, max_tasks)
logger.info(f"Registered new worker node: {worker_id} (GPU: {has_gpu})")
else:
# Update existing worker
worker = self.workers[worker_id]
worker.capabilities = capabilities
worker.has_gpu = has_gpu
worker.max_concurrent_tasks = max_tasks
worker.last_heartbeat = time.time()
if worker.status == WorkerStatus.OFFLINE:
worker.status = WorkerStatus.IDLE
def heartbeat(self, worker_id: str, metrics: Optional[Dict[str, Any]] = None):
"""Record a heartbeat from a worker node"""
if worker_id in self.workers:
worker = self.workers[worker_id]
worker.last_heartbeat = time.time()
# Update status based on metrics if provided
if metrics:
cpu_load = metrics.get('cpu_load', 0.0)
if cpu_load > 0.9 or len(worker.active_tasks) >= worker.max_concurrent_tasks:
worker.status = WorkerStatus.OVERLOADED
elif len(worker.active_tasks) > 0:
worker.status = WorkerStatus.BUSY
else:
worker.status = WorkerStatus.IDLE
async def submit_task(self, task: DistributedTask) -> str:
"""Submit a new task to the distributed framework"""
# Check cache first
if task.content_hash in self.result_cache:
task.status = TaskStatus.COMPLETED
task.result = self.result_cache[task.content_hash]
task.completed_at = time.time()
self.tasks[task.task_id] = task
logger.debug(f"Task {task.task_id} fulfilled from cache")
return task.task_id
self.tasks[task.task_id] = task
# Priority Queue uses lowest number first, so we invert user priority
queue_priority = 100 - min(task.priority, 100)
await self.task_queue.put((queue_priority, task.created_at, task.task_id))
logger.debug(f"Task {task.task_id} queued with priority {task.priority}")
return task.task_id
async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
"""Get the current status and result of a task"""
if task_id not in self.tasks:
return None
task = self.tasks[task_id]
response = {
'task_id': task.task_id,
'status': task.status,
'created_at': task.created_at
}
if task.status == TaskStatus.COMPLETED:
response['result'] = task.result
response['completed_at'] = task.completed_at
response['duration_ms'] = int((task.completed_at - (task.started_at or task.created_at)) * 1000)
elif task.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT]:
response['error'] = str(task.error)
if task.assigned_worker_id:
response['worker_id'] = task.assigned_worker_id
return response
async def _scheduling_loop(self):
"""Background task that assigns queued tasks to available workers"""
while self.is_running:
try:
# Get next task from queue (blocks until available)
if self.task_queue.empty():
await asyncio.sleep(0.1)
continue
priority, _, task_id = await self.task_queue.get()
if task_id not in self.tasks:
self.task_queue.task_done()
continue
task = self.tasks[task_id]
# If task was cancelled while in queue
if task.status != TaskStatus.PENDING and task.status != TaskStatus.RETRYING:
self.task_queue.task_done()
continue
# Find best worker
best_worker = self._find_best_worker(task)
if best_worker:
await self._assign_task(task, best_worker)
else:
# No worker available right now, put back in queue with slight delay
# Use a background task to not block the scheduling loop
asyncio.create_task(self._requeue_delayed(priority, task))
self.task_queue.task_done()
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in scheduling loop: {e}")
await asyncio.sleep(1.0)
async def _requeue_delayed(self, priority: int, task: DistributedTask):
"""Put a task back in the queue after a short delay"""
await asyncio.sleep(0.5)
if self.is_running and task.status in [TaskStatus.PENDING, TaskStatus.RETRYING]:
await self.task_queue.put((priority, task.created_at, task.task_id))
def _find_best_worker(self, task: DistributedTask) -> Optional[WorkerNode]:
"""Find the optimal worker for a task based on requirements and load"""
candidates = []
for worker in self.workers.values():
# Skip offline or overloaded workers
if worker.status in [WorkerStatus.OFFLINE, WorkerStatus.OVERLOADED]:
continue
# Skip if worker is at capacity
if len(worker.active_tasks) >= worker.max_concurrent_tasks:
continue
# Check GPU requirement
if task.requires_gpu and not worker.has_gpu:
continue
# Required capability check could be added here
# Calculate score for worker
score = worker.performance_score * 100
# Penalize slightly based on current load to balance distribution
load_factor = len(worker.active_tasks) / worker.max_concurrent_tasks
score -= (load_factor * 20)
# Prefer GPU workers for GPU tasks, penalize GPU workers for CPU tasks
# to keep them free for GPU workloads
if worker.has_gpu and not task.requires_gpu:
score -= 30
candidates.append((score, worker))
if not candidates:
return None
# Return worker with highest score
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][1]
async def _assign_task(self, task: DistributedTask, worker: WorkerNode):
"""Assign a task to a specific worker"""
task.status = TaskStatus.SCHEDULED
task.assigned_worker_id = worker.worker_id
task.scheduled_at = time.time()
worker.active_tasks.append(task.task_id)
if len(worker.active_tasks) >= worker.max_concurrent_tasks:
worker.status = WorkerStatus.OVERLOADED
elif worker.status == WorkerStatus.IDLE:
worker.status = WorkerStatus.BUSY
logger.debug(f"Assigned task {task.task_id} to worker {worker.worker_id}")
# In a real system, this would make an RPC/network call to the worker
# Here we simulate the network dispatch asynchronously
asyncio.create_task(self._simulate_worker_execution(task, worker))
async def _simulate_worker_execution(self, task: DistributedTask, worker: WorkerNode):
"""Simulate the execution on the remote worker node"""
task.status = TaskStatus.PROCESSING
task.started_at = time.time()
try:
# Simulate processing time based on task complexity
# Real implementation would await the actual RPC response
complexity = task.payload.get('complexity', 1.0)
base_time = 0.5
if worker.has_gpu and task.requires_gpu:
# GPU processes faster
processing_time = base_time * complexity * 0.2
else:
processing_time = base_time * complexity
# Simulate potential network/node failure
if worker.performance_score < 0.5 and time.time() % 10 < 1:
raise ConnectionError("Worker node network failure")
await asyncio.sleep(processing_time)
# Success
self.report_task_success(task.task_id, {"result_data": "simulated_success", "processed_by": worker.worker_id})
except Exception as e:
self.report_task_failure(task.task_id, str(e))
def report_task_success(self, task_id: str, result: Any):
"""Called by a worker when a task completes successfully"""
if task_id not in self.tasks:
return
task = self.tasks[task_id]
if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.TIMEOUT]:
return # Already finished
task.status = TaskStatus.COMPLETED
task.result = result
task.completed_at = time.time()
# Cache the result
self.result_cache[task.content_hash] = result
# Update worker metrics
if task.assigned_worker_id and task.assigned_worker_id in self.workers:
worker = self.workers[task.assigned_worker_id]
if task_id in worker.active_tasks:
worker.active_tasks.remove(task_id)
worker.total_completed += 1
# Increase performance score slightly (max 1.0)
worker.performance_score = min(1.0, worker.performance_score + 0.01)
if len(worker.active_tasks) < worker.max_concurrent_tasks and worker.status == WorkerStatus.OVERLOADED:
worker.status = WorkerStatus.BUSY
if len(worker.active_tasks) == 0:
worker.status = WorkerStatus.IDLE
logger.info(f"Task {task_id} completed successfully")
def report_task_failure(self, task_id: str, error: str):
"""Called when a task fails execution"""
if task_id not in self.tasks:
return
task = self.tasks[task_id]
# Update worker metrics
if task.assigned_worker_id and task.assigned_worker_id in self.workers:
worker = self.workers[task.assigned_worker_id]
if task_id in worker.active_tasks:
worker.active_tasks.remove(task_id)
# Decrease performance score heavily on failure
worker.performance_score = max(0.1, worker.performance_score - 0.05)
# Handle retry logic
if task.retries < task.max_retries:
task.retries += 1
task.status = TaskStatus.RETRYING
task.assigned_worker_id = None
task.error = f"Attempt {task.retries} failed: {error}"
logger.warning(f"Task {task_id} failed, scheduling retry {task.retries}/{task.max_retries}")
# Put back in queue with slightly lower priority
queue_priority = (100 - min(task.priority, 100)) + (task.retries * 5)
asyncio.create_task(self.task_queue.put((queue_priority, time.time(), task.task_id)))
else:
task.status = TaskStatus.FAILED
task.error = f"Max retries exceeded. Final error: {error}"
task.completed_at = time.time()
logger.error(f"Task {task_id} failed permanently")
async def _health_monitor_loop(self):
"""Background task that monitors worker health and task timeouts"""
while self.is_running:
try:
current_time = time.time()
# 1. Check worker health
for worker_id, worker in self.workers.items():
# If no heartbeat for 60 seconds, mark offline
if current_time - worker.last_heartbeat > 60.0:
if worker.status != WorkerStatus.OFFLINE:
logger.warning(f"Worker {worker_id} went offline (missed heartbeats)")
worker.status = WorkerStatus.OFFLINE
# Re-queue all active tasks for this worker
for task_id in worker.active_tasks:
if task_id in self.tasks:
self.report_task_failure(task_id, "Worker node disconnected")
worker.active_tasks.clear()
# 2. Check task timeouts
for task_id, task in self.tasks.items():
if task.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING]:
start_time = task.started_at or task.scheduled_at
if start_time and (current_time - start_time) * 1000 > task.timeout_ms:
logger.warning(f"Task {task_id} timed out")
self.report_task_failure(task_id, f"Execution timed out after {task.timeout_ms}ms")
await asyncio.sleep(5.0) # Check every 5 seconds
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in health monitor loop: {e}")
await asyncio.sleep(5.0)
def get_cluster_status(self) -> Dict[str, Any]:
"""Get the overall status of the distributed cluster"""
total_workers = len(self.workers)
active_workers = sum(1 for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
gpu_workers = sum(1 for w in self.workers.values() if w.has_gpu and w.status != WorkerStatus.OFFLINE)
pending_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.PENDING)
processing_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.SCHEDULED, TaskStatus.PROCESSING])
completed_tasks = sum(1 for t in self.tasks.values() if t.status == TaskStatus.COMPLETED)
failed_tasks = sum(1 for t in self.tasks.values() if t.status in [TaskStatus.FAILED, TaskStatus.TIMEOUT])
# Calculate cluster utilization
total_capacity = sum(w.max_concurrent_tasks for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
current_load = sum(len(w.active_tasks) for w in self.workers.values() if w.status != WorkerStatus.OFFLINE)
utilization = (current_load / total_capacity * 100) if total_capacity > 0 else 0
return {
"cluster_health": "healthy" if active_workers > 0 else "offline",
"nodes": {
"total": total_workers,
"active": active_workers,
"with_gpu": gpu_workers
},
"tasks": {
"pending": pending_tasks,
"processing": processing_tasks,
"completed": completed_tasks,
"failed": failed_tasks
},
"performance": {
"utilization_percent": round(utilization, 2),
"cache_size": len(self.result_cache)
},
"timestamp": datetime.utcnow().isoformat()
}

View File

@@ -0,0 +1,246 @@
"""
Marketplace Caching & Optimization Service
Implements advanced caching, indexing, and data optimization for the AITBC marketplace.
"""
import json
import time
import hashlib
import logging
from typing import Dict, List, Optional, Any, Union, Set
from collections import OrderedDict
from datetime import datetime
import redis.asyncio as redis
logger = logging.getLogger(__name__)
class LFU_LRU_Cache:
"""Hybrid Least-Frequently/Least-Recently Used Cache for in-memory optimization"""
def __init__(self, capacity: int):
self.capacity = capacity
self.cache = {}
self.frequencies = {}
self.frequency_lists = {}
self.min_freq = 0
def get(self, key: str) -> Optional[Any]:
if key not in self.cache:
return None
# Update frequency
freq = self.frequencies[key]
val = self.cache[key]
# Remove from current frequency list
self.frequency_lists[freq].remove(key)
if not self.frequency_lists[freq] and self.min_freq == freq:
self.min_freq += 1
# Add to next frequency list
new_freq = freq + 1
self.frequencies[key] = new_freq
if new_freq not in self.frequency_lists:
self.frequency_lists[new_freq] = OrderedDict()
self.frequency_lists[new_freq][key] = None
return val
def put(self, key: str, value: Any):
if self.capacity == 0:
return
if key in self.cache:
self.cache[key] = value
self.get(key) # Update frequency
return
if len(self.cache) >= self.capacity:
# Evict least frequently used item (if tie, least recently used)
evict_key, _ = self.frequency_lists[self.min_freq].popitem(last=False)
del self.cache[evict_key]
del self.frequencies[evict_key]
# Add new item
self.cache[key] = value
self.frequencies[key] = 1
self.min_freq = 1
if 1 not in self.frequency_lists:
self.frequency_lists[1] = OrderedDict()
self.frequency_lists[1][key] = None
class MarketplaceDataOptimizer:
"""Advanced optimization engine for marketplace data access"""
def __init__(self, redis_url: str = "redis://localhost:6379/0"):
self.redis_url = redis_url
self.redis_client = None
# Two-tier cache: Fast L1 (Memory), Slower L2 (Redis)
self.l1_cache = LFU_LRU_Cache(capacity=1000)
self.is_connected = False
# Cache TTL defaults
self.ttls = {
'order_book': 5, # Very dynamic, 5 seconds
'provider_status': 15, # 15 seconds
'market_stats': 60, # 1 minute
'historical_data': 3600 # 1 hour
}
async def connect(self):
"""Establish connection to Redis L2 cache"""
try:
self.redis_client = redis.from_url(self.redis_url, decode_responses=True)
await self.redis_client.ping()
self.is_connected = True
logger.info("Connected to Redis L2 cache")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}. Falling back to L1 cache only.")
self.is_connected = False
async def disconnect(self):
"""Close Redis connection"""
if self.redis_client:
await self.redis_client.close()
self.is_connected = False
def _generate_cache_key(self, namespace: str, params: Dict[str, Any]) -> str:
"""Generate a deterministic cache key from parameters"""
param_str = json.dumps(params, sort_keys=True)
param_hash = hashlib.md5(param_str.encode()).hexdigest()
return f"mkpt:{namespace}:{param_hash}"
async def get_cached_data(self, namespace: str, params: Dict[str, Any]) -> Optional[Any]:
"""Retrieve data from the multi-tier cache"""
key = self._generate_cache_key(namespace, params)
# 1. Try L1 Memory Cache (fastest)
l1_result = self.l1_cache.get(key)
if l1_result is not None:
# Check if expired
if l1_result['expires_at'] > time.time():
logger.debug(f"L1 Cache hit for {key}")
return l1_result['data']
# 2. Try L2 Redis Cache
if self.is_connected:
try:
l2_result_str = await self.redis_client.get(key)
if l2_result_str:
logger.debug(f"L2 Cache hit for {key}")
data = json.loads(l2_result_str)
# Backfill L1 cache
ttl = self.ttls.get(namespace, 60)
self.l1_cache.put(key, {
'data': data,
'expires_at': time.time() + min(ttl, 10) # L1 expires sooner than L2
})
return data
except Exception as e:
logger.warning(f"Redis get failed: {e}")
return None
async def set_cached_data(self, namespace: str, params: Dict[str, Any], data: Any, custom_ttl: int = None):
"""Store data in the multi-tier cache"""
key = self._generate_cache_key(namespace, params)
ttl = custom_ttl or self.ttls.get(namespace, 60)
# 1. Update L1 Cache
self.l1_cache.put(key, {
'data': data,
'expires_at': time.time() + ttl
})
# 2. Update L2 Redis Cache asynchronously
if self.is_connected:
try:
# We don't await this to keep the main thread fast
# In FastAPI we would use BackgroundTasks
await self.redis_client.setex(
key,
ttl,
json.dumps(data)
)
except Exception as e:
logger.warning(f"Redis set failed: {e}")
async def invalidate_namespace(self, namespace: str):
"""Invalidate all cached items for a specific namespace"""
if self.is_connected:
try:
# Find all keys matching namespace pattern
cursor = 0
pattern = f"mkpt:{namespace}:*"
while True:
cursor, keys = await self.redis_client.scan(cursor=cursor, match=pattern, count=100)
if keys:
await self.redis_client.delete(*keys)
if cursor == 0:
break
logger.info(f"Invalidated L2 cache namespace: {namespace}")
except Exception as e:
logger.error(f"Failed to invalidate namespace {namespace}: {e}")
# L1 invalidation is harder without scanning the whole dict
# We'll just let them naturally expire or get evicted
async def precompute_market_stats(self, db_session) -> Dict[str, Any]:
"""Background task to precompute expensive market statistics and cache them"""
# This would normally run periodically via Celery/Celery Beat
start_time = time.time()
# Simulated expensive DB aggregations
# In reality: SELECT AVG(price), SUM(volume) FROM trades WHERE created_at > NOW() - 24h
stats = {
"24h_volume": 1250000.50,
"active_providers": 450,
"average_price_per_tflop": 0.005,
"network_utilization": 0.76,
"computed_at": datetime.utcnow().isoformat(),
"computation_time_ms": int((time.time() - start_time) * 1000)
}
# Cache the precomputed stats
await self.set_cached_data('market_stats', {'period': '24h'}, stats, custom_ttl=300)
return stats
def optimize_order_book_response(self, raw_orders: List[Dict], depth: int = 50) -> Dict[str, List]:
"""
Optimize the raw order book for client delivery.
Groups similar prices, limits depth, and formats efficiently.
"""
buy_orders = [o for o in raw_orders if o['type'] == 'buy']
sell_orders = [o for o in raw_orders if o['type'] == 'sell']
# Aggregate by price level to reduce payload size
agg_buys = {}
for order in buy_orders:
price = round(order['price'], 4)
if price not in agg_buys:
agg_buys[price] = 0
agg_buys[price] += order['amount']
agg_sells = {}
for order in sell_orders:
price = round(order['price'], 4)
if price not in agg_sells:
agg_sells[price] = 0
agg_sells[price] += order['amount']
# Format and sort
formatted_buys = [[p, q] for p, q in sorted(agg_buys.items(), reverse=True)[:depth]]
formatted_sells = [[p, q] for p, q in sorted(agg_sells.items())[:depth]]
return {
"bids": formatted_buys,
"asks": formatted_sells,
"timestamp": time.time()
}

View File

@@ -0,0 +1,236 @@
"""
Marketplace Real-time Performance Monitor
Implements comprehensive real-time monitoring and analytics for the AITBC marketplace.
"""
import time
import asyncio
import logging
from typing import Dict, List, Optional, Any, collections
from datetime import datetime, timedelta
import collections
logger = logging.getLogger(__name__)
class TimeSeriesData:
"""Efficient in-memory time series data structure for real-time metrics"""
def __init__(self, max_points: int = 3600): # Default 1 hour of second-level data
self.max_points = max_points
self.timestamps = collections.deque(maxlen=max_points)
self.values = collections.deque(maxlen=max_points)
def add(self, value: float, timestamp: float = None):
self.timestamps.append(timestamp or time.time())
self.values.append(value)
def get_latest(self) -> Optional[float]:
return self.values[-1] if self.values else None
def get_average(self, window_seconds: int = 60) -> float:
if not self.values:
return 0.0
cutoff = time.time() - window_seconds
valid_values = [v for t, v in zip(self.timestamps, self.values) if t >= cutoff]
return sum(valid_values) / len(valid_values) if valid_values else 0.0
def get_percentile(self, percentile: float, window_seconds: int = 60) -> float:
if not self.values:
return 0.0
cutoff = time.time() - window_seconds
valid_values = sorted([v for t, v in zip(self.timestamps, self.values) if t >= cutoff])
if not valid_values:
return 0.0
idx = int(len(valid_values) * percentile)
idx = min(max(idx, 0), len(valid_values) - 1)
return valid_values[idx]
class MarketplaceMonitor:
"""Real-time performance monitoring system for the marketplace"""
def __init__(self):
# API Metrics
self.api_latency_ms = TimeSeriesData()
self.api_requests_per_sec = TimeSeriesData()
self.api_error_rate = TimeSeriesData()
# Trading Metrics
self.order_matching_time_ms = TimeSeriesData()
self.trades_per_sec = TimeSeriesData()
self.active_orders = TimeSeriesData()
# Resource Metrics
self.gpu_utilization_pct = TimeSeriesData()
self.network_bandwidth_mbps = TimeSeriesData()
self.active_providers = TimeSeriesData()
# internal tracking
self._request_counter = 0
self._error_counter = 0
self._trade_counter = 0
self._last_tick = time.time()
self.is_running = False
self._monitor_task = None
# Alert thresholds
self.alert_thresholds = {
'api_latency_p95_ms': 500.0,
'api_error_rate_pct': 5.0,
'gpu_utilization_pct': 90.0,
'matching_time_ms': 100.0
}
self.active_alerts = []
async def start(self):
if self.is_running:
return
self.is_running = True
self._monitor_task = asyncio.create_task(self._metric_tick_loop())
logger.info("Marketplace Monitor started")
async def stop(self):
self.is_running = False
if self._monitor_task:
self._monitor_task.cancel()
logger.info("Marketplace Monitor stopped")
def record_api_call(self, latency_ms: float, is_error: bool = False):
"""Record an API request for monitoring"""
self.api_latency_ms.add(latency_ms)
self._request_counter += 1
if is_error:
self._error_counter += 1
def record_trade(self, matching_time_ms: float):
"""Record a successful trade match"""
self.order_matching_time_ms.add(matching_time_ms)
self._trade_counter += 1
def update_resource_metrics(self, gpu_util: float, bandwidth: float, providers: int, orders: int):
"""Update system resource metrics"""
self.gpu_utilization_pct.add(gpu_util)
self.network_bandwidth_mbps.add(bandwidth)
self.active_providers.add(providers)
self.active_orders.add(orders)
async def _metric_tick_loop(self):
"""Background task that aggregates metrics every second"""
while self.is_running:
try:
now = time.time()
elapsed = now - self._last_tick
if elapsed >= 1.0:
# Calculate rates
req_per_sec = self._request_counter / elapsed
trades_per_sec = self._trade_counter / elapsed
error_rate = (self._error_counter / max(1, self._request_counter)) * 100
# Store metrics
self.api_requests_per_sec.add(req_per_sec)
self.trades_per_sec.add(trades_per_sec)
self.api_error_rate.add(error_rate)
# Reset counters
self._request_counter = 0
self._error_counter = 0
self._trade_counter = 0
self._last_tick = now
# Evaluate alerts
self._evaluate_alerts()
await asyncio.sleep(1.0 - (time.time() - now)) # Sleep for remainder of second
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in monitor tick loop: {e}")
await asyncio.sleep(1.0)
def _evaluate_alerts(self):
"""Check metrics against thresholds and generate alerts"""
current_alerts = []
# API Latency Alert
p95_latency = self.api_latency_ms.get_percentile(0.95, window_seconds=60)
if p95_latency > self.alert_thresholds['api_latency_p95_ms']:
current_alerts.append({
'id': f"alert_latency_{int(time.time())}",
'severity': 'high' if p95_latency > self.alert_thresholds['api_latency_p95_ms'] * 2 else 'medium',
'metric': 'api_latency',
'value': p95_latency,
'threshold': self.alert_thresholds['api_latency_p95_ms'],
'message': f"High API Latency (p95): {p95_latency:.2f}ms",
'timestamp': datetime.utcnow().isoformat()
})
# Error Rate Alert
avg_error_rate = self.api_error_rate.get_average(window_seconds=60)
if avg_error_rate > self.alert_thresholds['api_error_rate_pct']:
current_alerts.append({
'id': f"alert_error_{int(time.time())}",
'severity': 'critical',
'metric': 'error_rate',
'value': avg_error_rate,
'threshold': self.alert_thresholds['api_error_rate_pct'],
'message': f"High API Error Rate: {avg_error_rate:.2f}%",
'timestamp': datetime.utcnow().isoformat()
})
# Matching Time Alert
avg_matching = self.order_matching_time_ms.get_average(window_seconds=60)
if avg_matching > self.alert_thresholds['matching_time_ms']:
current_alerts.append({
'id': f"alert_matching_{int(time.time())}",
'severity': 'medium',
'metric': 'matching_time',
'value': avg_matching,
'threshold': self.alert_thresholds['matching_time_ms'],
'message': f"Slow Order Matching: {avg_matching:.2f}ms",
'timestamp': datetime.utcnow().isoformat()
})
self.active_alerts = current_alerts
if current_alerts:
# In a real system, this would trigger webhooks, Slack/Discord messages, etc.
for alert in current_alerts:
if alert['severity'] in ['high', 'critical']:
logger.warning(f"MARKETPLACE ALERT: {alert['message']}")
def get_realtime_dashboard_data(self) -> Dict[str, Any]:
"""Get aggregated data formatted for the frontend dashboard"""
return {
'status': 'degraded' if any(a['severity'] in ['high', 'critical'] for a in self.active_alerts) else 'healthy',
'timestamp': datetime.utcnow().isoformat(),
'current_metrics': {
'api': {
'rps': round(self.api_requests_per_sec.get_latest() or 0, 2),
'latency_p50_ms': round(self.api_latency_ms.get_percentile(0.50, 60), 2),
'latency_p95_ms': round(self.api_latency_ms.get_percentile(0.95, 60), 2),
'error_rate_pct': round(self.api_error_rate.get_average(60), 2)
},
'trading': {
'tps': round(self.trades_per_sec.get_latest() or 0, 2),
'matching_time_ms': round(self.order_matching_time_ms.get_average(60), 2),
'active_orders': int(self.active_orders.get_latest() or 0)
},
'network': {
'active_providers': int(self.active_providers.get_latest() or 0),
'gpu_utilization_pct': round(self.gpu_utilization_pct.get_latest() or 0, 2),
'bandwidth_mbps': round(self.network_bandwidth_mbps.get_latest() or 0, 2)
}
},
'alerts': self.active_alerts
}
# Global instance
monitor = MarketplaceMonitor()

View File

@@ -0,0 +1,265 @@
"""
Marketplace Adaptive Resource Scaler
Implements predictive and reactive auto-scaling of marketplace resources based on demand.
"""
import time
import asyncio
import logging
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime, timedelta
import math
logger = logging.getLogger(__name__)
class ScalingPolicy:
"""Configuration for scaling behavior"""
def __init__(
self,
min_nodes: int = 2,
max_nodes: int = 100,
target_utilization: float = 0.75,
scale_up_threshold: float = 0.85,
scale_down_threshold: float = 0.40,
cooldown_period_sec: int = 300, # 5 minutes between scaling actions
predictive_scaling: bool = True
):
self.min_nodes = min_nodes
self.max_nodes = max_nodes
self.target_utilization = target_utilization
self.scale_up_threshold = scale_up_threshold
self.scale_down_threshold = scale_down_threshold
self.cooldown_period_sec = cooldown_period_sec
self.predictive_scaling = predictive_scaling
class ResourceScaler:
"""Adaptive resource scaling engine for the AITBC marketplace"""
def __init__(self, policy: Optional[ScalingPolicy] = None):
self.policy = policy or ScalingPolicy()
# Current state
self.current_nodes = self.policy.min_nodes
self.active_gpu_nodes = 0
self.active_cpu_nodes = self.policy.min_nodes
self.last_scaling_action_time = 0
self.scaling_history = []
# Historical demand tracking for predictive scaling
# Format: hour_of_week (0-167) -> avg_utilization
self.historical_demand = {}
self.is_running = False
self._scaler_task = None
async def start(self):
if self.is_running:
return
self.is_running = True
self._scaler_task = asyncio.create_task(self._scaling_loop())
logger.info(f"Resource Scaler started (Min: {self.policy.min_nodes}, Max: {self.policy.max_nodes})")
async def stop(self):
self.is_running = False
if self._scaler_task:
self._scaler_task.cancel()
logger.info("Resource Scaler stopped")
def update_historical_demand(self, utilization: float):
"""Update historical data for predictive scaling"""
now = datetime.utcnow()
hour_of_week = now.weekday() * 24 + now.hour
if hour_of_week not in self.historical_demand:
self.historical_demand[hour_of_week] = utilization
else:
# Exponential moving average (favor recent data)
current_avg = self.historical_demand[hour_of_week]
self.historical_demand[hour_of_week] = (current_avg * 0.9) + (utilization * 0.1)
def _predict_demand(self, lookahead_hours: int = 1) -> float:
"""Predict expected utilization based on historical patterns"""
if not self.policy.predictive_scaling or not self.historical_demand:
return 0.0
now = datetime.utcnow()
target_hour = (now.weekday() * 24 + now.hour + lookahead_hours) % 168
# If we have exact data for that hour
if target_hour in self.historical_demand:
return self.historical_demand[target_hour]
# Find nearest available data points
available_hours = sorted(self.historical_demand.keys())
if not available_hours:
return 0.0
# Simplistic interpolation
return sum(self.historical_demand.values()) / len(self.historical_demand)
async def _scaling_loop(self):
"""Background task that evaluates scaling rules periodically"""
while self.is_running:
try:
# In a real system, we'd fetch this from the Monitor or Coordinator
# Here we simulate fetching current metrics
current_utilization = self._get_current_utilization()
current_queue_depth = self._get_queue_depth()
self.update_historical_demand(current_utilization)
await self.evaluate_scaling(current_utilization, current_queue_depth)
# Check every 10 seconds
await asyncio.sleep(10.0)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in scaling loop: {e}")
await asyncio.sleep(10.0)
async def evaluate_scaling(self, current_utilization: float, queue_depth: int) -> Optional[Dict[str, Any]]:
"""Evaluate if scaling action is needed and execute if necessary"""
now = time.time()
# Check cooldown
if now - self.last_scaling_action_time < self.policy.cooldown_period_sec:
return None
predicted_utilization = self._predict_demand()
# Determine target node count
target_nodes = self.current_nodes
action = None
reason = ""
# Scale UP conditions
if current_utilization > self.policy.scale_up_threshold or queue_depth > self.current_nodes * 5:
# Reactive scale up
desired_increase = math.ceil(self.current_nodes * (current_utilization / self.policy.target_utilization - 1.0))
# Ensure we add at least 1, but bounded by queue depth and max_nodes
nodes_to_add = max(1, min(desired_increase, max(1, queue_depth // 2)))
target_nodes = min(self.policy.max_nodes, self.current_nodes + nodes_to_add)
if target_nodes > self.current_nodes:
action = "scale_up"
reason = f"High utilization ({current_utilization*100:.1f}%) or queue depth ({queue_depth})"
elif self.policy.predictive_scaling and predicted_utilization > self.policy.scale_up_threshold:
# Predictive scale up (proactive)
# Add nodes more conservatively for predictive scaling
target_nodes = min(self.policy.max_nodes, self.current_nodes + 1)
if target_nodes > self.current_nodes:
action = "scale_up"
reason = f"Predictive scaling (expected {predicted_utilization*100:.1f}% util)"
# Scale DOWN conditions
elif current_utilization < self.policy.scale_down_threshold and queue_depth == 0:
# Only scale down if predicted utilization is also low
if not self.policy.predictive_scaling or predicted_utilization < self.policy.target_utilization:
# Remove nodes conservatively
nodes_to_remove = max(1, int(self.current_nodes * 0.2))
target_nodes = max(self.policy.min_nodes, self.current_nodes - nodes_to_remove)
if target_nodes < self.current_nodes:
action = "scale_down"
reason = f"Low utilization ({current_utilization*100:.1f}%)"
# Execute scaling if needed
if action and target_nodes != self.current_nodes:
diff = abs(target_nodes - self.current_nodes)
result = await self._execute_scaling(action, diff, target_nodes)
record = {
"timestamp": datetime.utcnow().isoformat(),
"action": action,
"nodes_changed": diff,
"new_total": target_nodes,
"reason": reason,
"metrics_at_time": {
"utilization": current_utilization,
"queue_depth": queue_depth,
"predicted_utilization": predicted_utilization
}
}
self.scaling_history.append(record)
# Keep history manageable
if len(self.scaling_history) > 1000:
self.scaling_history = self.scaling_history[-1000:]
self.last_scaling_action_time = now
self.current_nodes = target_nodes
logger.info(f"Auto-scaler: {action.upper()} to {target_nodes} nodes. Reason: {reason}")
return record
return None
async def _execute_scaling(self, action: str, count: int, new_total: int) -> bool:
"""Execute the actual scaling action (e.g. interacting with Kubernetes/Docker/Cloud provider)"""
# In this implementation, we simulate the scaling delay
# In production, this would call cloud APIs (AWS AutoScaling, K8s Scale, etc.)
logger.debug(f"Executing {action} by {count} nodes...")
# Simulate API delay
await asyncio.sleep(2.0)
if action == "scale_up":
# Simulate provisioning new instances
# We assume a mix of CPU and GPU instances based on demand
new_gpus = count // 2
new_cpus = count - new_gpus
self.active_gpu_nodes += new_gpus
self.active_cpu_nodes += new_cpus
elif action == "scale_down":
# Simulate de-provisioning
# Prefer removing CPU nodes first if we have GPU ones
remove_cpus = min(count, max(0, self.active_cpu_nodes - self.policy.min_nodes))
remove_gpus = count - remove_cpus
self.active_cpu_nodes -= remove_cpus
self.active_gpu_nodes = max(0, self.active_gpu_nodes - remove_gpus)
return True
# --- Simulation helpers ---
def _get_current_utilization(self) -> float:
"""Simulate getting current cluster utilization"""
# In reality, fetch from MarketplaceMonitor or Coordinator
import random
# Base utilization with some noise
base = 0.6
return max(0.1, min(0.99, base + random.uniform(-0.2, 0.3)))
def _get_queue_depth(self) -> int:
"""Simulate getting current queue depth"""
import random
if random.random() > 0.8:
return random.randint(10, 50)
return random.randint(0, 5)
def get_status(self) -> Dict[str, Any]:
"""Get current scaler status"""
return {
"status": "running" if self.is_running else "stopped",
"current_nodes": {
"total": self.current_nodes,
"cpu_nodes": self.active_cpu_nodes,
"gpu_nodes": self.active_gpu_nodes
},
"policy": {
"min_nodes": self.policy.min_nodes,
"max_nodes": self.policy.max_nodes,
"target_utilization": self.policy.target_utilization
},
"last_action": self.scaling_history[-1] if self.scaling_history else None,
"prediction": {
"next_hour_utilization_estimate": round(self._predict_demand(1), 3)
}
}