chore(security): enhance environment configuration, CI workflows, and wallet daemon with security improvements

- Restructure .env.example with security-focused documentation, service-specific environment file references, and AWS Secrets Manager integration
- Update CLI tests workflow to single Python 3.13 version, add pytest-mock dependency, and consolidate test execution with coverage
- Add comprehensive security validation to package publishing workflow with manual approval gates, secret scanning, and release
This commit is contained in:
oib
2026-03-03 10:33:46 +01:00
parent 00d00cb964
commit f353e00172
220 changed files with 42506 additions and 921 deletions

47
dev/cache/aitbc_cache/__init__.py vendored Normal file
View File

@@ -0,0 +1,47 @@
"""
AITBC Event-Driven Cache Package
Provides distributed caching with event-driven invalidation for GPU marketplace
and other real-time data that needs immediate propagation across edge nodes.
"""
from .event_driven_cache import (
EventDrivenCacheManager,
CacheEventType,
CacheEvent,
CacheConfig,
cache_manager,
cached_result
)
from .gpu_marketplace_cache import (
GPUMarketplaceCacheManager,
GPUInfo,
BookingInfo,
MarketStats,
init_marketplace_cache,
get_marketplace_cache,
marketplace_cache
)
__version__ = "1.0.0"
__author__ = "AITBC Team"
__all__ = [
# Core event-driven caching
"EventDrivenCacheManager",
"CacheEventType",
"CacheEvent",
"CacheConfig",
"cache_manager",
"cached_result",
# GPU marketplace caching
"GPUMarketplaceCacheManager",
"GPUInfo",
"BookingInfo",
"MarketStats",
"init_marketplace_cache",
"get_marketplace_cache",
"marketplace_cache"
]

343
dev/cache/aitbc_cache/config.py vendored Normal file
View File

@@ -0,0 +1,343 @@
"""
Cache Configuration for AITBC Event-Driven Caching System
Configuration settings for Redis distributed caching with event-driven invalidation
across global edge nodes for GPU marketplace and real-time data.
"""
import os
from typing import Dict, Any, Optional
from dataclasses import dataclass
@dataclass
class RedisConfig:
"""Redis connection configuration"""
host: str = "localhost"
port: int = 6379
db: int = 0
password: Optional[str] = None
ssl: bool = False
max_connections: int = 20
socket_timeout: int = 5
socket_connect_timeout: int = 5
retry_on_timeout: bool = True
health_check_interval: int = 30
@dataclass
class CacheConfig:
"""Cache behavior configuration"""
l1_cache_size: int = 1000
l1_ttl_multiplier: float = 0.5 # L1 cache TTL as fraction of L2 TTL
event_queue_size: int = 10000
event_processing_timeout: int = 30
invalidation_batch_size: int = 100
stats_retention_hours: int = 24
health_check_interval: int = 60
@dataclass
class EdgeNodeConfig:
"""Edge node configuration"""
node_id: Optional[str] = None
region: str = "default"
datacenter: str = "default"
rack: Optional[str] = None
availability_zone: Optional[str] = None
network_tier: str = "standard" # standard, premium, edge
cache_tier: str = "edge" # edge, regional, global
@dataclass
class EventDrivenCacheSettings:
"""Complete event-driven cache settings"""
redis: RedisConfig
cache: CacheConfig
edge_node: EdgeNodeConfig
# Feature flags
enable_l1_cache: bool = True
enable_event_driven_invalidation: bool = True
enable_compression: bool = True
enable_metrics: bool = True
enable_health_checks: bool = True
# Performance settings
connection_pool_size: int = 20
max_event_queue_size: int = 10000
event_processing_workers: int = 4
cache_warmup_enabled: bool = True
# Security settings
enable_tls: bool = False
require_auth: bool = False
auth_token: Optional[str] = None
def load_config_from_env() -> EventDrivenCacheSettings:
"""Load configuration from environment variables"""
# Redis configuration
redis_config = RedisConfig(
host=os.getenv("REDIS_HOST", "localhost"),
port=int(os.getenv("REDIS_PORT", "6379")),
db=int(os.getenv("REDIS_DB", "0")),
password=os.getenv("REDIS_PASSWORD"),
ssl=os.getenv("REDIS_SSL", "false").lower() == "true",
max_connections=int(os.getenv("REDIS_MAX_CONNECTIONS", "20")),
socket_timeout=int(os.getenv("REDIS_SOCKET_TIMEOUT", "5")),
socket_connect_timeout=int(os.getenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5")),
retry_on_timeout=os.getenv("REDIS_RETRY_ON_TIMEOUT", "true").lower() == "true",
health_check_interval=int(os.getenv("REDIS_HEALTH_CHECK_INTERVAL", "30"))
)
# Cache configuration
cache_config = CacheConfig(
l1_cache_size=int(os.getenv("CACHE_L1_SIZE", "1000")),
l1_ttl_multiplier=float(os.getenv("CACHE_L1_TTL_MULTIPLIER", "0.5")),
event_queue_size=int(os.getenv("CACHE_EVENT_QUEUE_SIZE", "10000")),
event_processing_timeout=int(os.getenv("CACHE_EVENT_PROCESSING_TIMEOUT", "30")),
invalidation_batch_size=int(os.getenv("CACHE_INVALIDATION_BATCH_SIZE", "100")),
stats_retention_hours=int(os.getenv("CACHE_STATS_RETENTION_HOURS", "24")),
health_check_interval=int(os.getenv("CACHE_HEALTH_CHECK_INTERVAL", "60"))
)
# Edge node configuration
edge_node_config = EdgeNodeConfig(
node_id=os.getenv("EDGE_NODE_ID"),
region=os.getenv("EDGE_NODE_REGION", "default"),
datacenter=os.getenv("EDGE_NODE_DATACENTER", "default"),
rack=os.getenv("EDGE_NODE_RACK"),
availability_zone=os.getenv("EDGE_NODE_AVAILABILITY_ZONE"),
network_tier=os.getenv("EDGE_NODE_NETWORK_TIER", "standard"),
cache_tier=os.getenv("EDGE_NODE_CACHE_TIER", "edge")
)
# Feature flags
enable_l1_cache = os.getenv("CACHE_ENABLE_L1", "true").lower() == "true"
enable_event_driven_invalidation = os.getenv("CACHE_ENABLE_EVENT_DRIVEN", "true").lower() == "true"
enable_compression = os.getenv("CACHE_ENABLE_COMPRESSION", "true").lower() == "true"
enable_metrics = os.getenv("CACHE_ENABLE_METRICS", "true").lower() == "true"
enable_health_checks = os.getenv("CACHE_ENABLE_HEALTH_CHECKS", "true").lower() == "true"
# Performance settings
connection_pool_size = int(os.getenv("CACHE_CONNECTION_POOL_SIZE", "20"))
max_event_queue_size = int(os.getenv("CACHE_MAX_EVENT_QUEUE_SIZE", "10000"))
event_processing_workers = int(os.getenv("CACHE_EVENT_PROCESSING_WORKERS", "4"))
cache_warmup_enabled = os.getenv("CACHE_WARMUP_ENABLED", "true").lower() == "true"
# Security settings
enable_tls = os.getenv("CACHE_ENABLE_TLS", "false").lower() == "true"
require_auth = os.getenv("CACHE_REQUIRE_AUTH", "false").lower() == "true"
auth_token = os.getenv("CACHE_AUTH_TOKEN")
return EventDrivenCacheSettings(
redis=redis_config,
cache=cache_config,
edge_node=edge_node_config,
enable_l1_cache=enable_l1_cache,
enable_event_driven_invalidation=enable_event_driven_invalidation,
enable_compression=enable_compression,
enable_metrics=enable_metrics,
enable_health_checks=enable_health_checks,
connection_pool_size=connection_pool_size,
max_event_queue_size=max_event_queue_size,
event_processing_workers=event_processing_workers,
cache_warmup_enabled=cache_warmup_enabled,
enable_tls=enable_tls,
require_auth=require_auth,
auth_token=auth_token
)
def get_redis_url(config: RedisConfig) -> str:
"""Construct Redis URL from configuration"""
auth_part = ""
if config.password:
auth_part = f":{config.password}@"
ssl_part = "s" if config.ssl else ""
return f"redis{ssl_part}://{auth_part}{config.host}:{config.port}/{config.db}"
# Default configurations for different environments
def get_development_config() -> EventDrivenCacheSettings:
"""Development environment configuration"""
return EventDrivenCacheSettings(
redis=RedisConfig(
host="localhost",
port=6379,
db=1, # Use different DB for development
ssl=False
),
cache=CacheConfig(
l1_cache_size=100, # Smaller cache for development
l1_ttl_multiplier=0.3,
event_queue_size=1000
),
edge_node=EdgeNodeConfig(
node_id="dev_node",
region="development"
),
enable_metrics=False, # Disable overhead in development
enable_health_checks=False
)
def get_staging_config() -> EventDrivenCacheSettings:
"""Staging environment configuration"""
return EventDrivenCacheSettings(
redis=RedisConfig(
host="redis-staging.internal",
port=6379,
db=0,
ssl=True
),
cache=CacheConfig(
l1_cache_size=500,
l1_ttl_multiplier=0.4,
event_queue_size=5000
),
edge_node=EdgeNodeConfig(
node_id=None, # Auto-generate
region="staging"
),
enable_metrics=True,
enable_health_checks=True
)
def get_production_config() -> EventDrivenCacheSettings:
"""Production environment configuration"""
return EventDrivenCacheSettings(
redis=RedisConfig(
host=os.getenv("REDIS_CLUSTER_HOST", "redis-cluster.internal"),
port=int(os.getenv("REDIS_CLUSTER_PORT", "6379")),
db=0,
password=os.getenv("REDIS_CLUSTER_PASSWORD"),
ssl=True,
max_connections=50,
socket_timeout=10,
socket_connect_timeout=10,
health_check_interval=15
),
cache=CacheConfig(
l1_cache_size=2000,
l1_ttl_multiplier=0.6,
event_queue_size=20000,
event_processing_timeout=60,
invalidation_batch_size=200,
health_check_interval=30
),
edge_node=EdgeNodeConfig(
node_id=None, # Auto-generate from hostname/IP
region=os.getenv("EDGE_NODE_REGION", "global"),
datacenter=os.getenv("EDGE_NODE_DATACENTER"),
availability_zone=os.getenv("EDGE_NODE_AZ"),
network_tier="premium",
cache_tier="edge"
),
enable_l1_cache=True,
enable_event_driven_invalidation=True,
enable_compression=True,
enable_metrics=True,
enable_health_checks=True,
connection_pool_size=50,
max_event_queue_size=20000,
event_processing_workers=8,
cache_warmup_enabled=True,
enable_tls=True,
require_auth=True,
auth_token=os.getenv("CACHE_AUTH_TOKEN")
)
def get_edge_node_config(region: str) -> EventDrivenCacheSettings:
"""Configuration for edge nodes in specific regions"""
base_config = get_production_config()
# Override edge node specific settings
base_config.edge_node.region = region
base_config.edge_node.cache_tier = "edge"
base_config.edge_node.network_tier = "edge"
# Edge nodes have smaller L1 cache but faster event processing
base_config.cache.l1_cache_size = 500
base_config.cache.l1_ttl_multiplier = 0.3
base_config.event_processing_workers = 2
return base_config
def get_regional_cache_config(region: str) -> EventDrivenCacheSettings:
"""Configuration for regional cache nodes"""
base_config = get_production_config()
# Override regional cache settings
base_config.edge_node.region = region
base_config.edge_node.cache_tier = "regional"
base_config.edge_node.network_tier = "premium"
# Regional caches have larger L1 cache
base_config.cache.l1_cache_size = 5000
base_config.cache.l1_ttl_multiplier = 0.8
base_config.event_processing_workers = 6
return base_config
# Configuration validation
def validate_config(config: EventDrivenCacheSettings) -> bool:
"""Validate cache configuration"""
errors = []
# Redis configuration validation
if not config.redis.host:
errors.append("Redis host is required")
if not (1 <= config.redis.port <= 65535):
errors.append("Redis port must be between 1 and 65535")
if not (0 <= config.redis.db <= 15):
errors.append("Redis DB must be between 0 and 15")
# Cache configuration validation
if config.cache.l1_cache_size <= 0:
errors.append("L1 cache size must be positive")
if not (0.1 <= config.cache.l1_ttl_multiplier <= 1.0):
errors.append("L1 TTL multiplier must be between 0.1 and 1.0")
if config.cache.event_queue_size <= 0:
errors.append("Event queue size must be positive")
# Edge node configuration validation
if not config.edge_node.region:
errors.append("Edge node region is required")
if config.edge_node.cache_tier not in ["edge", "regional", "global"]:
errors.append("Cache tier must be one of: edge, regional, global")
if errors:
raise ValueError(f"Configuration validation failed: {', '.join(errors)}")
return True
# Environment-specific configuration loader
def get_config_for_environment(env: str = None) -> EventDrivenCacheSettings:
"""Get configuration for specific environment"""
env = env or os.getenv("ENVIRONMENT", "development").lower()
if env == "production":
return get_production_config()
elif env == "staging":
return get_staging_config()
elif env == "development":
return get_development_config()
else:
# Default to environment variables
return load_config_from_env()

View File

@@ -0,0 +1,587 @@
"""
Event-Driven Redis Caching Strategy for Distributed Edge Nodes
Implements a distributed caching system with event-driven cache invalidation
for GPU availability and pricing data that changes on booking/cancellation.
"""
import json
import asyncio
import logging
import time
from typing import Dict, List, Optional, Any, Set, Callable
from dataclasses import dataclass, asdict
from enum import Enum
from datetime import datetime, timedelta
import hashlib
import uuid
import redis.asyncio as redis
from redis.asyncio import ConnectionPool
logger = logging.getLogger(__name__)
class CacheEventType(Enum):
"""Types of cache events"""
GPU_AVAILABILITY_CHANGED = "gpu_availability_changed"
PRICING_UPDATED = "pricing_updated"
BOOKING_CREATED = "booking_created"
BOOKING_CANCELLED = "booking_cancelled"
PROVIDER_STATUS_CHANGED = "provider_status_changed"
MARKET_STATS_UPDATED = "market_stats_updated"
ORDER_BOOK_UPDATED = "order_book_updated"
MANUAL_INVALIDATION = "manual_invalidation"
@dataclass
class CacheEvent:
"""Cache invalidation event"""
event_type: CacheEventType
resource_id: str
data: Dict[str, Any]
timestamp: float
source_node: str
event_id: str
affected_namespaces: List[str]
@dataclass
class CacheConfig:
"""Cache configuration for different data types"""
namespace: str
ttl_seconds: int
event_driven: bool
critical_data: bool # Data that needs immediate propagation
max_memory_mb: int
class EventDrivenCacheManager:
"""
Event-driven cache manager for distributed edge nodes
Features:
- Redis pub/sub for real-time cache invalidation
- Multi-tier caching (L1 memory + L2 Redis)
- Event-driven updates for critical data
- Automatic failover and recovery
- Distributed cache coordination
"""
def __init__(self,
redis_url: str = "redis://localhost:6379/0",
node_id: str = None,
edge_node_region: str = "default"):
self.redis_url = redis_url
self.node_id = node_id or f"edge_node_{uuid.uuid4().hex[:8]}"
self.edge_node_region = edge_node_region
# Redis connections
self.redis_client = None
self.pubsub = None
self.connection_pool = None
# Event handling
self.event_handlers: Dict[CacheEventType, List[Callable]] = {}
self.event_queue = asyncio.Queue()
self.is_running = False
# Local L1 cache for critical data
self.l1_cache: Dict[str, Dict] = {}
self.l1_max_size = 1000
# Cache configurations
self.cache_configs = self._init_cache_configs()
# Statistics
self.stats = {
'events_processed': 0,
'cache_hits': 0,
'cache_misses': 0,
'invalidations': 0,
'last_event_time': None
}
def _init_cache_configs(self) -> Dict[str, CacheConfig]:
"""Initialize cache configurations for different data types"""
return {
# GPU availability - changes frequently, needs immediate propagation
'gpu_availability': CacheConfig(
namespace='gpu_avail',
ttl_seconds=30, # Short TTL, but event-driven invalidation
event_driven=True,
critical_data=True,
max_memory_mb=100
),
# GPU pricing - changes on booking/cancellation
'gpu_pricing': CacheConfig(
namespace='gpu_pricing',
ttl_seconds=60, # Medium TTL with event-driven updates
event_driven=True,
critical_data=True,
max_memory_mb=50
),
# Order book - very dynamic
'order_book': CacheConfig(
namespace='order_book',
ttl_seconds=5, # Very short TTL
event_driven=True,
critical_data=True,
max_memory_mb=200
),
# Provider status - changes on provider state changes
'provider_status': CacheConfig(
namespace='provider_status',
ttl_seconds=120, # Longer TTL with event-driven updates
event_driven=True,
critical_data=False,
max_memory_mb=50
),
# Market statistics - computed periodically
'market_stats': CacheConfig(
namespace='market_stats',
ttl_seconds=300, # 5 minutes
event_driven=True,
critical_data=False,
max_memory_mb=100
),
# Historical data - static, longer TTL
'historical_data': CacheConfig(
namespace='historical',
ttl_seconds=3600, # 1 hour
event_driven=False,
critical_data=False,
max_memory_mb=500
)
}
async def connect(self):
"""Connect to Redis and setup pub/sub"""
try:
# Create connection pool
self.connection_pool = ConnectionPool.from_url(
self.redis_url,
decode_responses=True,
max_connections=20
)
# Create Redis client
self.redis_client = redis.Redis(connection_pool=self.connection_pool)
# Test connection
await self.redis_client.ping()
# Setup pub/sub for cache invalidation events
self.pubsub = self.redis_client.pubsub()
await self.pubsub.subscribe('cache_invalidation_events')
# Start event processing
self.is_running = True
asyncio.create_task(self._process_events())
asyncio.create_task(self._listen_for_events())
logger.info(f"Connected to Redis cache manager. Node ID: {self.node_id}")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
raise
async def disconnect(self):
"""Disconnect from Redis and cleanup"""
self.is_running = False
if self.pubsub:
await self.pubsub.unsubscribe('cache_invalidation_events')
await self.pubsub.close()
if self.redis_client:
await self.redis_client.close()
if self.connection_pool:
await self.connection_pool.disconnect()
logger.info("Disconnected from Redis cache manager")
def _generate_cache_key(self, namespace: str, params: Dict[str, Any]) -> str:
"""Generate deterministic cache key"""
param_str = json.dumps(params, sort_keys=True)
param_hash = hashlib.sha256(param_str.encode()).hexdigest()
return f"{namespace}:{param_hash}"
async def get(self, cache_type: str, params: Dict[str, Any]) -> Optional[Any]:
"""Get data from cache with L1/L2 fallback"""
config = self.cache_configs.get(cache_type)
if not config:
raise ValueError(f"Unknown cache type: {cache_type}")
cache_key = self._generate_cache_key(config.namespace, params)
# 1. Try L1 memory cache first (fastest)
if cache_key in self.l1_cache:
cache_entry = self.l1_cache[cache_key]
if cache_entry['expires_at'] > time.time():
self.stats['cache_hits'] += 1
logger.debug(f"L1 cache hit for {cache_key}")
return cache_entry['data']
else:
# Expired, remove from L1
del self.l1_cache[cache_key]
# 2. Try L2 Redis cache
if self.redis_client:
try:
cached_data = await self.redis_client.get(cache_key)
if cached_data:
self.stats['cache_hits'] += 1
logger.debug(f"L2 cache hit for {cache_key}")
data = json.loads(cached_data)
# Backfill L1 cache for critical data
if config.critical_data and len(self.l1_cache) < self.l1_max_size:
self.l1_cache[cache_key] = {
'data': data,
'expires_at': time.time() + min(config.ttl_seconds, 60)
}
return data
except Exception as e:
logger.warning(f"Redis get failed: {e}")
self.stats['cache_misses'] += 1
return None
async def set(self, cache_type: str, params: Dict[str, Any], data: Any,
custom_ttl: int = None, publish_event: bool = True):
"""Set data in cache with optional event publishing"""
config = self.cache_configs.get(cache_type)
if not config:
raise ValueError(f"Unknown cache type: {cache_type}")
cache_key = self._generate_cache_key(config.namespace, params)
ttl = custom_ttl or config.ttl_seconds
# 1. Set L1 cache for critical data
if config.critical_data:
self._update_l1_cache(cache_key, data, ttl)
# 2. Set L2 Redis cache
if self.redis_client:
try:
serialized_data = json.dumps(data, default=str)
await self.redis_client.setex(cache_key, ttl, serialized_data)
# Publish invalidation event if event-driven
if publish_event and config.event_driven:
await self._publish_invalidation_event(
CacheEventType.MANUAL_INVALIDATION,
cache_type,
{'cache_key': cache_key, 'action': 'updated'},
[config.namespace]
)
except Exception as e:
logger.error(f"Redis set failed: {e}")
def _update_l1_cache(self, cache_key: str, data: Any, ttl: int):
"""Update L1 cache with size management"""
# Remove oldest entries if cache is full
while len(self.l1_cache) >= self.l1_max_size:
oldest_key = min(self.l1_cache.keys(),
key=lambda k: self.l1_cache[k]['expires_at'])
del self.l1_cache[oldest_key]
self.l1_cache[cache_key] = {
'data': data,
'expires_at': time.time() + ttl
}
async def invalidate_cache(self, cache_type: str, resource_id: str = None,
reason: str = "manual"):
"""Invalidate cache entries and publish event"""
config = self.cache_configs.get(cache_type)
if not config:
raise ValueError(f"Unknown cache type: {cache_type}")
# Invalidate L1 cache
keys_to_remove = []
for key in self.l1_cache:
if key.startswith(config.namespace):
if resource_id is None or resource_id in key:
keys_to_remove.append(key)
for key in keys_to_remove:
del self.l1_cache[key]
# Invalidate L2 Redis cache
if self.redis_client:
try:
pattern = f"{config.namespace}:*"
if resource_id:
pattern = f"{config.namespace}:*{resource_id}*"
cursor = 0
while True:
cursor, keys = await self.redis_client.scan(
cursor=cursor, match=pattern, count=100
)
if keys:
await self.redis_client.delete(*keys)
if cursor == 0:
break
self.stats['invalidations'] += 1
# Publish invalidation event
await self._publish_invalidation_event(
CacheEventType.MANUAL_INVALIDATION,
cache_type,
{'resource_id': resource_id, 'reason': reason},
[config.namespace]
)
logger.info(f"Invalidated {cache_type} cache: {reason}")
except Exception as e:
logger.error(f"Cache invalidation failed: {e}")
async def _publish_invalidation_event(self, event_type: CacheEventType,
resource_id: str, data: Dict[str, Any],
affected_namespaces: List[str]):
"""Publish cache invalidation event to Redis pub/sub"""
event = CacheEvent(
event_type=event_type,
resource_id=resource_id,
data=data,
timestamp=time.time(),
source_node=self.node_id,
event_id=str(uuid.uuid4()),
affected_namespaces=affected_namespaces
)
try:
event_json = json.dumps(asdict(event), default=str)
await self.redis_client.publish('cache_invalidation_events', event_json)
logger.debug(f"Published invalidation event: {event_type.value}")
except Exception as e:
logger.error(f"Failed to publish event: {e}")
async def _listen_for_events(self):
"""Listen for cache invalidation events from other nodes"""
while self.is_running:
try:
message = await self.pubsub.get_message(timeout=1.0)
if message and message['type'] == 'message':
await self._handle_invalidation_event(message['data'])
except Exception as e:
logger.error(f"Event listener error: {e}")
await asyncio.sleep(1)
async def _handle_invalidation_event(self, event_json: str):
"""Handle incoming cache invalidation event"""
try:
event_data = json.loads(event_json)
# Ignore events from this node
if event_data.get('source_node') == self.node_id:
return
# Queue event for processing
await self.event_queue.put(event_data)
except Exception as e:
logger.error(f"Failed to handle invalidation event: {e}")
async def _process_events(self):
"""Process queued invalidation events"""
while self.is_running:
try:
event_data = await asyncio.wait_for(
self.event_queue.get(), timeout=1.0
)
await self._process_invalidation_event(event_data)
self.stats['events_processed'] += 1
self.stats['last_event_time'] = time.time()
except asyncio.TimeoutError:
continue
except Exception as e:
logger.error(f"Event processing error: {e}")
async def _process_invalidation_event(self, event_data: Dict[str, Any]):
"""Process a single invalidation event"""
event_type = CacheEventType(event_data['event_type'])
affected_namespaces = event_data['affected_namespaces']
# Invalidate L1 cache entries
for namespace in affected_namespaces:
keys_to_remove = []
for key in self.l1_cache:
if key.startswith(namespace):
keys_to_remove.append(key)
for key in keys_to_remove:
del self.l1_cache[key]
# Invalidate L2 cache entries
if self.redis_client:
try:
for namespace in affected_namespaces:
pattern = f"{namespace}:*"
cursor = 0
while True:
cursor, keys = await self.redis_client.scan(
cursor=cursor, match=pattern, count=100
)
if keys:
await self.redis_client.delete(*keys)
if cursor == 0:
break
logger.debug(f"Processed invalidation event: {event_type.value}")
except Exception as e:
logger.error(f"Failed to process invalidation event: {e}")
# Event-specific methods for common operations
async def notify_gpu_availability_change(self, gpu_id: str, new_status: str):
"""Notify about GPU availability change"""
await self._publish_invalidation_event(
CacheEventType.GPU_AVAILABILITY_CHANGED,
f"gpu_{gpu_id}",
{'gpu_id': gpu_id, 'status': new_status},
['gpu_avail']
)
async def notify_pricing_update(self, gpu_type: str, new_price: float):
"""Notify about GPU pricing update"""
await self._publish_invalidation_event(
CacheEventType.PRICING_UPDATED,
f"price_{gpu_type}",
{'gpu_type': gpu_type, 'price': new_price},
['gpu_pricing']
)
async def notify_booking_created(self, booking_id: str, gpu_id: str):
"""Notify about new booking creation"""
await self._publish_invalidation_event(
CacheEventType.BOOKING_CREATED,
f"booking_{booking_id}",
{'booking_id': booking_id, 'gpu_id': gpu_id},
['gpu_avail', 'gpu_pricing', 'order_book']
)
async def notify_booking_cancelled(self, booking_id: str, gpu_id: str):
"""Notify about booking cancellation"""
await self._publish_invalidation_event(
CacheEventType.BOOKING_CANCELLED,
f"booking_{booking_id}",
{'booking_id': booking_id, 'gpu_id': gpu_id},
['gpu_avail', 'gpu_pricing', 'order_book']
)
async def get_cache_stats(self) -> Dict[str, Any]:
"""Get cache performance statistics"""
stats = self.stats.copy()
# Add L1 cache size
stats['l1_cache_size'] = len(self.l1_cache)
stats['l1_cache_max_size'] = self.l1_max_size
# Add Redis info if available
if self.redis_client:
try:
info = await self.redis_client.info('memory')
stats['redis_memory_used_mb'] = info['used_memory'] / (1024 * 1024)
stats['redis_connected_clients'] = info.get('connected_clients', 0)
except Exception as e:
logger.warning(f"Failed to get Redis info: {e}")
return stats
async def health_check(self) -> Dict[str, Any]:
"""Perform health check of the cache system"""
health = {
'status': 'healthy',
'redis_connected': False,
'pubsub_active': False,
'event_queue_size': 0,
'last_event_age': None
}
try:
# Check Redis connection
if self.redis_client:
await self.redis_client.ping()
health['redis_connected'] = True
# Check pub/sub
if self.pubsub and self.is_running:
health['pubsub_active'] = True
# Check event queue
health['event_queue_size'] = self.event_queue.qsize()
# Check last event time
if self.stats['last_event_time']:
health['last_event_age'] = time.time() - self.stats['last_event_time']
# Overall status
if not health['redis_connected']:
health['status'] = 'degraded'
if not health['pubsub_active']:
health['status'] = 'unhealthy'
except Exception as e:
health['status'] = 'unhealthy'
health['error'] = str(e)
return health
# Global cache manager instance
cache_manager = EventDrivenCacheManager()
# Decorator for automatic cache management
def cached_result(cache_type: str, ttl: int = None, key_params: List[str] = None):
"""
Decorator to automatically cache function results
Args:
cache_type: Type of cache to use
ttl: Custom TTL override
key_params: List of parameter names to include in cache key
"""
def decorator(func):
async def wrapper(*args, **kwargs):
# Generate cache key from specified parameters
if key_params:
cache_key_params = {}
for i, param_name in enumerate(key_params):
if i < len(args):
cache_key_params[param_name] = args[i]
elif param_name in kwargs:
cache_key_params[param_name] = kwargs[param_name]
else:
cache_key_params = {'args': args, 'kwargs': kwargs}
# Try to get from cache
cached_result = await cache_manager.get(cache_type, cache_key_params)
if cached_result is not None:
return cached_result
# Execute function and cache result
result = await func(*args, **kwargs)
await cache_manager.set(cache_type, cache_key_params, result, ttl)
return result
return wrapper
return decorator

View File

@@ -0,0 +1,498 @@
"""
GPU Marketplace Cache Manager
Specialized cache manager for GPU marketplace data with event-driven invalidation
for availability and pricing changes on booking/cancellation.
"""
import asyncio
import logging
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from datetime import datetime, timedelta
import json
from .event_driven_cache import (
EventDrivenCacheManager,
CacheEventType,
cached_result
)
logger = logging.getLogger(__name__)
@dataclass
class GPUInfo:
"""GPU information structure"""
gpu_id: str
provider_id: str
gpu_type: str
memory_gb: int
cuda_cores: int
base_price_per_hour: float
current_price_per_hour: float
availability_status: str # 'available', 'busy', 'offline', 'maintenance'
region: str
performance_score: float
last_updated: datetime
@dataclass
class BookingInfo:
"""Booking information structure"""
booking_id: str
gpu_id: str
user_id: str
start_time: datetime
end_time: datetime
status: str # 'active', 'completed', 'cancelled'
total_cost: float
created_at: datetime
@dataclass
class MarketStats:
"""Market statistics structure"""
total_gpus: int
available_gpus: int
busy_gpus: int
average_price_per_hour: float
total_bookings_24h: int
total_volume_24h: float
utilization_rate: float
last_updated: datetime
class GPUMarketplaceCacheManager:
"""
Specialized cache manager for GPU marketplace
Features:
- Real-time GPU availability tracking
- Dynamic pricing with immediate propagation
- Event-driven cache invalidation on booking changes
- Regional cache optimization
- Performance-based GPU ranking
"""
def __init__(self, cache_manager: EventDrivenCacheManager):
self.cache = cache_manager
self.regions = set()
self.gpu_types = set()
# Register event handlers
self._register_event_handlers()
def _register_event_handlers(self):
"""Register handlers for cache invalidation events"""
# These handlers will be called when events are received
self.cache.event_handlers[CacheEventType.GPU_AVAILABILITY_CHANGED] = [
self._handle_gpu_availability_change
]
self.cache.event_handlers[CacheEventType.PRICING_UPDATED] = [
self._handle_pricing_update
]
self.cache.event_handlers[CacheEventType.BOOKING_CREATED] = [
self._handle_booking_created
]
self.cache.event_handlers[CacheEventType.BOOKING_CANCELLED] = [
self._handle_booking_cancelled
]
# GPU Availability Methods
async def get_gpu_availability(self,
region: str = None,
gpu_type: str = None,
include_busy: bool = False) -> List[GPUInfo]:
"""Get GPU availability with filtering options"""
params = {
'region': region,
'gpu_type': gpu_type,
'include_busy': include_busy,
'timestamp': datetime.utcnow().isoformat()
}
cached_data = await self.cache.get('gpu_availability', params)
if cached_data:
return [GPUInfo(**gpu) for gpu in cached_data]
# In real implementation, this would query the database
# For now, return empty list to be populated by real data
return []
async def set_gpu_availability(self, gpus: List[GPUInfo]):
"""Set GPU availability data"""
gpu_data = [asdict(gpu) for gpu in gpus]
# Update regions and GPU types tracking
for gpu in gpus:
self.regions.add(gpu.region)
self.gpu_types.add(gpu.gpu_type)
# Cache with different parameter combinations
await self.cache.set('gpu_availability', {}, gpu_data)
# Cache filtered views
for region in self.regions:
region_gpus = [asdict(gpu) for gpu in gpus if gpu.region == region]
await self.cache.set('gpu_availability',
{'region': region}, region_gpus)
for gpu_type in self.gpu_types:
type_gpus = [asdict(gpu) for gpu in gpus if gpu.gpu_type == gpu_type]
await self.cache.set('gpu_availability',
{'gpu_type': gpu_type}, type_gpus)
async def update_gpu_status(self, gpu_id: str, new_status: str):
"""Update individual GPU status and notify"""
# Get current GPU data
gpus = await self.get_gpu_availability()
updated_gpu = None
for gpu in gpus:
if gpu.gpu_id == gpu_id:
gpu.availability_status = new_status
gpu.last_updated = datetime.utcnow()
updated_gpu = gpu
break
if updated_gpu:
# Update cache
await self.set_gpu_availability(gpus)
# Publish event for immediate propagation
await self.cache.notify_gpu_availability_change(gpu_id, new_status)
logger.info(f"Updated GPU {gpu_id} status to {new_status}")
# Pricing Methods
async def get_gpu_pricing(self,
gpu_type: str = None,
region: str = None) -> Dict[str, float]:
"""Get current GPU pricing"""
params = {
'gpu_type': gpu_type,
'region': region,
'timestamp': datetime.utcnow().isoformat()
}
cached_data = await self.cache.get('gpu_pricing', params)
if cached_data:
return cached_data
# Return empty pricing to be populated by real data
return {}
async def update_gpu_pricing(self, gpu_type: str, new_price: float, region: str = None):
"""Update GPU pricing and notify"""
# Get current pricing
current_pricing = await self.get_gpu_pricing(gpu_type, region)
pricing_key = f"{gpu_type}_{region}" if region else gpu_type
current_pricing[pricing_key] = new_price
# Update cache
await self.cache.set('gpu_pricing',
{'gpu_type': gpu_type, 'region': region},
current_pricing)
# Publish event for immediate propagation
await self.cache.notify_pricing_update(gpu_type, new_price)
logger.info(f"Updated {gpu_type} pricing to {new_price}")
async def get_dynamic_pricing(self, gpu_id: str) -> float:
"""Get dynamic pricing for a specific GPU"""
params = {'gpu_id': gpu_id}
cached_price = await self.cache.get('gpu_pricing', params)
if cached_price:
return cached_price
# Calculate dynamic pricing based on demand and availability
gpus = await self.get_gpu_availability()
target_gpu = next((gpu for gpu in gpus if gpu.gpu_id == gpu_id), None)
if not target_gpu:
return 0.0
# Simple dynamic pricing logic
base_price = target_gpu.base_price_per_hour
availability_multiplier = 1.0
# Increase price based on demand (lower availability)
total_gpus = len(gpus)
available_gpus = len([g for g in gpus if g.availability_status == 'available'])
if total_gpus > 0:
availability_ratio = available_gpus / total_gpus
if availability_ratio < 0.1: # Less than 10% available
availability_multiplier = 2.0
elif availability_ratio < 0.3: # Less than 30% available
availability_multiplier = 1.5
elif availability_ratio < 0.5: # Less than 50% available
availability_multiplier = 1.2
dynamic_price = base_price * availability_multiplier
# Cache the calculated price
await self.cache.set('gpu_pricing', params, {'price': dynamic_price})
return dynamic_price
# Booking Methods
async def create_booking(self, booking: BookingInfo) -> bool:
"""Create a new booking and update caches"""
try:
# In real implementation, save to database first
# For now, just update caches
# Update GPU availability
await self.update_gpu_status(booking.gpu_id, 'busy')
# Update pricing (might change due to reduced availability)
gpus = await self.get_gpu_availability()
target_gpu = next((gpu for gpu in gpus if gpu.gpu_id == booking.gpu_id), None)
if target_gpu:
new_price = await self.get_dynamic_pricing(booking.gpu_id)
await self.update_gpu_pricing(target_gpu.gpu_type, new_price, target_gpu.region)
# Publish booking creation event
await self.cache.notify_booking_created(booking.booking_id, booking.gpu_id)
# Invalidate relevant caches
await self.cache.invalidate_cache('order_book')
await self.cache.invalidate_cache('market_stats')
logger.info(f"Created booking {booking.booking_id} for GPU {booking.gpu_id}")
return True
except Exception as e:
logger.error(f"Failed to create booking: {e}")
return False
async def cancel_booking(self, booking_id: str, gpu_id: str) -> bool:
"""Cancel a booking and update caches"""
try:
# Update GPU availability
await self.update_gpu_status(gpu_id, 'available')
# Update pricing (might change due to increased availability)
gpus = await self.get_gpu_availability()
target_gpu = next((gpu for gpu in gpus if gpu.gpu_id == gpu_id), None)
if target_gpu:
new_price = await self.get_dynamic_pricing(gpu_id)
await self.update_gpu_pricing(target_gpu.gpu_type, new_price, target_gpu.region)
# Publish booking cancellation event
await self.cache.notify_booking_cancelled(booking_id, gpu_id)
# Invalidate relevant caches
await self.cache.invalidate_cache('order_book')
await self.cache.invalidate_cache('market_stats')
logger.info(f"Cancelled booking {booking_id} for GPU {gpu_id}")
return True
except Exception as e:
logger.error(f"Failed to cancel booking: {e}")
return False
# Market Statistics
async def get_market_stats(self) -> MarketStats:
"""Get current market statistics"""
params = {'timestamp': datetime.utcnow().isoformat()}
cached_data = await self.cache.get('market_stats', params)
if cached_data:
return MarketStats(**cached_data)
# Calculate statistics from current data
gpus = await self.get_gpu_availability()
total_gpus = len(gpus)
available_gpus = len([g for g in gpus if g.availability_status == 'available'])
busy_gpus = len([g for g in gpus if g.availability_status == 'busy'])
# Calculate average price
prices = [g.current_price_per_hour for g in gpus if g.availability_status == 'available']
avg_price = sum(prices) / len(prices) if prices else 0.0
utilization_rate = busy_gpus / total_gpus if total_gpus > 0 else 0.0
stats = MarketStats(
total_gpus=total_gpus,
available_gpus=available_gpus,
busy_gpus=busy_gpus,
average_price_per_hour=avg_price,
total_bookings_24h=0, # Would be calculated from database
total_volume_24h=0.0, # Would be calculated from database
utilization_rate=utilization_rate,
last_updated=datetime.utcnow()
)
# Cache the statistics
await self.cache.set('market_stats', params, asdict(stats))
return stats
# Event Handlers
async def _handle_gpu_availability_change(self, event_data: Dict[str, Any]):
"""Handle GPU availability change event"""
gpu_id = event_data['data']['gpu_id']
new_status = event_data['data']['status']
# Invalidate GPU availability cache
await self.cache.invalidate_cache('gpu_availability')
# Invalidate market stats
await self.cache.invalidate_cache('market_stats')
logger.debug(f"Handled GPU availability change: {gpu_id} -> {new_status}")
async def _handle_pricing_update(self, event_data: Dict[str, Any]):
"""Handle pricing update event"""
gpu_type = event_data['data']['gpu_type']
new_price = event_data['data']['price']
# Invalidate pricing cache
await self.cache.invalidate_cache('gpu_pricing')
# Invalidate market stats
await self.cache.invalidate_cache('market_stats')
logger.debug(f"Handled pricing update: {gpu_type} -> {new_price}")
async def _handle_booking_created(self, event_data: Dict[str, Any]):
"""Handle booking creation event"""
booking_id = event_data['data']['booking_id']
gpu_id = event_data['data']['gpu_id']
# Invalidate caches affected by new booking
await self.cache.invalidate_cache('gpu_availability')
await self.cache.invalidate_cache('gpu_pricing')
await self.cache.invalidate_cache('order_book')
await self.cache.invalidate_cache('market_stats')
logger.debug(f"Handled booking creation: {booking_id}")
async def _handle_booking_cancelled(self, event_data: Dict[str, Any]):
"""Handle booking cancellation event"""
booking_id = event_data['data']['booking_id']
gpu_id = event_data['data']['gpu_id']
# Invalidate caches affected by cancellation
await self.cache.invalidate_cache('gpu_availability')
await self.cache.invalidate_cache('gpu_pricing')
await self.cache.invalidate_cache('order_book')
await self.cache.invalidate_cache('market_stats')
logger.debug(f"Handled booking cancellation: {booking_id}")
# Utility Methods
async def get_top_performing_gpus(self, limit: int = 10) -> List[GPUInfo]:
"""Get top performing GPUs by performance score"""
gpus = await self.get_gpu_availability()
# Filter available GPUs and sort by performance score
available_gpus = [gpu for gpu in gpus if gpu.availability_status == 'available']
sorted_gpus = sorted(available_gpus,
key=lambda gpu: gpu.performance_score,
reverse=True)
return sorted_gpus[:limit]
async def get_cheapest_gpus(self, limit: int = 10, gpu_type: str = None) -> List[GPUInfo]:
"""Get cheapest available GPUs"""
gpus = await self.get_gpu_availability(gpu_type=gpu_type)
# Filter available GPUs and sort by price
available_gpus = [gpu for gpu in gpus if gpu.availability_status == 'available']
sorted_gpus = sorted(available_gpus,
key=lambda gpu: gpu.current_price_per_hour)
return sorted_gpus[:limit]
async def search_gpus(self,
min_memory: int = None,
min_cuda_cores: int = None,
max_price: float = None,
region: str = None) -> List[GPUInfo]:
"""Search GPUs with specific criteria"""
gpus = await self.get_gpu_availability(region=region)
filtered_gpus = []
for gpu in gpus:
if gpu.availability_status != 'available':
continue
if min_memory and gpu.memory_gb < min_memory:
continue
if min_cuda_cores and gpu.cuda_cores < min_cuda_cores:
continue
if max_price and gpu.current_price_per_hour > max_price:
continue
filtered_gpus.append(gpu)
return filtered_gpus
async def get_cache_health(self) -> Dict[str, Any]:
"""Get comprehensive cache health report"""
health = await self.cache.health_check()
# Add marketplace-specific metrics
marketplace_metrics = {
'regions_count': len(self.regions),
'gpu_types_count': len(self.gpu_types),
'last_gpu_update': None,
'last_pricing_update': None
}
# Get last update times from cache stats
stats = await self.cache.get_cache_stats()
if stats['last_event_time']:
marketplace_metrics['last_event_age'] = time.time() - stats['last_event_time']
health['marketplace_metrics'] = marketplace_metrics
health['cache_stats'] = stats
return health
# Global marketplace cache manager instance
marketplace_cache = None
async def init_marketplace_cache(redis_url: str = "redis://localhost:6379/0",
node_id: str = None,
region: str = "default") -> GPUMarketplaceCacheManager:
"""Initialize the global marketplace cache manager"""
global marketplace_cache
# Initialize cache manager
cache_manager = EventDrivenCacheManager(redis_url, node_id, region)
await cache_manager.connect()
# Initialize marketplace cache manager
marketplace_cache = GPUMarketplaceCacheManager(cache_manager)
logger.info("GPU Marketplace Cache Manager initialized")
return marketplace_cache
async def get_marketplace_cache() -> GPUMarketplaceCacheManager:
"""Get the global marketplace cache manager"""
if marketplace_cache is None:
raise RuntimeError("Marketplace cache not initialized. Call init_marketplace_cache() first.")
return marketplace_cache