aitbc/gpu_acceleration/parallel_processing/marketplace_monitor.py

"""
Marketplace Real-time Performance Monitor
Implements comprehensive real-time monitoring and analytics for the AITBC marketplace.
"""

import time
import asyncio
import logging
from typing import Dict, List, Optional, Any, collections
from datetime import datetime, timedelta
import collections

logger = logging.getLogger(__name__)

class TimeSeriesData:
    """Efficient in-memory time series data structure for real-time metrics"""

    def __init__(self, max_points: int = 3600): # Default 1 hour of second-level data
        self.max_points = max_points
        self.timestamps = collections.deque(maxlen=max_points)
        self.values = collections.deque(maxlen=max_points)

    def add(self, value: float, timestamp: float = None):
        self.timestamps.append(timestamp or time.time())
        self.values.append(value)

    def get_latest(self) -> Optional[float]:
        return self.values[-1] if self.values else None

    def get_average(self, window_seconds: int = 60) -> float:
        if not self.values:
            return 0.0

        cutoff = time.time() - window_seconds
        valid_values = [v for t, v in zip(self.timestamps, self.values) if t >= cutoff]

        return sum(valid_values) / len(valid_values) if valid_values else 0.0

    def get_percentile(self, percentile: float, window_seconds: int = 60) -> float:
        if not self.values:
            return 0.0

        cutoff = time.time() - window_seconds
        valid_values = sorted([v for t, v in zip(self.timestamps, self.values) if t >= cutoff])

        if not valid_values:
            return 0.0

        idx = int(len(valid_values) * percentile)
        idx = min(max(idx, 0), len(valid_values) - 1)
        return valid_values[idx]

class MarketplaceMonitor:
    """Real-time performance monitoring system for the marketplace"""

    def __init__(self):
        # API Metrics
        self.api_latency_ms = TimeSeriesData()
        self.api_requests_per_sec = TimeSeriesData()
        self.api_error_rate = TimeSeriesData()

        # Trading Metrics
        self.order_matching_time_ms = TimeSeriesData()
        self.trades_per_sec = TimeSeriesData()
        self.active_orders = TimeSeriesData()

        # Resource Metrics
        self.gpu_utilization_pct = TimeSeriesData()
        self.network_bandwidth_mbps = TimeSeriesData()
        self.active_providers = TimeSeriesData()

        # internal tracking
        self._request_counter = 0
        self._error_counter = 0
        self._trade_counter = 0
        self._last_tick = time.time()

        self.is_running = False
        self._monitor_task = None

        # Alert thresholds
        self.alert_thresholds = {
            'api_latency_p95_ms': 500.0,
            'api_error_rate_pct': 5.0,
            'gpu_utilization_pct': 90.0,
            'matching_time_ms': 100.0
        }

        self.active_alerts = []

    async def start(self):
        if self.is_running:
            return
        self.is_running = True
        self._monitor_task = asyncio.create_task(self._metric_tick_loop())
        logger.info("Marketplace Monitor started")

    async def stop(self):
        self.is_running = False
        if self._monitor_task:
            self._monitor_task.cancel()
        logger.info("Marketplace Monitor stopped")

    def record_api_call(self, latency_ms: float, is_error: bool = False):
        """Record an API request for monitoring"""
        self.api_latency_ms.add(latency_ms)
        self._request_counter += 1
        if is_error:
            self._error_counter += 1

    def record_trade(self, matching_time_ms: float):
        """Record a successful trade match"""
        self.order_matching_time_ms.add(matching_time_ms)
        self._trade_counter += 1

    def update_resource_metrics(self, gpu_util: float, bandwidth: float, providers: int, orders: int):
        """Update system resource metrics"""
        self.gpu_utilization_pct.add(gpu_util)
        self.network_bandwidth_mbps.add(bandwidth)
        self.active_providers.add(providers)
        self.active_orders.add(orders)

    async def _metric_tick_loop(self):
        """Background task that aggregates metrics every second"""
        while self.is_running:
            try:
                now = time.time()
                elapsed = now - self._last_tick

                if elapsed >= 1.0:
                    # Calculate rates
                    req_per_sec = self._request_counter / elapsed
                    trades_per_sec = self._trade_counter / elapsed
                    error_rate = (self._error_counter / max(1, self._request_counter)) * 100

                    # Store metrics
                    self.api_requests_per_sec.add(req_per_sec)
                    self.trades_per_sec.add(trades_per_sec)
                    self.api_error_rate.add(error_rate)

                    # Reset counters
                    self._request_counter = 0
                    self._error_counter = 0
                    self._trade_counter = 0
                    self._last_tick = now

                    # Evaluate alerts
                    self._evaluate_alerts()

                await asyncio.sleep(1.0 - (time.time() - now)) # Sleep for remainder of second

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Error in monitor tick loop: {e}")
                await asyncio.sleep(1.0)

    def _evaluate_alerts(self):
        """Check metrics against thresholds and generate alerts"""
        current_alerts = []

        # API Latency Alert
        p95_latency = self.api_latency_ms.get_percentile(0.95, window_seconds=60)
        if p95_latency > self.alert_thresholds['api_latency_p95_ms']:
            current_alerts.append({
                'id': f"alert_latency_{int(time.time())}",
                'severity': 'high' if p95_latency > self.alert_thresholds['api_latency_p95_ms'] * 2 else 'medium',
                'metric': 'api_latency',
                'value': p95_latency,
                'threshold': self.alert_thresholds['api_latency_p95_ms'],
                'message': f"High API Latency (p95): {p95_latency:.2f}ms",
                'timestamp': datetime.utcnow().isoformat()
            })

        # Error Rate Alert
        avg_error_rate = self.api_error_rate.get_average(window_seconds=60)
        if avg_error_rate > self.alert_thresholds['api_error_rate_pct']:
            current_alerts.append({
                'id': f"alert_error_{int(time.time())}",
                'severity': 'critical',
                'metric': 'error_rate',
                'value': avg_error_rate,
                'threshold': self.alert_thresholds['api_error_rate_pct'],
                'message': f"High API Error Rate: {avg_error_rate:.2f}%",
                'timestamp': datetime.utcnow().isoformat()
            })

        # Matching Time Alert
        avg_matching = self.order_matching_time_ms.get_average(window_seconds=60)
        if avg_matching > self.alert_thresholds['matching_time_ms']:
            current_alerts.append({
                'id': f"alert_matching_{int(time.time())}",
                'severity': 'medium',
                'metric': 'matching_time',
                'value': avg_matching,
                'threshold': self.alert_thresholds['matching_time_ms'],
                'message': f"Slow Order Matching: {avg_matching:.2f}ms",
                'timestamp': datetime.utcnow().isoformat()
            })

        self.active_alerts = current_alerts

        if current_alerts:
            # In a real system, this would trigger webhooks, Slack/Discord messages, etc.
            for alert in current_alerts:
                if alert['severity'] in ['high', 'critical']:
                    logger.warning(f"MARKETPLACE ALERT: {alert['message']}")

    def get_realtime_dashboard_data(self) -> Dict[str, Any]:
        """Get aggregated data formatted for the frontend dashboard"""
        return {
            'status': 'degraded' if any(a['severity'] in ['high', 'critical'] for a in self.active_alerts) else 'healthy',
            'timestamp': datetime.utcnow().isoformat(),
            'current_metrics': {
                'api': {
                    'rps': round(self.api_requests_per_sec.get_latest() or 0, 2),
                    'latency_p50_ms': round(self.api_latency_ms.get_percentile(0.50, 60), 2),
                    'latency_p95_ms': round(self.api_latency_ms.get_percentile(0.95, 60), 2),
                    'error_rate_pct': round(self.api_error_rate.get_average(60), 2)
                },
                'trading': {
                    'tps': round(self.trades_per_sec.get_latest() or 0, 2),
                    'matching_time_ms': round(self.order_matching_time_ms.get_average(60), 2),
                    'active_orders': int(self.active_orders.get_latest() or 0)
                },
                'network': {
                    'active_providers': int(self.active_providers.get_latest() or 0),
                    'gpu_utilization_pct': round(self.gpu_utilization_pct.get_latest() or 0, 2),
                    'bandwidth_mbps': round(self.network_bandwidth_mbps.get_latest() or 0, 2)
                }
            },
            'alerts': self.active_alerts
        }

# Global instance
monitor = MarketplaceMonitor()