feat: implement CLI blockchain features and pool hub enhancements

CLI Blockchain Features: - Added block operations: import, export, import-chain, blocks-range - Added messaging system commands (deploy, state, topics, create-topic, messages, post, vote, search, reputation, moderate) - Added network force-sync operation - Replaced marketplace handlers with actual RPC calls - Replaced AI handlers with actual RPC calls - Added account operations (account get) - Added transaction query operations - Added mempool query operations - Created keystore_auth.py for authentication - Removed extended features interception - All handlers use keystore credentials for authenticated endpoints Pool Hub Enhancements: - Added SLA monitoring and capacity tables - Added billing integration service - Added SLA collector service - Added SLA router endpoints - Updated pool hub models and settings - Added integration tests for billing and SLA - Updated documentation with SLA monitoring guide
2026-04-22 15:59:00 +02:00
parent 51920a15d7
commit e22d864944
28 changed files with 4783 additions and 358 deletions
--- a/apps/pool-hub/src/poolhub/services/billing_integration.py
+++ b/apps/pool-hub/src/poolhub/services/billing_integration.py
@@ -0,0 +1,325 @@
+"""
+Billing Integration Service for Pool-Hub
+Integrates pool-hub usage data with coordinator-api's billing system.
+"""
+
+import asyncio
+import logging
+from datetime import datetime, timedelta
+from decimal import Decimal
+from typing import Dict, List, Optional, Any
+import httpx
+
+from sqlalchemy import and_, func, select
+from sqlalchemy.orm import Session
+
+from ..models import Miner, ServiceConfig, MatchRequest, MatchResult, Feedback
+from ..settings import settings
+
+logger = logging.getLogger(__name__)
+
+
+class BillingIntegration:
+    """Service for integrating pool-hub with coordinator-api billing"""
+
+    def __init__(self, db: Session):
+        self.db = db
+        self.coordinator_billing_url = getattr(
+            settings, "coordinator_billing_url", "http://localhost:8011"
+        )
+        self.coordinator_api_key = getattr(
+            settings, "coordinator_api_key", None
+        )
+        self.logger = logging.getLogger(__name__)
+
+        # Resource type mappings
+        self.resource_type_mapping = {
+            "gpu_hours": "gpu_hours",
+            "storage_gb": "storage_gb",
+            "api_calls": "api_calls",
+            "compute_hours": "compute_hours",
+        }
+
+        # Pricing configuration (fallback if coordinator-api pricing not available)
+        self.fallback_pricing = {
+            "gpu_hours": {"unit_price": Decimal("0.50")},
+            "storage_gb": {"unit_price": Decimal("0.02")},
+            "api_calls": {"unit_price": Decimal("0.0001")},
+            "compute_hours": {"unit_price": Decimal("0.30")},
+        }
+
+    async def record_usage(
+        self,
+        tenant_id: str,
+        resource_type: str,
+        quantity: Decimal,
+        unit_price: Optional[Decimal] = None,
+        job_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Record usage data to coordinator-api billing system"""
+
+        # Use fallback pricing if not provided
+        if not unit_price:
+            pricing_config = self.fallback_pricing.get(resource_type, {})
+            unit_price = pricing_config.get("unit_price", Decimal("0"))
+
+        # Calculate total cost
+        total_cost = unit_price * quantity
+
+        # Prepare billing event payload
+        billing_event = {
+            "tenant_id": tenant_id,
+            "event_type": "usage",
+            "resource_type": resource_type,
+            "quantity": float(quantity),
+            "unit_price": float(unit_price),
+            "total_amount": float(total_cost),
+            "currency": "USD",
+            "timestamp": datetime.utcnow().isoformat(),
+            "metadata": metadata or {},
+        }
+
+        if job_id:
+            billing_event["job_id"] = job_id
+
+        # Send to coordinator-api
+        try:
+            response = await self._send_billing_event(billing_event)
+            self.logger.info(
+                f"Recorded usage: tenant={tenant_id}, resource={resource_type}, "
+                f"quantity={quantity}, cost={total_cost}"
+            )
+            return response
+        except Exception as e:
+            self.logger.error(f"Failed to record usage: {e}")
+            # Queue for retry in production
+            return {"status": "failed", "error": str(e)}
+
+    async def sync_miner_usage(
+        self, miner_id: str, start_date: datetime, end_date: datetime
+    ) -> Dict[str, Any]:
+        """Sync usage data for a miner to coordinator-api billing"""
+
+        # Get miner information
+        stmt = select(Miner).where(Miner.miner_id == miner_id)
+        miner = self.db.execute(stmt).scalar_one_or_none()
+
+        if not miner:
+            raise ValueError(f"Miner not found: {miner_id}")
+
+        # Map miner to tenant (simplified - in production, use proper mapping)
+        tenant_id = miner_id  # For now, use miner_id as tenant_id
+
+        # Collect usage data from pool-hub
+        usage_data = await self._collect_miner_usage(miner_id, start_date, end_date)
+
+        # Send each usage record to coordinator-api
+        results = []
+        for resource_type, quantity in usage_data.items():
+            if quantity > 0:
+                result = await self.record_usage(
+                    tenant_id=tenant_id,
+                    resource_type=resource_type,
+                    quantity=Decimal(str(quantity)),
+                    metadata={"miner_id": miner_id, "sync_type": "miner_usage"},
+                )
+                results.append(result)
+
+        return {
+            "miner_id": miner_id,
+            "tenant_id": tenant_id,
+            "period": {"start": start_date.isoformat(), "end": end_date.isoformat()},
+            "usage_records": len(results),
+            "results": results,
+        }
+
+    async def sync_all_miners_usage(
+        self, hours_back: int = 24
+    ) -> Dict[str, Any]:
+        """Sync usage data for all miners to coordinator-api billing"""
+
+        end_date = datetime.utcnow()
+        start_date = end_date - timedelta(hours=hours_back)
+
+        # Get all miners
+        stmt = select(Miner)
+        miners = self.db.execute(stmt).scalars().all()
+
+        results = {
+            "sync_period": {"start": start_date.isoformat(), "end": end_date.isoformat()},
+            "miners_processed": 0,
+            "miners_failed": 0,
+            "total_usage_records": 0,
+            "details": [],
+        }
+
+        for miner in miners:
+            try:
+                result = await self.sync_miner_usage(miner.miner_id, start_date, end_date)
+                results["details"].append(result)
+                results["miners_processed"] += 1
+                results["total_usage_records"] += result["usage_records"]
+            except Exception as e:
+                self.logger.error(f"Failed to sync usage for miner {miner.miner_id}: {e}")
+                results["miners_failed"] += 1
+
+        self.logger.info(
+            f"Usage sync complete: processed={results['miners_processed']}, "
+            f"failed={results['miners_failed']}, records={results['total_usage_records']}"
+        )
+
+        return results
+
+    async def _collect_miner_usage(
+        self, miner_id: str, start_date: datetime, end_date: datetime
+    ) -> Dict[str, float]:
+        """Collect usage data for a miner from pool-hub"""
+
+        usage_data = {
+            "gpu_hours": 0.0,
+            "api_calls": 0.0,
+            "compute_hours": 0.0,
+        }
+
+        # Count match requests as API calls
+        stmt = select(func.count(MatchRequest.id)).where(
+            and_(
+                MatchRequest.created_at >= start_date,
+                MatchRequest.created_at <= end_date,
+            )
+        )
+        # Filter by miner_id if match requests have that field
+        # For now, count all requests (simplified)
+        api_calls = self.db.execute(stmt).scalar() or 0
+        usage_data["api_calls"] = float(api_calls)
+
+        # Calculate compute hours from match results
+        stmt = (
+            select(MatchResult)
+            .where(
+                and_(
+                    MatchResult.miner_id == miner_id,
+                    MatchResult.created_at >= start_date,
+                    MatchResult.created_at <= end_date,
+                )
+            )
+            .where(MatchResult.eta_ms.isnot_(None))
+        )
+
+        results = self.db.execute(stmt).scalars().all()
+
+        # Estimate compute hours from response times (simplified)
+        # In production, use actual job duration
+        total_compute_time_ms = sum(r.eta_ms for r in results if r.eta_ms)
+        compute_hours = (total_compute_time_ms / 1000 / 3600) if results else 0.0
+        usage_data["compute_hours"] = compute_hours
+
+        # Estimate GPU hours from miner capacity and compute hours
+        # In production, use actual GPU utilization data
+        gpu_hours = compute_hours * 1.5  # Estimate 1.5 GPUs per job on average
+        usage_data["gpu_hours"] = gpu_hours
+
+        return usage_data
+
+    async def _send_billing_event(self, billing_event: Dict[str, Any]) -> Dict[str, Any]:
+        """Send billing event to coordinator-api"""
+
+        url = f"{self.coordinator_billing_url}/api/billing/usage"
+
+        headers = {"Content-Type": "application/json"}
+        if self.coordinator_api_key:
+            headers["Authorization"] = f"Bearer {self.coordinator_api_key}"
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(url, json=billing_event, headers=headers)
+            response.raise_for_status()
+
+            return response.json()
+
+    async def get_billing_metrics(
+        self, tenant_id: Optional[str] = None, hours: int = 24
+    ) -> Dict[str, Any]:
+        """Get billing metrics from coordinator-api"""
+
+        url = f"{self.coordinator_billing_url}/api/billing/metrics"
+
+        params = {"hours": hours}
+        if tenant_id:
+            params["tenant_id"] = tenant_id
+
+        headers = {}
+        if self.coordinator_api_key:
+            headers["Authorization"] = f"Bearer {self.coordinator_api_key}"
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.get(url, params=params, headers=headers)
+            response.raise_for_status()
+
+            return response.json()
+
+    async def trigger_invoice_generation(
+        self, tenant_id: str, period_start: datetime, period_end: datetime
+    ) -> Dict[str, Any]:
+        """Trigger invoice generation in coordinator-api"""
+
+        url = f"{self.coordinator_billing_url}/api/billing/invoice"
+
+        payload = {
+            "tenant_id": tenant_id,
+            "period_start": period_start.isoformat(),
+            "period_end": period_end.isoformat(),
+        }
+
+        headers = {"Content-Type": "application/json"}
+        if self.coordinator_api_key:
+            headers["Authorization"] = f"Bearer {self.coordinator_api_key}"
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(url, json=payload, headers=headers)
+            response.raise_for_status()
+
+            return response.json()
+
+
+class BillingIntegrationScheduler:
+    """Scheduler for automated billing synchronization"""
+
+    def __init__(self, billing_integration: BillingIntegration):
+        self.billing_integration = billing_integration
+        self.logger = logging.getLogger(__name__)
+        self.running = False
+
+    async def start(self, sync_interval_hours: int = 1):
+        """Start the billing synchronization scheduler"""
+
+        if self.running:
+            return
+
+        self.running = True
+        self.logger.info("Billing Integration scheduler started")
+
+        # Start sync loop
+        asyncio.create_task(self._sync_loop(sync_interval_hours))
+
+    async def stop(self):
+        """Stop the billing synchronization scheduler"""
+
+        self.running = False
+        self.logger.info("Billing Integration scheduler stopped")
+
+    async def _sync_loop(self, interval_hours: int):
+        """Background task that syncs usage data periodically"""
+
+        while self.running:
+            try:
+                await self.billing_integration.sync_all_miners_usage(
+                    hours_back=interval_hours
+                )
+
+                # Wait for next sync interval
+                await asyncio.sleep(interval_hours * 3600)
+
+            except Exception as e:
+                self.logger.error(f"Error in billing sync loop: {e}")
+                await asyncio.sleep(300)  # Retry in 5 minutes
--- a/apps/pool-hub/src/poolhub/services/sla_collector.py
+++ b/apps/pool-hub/src/poolhub/services/sla_collector.py
@@ -0,0 +1,405 @@
+"""
+SLA Metrics Collection Service for Pool-Hub
+Collects and tracks SLA metrics for miners including uptime, response time, job completion rate, and capacity availability.
+"""
+
+import asyncio
+import logging
+from datetime import datetime, timedelta
+from decimal import Decimal
+from typing import Dict, List, Optional, Any
+
+from sqlalchemy import and_, desc, func, select
+from sqlalchemy.orm import Session
+
+from ..models import (
+    Miner,
+    MinerStatus,
+    SLAMetric,
+    SLAViolation,
+    Feedback,
+    MatchRequest,
+    MatchResult,
+    CapacitySnapshot,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SLACollector:
+    """Service for collecting and tracking SLA metrics for miners"""
+
+    def __init__(self, db: Session):
+        self.db = db
+        self.sla_thresholds = {
+            "uptime_pct": 95.0,
+            "response_time_ms": 1000.0,
+            "completion_rate_pct": 90.0,
+            "capacity_availability_pct": 80.0,
+        }
+
+    async def record_sla_metric(
+        self,
+        miner_id: str,
+        metric_type: str,
+        metric_value: float,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> SLAMetric:
+        """Record an SLA metric for a miner"""
+
+        threshold = self.sla_thresholds.get(metric_type, 100.0)
+        is_violation = self._check_violation(metric_type, metric_value, threshold)
+
+        # Create SLA metric record
+        sla_metric = SLAMetric(
+            miner_id=miner_id,
+            metric_type=metric_type,
+            metric_value=metric_value,
+            threshold=threshold,
+            is_violation=is_violation,
+            timestamp=datetime.utcnow(),
+            meta_data=metadata or {},
+        )
+
+        self.db.add(sla_metric)
+        await self.db.commit()
+
+        # Create violation record if threshold breached
+        if is_violation:
+            await self._record_violation(
+                miner_id, metric_type, metric_value, threshold, metadata
+            )
+
+        logger.info(
+            f"Recorded SLA metric: miner={miner_id}, type={metric_type}, "
+            f"value={metric_value}, violation={is_violation}"
+        )
+
+        return sla_metric
+
+    async def collect_miner_uptime(self, miner_id: str) -> float:
+        """Calculate miner uptime percentage based on heartbeat intervals"""
+
+        # Get miner status
+        stmt = select(MinerStatus).where(MinerStatus.miner_id == miner_id)
+        miner_status = (await self.db.execute(stmt)).scalar_one_or_none()
+
+        if not miner_status:
+            return 0.0
+
+        # Calculate uptime based on last heartbeat
+        if miner_status.last_heartbeat_at:
+            time_since_heartbeat = (
+                datetime.utcnow() - miner_status.last_heartbeat_at
+            ).total_seconds()
+
+            # Consider miner down if no heartbeat for 5 minutes
+            if time_since_heartbeat > 300:
+                uptime_pct = 0.0
+            else:
+                uptime_pct = 100.0 - (time_since_heartbeat / 300.0) * 100.0
+                uptime_pct = max(0.0, min(100.0, uptime_pct))
+        else:
+            uptime_pct = 0.0
+
+        # Update miner status with uptime
+        miner_status.uptime_pct = uptime_pct
+        self.db.commit()
+
+        # Record SLA metric
+        await self.record_sla_metric(
+            miner_id, "uptime_pct", uptime_pct, {"method": "heartbeat_based"}
+        )
+
+        return uptime_pct
+
+    async def collect_response_time(self, miner_id: str) -> Optional[float]:
+        """Calculate average response time for a miner from match results"""
+
+        # Get recent match results for this miner
+        stmt = (
+            select(MatchResult)
+            .where(MatchResult.miner_id == miner_id)
+            .order_by(desc(MatchResult.created_at))
+            .limit(100)
+        )
+        results = (await self.db.execute(stmt)).scalars().all()
+
+        if not results:
+            return None
+
+        # Calculate average response time (eta_ms)
+        response_times = [r.eta_ms for r in results if r.eta_ms is not None]
+
+        if not response_times:
+            return None
+
+        avg_response_time = sum(response_times) / len(response_times)
+
+        # Record SLA metric
+        await self.record_sla_metric(
+            miner_id,
+            "response_time_ms",
+            avg_response_time,
+            {"method": "match_results", "sample_size": len(response_times)},
+        )
+
+        return avg_response_time
+
+    async def collect_completion_rate(self, miner_id: str) -> Optional[float]:
+        """Calculate job completion rate for a miner from feedback"""
+
+        # Get recent feedback for this miner
+        stmt = (
+            select(Feedback)
+            .where(Feedback.miner_id == miner_id)
+            .where(Feedback.created_at >= datetime.utcnow() - timedelta(days=7))
+            .order_by(Feedback.created_at.desc())
+            .limit(100)
+        )
+        feedback_records = (await self.db.execute(stmt)).scalars().all()
+
+        if not feedback_records:
+            return None
+
+        # Calculate completion rate (successful outcomes)
+        successful = sum(1 for f in feedback_records if f.outcome == "success")
+        completion_rate = (successful / len(feedback_records)) * 100.0
+
+        # Record SLA metric
+        await self.record_sla_metric(
+            miner_id,
+            "completion_rate_pct",
+            completion_rate,
+            {"method": "feedback", "sample_size": len(feedback_records)},
+        )
+
+        return completion_rate
+
+    async def collect_capacity_availability(self) -> Dict[str, Any]:
+        """Collect capacity availability metrics across all miners"""
+
+        # Get all miner statuses
+        stmt = select(MinerStatus)
+        miner_statuses = (await self.db.execute(stmt)).scalars().all()
+
+        if not miner_statuses:
+            return {
+                "total_miners": 0,
+                "active_miners": 0,
+                "capacity_availability_pct": 0.0,
+            }
+
+        total_miners = len(miner_statuses)
+        active_miners = sum(1 for ms in miner_statuses if not ms.busy)
+        capacity_availability_pct = (active_miners / total_miners) * 100.0
+
+        # Record capacity snapshot
+        snapshot = CapacitySnapshot(
+            total_miners=total_miners,
+            active_miners=active_miners,
+            total_parallel_capacity=sum(
+                m.max_parallel for m in (await self.db.execute(select(Miner))).scalars().all()
+            ),
+            total_queue_length=sum(ms.queue_len for ms in miner_statuses),
+            capacity_utilization_pct=100.0 - capacity_availability_pct,
+            forecast_capacity=total_miners,  # Would be calculated from forecasting
+            recommended_scaling="stable",
+            scaling_reason="Capacity within normal range",
+            timestamp=datetime.utcnow(),
+            meta_data={"method": "real_time_collection"},
+        )
+
+        self.db.add(snapshot)
+        await self.db.commit()
+
+        logger.info(
+            f"Capacity snapshot: total={total_miners}, active={active_miners}, "
+            f"availability={capacity_availability_pct:.2f}%"
+        )
+
+        return {
+            "total_miners": total_miners,
+            "active_miners": active_miners,
+            "capacity_availability_pct": capacity_availability_pct,
+        }
+
+    async def collect_all_miner_metrics(self) -> Dict[str, Any]:
+        """Collect all SLA metrics for all miners"""
+
+        # Get all miners
+        stmt = select(Miner)
+        miners = self.db.execute(stmt).scalars().all()
+
+        results = {
+            "miners_processed": 0,
+            "metrics_collected": [],
+            "violations_detected": 0,
+        }
+
+        for miner in miners:
+            try:
+                # Collect each metric type
+                uptime = await self.collect_miner_uptime(miner.miner_id)
+                response_time = await self.collect_response_time(miner.miner_id)
+                completion_rate = await self.collect_completion_rate(miner.miner_id)
+
+                results["metrics_collected"].append(
+                    {
+                        "miner_id": miner.miner_id,
+                        "uptime_pct": uptime,
+                        "response_time_ms": response_time,
+                        "completion_rate_pct": completion_rate,
+                    }
+                )
+
+                results["miners_processed"] += 1
+
+            except Exception as e:
+                logger.error(f"Failed to collect metrics for miner {miner.miner_id}: {e}")
+
+        # Collect capacity metrics
+        capacity = await self.collect_capacity_availability()
+        results["capacity"] = capacity
+
+        # Count violations in this collection cycle
+        stmt = (
+            select(func.count(SLAViolation.id))
+            .where(SLAViolation.resolved_at.is_(None))
+            .where(SLAViolation.created_at >= datetime.utcnow() - timedelta(hours=1))
+        )
+        results["violations_detected"] = self.db.execute(stmt).scalar() or 0
+
+        logger.info(
+            f"SLA collection complete: processed={results['miners_processed']}, "
+            f"violations={results['violations_detected']}"
+        )
+
+        return results
+
+    async def get_sla_metrics(
+        self, miner_id: Optional[str] = None, hours: int = 24
+    ) -> List[SLAMetric]:
+        """Get SLA metrics for a miner or all miners"""
+
+        cutoff = datetime.utcnow() - timedelta(hours=hours)
+
+        stmt = select(SLAMetric).where(SLAMetric.timestamp >= cutoff)
+
+        if miner_id:
+            stmt = stmt.where(SLAMetric.miner_id == miner_id)
+
+        stmt = stmt.order_by(desc(SLAMetric.timestamp))
+
+        return (await self.db.execute(stmt)).scalars().all()
+
+    async def get_sla_violations(
+        self, miner_id: Optional[str] = None, resolved: bool = False
+    ) -> List[SLAViolation]:
+        """Get SLA violations for a miner or all miners"""
+
+        stmt = select(SLAViolation)
+
+        if miner_id:
+            stmt = stmt.where(SLAViolation.miner_id == miner_id)
+
+        if resolved:
+            stmt = stmt.where(SLAViolation.resolved_at.isnot_(None))
+        else:
+            stmt = stmt.where(SLAViolation.resolved_at.is_(None))
+
+        stmt = stmt.order_by(desc(SLAViolation.created_at))
+
+        return (await self.db.execute(stmt)).scalars().all()
+
+    def _check_violation(self, metric_type: str, value: float, threshold: float) -> bool:
+        """Check if a metric value violates its SLA threshold"""
+
+        if metric_type in ["uptime_pct", "completion_rate_pct", "capacity_availability_pct"]:
+            # Higher is better - violation if below threshold
+            return value < threshold
+        elif metric_type in ["response_time_ms"]:
+            # Lower is better - violation if above threshold
+            return value > threshold
+
+        return False
+
+    async def _record_violation(
+        self,
+        miner_id: str,
+        metric_type: str,
+        metric_value: float,
+        threshold: float,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> SLAViolation:
+        """Record an SLA violation"""
+
+        # Determine severity
+        if metric_type in ["uptime_pct", "completion_rate_pct"]:
+            severity = "critical" if metric_value < threshold * 0.8 else "high"
+        elif metric_type == "response_time_ms":
+            severity = "critical" if metric_value > threshold * 2 else "high"
+        else:
+            severity = "medium"
+
+        violation = SLAViolation(
+            miner_id=miner_id,
+            violation_type=metric_type,
+            severity=severity,
+            metric_value=metric_value,
+            threshold=threshold,
+            violation_duration_ms=None,  # Will be updated when resolved
+            created_at=datetime.utcnow(),
+            meta_data=metadata or {},
+        )
+
+        self.db.add(violation)
+        await self.db.commit()
+
+        logger.warning(
+            f"SLA violation recorded: miner={miner_id}, type={metric_type}, "
+            f"severity={severity}, value={metric_value}, threshold={threshold}"
+        )
+
+        return violation
+
+
+class SLACollectorScheduler:
+    """Scheduler for automated SLA metric collection"""
+
+    def __init__(self, sla_collector: SLACollector):
+        self.sla_collector = sla_collector
+        self.logger = logging.getLogger(__name__)
+        self.running = False
+
+    async def start(self, collection_interval_seconds: int = 300):
+        """Start the SLA collection scheduler"""
+
+        if self.running:
+            return
+
+        self.running = True
+        self.logger.info("SLA Collector scheduler started")
+
+        # Start collection loop
+        asyncio.create_task(self._collection_loop(collection_interval_seconds))
+
+    async def stop(self):
+        """Stop the SLA collection scheduler"""
+
+        self.running = False
+        self.logger.info("SLA Collector scheduler stopped")
+
+    async def _collection_loop(self, interval_seconds: int):
+        """Background task that collects SLA metrics periodically"""
+
+        while self.running:
+            try:
+                await self.sla_collector.collect_all_miner_metrics()
+
+                # Wait for next collection interval
+                await asyncio.sleep(interval_seconds)
+
+            except Exception as e:
+                self.logger.error(f"Error in SLA collection loop: {e}")
+                await asyncio.sleep(60)  # Retry in 1 minute