feat: add marketplace metrics, privacy features, and service registry endpoints

- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
2025-12-22 10:33:23 +01:00
parent d98b2c7772
commit c8be9d7414
260 changed files with 59033 additions and 351 deletions
--- a/apps/blockchain-node/observability/generated_dashboards/coordinator-overview.json
+++ b/apps/blockchain-node/observability/generated_dashboards/coordinator-overview.json
@ -298,6 +298,124 @@
      ],
      "title": "Miner Error Rate",
      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "expr": "rate(marketplace_requests_total[1m])",
+          "refId": "A"
+        }
+      ],
+      "title": "Marketplace API Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 5
+              },
+              {
+                "color": "red",
+                "value": 10
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "expr": "rate(marketplace_errors_total[1m])",
+          "refId": "A"
+        }
+      ],
+      "title": "Marketplace API Error Rate",
+      "type": "timeseries"
    }
  ],
  "refresh": "10s",
--- a/apps/blockchain-node/scripts/benchmark_throughput.py
+++ b/apps/blockchain-node/scripts/benchmark_throughput.py
@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Blockchain Node Throughput Benchmark
+
+This script simulates sustained load on the blockchain node to measure:
+- Transactions per second (TPS)
+- Latency percentiles (p50, p95, p99)
+- CPU and memory usage
+- Queue depth and saturation points
+
+Usage:
+    python benchmark_throughput.py --concurrent-clients 100 --duration 60 --target-url http://localhost:8080
+"""
+
+import asyncio
+import aiohttp
+import time
+import statistics
+import psutil
+import argparse
+import json
+from typing import List, Dict, Any
+from dataclasses import dataclass
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BenchmarkResult:
+    """Results from a benchmark run"""
+    total_transactions: int
+    duration: float
+    tps: float
+    latency_p50: float
+    latency_p95: float
+    latency_p99: float
+    cpu_usage: float
+    memory_usage: float
+    errors: int
+
+
+class BlockchainBenchmark:
+    """Benchmark client for blockchain node"""
+    
+    def __init__(self, base_url: str):
+        self.base_url = base_url.rstrip('/')
+        self.session = None
+        
+    async def __aenter__(self):
+        self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30))
+        return self
+        
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.session:
+            await self.session.close()
+    
+    async def submit_transaction(self, payload: Dict[str, Any]) -> Dict[str, Any]:
+        """Submit a single transaction"""
+        start_time = time.time()
+        try:
+            async with self.session.post(
+                f"{self.base_url}/v1/transactions",
+                json=payload
+            ) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    latency = (time.time() - start_time) * 1000  # ms
+                    return {"success": True, "latency": latency, "tx_id": result.get("tx_id")}
+                else:
+                    return {"success": False, "error": f"HTTP {response.status}"}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    
+    async def get_block_height(self) -> int:
+        """Get current block height"""
+        try:
+            async with self.session.get(f"{self.base_url}/v1/blocks/head") as response:
+                if response.status == 200:
+                    data = await response.json()
+                    return data.get("height", 0)
+        except Exception:
+            pass
+        return 0
+
+
+def generate_test_transaction(i: int) -> Dict[str, Any]:
+    """Generate a test transaction"""
+    return {
+        "from": f"0xtest_sender_{i % 100:040x}",
+        "to": f"0xtest_receiver_{i % 50:040x}",
+        "value": str((i + 1) * 1000),
+        "nonce": i,
+        "data": f"0x{hash(i) % 1000000:06x}",
+        "gas_limit": 21000,
+        "gas_price": "1000000000"  # 1 gwei
+    }
+
+
+async def worker_task(
+    benchmark: BlockchainBenchmark,
+    worker_id: int,
+    transactions_per_worker: int,
+    results: List[Dict[str, Any]]
+) -> None:
+    """Worker task that submits transactions"""
+    logger.info(f"Worker {worker_id} starting")
+    
+    for i in range(transactions_per_worker):
+        tx = generate_test_transaction(worker_id * transactions_per_worker + i)
+        result = await benchmark.submit_transaction(tx)
+        results.append(result)
+        
+        if not result["success"]:
+            logger.warning(f"Worker {worker_id} transaction failed: {result.get('error', 'unknown')}")
+    
+    logger.info(f"Worker {worker_id} completed")
+
+
+async def run_benchmark(
+    base_url: str,
+    concurrent_clients: int,
+    duration: int,
+    target_tps: int = None
+) -> BenchmarkResult:
+    """Run the benchmark"""
+    logger.info(f"Starting benchmark: {concurrent_clients} concurrent clients for {duration}s")
+    
+    # Start resource monitoring
+    process = psutil.Process()
+    cpu_samples = []
+    memory_samples = []
+    
+    async def monitor_resources():
+        while True:
+            cpu_samples.append(process.cpu_percent())
+            memory_samples.append(process.memory_info().rss / 1024 / 1024)  # MB
+            await asyncio.sleep(1)
+    
+    # Calculate transactions needed
+    if target_tps:
+        total_transactions = target_tps * duration
+    else:
+        total_transactions = concurrent_clients * 100  # Default: 100 tx per client
+    
+    transactions_per_worker = total_transactions // concurrent_clients
+    results = []
+    
+    async with BlockchainBenchmark(base_url) as benchmark:
+        # Start resource monitor
+        monitor_task = asyncio.create_task(monitor_resources())
+        
+        # Record start block height
+        start_height = await benchmark.get_block_height()
+        
+        # Start benchmark
+        start_time = time.time()
+        
+        # Create worker tasks
+        tasks = [
+            worker_task(benchmark, i, transactions_per_worker, results)
+            for i in range(concurrent_clients)
+        ]
+        
+        # Wait for all tasks to complete or timeout
+        try:
+            await asyncio.wait_for(asyncio.gather(*tasks), timeout=duration)
+        except asyncio.TimeoutError:
+            logger.warning("Benchmark timed out")
+            for task in tasks:
+                task.cancel()
+        
+        end_time = time.time()
+        actual_duration = end_time - start_time
+        
+        # Stop resource monitor
+        monitor_task.cancel()
+        
+        # Get final block height
+        end_height = await benchmark.get_block_height()
+        
+        # Calculate metrics
+        successful_tx = [r for r in results if r["success"]]
+        latencies = [r["latency"] for r in successful_tx if "latency" in r]
+        
+        if latencies:
+            latency_p50 = statistics.median(latencies)
+            latency_p95 = statistics.quantiles(latencies, n=20)[18]  # 95th percentile
+            latency_p99 = statistics.quantiles(latencies, n=100)[98]  # 99th percentile
+        else:
+            latency_p50 = latency_p95 = latency_p99 = 0
+        
+        tps = len(successful_tx) / actual_duration if actual_duration > 0 else 0
+        avg_cpu = statistics.mean(cpu_samples) if cpu_samples else 0
+        avg_memory = statistics.mean(memory_samples) if memory_samples else 0
+        errors = len(results) - len(successful_tx)
+        
+        logger.info(f"Benchmark completed:")
+        logger.info(f"  Duration: {actual_duration:.2f}s")
+        logger.info(f"  Transactions: {len(successful_tx)} successful, {errors} failed")
+        logger.info(f"  TPS: {tps:.2f}")
+        logger.info(f"  Latency p50/p95/p99: {latency_p50:.2f}/{latency_p95:.2f}/{latency_p99:.2f}ms")
+        logger.info(f"  CPU Usage: {avg_cpu:.1f}%")
+        logger.info(f"  Memory Usage: {avg_memory:.1f}MB")
+        logger.info(f"  Blocks processed: {end_height - start_height}")
+        
+        return BenchmarkResult(
+            total_transactions=len(successful_tx),
+            duration=actual_duration,
+            tps=tps,
+            latency_p50=latency_p50,
+            latency_p95=latency_p95,
+            latency_p99=latency_p99,
+            cpu_usage=avg_cpu,
+            memory_usage=avg_memory,
+            errors=errors
+        )
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Blockchain Node Throughput Benchmark")
+    parser.add_argument("--target-url", default="http://localhost:8080", 
+                       help="Blockchain node RPC URL")
+    parser.add_argument("--concurrent-clients", type=int, default=50,
+                       help="Number of concurrent client connections")
+    parser.add_argument("--duration", type=int, default=60,
+                       help="Benchmark duration in seconds")
+    parser.add_argument("--target-tps", type=int,
+                       help="Target TPS to achieve (calculates transaction count)")
+    parser.add_argument("--output", help="Output results to JSON file")
+    
+    args = parser.parse_args()
+    
+    # Run benchmark
+    result = await run_benchmark(
+        base_url=args.target_url,
+        concurrent_clients=args.concurrent_clients,
+        duration=args.duration,
+        target_tps=args.target_tps
+    )
+    
+    # Output results
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump({
+                "total_transactions": result.total_transactions,
+                "duration": result.duration,
+                "tps": result.tps,
+                "latency_p50": result.latency_p50,
+                "latency_p95": result.latency_p95,
+                "latency_p99": result.latency_p99,
+                "cpu_usage": result.cpu_usage,
+                "memory_usage": result.memory_usage,
+                "errors": result.errors
+            }, f, indent=2)
+        logger.info(f"Results saved to {args.output}")
+    
+    # Provide scaling recommendations
+    logger.info("\n=== Scaling Recommendations ===")
+    if result.tps < 100:
+        logger.info("• Low TPS detected. Consider optimizing transaction processing")
+    if result.latency_p95 > 1000:
+        logger.info("• High latency detected. Consider increasing resources or optimizing database queries")
+    if result.cpu_usage > 80:
+        logger.info("• High CPU usage. Horizontal scaling recommended")
+    if result.memory_usage > 1024:
+        logger.info("• High memory usage. Monitor for memory leaks")
+    
+    logger.info(f"\nRecommended minimum resources for current load:")
+    logger.info(f"• CPU: {result.cpu_usage * 1.5:.0f}% (with headroom)")
+    logger.info(f"• Memory: {result.memory_usage * 1.5:.0f}MB (with headroom)")
+    logger.info(f"• Horizontal scaling threshold: ~{result.tps * 0.7:.0f} TPS per node")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/apps/blockchain-node/scripts/test_autoscaling.py
+++ b/apps/blockchain-node/scripts/test_autoscaling.py
@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+Autoscaling Validation Script
+
+This script generates synthetic traffic to test and validate HPA behavior.
+It monitors pod counts and metrics while generating load to ensure autoscaling works as expected.
+
+Usage:
+    python test_autoscaling.py --service coordinator --namespace default --target-url http://localhost:8011 --duration 300
+"""
+
+import asyncio
+import aiohttp
+import time
+import argparse
+import logging
+import json
+from typing import List, Dict, Any
+from datetime import datetime
+import subprocess
+import sys
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+class AutoscalingTest:
+    """Test suite for validating autoscaling behavior"""
+    
+    def __init__(self, service_name: str, namespace: str, target_url: str):
+        self.service_name = service_name
+        self.namespace = namespace
+        self.target_url = target_url
+        self.session = None
+        
+    async def __aenter__(self):
+        self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30))
+        return self
+        
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.session:
+            await self.session.close()
+    
+    async def get_pod_count(self) -> int:
+        """Get current number of pods for the service"""
+        cmd = [
+            "kubectl", "get", "pods",
+            "-n", self.namespace,
+            "-l", f"app.kubernetes.io/name={self.service_name}",
+            "-o", "jsonpath='{.items[*].status.phase}'"
+        ]
+        
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            # Count Running pods
+            phases = result.stdout.strip().strip("'").split()
+            return len([p for p in phases if p == "Running"])
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to get pod count: {e}")
+            return 0
+    
+    async def get_hpa_status(self) -> Dict[str, Any]:
+        """Get current HPA status"""
+        cmd = [
+            "kubectl", "get", "hpa",
+            "-n", self.namespace,
+            f"{self.service_name}",
+            "-o", "json"
+        ]
+        
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            data = json.loads(result.stdout)
+            
+            return {
+                "min_replicas": data["spec"]["minReplicas"],
+                "max_replicas": data["spec"]["maxReplicas"],
+                "current_replicas": data["status"]["currentReplicas"],
+                "desired_replicas": data["status"]["desiredReplicas"],
+                "current_cpu": data["status"].get("currentCPUUtilizationPercentage"),
+                "target_cpu": None
+            }
+            
+            # Extract target CPU from metrics
+            for metric in data["spec"]["metrics"]:
+                if metric["type"] == "Resource" and metric["resource"]["name"] == "cpu":
+                    self.target_cpu = metric["resource"]["target"]["averageUtilization"]
+                    break
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to get HPA status: {e}")
+            return {}
+    
+    async def generate_load(self, duration: int, concurrent_requests: int = 50):
+        """Generate sustained load on the service"""
+        logger.info(f"Generating load for {duration}s with {concurrent_requests} concurrent requests")
+        
+        async def make_request():
+            try:
+                if self.service_name == "coordinator":
+                    # Test marketplace endpoints
+                    endpoints = [
+                        "/v1/marketplace/offers",
+                        "/v1/marketplace/stats"
+                    ]
+                    endpoint = endpoints[hash(time.time()) % len(endpoints)]
+                    async with self.session.get(f"{self.target_url}{endpoint}") as response:
+                        return response.status == 200
+                elif self.service_name == "blockchain-node":
+                    # Test blockchain endpoints
+                    payload = {
+                        "from": "0xtest_sender",
+                        "to": "0xtest_receiver",
+                        "value": "1000",
+                        "nonce": int(time.time()),
+                        "data": "0x",
+                        "gas_limit": 21000,
+                        "gas_price": "1000000000"
+                    }
+                    async with self.session.post(f"{self.target_url}/v1/transactions", json=payload) as response:
+                        return response.status == 200
+                else:
+                    # Generic health check
+                    async with self.session.get(f"{self.target_url}/v1/health") as response:
+                        return response.status == 200
+            except Exception as e:
+                logger.debug(f"Request failed: {e}")
+                return False
+        
+        # Generate sustained load
+        start_time = time.time()
+        tasks = []
+        
+        while time.time() - start_time < duration:
+            # Create batch of concurrent requests
+            batch = [make_request() for _ in range(concurrent_requests)]
+            tasks.extend(batch)
+            
+            # Wait for batch to complete
+            await asyncio.gather(*batch, return_exceptions=True)
+            
+            # Brief pause between batches
+            await asyncio.sleep(0.1)
+        
+        logger.info(f"Load generation completed")
+    
+    async def monitor_scaling(self, duration: int, interval: int = 10):
+        """Monitor pod scaling during load test"""
+        logger.info(f"Monitoring scaling for {duration}s")
+        
+        results = []
+        start_time = time.time()
+        
+        while time.time() - start_time < duration:
+            timestamp = datetime.now().isoformat()
+            pod_count = await self.get_pod_count()
+            hpa_status = await self.get_hpa_status()
+            
+            result = {
+                "timestamp": timestamp,
+                "pod_count": pod_count,
+                "hpa_status": hpa_status
+            }
+            
+            results.append(result)
+            logger.info(f"[{timestamp}] Pods: {pod_count}, HPA: {hpa_status}")
+            
+            await asyncio.sleep(interval)
+        
+        return results
+    
+    async def run_test(self, load_duration: int = 300, monitor_duration: int = 400):
+        """Run complete autoscaling test"""
+        logger.info(f"Starting autoscaling test for {self.service_name}")
+        
+        # Record initial state
+        initial_pods = await self.get_pod_count()
+        initial_hpa = await self.get_hpa_status()
+        
+        logger.info(f"Initial state - Pods: {initial_pods}, HPA: {initial_hpa}")
+        
+        # Start monitoring in background
+        monitor_task = asyncio.create_task(
+            self.monitor_scaling(monitor_duration)
+        )
+        
+        # Wait a bit to establish baseline
+        await asyncio.sleep(30)
+        
+        # Generate load
+        await self.generate_load(load_duration)
+        
+        # Wait for scaling to stabilize
+        await asyncio.sleep(60)
+        
+        # Get monitoring results
+        monitoring_results = await monitor_task
+        
+        # Analyze results
+        max_pods = max(r["pod_count"] for r in monitoring_results)
+        min_pods = min(r["pod_count"] for r in monitoring_results)
+        scaled_up = max_pods > initial_pods
+        
+        logger.info("\n=== Test Results ===")
+        logger.info(f"Initial pods: {initial_pods}")
+        logger.info(f"Min pods during test: {min_pods}")
+        logger.info(f"Max pods during test: {max_pods}")
+        logger.info(f"Scaling occurred: {scaled_up}")
+        
+        if scaled_up:
+            logger.info("✅ Autoscaling test PASSED - Service scaled up under load")
+        else:
+            logger.warning("⚠️ Autoscaling test FAILED - Service did not scale up")
+            logger.warning("Check:")
+            logger.warning("  - HPA configuration")
+            logger.warning("  - Metrics server is running")
+            logger.warning("  - Resource requests/limits are set")
+            logger.warning("  - Load was sufficient to trigger scaling")
+        
+        # Save results
+        results_file = f"autoscaling_test_{self.service_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        with open(results_file, "w") as f:
+            json.dump({
+                "service": self.service_name,
+                "namespace": self.namespace,
+                "initial_pods": initial_pods,
+                "max_pods": max_pods,
+                "min_pods": min_pods,
+                "scaled_up": scaled_up,
+                "monitoring_data": monitoring_results
+            }, f, indent=2)
+        
+        logger.info(f"Detailed results saved to: {results_file}")
+        
+        return scaled_up
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Autoscaling Validation Test")
+    parser.add_argument("--service", required=True,
+                       choices=["coordinator", "blockchain-node", "wallet-daemon"],
+                       help="Service to test")
+    parser.add_argument("--namespace", default="default",
+                       help="Kubernetes namespace")
+    parser.add_argument("--target-url", required=True,
+                       help="Service URL to generate load against")
+    parser.add_argument("--load-duration", type=int, default=300,
+                       help="Duration of load generation in seconds")
+    parser.add_argument("--monitor-duration", type=int, default=400,
+                       help="Total monitoring duration in seconds")
+    parser.add_argument("--local-mode", action="store_true",
+                       help="Run in local mode without Kubernetes (load test only)")
+    
+    args = parser.parse_args()
+    
+    if not args.local_mode:
+        # Verify kubectl is available
+        try:
+            subprocess.run(["kubectl", "version"], capture_output=True, check=True)
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            logger.error("kubectl is not available or not configured")
+            logger.info("Use --local-mode to run load test without Kubernetes monitoring")
+            sys.exit(1)
+    
+    # Run test
+    async with AutoscalingTest(args.service, args.namespace, args.target_url) as test:
+        if args.local_mode:
+            # Local mode: just test load generation
+            logger.info(f"Running load test for {args.service} in local mode")
+            await test.generate_load(args.load_duration)
+            logger.info("Load test completed successfully")
+            success = True
+        else:
+            # Full autoscaling test
+            success = await test.run_test(args.load_duration, args.monitor_duration)
+        sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/apps/blockchain-node/src/aitbc_chain/config.py
+++ b/apps/blockchain-node/src/aitbc_chain/config.py
@ -15,7 +15,7 @@ class ChainSettings(BaseSettings):
    rpc_bind_host: str = "127.0.0.1"
    rpc_bind_port: int = 8080

-    p2p_bind_host: str = "0.0.0.0"
+    p2p_bind_host: str = "127.0.0.2"
    p2p_bind_port: int = 7070

    proposer_id: str = "ait-devnet-proposer"