feat: add marketplace metrics, privacy features, and service registry endpoints

- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels
- Implement confidential transaction models with encryption support and access control
- Add key management system with registration, rotation, and audit logging
- Create services and registry routers for service discovery and management
- Integrate ZK proof generation for privacy-preserving receipts
- Add metrics instru
This commit is contained in:
oib
2025-12-22 10:33:23 +01:00
parent d98b2c7772
commit c8be9d7414
260 changed files with 59033 additions and 351 deletions

View File

@ -298,6 +298,124 @@
],
"title": "Miner Error Rate",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "rate(marketplace_requests_total[1m])",
"refId": "A"
}
],
"title": "Marketplace API Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 5
},
{
"color": "red",
"value": 10
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 7,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "rate(marketplace_errors_total[1m])",
"refId": "A"
}
],
"title": "Marketplace API Error Rate",
"type": "timeseries"
}
],
"refresh": "10s",

View File

@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
Blockchain Node Throughput Benchmark
This script simulates sustained load on the blockchain node to measure:
- Transactions per second (TPS)
- Latency percentiles (p50, p95, p99)
- CPU and memory usage
- Queue depth and saturation points
Usage:
python benchmark_throughput.py --concurrent-clients 100 --duration 60 --target-url http://localhost:8080
"""
import asyncio
import aiohttp
import time
import statistics
import psutil
import argparse
import json
from typing import List, Dict, Any
from dataclasses import dataclass
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@dataclass
class BenchmarkResult:
"""Results from a benchmark run"""
total_transactions: int
duration: float
tps: float
latency_p50: float
latency_p95: float
latency_p99: float
cpu_usage: float
memory_usage: float
errors: int
class BlockchainBenchmark:
"""Benchmark client for blockchain node"""
def __init__(self, base_url: str):
self.base_url = base_url.rstrip('/')
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30))
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def submit_transaction(self, payload: Dict[str, Any]) -> Dict[str, Any]:
"""Submit a single transaction"""
start_time = time.time()
try:
async with self.session.post(
f"{self.base_url}/v1/transactions",
json=payload
) as response:
if response.status == 200:
result = await response.json()
latency = (time.time() - start_time) * 1000 # ms
return {"success": True, "latency": latency, "tx_id": result.get("tx_id")}
else:
return {"success": False, "error": f"HTTP {response.status}"}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_block_height(self) -> int:
"""Get current block height"""
try:
async with self.session.get(f"{self.base_url}/v1/blocks/head") as response:
if response.status == 200:
data = await response.json()
return data.get("height", 0)
except Exception:
pass
return 0
def generate_test_transaction(i: int) -> Dict[str, Any]:
"""Generate a test transaction"""
return {
"from": f"0xtest_sender_{i % 100:040x}",
"to": f"0xtest_receiver_{i % 50:040x}",
"value": str((i + 1) * 1000),
"nonce": i,
"data": f"0x{hash(i) % 1000000:06x}",
"gas_limit": 21000,
"gas_price": "1000000000" # 1 gwei
}
async def worker_task(
benchmark: BlockchainBenchmark,
worker_id: int,
transactions_per_worker: int,
results: List[Dict[str, Any]]
) -> None:
"""Worker task that submits transactions"""
logger.info(f"Worker {worker_id} starting")
for i in range(transactions_per_worker):
tx = generate_test_transaction(worker_id * transactions_per_worker + i)
result = await benchmark.submit_transaction(tx)
results.append(result)
if not result["success"]:
logger.warning(f"Worker {worker_id} transaction failed: {result.get('error', 'unknown')}")
logger.info(f"Worker {worker_id} completed")
async def run_benchmark(
base_url: str,
concurrent_clients: int,
duration: int,
target_tps: int = None
) -> BenchmarkResult:
"""Run the benchmark"""
logger.info(f"Starting benchmark: {concurrent_clients} concurrent clients for {duration}s")
# Start resource monitoring
process = psutil.Process()
cpu_samples = []
memory_samples = []
async def monitor_resources():
while True:
cpu_samples.append(process.cpu_percent())
memory_samples.append(process.memory_info().rss / 1024 / 1024) # MB
await asyncio.sleep(1)
# Calculate transactions needed
if target_tps:
total_transactions = target_tps * duration
else:
total_transactions = concurrent_clients * 100 # Default: 100 tx per client
transactions_per_worker = total_transactions // concurrent_clients
results = []
async with BlockchainBenchmark(base_url) as benchmark:
# Start resource monitor
monitor_task = asyncio.create_task(monitor_resources())
# Record start block height
start_height = await benchmark.get_block_height()
# Start benchmark
start_time = time.time()
# Create worker tasks
tasks = [
worker_task(benchmark, i, transactions_per_worker, results)
for i in range(concurrent_clients)
]
# Wait for all tasks to complete or timeout
try:
await asyncio.wait_for(asyncio.gather(*tasks), timeout=duration)
except asyncio.TimeoutError:
logger.warning("Benchmark timed out")
for task in tasks:
task.cancel()
end_time = time.time()
actual_duration = end_time - start_time
# Stop resource monitor
monitor_task.cancel()
# Get final block height
end_height = await benchmark.get_block_height()
# Calculate metrics
successful_tx = [r for r in results if r["success"]]
latencies = [r["latency"] for r in successful_tx if "latency" in r]
if latencies:
latency_p50 = statistics.median(latencies)
latency_p95 = statistics.quantiles(latencies, n=20)[18] # 95th percentile
latency_p99 = statistics.quantiles(latencies, n=100)[98] # 99th percentile
else:
latency_p50 = latency_p95 = latency_p99 = 0
tps = len(successful_tx) / actual_duration if actual_duration > 0 else 0
avg_cpu = statistics.mean(cpu_samples) if cpu_samples else 0
avg_memory = statistics.mean(memory_samples) if memory_samples else 0
errors = len(results) - len(successful_tx)
logger.info(f"Benchmark completed:")
logger.info(f" Duration: {actual_duration:.2f}s")
logger.info(f" Transactions: {len(successful_tx)} successful, {errors} failed")
logger.info(f" TPS: {tps:.2f}")
logger.info(f" Latency p50/p95/p99: {latency_p50:.2f}/{latency_p95:.2f}/{latency_p99:.2f}ms")
logger.info(f" CPU Usage: {avg_cpu:.1f}%")
logger.info(f" Memory Usage: {avg_memory:.1f}MB")
logger.info(f" Blocks processed: {end_height - start_height}")
return BenchmarkResult(
total_transactions=len(successful_tx),
duration=actual_duration,
tps=tps,
latency_p50=latency_p50,
latency_p95=latency_p95,
latency_p99=latency_p99,
cpu_usage=avg_cpu,
memory_usage=avg_memory,
errors=errors
)
async def main():
parser = argparse.ArgumentParser(description="Blockchain Node Throughput Benchmark")
parser.add_argument("--target-url", default="http://localhost:8080",
help="Blockchain node RPC URL")
parser.add_argument("--concurrent-clients", type=int, default=50,
help="Number of concurrent client connections")
parser.add_argument("--duration", type=int, default=60,
help="Benchmark duration in seconds")
parser.add_argument("--target-tps", type=int,
help="Target TPS to achieve (calculates transaction count)")
parser.add_argument("--output", help="Output results to JSON file")
args = parser.parse_args()
# Run benchmark
result = await run_benchmark(
base_url=args.target_url,
concurrent_clients=args.concurrent_clients,
duration=args.duration,
target_tps=args.target_tps
)
# Output results
if args.output:
with open(args.output, "w") as f:
json.dump({
"total_transactions": result.total_transactions,
"duration": result.duration,
"tps": result.tps,
"latency_p50": result.latency_p50,
"latency_p95": result.latency_p95,
"latency_p99": result.latency_p99,
"cpu_usage": result.cpu_usage,
"memory_usage": result.memory_usage,
"errors": result.errors
}, f, indent=2)
logger.info(f"Results saved to {args.output}")
# Provide scaling recommendations
logger.info("\n=== Scaling Recommendations ===")
if result.tps < 100:
logger.info("• Low TPS detected. Consider optimizing transaction processing")
if result.latency_p95 > 1000:
logger.info("• High latency detected. Consider increasing resources or optimizing database queries")
if result.cpu_usage > 80:
logger.info("• High CPU usage. Horizontal scaling recommended")
if result.memory_usage > 1024:
logger.info("• High memory usage. Monitor for memory leaks")
logger.info(f"\nRecommended minimum resources for current load:")
logger.info(f"• CPU: {result.cpu_usage * 1.5:.0f}% (with headroom)")
logger.info(f"• Memory: {result.memory_usage * 1.5:.0f}MB (with headroom)")
logger.info(f"• Horizontal scaling threshold: ~{result.tps * 0.7:.0f} TPS per node")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
Autoscaling Validation Script
This script generates synthetic traffic to test and validate HPA behavior.
It monitors pod counts and metrics while generating load to ensure autoscaling works as expected.
Usage:
python test_autoscaling.py --service coordinator --namespace default --target-url http://localhost:8011 --duration 300
"""
import asyncio
import aiohttp
import time
import argparse
import logging
import json
from typing import List, Dict, Any
from datetime import datetime
import subprocess
import sys
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class AutoscalingTest:
"""Test suite for validating autoscaling behavior"""
def __init__(self, service_name: str, namespace: str, target_url: str):
self.service_name = service_name
self.namespace = namespace
self.target_url = target_url
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30))
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def get_pod_count(self) -> int:
"""Get current number of pods for the service"""
cmd = [
"kubectl", "get", "pods",
"-n", self.namespace,
"-l", f"app.kubernetes.io/name={self.service_name}",
"-o", "jsonpath='{.items[*].status.phase}'"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Count Running pods
phases = result.stdout.strip().strip("'").split()
return len([p for p in phases if p == "Running"])
except subprocess.CalledProcessError as e:
logger.error(f"Failed to get pod count: {e}")
return 0
async def get_hpa_status(self) -> Dict[str, Any]:
"""Get current HPA status"""
cmd = [
"kubectl", "get", "hpa",
"-n", self.namespace,
f"{self.service_name}",
"-o", "json"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
return {
"min_replicas": data["spec"]["minReplicas"],
"max_replicas": data["spec"]["maxReplicas"],
"current_replicas": data["status"]["currentReplicas"],
"desired_replicas": data["status"]["desiredReplicas"],
"current_cpu": data["status"].get("currentCPUUtilizationPercentage"),
"target_cpu": None
}
# Extract target CPU from metrics
for metric in data["spec"]["metrics"]:
if metric["type"] == "Resource" and metric["resource"]["name"] == "cpu":
self.target_cpu = metric["resource"]["target"]["averageUtilization"]
break
except subprocess.CalledProcessError as e:
logger.error(f"Failed to get HPA status: {e}")
return {}
async def generate_load(self, duration: int, concurrent_requests: int = 50):
"""Generate sustained load on the service"""
logger.info(f"Generating load for {duration}s with {concurrent_requests} concurrent requests")
async def make_request():
try:
if self.service_name == "coordinator":
# Test marketplace endpoints
endpoints = [
"/v1/marketplace/offers",
"/v1/marketplace/stats"
]
endpoint = endpoints[hash(time.time()) % len(endpoints)]
async with self.session.get(f"{self.target_url}{endpoint}") as response:
return response.status == 200
elif self.service_name == "blockchain-node":
# Test blockchain endpoints
payload = {
"from": "0xtest_sender",
"to": "0xtest_receiver",
"value": "1000",
"nonce": int(time.time()),
"data": "0x",
"gas_limit": 21000,
"gas_price": "1000000000"
}
async with self.session.post(f"{self.target_url}/v1/transactions", json=payload) as response:
return response.status == 200
else:
# Generic health check
async with self.session.get(f"{self.target_url}/v1/health") as response:
return response.status == 200
except Exception as e:
logger.debug(f"Request failed: {e}")
return False
# Generate sustained load
start_time = time.time()
tasks = []
while time.time() - start_time < duration:
# Create batch of concurrent requests
batch = [make_request() for _ in range(concurrent_requests)]
tasks.extend(batch)
# Wait for batch to complete
await asyncio.gather(*batch, return_exceptions=True)
# Brief pause between batches
await asyncio.sleep(0.1)
logger.info(f"Load generation completed")
async def monitor_scaling(self, duration: int, interval: int = 10):
"""Monitor pod scaling during load test"""
logger.info(f"Monitoring scaling for {duration}s")
results = []
start_time = time.time()
while time.time() - start_time < duration:
timestamp = datetime.now().isoformat()
pod_count = await self.get_pod_count()
hpa_status = await self.get_hpa_status()
result = {
"timestamp": timestamp,
"pod_count": pod_count,
"hpa_status": hpa_status
}
results.append(result)
logger.info(f"[{timestamp}] Pods: {pod_count}, HPA: {hpa_status}")
await asyncio.sleep(interval)
return results
async def run_test(self, load_duration: int = 300, monitor_duration: int = 400):
"""Run complete autoscaling test"""
logger.info(f"Starting autoscaling test for {self.service_name}")
# Record initial state
initial_pods = await self.get_pod_count()
initial_hpa = await self.get_hpa_status()
logger.info(f"Initial state - Pods: {initial_pods}, HPA: {initial_hpa}")
# Start monitoring in background
monitor_task = asyncio.create_task(
self.monitor_scaling(monitor_duration)
)
# Wait a bit to establish baseline
await asyncio.sleep(30)
# Generate load
await self.generate_load(load_duration)
# Wait for scaling to stabilize
await asyncio.sleep(60)
# Get monitoring results
monitoring_results = await monitor_task
# Analyze results
max_pods = max(r["pod_count"] for r in monitoring_results)
min_pods = min(r["pod_count"] for r in monitoring_results)
scaled_up = max_pods > initial_pods
logger.info("\n=== Test Results ===")
logger.info(f"Initial pods: {initial_pods}")
logger.info(f"Min pods during test: {min_pods}")
logger.info(f"Max pods during test: {max_pods}")
logger.info(f"Scaling occurred: {scaled_up}")
if scaled_up:
logger.info("✅ Autoscaling test PASSED - Service scaled up under load")
else:
logger.warning("⚠️ Autoscaling test FAILED - Service did not scale up")
logger.warning("Check:")
logger.warning(" - HPA configuration")
logger.warning(" - Metrics server is running")
logger.warning(" - Resource requests/limits are set")
logger.warning(" - Load was sufficient to trigger scaling")
# Save results
results_file = f"autoscaling_test_{self.service_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(results_file, "w") as f:
json.dump({
"service": self.service_name,
"namespace": self.namespace,
"initial_pods": initial_pods,
"max_pods": max_pods,
"min_pods": min_pods,
"scaled_up": scaled_up,
"monitoring_data": monitoring_results
}, f, indent=2)
logger.info(f"Detailed results saved to: {results_file}")
return scaled_up
async def main():
parser = argparse.ArgumentParser(description="Autoscaling Validation Test")
parser.add_argument("--service", required=True,
choices=["coordinator", "blockchain-node", "wallet-daemon"],
help="Service to test")
parser.add_argument("--namespace", default="default",
help="Kubernetes namespace")
parser.add_argument("--target-url", required=True,
help="Service URL to generate load against")
parser.add_argument("--load-duration", type=int, default=300,
help="Duration of load generation in seconds")
parser.add_argument("--monitor-duration", type=int, default=400,
help="Total monitoring duration in seconds")
parser.add_argument("--local-mode", action="store_true",
help="Run in local mode without Kubernetes (load test only)")
args = parser.parse_args()
if not args.local_mode:
# Verify kubectl is available
try:
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
logger.error("kubectl is not available or not configured")
logger.info("Use --local-mode to run load test without Kubernetes monitoring")
sys.exit(1)
# Run test
async with AutoscalingTest(args.service, args.namespace, args.target_url) as test:
if args.local_mode:
# Local mode: just test load generation
logger.info(f"Running load test for {args.service} in local mode")
await test.generate_load(args.load_duration)
logger.info("Load test completed successfully")
success = True
else:
# Full autoscaling test
success = await test.run_test(args.load_duration, args.monitor_duration)
sys.exit(0 if success else 1)
if __name__ == "__main__":
asyncio.run(main())

View File

@ -15,7 +15,7 @@ class ChainSettings(BaseSettings):
rpc_bind_host: str = "127.0.0.1"
rpc_bind_port: int = 8080
p2p_bind_host: str = "0.0.0.0"
p2p_bind_host: str = "127.0.0.2"
p2p_bind_port: int = 7070
proposer_id: str = "ait-devnet-proposer"