feat: add marketplace metrics, privacy features, and service registry endpoints
- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
This commit is contained in:
287
infra/scripts/chaos_test_coordinator.py
Executable file
287
infra/scripts/chaos_test_coordinator.py
Executable file
@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chaos Testing Script - Coordinator API Outage
|
||||
Tests system resilience when coordinator API becomes unavailable
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChaosTestCoordinator:
|
||||
"""Chaos testing for coordinator API outage scenarios"""
|
||||
|
||||
def __init__(self, namespace: str = "default"):
|
||||
self.namespace = namespace
|
||||
self.session = None
|
||||
self.metrics = {
|
||||
"test_start": None,
|
||||
"test_end": None,
|
||||
"outage_start": None,
|
||||
"outage_end": None,
|
||||
"recovery_time": None,
|
||||
"mttr": None,
|
||||
"error_count": 0,
|
||||
"success_count": 0,
|
||||
"scenario": "coordinator_outage"
|
||||
}
|
||||
|
||||
async def __aenter__(self):
|
||||
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10))
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.session:
|
||||
await self.session.close()
|
||||
|
||||
def get_coordinator_pods(self) -> List[str]:
|
||||
"""Get list of coordinator pods"""
|
||||
cmd = [
|
||||
"kubectl", "get", "pods",
|
||||
"-n", self.namespace,
|
||||
"-l", "app.kubernetes.io/name=coordinator",
|
||||
"-o", "jsonpath={.items[*].metadata.name}"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
pods = result.stdout.strip().split()
|
||||
return pods
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to get coordinator pods: {e}")
|
||||
return []
|
||||
|
||||
def delete_coordinator_pods(self) -> bool:
|
||||
"""Delete all coordinator pods to simulate outage"""
|
||||
try:
|
||||
cmd = [
|
||||
"kubectl", "delete", "pods",
|
||||
"-n", self.namespace,
|
||||
"-l", "app.kubernetes.io/name=coordinator",
|
||||
"--force", "--grace-period=0"
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info("Coordinator pods deleted successfully")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to delete coordinator pods: {e}")
|
||||
return False
|
||||
|
||||
async def wait_for_pods_termination(self, timeout: int = 60) -> bool:
|
||||
"""Wait for all coordinator pods to terminate"""
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
pods = self.get_coordinator_pods()
|
||||
if not pods:
|
||||
logger.info("All coordinator pods terminated")
|
||||
return True
|
||||
await asyncio.sleep(2)
|
||||
|
||||
logger.error("Timeout waiting for pods to terminate")
|
||||
return False
|
||||
|
||||
async def wait_for_recovery(self, timeout: int = 300) -> bool:
|
||||
"""Wait for coordinator service to recover"""
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
try:
|
||||
# Check if pods are running
|
||||
pods = self.get_coordinator_pods()
|
||||
if not pods:
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
# Check if at least one pod is ready
|
||||
ready_cmd = [
|
||||
"kubectl", "get", "pods",
|
||||
"-n", self.namespace,
|
||||
"-l", "app.kubernetes.io/name=coordinator",
|
||||
"-o", "jsonpath={.items[?(@.status.phase=='Running')].metadata.name}"
|
||||
]
|
||||
result = subprocess.run(ready_cmd, capture_output=True, text=True)
|
||||
if result.stdout.strip():
|
||||
# Test API health
|
||||
if self.test_health_endpoint():
|
||||
recovery_time = time.time() - start_time
|
||||
self.metrics["recovery_time"] = recovery_time
|
||||
logger.info(f"Service recovered in {recovery_time:.2f} seconds")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Recovery check failed: {e}")
|
||||
|
||||
await asyncio.sleep(5)
|
||||
|
||||
logger.error("Service did not recover within timeout")
|
||||
return False
|
||||
|
||||
def test_health_endpoint(self) -> bool:
|
||||
"""Test if coordinator health endpoint is responding"""
|
||||
try:
|
||||
# Get service URL
|
||||
cmd = [
|
||||
"kubectl", "get", "svc", "coordinator",
|
||||
"-n", self.namespace,
|
||||
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
service_url = f"http://{result.stdout.strip()}/v1/health"
|
||||
|
||||
# Test health endpoint
|
||||
response = subprocess.run(
|
||||
["curl", "-s", "--max-time", "5", service_url],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
|
||||
return response.returncode == 0 and "ok" in response.stdout
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def generate_load(self, duration: int, concurrent: int = 10):
|
||||
"""Generate synthetic load on coordinator API"""
|
||||
logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests")
|
||||
|
||||
# Get service URL
|
||||
cmd = [
|
||||
"kubectl", "get", "svc", "coordinator",
|
||||
"-n", self.namespace,
|
||||
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
base_url = f"http://{result.stdout.strip()}"
|
||||
|
||||
start_time = time.time()
|
||||
tasks = []
|
||||
|
||||
async def make_request():
|
||||
try:
|
||||
async with self.session.get(f"{base_url}/v1/marketplace/stats") as response:
|
||||
if response.status == 200:
|
||||
self.metrics["success_count"] += 1
|
||||
else:
|
||||
self.metrics["error_count"] += 1
|
||||
except Exception:
|
||||
self.metrics["error_count"] += 1
|
||||
|
||||
while time.time() - start_time < duration:
|
||||
# Create batch of requests
|
||||
batch = [make_request() for _ in range(concurrent)]
|
||||
tasks.extend(batch)
|
||||
|
||||
# Wait for batch to complete
|
||||
await asyncio.gather(*batch, return_exceptions=True)
|
||||
|
||||
# Brief pause
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}")
|
||||
|
||||
async def run_test(self, outage_duration: int = 60, load_duration: int = 120):
|
||||
"""Run the complete chaos test"""
|
||||
logger.info("Starting coordinator outage chaos test")
|
||||
self.metrics["test_start"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Phase 1: Generate initial load
|
||||
logger.info("Phase 1: Generating initial load")
|
||||
await self.generate_load(30)
|
||||
|
||||
# Phase 2: Induce outage
|
||||
logger.info("Phase 2: Inducing coordinator outage")
|
||||
self.metrics["outage_start"] = datetime.utcnow().isoformat()
|
||||
|
||||
if not self.delete_coordinator_pods():
|
||||
logger.error("Failed to induce outage")
|
||||
return False
|
||||
|
||||
if not await self.wait_for_pods_termination():
|
||||
logger.error("Pods did not terminate")
|
||||
return False
|
||||
|
||||
# Wait for specified outage duration
|
||||
logger.info(f"Waiting for {outage_duration} seconds outage duration")
|
||||
await asyncio.sleep(outage_duration)
|
||||
|
||||
# Phase 3: Monitor recovery
|
||||
logger.info("Phase 3: Monitoring service recovery")
|
||||
self.metrics["outage_end"] = datetime.utcnow().isoformat()
|
||||
|
||||
if not await self.wait_for_recovery():
|
||||
logger.error("Service did not recover")
|
||||
return False
|
||||
|
||||
# Phase 4: Post-recovery load test
|
||||
logger.info("Phase 4: Post-recovery load test")
|
||||
await self.generate_load(load_duration)
|
||||
|
||||
# Calculate metrics
|
||||
self.metrics["test_end"] = datetime.utcnow().isoformat()
|
||||
self.metrics["mttr"] = self.metrics["recovery_time"]
|
||||
|
||||
# Save results
|
||||
self.save_results()
|
||||
|
||||
logger.info("Chaos test completed successfully")
|
||||
return True
|
||||
|
||||
def save_results(self):
|
||||
"""Save test results to file"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"chaos_test_coordinator_{timestamp}.json"
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(self.metrics, f, indent=2)
|
||||
|
||||
logger.info(f"Test results saved to: {filename}")
|
||||
|
||||
# Print summary
|
||||
print("\n=== Chaos Test Summary ===")
|
||||
print(f"Scenario: {self.metrics['scenario']}")
|
||||
print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}")
|
||||
print(f"Outage Duration: {self.metrics['outage_start']} to {self.metrics['outage_end']}")
|
||||
print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A")
|
||||
print(f"Success Requests: {self.metrics['success_count']}")
|
||||
print(f"Error Requests: {self.metrics['error_count']}")
|
||||
print(f"Error Rate: {(self.metrics['error_count'] / (self.metrics['success_count'] + self.metrics['error_count']) * 100):.2f}%")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Chaos test for coordinator API outage")
|
||||
parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
|
||||
parser.add_argument("--outage-duration", type=int, default=60, help="Outage duration in seconds")
|
||||
parser.add_argument("--load-duration", type=int, default=120, help="Post-recovery load test duration")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("DRY RUN: Would test coordinator outage without actual deletion")
|
||||
return
|
||||
|
||||
# Verify kubectl is available
|
||||
try:
|
||||
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
logger.error("kubectl is not available or not configured")
|
||||
sys.exit(1)
|
||||
|
||||
# Run test
|
||||
async with ChaosTestCoordinator(args.namespace) as test:
|
||||
success = await test.run_test(args.outage_duration, args.load_duration)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user