- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
388 lines
15 KiB
Python
Executable File
388 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Chaos Testing Script - Database Failure
|
|
Tests system resilience when PostgreSQL database becomes unavailable
|
|
"""
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import argparse
|
|
import json
|
|
import time
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ChaosTestDatabase:
|
|
"""Chaos testing for database failure scenarios"""
|
|
|
|
def __init__(self, namespace: str = "default"):
|
|
self.namespace = namespace
|
|
self.session = None
|
|
self.metrics = {
|
|
"test_start": None,
|
|
"test_end": None,
|
|
"failure_start": None,
|
|
"failure_end": None,
|
|
"recovery_time": None,
|
|
"mttr": None,
|
|
"error_count": 0,
|
|
"success_count": 0,
|
|
"scenario": "database_failure",
|
|
"failure_type": None
|
|
}
|
|
|
|
async def __aenter__(self):
|
|
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10))
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
if self.session:
|
|
await self.session.close()
|
|
|
|
def get_postgresql_pod(self) -> Optional[str]:
|
|
"""Get PostgreSQL pod name"""
|
|
cmd = [
|
|
"kubectl", "get", "pods",
|
|
"-n", self.namespace,
|
|
"-l", "app.kubernetes.io/name=postgresql",
|
|
"-o", "jsonpath={.items[0].metadata.name}"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
pod = result.stdout.strip()
|
|
return pod if pod else None
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to get PostgreSQL pod: {e}")
|
|
return None
|
|
|
|
def simulate_database_connection_failure(self) -> bool:
|
|
"""Simulate database connection failure by blocking port 5432"""
|
|
pod = self.get_postgresql_pod()
|
|
if not pod:
|
|
return False
|
|
|
|
try:
|
|
# Block incoming connections to PostgreSQL
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-A", "INPUT", "-p", "tcp", "--dport", "5432", "-j", "DROP"
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
|
|
# Block outgoing connections from PostgreSQL
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-A", "OUTPUT", "-p", "tcp", "--sport", "5432", "-j", "DROP"
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
|
|
logger.info(f"Blocked PostgreSQL connections on pod {pod}")
|
|
self.metrics["failure_type"] = "connection_blocked"
|
|
return True
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to block PostgreSQL connections: {e}")
|
|
return False
|
|
|
|
def simulate_database_high_latency(self, latency_ms: int = 5000) -> bool:
|
|
"""Simulate high database latency using netem"""
|
|
pod = self.get_postgresql_pod()
|
|
if not pod:
|
|
return False
|
|
|
|
try:
|
|
# Add latency to PostgreSQL traffic
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"tc", "qdisc", "add", "dev", "eth0", "root", "netem", "delay", f"{latency_ms}ms"
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
|
|
logger.info(f"Added {latency_ms}ms latency to PostgreSQL on pod {pod}")
|
|
self.metrics["failure_type"] = "high_latency"
|
|
return True
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to add latency to PostgreSQL: {e}")
|
|
return False
|
|
|
|
def restore_database(self) -> bool:
|
|
"""Restore database connections"""
|
|
pod = self.get_postgresql_pod()
|
|
if not pod:
|
|
return False
|
|
|
|
try:
|
|
# Remove iptables rules
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-F", "INPUT"
|
|
]
|
|
subprocess.run(cmd, check=False) # May fail if rules don't exist
|
|
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-F", "OUTPUT"
|
|
]
|
|
subprocess.run(cmd, check=False)
|
|
|
|
# Remove netem qdisc
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"tc", "qdisc", "del", "dev", "eth0", "root"
|
|
]
|
|
subprocess.run(cmd, check=False)
|
|
|
|
logger.info(f"Restored PostgreSQL connections on pod {pod}")
|
|
return True
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to restore PostgreSQL: {e}")
|
|
return False
|
|
|
|
async def test_database_connectivity(self) -> bool:
|
|
"""Test if coordinator can connect to database"""
|
|
try:
|
|
# Get coordinator pod
|
|
cmd = [
|
|
"kubectl", "get", "pods",
|
|
"-n", self.namespace,
|
|
"-l", "app.kubernetes.io/name=coordinator",
|
|
"-o", "jsonpath={.items[0].metadata.name}"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
coordinator_pod = result.stdout.strip()
|
|
|
|
if not coordinator_pod:
|
|
return False
|
|
|
|
# Test database connection from coordinator
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, coordinator_pod, "--",
|
|
"python", "-c", "import psycopg2; psycopg2.connect('postgresql://aitbc:password@postgresql:5432/aitbc'); print('OK')"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
return result.returncode == 0 and "OK" in result.stdout
|
|
|
|
except Exception:
|
|
return False
|
|
|
|
async def test_api_health(self) -> bool:
|
|
"""Test if coordinator API is healthy"""
|
|
try:
|
|
# Get service URL
|
|
cmd = [
|
|
"kubectl", "get", "svc", "coordinator",
|
|
"-n", self.namespace,
|
|
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
service_url = f"http://{result.stdout.strip()}/v1/health"
|
|
|
|
# Test health endpoint
|
|
response = subprocess.run(
|
|
["curl", "-s", "--max-time", "5", service_url],
|
|
capture_output=True, text=True
|
|
)
|
|
|
|
return response.returncode == 0 and "ok" in response.stdout
|
|
|
|
except Exception:
|
|
return False
|
|
|
|
async def generate_load(self, duration: int, concurrent: int = 10):
|
|
"""Generate synthetic load on coordinator API"""
|
|
logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests")
|
|
|
|
# Get service URL
|
|
cmd = [
|
|
"kubectl", "get", "svc", "coordinator",
|
|
"-n", self.namespace,
|
|
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
base_url = f"http://{result.stdout.strip()}"
|
|
|
|
start_time = time.time()
|
|
tasks = []
|
|
|
|
async def make_request():
|
|
try:
|
|
async with self.session.get(f"{base_url}/v1/marketplace/offers") as response:
|
|
if response.status == 200:
|
|
self.metrics["success_count"] += 1
|
|
else:
|
|
self.metrics["error_count"] += 1
|
|
except Exception:
|
|
self.metrics["error_count"] += 1
|
|
|
|
while time.time() - start_time < duration:
|
|
# Create batch of requests
|
|
batch = [make_request() for _ in range(concurrent)]
|
|
tasks.extend(batch)
|
|
|
|
# Wait for batch to complete
|
|
await asyncio.gather(*batch, return_exceptions=True)
|
|
|
|
# Brief pause
|
|
await asyncio.sleep(1)
|
|
|
|
logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}")
|
|
|
|
async def wait_for_recovery(self, timeout: int = 300) -> bool:
|
|
"""Wait for database and API to recover"""
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
# Test database connectivity
|
|
db_connected = await self.test_database_connectivity()
|
|
|
|
# Test API health
|
|
api_healthy = await self.test_api_health()
|
|
|
|
if db_connected and api_healthy:
|
|
recovery_time = time.time() - start_time
|
|
self.metrics["recovery_time"] = recovery_time
|
|
logger.info(f"Database and API recovered in {recovery_time:.2f} seconds")
|
|
return True
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
logger.error("Database and API did not recover within timeout")
|
|
return False
|
|
|
|
async def run_test(self, failure_type: str = "connection", failure_duration: int = 60):
|
|
"""Run the complete database chaos test"""
|
|
logger.info(f"Starting database chaos test - failure type: {failure_type}")
|
|
self.metrics["test_start"] = datetime.utcnow().isoformat()
|
|
|
|
# Phase 1: Baseline test
|
|
logger.info("Phase 1: Baseline connectivity test")
|
|
db_connected = await self.test_database_connectivity()
|
|
api_healthy = await self.test_api_health()
|
|
|
|
if not db_connected or not api_healthy:
|
|
logger.error("Baseline test failed - database or API not healthy")
|
|
return False
|
|
|
|
logger.info("Baseline: Database and API are healthy")
|
|
|
|
# Phase 2: Generate initial load
|
|
logger.info("Phase 2: Generating initial load")
|
|
await self.generate_load(30)
|
|
|
|
# Phase 3: Induce database failure
|
|
logger.info("Phase 3: Inducing database failure")
|
|
self.metrics["failure_start"] = datetime.utcnow().isoformat()
|
|
|
|
if failure_type == "connection":
|
|
if not self.simulate_database_connection_failure():
|
|
logger.error("Failed to induce database connection failure")
|
|
return False
|
|
elif failure_type == "latency":
|
|
if not self.simulate_database_high_latency():
|
|
logger.error("Failed to induce database latency")
|
|
return False
|
|
else:
|
|
logger.error(f"Unknown failure type: {failure_type}")
|
|
return False
|
|
|
|
# Verify failure is effective
|
|
await asyncio.sleep(5)
|
|
db_connected = await self.test_database_connectivity()
|
|
api_healthy = await self.test_api_health()
|
|
|
|
logger.info(f"During failure - DB connected: {db_connected}, API healthy: {api_healthy}")
|
|
|
|
# Phase 4: Monitor during failure
|
|
logger.info(f"Phase 4: Monitoring system during {failure_duration}s failure")
|
|
|
|
# Generate load during failure
|
|
await self.generate_load(failure_duration)
|
|
|
|
# Phase 5: Restore database and monitor recovery
|
|
logger.info("Phase 5: Restoring database")
|
|
self.metrics["failure_end"] = datetime.utcnow().isoformat()
|
|
|
|
if not self.restore_database():
|
|
logger.error("Failed to restore database")
|
|
return False
|
|
|
|
# Wait for recovery
|
|
if not await self.wait_for_recovery():
|
|
logger.error("System did not recover after database restoration")
|
|
return False
|
|
|
|
# Phase 6: Post-recovery load test
|
|
logger.info("Phase 6: Post-recovery load test")
|
|
await self.generate_load(60)
|
|
|
|
# Final metrics
|
|
self.metrics["test_end"] = datetime.utcnow().isoformat()
|
|
self.metrics["mttr"] = self.metrics["recovery_time"]
|
|
|
|
# Save results
|
|
self.save_results()
|
|
|
|
logger.info("Database chaos test completed successfully")
|
|
return True
|
|
|
|
def save_results(self):
|
|
"""Save test results to file"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"chaos_test_database_{timestamp}.json"
|
|
|
|
with open(filename, "w") as f:
|
|
json.dump(self.metrics, f, indent=2)
|
|
|
|
logger.info(f"Test results saved to: {filename}")
|
|
|
|
# Print summary
|
|
print("\n=== Chaos Test Summary ===")
|
|
print(f"Scenario: {self.metrics['scenario']}")
|
|
print(f"Failure Type: {self.metrics['failure_type']}")
|
|
print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}")
|
|
print(f"Failure Duration: {self.metrics['failure_start']} to {self.metrics['failure_end']}")
|
|
print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A")
|
|
print(f"Success Requests: {self.metrics['success_count']}")
|
|
print(f"Error Requests: {self.metrics['error_count']}")
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Chaos test for database failure")
|
|
parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
|
|
parser.add_argument("--failure-type", choices=["connection", "latency"], default="connection", help="Type of failure to simulate")
|
|
parser.add_argument("--failure-duration", type=int, default=60, help="Failure duration in seconds")
|
|
parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info(f"DRY RUN: Would simulate {args.failure_type} database failure for {args.failure_duration} seconds")
|
|
return
|
|
|
|
# Verify kubectl is available
|
|
try:
|
|
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
logger.error("kubectl is not available or not configured")
|
|
sys.exit(1)
|
|
|
|
# Run test
|
|
async with ChaosTestDatabase(args.namespace) as test:
|
|
success = await test.run_test(args.failure_type, args.failure_duration)
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|