#!/usr/bin/env python3 """ Chaos Testing Script - Database Failure Tests system resilience when PostgreSQL database becomes unavailable """ import asyncio import aiohttp import argparse import json import time import logging import subprocess import sys from datetime import datetime from typing import Dict, List, Optional logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class ChaosTestDatabase: """Chaos testing for database failure scenarios""" def __init__(self, namespace: str = "default"): self.namespace = namespace self.session = None self.metrics = { "test_start": None, "test_end": None, "failure_start": None, "failure_end": None, "recovery_time": None, "mttr": None, "error_count": 0, "success_count": 0, "scenario": "database_failure", "failure_type": None } async def __aenter__(self): self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10)) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.session: await self.session.close() def get_postgresql_pod(self) -> Optional[str]: """Get PostgreSQL pod name""" cmd = [ "kubectl", "get", "pods", "-n", self.namespace, "-l", "app.kubernetes.io/name=postgresql", "-o", "jsonpath={.items[0].metadata.name}" ] try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) pod = result.stdout.strip() return pod if pod else None except subprocess.CalledProcessError as e: logger.error(f"Failed to get PostgreSQL pod: {e}") return None def simulate_database_connection_failure(self) -> bool: """Simulate database connection failure by blocking port 5432""" pod = self.get_postgresql_pod() if not pod: return False try: # Block incoming connections to PostgreSQL cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "iptables", "-A", "INPUT", "-p", "tcp", "--dport", "5432", "-j", "DROP" ] subprocess.run(cmd, check=True) # Block outgoing connections from PostgreSQL cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "iptables", "-A", "OUTPUT", "-p", "tcp", "--sport", "5432", "-j", "DROP" ] subprocess.run(cmd, check=True) logger.info(f"Blocked PostgreSQL connections on pod {pod}") self.metrics["failure_type"] = "connection_blocked" return True except subprocess.CalledProcessError as e: logger.error(f"Failed to block PostgreSQL connections: {e}") return False def simulate_database_high_latency(self, latency_ms: int = 5000) -> bool: """Simulate high database latency using netem""" pod = self.get_postgresql_pod() if not pod: return False try: # Add latency to PostgreSQL traffic cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "tc", "qdisc", "add", "dev", "eth0", "root", "netem", "delay", f"{latency_ms}ms" ] subprocess.run(cmd, check=True) logger.info(f"Added {latency_ms}ms latency to PostgreSQL on pod {pod}") self.metrics["failure_type"] = "high_latency" return True except subprocess.CalledProcessError as e: logger.error(f"Failed to add latency to PostgreSQL: {e}") return False def restore_database(self) -> bool: """Restore database connections""" pod = self.get_postgresql_pod() if not pod: return False try: # Remove iptables rules cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "iptables", "-F", "INPUT" ] subprocess.run(cmd, check=False) # May fail if rules don't exist cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "iptables", "-F", "OUTPUT" ] subprocess.run(cmd, check=False) # Remove netem qdisc cmd = [ "kubectl", "exec", "-n", self.namespace, pod, "--", "tc", "qdisc", "del", "dev", "eth0", "root" ] subprocess.run(cmd, check=False) logger.info(f"Restored PostgreSQL connections on pod {pod}") return True except subprocess.CalledProcessError as e: logger.error(f"Failed to restore PostgreSQL: {e}") return False async def test_database_connectivity(self) -> bool: """Test if coordinator can connect to database""" try: # Get coordinator pod cmd = [ "kubectl", "get", "pods", "-n", self.namespace, "-l", "app.kubernetes.io/name=coordinator", "-o", "jsonpath={.items[0].metadata.name}" ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) coordinator_pod = result.stdout.strip() if not coordinator_pod: return False # Test database connection from coordinator cmd = [ "kubectl", "exec", "-n", self.namespace, coordinator_pod, "--", "python", "-c", "import psycopg2; psycopg2.connect('postgresql://aitbc:password@postgresql:5432/aitbc'); print('OK')" ] result = subprocess.run(cmd, capture_output=True, text=True) return result.returncode == 0 and "OK" in result.stdout except Exception: return False async def test_api_health(self) -> bool: """Test if coordinator API is healthy""" try: # Get service URL cmd = [ "kubectl", "get", "svc", "coordinator", "-n", self.namespace, "-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}" ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) service_url = f"http://{result.stdout.strip()}/v1/health" # Test health endpoint response = subprocess.run( ["curl", "-s", "--max-time", "5", service_url], capture_output=True, text=True ) return response.returncode == 0 and "ok" in response.stdout except Exception: return False async def generate_load(self, duration: int, concurrent: int = 10): """Generate synthetic load on coordinator API""" logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests") # Get service URL cmd = [ "kubectl", "get", "svc", "coordinator", "-n", self.namespace, "-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}" ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) base_url = f"http://{result.stdout.strip()}" start_time = time.time() tasks = [] async def make_request(): try: async with self.session.get(f"{base_url}/v1/marketplace/offers") as response: if response.status == 200: self.metrics["success_count"] += 1 else: self.metrics["error_count"] += 1 except Exception: self.metrics["error_count"] += 1 while time.time() - start_time < duration: # Create batch of requests batch = [make_request() for _ in range(concurrent)] tasks.extend(batch) # Wait for batch to complete await asyncio.gather(*batch, return_exceptions=True) # Brief pause await asyncio.sleep(1) logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}") async def wait_for_recovery(self, timeout: int = 300) -> bool: """Wait for database and API to recover""" start_time = time.time() while time.time() - start_time < timeout: # Test database connectivity db_connected = await self.test_database_connectivity() # Test API health api_healthy = await self.test_api_health() if db_connected and api_healthy: recovery_time = time.time() - start_time self.metrics["recovery_time"] = recovery_time logger.info(f"Database and API recovered in {recovery_time:.2f} seconds") return True await asyncio.sleep(5) logger.error("Database and API did not recover within timeout") return False async def run_test(self, failure_type: str = "connection", failure_duration: int = 60): """Run the complete database chaos test""" logger.info(f"Starting database chaos test - failure type: {failure_type}") self.metrics["test_start"] = datetime.utcnow().isoformat() # Phase 1: Baseline test logger.info("Phase 1: Baseline connectivity test") db_connected = await self.test_database_connectivity() api_healthy = await self.test_api_health() if not db_connected or not api_healthy: logger.error("Baseline test failed - database or API not healthy") return False logger.info("Baseline: Database and API are healthy") # Phase 2: Generate initial load logger.info("Phase 2: Generating initial load") await self.generate_load(30) # Phase 3: Induce database failure logger.info("Phase 3: Inducing database failure") self.metrics["failure_start"] = datetime.utcnow().isoformat() if failure_type == "connection": if not self.simulate_database_connection_failure(): logger.error("Failed to induce database connection failure") return False elif failure_type == "latency": if not self.simulate_database_high_latency(): logger.error("Failed to induce database latency") return False else: logger.error(f"Unknown failure type: {failure_type}") return False # Verify failure is effective await asyncio.sleep(5) db_connected = await self.test_database_connectivity() api_healthy = await self.test_api_health() logger.info(f"During failure - DB connected: {db_connected}, API healthy: {api_healthy}") # Phase 4: Monitor during failure logger.info(f"Phase 4: Monitoring system during {failure_duration}s failure") # Generate load during failure await self.generate_load(failure_duration) # Phase 5: Restore database and monitor recovery logger.info("Phase 5: Restoring database") self.metrics["failure_end"] = datetime.utcnow().isoformat() if not self.restore_database(): logger.error("Failed to restore database") return False # Wait for recovery if not await self.wait_for_recovery(): logger.error("System did not recover after database restoration") return False # Phase 6: Post-recovery load test logger.info("Phase 6: Post-recovery load test") await self.generate_load(60) # Final metrics self.metrics["test_end"] = datetime.utcnow().isoformat() self.metrics["mttr"] = self.metrics["recovery_time"] # Save results self.save_results() logger.info("Database chaos test completed successfully") return True def save_results(self): """Save test results to file""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"chaos_test_database_{timestamp}.json" with open(filename, "w") as f: json.dump(self.metrics, f, indent=2) logger.info(f"Test results saved to: {filename}") # Print summary print("\n=== Chaos Test Summary ===") print(f"Scenario: {self.metrics['scenario']}") print(f"Failure Type: {self.metrics['failure_type']}") print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}") print(f"Failure Duration: {self.metrics['failure_start']} to {self.metrics['failure_end']}") print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A") print(f"Success Requests: {self.metrics['success_count']}") print(f"Error Requests: {self.metrics['error_count']}") async def main(): parser = argparse.ArgumentParser(description="Chaos test for database failure") parser.add_argument("--namespace", default="default", help="Kubernetes namespace") parser.add_argument("--failure-type", choices=["connection", "latency"], default="connection", help="Type of failure to simulate") parser.add_argument("--failure-duration", type=int, default=60, help="Failure duration in seconds") parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos") args = parser.parse_args() if args.dry_run: logger.info(f"DRY RUN: Would simulate {args.failure_type} database failure for {args.failure_duration} seconds") return # Verify kubectl is available try: subprocess.run(["kubectl", "version"], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): logger.error("kubectl is not available or not configured") sys.exit(1) # Run test async with ChaosTestDatabase(args.namespace) as test: success = await test.run_test(args.failure_type, args.failure_duration) sys.exit(0 if success else 1) if __name__ == "__main__": asyncio.run(main())