aitbc/infra/scripts/chaos_test_coordinator.py

#!/usr/bin/env python3
"""
Chaos Testing Script - Coordinator API Outage
Tests system resilience when coordinator API becomes unavailable
"""

import asyncio
import aiohttp
import argparse
import json
import time
import logging
import subprocess
import sys
from datetime import datetime
from typing import Dict, List, Optional

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class ChaosTestCoordinator:
    """Chaos testing for coordinator API outage scenarios"""

    def __init__(self, namespace: str = "default"):
        self.namespace = namespace
        self.session = None
        self.metrics = {
            "test_start": None,
            "test_end": None,
            "outage_start": None,
            "outage_end": None,
            "recovery_time": None,
            "mttr": None,
            "error_count": 0,
            "success_count": 0,
            "scenario": "coordinator_outage"
        }

    async def __aenter__(self):
        self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10))
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()

    def get_coordinator_pods(self) -> List[str]:
        """Get list of coordinator pods"""
        cmd = [
            "kubectl", "get", "pods",
            "-n", self.namespace,
            "-l", "app.kubernetes.io/name=coordinator",
            "-o", "jsonpath={.items[*].metadata.name}"
        ]

        try:
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            pods = result.stdout.strip().split()
            return pods
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to get coordinator pods: {e}")
            return []

    def delete_coordinator_pods(self) -> bool:
        """Delete all coordinator pods to simulate outage"""
        try:
            cmd = [
                "kubectl", "delete", "pods",
                "-n", self.namespace,
                "-l", "app.kubernetes.io/name=coordinator",
                "--force", "--grace-period=0"
            ]
            subprocess.run(cmd, check=True)
            logger.info("Coordinator pods deleted successfully")
            return True
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to delete coordinator pods: {e}")
            return False

    async def wait_for_pods_termination(self, timeout: int = 60) -> bool:
        """Wait for all coordinator pods to terminate"""
        start_time = time.time()

        while time.time() - start_time < timeout:
            pods = self.get_coordinator_pods()
            if not pods:
                logger.info("All coordinator pods terminated")
                return True
            await asyncio.sleep(2)

        logger.error("Timeout waiting for pods to terminate")
        return False

    async def wait_for_recovery(self, timeout: int = 300) -> bool:
        """Wait for coordinator service to recover"""
        start_time = time.time()

        while time.time() - start_time < timeout:
            try:
                # Check if pods are running
                pods = self.get_coordinator_pods()
                if not pods:
                    await asyncio.sleep(5)
                    continue

                # Check if at least one pod is ready
                ready_cmd = [
                    "kubectl", "get", "pods",
                    "-n", self.namespace,
                    "-l", "app.kubernetes.io/name=coordinator",
                    "-o", "jsonpath={.items[?(@.status.phase=='Running')].metadata.name}"
                ]
                result = subprocess.run(ready_cmd, capture_output=True, text=True)
                if result.stdout.strip():
                    # Test API health
                    if self.test_health_endpoint():
                        recovery_time = time.time() - start_time
                        self.metrics["recovery_time"] = recovery_time
                        logger.info(f"Service recovered in {recovery_time:.2f} seconds")
                        return True

            except Exception as e:
                logger.debug(f"Recovery check failed: {e}")

            await asyncio.sleep(5)

        logger.error("Service did not recover within timeout")
        return False

    def test_health_endpoint(self) -> bool:
        """Test if coordinator health endpoint is responding"""
        try:
            # Get service URL
            cmd = [
                "kubectl", "get", "svc", "coordinator",
                "-n", self.namespace,
                "-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            service_url = f"http://{result.stdout.strip()}/v1/health"

            # Test health endpoint
            response = subprocess.run(
                ["curl", "-s", "--max-time", "5", service_url],
                capture_output=True, text=True
            )

            return response.returncode == 0 and "ok" in response.stdout
        except Exception:
            return False

    async def generate_load(self, duration: int, concurrent: int = 10):
        """Generate synthetic load on coordinator API"""
        logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests")

        # Get service URL
        cmd = [
            "kubectl", "get", "svc", "coordinator",
            "-n", self.namespace,
            "-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        base_url = f"http://{result.stdout.strip()}"

        start_time = time.time()
        tasks = []

        async def make_request():
            try:
                async with self.session.get(f"{base_url}/v1/marketplace/stats") as response:
                    if response.status == 200:
                        self.metrics["success_count"] += 1
                    else:
                        self.metrics["error_count"] += 1
            except Exception:
                self.metrics["error_count"] += 1

        while time.time() - start_time < duration:
            # Create batch of requests
            batch = [make_request() for _ in range(concurrent)]
            tasks.extend(batch)

            # Wait for batch to complete
            await asyncio.gather(*batch, return_exceptions=True)

            # Brief pause
            await asyncio.sleep(1)

        logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}")

    async def run_test(self, outage_duration: int = 60, load_duration: int = 120):
        """Run the complete chaos test"""
        logger.info("Starting coordinator outage chaos test")
        self.metrics["test_start"] = datetime.utcnow().isoformat()

        # Phase 1: Generate initial load
        logger.info("Phase 1: Generating initial load")
        await self.generate_load(30)

        # Phase 2: Induce outage
        logger.info("Phase 2: Inducing coordinator outage")
        self.metrics["outage_start"] = datetime.utcnow().isoformat()

        if not self.delete_coordinator_pods():
            logger.error("Failed to induce outage")
            return False

        if not await self.wait_for_pods_termination():
            logger.error("Pods did not terminate")
            return False

        # Wait for specified outage duration
        logger.info(f"Waiting for {outage_duration} seconds outage duration")
        await asyncio.sleep(outage_duration)

        # Phase 3: Monitor recovery
        logger.info("Phase 3: Monitoring service recovery")
        self.metrics["outage_end"] = datetime.utcnow().isoformat()

        if not await self.wait_for_recovery():
            logger.error("Service did not recover")
            return False

        # Phase 4: Post-recovery load test
        logger.info("Phase 4: Post-recovery load test")
        await self.generate_load(load_duration)

        # Calculate metrics
        self.metrics["test_end"] = datetime.utcnow().isoformat()
        self.metrics["mttr"] = self.metrics["recovery_time"]

        # Save results
        self.save_results()

        logger.info("Chaos test completed successfully")
        return True

    def save_results(self):
        """Save test results to file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"chaos_test_coordinator_{timestamp}.json"

        with open(filename, "w") as f:
            json.dump(self.metrics, f, indent=2)

        logger.info(f"Test results saved to: {filename}")

        # Print summary
        print("\n=== Chaos Test Summary ===")
        print(f"Scenario: {self.metrics['scenario']}")
        print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}")
        print(f"Outage Duration: {self.metrics['outage_start']} to {self.metrics['outage_end']}")
        print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A")
        print(f"Success Requests: {self.metrics['success_count']}")
        print(f"Error Requests: {self.metrics['error_count']}")
        print(f"Error Rate: {(self.metrics['error_count'] / (self.metrics['success_count'] + self.metrics['error_count']) * 100):.2f}%")


async def main():
    parser = argparse.ArgumentParser(description="Chaos test for coordinator API outage")
    parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
    parser.add_argument("--outage-duration", type=int, default=60, help="Outage duration in seconds")
    parser.add_argument("--load-duration", type=int, default=120, help="Post-recovery load test duration")
    parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")

    args = parser.parse_args()

    if args.dry_run:
        logger.info("DRY RUN: Would test coordinator outage without actual deletion")
        return

    # Verify kubectl is available
    try:
        subprocess.run(["kubectl", "version"], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        logger.error("kubectl is not available or not configured")
        sys.exit(1)

    # Run test
    async with ChaosTestCoordinator(args.namespace) as test:
        success = await test.run_test(args.outage_duration, args.load_duration)
        sys.exit(0 if success else 1)


if __name__ == "__main__":
    asyncio.run(main())