Files
aitbc/infra/scripts/chaos_orchestrator.py
oib c8be9d7414 feat: add marketplace metrics, privacy features, and service registry endpoints
- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels
- Implement confidential transaction models with encryption support and access control
- Add key management system with registration, rotation, and audit logging
- Create services and registry routers for service discovery and management
- Integrate ZK proof generation for privacy-preserving receipts
- Add metrics instru
2025-12-22 10:33:23 +01:00

343 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Chaos Testing Orchestrator
Runs multiple chaos test scenarios and aggregates MTTR metrics
"""
import asyncio
import argparse
import json
import logging
import subprocess
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ChaosOrchestrator:
"""Orchestrates multiple chaos test scenarios"""
def __init__(self, namespace: str = "default"):
self.namespace = namespace
self.results = {
"orchestration_start": None,
"orchestration_end": None,
"scenarios": [],
"summary": {
"total_scenarios": 0,
"successful_scenarios": 0,
"failed_scenarios": 0,
"average_mttr": 0,
"max_mttr": 0,
"min_mttr": float('inf')
}
}
async def run_scenario(self, script: str, args: List[str]) -> Optional[Dict]:
"""Run a single chaos test scenario"""
scenario_name = Path(script).stem.replace("chaos_test_", "")
logger.info(f"Running scenario: {scenario_name}")
cmd = ["python3", script] + args
start_time = time.time()
try:
# Run the chaos test script
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
logger.error(f"Scenario {scenario_name} failed with exit code {process.returncode}")
logger.error(f"Error: {stderr.decode()}")
return None
# Find the results file
result_files = list(Path(".").glob(f"chaos_test_{scenario_name}_*.json"))
if not result_files:
logger.error(f"No results file found for scenario {scenario_name}")
return None
# Load the most recent result file
result_file = max(result_files, key=lambda p: p.stat().st_mtime)
with open(result_file, 'r') as f:
results = json.load(f)
# Add execution metadata
results["execution_time"] = time.time() - start_time
results["scenario_name"] = scenario_name
logger.info(f"Scenario {scenario_name} completed successfully")
return results
except Exception as e:
logger.error(f"Failed to run scenario {scenario_name}: {e}")
return None
def calculate_summary_metrics(self):
"""Calculate summary metrics across all scenarios"""
mttr_values = []
for scenario in self.results["scenarios"]:
if scenario.get("mttr"):
mttr_values.append(scenario["mttr"])
if mttr_values:
self.results["summary"]["average_mttr"] = sum(mttr_values) / len(mttr_values)
self.results["summary"]["max_mttr"] = max(mttr_values)
self.results["summary"]["min_mttr"] = min(mttr_values)
self.results["summary"]["total_scenarios"] = len(self.results["scenarios"])
self.results["summary"]["successful_scenarios"] = sum(
1 for s in self.results["scenarios"] if s.get("mttr") is not None
)
self.results["summary"]["failed_scenarios"] = (
self.results["summary"]["total_scenarios"] -
self.results["summary"]["successful_scenarios"]
)
def generate_report(self, output_file: Optional[str] = None):
"""Generate a comprehensive chaos test report"""
report = {
"report_generated": datetime.utcnow().isoformat(),
"namespace": self.namespace,
"orchestration": self.results,
"recommendations": []
}
# Add recommendations based on results
if self.results["summary"]["average_mttr"] > 120:
report["recommendations"].append(
"Average MTTR exceeds 2 minutes. Consider improving recovery automation."
)
if self.results["summary"]["max_mttr"] > 300:
report["recommendations"].append(
"Maximum MTTR exceeds 5 minutes. Review slowest recovery scenario."
)
if self.results["summary"]["failed_scenarios"] > 0:
report["recommendations"].append(
f"{self.results['summary']['failed_scenarios']} scenario(s) failed. Review test configuration."
)
# Check for specific scenario issues
for scenario in self.results["scenarios"]:
if scenario.get("scenario_name") == "coordinator_outage":
if scenario.get("mttr", 0) > 180:
report["recommendations"].append(
"Coordinator recovery is slow. Consider reducing pod startup time."
)
elif scenario.get("scenario_name") == "network_partition":
if scenario.get("error_count", 0) > scenario.get("success_count", 0):
report["recommendations"].append(
"High error rate during network partition. Improve error handling."
)
elif scenario.get("scenario_name") == "database_failure":
if scenario.get("failure_type") == "connection":
report["recommendations"].append(
"Consider implementing database connection pooling and retry logic."
)
# Save report
if output_file:
with open(output_file, 'w') as f:
json.dump(report, f, indent=2)
logger.info(f"Chaos test report saved to: {output_file}")
# Print summary
self.print_summary()
return report
def print_summary(self):
"""Print a summary of all chaos test results"""
print("\n" + "="*60)
print("CHAOS TESTING SUMMARY REPORT")
print("="*60)
print(f"\nTest Execution: {self.results['orchestration_start']} to {self.results['orchestration_end']}")
print(f"Namespace: {self.namespace}")
print(f"\nScenario Results:")
print("-" * 40)
for scenario in self.results["scenarios"]:
name = scenario.get("scenario_name", "Unknown")
mttr = scenario.get("mttr", "N/A")
if mttr != "N/A":
mttr = f"{mttr:.2f}s"
print(f" {name:20} MTTR: {mttr}")
print(f"\nSummary Metrics:")
print("-" * 40)
print(f" Total Scenarios: {self.results['summary']['total_scenarios']}")
print(f" Successful: {self.results['summary']['successful_scenarios']}")
print(f" Failed: {self.results['summary']['failed_scenarios']}")
if self.results["summary"]["average_mttr"] > 0:
print(f" Average MTTR: {self.results['summary']['average_mttr']:.2f}s")
print(f" Maximum MTTR: {self.results['summary']['max_mttr']:.2f}s")
print(f" Minimum MTTR: {self.results['summary']['min_mttr']:.2f}s")
# SLO compliance
print(f"\nSLO Compliance:")
print("-" * 40)
slo_target = 120 # 2 minutes
if self.results["summary"]["average_mttr"] <= slo_target:
print(f" ✓ Average MTTR within SLO ({slo_target}s)")
else:
print(f" ✗ Average MTTR exceeds SLO ({slo_target}s)")
print("\n" + "="*60)
async def run_all_scenarios(self, scenarios: List[str], scenario_args: Dict[str, List[str]]):
"""Run all specified chaos test scenarios"""
logger.info("Starting chaos testing orchestration")
self.results["orchestration_start"] = datetime.utcnow().isoformat()
for scenario in scenarios:
args = scenario_args.get(scenario, [])
# Add namespace to all scenarios
args.extend(["--namespace", self.namespace])
result = await self.run_scenario(scenario, args)
if result:
self.results["scenarios"].append(result)
self.results["orchestration_end"] = datetime.utcnow().isoformat()
# Calculate summary metrics
self.calculate_summary_metrics()
# Generate report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = f"chaos_test_report_{timestamp}.json"
self.generate_report(report_file)
logger.info("Chaos testing orchestration completed")
async def run_continuous_chaos(self, duration_hours: int = 24, interval_minutes: int = 60):
"""Run chaos tests continuously over time"""
logger.info(f"Starting continuous chaos testing for {duration_hours} hours")
end_time = datetime.now() + timedelta(hours=duration_hours)
interval_seconds = interval_minutes * 60
all_results = []
while datetime.now() < end_time:
cycle_start = datetime.now()
logger.info(f"Starting chaos test cycle at {cycle_start}")
# Run a random scenario
scenarios = [
"chaos_test_coordinator.py",
"chaos_test_network.py",
"chaos_test_database.py"
]
import random
selected_scenario = random.choice(scenarios)
# Run scenario with reduced duration for continuous testing
args = ["--namespace", self.namespace]
if "coordinator" in selected_scenario:
args.extend(["--outage-duration", "30", "--load-duration", "60"])
elif "network" in selected_scenario:
args.extend(["--partition-duration", "30", "--partition-ratio", "0.3"])
elif "database" in selected_scenario:
args.extend(["--failure-duration", "30", "--failure-type", "connection"])
result = await self.run_scenario(selected_scenario, args)
if result:
result["cycle_time"] = cycle_start.isoformat()
all_results.append(result)
# Wait for next cycle
elapsed = (datetime.now() - cycle_start).total_seconds()
if elapsed < interval_seconds:
wait_time = interval_seconds - elapsed
logger.info(f"Waiting {wait_time:.0f}s for next cycle")
await asyncio.sleep(wait_time)
# Generate continuous testing report
continuous_report = {
"continuous_testing": True,
"duration_hours": duration_hours,
"interval_minutes": interval_minutes,
"total_cycles": len(all_results),
"cycles": all_results
}
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = f"continuous_chaos_report_{timestamp}.json"
with open(report_file, 'w') as f:
json.dump(continuous_report, f, indent=2)
logger.info(f"Continuous chaos testing completed. Report saved to: {report_file}")
async def main():
parser = argparse.ArgumentParser(description="Chaos testing orchestrator")
parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
parser.add_argument("--scenarios", nargs="+",
choices=["coordinator", "network", "database"],
default=["coordinator", "network", "database"],
help="Scenarios to run")
parser.add_argument("--continuous", action="store_true", help="Run continuous chaos testing")
parser.add_argument("--duration", type=int, default=24, help="Duration in hours for continuous testing")
parser.add_argument("--interval", type=int, default=60, help="Interval in minutes for continuous testing")
parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")
args = parser.parse_args()
# Verify kubectl is available
try:
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
logger.error("kubectl is not available or not configured")
sys.exit(1)
orchestrator = ChaosOrchestrator(args.namespace)
if args.dry_run:
logger.info(f"DRY RUN: Would run scenarios: {', '.join(args.scenarios)}")
return
if args.continuous:
await orchestrator.run_continuous_chaos(args.duration, args.interval)
else:
# Map scenario names to script files
scenario_map = {
"coordinator": "chaos_test_coordinator.py",
"network": "chaos_test_network.py",
"database": "chaos_test_database.py"
}
# Get script files
scripts = [scenario_map[s] for s in args.scenarios]
# Default arguments for each scenario
scenario_args = {
"chaos_test_coordinator.py": ["--outage-duration", "60", "--load-duration", "120"],
"chaos_test_network.py": ["--partition-duration", "60", "--partition-ratio", "0.5"],
"chaos_test_database.py": ["--failure-duration", "60", "--failure-type", "connection"]
}
await orchestrator.run_all_scenarios(scripts, scenario_args)
if __name__ == "__main__":
asyncio.run(main())