- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
373 lines
15 KiB
Python
Executable File
373 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Chaos Testing Script - Network Partition
|
|
Tests system resilience when blockchain nodes experience network partitions
|
|
"""
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import argparse
|
|
import json
|
|
import time
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ChaosTestNetwork:
|
|
"""Chaos testing for network partition scenarios"""
|
|
|
|
def __init__(self, namespace: str = "default"):
|
|
self.namespace = namespace
|
|
self.session = None
|
|
self.metrics = {
|
|
"test_start": None,
|
|
"test_end": None,
|
|
"partition_start": None,
|
|
"partition_end": None,
|
|
"recovery_time": None,
|
|
"mttr": None,
|
|
"error_count": 0,
|
|
"success_count": 0,
|
|
"scenario": "network_partition",
|
|
"affected_nodes": []
|
|
}
|
|
|
|
async def __aenter__(self):
|
|
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10))
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
if self.session:
|
|
await self.session.close()
|
|
|
|
def get_blockchain_pods(self) -> List[str]:
|
|
"""Get list of blockchain node pods"""
|
|
cmd = [
|
|
"kubectl", "get", "pods",
|
|
"-n", self.namespace,
|
|
"-l", "app.kubernetes.io/name=blockchain-node",
|
|
"-o", "jsonpath={.items[*].metadata.name}"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
pods = result.stdout.strip().split()
|
|
return pods
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to get blockchain pods: {e}")
|
|
return []
|
|
|
|
def get_coordinator_pods(self) -> List[str]:
|
|
"""Get list of coordinator pods"""
|
|
cmd = [
|
|
"kubectl", "get", "pods",
|
|
"-n", self.namespace,
|
|
"-l", "app.kubernetes.io/name=coordinator",
|
|
"-o", "jsonpath={.items[*].metadata.name}"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
pods = result.stdout.strip().split()
|
|
return pods
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to get coordinator pods: {e}")
|
|
return []
|
|
|
|
def apply_network_partition(self, pods: List[str], target_pods: List[str]) -> bool:
|
|
"""Apply network partition using iptables"""
|
|
logger.info(f"Applying network partition: blocking traffic between {len(pods)} and {len(target_pods)} pods")
|
|
|
|
for pod in pods:
|
|
if pod in target_pods:
|
|
continue
|
|
|
|
# Block traffic from this pod to target pods
|
|
for target_pod in target_pods:
|
|
try:
|
|
# Get target pod IP
|
|
cmd = [
|
|
"kubectl", "get", "pod", target_pod,
|
|
"-n", self.namespace,
|
|
"-o", "jsonpath={.status.podIP}"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
target_ip = result.stdout.strip()
|
|
|
|
if not target_ip:
|
|
continue
|
|
|
|
# Apply iptables rule to block traffic
|
|
iptables_cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-A", "OUTPUT", "-d", target_ip, "-j", "DROP"
|
|
]
|
|
subprocess.run(iptables_cmd, check=True)
|
|
|
|
logger.info(f"Blocked traffic from {pod} to {target_pod} ({target_ip})")
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to block traffic from {pod} to {target_pod}: {e}")
|
|
return False
|
|
|
|
self.metrics["affected_nodes"] = pods + target_pods
|
|
return True
|
|
|
|
def remove_network_partition(self, pods: List[str]) -> bool:
|
|
"""Remove network partition rules"""
|
|
logger.info("Removing network partition rules")
|
|
|
|
for pod in pods:
|
|
try:
|
|
# Flush OUTPUT chain (remove all rules)
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"iptables", "-F", "OUTPUT"
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
logger.info(f"Removed network rules from {pod}")
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to remove network rules from {pod}: {e}")
|
|
return False
|
|
|
|
return True
|
|
|
|
async def test_connectivity(self, pods: List[str]) -> Dict[str, bool]:
|
|
"""Test connectivity between pods"""
|
|
results = {}
|
|
|
|
for pod in pods:
|
|
try:
|
|
# Test if pod can reach coordinator
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
|
"curl", "-s", "--max-time", "5", "http://coordinator:8011/v1/health"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
results[pod] = result.returncode == 0 and "ok" in result.stdout
|
|
|
|
except Exception:
|
|
results[pod] = False
|
|
|
|
return results
|
|
|
|
async def monitor_consensus(self, duration: int = 60) -> bool:
|
|
"""Monitor blockchain consensus health"""
|
|
logger.info(f"Monitoring consensus for {duration} seconds")
|
|
|
|
start_time = time.time()
|
|
last_height = 0
|
|
|
|
while time.time() - start_time < duration:
|
|
try:
|
|
# Get block height from a random pod
|
|
pods = self.get_blockchain_pods()
|
|
if not pods:
|
|
await asyncio.sleep(5)
|
|
continue
|
|
|
|
# Use first pod to check height
|
|
cmd = [
|
|
"kubectl", "exec", "-n", self.namespace, pods[0], "--",
|
|
"curl", "-s", "http://localhost:8080/v1/blocks/head"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
try:
|
|
data = json.loads(result.stdout)
|
|
current_height = data.get("height", 0)
|
|
|
|
# Check if blockchain is progressing
|
|
if current_height > last_height:
|
|
last_height = current_height
|
|
logger.info(f"Blockchain progressing, height: {current_height}")
|
|
elif time.time() - start_time > 30: # Allow 30s for initial sync
|
|
logger.warning(f"Blockchain stuck at height {current_height}")
|
|
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Consensus check failed: {e}")
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
return last_height > 0
|
|
|
|
async def generate_load(self, duration: int, concurrent: int = 5):
|
|
"""Generate synthetic load on blockchain nodes"""
|
|
logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests")
|
|
|
|
# Get service URL
|
|
cmd = [
|
|
"kubectl", "get", "svc", "blockchain-node",
|
|
"-n", self.namespace,
|
|
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
base_url = f"http://{result.stdout.strip()}"
|
|
|
|
start_time = time.time()
|
|
tasks = []
|
|
|
|
async def make_request():
|
|
try:
|
|
async with self.session.get(f"{base_url}/v1/blocks/head") as response:
|
|
if response.status == 200:
|
|
self.metrics["success_count"] += 1
|
|
else:
|
|
self.metrics["error_count"] += 1
|
|
except Exception:
|
|
self.metrics["error_count"] += 1
|
|
|
|
while time.time() - start_time < duration:
|
|
# Create batch of requests
|
|
batch = [make_request() for _ in range(concurrent)]
|
|
tasks.extend(batch)
|
|
|
|
# Wait for batch to complete
|
|
await asyncio.gather(*batch, return_exceptions=True)
|
|
|
|
# Brief pause
|
|
await asyncio.sleep(1)
|
|
|
|
logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}")
|
|
|
|
async def run_test(self, partition_duration: int = 60, partition_ratio: float = 0.5):
|
|
"""Run the complete network partition chaos test"""
|
|
logger.info("Starting network partition chaos test")
|
|
self.metrics["test_start"] = datetime.utcnow().isoformat()
|
|
|
|
# Get all blockchain pods
|
|
all_pods = self.get_blockchain_pods()
|
|
if not all_pods:
|
|
logger.error("No blockchain pods found")
|
|
return False
|
|
|
|
# Determine which pods to partition
|
|
num_partition = int(len(all_pods) * partition_ratio)
|
|
partition_pods = all_pods[:num_partition]
|
|
remaining_pods = all_pods[num_partition:]
|
|
|
|
logger.info(f"Partitioning {len(partition_pods)} pods out of {len(all_pods)} total")
|
|
|
|
# Phase 1: Baseline test
|
|
logger.info("Phase 1: Baseline connectivity test")
|
|
baseline_connectivity = await self.test_connectivity(all_pods)
|
|
logger.info(f"Baseline connectivity: {sum(baseline_connectivity.values())}/{len(all_pods)} pods connected")
|
|
|
|
# Phase 2: Generate initial load
|
|
logger.info("Phase 2: Generating initial load")
|
|
await self.generate_load(30)
|
|
|
|
# Phase 3: Apply network partition
|
|
logger.info("Phase 3: Applying network partition")
|
|
self.metrics["partition_start"] = datetime.utcnow().isoformat()
|
|
|
|
if not self.apply_network_partition(remaining_pods, partition_pods):
|
|
logger.error("Failed to apply network partition")
|
|
return False
|
|
|
|
# Verify partition is effective
|
|
await asyncio.sleep(5)
|
|
partitioned_connectivity = await self.test_connectivity(all_pods)
|
|
logger.info(f"Partitioned connectivity: {sum(partitioned_connectivity.values())}/{len(all_pods)} pods connected")
|
|
|
|
# Phase 4: Monitor during partition
|
|
logger.info(f"Phase 4: Monitoring system during {partition_duration}s partition")
|
|
consensus_healthy = await self.monitor_consensus(partition_duration)
|
|
|
|
# Phase 5: Remove partition and monitor recovery
|
|
logger.info("Phase 5: Removing network partition")
|
|
self.metrics["partition_end"] = datetime.utcnow().isoformat()
|
|
|
|
if not self.remove_network_partition(all_pods):
|
|
logger.error("Failed to remove network partition")
|
|
return False
|
|
|
|
# Wait for recovery
|
|
logger.info("Waiting for network recovery...")
|
|
await asyncio.sleep(10)
|
|
|
|
# Test connectivity after recovery
|
|
recovery_connectivity = await self.test_connectivity(all_pods)
|
|
recovery_time = time.time()
|
|
|
|
# Calculate recovery metrics
|
|
all_connected = all(recovery_connectivity.values())
|
|
if all_connected:
|
|
self.metrics["recovery_time"] = recovery_time - (datetime.fromisoformat(self.metrics["partition_end"]).timestamp())
|
|
logger.info(f"Network recovered in {self.metrics['recovery_time']:.2f} seconds")
|
|
|
|
# Phase 6: Post-recovery load test
|
|
logger.info("Phase 6: Post-recovery load test")
|
|
await self.generate_load(60)
|
|
|
|
# Final metrics
|
|
self.metrics["test_end"] = datetime.utcnow().isoformat()
|
|
self.metrics["mttr"] = self.metrics["recovery_time"]
|
|
|
|
# Save results
|
|
self.save_results()
|
|
|
|
logger.info("Network partition chaos test completed successfully")
|
|
return True
|
|
|
|
def save_results(self):
|
|
"""Save test results to file"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"chaos_test_network_{timestamp}.json"
|
|
|
|
with open(filename, "w") as f:
|
|
json.dump(self.metrics, f, indent=2)
|
|
|
|
logger.info(f"Test results saved to: {filename}")
|
|
|
|
# Print summary
|
|
print("\n=== Chaos Test Summary ===")
|
|
print(f"Scenario: {self.metrics['scenario']}")
|
|
print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}")
|
|
print(f"Partition Duration: {self.metrics['partition_start']} to {self.metrics['partition_end']}")
|
|
print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A")
|
|
print(f"Affected Nodes: {len(self.metrics['affected_nodes'])}")
|
|
print(f"Success Requests: {self.metrics['success_count']}")
|
|
print(f"Error Requests: {self.metrics['error_count']}")
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Chaos test for network partition")
|
|
parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
|
|
parser.add_argument("--partition-duration", type=int, default=60, help="Partition duration in seconds")
|
|
parser.add_argument("--partition-ratio", type=float, default=0.5, help="Fraction of nodes to partition (0.0-1.0)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info(f"DRY RUN: Would partition {args.partition_ratio * 100}% of nodes for {args.partition_duration} seconds")
|
|
return
|
|
|
|
# Verify kubectl is available
|
|
try:
|
|
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
logger.error("kubectl is not available or not configured")
|
|
sys.exit(1)
|
|
|
|
# Run test
|
|
async with ChaosTestNetwork(args.namespace) as test:
|
|
success = await test.run_test(args.partition_duration, args.partition_ratio)
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|