feat: add marketplace metrics, privacy features, and service registry endpoints
- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
This commit is contained in:
372
infra/scripts/chaos_test_network.py
Executable file
372
infra/scripts/chaos_test_network.py
Executable file
@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chaos Testing Script - Network Partition
|
||||
Tests system resilience when blockchain nodes experience network partitions
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChaosTestNetwork:
|
||||
"""Chaos testing for network partition scenarios"""
|
||||
|
||||
def __init__(self, namespace: str = "default"):
|
||||
self.namespace = namespace
|
||||
self.session = None
|
||||
self.metrics = {
|
||||
"test_start": None,
|
||||
"test_end": None,
|
||||
"partition_start": None,
|
||||
"partition_end": None,
|
||||
"recovery_time": None,
|
||||
"mttr": None,
|
||||
"error_count": 0,
|
||||
"success_count": 0,
|
||||
"scenario": "network_partition",
|
||||
"affected_nodes": []
|
||||
}
|
||||
|
||||
async def __aenter__(self):
|
||||
self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10))
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.session:
|
||||
await self.session.close()
|
||||
|
||||
def get_blockchain_pods(self) -> List[str]:
|
||||
"""Get list of blockchain node pods"""
|
||||
cmd = [
|
||||
"kubectl", "get", "pods",
|
||||
"-n", self.namespace,
|
||||
"-l", "app.kubernetes.io/name=blockchain-node",
|
||||
"-o", "jsonpath={.items[*].metadata.name}"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
pods = result.stdout.strip().split()
|
||||
return pods
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to get blockchain pods: {e}")
|
||||
return []
|
||||
|
||||
def get_coordinator_pods(self) -> List[str]:
|
||||
"""Get list of coordinator pods"""
|
||||
cmd = [
|
||||
"kubectl", "get", "pods",
|
||||
"-n", self.namespace,
|
||||
"-l", "app.kubernetes.io/name=coordinator",
|
||||
"-o", "jsonpath={.items[*].metadata.name}"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
pods = result.stdout.strip().split()
|
||||
return pods
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to get coordinator pods: {e}")
|
||||
return []
|
||||
|
||||
def apply_network_partition(self, pods: List[str], target_pods: List[str]) -> bool:
|
||||
"""Apply network partition using iptables"""
|
||||
logger.info(f"Applying network partition: blocking traffic between {len(pods)} and {len(target_pods)} pods")
|
||||
|
||||
for pod in pods:
|
||||
if pod in target_pods:
|
||||
continue
|
||||
|
||||
# Block traffic from this pod to target pods
|
||||
for target_pod in target_pods:
|
||||
try:
|
||||
# Get target pod IP
|
||||
cmd = [
|
||||
"kubectl", "get", "pod", target_pod,
|
||||
"-n", self.namespace,
|
||||
"-o", "jsonpath={.status.podIP}"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
target_ip = result.stdout.strip()
|
||||
|
||||
if not target_ip:
|
||||
continue
|
||||
|
||||
# Apply iptables rule to block traffic
|
||||
iptables_cmd = [
|
||||
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
||||
"iptables", "-A", "OUTPUT", "-d", target_ip, "-j", "DROP"
|
||||
]
|
||||
subprocess.run(iptables_cmd, check=True)
|
||||
|
||||
logger.info(f"Blocked traffic from {pod} to {target_pod} ({target_ip})")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to block traffic from {pod} to {target_pod}: {e}")
|
||||
return False
|
||||
|
||||
self.metrics["affected_nodes"] = pods + target_pods
|
||||
return True
|
||||
|
||||
def remove_network_partition(self, pods: List[str]) -> bool:
|
||||
"""Remove network partition rules"""
|
||||
logger.info("Removing network partition rules")
|
||||
|
||||
for pod in pods:
|
||||
try:
|
||||
# Flush OUTPUT chain (remove all rules)
|
||||
cmd = [
|
||||
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
||||
"iptables", "-F", "OUTPUT"
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info(f"Removed network rules from {pod}")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to remove network rules from {pod}: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def test_connectivity(self, pods: List[str]) -> Dict[str, bool]:
|
||||
"""Test connectivity between pods"""
|
||||
results = {}
|
||||
|
||||
for pod in pods:
|
||||
try:
|
||||
# Test if pod can reach coordinator
|
||||
cmd = [
|
||||
"kubectl", "exec", "-n", self.namespace, pod, "--",
|
||||
"curl", "-s", "--max-time", "5", "http://coordinator:8011/v1/health"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
results[pod] = result.returncode == 0 and "ok" in result.stdout
|
||||
|
||||
except Exception:
|
||||
results[pod] = False
|
||||
|
||||
return results
|
||||
|
||||
async def monitor_consensus(self, duration: int = 60) -> bool:
|
||||
"""Monitor blockchain consensus health"""
|
||||
logger.info(f"Monitoring consensus for {duration} seconds")
|
||||
|
||||
start_time = time.time()
|
||||
last_height = 0
|
||||
|
||||
while time.time() - start_time < duration:
|
||||
try:
|
||||
# Get block height from a random pod
|
||||
pods = self.get_blockchain_pods()
|
||||
if not pods:
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
# Use first pod to check height
|
||||
cmd = [
|
||||
"kubectl", "exec", "-n", self.namespace, pods[0], "--",
|
||||
"curl", "-s", "http://localhost:8080/v1/blocks/head"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
try:
|
||||
data = json.loads(result.stdout)
|
||||
current_height = data.get("height", 0)
|
||||
|
||||
# Check if blockchain is progressing
|
||||
if current_height > last_height:
|
||||
last_height = current_height
|
||||
logger.info(f"Blockchain progressing, height: {current_height}")
|
||||
elif time.time() - start_time > 30: # Allow 30s for initial sync
|
||||
logger.warning(f"Blockchain stuck at height {current_height}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Consensus check failed: {e}")
|
||||
|
||||
await asyncio.sleep(5)
|
||||
|
||||
return last_height > 0
|
||||
|
||||
async def generate_load(self, duration: int, concurrent: int = 5):
|
||||
"""Generate synthetic load on blockchain nodes"""
|
||||
logger.info(f"Generating load for {duration} seconds with {concurrent} concurrent requests")
|
||||
|
||||
# Get service URL
|
||||
cmd = [
|
||||
"kubectl", "get", "svc", "blockchain-node",
|
||||
"-n", self.namespace,
|
||||
"-o", "jsonpath={.spec.clusterIP}:{.spec.ports[0].port}"
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
base_url = f"http://{result.stdout.strip()}"
|
||||
|
||||
start_time = time.time()
|
||||
tasks = []
|
||||
|
||||
async def make_request():
|
||||
try:
|
||||
async with self.session.get(f"{base_url}/v1/blocks/head") as response:
|
||||
if response.status == 200:
|
||||
self.metrics["success_count"] += 1
|
||||
else:
|
||||
self.metrics["error_count"] += 1
|
||||
except Exception:
|
||||
self.metrics["error_count"] += 1
|
||||
|
||||
while time.time() - start_time < duration:
|
||||
# Create batch of requests
|
||||
batch = [make_request() for _ in range(concurrent)]
|
||||
tasks.extend(batch)
|
||||
|
||||
# Wait for batch to complete
|
||||
await asyncio.gather(*batch, return_exceptions=True)
|
||||
|
||||
# Brief pause
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.info(f"Load generation completed. Success: {self.metrics['success_count']}, Errors: {self.metrics['error_count']}")
|
||||
|
||||
async def run_test(self, partition_duration: int = 60, partition_ratio: float = 0.5):
|
||||
"""Run the complete network partition chaos test"""
|
||||
logger.info("Starting network partition chaos test")
|
||||
self.metrics["test_start"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Get all blockchain pods
|
||||
all_pods = self.get_blockchain_pods()
|
||||
if not all_pods:
|
||||
logger.error("No blockchain pods found")
|
||||
return False
|
||||
|
||||
# Determine which pods to partition
|
||||
num_partition = int(len(all_pods) * partition_ratio)
|
||||
partition_pods = all_pods[:num_partition]
|
||||
remaining_pods = all_pods[num_partition:]
|
||||
|
||||
logger.info(f"Partitioning {len(partition_pods)} pods out of {len(all_pods)} total")
|
||||
|
||||
# Phase 1: Baseline test
|
||||
logger.info("Phase 1: Baseline connectivity test")
|
||||
baseline_connectivity = await self.test_connectivity(all_pods)
|
||||
logger.info(f"Baseline connectivity: {sum(baseline_connectivity.values())}/{len(all_pods)} pods connected")
|
||||
|
||||
# Phase 2: Generate initial load
|
||||
logger.info("Phase 2: Generating initial load")
|
||||
await self.generate_load(30)
|
||||
|
||||
# Phase 3: Apply network partition
|
||||
logger.info("Phase 3: Applying network partition")
|
||||
self.metrics["partition_start"] = datetime.utcnow().isoformat()
|
||||
|
||||
if not self.apply_network_partition(remaining_pods, partition_pods):
|
||||
logger.error("Failed to apply network partition")
|
||||
return False
|
||||
|
||||
# Verify partition is effective
|
||||
await asyncio.sleep(5)
|
||||
partitioned_connectivity = await self.test_connectivity(all_pods)
|
||||
logger.info(f"Partitioned connectivity: {sum(partitioned_connectivity.values())}/{len(all_pods)} pods connected")
|
||||
|
||||
# Phase 4: Monitor during partition
|
||||
logger.info(f"Phase 4: Monitoring system during {partition_duration}s partition")
|
||||
consensus_healthy = await self.monitor_consensus(partition_duration)
|
||||
|
||||
# Phase 5: Remove partition and monitor recovery
|
||||
logger.info("Phase 5: Removing network partition")
|
||||
self.metrics["partition_end"] = datetime.utcnow().isoformat()
|
||||
|
||||
if not self.remove_network_partition(all_pods):
|
||||
logger.error("Failed to remove network partition")
|
||||
return False
|
||||
|
||||
# Wait for recovery
|
||||
logger.info("Waiting for network recovery...")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
# Test connectivity after recovery
|
||||
recovery_connectivity = await self.test_connectivity(all_pods)
|
||||
recovery_time = time.time()
|
||||
|
||||
# Calculate recovery metrics
|
||||
all_connected = all(recovery_connectivity.values())
|
||||
if all_connected:
|
||||
self.metrics["recovery_time"] = recovery_time - (datetime.fromisoformat(self.metrics["partition_end"]).timestamp())
|
||||
logger.info(f"Network recovered in {self.metrics['recovery_time']:.2f} seconds")
|
||||
|
||||
# Phase 6: Post-recovery load test
|
||||
logger.info("Phase 6: Post-recovery load test")
|
||||
await self.generate_load(60)
|
||||
|
||||
# Final metrics
|
||||
self.metrics["test_end"] = datetime.utcnow().isoformat()
|
||||
self.metrics["mttr"] = self.metrics["recovery_time"]
|
||||
|
||||
# Save results
|
||||
self.save_results()
|
||||
|
||||
logger.info("Network partition chaos test completed successfully")
|
||||
return True
|
||||
|
||||
def save_results(self):
|
||||
"""Save test results to file"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"chaos_test_network_{timestamp}.json"
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(self.metrics, f, indent=2)
|
||||
|
||||
logger.info(f"Test results saved to: {filename}")
|
||||
|
||||
# Print summary
|
||||
print("\n=== Chaos Test Summary ===")
|
||||
print(f"Scenario: {self.metrics['scenario']}")
|
||||
print(f"Test Duration: {self.metrics['test_start']} to {self.metrics['test_end']}")
|
||||
print(f"Partition Duration: {self.metrics['partition_start']} to {self.metrics['partition_end']}")
|
||||
print(f"MTTR: {self.metrics['mttr']:.2f} seconds" if self.metrics['mttr'] else "MTTR: N/A")
|
||||
print(f"Affected Nodes: {len(self.metrics['affected_nodes'])}")
|
||||
print(f"Success Requests: {self.metrics['success_count']}")
|
||||
print(f"Error Requests: {self.metrics['error_count']}")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Chaos test for network partition")
|
||||
parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
|
||||
parser.add_argument("--partition-duration", type=int, default=60, help="Partition duration in seconds")
|
||||
parser.add_argument("--partition-ratio", type=float, default=0.5, help="Fraction of nodes to partition (0.0-1.0)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run without actual chaos")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
logger.info(f"DRY RUN: Would partition {args.partition_ratio * 100}% of nodes for {args.partition_duration} seconds")
|
||||
return
|
||||
|
||||
# Verify kubectl is available
|
||||
try:
|
||||
subprocess.run(["kubectl", "version"], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
logger.error("kubectl is not available or not configured")
|
||||
sys.exit(1)
|
||||
|
||||
# Run test
|
||||
async with ChaosTestNetwork(args.namespace) as test:
|
||||
success = await test.run_test(args.partition_duration, args.partition_ratio)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user