Files
aitbc/apps/multi-region-load-balancer/main.py
AITBC System b033923756 chore: normalize file permissions across repository
- Remove executable permissions from configuration files (.editorconfig, .env.example, .gitignore)
- Remove executable permissions from documentation files (README.md, LICENSE, SECURITY.md)
- Remove executable permissions from web assets (HTML, CSS, JS files)
- Remove executable permissions from data files (JSON, SQL, YAML, requirements.txt)
- Remove executable permissions from source code files across all apps
- Add executable permissions to Python
2026-03-08 11:26:18 +01:00

697 lines
25 KiB
Python
Executable File

"""
Multi-Region Load Balancing Service for AITBC
Handles intelligent load distribution across global regions
"""
import asyncio
import json
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Any, List, Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="AITBC Multi-Region Load Balancer",
description="Intelligent load balancing across global regions",
version="1.0.0"
)
# Data models
class LoadBalancingRule(BaseModel):
rule_id: str
name: str
algorithm: str # weighted_round_robin, least_connections, geographic, performance_based
target_regions: List[str]
weights: Dict[str, float] # Region weights
health_check_path: str
failover_enabled: bool
session_affinity: bool
class RegionHealth(BaseModel):
region_id: str
status: str # healthy, unhealthy, degraded
response_time_ms: float
success_rate: float
active_connections: int
last_check: datetime
class LoadBalancingMetrics(BaseModel):
balancer_id: str
timestamp: datetime
total_requests: int
requests_per_region: Dict[str, int]
average_response_time: float
error_rate: float
throughput: float
class GeographicRule(BaseModel):
rule_id: str
source_regions: List[str]
target_regions: List[str]
priority: int # Lower number = higher priority
latency_threshold_ms: float
# In-memory storage (in production, use database)
load_balancing_rules: Dict[str, Dict] = {}
region_health_status: Dict[str, RegionHealth] = {}
balancing_metrics: Dict[str, List[Dict]] = {}
geographic_rules: Dict[str, Dict] = {}
session_affinity_data: Dict[str, Dict] = {}
@app.get("/")
async def root():
return {
"service": "AITBC Multi-Region Load Balancer",
"status": "running",
"timestamp": datetime.utcnow().isoformat(),
"version": "1.0.0"
}
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"total_rules": len(load_balancing_rules),
"active_rules": len([r for r in load_balancing_rules.values() if r["status"] == "active"]),
"monitored_regions": len(region_health_status),
"healthy_regions": len([r for r in region_health_status.values() if r.status == "healthy"])
}
@app.post("/api/v1/rules/create")
async def create_load_balancing_rule(rule: LoadBalancingRule):
"""Create a new load balancing rule"""
if rule.rule_id in load_balancing_rules:
raise HTTPException(status_code=400, detail="Load balancing rule already exists")
# Create rule record
rule_record = {
"rule_id": rule.rule_id,
"name": rule.name,
"algorithm": rule.algorithm,
"target_regions": rule.target_regions,
"weights": rule.weights,
"health_check_path": rule.health_check_path,
"failover_enabled": rule.failover_enabled,
"session_affinity": rule.session_affinity,
"status": "active",
"created_at": datetime.utcnow().isoformat(),
"total_requests": 0,
"failed_requests": 0,
"last_updated": datetime.utcnow().isoformat()
}
load_balancing_rules[rule.rule_id] = rule_record
# Start health monitoring for target regions
asyncio.create_task(start_health_monitoring(rule.rule_id))
logger.info(f"Load balancing rule created: {rule.name} ({rule.rule_id})")
return {
"rule_id": rule.rule_id,
"status": "created",
"name": rule.name,
"algorithm": rule.algorithm,
"created_at": rule_record["created_at"]
}
@app.get("/api/v1/rules")
async def list_load_balancing_rules():
"""List all load balancing rules"""
return {
"rules": list(load_balancing_rules.values()),
"total_rules": len(load_balancing_rules),
"active_rules": len([r for r in load_balancing_rules.values() if r["status"] == "active"])
}
@app.get("/api/v1/rules/{rule_id}")
async def get_load_balancing_rule(rule_id: str):
"""Get detailed load balancing rule information"""
if rule_id not in load_balancing_rules:
raise HTTPException(status_code=404, detail="Load balancing rule not found")
rule = load_balancing_rules[rule_id].copy()
# Add region health status
rule["region_health"] = {
region_id: region_health_status.get(region_id)
for region_id in rule["target_regions"]
if region_id in region_health_status
}
# Add performance metrics
rule["performance_metrics"] = balancing_metrics.get(rule_id, [])
return rule
@app.post("/api/v1/rules/{rule_id}/update-weights")
async def update_rule_weights(rule_id: str, weights: Dict[str, float]):
"""Update weights for a load balancing rule"""
if rule_id not in load_balancing_rules:
raise HTTPException(status_code=404, detail="Load balancing rule not found")
rule = load_balancing_rules[rule_id]
# Validate weights
total_weight = sum(weights.values())
if total_weight == 0:
raise HTTPException(status_code=400, detail="Total weight cannot be zero")
# Normalize weights
normalized_weights = {k: v / total_weight for k, v in weights.items()}
# Update rule weights
rule["weights"] = normalized_weights
rule["last_updated"] = datetime.utcnow().isoformat()
logger.info(f"Weights updated for rule {rule_id}: {normalized_weights}")
return {
"rule_id": rule_id,
"new_weights": normalized_weights,
"updated_at": rule["last_updated"]
}
@app.post("/api/v1/health/register")
async def register_region_health(health: RegionHealth):
"""Register or update health status for a region"""
region_health_status[health.region_id] = health
# Update load balancing rules that use this region
for rule_id, rule in load_balancing_rules.items():
if health.region_id in rule["target_regions"]:
# Update rule based on health status
if health.status == "unhealthy" and rule["failover_enabled"]:
logger.warning(f"Region {health.region_id} unhealthy, enabling failover for rule {rule_id}")
enable_failover(rule_id, health.region_id)
return {
"region_id": health.region_id,
"status": health.status,
"registered_at": datetime.utcnow().isoformat()
}
@app.get("/api/v1/health")
async def get_all_region_health():
"""Get health status for all monitored regions"""
return {
"region_health": {
region_id: health.dict()
for region_id, health in region_health_status.items()
},
"total_regions": len(region_health_status),
"healthy_regions": len([r for r in region_health_status.values() if r.status == "healthy"]),
"unhealthy_regions": len([r for r in region_health_status.values() if r.status == "unhealthy"]),
"degraded_regions": len([r for r in region_health_status.values() if r.status == "degraded"])
}
@app.post("/api/v1/geographic-rules/create")
async def create_geographic_rule(rule: GeographicRule):
"""Create a geographic routing rule"""
if rule.rule_id in geographic_rules:
raise HTTPException(status_code=400, detail="Geographic rule already exists")
# Create geographic rule record
rule_record = {
"rule_id": rule.rule_id,
"source_regions": rule.source_regions,
"target_regions": rule.target_regions,
"priority": rule.priority,
"latency_threshold_ms": rule.latency_threshold_ms,
"status": "active",
"created_at": datetime.utcnow().isoformat(),
"usage_count": 0
}
geographic_rules[rule.rule_id] = rule_record
logger.info(f"Geographic rule created: {rule.rule_id}")
return {
"rule_id": rule.rule_id,
"status": "created",
"priority": rule.priority,
"created_at": rule_record["created_at"]
}
@app.get("/api/v1/route/{client_region}")
async def get_optimal_region(client_region: str, rule_id: Optional[str] = None):
"""Get optimal target region for a client region"""
if rule_id and rule_id not in load_balancing_rules:
raise HTTPException(status_code=404, detail="Load balancing rule not found")
# Find optimal region based on rules
if rule_id:
optimal_region = select_region_by_algorithm(rule_id, client_region)
else:
optimal_region = select_region_geographically(client_region)
return {
"client_region": client_region,
"optimal_region": optimal_region,
"rule_id": rule_id,
"selection_reason": get_selection_reason(optimal_region, client_region, rule_id),
"timestamp": datetime.utcnow().isoformat()
}
@app.post("/api/v1/metrics/record")
async def record_balancing_metrics(metrics: LoadBalancingMetrics):
"""Record load balancing performance metrics"""
metrics_record = {
"metrics_id": f"metrics_{int(datetime.utcnow().timestamp())}",
"balancer_id": metrics.balancer_id,
"timestamp": metrics.timestamp.isoformat(),
"total_requests": metrics.total_requests,
"requests_per_region": metrics.requests_per_region,
"average_response_time": metrics.average_response_time,
"error_rate": metrics.error_rate,
"throughput": metrics.throughput
}
if metrics.balancer_id not in balancing_metrics:
balancing_metrics[metrics.balancer_id] = []
balancing_metrics[metrics.balancer_id].append(metrics_record)
# Keep only last 1000 records per balancer
if len(balancing_metrics[metrics.balancer_id]) > 1000:
balancing_metrics[metrics.balancer_id] = balancing_metrics[metrics.balancer_id][-1000:]
return {
"metrics_id": metrics_record["metrics_id"],
"status": "recorded",
"timestamp": metrics_record["timestamp"]
}
@app.get("/api/v1/metrics/{rule_id}")
async def get_balancing_metrics(rule_id: str, hours: int = 24):
"""Get performance metrics for a load balancing rule"""
if rule_id not in load_balancing_rules:
raise HTTPException(status_code=404, detail="Load balancing rule not found")
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
recent_metrics = [
m for m in balancing_metrics.get(rule_id, [])
if datetime.fromisoformat(m["timestamp"]) > cutoff_time
]
# Calculate statistics
if recent_metrics:
avg_response_time = sum(m["average_response_time"] for m in recent_metrics) / len(recent_metrics)
avg_error_rate = sum(m["error_rate"] for m in recent_metrics) / len(recent_metrics)
avg_throughput = sum(m["throughput"] for m in recent_metrics) / len(recent_metrics)
total_requests = sum(m["total_requests"] for m in recent_metrics)
else:
avg_response_time = avg_error_rate = avg_throughput = total_requests = 0.0
return {
"rule_id": rule_id,
"period_hours": hours,
"metrics": recent_metrics,
"statistics": {
"average_response_time_ms": round(avg_response_time, 3),
"average_error_rate": round(avg_error_rate, 4),
"average_throughput": round(avg_throughput, 2),
"total_requests": int(total_requests),
"total_samples": len(recent_metrics)
},
"generated_at": datetime.utcnow().isoformat()
}
@app.get("/api/v1/dashboard")
async def get_load_balancing_dashboard():
"""Get comprehensive load balancing dashboard"""
# Calculate overall statistics
total_rules = len(load_balancing_rules)
active_rules = len([r for r in load_balancing_rules.values() if r["status"] == "active"])
# Region health summary
health_summary = {
"total_regions": len(region_health_status),
"healthy": len([r for r in region_health_status.values() if r.status == "healthy"]),
"unhealthy": len([r for r in region_health_status.values() if r.status == "unhealthy"]),
"degraded": len([r for r in region_health_status.values() if r.status == "degraded"])
}
# Performance summary
performance_summary = {}
for rule_id, metrics_list in balancing_metrics.items():
if metrics_list:
latest_metrics = metrics_list[-1]
performance_summary[rule_id] = {
"total_requests": latest_metrics["total_requests"],
"average_response_time": latest_metrics["average_response_time"],
"error_rate": latest_metrics["error_rate"],
"throughput": latest_metrics["throughput"]
}
# Algorithm distribution
algorithm_distribution = {}
for rule in load_balancing_rules.values():
algorithm = rule["algorithm"]
algorithm_distribution[algorithm] = algorithm_distribution.get(algorithm, 0) + 1
return {
"dashboard": {
"overview": {
"total_rules": total_rules,
"active_rules": active_rules,
"geographic_rules": len(geographic_rules),
"algorithm_distribution": algorithm_distribution
},
"region_health": health_summary,
"performance": performance_summary,
"recent_activity": get_recent_activity()
},
"generated_at": datetime.utcnow().isoformat()
}
# Core load balancing functions
def select_region_by_algorithm(rule_id: str, client_region: str) -> Optional[str]:
"""Select optimal region based on load balancing algorithm"""
if rule_id not in load_balancing_rules:
return None
rule = load_balancing_rules[rule_id]
algorithm = rule["algorithm"]
target_regions = rule["target_regions"]
# Filter healthy regions
healthy_regions = [
region for region in target_regions
if region in region_health_status and region_health_status[region].status == "healthy"
]
if not healthy_regions:
# Fallback to any region if no healthy ones
healthy_regions = target_regions
if algorithm == "weighted_round_robin":
return select_weighted_round_robin(rule_id, healthy_regions)
elif algorithm == "least_connections":
return select_least_connections(healthy_regions)
elif algorithm == "geographic":
return select_geographic_optimal(client_region, healthy_regions)
elif algorithm == "performance_based":
return select_performance_optimal(healthy_regions)
else:
return healthy_regions[0] if healthy_regions else None
def select_weighted_round_robin(rule_id: str, regions: List[str]) -> str:
"""Select region using weighted round robin"""
rule = load_balancing_rules[rule_id]
weights = rule["weights"]
# Filter weights for available regions
available_weights = {r: weights.get(r, 1.0) for r in regions if r in weights}
if not available_weights:
return regions[0]
# Simple weighted selection (in production, use proper round robin state)
import random
total_weight = sum(available_weights.values())
rand_val = random.uniform(0, total_weight)
current_weight = 0
for region, weight in available_weights.items():
current_weight += weight
if rand_val <= current_weight:
return region
return list(available_weights.keys())[-1]
def select_least_connections(regions: List[str]) -> str:
"""Select region with least connections"""
min_connections = float('inf')
optimal_region = None
for region in regions:
if region in region_health_status:
connections = region_health_status[region].active_connections
if connections < min_connections:
min_connections = connections
optimal_region = region
return optimal_region or regions[0]
def select_geographic_optimal(client_region: str, target_regions: List[str]) -> str:
"""Select region based on geographic proximity"""
# Simplified geographic mapping (in production, use actual geographic data)
geographic_proximity = {
"us-east": ["us-east-1", "us-west-1"],
"us-west": ["us-west-1", "us-east-1"],
"europe": ["eu-west-1", "eu-central-1"],
"asia": ["ap-southeast-1", "ap-northeast-1"]
}
# Find closest regions
for geo_area, close_regions in geographic_proximity.items():
if client_region.lower() in geo_area.lower():
for close_region in close_regions:
if close_region in target_regions:
return close_region
# Fallback to first healthy region
return target_regions[0]
def select_performance_optimal(regions: List[str]) -> str:
"""Select region with best performance"""
best_region = None
best_score = float('inf')
for region in regions:
if region in region_health_status:
health = region_health_status[region]
# Calculate performance score (lower is better)
score = health.response_time_ms * (1 - health.success_rate)
if score < best_score:
best_score = score
best_region = region
return best_region or regions[0]
def select_region_geographically(client_region: str) -> Optional[str]:
"""Select region based on geographic rules"""
# Apply geographic rules
applicable_rules = [
rule for rule in geographic_rules.values()
if client_region in rule["source_regions"] and rule["status"] == "active"
]
# Sort by priority (lower number = higher priority)
applicable_rules.sort(key=lambda x: x["priority"])
for rule in applicable_rules:
# Find best target region based on latency
best_target = None
best_latency = float('inf')
for target_region in rule["target_regions"]:
if target_region in region_health_status:
latency = region_health_status[target_region].response_time_ms
if latency < best_latency and latency < rule["latency_threshold_ms"]:
best_latency = latency
best_target = target_region
if best_target:
rule["usage_count"] += 1
return best_target
# Fallback to any healthy region
healthy_regions = [
region for region, health in region_health_status.items()
if health.status == "healthy"
]
return healthy_regions[0] if healthy_regions else None
def get_selection_reason(region: str, client_region: str, rule_id: Optional[str]) -> str:
"""Get reason for region selection"""
if rule_id and rule_id in load_balancing_rules:
rule = load_balancing_rules[rule_id]
return f"Selected by {rule['algorithm']} algorithm using rule {rule['name']}"
else:
return f"Selected based on geographic proximity from {client_region}"
def enable_failover(rule_id: str, unhealthy_region: str):
"""Enable failover for unhealthy region"""
rule = load_balancing_rules[rule_id]
# Remove unhealthy region from rotation temporarily
if unhealthy_region in rule["target_regions"]:
rule["target_regions"].remove(unhealthy_region)
logger.warning(f"Region {unhealthy_region} removed from load balancing rule {rule_id}")
def get_recent_activity() -> List[Dict]:
"""Get recent load balancing activity"""
activity = []
# Recent health changes
for region_id, health in region_health_status.items():
if (datetime.utcnow() - health.last_check).total_seconds() < 3600: # Last hour
activity.append({
"type": "health_check",
"region": region_id,
"status": health.status,
"timestamp": health.last_check.isoformat()
})
# Recent rule updates
for rule_id, rule in load_balancing_rules.items():
if (datetime.utcnow() - datetime.fromisoformat(rule["last_updated"])).total_seconds() < 3600:
activity.append({
"type": "rule_update",
"rule_id": rule_id,
"name": rule["name"],
"timestamp": rule["last_updated"]
})
# Sort by timestamp (most recent first)
activity.sort(key=lambda x: x["timestamp"], reverse=True)
return activity[:20]
# Background task for health monitoring
async def start_health_monitoring(rule_id: str):
"""Start health monitoring for a load balancing rule"""
rule = load_balancing_rules[rule_id]
while rule["status"] == "active":
try:
# Check health of all target regions
for region_id in rule["target_regions"]:
await check_region_health(region_id)
await asyncio.sleep(30) # Check every 30 seconds
except Exception as e:
logger.error(f"Health monitoring error for rule {rule_id}: {str(e)}")
await asyncio.sleep(10)
async def check_region_health(region_id: str):
"""Check health of a specific region"""
# Simulate health check (in production, this would be actual health checks)
import random
# Simulate health metrics
response_time = random.uniform(20, 200)
success_rate = random.uniform(0.95, 1.0)
active_connections = random.randint(100, 1000)
# Determine health status
if response_time < 100 and success_rate > 0.99:
status = "healthy"
elif response_time < 200 and success_rate > 0.95:
status = "degraded"
else:
status = "unhealthy"
health = RegionHealth(
region_id=region_id,
status=status,
response_time_ms=response_time,
success_rate=success_rate,
active_connections=active_connections,
last_check=datetime.utcnow()
)
region_health_status[region_id] = health
# Initialize with some default rules
@app.on_event("startup")
async def startup_event():
logger.info("Starting AITBC Multi-Region Load Balancer")
# Initialize default load balancing rules
default_rules = [
{
"rule_id": "global-web-rule",
"name": "Global Web Load Balancer",
"algorithm": "weighted_round_robin",
"target_regions": ["us-east-1", "eu-west-1", "ap-southeast-1"],
"weights": {"us-east-1": 0.4, "eu-west-1": 0.35, "ap-southeast-1": 0.25},
"health_check_path": "/health",
"failover_enabled": True,
"session_affinity": False
},
{
"rule_id": "api-performance-rule",
"name": "API Performance Optimizer",
"algorithm": "performance_based",
"target_regions": ["us-east-1", "eu-west-1"],
"weights": {"us-east-1": 0.5, "eu-west-1": 0.5},
"health_check_path": "/api/health",
"failover_enabled": True,
"session_affinity": True
}
]
for rule_data in default_rules:
rule = LoadBalancingRule(**rule_data)
rule_record = {
"rule_id": rule.rule_id,
"name": rule.name,
"algorithm": rule.algorithm,
"target_regions": rule.target_regions,
"weights": rule.weights,
"health_check_path": rule.health_check_path,
"failover_enabled": rule.failover_enabled,
"session_affinity": rule.session_affinity,
"status": "active",
"created_at": datetime.utcnow().isoformat(),
"total_requests": 0,
"failed_requests": 0,
"last_updated": datetime.utcnow().isoformat()
}
load_balancing_rules[rule.rule_id] = rule_record
# Start health monitoring
asyncio.create_task(start_health_monitoring(rule.rule_id))
# Initialize default geographic rules
default_geo_rules = [
{
"rule_id": "us-to-us",
"source_regions": ["us-east", "us-west", "north-america"],
"target_regions": ["us-east-1", "us-west-1"],
"priority": 1,
"latency_threshold_ms": 50
},
{
"rule_id": "eu-to-eu",
"source_regions": ["europe", "eu-west", "eu-central"],
"target_regions": ["eu-west-1", "eu-central-1"],
"priority": 1,
"latency_threshold_ms": 30
}
]
for geo_rule_data in default_geo_rules:
geo_rule = GeographicRule(**geo_rule_data)
geo_rule_record = {
"rule_id": geo_rule.rule_id,
"source_regions": geo_rule.source_regions,
"target_regions": geo_rule.target_regions,
"priority": geo_rule.priority,
"latency_threshold_ms": geo_rule.latency_threshold_ms,
"status": "active",
"created_at": datetime.utcnow().isoformat(),
"usage_count": 0
}
geographic_rules[geo_rule.rule_id] = geo_rule_record
@app.on_event("shutdown")
async def shutdown_event():
logger.info("Shutting down AITBC Multi-Region Load Balancer")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8019, log_level="info")